예제 #1
0
 def test_split_data(self):
     with open("./data/complete_data", "r") as f:
         lines = f.readlines()
         for idx, line in enumerate(lines):  # pylint: disable=unused-variable
             dic = json.loads(line)
             for ges in self.num_dic:
                 if dic["gesture"] == ges:
                     self.num_dic[ges] += 1
     train_data_0, valid_data_0, test_data_100 = split_data(self.data, 0, 0)
     train_data_50, valid_data_50, test_data_0 = split_data(
         self.data, 0.5, 0.5)
     train_data_60, valid_data_20, test_data_20 = split_data(
         self.data, 0.6, 0.2)
     len_60 = int(self.num_dic["wing"] * 0.6) + int(
         self.num_dic["ring"] * 0.6) + int(
             self.num_dic["slope"] * 0.6) + int(
                 self.num_dic["negative"] * 0.6)
     len_50 = int(self.num_dic["wing"] * 0.5) + int(
         self.num_dic["ring"] * 0.5) + int(
             self.num_dic["slope"] * 0.5) + int(
                 self.num_dic["negative"] * 0.5)
     len_20 = int(self.num_dic["wing"] * 0.2) + int(
         self.num_dic["ring"] * 0.2) + int(
             self.num_dic["slope"] * 0.2) + int(
                 self.num_dic["negative"] * 0.2)
     self.assertEqual(len(train_data_0), 0)
     self.assertEqual(len(train_data_50), len_50)
     self.assertEqual(len(train_data_60), len_60)
     self.assertEqual(len(valid_data_0), 0)
     self.assertEqual(len(valid_data_50), len_50)
     self.assertEqual(len(valid_data_20), len_20)
     self.assertEqual(len(test_data_100), self.num)
     self.assertEqual(len(test_data_0), (self.num - 2 * len_50))
     self.assertEqual(len(test_data_20), (self.num - len_60 - len_20))
예제 #2
0
import json
import os
import numpy as np
from opts import get_opts
import missingpy as miss
import tqdm
import copy
import warnings
from data_imputation import pipeline
from data_split import split_data
import argparse

if __name__ == '__main__':
    args = get_opts()
    split_tool = split_data(args)
    nodes, pos = split_tool.node_extracing()

    res = split_tool.data_spliting(nodes)
    split_tool.save_json(res)
    print('>>>>>> Spliting data saving is done ! <<<<<<')

    model = pipeline()
    model.data_imputating(model.k_nearest)
예제 #3
0
    print('using gpu')
else:
    print('using cpu')


# flickr doesnt need to be split at the root node
def iterate_data(h5_file):
    for x in h5_file.root:
        yield x


f_nodes = [node for node in iterate_data(data_file)]

# split the database into train test and validation sets. default settings uses the json file
# with the karpathy split
train, test, val = split_data(f_nodes, args.split_loc)
############################### Neural network setup #################################################
# network modules
img_net = img_encoder(image_config)
cap_net = text_gru_encoder(token_config)

# Adam optimiser. I found SGD to work terribly and could not find appropriate parameter settings for it.
optimizer = torch.optim.Adam(
    list(img_net.parameters()) + list(cap_net.parameters()), 1)

#plateau_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.9, patience = 100,
#                                                   threshold = 0.0001, min_lr = 1e-8, cooldown = 100)

#step_scheduler = lr_scheduler.StepLR(optimizer, 1000, gamma=0.1, last_epoch=-1)

예제 #4
0
                replace_text = resolve(fulltext_string, tokenize)
                sent_tokens = sentence_tokenize(replace_text)
            else:
                sent_tokens = sentence_tokenize(fulltext_string)
            if len(sent_tokens) > args.min_sentences:
                try:
                    labels[key] = get_binary_labels(abstract_string,
                                                    sent_tokens)
                    abstracts[key] = abstract_string.strip()
                    fulltexts[key] = tokenized
                    sentence_tokens[key] = sent_tokens
                except ValueError:
                    continue
        est_time = ((time.time() - t) / (i + 1)) * (len(docs) - (i + 1))
        print(
            "Document {} of {} completed. Estimated time remaining: {} seconds\r"
            .format(i + 1, len(docs), round(est_time, 3)))

    write_dict_to_json(abstracts,
                       os.path.join(args.output_dir, 'abstracts.json'))
    write_dict_to_json(fulltexts,
                       os.path.join(args.output_dir, 'fulltexts.json'))
    write_dict_to_json(sentence_tokens,
                       os.path.join(args.output_dir, 'sentence_tokens.json'))
    write_dict_to_json(labels, os.path.join(args.output_dir, 'labels.json'))

    key_set = set(fulltexts.keys())
    splits = split_data(key_set, num_train_subset=5000)
    write_dict_to_json(splits, os.path.join(args.output_dir,
                                            'data_splits.json'))
from data_fea_gen import gen_feature_file
from data_split import split_data
from model_train import train_model
from model_pred import pred_model
from evaluation import evaluation_result
if __name__=='__main__':
    # 读取数据到本地
    data_to_local()
    # 处理数据
    process_data()
    # 提取特征:输入值,预测目标名
    gen_feature_file('oper_rev')
    # 划分数据集
    split_data('2011-03-31', 
               '2016-12-31', 
              '2017-03-31', 
               '2017-12-31',
               '2017-12-31',
               '2018-03-31', 'oper_rev')
    # 模型训练,并保存模型
    train_model('train.csv', 'valid.csv', '2018-03-31', 'oper_rev')
    # 预测
    pred_model('2018-03-31', 'oper_rev')
    # 评估
    evaluation_result('2018-03-31', 'oper_rev')