def test_split_data(self): with open("./data/complete_data", "r") as f: lines = f.readlines() for idx, line in enumerate(lines): # pylint: disable=unused-variable dic = json.loads(line) for ges in self.num_dic: if dic["gesture"] == ges: self.num_dic[ges] += 1 train_data_0, valid_data_0, test_data_100 = split_data(self.data, 0, 0) train_data_50, valid_data_50, test_data_0 = split_data( self.data, 0.5, 0.5) train_data_60, valid_data_20, test_data_20 = split_data( self.data, 0.6, 0.2) len_60 = int(self.num_dic["wing"] * 0.6) + int( self.num_dic["ring"] * 0.6) + int( self.num_dic["slope"] * 0.6) + int( self.num_dic["negative"] * 0.6) len_50 = int(self.num_dic["wing"] * 0.5) + int( self.num_dic["ring"] * 0.5) + int( self.num_dic["slope"] * 0.5) + int( self.num_dic["negative"] * 0.5) len_20 = int(self.num_dic["wing"] * 0.2) + int( self.num_dic["ring"] * 0.2) + int( self.num_dic["slope"] * 0.2) + int( self.num_dic["negative"] * 0.2) self.assertEqual(len(train_data_0), 0) self.assertEqual(len(train_data_50), len_50) self.assertEqual(len(train_data_60), len_60) self.assertEqual(len(valid_data_0), 0) self.assertEqual(len(valid_data_50), len_50) self.assertEqual(len(valid_data_20), len_20) self.assertEqual(len(test_data_100), self.num) self.assertEqual(len(test_data_0), (self.num - 2 * len_50)) self.assertEqual(len(test_data_20), (self.num - len_60 - len_20))
import json import os import numpy as np from opts import get_opts import missingpy as miss import tqdm import copy import warnings from data_imputation import pipeline from data_split import split_data import argparse if __name__ == '__main__': args = get_opts() split_tool = split_data(args) nodes, pos = split_tool.node_extracing() res = split_tool.data_spliting(nodes) split_tool.save_json(res) print('>>>>>> Spliting data saving is done ! <<<<<<') model = pipeline() model.data_imputating(model.k_nearest)
print('using gpu') else: print('using cpu') # flickr doesnt need to be split at the root node def iterate_data(h5_file): for x in h5_file.root: yield x f_nodes = [node for node in iterate_data(data_file)] # split the database into train test and validation sets. default settings uses the json file # with the karpathy split train, test, val = split_data(f_nodes, args.split_loc) ############################### Neural network setup ################################################# # network modules img_net = img_encoder(image_config) cap_net = text_gru_encoder(token_config) # Adam optimiser. I found SGD to work terribly and could not find appropriate parameter settings for it. optimizer = torch.optim.Adam( list(img_net.parameters()) + list(cap_net.parameters()), 1) #plateau_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', factor = 0.9, patience = 100, # threshold = 0.0001, min_lr = 1e-8, cooldown = 100) #step_scheduler = lr_scheduler.StepLR(optimizer, 1000, gamma=0.1, last_epoch=-1)
replace_text = resolve(fulltext_string, tokenize) sent_tokens = sentence_tokenize(replace_text) else: sent_tokens = sentence_tokenize(fulltext_string) if len(sent_tokens) > args.min_sentences: try: labels[key] = get_binary_labels(abstract_string, sent_tokens) abstracts[key] = abstract_string.strip() fulltexts[key] = tokenized sentence_tokens[key] = sent_tokens except ValueError: continue est_time = ((time.time() - t) / (i + 1)) * (len(docs) - (i + 1)) print( "Document {} of {} completed. Estimated time remaining: {} seconds\r" .format(i + 1, len(docs), round(est_time, 3))) write_dict_to_json(abstracts, os.path.join(args.output_dir, 'abstracts.json')) write_dict_to_json(fulltexts, os.path.join(args.output_dir, 'fulltexts.json')) write_dict_to_json(sentence_tokens, os.path.join(args.output_dir, 'sentence_tokens.json')) write_dict_to_json(labels, os.path.join(args.output_dir, 'labels.json')) key_set = set(fulltexts.keys()) splits = split_data(key_set, num_train_subset=5000) write_dict_to_json(splits, os.path.join(args.output_dir, 'data_splits.json'))
from data_fea_gen import gen_feature_file from data_split import split_data from model_train import train_model from model_pred import pred_model from evaluation import evaluation_result if __name__=='__main__': # 读取数据到本地 data_to_local() # 处理数据 process_data() # 提取特征:输入值,预测目标名 gen_feature_file('oper_rev') # 划分数据集 split_data('2011-03-31', '2016-12-31', '2017-03-31', '2017-12-31', '2017-12-31', '2018-03-31', 'oper_rev') # 模型训练,并保存模型 train_model('train.csv', 'valid.csv', '2018-03-31', 'oper_rev') # 预测 pred_model('2018-03-31', 'oper_rev') # 评估 evaluation_result('2018-03-31', 'oper_rev')