def load_data_nonclass(filepath_news, filepath_stock): newslist = preprocess.read_news(filepath_news) sentences = preprocess.news_to_sentences(newslist) prices = preprocess.read_price(filepath_stock) prices = scaler.fit_transform(prices) news = preprocess.sentences_to_nparray(sentences) # news = scaler_news.fit_transform(news) hisprice, y = preprocess.data_process(prices, look_back) return news[look_back:], hisprice, y
def inference(model): print("Loading test data...") start_time = time.time() x_test, y_test = data_process(valid_dir, config.seq_length) # Create session and restore model checkpoint session = tf.Session() session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=session, save_path=saver_dir) print('Testing...') loss_test, acc_test = evaluate(session, x_test, y_test) msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' print(msg.format(loss_test, acc_test)) # Generate data with batch batch_size = 128 data_len = len(x_test) num_batch = int((data_len - 1) / batch_size) + 1 y_test_cls = np.argmax(y_test, 1) y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32) for i in range(num_batch): start_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) feed_dict = { model.input_x: x_test[start_id:end_id], model.keep_prob: 1.0 } y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict) # Evaluate print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test_cls, y_pred_cls, target_names=label_category)) # Print confusion matrix print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif)
def get_dataloader(args): data = None setattr(args, 'mode', 'test') if args.raw_file: source = data_process(filelist=[args.raw_file], word2index=args.src_word2index, lower=args.lower) del args.src_word2index max_src_len = max(len(seq) for seq in source) data = {'source': source, 'max_src_len': max_src_len} dataset, batch_size = get_data(args=args, data=data) dataset = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True) setattr(args, 'data', dataset)
def train(model): """Train model: split train data, train model, model save and result print""" # config tensor board and summary print('Configuring TensorBoard and Saver ...') if not os.path.exists(tensorboard_dir): os.mkdir(tensorboard_dir) tf.summary.scalar('loss', model.loss) tf.summary.scalar('accuracy', model.acc) merged_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(tensorboard_dir) # config graph-saver saver = tf.train.Saver() if not os.path.exists(saver_dir): os.mkdir(saver_dir) # Loading trianing data and validation data print('Loading trianing data and validation data ...') start_time = time.time() x_train, y_train = data_process(train_dir, config.max_length) x_valid, y_valid = data_process(valid_dir, config.max_length) time_dif = get_time_dif(start_time) print('Loading data ok!') print('Time usage: %f' % time_dif) # Create session session = tf.Session() session.run(tf.global_variables_initializer()) summary_writer.add_graph(session.graph) # Some variables about training total_batch = 0 best_val_acc = 0. last_improved = 0. early_stop_batch = 1000 print('Training and evaluating ...') start_time = time.time() is_early_stop = False for epoch in range(config.num_epochs): print('Epoch:', epoch + 1) batch_train = batch_iter(x_train, y_train, config.batch_size) for x_batch, y_batch in batch_train: feed_dict = { model.input_x: x_batch, model.input_y: y_batch, model.dropout_keep_prob: config.dropout_keep_prob } # Every saver_epochs, save summary if total_batch % config.save_per_batch == 0: graph = session.run(merged_summary, feed_dict=feed_dict) summary_writer.add_summary(graph, total_batch) # Print result if total_batch % config.print_per_batch == 0: feed_dict[model.dropout_keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) print() loss_val, acc_val = evaluate(session, x_valid, y_valid) # save best model by acc if acc_val > best_val_acc: best_val_acc = acc_val last_improved = total_batch saver.save(sess=session, save_path=saver_dir) improved_str = '//improved' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print( msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) # Optimizer model session.run(model.optim, feed_dict=feed_dict) total_batch += 1 # Early stop if total_batch - last_improved > early_stop_batch: print("No optimization for a long time, auto-stopping...") is_early_stop = True break if is_early_stop: break
def train(chm, model_name, data_path, generations, window_size, smooth_size, missing, n_cores, verbose): if verbose: print("Preprocessing data...") # ------------------ Config ------------------ model_name += "_chm_" + chm model_repo = join_paths("./" + instance_name, "models", verb=False) model_repo = join_paths(model_repo, model_name, verb=False) model_path = model_repo + "/" + model_name + ".pkl" train1_paths = [ data_path + "/chm" + chm + "/simulation_output/train1/gen_" + str(gen) + "/" for gen in generations ] train2_paths = [ data_path + "/chm" + chm + "/simulation_output/train2/gen_" + str(gen) + "/" for gen in generations ] val_paths = [ data_path + "/chm" + chm + "/simulation_output/val/gen_" + str(gen) + "/" for gen in generations ] # only validate on 4th gen position_map_file = data_path + "/chm" + chm + "/positions.txt" reference_map_file = data_path + "/chm" + chm + "/references.txt" population_map_file = data_path + "/populations.txt" # ------------------ Process data ------------------ # gather feature data files (binary representation of variants) X_fname = "mat_vcf_2d.npy" X_train1_files = [p + X_fname for p in train1_paths] X_train2_files = [p + X_fname for p in train2_paths] X_val_files = [p + X_fname for p in val_paths] # gather label data files (population) labels_fname = "mat_map.npy" labels_train1_files = [p + labels_fname for p in train1_paths] labels_train2_files = [p + labels_fname for p in train2_paths] labels_val_files = [p + labels_fname for p in val_paths] # load the data train_val_files = [ X_train1_files, labels_train1_files, X_train2_files, labels_train2_files, X_val_files, labels_val_files ] X_train1_raw, labels_train1_raw, X_train2_raw, labels_train2_raw, X_val_raw, labels_val_raw = [ load_np_data(f) for f in train_val_files ] # adding generation 0 if gen_0: if verbose: print("Including generation 0...") # get it gen_0_sets = ["train1", "train2"] X_train1_raw_gen_0, y_train1_raw_gen_0, X_train2_raw_gen_0, y_train2_raw_gen_0 = get_gen_0( data_path + "/chm" + chm, population_map_file, gen_0_sets) # add it X_train1_raw = np.concatenate([X_train1_raw, X_train1_raw_gen_0]) labels_train1_raw = np.concatenate( [labels_train1_raw, y_train1_raw_gen_0]) X_train2_raw = np.concatenate([X_train2_raw, X_train2_raw_gen_0]) labels_train2_raw = np.concatenate( [labels_train2_raw, y_train2_raw_gen_0]) # delete it del X_train1_raw_gen_0, y_train1_raw_gen_0, X_train2_raw_gen_0, y_train2_raw_gen_0 # reshape according to window size X_train1, labels_window_train1 = data_process(X_train1_raw, labels_train1_raw, window_size, missing) X_train2, labels_window_train2 = data_process(X_train2_raw, labels_train2_raw, window_size, missing) X_val, labels_window_val = data_process(X_val_raw, labels_val_raw, window_size, missing) del X_train1_raw, X_train2_raw, X_val_raw, labels_train1_raw, labels_train2_raw, labels_val_raw # necessary arguments for model snp_pos = np.loadtxt(position_map_file, delimiter='\n').astype("int") snp_ref = np.loadtxt(reference_map_file, delimiter='\n', dtype=str) pop_order = np.genfromtxt(population_map_file, dtype="str") chm_len = len(snp_pos) num_anc = len(pop_order) # ------------------ Train model ------------------ # init, train, evaluate and save model if verbose: print("Initializing XGMix model and training...") model = XGMIX(chm_len, window_size, smooth_size, num_anc, snp_pos, snp_ref, pop_order, calibrate=calibrate, cores=n_cores) model.train(X_train1, labels_window_train1, X_train2, labels_window_train2, X_val, labels_window_val, retrain_base=retrain_base, verbose=verbose) # evaluate model analysis_path = join_paths(model_repo, "analysis", verb=False) CM(labels_window_val.ravel(), model.predict(X_val).ravel(), pop_order, analysis_path, verbose) pickle.dump(model, open(model_path, "wb")) return model