def _run_one_epoch(self, sess, train_data, valid_data, epoch, saver): save_flag = False start_time = time.time() data_len = len(train_data) num_batches = (data_len + self.batch_size - 1) // self.batch_size batches = next_batch( train_data, (self.word_vocab, self.pos_vocab, self.label_vocab), self.batch_size, shuffle=False) valid_start_index = epoch * self.batch_size valid_end_index = (epoch + 1) * self.batch_size if valid_start_index >= len(train_data): valid_start_index = 0 valid_end_index = self.batch_size else: if valid_end_index <= len(train_data): valid_end_index = (epoch + 1) * self.batch_size else: valid_end_index = -1 train_valid_data = train_data[valid_start_index:valid_end_index] for step, (batch_words, batch_poses, batch_labels) in enumerate(batches): real_step = epoch * num_batches + step + 1 feed_dict, _ = self._get_feed_dict(batch_words, batch_poses, batch_labels) _, loss, summary, global_step = sess.run([ self.train_op, self.loss, self.summary_merged, self.global_step ], feed_dict=feed_dict) self.file_writer.add_summary(summary, global_step=global_step) if epoch >= 0 and self.save_flag: saver.save(sess, self.model_checkpoint_path, global_step=real_step) self.save_flag = False if step + 1 == num_batches and epoch > 0: train_predict_label_list, _ = self.valid( sess, train_valid_data) tmp_file_path = get_tmp_file_name('output') write_result(train_valid_data, train_predict_label_list, self.id2label, tmp_file_path) avg_f1 = compute_prf_score(tmp_file_path) if avg_f1 > self.best_f1 and avg_f1 > 0.9: self.save_flag = True print('TRAIN: epoch {}, avg f1 {}\n'.format(epoch, avg_f1)) predict_label_list, _ = self.valid(sess, valid_data, training_flag=False) tmp_file_path = get_tmp_file_name('output') write_result(valid_data, predict_label_list, self.id2label, tmp_file_path) avg_f1 = compute_prf_score(tmp_file_path) if self.best_f1 < avg_f1: self.best_f1 = avg_f1 self.best_epoch = epoch self.save_flag = True print('TEST: epoch {}, cost time is {}, avg f1 {}\n'.format( epoch, time.time() - start_time, avg_f1))
def replay(self): # 1. Updating the variables self.results.append(self.clicks) self.result_set.add(self.clicks) # 2. Writing results if self.repeat == 16: self.blocks.append([deepcopy(self.results),\ (datetime.datetime.now() - self.block_start_time).total_seconds() ,1]) utils.write_result('1', self, False) self.results = [] self.result_set = set() self.repeat = 0 self.block_start_time = datetime.datetime.now() else: utils.write_result('1', self, True) self.frequency[self.clicks] += 1 #frequency calculated later self.total_frequency[self.clicks] += 1 # 3. Checking replay conditions if self.repeat == 0 and len(self.blocks) >= int(self.settings['blocks1'])\ and utils.Stability(self.blocks,float(self.settings['stability'])): self.rgb = np.array([0.0, 200.0, 0.0]) self.win_txt = tkinter.Label(self.master, bg= "#%02x%02x%02x" % (0, 200, 0), fg = "#%02x%02x%02x" % (0, 200, 0),\ text='ATÉ O MOMENTO VOCÊ ACUMULOU '+str(int(self.points.get())+int(self.prev_sc.points.get()))+\ ' PONTOS!', font=Font(family='Helvetica', size=16, weight='bold')) self.master.after(20, self.fadeResetText) else: self.clicks = '' self.round_start_time = datetime.datetime.now() self.main_bg.configure(bg="#%02x%02x%02x" % (255, 255, 255)) self.createButtons() self.ableButtonsAndMouse()
def pvsimulator(fname, qu): ctx = pv_new_context().set('curve_start', time(7, 0)) print("Started simulator") if not os.path.isfile(fname): utils.write_headers(fname) print("Created file " + fname) try: while True: body = qu.get() consup = int(body) dt = datetime.now() ctx = (update_pv_change(ctx, random.random()).set( 'rnd_changep', random.random()).set('dt', dt)) ctx = simulate_pv(pv_sba, ctx) power = ctx['value'] print("Processing consumption and pv value: " + body + ", " + str(power)) utils.write_result(fname, consup, power, dt) except Exception as e: print(e) print("Finished pvsimulator")
def replay(self): # 1. Updating the variables self.results.append(self.clicks) self.result_set.add(self.clicks) # 2. Writing results if self.repeat == 16: self.blocks.append([deepcopy(self.results),\ (datetime.datetime.now() - self.block_start_time).total_seconds() ,1]) utils.write_result('4-Azul', self, False) self.results = [] self.result_set = set() self.repeat = 0 self.block_start_time = datetime.datetime.now() self.frequency[self.clicks] += 1 #frequency calculated later self.total_frequency[self.clicks] += 1 self.master.after(20, self.reset) else: utils.write_result('4-Azul', self, True) self.frequency[self.clicks] += 1 #frequency calculated later self.total_frequency[self.clicks] += 1 # 3. Checking replay conditions self.clicks = '' self.round_start_time = datetime.datetime.now() self.main_bg.configure(bg="#%02x%02x%02x" % (115, 190, 255)) self.createButtons() self.ableButtonsAndMouse()
def replay(self): if self.experiment != 1 and self.saved_order[0] == 2: if self.clicks[0] == 'E': self.clicks = self.left_txt else: self.clicks = self.right_txt print('| Clicks:', self.clicks) # 1. Writing results if self.repeat == 24: self.blocks.append([deepcopy(self.results),\ (datetime.datetime.now() - self.block_start_time).total_seconds() ,1]) if self.saved_order[0] == 2: self.memo_reinforced.pop(0) utils.write_result('4-Amarelo', self, False, True) else: utils.write_result('4-Amarelo', self, False, False) self.results.append(self.clicks) self.result_set.add(self.clicks) self.frequency[self.clicks] += 1 #frequency calculated later self.total_frequency[self.clicks] += 1 self.saved_order.pop(0) self.results = [] self.result_set = set() self.repeat = 0 self.block_start_time = datetime.datetime.now() self.master.after(20, self.reset) else: if self.saved_order[0] == 2: self.memo_reinforced.pop(0) utils.write_result('4-Amarelo', self, True, True) else: utils.write_result('4-Amarelo', self, True, False) self.results.append(self.clicks) self.result_set.add(self.clicks) self.frequency[self.clicks] += 1 #frequency calculated later self.total_frequency[self.clicks] += 1 self.saved_order.pop(0) self.clicks = '' self.round_start_time = datetime.datetime.now() self.main_bg.configure(bg="#%02x%02x%02x" % (255, 255, 110)) if self.saved_order[0] == 2: if self.experiment != 1: if self.memo_reinforced[0][1]: self.createJokerButton() self.master.configure(cursor='') self.reset_mouse_position() else: self.createImgButtons() self.ableButtonsAndMouse() else: self.createButtons() self.ableButtonsAndMouse() else: self.createButtons() self.ableButtonsAndMouse()
def test(self, data): with tf.Session(graph=self.graph) as sess: ckpt_path = tf.train.latest_checkpoint(self.model_directory) self.saver.restore(sess, ckpt_path) self.is_training = False predict_label_list, _ = self.valid(sess, data) tmp_file_path = get_tmp_file_name('output') write_result(data, predict_label_list, self.id2label, tmp_file_path) avg_f1 = compute_prf_score(tmp_file_path) print(avg_f1) print('predict done!!!') pass
def main(unused): # Load parameters model_params = getattr(params, FLAGS.params)() # Define estimator q_generation = model.QG(model_params) q_generation.compile(optimizer=tf.keras.optimizers.Adam(), loss=loss_function,run_eagerly=True, metrics=[BleuScore()]) # Training dataset train_sentence = np.load(FLAGS.train_sentence) # train_data train_question = np.load(FLAGS.train_question) # train_label TRAIN_BUFFER_SIZE = len(train_sentence) train_input_data = tf.data.Dataset.from_tensor_slices((train_sentence, train_question)).shuffle( TRAIN_BUFFER_SIZE).batch(model_params['batch_size'], drop_remainder=True) # Evaluation dataset eval_sentence = np.load(FLAGS.eval_sentence) eval_question = np.load(FLAGS.eval_question) EVAL_BUFFER_SIZE = len(train_sentence) eval_input_data = tf.data.Dataset.from_tensor_slices((eval_sentence, eval_question)).shuffle( EVAL_BUFFER_SIZE).batch(model_params['batch_size'], drop_remainder=True) # train and evaluate if FLAGS.mode == 'train': example_input_batch, example_target_batch = next(iter(train_input_data)) print("Shape train_input_data: ", example_input_batch.shape, example_target_batch.shape) q_generation.fit(train_input_data, epochs=FLAGS.num_epochs, validation_data=eval_input_data) q_generation.summary() elif FLAGS.mode == 'eval': q_generation.evaluate(eval_input_data) # exp_nn.evaluate(delay_secs=0) else: # 'pred' # Load test data test_sentence = np.load(FLAGS.test_sentence) # prediction input function for estimator test_input_data = tf.data.Dataset.from_tensor_slices( {'enc_inputs': test_sentence}).batch(model_params['batch_size'], drop_remainder=True) # prediction predict_results = q_generation.predict(test_input_data) # write result(question) into file write_result(predict_results, FLAGS.dic_dir, FLAGS.pred_dir)
def main(): # build parser and check arguments args = _build_parser() _check_args(args) # Setup Estimator '''Estimator name: xgb: XGBoost Classifier log: Logistic Regression knn: KNeighbors Classifier rfo: RandomForest Classifier ada: AdaBoost Classifier ext: ExtraTrees Classifier svc: Support Vector Classifier keras: Keras Neural Networks ''' if not args.estimator == 'all': estimators = [args.estimator] elif args.estimator == 'all': estimators = ['xgb', 'lgb', 'log', 'rfo', 'ext', 'ada', 'knn', 'svc'] # Training neural nets with keras if args.train_nn: estimator_name = 'keras' print('Training %s...' % estimator_name) params = { 'n_features': n_features, 'n_classes': n_classes, 'dropout': args.dropout, 'hidden_unit': args.hidden_unit, 'n_layers': args.layers, 'optimizer': args.optimizer, 'init': args.init, 'batch_size': args.batch_size, 'epochs': args.epochs, } estimator = keras_model(**params) train_kwargs = { 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'score_name': args.score, 'num': args.num } _ = estimator.train(**train_kwargs) print('params: \n', params) # Training random search CV with scikit-learn models if args.train_random: for estimator_name in estimators: print('Training %s...' % estimator_name) if not estimator_name == 'keras': seed = args.seed if args.seed != None else np.random.randint( 100) estimator, params = select_model(estimator_name, n_features, n_classes, seed) # kwargs dict for train and predict train_kwargs = { 'estimator': estimator, 'params': params, 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'n_iter': args.n_iter, 'score_name': args.score, } # Train model and Predict results best_params, best_score, val_score = random_model( **train_kwargs) timestamp = get_timestamp() # Write params to file write_params(estimator_name, best_params, best_score, val_score, timestamp, args.num) elif estimator_name == 'keras': space_params = { 'n_features': n_features, 'n_classes': n_classes, 'dropout': hp.uniform('dropout', .20, .80), 'hidden_unit': hp.quniform('hidden_unit', 10, 50, q=1), 'n_layers': hp.choice('n_layers', [1, 2, 3, 4]), 'optimizer': hp.choice('optimizer', ['adam', 'adadelta', 'sgd']), 'init': hp.choice('init', ['glorot_uniform', 'normal', 'uniform']), 'batch_size': hp.choice('batch_size', [16, 32, 64, 128]), 'epochs': hp.quniform('epochs', 100, 1000, q=1), 'score_name': args.score, 'num': args.num, } trials = Trials() best_params = fmin(random_nn, space_params, algo=tpe.suggest, max_evals=args.n_iter, trials=trials) print('best_params \n', best_params) # Evaluate with ensemble method and predict result if args.predict: eva_kwargs = { 'estimators': estimators, 'threshold': args.threshold, 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val, 'X_test': X_test, 'score_name': args.score, 'n_classes': n_classes, } # Predict with ensemble voting and write result prediction = ensemble(**eva_kwargs) if args.ensemble == 'vote': result = prediction.vote() elif args.ensemble == 'stack': result = prediction.stack(args.num_imp) timestamp = get_timestamp() write_result(result, label_list, timestamp)
# TODO: The sounds should be recorded and played on a frontend def play_audio(sr, wav): # I bet you've seen this duct-tape before wav = np.multiply(wav, (2**15)).astype(np.int16) wavfile.write("output.wav", rate=sr, data=wav) sound = AudioSegment.from_wav('output.wav') play(sound) if __name__ == "__main__": bot = SmallTalkAgent() for _ in range(10): play(tink) user_input = recognize(7) # 7 - is a duration of recorded sound play(morse) print(f"you: {user_input}") write_result(user_input, 'you') response = bot.talk(user_input) write_result(user_input, 'bot') print(f"bot: {response}") sr, wav = pronounce(response) play_audio(sr, wav)
"loss_docking": train_losses_docking, "loss_screening": train_losses_screening, }, epoch) writer.add_scalars( "test", { "total_loss": test_total_losses, "loss": test_losses, "loss_der1": test_losses_der1, "loss_der2": test_losses_der2, "loss_var": test_losses_var, "loss_docking": test_losses_docking, "loss_screening": test_losses_screening, }, epoch) # Write prediction utils.write_result(args.train_result_filename, train_pred, train_true) utils.write_result(args.test_result_filename, test_pred, test_true) utils.write_result(args.train_result_docking_filename, train_pred_docking, train_true_docking) utils.write_result(args.test_result_docking_filename, test_pred_docking, test_true_docking) utils.write_result(args.train_result_screening_filename, train_pred_screening, train_true_screening) utils.write_result(args.test_result_screening_filename, test_pred_screening, test_true_screening) end = time.time() # Cal R2 train_r2 = r2_score([train_true[k] for k in train_true.keys()], [train_pred[k].sum() for k in train_true.keys()]) test_r2 = r2_score([test_true[k] for k in test_true.keys()],
model.__dict__['{}_log_prob'.format(log_prob_func_name)], dataset[log_prob_func_name]['length'], trainingparams['shuffle_mask'], trainingparams['shuffling_type'], 1000) print "\tBest {1} error is : {0:.6f}".format( model_evaluation[log_prob_func_name][0], log_prob_func_name.upper()) # # WRITING RESULTS ##### model_info = [ trainingparams['learning_rate'], trainingparams['decrease_constant'], hyperparams['hidden_sizes'], hyperparams['random_seed'], hyperparams['hidden_activation'], trainingparams['max_epochs'], best_epoch, trainingparams['look_ahead'], trainingparams['batch_size'], trainingparams['shuffle_mask'], trainingparams['shuffling_type'], trainingparams['nb_shuffle_per_valid'], hyperparams['use_cond_mask'], hyperparams['direct_input_connect'], hyperparams['direct_output_connect'], trainingparams['pre_training'], trainingparams['pre_training_max_epoc'], trainingparams['update_rule'], trainingparams['dropout_rate'], hyperparams['weights_initialization'], hyperparams['mask_distribution'], float(model_evaluation['train'][0]), float(model_evaluation['train'][1]), float(model_evaluation['valid'][0]), float(model_evaluation['valid'][1]), float(model_evaluation['test'][0]), float(model_evaluation['test'][1]), total_train_time ] utils.write_result(dataset_name, model_info, experiment_name)
# df['fscore'] = df['fscore'] / df['fscore'].sum() # df.to_csv("temp/feat_importance.csv", index=False,encoding="utf-8") # df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) # plt.rcParams["font.sans-serif"] = ["SimHei"] # plt.title('XGBoost Feature Importance') # plt.xlabel('relative importance') # plt.show() # this is prediction preds = bst.predict(dtest) labels = dtest.get_label() print(tl.loss_function(preds, labels)) # 读取比赛题 # exam_set = tl.load_match_data("data/d_test_A_20180102.csv") exam_set = tl.load_match_data("data/d_test_A_20180102_new.csv") # 数据预处理 exam_set = tl.pre_process(exam_set) del exam_set["乙肝表面抗原"] del exam_set["乙肝表面抗体"] del exam_set["乙肝e抗原"] del exam_set["乙肝e抗体"] del exam_set["乙肝核心抗体"] # 结果预测 exam_set = xgb.DMatrix(exam_set) y_exam = bst.predict(exam_set) # 将结果写入 tl.write_result(y_exam)
# Not totally resumable if it was stopped during pre-training. if resume_mode: load_model_params(model, save_path_experiment) trainer_status = utils.load_dict_from_json_file(os.path.join(save_path_experiment, "trainer_status")) # # TRAINING LEARNER #### best_epoch, total_train_time = train_model(model, dataset, trainingparams['look_ahead'], trainingparams['shuffle_mask'], trainingparams['nb_shuffle_per_valid'], trainingparams['max_epochs'], trainingparams['batch_size'], trainingparams['shuffling_type'], save_path_experiment, trainer_status) # # Loading best model load_model_params(model, save_path_experiment) # # EVALUATING BEST MODEL #### model_evaluation = {} print '\n### Evaluating best model from Epoch {0} ###'.format(best_epoch) for log_prob_func_name in ['test', 'valid', 'train']: if trainingparams['shuffle_mask'] > 0: model.reset(trainingparams['shuffling_type']) if log_prob_func_name == "train": model_evaluation[log_prob_func_name] = get_mean_error_and_std_final(model, model.train_log_prob_batch, dataset[log_prob_func_name]['length'], trainingparams['shuffle_mask'], trainingparams['shuffling_type'], 1000) else: model_evaluation[log_prob_func_name] = get_mean_error_and_std(model, model.__dict__['{}_log_prob'.format(log_prob_func_name)], dataset[log_prob_func_name]['length'], trainingparams['shuffle_mask'], trainingparams['shuffling_type'], 1000) print "\tBest {1} error is : {0:.6f}".format(model_evaluation[log_prob_func_name][0], log_prob_func_name.upper()) # # WRITING RESULTS ##### model_info = [trainingparams['learning_rate'], trainingparams['decrease_constant'], hyperparams['hidden_sizes'], hyperparams['random_seed'], hyperparams['hidden_activation'], trainingparams['max_epochs'], best_epoch, trainingparams['look_ahead'], trainingparams['batch_size'], trainingparams['shuffle_mask'], trainingparams['shuffling_type'], trainingparams['nb_shuffle_per_valid'], hyperparams['use_cond_mask'], hyperparams['direct_input_connect'], hyperparams['direct_output_connect'], trainingparams['pre_training'], trainingparams['pre_training_max_epoc'], trainingparams['update_rule'], trainingparams['dropout_rate'], hyperparams['weights_initialization'], hyperparams['mask_distribution'], float(model_evaluation['train'][0]), float(model_evaluation['train'][1]), float(model_evaluation['valid'][0]), float(model_evaluation['valid'][1]), float(model_evaluation['test'][0]), float(model_evaluation['test'][1]), total_train_time] utils.write_result(dataset_name, model_info, experiment_name)
# Remove duplicate words query_text = set(query_text) for doc_id in doc_dict: # retrieve the document text doc_text = doc_dict[doc_id] # remove duplicate words doc_text = set(doc_text) # find the overlap between query and document overlap = 0 for word in query_text: if word in doc_text: overlap += 1 # store overlap score in dictionary with id tuple as key result[(query_id, doc_id)] = overlap return result if __name__ == '__main__': query_dict = utils.process_data('data/qrys.txt') doc_dict = utils.process_data('data/docs.txt') overlap_scores = calculate_overlap(query_dict, doc_dict) with open('results/overlap.top','w') as output_file: output_file = utils.write_result(overlap_scores, output_file)
precision, recall, f1 = calculate_quality(_OUTPUT_PATTERN % metric, _REFERENCE_FILENAME) print("Metric %s:" % metric) print("\tPrecision: %f, recall: %f, f1: %f" % (precision, recall, f1)) except: pass time2 = time() print("Run for %f s." % (time2 - time1)) else: metric_txt = action metric = dice_metric if action == 'dice' else cosine_metric if action == 'cosine' else lcs_metric print("Preprocessing data...") print("Input: %s" % _INPUT_FILENAME) counter = 0 preprocessed = {} result = {} with open(_INPUT_FILENAME) as input: for line in input: preprocessed_line = process(line) preprocessed[line] = preprocessed_line if _DEBUG and counter % 50 == 0: print("%s => %s" % (line, preprocessed_line)) counter += 1 print("Clustering...") clusters = cluster(preprocessed, metric, _THRESHOLDS[metric_txt], _DEBUG) for line, preprocessed_line in preprocessed.items(): result[line] = clusters[preprocessed_line] print("Writing result...") write_result(result, _OUTPUT_PATTERN % metric_txt) time2 = time() print("Run for %f s." % (time2 - time1))
# find the most frequent words freq_dist = nltk.FreqDist(word for word in top_doc_word_list) best_words = freq_dist.keys()[:num_words] # add to the query new_query = query_text + best_words # recalculate tfidf score and add to score dictionary for doc_id, tfidf_score in calculate_tfidf(new_query, doc_dict, average_doc_length, k): score_dict[query_id, doc_id] = tfidf_score return score_dict if __name__ == "__main__": query_dict = utils.process_data('data/qrys.txt') doc_dict = utils.process_data('data/docs.txt') standard_tfidf_scores = standard_tfidf(query_dict, doc_dict) with open('results/tfidf.top', 'w') as output_file: output_file = utils.write_result(standard_tfidf_scores, output_file) tfidf_with_prf_scores = tfidf_pseudo_relevance_feedback( query_dict, doc_dict) with open('results/best.top', 'w') as output_file: output_file = utils.write_result(tfidf_with_prf_scores, output_file)
def train(model, args): vocab = json.load(open(args.vocab_path)) querys = json.load(open(args.query_path)) gloss = json.load(open(args.gloss_path)) # Docs = json.load(open(args.docs_path)) pred = json.load(open(args.preds_path)) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') preds = {} # doc_tf = get_doctf(Docs) # Doc_inds = prepare(vocab, Docs,args.max_len) test_datas = json.load(open(args.datapath)) train_datas = json.load(open(args.train_path)) keys = [key for key in gloss] if args.only_test: datas = test_datas if args.merge_train: random.shuffle(train_datas) datas = train_datas[:len(test_datas)] for ele in datas: target_word = ele['target_word'] uid = ele['id'] preds[uid] = pred[target_word] for ele in test_datas: target_word = ele['target_word'] uid = ele['id'] preds[uid] = pred[target_word] prior_preds = preds.copy() for i, data in enumerate(datas): target_word = data['target_word'] data['neg_candidates'] = querys[target_word] datas[i] = data # if args.labeled: # for i,ele in enumerate(train_datas): # uid = ele['id'] # target_word = ele['target_word'] # preds[uid] = [0 if sense != ele['target_sense'] else 1 for sense in querys[target_word]] # ele['neg_candidates'] = [sense for sense in querys[target_word] if sense != ele['target_sense']] # train_datas[i] = ele # datas = test_datas + train_datas # query_lens = [len(preds[key]) for key in preds] # print(max(query_lens)) # print(sum(query_lens) / len(query_lens)) # datas = preprocess_data(datas, querys, vocab, preds, args) # train_dataset = Traindataset(train_datas, tokenizer, args) all_dataset = Mydataset(datas, tokenizer, querys, pred, args) parameters = [p for p in model.parameters() if p.requires_grad] if args.optim == 'adam': optim = Adam(parameters, lr=args.lr, weight_decay=args.weight_decay) if args.optim == 'sgd': optim = SGD(parameters, lr=args.lr) tot_loss = 0 loss_time = 0 max_F1 = 0 max_val_F1 = 0 stop = 0 # init confirm = 0 for key in preds: lis = preds[key] score = max(lis) confirm += 1 if score > 0.9 else 0 logging.info('confirm:%s/%s' % (confirm, len(datas))) # print('confirm:', confirm, '/', len(datas)) # em goal_sum = 0 # preds = E_step(model, datas, querys, vocab, args) # gloss init # M_step for epoch in range(args.epoch): tot_loss = 0 loss_time = 0 tot_pos_score = 0 tot_neg_score = 0 tot_eloss = 0 for i in range(args.update_steps): # batch = generate_batch(datas, preds, querys, vocab, keys, args) batch = all_dataset.generate_batch(preds, querys, vocab, keys, gloss, args) loss, (pos_score, neg_score, e_loss) = M_step(model, batch, querys, vocab, prior_preds, tokenizer, gloss, args) optim.zero_grad() loss.backward() optim.step() l = loss.item() tot_loss += l loss_time += 1 tot_eloss += e_loss.item() tot_pos_score += pos_score.item() tot_neg_score += neg_score.item() # if i % args.print_every == 0: # print('goal:' + str(cal_goal(model,datas,querys,vocab,preds,args))) F1, val_F1, results = test(model, test_datas, querys, vocab, gloss, tokenizer, prior_preds, args) stop += 1 writer.add_scalars('loss', { 'loss': tot_loss / loss_time, 'eloss': tot_eloss / loss_time }, epoch) writer.add_scalars( 'scores', { 'pos_score': tot_pos_score / loss_time, 'neg_score': tot_neg_score / loss_time }, epoch) writer.add_scalar('F1', F1, epoch) if F1 > max_F1: stop = 0 if os.path.exists(args.save_dir + str(max_F1) + 'all.model.pkl'): os.remove(args.save_dir + str(max_F1) + 'all.test.res') os.remove(args.save_dir + str(max_F1) + 'all.model.pkl') os.remove(args.save_dir + str(max_F1) + 'all.preds.txt') max_F1 = F1 write_result(args.save_dir + str(max_F1) + 'all.test.res', results) with open(args.save_dir + str(max_F1) + 'all.preds.txt', 'w') as fout: for key in preds: fout.write(key + '\t' + str(preds[key]) + '\n') check_point = {} check_point['model_dict'] = model.state_dict() torch.save(check_point, args.save_dir + str(max_F1) + 'all.model.pkl') args.model_path = args.save_dir + str(max_F1) + 'all.model.pkl' max_val_F1 = max(val_F1, max_val_F1) write_result(args.save_dir + 'epoch' + str(epoch) + 'all.test.res', results) with open(args.save_dir + 'epoch' + str(epoch) + 'all.preds.txt', 'w') as fout: for key in preds: fout.write(key + '\t' + str(preds[key]) + '\n') logging.info('\nepoch: %s, avg_loss:%s' % (epoch, tot_loss / loss_time)) loss_time = 0 tot_loss = 0 logging.info('F1: %s/%s' % (F1, max_F1)) logging.info('val F1: %s/%s' % (val_F1, max_val_F1)) if stop > 50: exit() if args.merge_train: results = E_step(model, datas, querys, vocab, gloss, tokenizer, prior_preds, args) new_preds = {} for ele in results: tem_pred = [x[1] for x in ele[2]] tot = sum(tem_pred) tem_pred = [x / tot for x in tem_pred] new_preds[ele[0]] = tem_pred tot_dis = 0 for key in new_preds: new_pred = np.array(new_preds[key]) pred = np.array(preds[key]) if np.argmax(new_pred) != np.argmax(pred): tot_dis += 1 writer.add_scalar('tot_update_dis', tot_dis, epoch) if not args.wo_estep: preds.update(new_preds) logging.info('E_step') confirm = 0 for key in preds: lis = preds[key] score = max(lis) confirm += 1 if score > 0.9 else 0 writer.add_scalar('confirm', confirm, epoch) logging.info('confirm:%s/%s' % (confirm, len(datas))) with open(args.save_dir + 'epoch' + str(epoch) + 'samplecount.txt', 'w') as fout: for ele in test_datas: key = ele['id'] neg_count = all_dataset.neg_counts[ key] if key in all_dataset.neg_counts else [] fout.write(key + '\t' + str(neg_count) + '\n')
def test(**kwargs): opt = DefaultConfig() opt.update(**kwargs) logger = Logger() prefix = '' if opt['use_double_length']: prefix += '_2' print prefix if opt['use_char']: logger.info('Load char data starting...') opt['embed_num'] = opt['char_embed_num'] embed_mat = np.load(opt['char_embed']) test_title = np.load(opt['test_title_char' + prefix]) test_desc = np.load(opt['test_desc_char' + prefix]) logger.info('Load char data finished!') elif opt['use_word']: logger.info('Load word data starting...') opt['embed_num'] = opt['word_embed_num'] embed_mat = np.load(opt['word_embed']) test_title = np.load(opt['test_title_word' + prefix]) test_desc = np.load(opt['test_desc_word' + prefix]) logger.info('Load word data finished!') elif opt['use_char_word']: logger.info('Load char-word data starting...') embed_mat_char = np.load(opt['char_embed']) embed_mat_word = np.load(opt['word_embed']) embed_mat = np.vstack((embed_mat_char, embed_mat_word)) test_title = np.load(opt['test_title_char' + prefix]) test_desc = np.load(opt['test_desc_word' + prefix]) logger.info('Load char-word data finished!') elif opt['use_word_char']: logger.info('Load word-char data starting...') embed_mat_char = np.load(opt['char_embed']) embed_mat_word = np.load(opt['word_embed']) embed_mat = np.vstack((embed_mat_char, embed_mat_word)) test_title = np.load(opt['test_title_word' + prefix]) test_desc = np.load(opt['test_desc_char' + prefix]) logger.info('Load word-char data finished!') test_idx = np.load(opt['test_idx']) topic_idx = np.load(opt['topic_idx']) test_dataset = Dataset(test=True, title=test_title, desc=test_desc) test_loader = data.DataLoader(test_dataset, shuffle=False, batch_size=opt['batch_size']) logger.info('Using model {}'.format(opt['model'])) Model = getattr(models, opt['model']) model = Model(embed_mat, opt) if opt['load']: if opt.get('load_name', None) is None: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model']) else: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name']) if opt['device'] != None: torch.cuda.set_device(opt['device']) if opt['cuda']: model.cuda() logger.info('Start testing...') model.eval() predict_label_list = [] res = torch.Tensor(opt['test_num'], opt['class_num']) for i, batch in enumerate(test_loader, 0): batch_size = batch[0].size(0) title, desc = batch title, desc = Variable(title), Variable(desc) if opt['cuda']: title, desc = title.cuda(), desc.cuda() logit = model(title, desc) if opt.get('save_resmat', False): res[i * opt['batch_size']:i * opt['batch_size'] + batch_size] = logit.data.cpu() predict_label_list += [list(ii) for ii in logit.topk(5, 1)[1].data] if opt.get('save_resmat', False): torch.save(res, '{}/{}_test_res.pt'.format(opt['result_dir'], opt['model'])) return lines = [] for qid, top5 in zip(test_idx, predict_label_list): topic_ids = [topic_idx[i] for i in top5] lines.append('{},{}'.format(qid, ','.join(topic_ids))) if opt.get('load_name', None) is None: write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], result_dir=opt['result_dir']) else: write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name'], result_dir=opt['result_dir'])
def replay(self): if self.experiment != 1 and self.order[0] == 2: if self.clicks[0] == 'E': self.clicks = self.left_txt else: self.clicks = self.right_txt print('| Clicks:', self.clicks) # 1. Writing results if self.repeat == 24: self.blocks.append([deepcopy(self.results),\ (datetime.datetime.now() - self.block_start_time).total_seconds() ,1]) if self.order[0] == 2: self.memory.pop(0) utils.write_result('3', self, False, True) else: utils.write_result('3', self, False, False) self.results.append(self.clicks) self.result_set.add(self.clicks) self.frequency[self.clicks] += 1 #frequency calculated later self.total_frequency[self.clicks] += 1 self.order.pop(0) self.results = [] self.result_set = set() self.repeat = 0 self.block_start_time = datetime.datetime.now() self.order = self.shuffleMode() self.memory = self.shuffleMemory() else: if self.order[0] == 2: self.memory.pop(0) utils.write_result('3', self, True, True) else: utils.write_result('3', self, True, False) self.results.append(self.clicks) self.result_set.add(self.clicks) self.frequency[self.clicks] += 1 #frequency calculated later self.total_frequency[self.clicks] += 1 self.order.pop(0) # 3. Checking replay conditions if self.repeat == 0 and len(self.blocks) >= int(self.settings['blocks3']) and self.reinforcement[-1]\ and self.memo_accuracy >= int(self.settings['min_memo']): self.rgb = np.array([0.0, 200.0, 0.0]) self.win_txt = tkinter.Label(self.master, bg= "#%02x%02x%02x" % (0, 200, 0), fg = "#%02x%02x%02x" % (0, 200, 0),\ text='ATÉ O MOMENTO VOCÊ ACUMULOU '+str(int(self.points.get())+int(self.prev_sc.points.get()))+\ ' PONTOS!', font=Font(family='Helvetica', size=16, weight='bold')) self.master.after(20, self.fadeResetText) elif self.repeat == 0 and len(self.blocks) >= int(self.settings['blocks3']) and not self.reinforcement[-1]\ and self.memo_accuracy >= int(self.settings['min_memo']): self.rgb = np.array([0.0, 0.0, 0.0]) self.win_txt = tkinter.Label(self.master, bg= "#%02x%02x%02x" % (0, 0, 0), fg = "#%02x%02x%02x" % (0, 0, 0),\ text='ATÉ O MOMENTO VOCÊ ACUMULOU '+str(int(self.points.get())+int(self.prev_sc.points.get()))+\ ' PONTOS!', font=Font(family='Helvetica', size=16, weight='bold')) self.master.after(20, self.fadeResetText) else: self.clicks = '' self.round_start_time = datetime.datetime.now() self.main_bg.configure(bg="#%02x%02x%02x" % (255, 255, 255)) if self.order[0] == 2: if self.experiment != 1: self.createImgButtons() else: self.createButtons() else: self.createButtons() self.ableButtonsAndMouse()
def test_stack(**kwargs): opt = DefaultConfig() opt.update(**kwargs) logger = Logger() result_dir = '/home/dyj/' resmat = [result_dir+'TextCNN1_2017-07-27#12:30:16_test_res.pt',\ result_dir+'TextCNN2_2017-07-27#12:22:42_test_res.pt', \ result_dir+'RNN1_2017-07-27#12:35:51_test_res.pt',\ result_dir+'RNN2_2017-07-27#11:33:24_test_res.pt',\ result_dir+'RCNN1_2017-07-27#11:30:42_test_res.pt',\ result_dir+'RCNNcha_2017-07-27#16:00:33_test_res.pt',\ result_dir+'FastText4_2017-07-28#17:20:21_test_res.pt',\ result_dir+'FastText1_2017-07-29#10:47:46_test_res.pt'] opt['stack_num'] = len(resmat) test_dataset = Stack_Dataset(resmat=resmat, test=True) test_loader = data.DataLoader(test_dataset, shuffle=False, batch_size=opt['batch_size']) test_idx = np.load(opt['test_idx']) topic_idx = np.load(opt['topic_idx']) logger.info('Using model {}'.format(opt['model'])) Model = getattr(models, opt['model']) model = Model(opt) print model if opt['load']: if opt.get('load_name', None) is None: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model']) else: model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name']) if opt['device'] != None: torch.cuda.set_device(opt['device']) if opt['cuda']: model.cuda() logger.info('Start testing...') model.eval() predict_label_list = [] res = torch.Tensor(opt['test_num'], opt['class_num']) for i, batch in enumerate(test_loader, 0): batch_size = batch[0].size(0) resmat = batch resmat = [Variable(ii) for ii in resmat] if opt['cuda']: resmat = [ii.cuda() for ii in resmat] logit = model(resmat) if opt.get('save_resmat', False): res[i * opt['batch_size']:i * opt['batch_size'] + batch_size] = logit.data.cpu() predict_label_list += [list(ii) for ii in logit.topk(5, 1)[1].data] if opt.get('save_resmat', False): torch.save( res, '{}/{}_{}_test_res.pt'.format( opt['result_dir'], opt['model'], datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S'))) return lines = [] for qid, top5 in zip(test_idx, predict_label_list): topic_ids = [topic_idx[i] for i in top5] lines.append('{},{}'.format(qid, ','.join(topic_ids))) if opt.get('load_name', None) is None: write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], result_dir=opt['result_dir']) else: write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], \ name=opt['load_name'], result_dir=opt['result_dir'])
keys_list.append(keys) print(keys_list) return keys_list def load_excel(filename: str, sheet_index: int, title_col: int): excel_file = xlrd.open_workbook(filename) sheet = excel_file.sheet_by_index(sheet_index) article_list_ = [] for i in range(1, sheet.nrows): article = {'title': sheet.cell(i, title_col).value} article_list_.append(article) return article_list_ if __name__ == '__main__': EXTERSIONS = ['xls', 'xlsm', 'xlsx'] std_keys = utils.load_std_keys() dir_ = 'F:/pycharm/zbj2/test_data/' for path in os.listdir(dir_): if EXTERSIONS.count(path.split('.')[-1]) > 0: article_list = load_excel(dir_ + path, sheet_index=1, title_col=0) # keys_list_ = load_key('C:/excel/数据.xls') result_list = utils.classify_subject(article_list, std_keys, index='std_key') utils.write_result(dir_ + path, result_list, w_key_col=5, sheet_index=1, w_id_col=4) # article_list = load_excel('C:/excel/信息公开目录.xls') # # keys_list_ = load_key('C:/excel/数据.xls') # result_list = classify(article_list, keys_list_new) # write_result('C:/excel/信息公开目录.xls', result_list)
doc_text = doc_dict[doc_id] top_doc_word_list+=doc_text # find the most frequent words freq_dist = nltk.FreqDist(word for word in top_doc_word_list) best_words = freq_dist.keys()[:num_words] # add to the query new_query = query_text + best_words # recalculate tfidf score and add to score dictionary for doc_id, tfidf_score in calculate_tfidf(new_query, doc_dict, average_doc_length, k): score_dict[query_id, doc_id] = tfidf_score return score_dict if __name__ == "__main__": query_dict = utils.process_data('data/qrys.txt') doc_dict = utils.process_data('data/docs.txt') standard_tfidf_scores = standard_tfidf(query_dict, doc_dict) with open('results/tfidf.top', 'w') as output_file: output_file = utils.write_result(standard_tfidf_scores, output_file) tfidf_with_prf_scores = tfidf_pseudo_relevance_feedback(query_dict, doc_dict) with open('results/best.top', 'w') as output_file: output_file = utils.write_result(tfidf_with_prf_scores, output_file)
from utils import read_subject, write_result for f in [ "subject/a_example.in", "subject/b_should_be_easy.in", "subject/c_no_hurry.in", "subject/d_metropolis.in", "subject/e_high_bonus.in" ]: STEP_COUNT, BONUS, rides, cars = read_subject(f) step = 0 print f, STEP_COUNT, len(cars), len(rides) while step < STEP_COUNT: if step % 100 == 0: print step for car in cars: if car.available(step): r = car.findRide(rides, step, STEP_COUNT) if r is not None: rides.remove(r) step += 1 write_result(f, cars)
# Remove duplicate words query_text = set(query_text) for doc_id in doc_dict: # retrieve the document text doc_text = doc_dict[doc_id] # remove duplicate words doc_text = set(doc_text) # find the overlap between query and document overlap = 0 for word in query_text: if word in doc_text: overlap += 1 # store overlap score in dictionary with id tuple as key result[(query_id, doc_id)] = overlap return result if __name__ == '__main__': query_dict = utils.process_data('data/qrys.txt') doc_dict = utils.process_data('data/docs.txt') overlap_scores = calculate_overlap(query_dict, doc_dict) with open('results/overlap.top', 'w') as output_file: output_file = utils.write_result(overlap_scores, output_file)