Пример #1
0
    def _run_one_epoch(self, sess, train_data, valid_data, epoch, saver):
        save_flag = False
        start_time = time.time()
        data_len = len(train_data)
        num_batches = (data_len + self.batch_size - 1) // self.batch_size
        batches = next_batch(
            train_data, (self.word_vocab, self.pos_vocab, self.label_vocab),
            self.batch_size,
            shuffle=False)
        valid_start_index = epoch * self.batch_size
        valid_end_index = (epoch + 1) * self.batch_size
        if valid_start_index >= len(train_data):
            valid_start_index = 0
            valid_end_index = self.batch_size
        else:
            if valid_end_index <= len(train_data):
                valid_end_index = (epoch + 1) * self.batch_size
            else:
                valid_end_index = -1
        train_valid_data = train_data[valid_start_index:valid_end_index]
        for step, (batch_words, batch_poses,
                   batch_labels) in enumerate(batches):
            real_step = epoch * num_batches + step + 1
            feed_dict, _ = self._get_feed_dict(batch_words, batch_poses,
                                               batch_labels)
            _, loss, summary, global_step = sess.run([
                self.train_op, self.loss, self.summary_merged, self.global_step
            ],
                                                     feed_dict=feed_dict)
            self.file_writer.add_summary(summary, global_step=global_step)
            if epoch >= 0 and self.save_flag:
                saver.save(sess,
                           self.model_checkpoint_path,
                           global_step=real_step)
                self.save_flag = False
            if step + 1 == num_batches and epoch > 0:
                train_predict_label_list, _ = self.valid(
                    sess, train_valid_data)
                tmp_file_path = get_tmp_file_name('output')
                write_result(train_valid_data, train_predict_label_list,
                             self.id2label, tmp_file_path)
                avg_f1 = compute_prf_score(tmp_file_path)
                if avg_f1 > self.best_f1 and avg_f1 > 0.9:
                    self.save_flag = True
                print('TRAIN: epoch {}, avg f1 {}\n'.format(epoch, avg_f1))

        predict_label_list, _ = self.valid(sess,
                                           valid_data,
                                           training_flag=False)
        tmp_file_path = get_tmp_file_name('output')
        write_result(valid_data, predict_label_list, self.id2label,
                     tmp_file_path)
        avg_f1 = compute_prf_score(tmp_file_path)
        if self.best_f1 < avg_f1:
            self.best_f1 = avg_f1
            self.best_epoch = epoch
            self.save_flag = True
        print('TEST: epoch {}, cost time is {}, avg f1 {}\n'.format(
            epoch,
            time.time() - start_time, avg_f1))
Пример #2
0
    def replay(self):
        # 1. Updating the variables
        self.results.append(self.clicks)
        self.result_set.add(self.clicks)

        # 2. Writing results
        if self.repeat == 16:
            self.blocks.append([deepcopy(self.results),\
            (datetime.datetime.now() - self.block_start_time).total_seconds() ,1])
            utils.write_result('1', self, False)

            self.results = []
            self.result_set = set()
            self.repeat = 0
            self.block_start_time = datetime.datetime.now()
        else:
            utils.write_result('1', self, True)

        self.frequency[self.clicks] += 1  #frequency calculated later
        self.total_frequency[self.clicks] += 1

        # 3. Checking replay conditions
        if self.repeat == 0 and len(self.blocks) >= int(self.settings['blocks1'])\
        and utils.Stability(self.blocks,float(self.settings['stability'])):
            self.rgb = np.array([0.0, 200.0, 0.0])
            self.win_txt = tkinter.Label(self.master, bg= "#%02x%02x%02x" % (0, 200, 0), fg = "#%02x%02x%02x" % (0, 200, 0),\
              text='ATÉ O MOMENTO VOCÊ ACUMULOU '+str(int(self.points.get())+int(self.prev_sc.points.get()))+\
              ' PONTOS!', font=Font(family='Helvetica', size=16, weight='bold'))
            self.master.after(20, self.fadeResetText)
        else:
            self.clicks = ''
            self.round_start_time = datetime.datetime.now()
            self.main_bg.configure(bg="#%02x%02x%02x" % (255, 255, 255))
            self.createButtons()
            self.ableButtonsAndMouse()
Пример #3
0
def pvsimulator(fname, qu):
    ctx = pv_new_context().set('curve_start', time(7, 0))
    print("Started simulator")

    if not os.path.isfile(fname):
        utils.write_headers(fname)
        print("Created file " + fname)

    try:
        while True:
            body = qu.get()
            consup = int(body)
            dt = datetime.now()
            ctx = (update_pv_change(ctx, random.random()).set(
                'rnd_changep', random.random()).set('dt', dt))
            ctx = simulate_pv(pv_sba, ctx)
            power = ctx['value']

            print("Processing consumption and pv value: " + body + ", " +
                  str(power))
            utils.write_result(fname, consup, power, dt)

    except Exception as e:
        print(e)
        print("Finished pvsimulator")
Пример #4
0
    def replay(self):
        # 1. Updating the variables
        self.results.append(self.clicks)
        self.result_set.add(self.clicks)

        # 2. Writing results
        if self.repeat == 16:
            self.blocks.append([deepcopy(self.results),\
            (datetime.datetime.now() - self.block_start_time).total_seconds() ,1])
            utils.write_result('4-Azul', self, False)

            self.results = []
            self.result_set = set()
            self.repeat = 0
            self.block_start_time = datetime.datetime.now()

            self.frequency[self.clicks] += 1  #frequency calculated later
            self.total_frequency[self.clicks] += 1

            self.master.after(20, self.reset)

        else:
            utils.write_result('4-Azul', self, True)

            self.frequency[self.clicks] += 1  #frequency calculated later
            self.total_frequency[self.clicks] += 1

            # 3. Checking replay conditions
            self.clicks = ''
            self.round_start_time = datetime.datetime.now()
            self.main_bg.configure(bg="#%02x%02x%02x" % (115, 190, 255))
            self.createButtons()
            self.ableButtonsAndMouse()
Пример #5
0
    def replay(self):
        if self.experiment != 1 and self.saved_order[0] == 2:
            if self.clicks[0] == 'E':
                self.clicks = self.left_txt
            else:
                self.clicks = self.right_txt
        print('| Clicks:', self.clicks)

        # 1. Writing results
        if self.repeat == 24:
            self.blocks.append([deepcopy(self.results),\
            (datetime.datetime.now() - self.block_start_time).total_seconds() ,1])
            if self.saved_order[0] == 2:
                self.memo_reinforced.pop(0)
                utils.write_result('4-Amarelo', self, False, True)
            else:
                utils.write_result('4-Amarelo', self, False, False)
                self.results.append(self.clicks)
                self.result_set.add(self.clicks)
                self.frequency[self.clicks] += 1  #frequency calculated later
                self.total_frequency[self.clicks] += 1
            self.saved_order.pop(0)

            self.results = []
            self.result_set = set()
            self.repeat = 0
            self.block_start_time = datetime.datetime.now()

            self.master.after(20, self.reset)

        else:
            if self.saved_order[0] == 2:
                self.memo_reinforced.pop(0)
                utils.write_result('4-Amarelo', self, True, True)
            else:
                utils.write_result('4-Amarelo', self, True, False)
                self.results.append(self.clicks)
                self.result_set.add(self.clicks)
                self.frequency[self.clicks] += 1  #frequency calculated later
                self.total_frequency[self.clicks] += 1
            self.saved_order.pop(0)

            self.clicks = ''
            self.round_start_time = datetime.datetime.now()
            self.main_bg.configure(bg="#%02x%02x%02x" % (255, 255, 110))
            if self.saved_order[0] == 2:
                if self.experiment != 1:
                    if self.memo_reinforced[0][1]:
                        self.createJokerButton()
                        self.master.configure(cursor='')
                        self.reset_mouse_position()
                    else:
                        self.createImgButtons()
                        self.ableButtonsAndMouse()
                else:
                    self.createButtons()
                    self.ableButtonsAndMouse()
            else:
                self.createButtons()
                self.ableButtonsAndMouse()
Пример #6
0
    def test(self, data):
        with tf.Session(graph=self.graph) as sess:
            ckpt_path = tf.train.latest_checkpoint(self.model_directory)
            self.saver.restore(sess, ckpt_path)
            self.is_training = False
            predict_label_list, _ = self.valid(sess, data)
            tmp_file_path = get_tmp_file_name('output')
            write_result(data, predict_label_list, self.id2label,
                         tmp_file_path)
            avg_f1 = compute_prf_score(tmp_file_path)
            print(avg_f1)
            print('predict done!!!')

        pass
Пример #7
0
def main(unused):
    # Load parameters
    model_params = getattr(params, FLAGS.params)()

    # Define estimator
    q_generation = model.QG(model_params)

    q_generation.compile(optimizer=tf.keras.optimizers.Adam(), loss=loss_function,run_eagerly=True,
                         metrics=[BleuScore()])

    # Training dataset
    train_sentence = np.load(FLAGS.train_sentence)  # train_data
    train_question = np.load(FLAGS.train_question)  # train_label
    TRAIN_BUFFER_SIZE = len(train_sentence)
    train_input_data = tf.data.Dataset.from_tensor_slices((train_sentence, train_question)).shuffle(
        TRAIN_BUFFER_SIZE).batch(model_params['batch_size'], drop_remainder=True)

    # Evaluation dataset
    eval_sentence = np.load(FLAGS.eval_sentence)
    eval_question = np.load(FLAGS.eval_question)
    EVAL_BUFFER_SIZE = len(train_sentence)
    eval_input_data = tf.data.Dataset.from_tensor_slices((eval_sentence, eval_question)).shuffle(
        EVAL_BUFFER_SIZE).batch(model_params['batch_size'], drop_remainder=True)

    # train and evaluate
    if FLAGS.mode == 'train':
        example_input_batch, example_target_batch = next(iter(train_input_data))
        print("Shape train_input_data: ", example_input_batch.shape, example_target_batch.shape)
        q_generation.fit(train_input_data,
                         epochs=FLAGS.num_epochs,
                         validation_data=eval_input_data)
        q_generation.summary()

    elif FLAGS.mode == 'eval':
        q_generation.evaluate(eval_input_data)
        # exp_nn.evaluate(delay_secs=0)

    else:  # 'pred'
        # Load test data
        test_sentence = np.load(FLAGS.test_sentence)

        # prediction input function for estimator
        test_input_data = tf.data.Dataset.from_tensor_slices(
            {'enc_inputs': test_sentence}).batch(model_params['batch_size'], drop_remainder=True)

        # prediction
        predict_results = q_generation.predict(test_input_data)

        # write result(question) into file
        write_result(predict_results, FLAGS.dic_dir, FLAGS.pred_dir)
def main():

    # build parser and check arguments
    args = _build_parser()
    _check_args(args)

    # Setup Estimator
    '''Estimator name: 
    xgb: XGBoost Classifier
    log: Logistic Regression
    knn: KNeighbors Classifier
    rfo: RandomForest Classifier 
    ada: AdaBoost Classifier
    ext: ExtraTrees Classifier
    svc: Support Vector Classifier
    keras: Keras Neural Networks
    '''

    if not args.estimator == 'all':
        estimators = [args.estimator]
    elif args.estimator == 'all':
        estimators = ['xgb', 'lgb', 'log', 'rfo', 'ext', 'ada', 'knn', 'svc']

    # Training neural nets with keras
    if args.train_nn:
        estimator_name = 'keras'
        print('Training %s...' % estimator_name)

        params = {
            'n_features': n_features,
            'n_classes': n_classes,
            'dropout': args.dropout,
            'hidden_unit': args.hidden_unit,
            'n_layers': args.layers,
            'optimizer': args.optimizer,
            'init': args.init,
            'batch_size': args.batch_size,
            'epochs': args.epochs,
        }
        estimator = keras_model(**params)

        train_kwargs = {
            'X_train': X_train,
            'y_train': y_train,
            'X_val': X_val,
            'y_val': y_val,
            'score_name': args.score,
            'num': args.num
        }
        _ = estimator.train(**train_kwargs)
        print('params: \n', params)

    # Training random search CV with scikit-learn models
    if args.train_random:
        for estimator_name in estimators:
            print('Training %s...' % estimator_name)

            if not estimator_name == 'keras':
                seed = args.seed if args.seed != None else np.random.randint(
                    100)
                estimator, params = select_model(estimator_name, n_features,
                                                 n_classes, seed)

                # kwargs dict for train and predict
                train_kwargs = {
                    'estimator': estimator,
                    'params': params,
                    'X_train': X_train,
                    'y_train': y_train,
                    'X_val': X_val,
                    'y_val': y_val,
                    'n_iter': args.n_iter,
                    'score_name': args.score,
                }

                # Train model and Predict results
                best_params, best_score, val_score = random_model(
                    **train_kwargs)
                timestamp = get_timestamp()

                # Write params to file
                write_params(estimator_name, best_params, best_score,
                             val_score, timestamp, args.num)

            elif estimator_name == 'keras':

                space_params = {
                    'n_features':
                    n_features,
                    'n_classes':
                    n_classes,
                    'dropout':
                    hp.uniform('dropout', .20, .80),
                    'hidden_unit':
                    hp.quniform('hidden_unit', 10, 50, q=1),
                    'n_layers':
                    hp.choice('n_layers', [1, 2, 3, 4]),
                    'optimizer':
                    hp.choice('optimizer', ['adam', 'adadelta', 'sgd']),
                    'init':
                    hp.choice('init', ['glorot_uniform', 'normal', 'uniform']),
                    'batch_size':
                    hp.choice('batch_size', [16, 32, 64, 128]),
                    'epochs':
                    hp.quniform('epochs', 100, 1000, q=1),
                    'score_name':
                    args.score,
                    'num':
                    args.num,
                }
                trials = Trials()
                best_params = fmin(random_nn,
                                   space_params,
                                   algo=tpe.suggest,
                                   max_evals=args.n_iter,
                                   trials=trials)
                print('best_params \n', best_params)

    # Evaluate with ensemble method and predict result
    if args.predict:

        eva_kwargs = {
            'estimators': estimators,
            'threshold': args.threshold,
            'X_train': X_train,
            'y_train': y_train,
            'X_val': X_val,
            'y_val': y_val,
            'X_test': X_test,
            'score_name': args.score,
            'n_classes': n_classes,
        }

        # Predict with ensemble voting and write result
        prediction = ensemble(**eva_kwargs)
        if args.ensemble == 'vote':
            result = prediction.vote()
        elif args.ensemble == 'stack':
            result = prediction.stack(args.num_imp)

        timestamp = get_timestamp()
        write_result(result, label_list, timestamp)
Пример #9
0
# TODO: The sounds should be recorded and played on a frontend

def play_audio(sr, wav):
    # I bet you've seen this duct-tape before
    wav = np.multiply(wav, (2**15)).astype(np.int16)
    wavfile.write("output.wav", rate=sr, data=wav)
    sound = AudioSegment.from_wav('output.wav')
    play(sound)


if __name__ == "__main__":
    bot = SmallTalkAgent()

    for _ in range(10):
        play(tink)
        user_input = recognize(7) # 7 - is a duration of recorded sound
        play(morse)

        print(f"you: {user_input}")
        write_result(user_input, 'you')

        response = bot.talk(user_input)
        write_result(user_input, 'bot')

        print(f"bot: {response}")

        sr, wav = pronounce(response)
        play_audio(sr, wav)

Пример #10
0
            "loss_docking": train_losses_docking,
            "loss_screening": train_losses_screening,
        }, epoch)
    writer.add_scalars(
        "test", {
            "total_loss": test_total_losses,
            "loss": test_losses,
            "loss_der1": test_losses_der1,
            "loss_der2": test_losses_der2,
            "loss_var": test_losses_var,
            "loss_docking": test_losses_docking,
            "loss_screening": test_losses_screening,
        }, epoch)

    # Write prediction
    utils.write_result(args.train_result_filename, train_pred, train_true)
    utils.write_result(args.test_result_filename, test_pred, test_true)
    utils.write_result(args.train_result_docking_filename, train_pred_docking,
                       train_true_docking)
    utils.write_result(args.test_result_docking_filename, test_pred_docking,
                       test_true_docking)
    utils.write_result(args.train_result_screening_filename,
                       train_pred_screening, train_true_screening)
    utils.write_result(args.test_result_screening_filename,
                       test_pred_screening, test_true_screening)
    end = time.time()

    # Cal R2
    train_r2 = r2_score([train_true[k] for k in train_true.keys()],
                        [train_pred[k].sum() for k in train_true.keys()])
    test_r2 = r2_score([test_true[k] for k in test_true.keys()],
Пример #11
0
                model.__dict__['{}_log_prob'.format(log_prob_func_name)],
                dataset[log_prob_func_name]['length'],
                trainingparams['shuffle_mask'],
                trainingparams['shuffling_type'], 1000)
        print "\tBest {1} error is : {0:.6f}".format(
            model_evaluation[log_prob_func_name][0],
            log_prob_func_name.upper())

    #
    # WRITING RESULTS #####
    model_info = [
        trainingparams['learning_rate'], trainingparams['decrease_constant'],
        hyperparams['hidden_sizes'], hyperparams['random_seed'],
        hyperparams['hidden_activation'], trainingparams['max_epochs'],
        best_epoch, trainingparams['look_ahead'], trainingparams['batch_size'],
        trainingparams['shuffle_mask'], trainingparams['shuffling_type'],
        trainingparams['nb_shuffle_per_valid'], hyperparams['use_cond_mask'],
        hyperparams['direct_input_connect'],
        hyperparams['direct_output_connect'], trainingparams['pre_training'],
        trainingparams['pre_training_max_epoc'], trainingparams['update_rule'],
        trainingparams['dropout_rate'], hyperparams['weights_initialization'],
        hyperparams['mask_distribution'],
        float(model_evaluation['train'][0]),
        float(model_evaluation['train'][1]),
        float(model_evaluation['valid'][0]),
        float(model_evaluation['valid'][1]),
        float(model_evaluation['test'][0]),
        float(model_evaluation['test'][1]), total_train_time
    ]
    utils.write_result(dataset_name, model_info, experiment_name)
# df['fscore'] = df['fscore'] / df['fscore'].sum()
# df.to_csv("temp/feat_importance.csv", index=False,encoding="utf-8")
# df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
# plt.rcParams["font.sans-serif"] = ["SimHei"]
# plt.title('XGBoost Feature Importance')
# plt.xlabel('relative importance')
# plt.show()

# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()
print(tl.loss_function(preds, labels))

# 读取比赛题
# exam_set = tl.load_match_data("data/d_test_A_20180102.csv")
exam_set = tl.load_match_data("data/d_test_A_20180102_new.csv")
# 数据预处理
exam_set = tl.pre_process(exam_set)
del exam_set["乙肝表面抗原"]
del exam_set["乙肝表面抗体"]
del exam_set["乙肝e抗原"]
del exam_set["乙肝e抗体"]
del exam_set["乙肝核心抗体"]

# 结果预测
exam_set = xgb.DMatrix(exam_set)
y_exam = bst.predict(exam_set)

# 将结果写入
tl.write_result(y_exam)
Пример #13
0
    # Not totally resumable if it was stopped during pre-training.
    if resume_mode:
        load_model_params(model, save_path_experiment)
        trainer_status = utils.load_dict_from_json_file(os.path.join(save_path_experiment, "trainer_status"))

    #
    # TRAINING LEARNER ####
    best_epoch, total_train_time = train_model(model, dataset, trainingparams['look_ahead'], trainingparams['shuffle_mask'], trainingparams['nb_shuffle_per_valid'], trainingparams['max_epochs'], trainingparams['batch_size'], trainingparams['shuffling_type'], save_path_experiment, trainer_status)

    #
    # Loading best model
    load_model_params(model, save_path_experiment)

    #
    # EVALUATING BEST MODEL ####
    model_evaluation = {}
    print '\n### Evaluating best model from Epoch {0} ###'.format(best_epoch)
    for log_prob_func_name in ['test', 'valid', 'train']:
        if trainingparams['shuffle_mask'] > 0:
            model.reset(trainingparams['shuffling_type'])
        if log_prob_func_name == "train":
            model_evaluation[log_prob_func_name] = get_mean_error_and_std_final(model, model.train_log_prob_batch, dataset[log_prob_func_name]['length'], trainingparams['shuffle_mask'], trainingparams['shuffling_type'], 1000)
        else:
            model_evaluation[log_prob_func_name] = get_mean_error_and_std(model, model.__dict__['{}_log_prob'.format(log_prob_func_name)], dataset[log_prob_func_name]['length'], trainingparams['shuffle_mask'], trainingparams['shuffling_type'], 1000)
        print "\tBest {1} error is : {0:.6f}".format(model_evaluation[log_prob_func_name][0], log_prob_func_name.upper())

    #
    # WRITING RESULTS #####
    model_info = [trainingparams['learning_rate'], trainingparams['decrease_constant'], hyperparams['hidden_sizes'], hyperparams['random_seed'], hyperparams['hidden_activation'], trainingparams['max_epochs'], best_epoch, trainingparams['look_ahead'], trainingparams['batch_size'], trainingparams['shuffle_mask'], trainingparams['shuffling_type'], trainingparams['nb_shuffle_per_valid'], hyperparams['use_cond_mask'], hyperparams['direct_input_connect'], hyperparams['direct_output_connect'], trainingparams['pre_training'], trainingparams['pre_training_max_epoc'], trainingparams['update_rule'], trainingparams['dropout_rate'], hyperparams['weights_initialization'], hyperparams['mask_distribution'], float(model_evaluation['train'][0]), float(model_evaluation['train'][1]), float(model_evaluation['valid'][0]), float(model_evaluation['valid'][1]), float(model_evaluation['test'][0]), float(model_evaluation['test'][1]), total_train_time]
    utils.write_result(dataset_name, model_info, experiment_name)
Пример #14
0
		# Remove duplicate words
		query_text = set(query_text)

		for doc_id in doc_dict:
			# retrieve the document text
			doc_text = doc_dict[doc_id]

			# remove duplicate words
			doc_text = set(doc_text)

			# find the overlap between query and document
			overlap = 0
			for word in query_text:
				if word in doc_text:
					overlap += 1

			# store overlap score in dictionary with id tuple as key
			result[(query_id, doc_id)] = overlap

	return result


if __name__ == '__main__':
	query_dict = utils.process_data('data/qrys.txt')
	doc_dict = utils.process_data('data/docs.txt')

	overlap_scores = calculate_overlap(query_dict, doc_dict)

	with open('results/overlap.top','w') as output_file:
		output_file = utils.write_result(overlap_scores, output_file)
Пример #15
0
                precision, recall, f1 = calculate_quality(_OUTPUT_PATTERN % metric, _REFERENCE_FILENAME)
                print("Metric %s:" % metric)
                print("\tPrecision: %f, recall: %f, f1: %f" % (precision, recall, f1))
            except:
                pass
        time2 = time()
        print("Run for %f s." % (time2 - time1))
    else:
        metric_txt = action
        metric = dice_metric if action == 'dice' else cosine_metric if action == 'cosine' else lcs_metric
        print("Preprocessing data...")
        print("Input: %s" % _INPUT_FILENAME)
        counter = 0
        preprocessed = {}
        result = {}
        with open(_INPUT_FILENAME) as input:
            for line in input:
                preprocessed_line = process(line)
                preprocessed[line] = preprocessed_line
                if _DEBUG and counter % 50 == 0:
                    print("%s => %s" % (line, preprocessed_line))
                counter += 1
        print("Clustering...")
        clusters = cluster(preprocessed, metric, _THRESHOLDS[metric_txt], _DEBUG)
        for line, preprocessed_line in preprocessed.items():
            result[line] = clusters[preprocessed_line]
        print("Writing result...")
        write_result(result, _OUTPUT_PATTERN % metric_txt)
        time2 = time()
        print("Run for %f s." % (time2 - time1))
Пример #16
0
        # find the most frequent words
        freq_dist = nltk.FreqDist(word for word in top_doc_word_list)
        best_words = freq_dist.keys()[:num_words]

        # add to the query
        new_query = query_text + best_words

        # recalculate tfidf score and add to score dictionary
        for doc_id, tfidf_score in calculate_tfidf(new_query, doc_dict,
                                                   average_doc_length, k):
            score_dict[query_id, doc_id] = tfidf_score

    return score_dict


if __name__ == "__main__":
    query_dict = utils.process_data('data/qrys.txt')
    doc_dict = utils.process_data('data/docs.txt')

    standard_tfidf_scores = standard_tfidf(query_dict, doc_dict)

    with open('results/tfidf.top', 'w') as output_file:
        output_file = utils.write_result(standard_tfidf_scores, output_file)

    tfidf_with_prf_scores = tfidf_pseudo_relevance_feedback(
        query_dict, doc_dict)

    with open('results/best.top', 'w') as output_file:
        output_file = utils.write_result(tfidf_with_prf_scores, output_file)
Пример #17
0
def train(model, args):
    vocab = json.load(open(args.vocab_path))
    querys = json.load(open(args.query_path))
    gloss = json.load(open(args.gloss_path))
    # Docs = json.load(open(args.docs_path))
    pred = json.load(open(args.preds_path))
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    preds = {}
    # doc_tf = get_doctf(Docs)
    # Doc_inds = prepare(vocab, Docs,args.max_len)
    test_datas = json.load(open(args.datapath))
    train_datas = json.load(open(args.train_path))
    keys = [key for key in gloss]
    if args.only_test:
        datas = test_datas
    if args.merge_train:
        random.shuffle(train_datas)
        datas = train_datas[:len(test_datas)]
    for ele in datas:
        target_word = ele['target_word']
        uid = ele['id']
        preds[uid] = pred[target_word]
    for ele in test_datas:
        target_word = ele['target_word']
        uid = ele['id']
        preds[uid] = pred[target_word]
    prior_preds = preds.copy()
    for i, data in enumerate(datas):
        target_word = data['target_word']
        data['neg_candidates'] = querys[target_word]
        datas[i] = data

    # if args.labeled:
    #     for i,ele in enumerate(train_datas):
    #         uid = ele['id']
    #         target_word = ele['target_word']
    #         preds[uid] = [0 if sense != ele['target_sense'] else 1 for sense in querys[target_word]]
    #         ele['neg_candidates'] = [sense for sense in querys[target_word] if sense != ele['target_sense']]
    #         train_datas[i] = ele
    #     datas = test_datas + train_datas
    #     query_lens = [len(preds[key]) for key in preds]
    #     print(max(query_lens))
    #     print(sum(query_lens) / len(query_lens))

    # datas = preprocess_data(datas, querys, vocab, preds, args)
    # train_dataset = Traindataset(train_datas, tokenizer, args)
    all_dataset = Mydataset(datas, tokenizer, querys, pred, args)
    parameters = [p for p in model.parameters() if p.requires_grad]
    if args.optim == 'adam':
        optim = Adam(parameters, lr=args.lr, weight_decay=args.weight_decay)
    if args.optim == 'sgd':
        optim = SGD(parameters, lr=args.lr)
    tot_loss = 0
    loss_time = 0
    max_F1 = 0
    max_val_F1 = 0
    stop = 0
    # init
    confirm = 0
    for key in preds:
        lis = preds[key]
        score = max(lis)
        confirm += 1 if score > 0.9 else 0
    logging.info('confirm:%s/%s' % (confirm, len(datas)))
    # print('confirm:', confirm, '/', len(datas))
    # em
    goal_sum = 0
    # preds = E_step(model, datas, querys, vocab, args)

    # gloss init

    # M_step
    for epoch in range(args.epoch):
        tot_loss = 0
        loss_time = 0
        tot_pos_score = 0
        tot_neg_score = 0
        tot_eloss = 0
        for i in range(args.update_steps):
            # batch = generate_batch(datas, preds, querys, vocab, keys, args)
            batch = all_dataset.generate_batch(preds, querys, vocab, keys,
                                               gloss, args)
            loss, (pos_score, neg_score,
                   e_loss) = M_step(model, batch, querys, vocab, prior_preds,
                                    tokenizer, gloss, args)
            optim.zero_grad()
            loss.backward()
            optim.step()
            l = loss.item()
            tot_loss += l
            loss_time += 1
            tot_eloss += e_loss.item()
            tot_pos_score += pos_score.item()
            tot_neg_score += neg_score.item()
            # if i % args.print_every == 0:
            # print('goal:' + str(cal_goal(model,datas,querys,vocab,preds,args)))
        F1, val_F1, results = test(model, test_datas, querys, vocab, gloss,
                                   tokenizer, prior_preds, args)
        stop += 1
        writer.add_scalars('loss', {
            'loss': tot_loss / loss_time,
            'eloss': tot_eloss / loss_time
        }, epoch)
        writer.add_scalars(
            'scores', {
                'pos_score': tot_pos_score / loss_time,
                'neg_score': tot_neg_score / loss_time
            }, epoch)
        writer.add_scalar('F1', F1, epoch)
        if F1 > max_F1:
            stop = 0
            if os.path.exists(args.save_dir + str(max_F1) + 'all.model.pkl'):
                os.remove(args.save_dir + str(max_F1) + 'all.test.res')
                os.remove(args.save_dir + str(max_F1) + 'all.model.pkl')
                os.remove(args.save_dir + str(max_F1) + 'all.preds.txt')
            max_F1 = F1
            write_result(args.save_dir + str(max_F1) + 'all.test.res', results)
            with open(args.save_dir + str(max_F1) + 'all.preds.txt',
                      'w') as fout:
                for key in preds:
                    fout.write(key + '\t' + str(preds[key]) + '\n')
            check_point = {}
            check_point['model_dict'] = model.state_dict()
            torch.save(check_point,
                       args.save_dir + str(max_F1) + 'all.model.pkl')
            args.model_path = args.save_dir + str(max_F1) + 'all.model.pkl'
        max_val_F1 = max(val_F1, max_val_F1)
        write_result(args.save_dir + 'epoch' + str(epoch) + 'all.test.res',
                     results)
        with open(args.save_dir + 'epoch' + str(epoch) + 'all.preds.txt',
                  'w') as fout:
            for key in preds:
                fout.write(key + '\t' + str(preds[key]) + '\n')
        logging.info('\nepoch: %s, avg_loss:%s' %
                     (epoch, tot_loss / loss_time))
        loss_time = 0
        tot_loss = 0
        logging.info('F1: %s/%s' % (F1, max_F1))
        logging.info('val F1: %s/%s' % (val_F1, max_val_F1))
        if stop > 50:
            exit()
        if args.merge_train:
            results = E_step(model, datas, querys, vocab, gloss, tokenizer,
                             prior_preds, args)
        new_preds = {}
        for ele in results:
            tem_pred = [x[1] for x in ele[2]]
            tot = sum(tem_pred)
            tem_pred = [x / tot for x in tem_pred]
            new_preds[ele[0]] = tem_pred
        tot_dis = 0
        for key in new_preds:
            new_pred = np.array(new_preds[key])
            pred = np.array(preds[key])
            if np.argmax(new_pred) != np.argmax(pred):
                tot_dis += 1
        writer.add_scalar('tot_update_dis', tot_dis, epoch)
        if not args.wo_estep:
            preds.update(new_preds)
        logging.info('E_step')
        confirm = 0
        for key in preds:
            lis = preds[key]
            score = max(lis)
            confirm += 1 if score > 0.9 else 0
        writer.add_scalar('confirm', confirm, epoch)
        logging.info('confirm:%s/%s' % (confirm, len(datas)))
        with open(args.save_dir + 'epoch' + str(epoch) + 'samplecount.txt',
                  'w') as fout:
            for ele in test_datas:
                key = ele['id']
                neg_count = all_dataset.neg_counts[
                    key] if key in all_dataset.neg_counts else []
                fout.write(key + '\t' + str(neg_count) + '\n')
Пример #18
0
def test(**kwargs):
    opt = DefaultConfig()
    opt.update(**kwargs)

    logger = Logger()

    prefix = ''
    if opt['use_double_length']: prefix += '_2'
    print prefix
    if opt['use_char']:
        logger.info('Load char data starting...')
        opt['embed_num'] = opt['char_embed_num']
        embed_mat = np.load(opt['char_embed'])
        test_title = np.load(opt['test_title_char' + prefix])
        test_desc = np.load(opt['test_desc_char' + prefix])
        logger.info('Load char data finished!')
    elif opt['use_word']:
        logger.info('Load word data starting...')
        opt['embed_num'] = opt['word_embed_num']
        embed_mat = np.load(opt['word_embed'])
        test_title = np.load(opt['test_title_word' + prefix])
        test_desc = np.load(opt['test_desc_word' + prefix])
        logger.info('Load word data finished!')
    elif opt['use_char_word']:
        logger.info('Load char-word data starting...')
        embed_mat_char = np.load(opt['char_embed'])
        embed_mat_word = np.load(opt['word_embed'])
        embed_mat = np.vstack((embed_mat_char, embed_mat_word))
        test_title = np.load(opt['test_title_char' + prefix])
        test_desc = np.load(opt['test_desc_word' + prefix])
        logger.info('Load char-word data finished!')
    elif opt['use_word_char']:
        logger.info('Load word-char data starting...')
        embed_mat_char = np.load(opt['char_embed'])
        embed_mat_word = np.load(opt['word_embed'])
        embed_mat = np.vstack((embed_mat_char, embed_mat_word))
        test_title = np.load(opt['test_title_word' + prefix])
        test_desc = np.load(opt['test_desc_char' + prefix])
        logger.info('Load word-char data finished!')

    test_idx = np.load(opt['test_idx'])
    topic_idx = np.load(opt['topic_idx'])

    test_dataset = Dataset(test=True, title=test_title, desc=test_desc)
    test_loader = data.DataLoader(test_dataset,
                                  shuffle=False,
                                  batch_size=opt['batch_size'])

    logger.info('Using model {}'.format(opt['model']))
    Model = getattr(models, opt['model'])
    model = Model(embed_mat, opt)

    if opt['load']:
        if opt.get('load_name', None) is None:
            model = load_model(model,
                               model_dir=opt['model_dir'],
                               model_name=opt['model'])
        else:
            model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \
                              name=opt['load_name'])

    if opt['device'] != None:
        torch.cuda.set_device(opt['device'])

    if opt['cuda']:
        model.cuda()

    logger.info('Start testing...')

    model.eval()
    predict_label_list = []
    res = torch.Tensor(opt['test_num'], opt['class_num'])
    for i, batch in enumerate(test_loader, 0):
        batch_size = batch[0].size(0)
        title, desc = batch
        title, desc = Variable(title), Variable(desc)
        if opt['cuda']:
            title, desc = title.cuda(), desc.cuda()
        logit = model(title, desc)
        if opt.get('save_resmat', False):
            res[i * opt['batch_size']:i * opt['batch_size'] +
                batch_size] = logit.data.cpu()
        predict_label_list += [list(ii) for ii in logit.topk(5, 1)[1].data]

    if opt.get('save_resmat', False):
        torch.save(res, '{}/{}_test_res.pt'.format(opt['result_dir'],
                                                   opt['model']))
        return

    lines = []
    for qid, top5 in zip(test_idx, predict_label_list):
        topic_ids = [topic_idx[i] for i in top5]
        lines.append('{},{}'.format(qid, ','.join(topic_ids)))

    if opt.get('load_name', None) is None:
        write_result(lines,
                     model_dir=opt['model_dir'],
                     model_name=opt['model'],
                     result_dir=opt['result_dir'])
    else:
        write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], \
                          name=opt['load_name'], result_dir=opt['result_dir'])
Пример #19
0
    def replay(self):
        if self.experiment != 1 and self.order[0] == 2:
            if self.clicks[0] == 'E':
                self.clicks = self.left_txt
            else:
                self.clicks = self.right_txt
        print('| Clicks:', self.clicks)

        # 1. Writing results
        if self.repeat == 24:
            self.blocks.append([deepcopy(self.results),\
            (datetime.datetime.now() - self.block_start_time).total_seconds() ,1])
            if self.order[0] == 2:
                self.memory.pop(0)
                utils.write_result('3', self, False, True)
            else:
                utils.write_result('3', self, False, False)
                self.results.append(self.clicks)
                self.result_set.add(self.clicks)
                self.frequency[self.clicks] += 1  #frequency calculated later
                self.total_frequency[self.clicks] += 1
            self.order.pop(0)

            self.results = []
            self.result_set = set()
            self.repeat = 0
            self.block_start_time = datetime.datetime.now()

            self.order = self.shuffleMode()
            self.memory = self.shuffleMemory()
        else:
            if self.order[0] == 2:
                self.memory.pop(0)
                utils.write_result('3', self, True, True)
            else:
                utils.write_result('3', self, True, False)
                self.results.append(self.clicks)
                self.result_set.add(self.clicks)
                self.frequency[self.clicks] += 1  #frequency calculated later
                self.total_frequency[self.clicks] += 1
            self.order.pop(0)

        # 3. Checking replay conditions
        if self.repeat == 0 and len(self.blocks) >= int(self.settings['blocks3']) and self.reinforcement[-1]\
        and self.memo_accuracy >= int(self.settings['min_memo']):
            self.rgb = np.array([0.0, 200.0, 0.0])
            self.win_txt = tkinter.Label(self.master, bg= "#%02x%02x%02x" % (0, 200, 0), fg = "#%02x%02x%02x" % (0, 200, 0),\
              text='ATÉ O MOMENTO VOCÊ ACUMULOU '+str(int(self.points.get())+int(self.prev_sc.points.get()))+\
              ' PONTOS!', font=Font(family='Helvetica', size=16, weight='bold'))
            self.master.after(20, self.fadeResetText)
        elif self.repeat == 0 and len(self.blocks) >= int(self.settings['blocks3']) and not self.reinforcement[-1]\
        and self.memo_accuracy >= int(self.settings['min_memo']):
            self.rgb = np.array([0.0, 0.0, 0.0])
            self.win_txt = tkinter.Label(self.master, bg= "#%02x%02x%02x" % (0, 0, 0), fg = "#%02x%02x%02x" % (0, 0, 0),\
              text='ATÉ O MOMENTO VOCÊ ACUMULOU '+str(int(self.points.get())+int(self.prev_sc.points.get()))+\
              ' PONTOS!', font=Font(family='Helvetica', size=16, weight='bold'))
            self.master.after(20, self.fadeResetText)
        else:
            self.clicks = ''
            self.round_start_time = datetime.datetime.now()
            self.main_bg.configure(bg="#%02x%02x%02x" % (255, 255, 255))
            if self.order[0] == 2:
                if self.experiment != 1:
                    self.createImgButtons()
                else:
                    self.createButtons()
            else:
                self.createButtons()
            self.ableButtonsAndMouse()
Пример #20
0
def test_stack(**kwargs):
    opt = DefaultConfig()
    opt.update(**kwargs)

    logger = Logger()

    result_dir = '/home/dyj/'
    resmat = [result_dir+'TextCNN1_2017-07-27#12:30:16_test_res.pt',\
              result_dir+'TextCNN2_2017-07-27#12:22:42_test_res.pt', \
              result_dir+'RNN1_2017-07-27#12:35:51_test_res.pt',\
              result_dir+'RNN2_2017-07-27#11:33:24_test_res.pt',\
              result_dir+'RCNN1_2017-07-27#11:30:42_test_res.pt',\
              result_dir+'RCNNcha_2017-07-27#16:00:33_test_res.pt',\
              result_dir+'FastText4_2017-07-28#17:20:21_test_res.pt',\
              result_dir+'FastText1_2017-07-29#10:47:46_test_res.pt']
    opt['stack_num'] = len(resmat)

    test_dataset = Stack_Dataset(resmat=resmat, test=True)
    test_loader = data.DataLoader(test_dataset,
                                  shuffle=False,
                                  batch_size=opt['batch_size'])

    test_idx = np.load(opt['test_idx'])
    topic_idx = np.load(opt['topic_idx'])

    logger.info('Using model {}'.format(opt['model']))
    Model = getattr(models, opt['model'])
    model = Model(opt)
    print model

    if opt['load']:
        if opt.get('load_name', None) is None:
            model = load_model(model,
                               model_dir=opt['model_dir'],
                               model_name=opt['model'])
        else:
            model = load_model(model, model_dir=opt['model_dir'], model_name=opt['model'], \
                              name=opt['load_name'])

    if opt['device'] != None:
        torch.cuda.set_device(opt['device'])

    if opt['cuda']:
        model.cuda()

    logger.info('Start testing...')

    model.eval()
    predict_label_list = []
    res = torch.Tensor(opt['test_num'], opt['class_num'])
    for i, batch in enumerate(test_loader, 0):
        batch_size = batch[0].size(0)
        resmat = batch
        resmat = [Variable(ii) for ii in resmat]
        if opt['cuda']:
            resmat = [ii.cuda() for ii in resmat]
        logit = model(resmat)
        if opt.get('save_resmat', False):
            res[i * opt['batch_size']:i * opt['batch_size'] +
                batch_size] = logit.data.cpu()
        predict_label_list += [list(ii) for ii in logit.topk(5, 1)[1].data]

    if opt.get('save_resmat', False):
        torch.save(
            res, '{}/{}_{}_test_res.pt'.format(
                opt['result_dir'], opt['model'],
                datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S')))
        return

    lines = []
    for qid, top5 in zip(test_idx, predict_label_list):
        topic_ids = [topic_idx[i] for i in top5]
        lines.append('{},{}'.format(qid, ','.join(topic_ids)))

    if opt.get('load_name', None) is None:
        write_result(lines,
                     model_dir=opt['model_dir'],
                     model_name=opt['model'],
                     result_dir=opt['result_dir'])
    else:
        write_result(lines, model_dir=opt['model_dir'], model_name=opt['model'], \
                          name=opt['load_name'], result_dir=opt['result_dir'])
Пример #21
0
        keys_list.append(keys)
    print(keys_list)
    return keys_list


def load_excel(filename: str, sheet_index: int, title_col: int):
    excel_file = xlrd.open_workbook(filename)
    sheet = excel_file.sheet_by_index(sheet_index)
    article_list_ = []
    for i in range(1, sheet.nrows):
        article = {'title': sheet.cell(i, title_col).value}
        article_list_.append(article)
    return article_list_


if __name__ == '__main__':
    EXTERSIONS = ['xls', 'xlsm', 'xlsx']
    std_keys = utils.load_std_keys()
    dir_ = 'F:/pycharm/zbj2/test_data/'
    for path in os.listdir(dir_):
        if EXTERSIONS.count(path.split('.')[-1]) > 0:
            article_list = load_excel(dir_ + path, sheet_index=1, title_col=0)
            # keys_list_ = load_key('C:/excel/数据.xls')
            result_list = utils.classify_subject(article_list, std_keys, index='std_key')
            utils.write_result(dir_ + path, result_list, w_key_col=5, sheet_index=1, w_id_col=4)

    # article_list = load_excel('C:/excel/信息公开目录.xls')
    # # keys_list_ = load_key('C:/excel/数据.xls')
    # result_list = classify(article_list, keys_list_new)
    # write_result('C:/excel/信息公开目录.xls', result_list)
Пример #22
0
            doc_text = doc_dict[doc_id]
            top_doc_word_list+=doc_text

        # find the most frequent words
        freq_dist = nltk.FreqDist(word for word in top_doc_word_list)
        best_words = freq_dist.keys()[:num_words]

        # add to the query
        new_query = query_text + best_words

        # recalculate tfidf score and add to score dictionary
        for doc_id, tfidf_score in calculate_tfidf(new_query, doc_dict, average_doc_length, k):
            score_dict[query_id, doc_id] = tfidf_score

    return score_dict


if __name__ == "__main__":
    query_dict = utils.process_data('data/qrys.txt')
    doc_dict = utils.process_data('data/docs.txt')

    standard_tfidf_scores = standard_tfidf(query_dict, doc_dict)

    with open('results/tfidf.top', 'w') as output_file:
        output_file = utils.write_result(standard_tfidf_scores, output_file)

    tfidf_with_prf_scores = tfidf_pseudo_relevance_feedback(query_dict, doc_dict)

    with open('results/best.top', 'w') as output_file:
        output_file = utils.write_result(tfidf_with_prf_scores, output_file)
Пример #23
0
from utils import read_subject, write_result

for f in [
        "subject/a_example.in", "subject/b_should_be_easy.in",
        "subject/c_no_hurry.in", "subject/d_metropolis.in",
        "subject/e_high_bonus.in"
]:
    STEP_COUNT, BONUS, rides, cars = read_subject(f)
    step = 0
    print f, STEP_COUNT, len(cars), len(rides)
    while step < STEP_COUNT:
        if step % 100 == 0:
            print step
        for car in cars:
            if car.available(step):
                r = car.findRide(rides, step, STEP_COUNT)
                if r is not None:
                    rides.remove(r)
        step += 1
    write_result(f, cars)
Пример #24
0
        # Remove duplicate words
        query_text = set(query_text)

        for doc_id in doc_dict:
            # retrieve the document text
            doc_text = doc_dict[doc_id]

            # remove duplicate words
            doc_text = set(doc_text)

            # find the overlap between query and document
            overlap = 0
            for word in query_text:
                if word in doc_text:
                    overlap += 1

            # store overlap score in dictionary with id tuple as key
            result[(query_id, doc_id)] = overlap

    return result


if __name__ == '__main__':
    query_dict = utils.process_data('data/qrys.txt')
    doc_dict = utils.process_data('data/docs.txt')

    overlap_scores = calculate_overlap(query_dict, doc_dict)

    with open('results/overlap.top', 'w') as output_file:
        output_file = utils.write_result(overlap_scores, output_file)