def evaluate(machine, valid_problem, info_suffix):
    test_label_file_path = "test_label_" + str(info_suffix) + ".txt"
    pred_label_file_path = "pred_label_" + str(info_suffix) + ".txt"
    test_raw_label_file_path = "test_raw_label_" + str(info_suffix) + ".txt"
    pred_raw_label_file_path = "pred_raw_label_" + str(info_suffix) + ".txt"

    test_label_file = open(test_label_file_path, "w")
    pred_label_file = open(pred_label_file_path, "w")

    test_raw_label_file = open(test_raw_label_file_path, "w")
    pred_raw_label_file = open(pred_raw_label_file_path, "w")

    test_types = []
    pred_types = []
    for valid_sentence in valid_problem.sentences():
        test_labels = []
        pred_labels = []

        sentence_str = " ".join([word.content for word in valid_sentence.words()])
        srl_x, srl_y = valid_problem.get_dataset_for_sentence(valid_sentence)
        pred_y = machine.predict(srl_x.astype(theano.config.floatX))

        test_types.append(sentence_str)
        test_types.append("\t".join([SrlTypes.LABEL_SRLTYPE_MAP[l] for l in srl_y]))
        pred_types.append(sentence_str)
        pred_types.append("\t".join([SrlTypes.LABEL_SRLTYPE_MAP[l] for l in pred_y]))

        test_labels.append(srl_y)
        pred_labels.append(pred_y)

        test_label_str = valid_problem.pretty_srl_test_label(valid_sentence, test_labels)
        pred_label_str = valid_problem.pretty_srl_predict_label(valid_sentence, pred_labels)

        test_label_file.write(test_label_str)
        pred_label_file.write(pred_label_str)

        test_raw_label_file.write("\n".join(test_types))
        pred_raw_label_file.write("\n".join(pred_types))

    test_label_file.close()
    pred_label_file.close()

    test_raw_label_file.close()
    pred_raw_label_file.close()

    valid_result = eval_srl(test_label_file_path, pred_label_file_path)
    valid_info = 'validation info {0}% '.format(
                                    valid_result)
    print valid_info
예제 #2
0
def test_srl_label_formatter(data_file_path):

    conll05corpora = Conll05Corpora()
    conll05corpora.load(data_file_path)
    print 'load done'

    test_label_file_path = "test_label.txt"
    pred_label_file_path = "pred_label.txt"

    srl_problem = SRLProblem(conll05corpora)

    for valid_sentence in srl_problem.sentences():
        test_labels = []
        pred_labels = []

        test_label_file = open(test_label_file_path, "w")
        pred_label_file = open(pred_label_file_path, "w")

        for srl_x, srl_y in srl_problem.get_dataset_for_sentence(valid_sentence):
            pred_y = [random.choice(SrlTypes.LABEL_SRLTYPE_MAP.keys()) for i in range(srl_y.size)]

            test_labels.append(srl_y)
            pred_labels.append(pred_y)

        test_label_str = srl_problem.pretty_srl_test_label(valid_sentence, test_labels)
        pred_label_str = srl_problem.pretty_srl_predict_label(valid_sentence, pred_labels)

        test_label_file.write(test_label_str)
        pred_label_file.write(pred_label_str)

        test_label_file.close()
        pred_label_file.close()

        try:

            valid_result = eval_srl(test_label_file_path, pred_label_file_path)
        except:
            print "label = "
            print "\n".join([" ".join([SrlTypes.LABEL_SRLTYPE_MAP[x] for x in  label]) for label in pred_labels])
            print "formatted_label = "
            print pred_label_str
예제 #3
0
def train_srl_neural_model(train_problem, valid_problem,
                           nn_architecture,  hyper_param,
                           model_path=None, model_tag=None):


    problem_character = train_problem.get_problem_property()
    trans_mat_prior = train_problem.get_trans_mat_prior()

    srl_nn = SRLNetwork(problem_character, nn_architecture, trans_mat_prior)

    if model_path != None:
        srl_nn.load_model(model_path, model_tag)

    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant


    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    validation_frequency = 10000

    total_minibatch = 0

    train_func = get_train_func(srl_nn, hyper_param.learning_rate, hyper_param.l1_reg, hyper_param.l2_reg)

    valid_func = get_test_func(srl_nn)

    #stat_func = get_pred_stat_func(srl_nn)
    pred_func = get_pred_func(srl_nn)

    while (epoch < hyper_param.n_epochs) and (not done_looping):
        epoch = epoch + 1

        minibatch = 0
        for X, y in train_problem.get_data_batch():

            if X[0][0] < 3:
                continue

            start_time = time.clock()


            minibatch_avg_cost= train_func(X.astype("float32"), y.astype('int32'))

            end_time = time.clock()

            minibatch += 1
            total_minibatch += 1
            if minibatch % 100 == 0:

                debug_info = 'epoch {0}.{1}, cost = {2}, time = {3}'.format(epoch,minibatch,minibatch_avg_cost,end_time - start_time)
                print debug_info
            '''
                numpy.savetxt(str(minibatch) +  ".X.txt",
                              numpy.asarray(srl_nn.hidden_output(T.shared(X)).eval()))
                numpy.savetxt(str(minibatch) +  ".y.txt",
                              y)
            '''


            if total_minibatch  % validation_frequency == 0:

#                srl_nn.dump_model('./models/',str(total_minibatch/validation_frequency))

                # compute zero-one loss on validation set
                validation_losses = 0
                sample_num = 0
                validation_pred = []
                validation_label = []
                test_num = 0
                all_same = 0

                same_rate = 0
                test_label_file_path = "test_label_" + str(total_minibatch/validation_frequency) + ".txt"
                pred_label_file_path = "pred_label_" + str(total_minibatch/validation_frequency) + ".txt"

                test_label_file = open(test_label_file_path, "w")
                pred_label_file = open(pred_label_file_path, "w")
                start_time = time.clock()
                for sentence in valid_problem.sentences():
                    test_labels = []
                    pred_labels = []
                    for srl_x, srl_y in valid_problem.get_dataset_for_sentence(sentence):
                        test_labels.append(srl_y)
                        pred_labels.append(pred_func(srl_x.astype("float32")))

                    test_label_str = valid_problem.pretty_srl_label(sentence, test_labels)
                    pred_label_str = valid_problem.pretty_srl_label(sentence, pred_labels)

                    test_label_file.write(test_label_str)
                    pred_label_file.write(pred_label_str)

                test_label_file.close()
                pred_label_file.close()


                valid_result = eval_srl(test_label_file_path, pred_label_file_path)
                valid_info = 'minibatch {0}, validation info {1}% '.format(
                    total_minibatch, valid_result)
                print valid_info


                # # if we got the best validation score until now
                # if validation_losses < best_validation_loss:
                #     #improve patience if loss improvement is good enough
                #     if validation_losses < best_validation_loss *  \
                #            improvement_threshold:
                #         patience = max(patience, epoch * patience_increase)
                #
                #     best_validation_loss = validation_losses
                #     best_iter = epoch
                #
                # if patience <= epoch:
                #     done_looping = True
                #     break

        hyper_param.learning_rate *= hyper_param.learning_rate_decay_ratio
        if hyper_param.learning_rate <= hyper_param.learning_rate_lowerbound:
            hyper_param.learning_rate = hyper_param.learning_rate_lowerbound

    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i.') %
          (best_validation_loss * 100., epoch))