Пример #1
0
    def __init__(self, path_train, path_dev, path_test, aug_num,
                 distance_train, distance_dev, distance_test):
        self.path_train = path_train
        self.path_dev = path_dev
        self.path_test = path_test
        self.aug_num = aug_num
        self.old_train = load_dataset_input(self.path_train)
        self.old_dev = load_dataset_input(self.path_dev)
        self.old_test = load_dataset_input(self.path_test)
        self.train_train = load_pickle(distance_train)
        self.dev_train = load_pickle(distance_dev)
        self.test_train = load_pickle(distance_test)

        tprint('load is done, begin transfer data to new format')
        self._load_new()
        tprint('dataset load done')
Пример #2
0
def pruning_sem_f1_score(target, predict, predicate_correct, predicate_sum,
                         out_of_pruning, argument2idx):
    predict_args = 0
    golden_args = 0
    correct_args = 0
    num_correct = 0
    total = 0
    for i in range(len(target)):
        pred_i = predict[i]
        golden_i = target[i]
        if golden_i == argument2idx[_PAD_]:
            continue
        total += 1
        if pred_i == argument2idx[_UNK_]:
            pred_i = argument2idx['_']
        if golden_i == argument2idx[_UNK_]:
            golden_i = argument2idx['_']
        if pred_i != argument2idx['_']:
            predict_args += 1
        if golden_i != argument2idx['_']:
            golden_args += 1
        if golden_i != argument2idx['_'] and pred_i == golden_i:
            correct_args += 1
        if pred_i == golden_i:
            num_correct += 1

    P = (correct_args + predicate_correct) / (predict_args + predicate_sum +
                                              1e-13)

    R = (correct_args + predicate_correct) / (golden_args + out_of_pruning +
                                              predicate_sum + 1e-13)

    NP = correct_args / (predict_args + 1e-13)

    NR = correct_args / (golden_args + 1e-13)

    F1 = 2 * P * R / (P + R + 1e-13)

    NF1 = 2 * NP * NR / (NP + NR + 1e-13)

    tprint(
        '\teval accurate:{:.2f} predict:{} golden:{} correct:{} P:{:.2f} R:{:.2f} F1:{:.2f} NP:{:.2f} NR:{:.2f} NF1:{:.2f}'
        .format(num_correct / total * 100, predict_args,
                golden_args + out_of_pruning, correct_args, P * 100, R * 100,
                F1 * 100, NP * 100, NR * 100, NF1 * 100))

    return (P, R, F1, NP, NR, NF1)
Пример #3
0
def sem_f1_score(target,
                 predict,
                 predicate_correct,
                 predicate_sum,
                 argument2idx,
                 output_to_file=None):
    predict_args = 0
    golden_args = 0
    correct_args = 0
    num_correct = 0
    total = 0
    for i in range(len(target)):
        pred_i = predict[i]
        golden_i = target[i]
        if golden_i == argument2idx[_PAD_]:
            continue
        total += 1
        if pred_i == argument2idx[_UNK_]:
            pred_i = argument2idx['_']
        if golden_i == argument2idx[_UNK_]:
            golden_i = argument2idx['_']
        if pred_i != argument2idx['_']:
            predict_args += 1
        if golden_i != argument2idx['_']:
            golden_args += 1
        if golden_i != argument2idx['_'] and pred_i == golden_i:
            correct_args += 1
        if pred_i == golden_i:
            num_correct += 1

    P = (correct_args + predicate_correct) / (predict_args + predicate_sum +
                                              1e-13)

    R = (correct_args + predicate_correct) / (golden_args + predicate_sum +
                                              1e-13)

    NP = correct_args / (predict_args + 1e-13)

    NR = correct_args / (golden_args + 1e-13)

    F1 = 2 * P * R / (P + R + 1e-13)

    NF1 = 2 * NP * NR / (NP + NR + 1e-13)

    outs = tprint(
        'eval accurate:{:.2f} predict:{} golden:{} correct:{} P:{:.2f} R:{:.2f} F1:{:.2f} NP:{:.2f} NR:{:.2f} NF1:{:.2f}'
        .format(num_correct / total * 100, predict_args, golden_args,
                correct_args, P * 100, R * 100, F1 * 100, NP * 100, NR * 100,
                NF1 * 100))

    if output_to_file is not None:
        with open(output_to_file, 'a') as files:
            files.write(outs)

    return (P, R, F1, NP, NR, NF1)
Пример #4
0
def eval_train_batch(epoch,
                     batch_i,
                     loss,
                     golden_batch,
                     predict_batch,
                     argument2idx,
                     output_to_file=None):
    predict_args = 0
    golden_args = 0
    correct_args = 0
    num_correct = 0
    batch_total = 0
    for i in range(len(golden_batch)):
        pred_i = predict_batch[i]
        golden_i = golden_batch[i]
        if golden_i == argument2idx[_PAD_]:
            continue
        batch_total += 1
        if pred_i == argument2idx[_UNK_]:
            pred_i = argument2idx['_']
        if golden_i == argument2idx[_UNK_]:
            golden_i = argument2idx['_']
        if pred_i != argument2idx['_']:
            predict_args += 1
        if golden_i != argument2idx['_']:
            golden_args += 1
        if golden_i != argument2idx['_'] and pred_i == golden_i:
            correct_args += 1
        if pred_i == golden_i:
            num_correct += 1

    recall = correct_args / (golden_args + 1e-13)
    precision = correct_args / (predict_args + 1e-13)
    F = 2 * recall * precision / (recall + precision + 1e-13)

    outs = tprint(
        'epoch {} batch {} loss:{:4f} accurate:{:.2f} precision:{} recall:{} F:{}'
        .format(epoch, batch_i, loss, num_correct / batch_total * 100,
                precision, recall, F))

    if output_to_file is not None:
        with open(output_to_file, 'a') as out_file:
            out_file.write(outs)

    return correct_args, golden_args, predict_args
Пример #5
0
        res_tmp = []
        for i,e in enumerate(ele):
            res_tmp.append([i,e])
        res_tmp = sorted(res_tmp,key = lambda x:x[1])[:300]
        train_tmp.append([ele[0] for ele in res_tmp])
    pickle_save(train_tmp,result_data_path )
     
if __name__ == '__main__':
    target_name = sys.argv[1]
    if target_name == 'train':
        train = get_sentences('./data/train.txt')
        # train file is so large, we need to split them to 18 file for process
        target_2 = get_sentences('./data/train.txt')
        cur_idx = 0
        while cur_idx < 179014:
            tprint('begin processing %d/18' % (cur_idx // 10000 + 1))
            target = target_2[cur_idx:cur_idx+10000]
            result = get_max_similar(target, train)
            pickle_save(result, './temp/train-train-%d-result.bin' % (cur_idx))
            analyse_train_file('./temp/train-train-%d-result.bin' % (cur_idx), './temp/train-train-%d-part.bin' % (cur_idx))
            cur_idx += 10000
        total = []
        for i in range(18):
            total.extend(pickle_load('./temp/train-train-%d-part.bin' % (i)))
        pickle_save(total, './temp/train_train.bin')

    elif target_name == 'dev':
        target = get_sentences('./data/dev.txt')
        train = get_sentences('./data/train.txt')
        result = get_max_similar(target, train)
        pickle_save(result, './temp/dev-train-result.bin')