Пример #1
0
        dev_best_score = None
        test_best_score = None
        test_ood_best_score = None

        for epoch in range(0):

            epoch_start = time.time()
            for batch_i, train_input_data in enumerate(
                    inter_utils.get_batch(train_dataset,
                                          batch_size,
                                          word2idx,
                                          fr_word2idx,
                                          lemma2idx,
                                          pos2idx,
                                          pretrain2idx,
                                          fr_pretrain2idx,
                                          deprel2idx,
                                          argument2idx,
                                          idx2word,
                                          shuffle=True,
                                          withParrallel=True)):
                srl_model.train()
                target_argument = train_input_data['argument']
                flat_argument = train_input_data['flat_argument']
                target_batch_variable = get_torch_variable_from_np(
                    flat_argument)

                out = srl_model(train_input_data,
                                elmo,
                                withParallel=False,
Пример #2
0
        log('\nStart training...')

        dev_best_score = None
        test_best_score = None
        test_ood_best_score = None

        for epoch in range(max_epoch):

            epoch_start = time.time()
            for batch_i, train_input_data in enumerate(
                    inter_utils.get_batch(train_dataset,
                                          batch_size,
                                          word2idx,
                                          lemma2idx,
                                          pos2idx,
                                          pretrain2idx,
                                          deprel2idx,
                                          argument2idx,
                                          idx2word,
                                          shuffle=True)):

                target_argument = train_input_data['argument']

                flat_argument = train_input_data['flat_argument']

                gold_pos = train_input_data['gold_pos']

                gold_PI = train_input_data['predicates_flag']

                gold_deprel = train_input_data['sep_dep_rel']
Пример #3
0
def eval_data(model, elmo, dataset, batch_size ,word2idx, fr_word2idx, lemma2idx, pos2idx, pretrain2idx, fr_pretrain2idx, deprel2idx, argument2idx, idx2argument, idx2word, unify_pred = False, predicate_correct=0, predicate_sum=0, isPretrain=False):

    model.eval()
    golden = []
    predict = []

    output_data = []
    cur_sentence = None
    cur_sentence_data = None

    for batch_i, input_data in enumerate(inter_utils.get_batch(dataset, batch_size, word2idx, fr_word2idx,
                                                             lemma2idx, pos2idx, pretrain2idx,
                                                             fr_pretrain2idx, deprel2idx, argument2idx, idx2word, lang="En")):
        target_flags = input_data['sen_flags']
        flat_argument = input_data['flat_argument']
        target_batch_variable = get_torch_variable_from_np(flat_argument)

        sentence_id = input_data['sentence_id']
        predicate_id = input_data['predicate_id']
        word_id = input_data['word_id']
        sentence_len =  input_data['sentence_len']
        seq_len = input_data['seq_len']
        bs = input_data['batch_size']
        psl = input_data['pad_seq_len']
        out = model(input_data, lang='En')

        _, pred = torch.max(out, 1)
        pred = get_data(pred)
        pred = np.reshape(pred, target_flags.shape)

        for idx in range(pred.shape[0]):
            predict.append(list(pred[idx]))
            golden.append(list(target_flags[idx]))

        pre_data = []
        for b in range(len(seq_len)):
            line_data = ['_' for _ in range(sentence_len[b])]
            for s in range(seq_len[b]):
                wid = word_id[b][s]
                #line_data[wid-1] = idx2argument[pred[b][s]]
                line_data[wid - 1] = str(pred[b][s])
            pre_data.append(line_data)

        for b in range(len(sentence_id)):
            if cur_sentence != sentence_id[b]:
                if cur_sentence_data is not None:
                    output_data.append(cur_sentence_data)
                cur_sentence_data = [[sentence_id[b]]*len(pre_data[b]),pre_data[b]]
                cur_sentence = sentence_id[b]
            else:
                assert cur_sentence_data is not None
                cur_sentence_data.append(pre_data[b])

    if cur_sentence_data is not None and len(cur_sentence_data)>0:
        output_data.append(cur_sentence_data)
    
    score = sem_f1_score(golden, predict, argument2idx, unify_pred, predicate_correct, predicate_sum)
    """
    P = correct_pos / NonullPredict_pos
    R = correct_pos / NonullTruth_pos
    F = 2 * P * R / (P + R)
    log("POS: ", P, R, F)

    P = correct_PI / NonullPredict_PI
    R = correct_PI / NonullTruth_PI
    F = 2 * P * R / (P + R)
    log("PI: ", P, R, F)

    P = correct_deprel / NonullPredict_deprel
    R = correct_deprel / NonullTruth_deprel
    F = 2 * P * R / (P + R)
    log("deprel: ", P, R, F)

    P = correct_link / NonullPredict_link
    R = correct_link / NonullTruth_link
    F = 2 * P * R / (P + R)
    log(correct_link, NonullPredict_link, NonullTruth_link)
    log("link: ", P, R, F)
    """

    model.train()

    return score, output_data
Пример #4
0
def train_1_epoc(srl_model,
                 criterion,
                 optimizer,
                 train_dataset,
                 labeled_dataset_fr,
                 batch_size,
                 word2idx,
                 fr_word2idx,
                 lemma2idx,
                 pos2idx,
                 pretrain2idx,
                 fr_pretrain2idx,
                 deprel2idx,
                 argument2idx,
                 idx2word,
                 shuffle=False,
                 lang='En',
                 dev_best_score=None,
                 test_best_score=None,
                 test_ood_best_score=None):
    for batch_i, train_input_data in enumerate(
            inter_utils.get_batch(train_dataset,
                                  batch_size,
                                  word2idx,
                                  fr_word2idx,
                                  lemma2idx,
                                  pos2idx,
                                  pretrain2idx,
                                  fr_pretrain2idx,
                                  deprel2idx,
                                  argument2idx,
                                  idx2word,
                                  shuffle=shuffle,
                                  lang=lang)):

        flat_argument = train_input_data['flat_argument']
        target_batch_variable = get_torch_variable_from_np(flat_argument)

        out, out_word = srl_model(train_input_data, lang='En')
        loss = criterion(out, target_batch_variable)
        loss_word = criterion(out_word, target_batch_variable)
        if batch_i % 50 == 0:
            log(batch_i, loss, loss_word)

        optimizer.zero_grad()
        (loss + loss_word).backward()
        optimizer.step()

        if batch_i > 0 and batch_i % show_steps == 0:

            _, pred = torch.max(out, 1)

            pred = get_data(pred)

            # pred = pred.reshape([bs, sl])

            log('\n')
            log('*' * 80)

            eval_train_batch(epoch, batch_i, loss.data[0], flat_argument, pred,
                             argument2idx)

            log('FR test:')
            score, dev_output = eval_data(srl_model,
                                          elmo,
                                          labeled_dataset_fr,
                                          batch_size,
                                          word2idx,
                                          fr_word2idx,
                                          lemma2idx,
                                          pos2idx,
                                          pretrain2idx,
                                          fr_pretrain2idx,
                                          deprel2idx,
                                          argument2idx,
                                          idx2argument,
                                          idx2word,
                                          False,
                                          dev_predicate_correct,
                                          dev_predicate_sum,
                                          lang='Fr')

            if dev_best_score is None or score[5] > dev_best_score[5]:
                dev_best_score = score
                output_predict(
                    os.path.join(
                        result_path, 'dev_argument_{:.2f}.pred'.format(
                            dev_best_score[2] * 100)), dev_output)
                # torch.save(srl_model, os.path.join(os.path.dirname(__file__),'model/best_{:.2f}.pkl'.format(dev_best_score[2]*100)))
            log('\tdev best P:{:.2f} R:{:.2f} F1:{:.2f} NP:{:.2f} NR:{:.2f} NF1:{:.2f}'
                .format(dev_best_score[0] * 100, dev_best_score[1] * 100,
                        dev_best_score[2] * 100, dev_best_score[3] * 100,
                        dev_best_score[4] * 100, dev_best_score[5] * 100))
    return dev_best_score
Пример #5
0
def parallel_train_1_epoc(srl_model,
                          criterion,
                          optimizer,
                          train_dataset,
                          labeled_dataset_fr,
                          batch_size,
                          word2idx,
                          fr_word2idx,
                          lemma2idx,
                          pos2idx,
                          pretrain2idx,
                          fr_pretrain2idx,
                          deprel2idx,
                          argument2idx,
                          idx2word,
                          shuffle=False,
                          lang='En',
                          dev_best_score=None,
                          test_best_score=None,
                          test_ood_best_score=None):
    unlabeled_dataset_en, unlabeled_dataset_fr = train_dataset
    unlabeled_Generator_En = inter_utils.get_batch(unlabeled_dataset_en,
                                                   batch_size,
                                                   word2idx,
                                                   fr_word2idx,
                                                   lemma2idx,
                                                   pos2idx,
                                                   pretrain2idx,
                                                   fr_pretrain2idx,
                                                   deprel2idx,
                                                   argument2idx,
                                                   idx2word,
                                                   shuffle=False,
                                                   lang="En")

    for batch_i, unlabeled_data_fr in enumerate(
            inter_utils.get_batch(unlabeled_dataset_fr,
                                  batch_size,
                                  word2idx,
                                  fr_word2idx,
                                  lemma2idx,
                                  pos2idx,
                                  pretrain2idx,
                                  fr_pretrain2idx,
                                  deprel2idx,
                                  argument2idx,
                                  idx2word,
                                  shuffle=False,
                                  lang='Fr')):
        srl_model.train()
        unlabeled_data_en = unlabeled_Generator_En.next()

        predicates_1D = unlabeled_data_en['predicates_idx']
        predicates_1D_fr = unlabeled_data_fr['predicates_idx']
        #log(predicates_1D, predicates_1D_fr)
        u_loss = srl_model((unlabeled_data_en, unlabeled_data_fr),
                           lang='En',
                           unlabeled='True')
        optimizer.zero_grad()
        u_loss.backward()
        optimizer.step()

        if batch_i % 50 == 0:
            log(batch_i, u_loss)

        if batch_i % 500 == 0:
            log('\n')
            log('*' * 80)
            srl_model.eval()
            # eval_train_batch(epoch, batch_i, loss.data[0], flat_argument, pred, argument2idx)

            log('FR test:')
            score, dev_output = eval_data(srl_model,
                                          elmo,
                                          labeled_dataset_fr,
                                          30,
                                          word2idx,
                                          fr_word2idx,
                                          lemma2idx,
                                          pos2idx,
                                          pretrain2idx,
                                          fr_pretrain2idx,
                                          deprel2idx,
                                          argument2idx,
                                          idx2argument,
                                          idx2word,
                                          False,
                                          dev_predicate_correct,
                                          dev_predicate_sum,
                                          lang='Fr')

            if dev_best_score is None or score[5] > dev_best_score[5]:
                dev_best_score = score
                output_predict(
                    os.path.join(
                        result_path, 'dev_argument_{:.2f}.pred'.format(
                            dev_best_score[2] * 100)), dev_output)
                # torch.save(srl_model, os.path.join(os.path.dirname(__file__),'model/best_{:.2f}.pkl'.format(dev_best_score[2]*100)))
            log('\tdev best P:{:.2f} R:{:.2f} F1:{:.2f} NP:{:.2f} NR:{:.2f} NF1:{:.2f}'
                .format(dev_best_score[0] * 100, dev_best_score[1] * 100,
                        dev_best_score[2] * 100, dev_best_score[3] * 100,
                        dev_best_score[4] * 100, dev_best_score[5] * 100))
    return dev_best_score
Пример #6
0
        print(srl_model)

        print('\t model build finished! consuming {} s'.format(
            int(time.time() - start_t)))

        print('\nStart training...')
        dev_best_score = None
        test_best_score = None
        test_ood_best_score = None
        use_bert = True

        unlabeled_Generator_En = inter_utils.get_batch(unlabeled_dataset_en,
                                                       batch_size,
                                                       argument2idx,
                                                       shuffle=False,
                                                       lang="En",
                                                       use_bert=use_bert,
                                                       para=True)
        unlabeled_Generator_Fr = inter_utils.get_batch(unlabeled_dataset_fr,
                                                       batch_size,
                                                       argument2idx,
                                                       shuffle=False,
                                                       lang="Fr",
                                                       use_bert=use_bert,
                                                       para=True)

        for epoch in range(30):

            for batch_i, train_input_data in enumerate(
                    inter_utils.get_batch(train_dataset,
Пример #7
0
        log('\t model build finished! consuming {} s'.format(
            int(time.time() - start_t)))

        log('\nStart training...')

        dev_best_score = None
        test_best_score = None
        test_ood_best_score = None
        unlabeled_Generator_En = inter_utils.get_batch(unlabeled_dataset_en,
                                                       batch_size,
                                                       word2idx,
                                                       fr_word2idx,
                                                       lemma2idx,
                                                       pos2idx,
                                                       pretrain2idx,
                                                       fr_pretrain2idx,
                                                       deprel2idx,
                                                       argument2idx,
                                                       idx2word,
                                                       shuffle=False,
                                                       lang="En")
        unlabeled_Generator_Fr = inter_utils.get_batch(unlabeled_dataset_fr,
                                                       batch_size,
                                                       word2idx,
                                                       fr_word2idx,
                                                       lemma2idx,
                                                       pos2idx,
                                                       pretrain2idx,
                                                       fr_pretrain2idx,
                                                       deprel2idx,
Пример #8
0
        dev_best_score = None
        test_best_score = None
        test_ood_best_score = None

        for epoch in range(max_epoch):

            epoch_start = time.time()
            for batch_i, train_input_data in enumerate(
                    inter_utils.get_batch(train_dataset,
                                          batch_size,
                                          word2idx,
                                          fr_word2idx,
                                          lemma2idx,
                                          pos2idx,
                                          pretrain2idx,
                                          fr_pretrain2idx,
                                          deprel2idx,
                                          argument2idx,
                                          idx2word,
                                          shuffle=False,
                                          withParrallel=True)):

                target_argument = train_input_data['argument']

                flat_argument = train_input_data['flat_argument']

                gold_pos = train_input_data['gold_pos']

                gold_PI = train_input_data['predicates_flag']
Пример #9
0
def pruning_eval_data(model,
                      elmo,
                      dataset,
                      batch_size,
                      out_of_pruning,
                      word2idx,
                      lemma2idx,
                      pos2idx,
                      pretrain2idx,
                      deprel2idx,
                      argument2idx,
                      idx2argument,
                      unify_pred=False,
                      predicate_correct=0,
                      predicate_sum=0):

    model.eval()
    golden = []
    predict = []

    output_data = []
    cur_sentence = None
    cur_sentence_data = None

    for batch_i, input_data in enumerate(
            inter_utils.get_batch(dataset, batch_size, word2idx, lemma2idx,
                                  pos2idx, pretrain2idx, deprel2idx,
                                  argument2idx)):

        target_argument = input_data['argument']

        flat_argument = input_data['flat_argument']

        target_batch_variable = get_torch_variable_from_np(flat_argument)

        sentence_id = input_data['sentence_id']
        predicate_id = input_data['predicate_id']
        word_id = input_data['word_id']
        sentence_len = input_data['sentence_len']
        seq_len = input_data['seq_len']
        bs = input_data['batch_size']
        psl = input_data['pad_seq_len']

        out = model(input_data, elmo)

        # loss = criterion(out, target_batch_variable)

        _, pred = torch.max(out, 1)

        pred = get_data(pred)

        # print(target_argument.shape)
        # print(pred.shape)

        pred = np.reshape(pred, target_argument.shape)

        # golden += flat_argument.tolist()
        # predict += pred

        for idx in range(pred.shape[0]):
            predict.append(list(pred[idx]))
            golden.append(list(target_argument[idx]))

        pre_data = []
        for b in range(len(seq_len)):
            line_data = ['_' for _ in range(sentence_len[b])]
            for s in range(seq_len[b]):
                wid = word_id[b][s]
                line_data[wid - 1] = idx2argument[pred[b][s]]
            pre_data.append(line_data)

        for b in range(len(sentence_id)):
            if cur_sentence != sentence_id[b]:
                if cur_sentence_data is not None:
                    output_data.append(cur_sentence_data)
                cur_sentence_data = [[sentence_id[b]] * len(pre_data[b]),
                                     pre_data[b]]
                cur_sentence = sentence_id[b]
            else:
                assert cur_sentence_data is not None
                cur_sentence_data.append(pre_data[b])

    if cur_sentence_data is not None and len(cur_sentence_data) > 0:
        output_data.append(cur_sentence_data)

    score = pruning_sem_f1_score(golden, predict, out_of_pruning, argument2idx,
                                 unify_pred, predicate_correct, predicate_sum)

    model.train()

    return score, output_data
Пример #10
0
        dev_best_score = None
        test_best_score = None
        test_ood_best_score = None

        for epoch in range(max_epoch):

            epoch_start = time.time()
            for batch_i, train_input_data in enumerate(
                    inter_utils.get_batch(train_dataset,
                                          batch_size,
                                          word2idx,
                                          fr_word2idx,
                                          lemma2idx,
                                          pos2idx,
                                          pretrain2idx,
                                          fr_pretrain2idx,
                                          deprel2idx,
                                          argument2idx,
                                          idx2word,
                                          shuffle=False,
                                          lang='Fr')):

                flat_flags = train_input_data['flat_flags']
                target_batch_variable = get_torch_variable_from_np(flat_flags)

                bs = train_input_data['batch_size']
                sl = train_input_data['seq_len']
                out = srl_model(train_input_data, lang='En')
                loss = criterion(out, target_batch_variable)
                if batch_i % 50 == 0:
Пример #11
0
def my_eval(epoch,
            step,
            model,
            criterion,
            dataset,
            batch_size,
            predicate_correct,
            predicate_sum,
            word2idx,
            lemma2idx,
            pos2idx,
            pretrain2idx,
            argument2idx,
            use_attention=False,
            aug_data=None):
    model.eval()
    gold = predict = correct = total = num_correct = 0
    for batch_i, input_data in enumerate(
            inter_utils.get_batch(dataset, batch_size, word2idx, lemma2idx,
                                  pos2idx, pretrain2idx, argument2idx)):
        target_argument = input_data["argument"]
        flat_argument = input_data["flat_argument"]
        target_batch_variable = get_torch_variable_from_np(flat_argument)

        if use_attention:
            out = model((input_data, aug_data))
        else:
            out = model(input_data)

        _, pred = torch.max(out, 1)

        pred = get_data(pred)

        loss = criterion(out, target_batch_variable)
        cur_correct, cur_golden, cur_predict, cur_total, cur_nc = my_eval_batch(
            flat_argument, pred, argument2idx)
        gold += cur_golden
        predict += cur_predict
        correct += cur_correct
        total += cur_total
        num_correct += cur_nc

    P = (correct + predicate_correct) / (predict + predicate_sum + 1e-13)
    R = (correct + predicate_correct) / (gold + predicate_sum + 1e-13)
    NP = correct / (predict + 1e-13)
    NR = correct / (gold + 1e-13)
    F1 = 2 * P * R / (P + R + 1e-13)
    NF1 = 2 * NP * NR / (NP + NR + 1e-13)
    acc = num_correct / total
    print_dict = {
        'e': epoch,
        's': step,
        'acc': acc,
        'p': predict,
        'g': gold,
        'c': correct,
        'P': P,
        'R': R,
        'F1': F1,
        'NP': NP,
        'NR': NR,
        'NF1': NF1
    }
    print_result(print_dict)
    model.train()
    return print_dict
Пример #12
0
def eval_data(model,
              elmo,
              dataset,
              batch_size,
              word2idx,
              lemma2idx,
              pos2idx,
              pretrain2idx,
              deprel2idx,
              argument2idx,
              idx2argument,
              idx2word,
              unify_pred=False,
              predicate_correct=0,
              predicate_sum=0):

    model.eval()
    golden = []
    predict = []

    output_data = []
    cur_sentence = None
    cur_sentence_data = None

    correct_pos, NonullPredict_pos, NonullTruth_pos = 0.1, 0.1, 0.
    correct_PI, NonullPredict_PI, NonullTruth_PI = 0., 0., 0.
    correct_deprel, NonullPredict_deprel, NonullTruth_deprel = 0., 0., 0.
    correct_link, NonullPredict_link, NonullTruth_link = 0., 0., 0.

    for batch_i, input_data in enumerate(
            inter_utils.get_batch(dataset, batch_size, word2idx, lemma2idx,
                                  pos2idx, pretrain2idx, deprel2idx,
                                  argument2idx, idx2word)):

        target_argument = input_data['argument']

        flat_argument = input_data['flat_argument']

        gold_pos = input_data['gold_pos']

        gold_PI = input_data['predicates_flag']

        gold_deprel = input_data['sep_dep_rel']

        gold_link = input_data['sep_dep_link']

        target_batch_variable = get_torch_variable_from_np(flat_argument)
        gold_pos_batch_variable = get_torch_variable_from_np(gold_pos)
        gold_PI_batch_variable = get_torch_variable_from_np(gold_PI)
        gold_deprel_batch_variable = get_torch_variable_from_np(gold_deprel)
        gold_link_batch_variable = get_torch_variable_from_np(gold_link)

        sentence_id = input_data['sentence_id']
        predicate_id = input_data['predicate_id']
        word_id = input_data['word_id']
        sentence_len = input_data['sentence_len']
        seq_len = input_data['seq_len']
        bs = input_data['batch_size']
        psl = input_data['pad_seq_len']

        out, out_pos, out_PI, out_deprel, out_link = model(input_data, elmo)

        a, b, c = get_PRF(out_pos, gold_pos_batch_variable.view(-1))
        correct_pos += a
        NonullPredict_pos += b
        NonullTruth_pos += c

        a, b, c = get_PRF(out_PI, gold_PI_batch_variable.view(-1))
        correct_PI += a
        NonullPredict_PI += b
        NonullTruth_PI += c

        a, b, c = get_PRF(out_deprel, gold_deprel_batch_variable.view(-1))
        correct_deprel += a
        NonullPredict_deprel += b
        NonullTruth_deprel += c

        a, b, c = get_PRF(out_link, gold_link_batch_variable.view(-1))
        correct_link += a
        NonullPredict_link += b
        NonullTruth_link += c

        _, pred = torch.max(out, 1)

        pred = get_data(pred)

        pred = np.reshape(pred, target_argument.shape)

        for idx in range(pred.shape[0]):
            predict.append(list(pred[idx]))
            golden.append(list(target_argument[idx]))

        pre_data = []
        for b in range(len(seq_len)):
            line_data = ['_' for _ in range(sentence_len[b])]
            for s in range(seq_len[b]):
                wid = word_id[b][s]
                line_data[wid - 1] = idx2argument[pred[b][s]]
            pre_data.append(line_data)

        for b in range(len(sentence_id)):
            if cur_sentence != sentence_id[b]:
                if cur_sentence_data is not None:
                    output_data.append(cur_sentence_data)
                cur_sentence_data = [[sentence_id[b]] * len(pre_data[b]),
                                     pre_data[b]]
                cur_sentence = sentence_id[b]
            else:
                assert cur_sentence_data is not None
                cur_sentence_data.append(pre_data[b])

    if cur_sentence_data is not None and len(cur_sentence_data) > 0:
        output_data.append(cur_sentence_data)

    score = sem_f1_score(golden, predict, argument2idx, unify_pred,
                         predicate_correct, predicate_sum)
    P = correct_pos / NonullPredict_pos
    R = correct_pos / NonullTruth_pos
    F = 2 * P * R / (P + R)
    log("POS: ", P, R, F)

    P = correct_PI / NonullPredict_PI
    R = correct_PI / NonullTruth_PI
    F = 2 * P * R / (P + R)
    log("PI: ", P, R, F)

    P = correct_deprel / NonullPredict_deprel
    R = correct_deprel / NonullTruth_deprel
    F = 2 * P * R / (P + R)
    log("deprel: ", P, R, F)

    P = correct_link / NonullPredict_link
    R = correct_link / NonullTruth_link
    F = 2 * P * R / (P + R)
    log(correct_link, NonullPredict_link, NonullTruth_link)
    log("link: ", P, R, F)

    model.train()

    return score, output_data