Пример #1
0
def test_entities_at_the_end():
    words = "Shyam lives in New York".split()
    gold = "B-PER O O B-LOC I-LOC".split()
    pred = "B-PER O O B-LOC O".split()

    print("Input gold. This should be perfect.")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, gold)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 1.0

    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.5
    assert by_type["PER"].fscore == 1.0
    assert by_type["LOC"].fscore == 0.0

    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, pred, gold)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.5
    assert by_type["PER"].fscore == 1.0
    assert by_type["LOC"].fscore == 0.0
Пример #2
0
def test(data):
    if not os.path.exists(RESULT_PATH):
        os.makedirs(RESULT_PATH)

    with tf.Graph().as_default() as g:
        x = tf.placeholder(tf.int32, [None, None])
        y_ = tf.placeholder(tf.int64, [None, None])

        word_emb = tf.Variable(datautil._word_emb,
                               dtype=tf.float32,
                               name='word_emb')
        x_emb = tf.nn.embedding_lookup(word_emb, x)

        y = ner_forward.forward(x_emb, is_train=False, regularizer=None)
        predict = tf.argmax(y, -1)

        saver = tf.train.Saver()

        x_batch = []
        for i in range(len(data)):
            pad_lst = [0] * (MAX_SEQ_LEN - len(data[i][0]))
            x_pad = data[i][0] + pad_lst
            x_batch.append(x_pad)

        while True:
            with tf.Session() as sess:
                # This will return a dict like variable (with key: value)
                # ckpt.model_checkpoint_path will get the string value of "model_checkpoint_path"
                # tf.train.latest_checkpoint can do the same thing
                ckpt = tf.train.get_checkpoint_state(
                    ner_backward.MODEL_SAVE_PATH)
                if ckpt and ckpt.model_checkpoint_path:
                    saver.restore(sess, ckpt.model_checkpoint_path)
                    # Parse the string to extract the number of the ckpt filename
                    global_step = ckpt.model_checkpoint_path.split(
                        '/')[-1].split('-')[-1]
                    predict_id = sess.run(predict, feed_dict={x: x_batch})

                    filename = os.path.join(RESULT_PATH, 'ner.result')
                    fw = open(filename, 'w')
                    for i in range(len(data)):
                        fw.write('{} {} {}\n'.format("<S>", "O", "O"))
                        for j in range(len(data[i][0])):
                            word = data[i][2][j]
                            predict_str = datautil.id2label(predict_id[i][j])
                            label_str = datautil.id2label(data[i][1][j])
                            fw.write('{} {} {}\n'.format(
                                word, label_str, predict_str))
                        fw.write('{} {} {}\n\n'.format("<E>", "O", "O"))
                    fw.close()
                    print("After %s training step(s), test result is:" %
                          (global_step))
                    conlleval.evaluate(filename)
            time.sleep(TEST_INTERVAL_SECS)
    def eval(self, sess, dataset):
        IOB_output = []
        for batch in dataset:
            input_sents, input_words, input_tags, input_lengths, input_words_lens = batch
            feed_dict = {
                self.input_sents: input_sents,
                self.input_words: input_words,
                self.input_tags: input_tags,
                self.input_lengths: input_lengths,
                self.input_words_lens: input_words_lens,
                self.dropout_keep_prob: 1.0
            }

            predicted_seqs = sess.run(self.predicted_seqs, feed_dict=feed_dict)

            batch_size = predicted_seqs.shape[0]

            for i in range(batch_size):
                seq_length = input_lengths[i]
                to_tag = FLAGS.id_to_tags_map
                for j in range(seq_length):
                    line = ['-'] * 2 + [to_tag[input_tags[i, j]]
                                        ] + [to_tag[predicted_seqs[i, j]]]
                    IOB_output.append(' '.join(line))
                IOB_output.append('\n')

        print(len(IOB_output))
        print(IOB_output[:10])

        return conlleval.evaluate(IOB_output)
Пример #4
0
def conll_summary(tokens, gold, pred, config):
    """Return string summarizing performance using CoNLL criteria."""
    index_to_label = {v: k for k, v in config.label_to_index.items()}

    acc = accuracy(gold, pred)
    gold = map(lambda i: index_to_label[i], as_dense(gold))
    pred = map(lambda i: index_to_label[i], as_dense(pred))

    # Format as space-separated (token, gold, pred) strings for CoNLL eval.
    if len(tokens) != len(gold) or len(gold) != len(pred):
        raise ValueError('counts do not match')
    formatted = [' '.join(t) for t in zip(tokens, gold, pred)]

    o, by_type = conlleval.metrics(conlleval.evaluate(formatted))
    nlen = max(len(name) for name in by_type.keys())
    summaries = [
        '%.2f%% acc %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' %
        (100. * acc, 100. * o.fscore, 100. * o.prec, 100. * o.rec, o.tp, o.fp,
         o.fn)
    ]
    config.results_log[config.model_name_log][
        config.dataset_name_log] = o.fscore
    for name, r in sorted(by_type.items()):
        summaries.append('%*s %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' %
                         (nlen, name, 100. * r.fscore, 100. * r.prec,
                          100. * r.rec, r.tp, r.fp, r.fn))

    return '\n'.join(summaries)
Пример #5
0
    def validate_one_batch(self, test_batch, task_name, log_writer, epoch):
        S, Q = test_batch
        Q_tag_ids = Q['tag_ids']
        S_tag_ids = S['tag_ids']
        Q_seq_len_list = Q['lens']
        Q_seq_len_list_plus2 = [x + 2 for x in Q_seq_len_list]
        Q_tag_ids_padded = pad_tag_ids(Q_tag_ids)
        S_tag_ids_padded = pad_tag_ids(S_tag_ids)
        Q['tag_ids'] = Q_tag_ids_padded
        S['tag_ids'] = S_tag_ids_padded

        logits = self([S, Q])
        loss = self.crf_loss(logits, Q_tag_ids_padded, Q_seq_len_list_plus2)
        pred_tags, pred_best_score = crf.crf_decode(
            potentials=logits,
            transition_params=self.trans_p,
            sequence_length=Q_seq_len_list_plus2)
        pred_tags_masked = seq_masking(pred_tags, Q_seq_len_list_plus2)
        p_tags_char, _ = get_id2tag_V2(pred_tags_masked,
                                       Q_seq_len_list_plus2,
                                       taskname=task_name)
        t_tags_char, _ = get_id2tag_V2(Q_tag_ids_padded,
                                       Q_seq_len_list_plus2,
                                       taskname=task_name)
        (P, R, F1), _ = evaluate(t_tags_char, p_tags_char, verbose=True)
        write_to_log(loss, P, R, F1, t_tags_char, log_writer, epoch)
        return (loss, pred_tags_masked, Q_tag_ids_padded, P, R, F1)
Пример #6
0
def evaluate(model, x_test, y_test, labels, MAX_SEQUENCE_LENGTH):

    total = []
    for lang in get_lang_list():  # x_test.keys():
        y_pred = model.predict(x_test[lang])
        pred_tags_all = []
        true_tags_all = []
        for i, seq in enumerate(y_pred):
            for j in range(MAX_SEQUENCE_LENGTH):
                indx = np.argmax(y_test[lang][i][j])
                true_label = labels[indx]
                if "[PAD]" in true_label or "[CLS]" in true_label in true_label:
                    continue
                true_tags_all.append(true_label)
                indx = np.argmax(seq[j])
                pred_label = labels[indx]
                pred_tags_all.append(pred_label)
        prec, rec, f1 = conlleval.evaluate(true_tags_all,
                                           pred_tags_all,
                                           verbose=False)
        print("Lang {} scores {} {} {}".format(lang, prec, rec, f1))
        total.append(f1)
    print("All f-scores {}".format(total))
    print("Overall average f-score mean {} and variance {}".format(
        np.mean(total), np.var(total)))
Пример #7
0
    def eval(self, sess, dataset):
        iterator = dataset.make_one_shot_iterator()
        next_batch_op = iterator.get_next()
        IOB_output = []
        while True:
            try:
                next_batch = sess.run(next_batch_op)
                input_x, input_y, input_lengths = next_batch
            except:
                break
            feed_dict = {
                self.input_sents: input_x,
                self.input_lengths: input_lengths,
                self.dropout_keep_prob: 1.0
            }
            predicted_seqs = sess.run(self.predicted_seqs, feed_dict=feed_dict)

            batch_size = predicted_seqs.shape[0]

            for i in range(batch_size):
                seq_length = input_lengths[i]
                to_tag = FLAGS.id_to_tag_map
                for j in range(seq_length):
                    line = ['-'] * 2 + [to_tag[input_y[i, j]]
                                        ] + [to_tag[predicted_seqs[i, j]]]
                    IOB_output.append(' '.join(line))
                IOB_output.append('\n')

        print(len(IOB_output))
        print(IOB_output[:10])

        return conlleval.evaluate(IOB_output)
Пример #8
0
    def chunking_eval(self, dataloader):
        self.coarse_tagger.eval()
        binary_preds, binary_golds = [], []
        pbar = tqdm(enumerate(dataloader), total=len(dataloader))
        for i, (X, lengths, y_0, y_bin, y_final, y_dm) in pbar:
            binary_golds.extend(y_0)

            X, lengths = X.cuda(), lengths.cuda()
            preds = self.coarse_tagger.chunking(X, y_0, True, lengths)

            binary_preds.extend(preds)

        binary_preds = np.concatenate(binary_preds, axis=0)
        binary_preds = list(binary_preds)
        binary_golds = np.concatenate(binary_golds, axis=0)
        binary_golds = list(binary_golds)

        _bin_pred = []
        _bin_gold = []

        temp = {"B": "B-A", "I": "I-A", "O": "O"}

        for bin_pred, bin_gold in zip(binary_preds, binary_golds):

            bin_slot_pred = y0_set[bin_pred]
            bin_slot_gold = y0_set[bin_gold]

            _bin_gold.append(temp[bin_slot_gold])
            _bin_pred.append(temp[bin_slot_pred])

        (pre, rec, f1), d = conlleval.evaluate(_bin_gold, _bin_pred, logger)
        return f1
Пример #9
0
def evaluate(results, idx_token, idx_label, writer=None):
    """Evaluate prediction results.

    :param results: A List of which each item is a tuple
        (predictions, gold labels, sequence lengths, tokens) of a batch.
    :param idx_token: Index to token dictionary.
    :param idx_label: Index to label dictionary.
    :param writer: An object (file object) with a write() function. Extra output.
    :return: F-score, precision, and recall.
    """
    # b: batch, s: sequence
    outputs = []
    for preds_b, golds_b, len_b, tokens_b in results:
        for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b,
                                                     tokens_b):
            l = int(len_s.item())
            preds_s = preds_s.data.tolist()[:l]
            golds_s = golds_s.data.tolist()[:l]
            tokens_s = tokens_s.data.tolist()[:l]
            for p, g, t in zip(preds_s, golds_s, tokens_s):
                token = idx_token.get(t, C.UNK_INDEX)
                outputs.append('{} {} {}'.format(token, idx_label.get(g, 0),
                                                 idx_label.get(p, 0)))
            outputs.append('')
    counts = conlleval.evaluate(outputs)
    overall, by_type = conlleval.metrics(counts)
    conlleval.report(counts)
    if writer:
        conlleval.report(counts, out=writer)
        writer.flush()
    return overall.fscore, overall.prec, overall.rec
Пример #10
0
def evaluate(model, dataset, word_vocab, label_vocab):
    model.eval()
    losses = []
    scores = []
    true_tags = []
    pred_tags = []
    sents = []
    for i, (sent, tags) in enumerate(dataset):
        with torch.no_grad():
            sent, tags = sent.to(device), tags.to(device)
            sent = sent.unsqueeze(0)
            tags = tags.unsqueeze(0)
            losses.append(model.loss(sent, tags).cpu().detach().item())
            score, pred_tag_seq = model(sent)
            scores.append(score.cpu().detach().numpy())
            true_tags.append([label_vocab.itos[i] for i in tags.tolist()[0]])
            pred_tags.append([label_vocab.itos[i] for i in pred_tag_seq[0]])
            sents.append([word_vocab.itos[i] for i in sent[0]])

    print('Avg evaluation loss:', np.mean(losses))
    acc, rec, f1 = conlleval.evaluate(
        [tag for tags in true_tags for tag in tags],
        [tag for tags in pred_tags for tag in tags],
        verbose=True)
    # print('\n5 random evaluation samples:')
    # for idx in np.random.randint(0, len(sents), size=5):
    #     print('SENT:', ' '.join(sents[idx]))
    #     print('TRUE:', ' '.join(true_tags[idx]))
    #     print('PRED:', ' '.join(pred_tags[idx]))
    #     print('-'*20)
    return sents, true_tags, pred_tags, f1
Пример #11
0
    def get_results(self, name):
        p = (float(self.main_correct_count) / float(self.main_predicted_count)) if (self.main_predicted_count > 0) else 0.0
        r = (float(self.main_correct_count) / float(self.main_total_count)) if (self.main_total_count > 0) else 0.0
        f = (2.0 * p * r / (p + r)) if (p+r > 0.0) else 0.0
        f05 = ((1.0 + 0.5*0.5) * p * r / ((0.5*0.5 * p) + r)) if (p+r > 0.0) else 0.0

        results = collections.OrderedDict()
        results[name + "_cost_avg"] = self.cost_sum / float(self.token_count)
        results[name + "_cost_sum"] = self.cost_sum
        results[name + "_main_predicted_count"] = self.main_predicted_count
        results[name + "_main_total_count"] = self.main_total_count
        results[name + "_main_correct_count"] = self.main_correct_count
        results[name + "_p"] = p
        results[name + "_r"] = r
        results[name + "_f"] = f
        results[name + "_f05"] = f05
        results[name + "_accuracy"] = self.correct_sum / float(self.token_count)
        results[name + "_token_count"] = self.token_count
        results[name + "_time"] = float(time.time()) - float(self.start_time)
        results[name + "_correct_sum"] = self.correct_sum

        if self.label2id is not None and self.conll_eval == True:
            conll_counts = conlleval.evaluate(self.conll_format)
            conll_metrics_overall, conll_metrics_by_type = conlleval.metrics(conll_counts)
            results[name + "_conll_accuracy"] = float(conll_counts.correct_tags) / float(conll_counts.token_counter)
            results[name + "_conll_p"] = conll_metrics_overall.prec
            results[name + "_conll_r"] = conll_metrics_overall.rec
            results[name + "_conll_f"] = conll_metrics_overall.fscore
#            for i, m in sorted(conll_metrics_by_type.items()):
#                results[name + "_conll_p_" + str(i)] = m.prec
#                results[name + "_conll_r_" + str(i)] = m.rec
#                results[name + "_conll_f_" + str(i)] = m.fscore #str(m.fscore) + " " + str(conll_counts.t_found_guessed[i])

        return results, self.conll_format
Пример #12
0
def test_format():
    words = "Shyam lives in New York .".split()
    gold = "B-PER O O B-LOC I-LOC O".split()
    pred = "B-PER O O B-LOC O O".split()
    print("Testing inputting the wrong format. This should get an exception")
    try:
        evaluate([1, 2, 3])
    except Exception as e:
        print(e)

    pred = "B-PER O O B-LOC I-MISC O".split()
    print("This should be 50% F1")
    counts = evaluate(map(lambda p: " ".join(p), zip(words, gold, pred)))
    overall, by_type = metrics(counts)
    report(counts)
    assert overall.fscore == 0.4
Пример #13
0
def conlleval_evaluate(documents):
    """Return conlleval evaluation results for Documents as counts."""
    # conlleval.py has a file-based API, so use StringIO
    conll_string = StringIO()
    write_conll(documents, out=conll_string)
    conll_string.seek(0)
    return evaluate(conll_string)
Пример #14
0
    def eval(self, dataset_name, log_output=None):
        dataset = self.datasets.get(dataset_name, None)
        if dataset is None:
            return

        results = []
        logger.info('Evaluating {} ({})'.format(self.name, dataset_name))
        set_loss = 0
        for tokens, labels, chars, seq_lens, char_lens in dataset.get_dataset(
                volatile=True, gpu=self.gpu):
            preds, loss = self.model.predict(tokens, labels, seq_lens, chars,
                                             char_lens)
            set_loss += float(loss.data[0])
            for pred, gold, seq_len, ts in zip(preds, labels, seq_lens,
                                               tokens):
                l = int(seq_len.data[0])
                pred = pred.data.tolist()[:l]
                gold = gold.data.tolist()[:l]
                ts = ts.data.tolist()[:l]
                for p, g, t in zip(pred, gold, ts):
                    t = self.idx_token.get(t, 'UNK')
                    results.append('{} {} {}'.format(t, self.idx_label[g],
                                                     self.idx_label[p]))
                results.append('')
        counts = evaluate(results)
        overall, by_type = metrics(counts)
        report(counts)
        logger.info('Loss: {:.5f}'.format(set_loss))
        return SCORES(fscore=overall.fscore,
                      precision=overall.prec,
                      recall=overall.rec,
                      loss=set_loss)
Пример #15
0
def main(argv):
    argparser = argument_parser()
    args = argparser.parse_args(argv[1:])
    seq_len = args.max_seq_length  # abbreviation

    pretrained_model, tokenizer = load_pretrained(args)

    train_words, train_tags = read_conll(args.train_data)
    test_words, test_tags = read_conll(args.test_data)
    train_data = process_sentences(train_words, train_tags, tokenizer, seq_len)
    test_data = process_sentences(test_words, test_tags, tokenizer, seq_len)

    label_list = get_labels(train_data.labels)
    tag_map = {l: i for i, l in enumerate(label_list)}
    inv_tag_map = {v: k for k, v in tag_map.items()}

    init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map)

    train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
    test_x = encode(test_data.combined_tokens, tokenizer, seq_len)

    train_y, train_weights = label_encode(train_data.combined_labels, tag_map,
                                          seq_len)
    test_y, test_weights = label_encode(test_data.combined_labels, tag_map,
                                        seq_len)

    ner_model = create_ner_model(pretrained_model, len(tag_map))
    optimizer = create_optimizer(len(train_x[0]), args)

    ner_model.compile(optimizer,
                      loss='sparse_categorical_crossentropy',
                      sample_weight_mode='temporal',
                      metrics=['sparse_categorical_accuracy'])

    ner_model.fit(train_x,
                  train_y,
                  sample_weight=train_weights,
                  epochs=args.num_train_epochs,
                  batch_size=args.batch_size)

    if args.ner_model_dir is not None:
        label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
        save_ner_model(ner_model, tokenizer, label_list, args)
        save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args)

    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)

    pred_tags = []
    for i, pred in enumerate(preds):
        pred_tags.append(
            [inv_tag_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]])

    lines = write_result(args.output_file, test_data.words, test_data.lengths,
                         test_data.tokens, test_data.labels, pred_tags)

    c = conlleval.evaluate(lines)
    conlleval.report(c)
    return 0
Пример #16
0
    def check_path(self, path, files, zip_data):
        logging.info("path: {}".format(path))
        logging.info("files: {}".format(files))
        testfile_path = ''
        testfile_key = ''
        path_key = ''
        for filename in files:
            logging.info("testing filename: {}".format(filename))
            if path is None or path == '':
                logging.info("filename={}".format(filename))
                testfile_path = os.path.abspath(
                    os.path.join(self.ref_dir, filename))
                testfile_key = filename
                path_key = filename
            else:
                logging.info("path={}".format(path))
                testfile_path = os.path.abspath(
                    os.path.join(self.ref_dir, path, filename))
                testfile_key = os.path.join(path, filename)
                path_key = path

            logging.info("path_key={}".format(path_key))

            # set up score value for matching output correctly
            score = self.default_score
            if path_key in self.path_score:
                score = self.path_score[path_key]
            tally = 0.0
            self.perf[path_key] = 0.0

            logging.info("Checking {}".format(testfile_key))
            if testfile_key in zip_data:
                with open(testfile_path, 'rt') as ref:
                    ref_data = list(
                        filter(
                            lambda k: k,
                            [str(x).strip() for x in ref.read().splitlines()]))
                    output_data = list(
                        filter(lambda k: k, [
                            str(x, 'utf-8').strip()
                            for x in zip_data[testfile_key].splitlines()
                        ]))
                    output_data = output_data[:len(ref_data)]
                    if len(ref_data) == len(output_data):
                        logging.info("ref, output {}".format(
                            list(zip(ref_data, output_data))))
                        (prec, recall,
                         tally) = conlleval.evaluate(ref_data, output_data)
                        logging.info("score {}: {}".format(
                            testfile_key, tally))
                    else:
                        logging.info(
                            "length mismatch between output and reference")
                        tally = 0.

            self.perf[path_key] = tally
Пример #17
0
def evaluate(args, data, model, id2label, all_ori_tokens):
    model.eval()
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data,
                            sampler=sampler,
                            batch_size=args.train_batch_size)

    logger.info("***** Running eval *****")
    # logger.info(f" Num examples = {len(data)}")
    # logger.info(f" Batch size = {args.eval_batch_size}")
    pred_labels = []
    ori_labels = []

    for b_i, (input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id,
              bbox_num) in enumerate(tqdm(dataloader, desc="Evaluating")):

        input_ids = input_ids.to(args.device)
        input_mask = input_mask.to(args.device)
        segment_ids = segment_ids.to(args.device)
        label_ids = label_ids.to(args.device)
        bbox = bbox.to(args.device)
        bbox_pos_id = bbox_pos_id.to(args.device)
        bbox_num = bbox_num.to(args.device)

        with torch.no_grad():
            logits = model.predict(input_ids, segment_ids, input_mask, bbox,
                                   bbox_pos_id, bbox_num)
        # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
        # logits = logits.detach().cpu().numpy()

        for l in logits:  # logits-> List[List[int]]
            pred_labels.append([id2label[idx] for idx in l])

        for l in label_ids:  # tensor
            ori_labels.append([id2label[idx.item()] for idx in l])

    eval_list = []

    for ori_tokens, oril, prel in zip(all_ori_tokens, ori_labels, pred_labels):
        for ot, ol, pl in zip(ori_tokens, oril, prel):
            if ot in ["[CLS]", "[SEP]"]:
                continue
            if len(f"{ot} {ol} {pl}\n".split(" ")) != 3:
                continue
            eval_list.append(f"{ot} {ol} {pl}\n")
        eval_list.append("\n")

    # eval the model
    counts = conlleval.evaluate(eval_list)
    conlleval.report(counts)

    # namedtuple('Metrics', 'tp fp fn prec rec fscore')
    overall, by_type = conlleval.metrics(counts)

    return overall, by_type
Пример #18
0
def evaluating_batch(model, datas):

    save = False
    adv = 0
    true_tags_all=[]
    pred_tags_all=[]
    macro=[]
    for data in datas:
        true_tags = []
        pred_tags = []
        
        sentence_in=Variable(torch.LongTensor(data['words']))
        chars2_mask=Variable(torch.LongTensor(data['chars']))
        caps=Variable(torch.LongTensor(data['caps']))
        targets = torch.LongTensor(data['tags']) 
        chars2_length = data['char_length']
        word_length=data['word_length']
        ground_truth_id = data['tags'][0]
        
        if use_gpu:
            sentence_in = sentence_in.cuda()
            targets     = targets.cuda()
            chars2_mask = chars2_mask.cuda()
            caps        = caps.cuda()
        val, out = model(sentence= sentence_in, caps = caps,chars = chars2_mask, 
                         chars2_length= chars2_length,word_length=word_length)
        
        predicted_id = out
        
        for (true_id, pred_id) in zip(ground_truth_id, predicted_id[0]):
            true_tags.append(id_to_tag[true_id])
            pred_tags.append(id_to_tag[pred_id])
            
        
        true_tags_all.extend(true_tags)
        pred_tags_all.extend(pred_tags)
        
        df = pd.DataFrame({'true':true_tags,'pred':pred_tags})
        
        if sum(df['true']!=df['pred'])>0:
            adv = adv+1
        df=df[df['true']!='O'] #only tags
        if len(df)!=0:
            macro.append(sum(df['true']==df['pred'])/len(df))
        
        
    df_tags = pd.DataFrame({'true':true_tags_all,'pred':pred_tags_all})
    df_tags = df_tags[df_tags['true']!='O']
    
    
    print('Micro acc_tag:', sum(df_tags['true']==df_tags['pred'])/len(df_tags))
    print('Macro acc_tag:', np.mean(macro))
    prec, rec, new_F = evaluate(true_tags_all, pred_tags_all, verbose=False)
    print('F 1:',new_F)
    print('Hit:', adv/len(datas))
Пример #19
0
def evaluate(args, task_id, data, model, id2label, all_ori_words, file_name=None):
    model.eval()
    sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size)
    task_id = torch.tensor(task_id, dtype=torch.long).to(args.device)

    logger.info("***** Running eval *****")
    logger.info(f" Num examples = {len(data)}")

    pred_labels = []
    ori_labels = []
    for b_i, batch in enumerate(tqdm(dataloader, desc="Evaluating")):
        batch = tuple(t.to(args.device) for t in batch)
        if args.need_charcnn:
            input_word_ids, input_mask, label_ids, label_mask, char_ids = batch
        else:
            input_word_ids, input_mask, label_ids, label_mask = batch
            char_ids = None

        with torch.no_grad():
            logits = model.predict(task_id, input_word_ids, char_ids, input_mask)

        # print(len(all_ori_words), [len(x) for x in all_ori_words])
        # print(len(logits), [len(x) for x in logits])
        # print(len(label_ids), [len(x) for x in label_ids])
        # print(len(input_mask), [sum(x) for x in input_mask])
        # print(len(label_mask), [sum(x) for x in label_mask])

        for predL, goldL, maskL in zip(logits, label_ids, label_mask):
            for p, g, mask in zip(predL, goldL, maskL):
                if mask.item() == 1:
                    pred_labels.append(id2label[p])
                    ori_labels.append(id2label[g.item()])
            pred_labels.append(None)
            ori_labels.append(None)
    ori_words = []
    for sent in all_ori_words:
        ori_words.extend(sent+[None])
    eval_list = []
    # print(len(pred_labels), len(ori_labels), len(ori_words))
    for plabel, olabel, word in zip(pred_labels, ori_labels, ori_words):
        if plabel is not None:
            eval_list.append(f"{word} {olabel} {plabel}\n")
        else:
            eval_list.append("\n")

    if file_name is not None:
        with open(file_name, "w", encoding="utf-8") as f:
          for line in eval_list:
            f.write(line)

    # eval the model
    counts = conlleval.evaluate(eval_list)
    conlleval.report(counts)
Пример #20
0
def evaluate_f1(labels_pred_list, label_true_list):
    id2tag = {
        test_data_loader.tag2label[key]: key
        for key in test_data_loader.tag2label
    }
    res_pred = []
    for label_ in labels_pred_list:
        res_pred.extend([id2tag[l] for l in label_])
    res_true = []
    for label_ in label_true_list:
        res_true.extend([id2tag[l] for l in label_])
    prec, rec, f1 = evaluate(res_pred, res_true, verbose=True)
    return prec, rec, f1
Пример #21
0
def calculate_labeling_scores(results, report=True):
    outputs = []
    for p_b, g_b, t_b, l_b in results:
        for p_s, g_s, t_s, l_s in zip(p_b, g_b, t_b, l_b):
            p_s = p_s[:l_s]
            for p, g, t in zip(p_s, g_s, t_s):
                outputs.append('{} {} {}'.format(t, g, p))
            outputs.append('')
    counts = conlleval.evaluate(outputs)
    overall, by_type = conlleval.metrics(counts)
    if report:
        conlleval.report(counts)
    return (overall.fscore * 100.0, overall.prec * 100.0, overall.rec * 100.0)
Пример #22
0
def eval2(model):
    y_true = []
    y_pred = []

    for sentence, tags in zip(test_sentences, test_tags):
        tags_pred = model.decode([sentence])
        for word, true_tag, pred_tag in zip(sentence, tags, tags_pred[0]):
            y_true.append(true_tag)
            y_pred.append(pred_tag)

    precision, recall, f1_score = evaluate(y_true, y_pred, verbose=False)
    return "Precision: %.2f%%\tRecall: %.2f%%\tF1_score: %.2f%%".format(
        precision, recall, f1_score)
Пример #23
0
    def inner_train_one_step(self, batches, epochNum, task_name, log_writer,
                             log_dir):
        '''
        :param self:
        :param batches: one batch data: [[sentence],[sentence],....]
                               sentence=[[chars],[charids],[tags],[tag_ids]]
        :param inner_epochNum:
        :return:
        '''
        # tf.summary.trace_on(graph=True,profiler=True)  # 开启Trace(可选)
        batch_Nums = len(batches)

        losses, P_ts, R_ts, F1_ts = [], [], [], []
        # =====run model=======
        with tqdm(total=batch_Nums) as bar:
            for batch_num in range(batch_Nums):
                batch = batches[batch_num]
                seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch(
                    batch)
                with tf.GradientTape() as tape:
                    # print(batch[0]) # 调试用
                    logits = self(seq_ids_padded)
                    loss = self.crf_loss(logits, tag_ids_padded, seq_len_list)
                    pred_tags, pred_best_score = crf.crf_decode(
                        potentials=logits,
                        transition_params=self.trans_p,
                        sequence_length=seq_len_list)
                grads = tape.gradient(loss, self.trainable_variables)
                self.optimizer.apply_gradients(
                    zip(grads, self.trainable_variables))
                # optimizer.minimize(loss, [myModel_bilstm.trainable_variables])

                pred_tags_masked = seq_masking(pred_tags, seq_len_list)
                p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked,
                                                           taskname=task_name)
                t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded,
                                                           taskname=task_name)
                (P_t, R_t, F1_t), _ = evaluate(t_tags_char,
                                               p_tags_char,
                                               verbose=False)
                losses.append(loss)
                P_ts.append(P_t)
                R_ts.append(R_t)
                F1_ts.append(F1_t)
                print('train_loss:{}, train_P:{}'.format(loss, P_t))
                bar.update(1)
        with log_writer.as_default():
            tf.summary.scalar("loss", np.mean(losses), step=epochNum)
            tf.summary.scalar("P", np.mean(P_ts), step=epochNum)
            tf.summary.scalar("R", np.mean(R_ts), step=epochNum)
            tf.summary.scalar("F1", np.mean(F1_ts), step=epochNum)
Пример #24
0
def compare(gold_toks, gold_tags, pred_toks, pred_tags):
    if len(gold_toks) != len(pred_toks):
        raise ValueError('sentence count mismatch: {} in gold, {} in pred'.\
                         format(len(gold_toks), len(pred_toks)))
    lines = []
    for g_toks, g_tags, p_toks, p_tags in zip(gold_toks, gold_tags, pred_toks,
                                              pred_tags):
        if g_toks != p_toks:
            raise ValueError('text mismatch: gold "{}", pred "{}"'.\
                             format(g_toks, p_toks))
        for (g_tok, g_tag, p_tag) in zip(g_toks, g_tags, p_tags):
            lines.append('{}\t{}\t{}'.format(g_tok, g_tag, p_tag))

    return conlleval.report(conlleval.evaluate(lines))
Пример #25
0
def calculate_metrics(dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = ner_model.predict(x)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    evaluate(real_tags, predicted_tags)
Пример #26
0
def get_output_file(all_logit, all_label, decode, out):
    decode.pop(len(decode) - 1)
    assert len(all_logit) == len(all_label)
    evalseq = []
    for i in range(len(all_logit)):
        evalseq.append("{} {} {}".format(
            i,
            decode[int(all_label[i])]
            if int(all_label[i]) in decode.keys() else "O",
            decode[int(all_logit[i])]
            if int(all_logit[i]) in decode.keys() else "O",
        ))

    count = conlleval.evaluate(evalseq)
    conlleval.report(count, out)
Пример #27
0
def conll_eval_counts(ypred, ytruth, labels):
    ytruth_max = ytruth.argmax(axis=2)
    ypred_max = ypred.argmax(axis=2)
    conf_matrix = None
    eval_counts = ceval.EvalCounts()
    label_keys = ordered_label_keys(labels)
    for i in range(len(ypred_max)):
        true_seq = [labels[x] for x in ytruth_max[i].tolist()]
        pred_seq = [labels[x] for x in ypred_max[i].tolist()]
        c = ceval.evaluate(['%s %s' % x for x in zip(true_seq, pred_seq)])
        eval_counts.add(c)

        cm = confusion_matrix(true_seq, pred_seq, label_keys)
        conf_matrix = cm if conf_matrix is None else conf_matrix + cm

    return eval_counts, conf_matrix
Пример #28
0
    def inner_train_one_step(self, batches, inner_iters, inner_epochNum,
                             outer_epochNum, task_name, log_writer):
        '''
        :param self:
        :param batches: one batch data: [[sentence],[sentence],....]
                               sentence=[[chars],[charids],[tags],[tag_ids]]
        :param inner_epochNum:
        :return:
        '''

        batches_len = len(batches)

        # =====run model=======
        for batch_num in range(batches_len):
            batch = batches[batch_num]
            seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch(
                batch)
            with tf.GradientTape() as tape:
                logits = self(seq_ids_padded)
                loss = self.crf_loss(logits, tag_ids_padded, seq_len_list)
                pred_tags, pred_best_score = crf.crf_decode(
                    potentials=logits,
                    transition_params=self.trans_p,
                    sequence_length=seq_len_list)
            grads = tape.gradient(loss, self.trainable_variables)
            self.optimizer.apply_gradients(zip(grads,
                                               self.trainable_variables))
            # optimizer.minimize(loss, [myModel_bilstm.trainable_variables])

        pred_tags_masked = seq_masking(pred_tags, seq_len_list)
        p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked,
                                                   taskname=task_name)
        t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded,
                                                   taskname=task_name)
        (P_t, R_t, F1_t), _ = evaluate(t_tags_char, p_tags_char, verbose=False)
        with log_writer.as_default():
            step = batch_num + 1 + inner_epochNum * batches_len
            tf.summary.scalar("loss",
                              loss,
                              step=inner_epochNum +
                              outer_epochNum * inner_iters)
            tf.summary.scalar("P", P_t, step=inner_epochNum)
            tf.summary.scalar("R", R_t, step=inner_epochNum)
            tf.summary.scalar("F", F1_t, step=inner_epochNum)
        return (loss, P_t)
Пример #29
0
    def inner_train_one_step(self, batches, inner_epochNum, ckpt_manager, log_writer=None):
        '''
        :param self:
        :param batches: one batch data: [[sentence],[sentence],....]
                               sentence=[[chars],[charids],[tags],[tag_ids]]
        :param inner_epochNum:
        :return:
        '''

        batch_size = len(batches)
        print('========================batchsiez', batch_size)

        # =====run model=======
        with tqdm(total=batch_size) as bar:
            for batch_num in range(batch_size):

                batch = batches[batch_num]
                seq_ids_padded, tag_ids_padded, seq_len_list = get_train_data_from_batch(batch)
                with tf.GradientTape() as tape:
                    logits = self(seq_ids_padded)
                    loss = self.crf_loss(logits, tag_ids_padded, seq_len_list)
                    pred_tags, pred_best_score = crf.crf_decode(potentials=logits, transition_params=self.trans_p,
                                                                sequence_length=seq_len_list)
                grads = tape.gradient(loss, self.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
                # optimizer.minimize(loss, [myModel_bilstm.trainable_variables])

                bar.update(1)

            pred_tags_masked = seq_masking(pred_tags, seq_len_list)
            p_tags_char, p_tagsid_flatten = get_id2tag(pred_tags_masked)
            t_tags_char, t_tagsid_flatten = get_id2tag(tag_ids_padded)
            try:
                (P_t, R_t, F1_t),_ = evaluate(t_tags_char, p_tags_char, verbose=True)
            except Exception as e:
                print(e)
            with log_writer.as_default():
                step = batch_num + 1 + inner_epochNum * batch_size
                tf.summary.scalar("loss", loss, step=inner_epochNum)
                tf.summary.scalar("P", P_t, step=inner_epochNum)
                tf.summary.scalar("R", R_t, step=inner_epochNum)
                tf.summary.scalar("F", F1_t, step=inner_epochNum)

            ckpt_manager.save(checkpoint_number=inner_epochNum)
Пример #30
0
def evaluate(results, idx_token, idx_label, writer=None):
    """Evaluate prediction results.

    :param results: A List of which each item is a tuple
        (predictions, gold labels, sequence lengths, tokens) of a batch.
    :param idx_token: Index to token dictionary.
    :param idx_label: Index to label dictionary.
    :param writer: An object (file object) with a write() function. Extra output.
    :return: F-score, precision, and recall.
    """
    # b: batch, s: sequence
    outputs = []
    # preds: predictions
    # golds: answers?
    # len: length of something
    # tokens: original words?
    for preds_b, golds_b, len_b, tokens_b in results:
        for preds_s, golds_s, len_s, tokens_s in zip(preds_b, golds_b, len_b, tokens_b):
            l = int(len_s.item())
            preds_s = preds_s.data.tolist()[:l]
            golds_s = golds_s.data.tolist()[:l]
            tokens_s = tokens_s.data.tolist()[:l]
            for p, g, t in zip(preds_s, golds_s, tokens_s):
                token = idx_token.get(t, C.UNK)
                # if token == '':  # debug
                #     token = '<$UNK$>'
                # print(idx_token)  # debug
                # print("p: ", p, ", g: ", g, ", t: ", t, ", corresponding token:", token, "|")  # DEBUG
                outputs.append('{} {} {}'.format(
                    token, idx_label.get(g, 0), idx_label.get(p, 0)))
            outputs.append('')
    # print("OUTPUTS: ", outputs)  # DEBUG # seems like outputs is right but counts is wrong
    # Why is english-covered-test not like the other, uncovered datasets? is this causing an issue?
    counts = conlleval.evaluate(outputs)
    # print("counts: ", counts)  # DEBUG
    overall, by_type = conlleval.metrics(counts)
    conlleval.report(counts)
    if writer:
        conlleval.report(counts, out=writer)
        writer.flush()
    return overall.fscore, overall.prec, overall.rec
Пример #31
0
def conll_summary(tokens, gold, pred, config):
  """Return string summarizing performance using CoNLL criteria."""
  index_to_label = { v: k for k, v in config.label_to_index.items() }

  acc = accuracy(gold, pred)
  gold = map(lambda i: index_to_label[i], as_dense(gold))
  pred = map(lambda i: index_to_label[i], as_dense(pred))

  # Format as space-separated (token, gold, pred) strings for CoNLL eval.
  if len(tokens) != len(gold) or len(gold) != len(pred):
    raise ValueError('counts do not match')
  formatted = [' '.join(t) for t in zip(tokens, gold, pred)]

  o, by_type = conlleval.metrics(conlleval.evaluate(formatted))
  nlen = max(len(name) for name in by_type.keys())
  summaries = ['%.2f%% acc %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % (
    100.*acc, 100.*o.fscore, 100.*o.prec, 100.*o.rec,  o.tp, o.fp, o.fn
  )]
  for name, r in sorted(by_type.items()):
    summaries.append('%*s %.2f%% f (%.1fp %.1fr %dtp %dfp %dfn)' % (
      nlen, name, 100.*r.fscore, 100.*r.prec, 100.*r.rec, r.tp, r.fp, r.fn
    ))

  return '\n'.join(summaries)