Python read_ground_truth примеры, politiquices.nlp.utils.utils.read_ground_truth Python примеры использования

Пример #1

0

Показать файл

Файл: evaluate_ner.py Проект: davidsbatista/politiquices

def main():
    arquivo = read_ground_truth("../../../../annotations/arquivo.tsv")
    publico = read_ground_truth("../../../../annotations/publico_politica.tsv")
    tp_total = 0
    fp_total = 0
    fn_total = 0

    for x in arquivo:
        if x['label'] == '':
            continue
        """
        print(x['title'])
        print(x['ent1'])
        print(x['ent2'])
        print()
        """
        tp, fp, fn = evaluate_ner(clean_title_re(x['title']), [x['ent1'], x['ent2']])
        tp_total += tp
        fp_total += fp
        fn_total += fn

    """
    for x in publico:
        tp, fp, fn = evaluate_ner(x['title'], [x['ent1'], x['ent2']])
        tp_total += tp
        fp_total += fp
        fn_total += fn
    """

    print("Precision: ", tp_total / (tp_total + fp_total))
    print("Recall   : ", tp_total / (tp_total + fn_total))

Пример #2

0

Показать файл

def main():
    all_data = read_ground_truth("../politiquices_data_v1.0.csv")
    direction_clf = DirectionClassifier()
    true_direction = []
    pred_direction = []

    wrong_patterns = defaultdict(int)
    correct_patterns = defaultdict(int)

    for idx, d in enumerate(all_data):
        if "supports" in d["label"] or "opposes" in d["label"]:
            clean_title = clean_title_quotes(clean_title_re(d['title']))
            ent1 = d["ent1"]
            ent2 = d["ent2"]

            if ent1 not in clean_title or ent2 not in clean_title:
                print("skipped: ", clean_title)
                continue

            true = "ent2_rel_ent1" if d["label"].endswith("ent1") else "ent1_rel_ent2"
            true_direction.append(true)
            pred, pattern, context, pos_tags = direction_clf.detect_direction(clean_title, ent1, ent2)
            pred_direction.append(pred)

            if true != pred:
                wrong_patterns[pattern] += 1
                """
                if pattern == "default":
                    print("true: ", true)
                    print("pred: ", pred)
                    print(d["title"])
                    print(context)
                    # print(pos_tags)
                    print("\n-----------------------------")
                """
            elif true == pred:
                correct_patterns[pattern] += 1
                """
                if true == "ent2_rel_ent1":
                    print("true: ", true)
                    print("pred: ", pred)
                    print(d["title"])
                    print(pattern)
                    print()
                    print(context)
                    print("\n-----------------------------")
                """

    print(classification_report(true_direction, pred_direction))
    print("\nPATTERNS WRONG PREDICTION")
    print("----------------------------")
    for k, v in wrong_patterns.items():
        print(k, v)
    print("\nPATTERNS CORRECT PREDICTION")
    print("----------------------------")
    for k, v in correct_patterns.items():
        print(k, v)

Пример #3

0

Показать файл

def main():

    all_data = read_ground_truth("../politiquices_data_v1.0.csv")
    labels = remap_y_target([s['label'] for s in all_data])
    skf = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)

    print("Loading embeddings...")
    word2embedding, word2index = get_embeddings()

    all_data_shuffled = []
    all_preds = []
    all_trues = []
    fold_n = 0

    for train_index, test_index in skf.split(all_data, labels):
        x_train = [
            doc for idx, doc in enumerate(all_data) if idx in train_index
        ]
        x_test = [doc for idx, doc in enumerate(all_data) if idx in test_index]
        y_train = [
            label for idx, label in enumerate(labels) if idx in train_index
        ]
        y_test = [
            label for idx, label in enumerate(labels) if idx in test_index
        ]

        # get textual contexts
        train_textual_context = get_text_tokens(x_train, tokenized=True)
        test_textual_context = get_text_tokens(x_test, tokenized=True)

        model = RelationshipClassifier(epochs=10)
        model.train(train_textual_context,
                    y_train,
                    word2index,
                    word2embedding,
                    x_val_tks=test_textual_context,
                    y_val=y_test)

        report_str, misclassifications, correct, pred_labels = model.evaluate(
            x_test, y_test)

        all_data_shuffled.extend(x_train)
        all_trues.extend(y_test)
        all_preds.extend(pred_labels)

        fold_n += 1

    print("\n\nFINAL REPORT")
    print(classification_report(all_trues, all_preds, zero_division=0.00))
    cm = confusion_matrix(all_trues,
                          all_preds,
                          labels=['opposes', 'other', 'supports'])
    print_cm(cm, labels=['opposes', 'other', 'supports'])
    print()
    """

Пример #4

0

Показать файл

Файл: train_clf_avg_att.py Проект: davidsbatista/politiquices

def main():
    data_publico = read_ground_truth(
        "../../../../annotations/publico_politica.tsv")
    data_arquivo = read_ground_truth("../../../../annotations/arquivo.tsv")
    docs, labels = pre_process_train_data(data_arquivo + data_publico)

    skf = StratifiedKFold(n_splits=2, random_state=42, shuffle=True)
    fold_n = 0
    for train_index, test_index in skf.split(docs, labels):
        x_train = [doc for idx, doc in enumerate(docs) if idx in train_index]
        x_test = [doc for idx, doc in enumerate(docs) if idx in test_index]
        y_train = [
            label for idx, label in enumerate(labels) if idx in train_index
        ]
        y_test = [
            label for idx, label in enumerate(labels) if idx in test_index
        ]

        max_length = max([len(x) for x in x_train])
        kclf = KerasTextClassifier(input_length=max_length,
                                   n_classes=len(set(labels)),
                                   max_words=15000,
                                   emb_dim=50)
        kclf.fit(x_train,
                 y_train,
                 X_val=x_test,
                 y_val=y_test,
                 epochs=15,
                 batch_size=16)

        predictions = kclf.encoder.inverse_transform(kclf.predict(x_test))
        print(classification_report(y_test, predictions))

        fold_n += 1

    max_length = max([len(x) for x in docs])
    kclf = KerasTextClassifier(input_length=max_length,
                               n_classes=len(set(labels)),
                               max_words=150000,
                               emb_dim=50)
    kclf.fit(docs, labels, epochs=15, batch_size=8)
    kclf.save(path="trained_models/relationship_clf")

Пример #5

0

Показать файл

Файл: download_wikidata_entities.py Проект: davidsbatista/politiquices

def get_wiki_ids_from_annotations():
    training_data = read_ground_truth("../politiquices_data_v1.0.csv")
    annotated_wiki_ids = set()
    for entry in training_data:
        p1_id = entry["ent1_id"]
        p2_id = entry["ent2_id"]
        if p1_id == 'None' or p2_id == 'None':
            continue
        annotated_wiki_ids.add(p1_id.split("/")[-1])
        annotated_wiki_ids.add(p2_id.split("/")[-1])
    print(f"{len(list(annotated_wiki_ids))} entities from annotations")
    return list(annotated_wiki_ids)

Пример #6

0

Показать файл

def main():
    args = parse_args()
    arquivo = []
    publico = []
    chave = []
    rels_gold = []
    arquivo_publico_urls = []
    gold_articles = []
    gold_urls = None
    gold_persons = None

    if args.annotations:
        training_data = read_ground_truth(
            "../classifiers/politiquices_data_v1.0.csv")
        gold_articles, gold_persons, rels_gold, urls_to_ignore = process_gold(
            training_data)
        print("ground truth : ", len(gold_articles))
        gold_urls = [article.url for article in gold_articles] + urls_to_ignore

    if args.arquivo:
        # remove duplicates: keep only unique urls
        arquivo_unique_url = remove_duplicates_with_same_url(args.arquivo)

        # remove duplicates: same crawled url but different crawl date, keep oldest version
        unique = remove_url_crawled_diff_dates_duplicates(arquivo_unique_url)

        # remove duplicates: title + crawl date same and URL overlaps, e.g.: with/without params
        arquivo = remove_duplicates_same_domain(unique)

        # gather all publico.pt URLs
        arquivo_publico_urls = get_publico_urls_in_arquivo(unique)
        print("arquivo.pt   : ", len(arquivo))

    if args.publico:
        publico = remove_duplicates_with_same_url(args.publico)
        print("publico.pt   : ", len(publico))

    if args.chave:
        chave = [entry for entry in processed_titles(args.chave)]
        print("CHAVE        : ", len(chave))

    articles, persons, relationships = process_data(
        arquivo + publico + chave,
        persons=gold_persons,
        publico_urls_in_arquivo=arquivo_publico_urls,
        gold_urls=gold_urls,
    )

    populate_graph(articles + gold_articles, persons,
                   relationships + rels_gold)

Пример #7

0

Показать файл

Файл: politiquices_app.py Проект: davidsbatista/politiquices

def annotations():
    # data = read_ground_truth("../../../nlp/classifiers/politiquices_training_data.tsv")
    # data_webapp = read_ground_truth("../../../nlp/api_annotations/annotations_from_webapp.tsv")
    dataset = read_ground_truth(
        "../../../nlp/classifiers/politiquices_data_v1.0.csv")
    webapp_data = read_ground_truth(
        "../../../nlp/api_annotations/annotations_from_webapp.tsv")
    training_data = [d['title'] for d in dataset + webapp_data]
    all_other = get_relationships_to_annotate()
    to_annotate = []
    skipped = 0
    for doc in all_other:
        if doc['title'] in training_data:
            skipped += 1
            continue
        to_annotate.append(doc)

    print(f"Skipped {skipped} titles, already in annotated data")

    for idx, r in enumerate(to_annotate):
        link_one = r["title"].replace(
            r["ent1_str"],
            '<a id="ent_1" href="entity?q=' + r["ent1"].split("/")[-1] + '">' +
            r["ent1_str"] + "</a>",
        )

        title_link = link_one.replace(
            r["ent2_str"],
            '<a id="ent_2" href="entity?q=' + r["ent2"].split("/")[-1] + '">' +
            r["ent2_str"] + "</a>",
        )

        r["title_clickable"] = title_link
        r["id"] = idx

    return render_template("annotate_other.html", items=to_annotate)

Пример #8

0

Показать файл

def main():
    # Defining DistilBERT tokonizer
    tokenizer = DistilBertTokenizer.from_pretrained(distil_bert,
                                                    do_lower_case=True,
                                                    add_special_tokens=True,
                                                    max_length=21,
                                                    pad_to_max_length=True)

    model_name = 'unicamp-dl/ptt5-base-portuguese-vocab'
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    f_name = "../../../../data/annotations/publico_politica.tsv"
    data = read_ground_truth(f_name, only_label=True)
    docs, labels = pre_process_train_data(data)
    y_test = [re.sub(r"_?ent[1-2]_?", "", y_sample) for y_sample in labels]

    # tokenize
    train_data = tokenize(docs[0:500], tokenizer)
    train_label = y_test[0:500]

    le = LabelEncoder()
    y_train_encoded = le.fit_transform(train_label)
    y_train_vec = to_categorical(y_train_encoded, num_classes=None)

    model = fine_tuning_pretrained_transformer_model()
    model.compile(loss={"output": categorical_crossentropy},
                  optimizer="adam",
                  metrics=["accuracy"])

    model.fit(train_data, y_train_vec, epochs=10)

    f_name = "model_test_10"
    model.save_weights(f_name)

    test_data = tokenize(docs[500:1000], tokenizer)
    test_label = y_test[500:1000]

    x_predicted_probs = model.predict(test_data)
    labels_idx = np.argmax(x_predicted_probs, axis=1)
    pred_labels = le.inverse_transform(labels_idx)

    print("\n" + classification_report(test_label, pred_labels))
    cm = confusion_matrix(test_label, pred_labels, labels=le.classes_)
    print_cm(cm, labels=le.classes_)
    print()

Пример #9

0

Показать файл

def main():
    all_data = read_ground_truth("../politiquices_data_v1.0.csv")
    labels = remap_y_target([s['label'] for s in all_data])
    skf = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)

    all_data_shuffled = []
    all_predictions = []
    all_trues = []
    fold_n = 0

    for train_index, test_index in skf.split(all_data, labels):
        print(f"fold: {fold_n}")
        x_train = [
            doc for idx, doc in enumerate(all_data) if idx in train_index
        ]
        x_test = [doc for idx, doc in enumerate(all_data) if idx in test_index]
        y_train = [
            label for idx, label in enumerate(labels) if idx in train_index
        ]
        y_test = [
            label for idx, label in enumerate(labels) if idx in test_index
        ]
        all_data_test = [
            label for idx, label in enumerate(all_data) if idx in test_index
        ]

        # target vector
        le = LabelEncoder()
        y_train_encoded = le.fit_transform(y_train)

        # get textual contexts
        train_textual_context = get_text_tokens(x_train, tokenized=True)
        test_textual_context = get_text_tokens(x_test, tokenized=True)

        # get other features
        # train_other_features = get_features(train_textual_context)
        # test_other_features = get_features(test_textual_context)

        # no tokenization
        tfidf = TfidfVectorizer(
            tokenizer=dummy_fun,
            preprocessor=dummy_fun,
            # ngram_range=(1, 2)
        )

        # n-grams
        # tfidf = TfidfVectorizer(ngram_range=(1, 2))
        tf_idf_weights = tfidf.fit_transform(train_textual_context)

        # clf = LogisticRegression(multi_class='multinomial', class_weight='balanced')
        # clf = SGDClassifier(max_iter=15000, class_weight='balanced')
        # clf = LinearSVC(class_weight='balanced', max_iter=2000)
        # clf = MultinomialNB(alpha=0.3, fit_prior=True, class_prior=None)
        clf = SVC(C=1.0,
                  class_weight='balanced',
                  decision_function_shape='ovo',
                  probability=True)

        clf.fit(tf_idf_weights, y_train_encoded)
        test_tf_idf_weights = tfidf.transform(test_textual_context)
        predicted = clf.predict(test_tf_idf_weights)
        prob_predictions = clf.predict_proba(test_tf_idf_weights)
        y_pred_label = [le.classes_[pred] for pred in predicted]

        # ToDo: imprimir isto para as classificações erradas
        """
        for prob, pred, pred_label, contexts, label, sample in zip(
                prob_predictions, predicted, y_pred_label, test_textual_context, y_train, x_test
        ):
            print(sample['title'])
            print("prob : ", prob)
            print("pred : ", pred)
            print("class: ", pred_label)
            print()
        """

        all_data_shuffled.extend(x_train)
        all_trues.extend(y_test)
        all_predictions.extend(y_pred_label)
        error_analysis(y_pred_label, predicted, prob_predictions, y_test,
                       all_data_test)
        fold_n += 1

    # correct and not 'other'
    """
    count = 0
    for sample in supports_correct + opposes_correct:
        doc = nlp(sample['title'])
        related = are_entities_related(doc, sample)
        if related:
            continue
        else:
            count+=1
            print(sample['title'])
            print("ent1: ", sample['ent1'])
            print("ent2: ", sample['ent2'])
            if sample in supports_correct:
                print("pred: supports")
            elif sample in opposes_correct:
                print("pred: opposes")
            print("related: ", are_entities_related(doc, sample))
            for token in doc:
                print(f"{token.text:<10} \t {token.pos_:>10} \t {token.dep_:>20} \t {list(token.children)}")
            print()
    print(count)
    """

    # other classified as opposes
    """
    for sample in other_classified_as_opposes + other_classified_as_supports:
        print("\n"+sample['title'])
        print("ent1: ", sample['ent1'])
        print("ent2: ", sample['ent2'])
        print("true: other")
        if sample in other_classified_as_supports:
            print("pred: supports")
        elif sample in other_classified_as_opposes:
            print("pred: opposes")
        print()
        doc = nlp(sample['title'])
        pos_tags = [t for t in doc]
        before, between, after = get_contexts(pos_tags, sample['ent1'], sample['ent2'])
        print("BEF: ", before)
        print("BET: ", between)
        print("AFT: ", after)
        print("features used: ", get_text_tokens([sample]))
        for token in doc:
            print(f"{token.text:<10} \t {token.dep_:>10} \t {list(token.children)}")
        print()
        print("\n\n\n--------------")
        print(
    """

    # supports classified as opposes
    """
    for sample in supports_classified_as_opposes:
        print(sample['title'])
        print("true: supports")
        print("pred: opposes")
        doc = nlp(sample['title'])
        pos_tags = [t for t in doc]
        before, between, after = get_contexts(pos_tags, sample['ent1'], sample['ent2'])
        print("BEF: ", before)
        print("BET: ", between)
        print("AFT: ", after)
        print("features used: ", get_text_tokens([sample]))
        # print("features tags: ", [(t.text, t.lemma_, t.pos_) for t in pos_tags])
        print("\n\n--------------")
    """

    # opposes classified as supports
    """
    for sample in opposes_classified_as_supports:
        print(sample['title'])
        print(sample['label'])
        print("true: opposes")
        print("pred: supports")
        doc = nlp(sample['title'])
        pos_tags = [t for t in doc]
        before, between, after = get_contexts(pos_tags, sample['ent1'], sample['ent2'])
        print("BEF: ", before)
        print("BET: ", between)
        print("AFT: ", after)
        print("features used: ", get_text_tokens([sample]))
        # print("features tags: ", [(t.text, t.lemma_, t.pos_) for t in pos_tags])
        print("\n\n--------------")
    """

    # opposes classified as other
    for sample, prob_pred in opposes_classified_as_other:
        print(sample['title'])
        print("true: opposes")
        print("pred: other")
        print(prob_pred)
        """
        doc = nlp(sample['title'])
        pos_tags = [t for t in doc]
        before, between, after = get_contexts(pos_tags, sample['ent1'], sample['ent2'])
        print("BEF: ", before)
        print("BET: ", between)
        print("AFT: ", after)
        print("features used: ", get_text_tokens([sample]))
        """
        print("\n\n--------------")

    # apply direction classifier to those that were correct
    """
    for sample in supports_correct + opposes_correct:
        title = sample['title']
        ent1 = sample['ent1']
        ent2 = sample['ent2']
        pred_direction, pattern, context, tags = direction_clf.detect_direction(title, ent1, ent2)
        print(title)
        print(sample['label'])
        print(pred_direction)
        print("\n\n--------------")
    """

    print("\n\nFINAL REPORT")
    print(classification_report(all_trues, all_predictions,
                                zero_division=0.00))
    cm = confusion_matrix(all_trues,
                          all_predictions,
                          labels=['opposes', 'other', 'supports'])
    print_cm(cm, labels=['opposes', 'other', 'supports'])
    print()

    train_all_data(all_data, labels)

Пример #10

0

Показать файл

Файл: relationship_EDA.py Проект: davidsbatista/politiquices

def main():
    all_data = read_ground_truth("../politiquices_data_v1.0.csv")
    labels = remap_y_target([s['label'] for s in all_data])
    relevant_labels = ['opposes', 'supports']
    relevant = [
        'relevant' if label in relevant_labels else 'non-relevant'
        for label in labels
    ]

    predicted = []

    relevant_not_related = 0
    relevant_related = 0
    other_related = 0
    other_not_related = 0
    for sample, label in zip(all_data, labels):
        doc = nlp(sample["title"])
        if label in ['opposes', 'supports']:
            continue

        if not are_entities_related(doc, sample):
            other_not_related += 1
            continue
        other_related += 1
        print(sample['title'])
        print("ent1: ", sample['ent1'])
        print("ent2: ", sample['ent2'])
        print(label)
        for token in doc:
            print(
                f"{token.text:<10} \t {token.pos_:>10} \t {token.dep_:>10} \t {list(token.children)}"
            )
        print()

    # print("relevant_related: ", relevant_related)
    # print("relevant_not_related: ", relevant_not_related)
    # print()
    print("other_related: ", other_related)
    print("other_not_related: ", other_not_related)
    print()
    """
    # total = relevant_not_related + relevant_not_related
    # print(total-relevant_not_related/total)
    # print(classification_report(relevant, predicted))
    """
    """
    verb = "<ADV>?<AUX|VERB><PART>?<ADV>?"
    word = "<NOUN|ADJ|ADV|DET|ADP>"
    preposition = "<ADP|ADJ>"
    rel_pattern = "( %s (%s* (%s)+ )? )+ " % (verb, word, preposition)
    reverb_mark = '''REVERB_PATTERN: {%s}''' % rel_pattern
    reverb_pattern = nltk.RegexpParser(reverb_mark)

    word_sentiment = WordSentiment()
    contexts = defaultdict(lambda: defaultdict(list))
    verbs = defaultdict(int)
    verbs_lemma = defaultdict(int)
    """
    """
    for sample in all_data:

        title = sample["title"]
        ent1 = sample["ent1"]
        ent2 = sample["ent2"]
        pos_tags = get_pos_tags(title)
        context = get_context(pos_tags, ent1, ent2)

        # to catch errors in training data
        if context is None or len(context) == 0:
            continue

        label = sample['label']

        if label in other_labels:
            label = 'other'
        else:
            label = re.sub(r"_?ent[1-2]_?", "", label)

        contexts['_'.join([t.pos_ for t in context])][label].append(' '.join([t.text for t in context]))
    """
    """
    for x in sorted(verbs_lemma, key=lambda x: verbs_lemma[x], reverse=True):
        print(x, verbs_lemma[x], word_sentiment.get_sentiment(x))
    print(len(verbs_lemma))
    """
    """
    for pos_tags in sorted(contexts, key=lambda x: len(contexts[x]), reverse=True):
        print(pos_tags, "labels->", len(set(contexts[pos_tags])))
        for label in contexts[pos_tags]:
            print(label)
            print("-----")
            for text in sorted(set(contexts[pos_tags][label])):
                print(text)
            print("\n")
        print("\n\n")

    print(len(contexts))
    """
    """

Пример #11

0

Показать файл

def main():
    data = read_ground_truth("../politiquices_data_v1.0.csv")
    for x in data:
        entity_one_str = x["ent1"]
        entity_one_id = x["ent1_id"]
        entity_two_str = x["ent2"]
        entity_two_id = x["ent2_id"]
        url = x['url']
        if url.startswith('http://www.publico.pt'):
            news_id = url.split("/")[-1]
            url = 'https://publico.pt/' + news_id
        evaluate_one(entity_one_str, entity_one_id, url)
        evaluate_one(entity_two_str, entity_two_id, url)

    wiki_id = []
    unique_ne = []
    ambiguous_named_entities = defaultdict(list)

    for k in sorted(freqs, key=lambda x: len(freqs[x]), reverse=True):
        # print(k, '\t', len(freqs[k]), len(set(freqs[k])))
        for ne in freqs[k]:
            ambiguous_named_entities[ne].append(k)
        unique_ne.append(len(set(freqs[k])))
        if k != 'None':
            wiki_id.append(k)

    print()
    print("#named-entities (surface strings): ", len(ent_true))
    print("#unique ids: ", len(freqs.keys()))
    print("#unique surface strings: ", len(set(all_ent_surface_string)))
    print("  with Wikidata        : ", len(set(ent_surface_string_with_wiki)))
    print("  without Wikidata     : ",
          len(set(ent_surface_string_without_wiki)))
    print()
    print("mean     (unique_ne): ", mean(unique_ne))
    print("mode     (unique_ne): ", mode(unique_ne))
    print("median   (unique_ne): ", median(unique_ne))
    print("st. dev. (unique_ne): ", stdev(unique_ne))
    """
    for k, v in ambiguous_named_entities.items():
        print(k, '\t', len(set(v)))
    """

    seaborn.boxplot(data=unique_ne)
    plt.savefig('unique.png', dpi=300)

    not_found = []
    correct = []
    wrong = []

    for ent_string, true_id, pred_id in zip(all_ent_surface_string, ent_true,
                                            ent_pred):
        if true_id.split("/")[-1] == pred_id.split("/")[-1]:
            correct.append((ent_string, true_id))
        elif true_id != pred_id:
            if pred_id == 'None' and true_id != 'None':
                not_found.append((ent_string, true_id))
            else:
                wrong.append((ent_string, true_id, pred_id))

    print("CORRECT  : ", len(correct))
    print("NOT FOUND: ", len(not_found))
    print("WRONG    : ", len(wrong))
    print()
    print("accuracy: ", accuracy_score(ent_true, ent_pred))

    write_iterator_to_file(sorted(not_found),
                           "entity_linking_could_not_disambiguate.txt")
    write_iterator_to_file(sorted(wrong), "entity_linking_wrong.txt")

Пример #12

0

Показать файл

def main():
    training_data = read_ground_truth("../politiquices_training_data.tsv")
    training_data_webapp = read_ground_truth("../../api_annotations/annotations_from_webapp.tsv")
    all_data = training_data + training_data_webapp

    word_sentiment = WordSentiment()
    contexts = defaultdict(int)

    other = ['e', ',']
    supports = ['apoiar', 'convidar', 'elogiar', 'confiança', 'felicitar']
    opposes = ['acusar', 'criticar', 'responsabilizar', 'desmentir', 'atacar', 'contrariar']

    true_labels = []
    pred_labels = []

    for sample in all_data:

        if sample['label'] in other_labels:
            true_labels.append('other')
        else:
            true_labels.append(re.sub(r"_?ent[1-2]_?", "", sample['label']))

        title = sample["title"]
        ent1 = sample["ent1"]
        ent2 = sample["ent2"]
        pos_tags = get_pos_tags(title)
        context = get_context(pos_tags, ent1, ent2)

        if context is None:
            exit(-1)

        # print([(t.text, t.pos_, t.morph) for t in context])
        # print([t.text for t in context if t.pos_ == 'ADJ'])

        context_text = ' '.join([t.lemma_ for t in context])
        contexts[context_text] += 1

        pred_label = 'other'
        if any(x == context for x in other):
            pred_label = 'other'
        elif any(x in context_text for x in supports):
            pred_label = 'supports'
        elif any(x in context_text for x in opposes):
            pred_label = 'opposes'

        pred_labels.append(pred_label)

        """
        if ' '.join([t.text for t in context]) == '':
            print(title)
            print(ent1)
            print(ent2)
            print()
            print(sample['label'])
            print("\n\n--------------")
        """

        """
        for t in context:
            if t.pos_ in ['ADP']:
                continue
            print(t.text, t.pos_, t.lemma_, '\t', word_sentiment.get_sentiment(t.lemma_))
        """

    for x in sorted(contexts, key=lambda x: contexts[x], reverse=True):
        print(x, contexts[x])

    print()

    print(classification_report(true_labels, pred_labels, zero_division=0.00))
    cm = confusion_matrix(true_labels, pred_labels, labels=['opposes', 'other', 'supports'])
    print_cm(cm, labels=['opposes', 'other', 'supports'])
    print()

Python read_ground_truth примеры использования