def test_fuzzy_match_one_candidate_substring_matches_case_1():
    expanded = ['Marques Mendes']
    candidates = [{
        'wiki': 'Q550243',
        'label': 'Luís Marques Mendes',
        'aliases': ['Luís Manuel Gonçalves Marques Mendes']
    }]
    assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_fuzzy_match_one_candidate_clean_string_matches_case_1():
    expanded = ['José Pedro Aguiar-Branco']
    candidates = [{
        'wiki': 'Q1555060',
        'label': 'José Pedro Aguiar Branco',
        'aliases': ['José Pedro Correia de Aguiar Branco']
    }]
    assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_fuzzy_match_one_candidate_substring_matches_case_3():
    expanded = ['Ribeiro e Castro']
    candidates = [{
        'wiki': 'Q1386216',
        'label': 'José Ribeiro e Castro',
        'aliases': ['José Duarte de Almeida Ribeiro e Castro']
    }]
    assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_fuzzy_match_one_candidate_substring_matches_case_2():
    expanded = ['Morais Castro']
    candidates = [{
        'wiki': 'Q934980',
        'label': 'José Morais e Castro',
        'aliases': None
    }]
    assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_fuzzy_match_one_candidate_substring_matches_case_4():
    expanded = ['António Marinho']
    candidates = [{
        'wiki':
        'Q611182',
        'label':
        'Marinho Pinto',
        'aliases': [
            'António Marinho Pinto', 'António Marinho e Pinto',
            'António de Sousa Marinho e Pinto'
        ]
    }]
    assert EntityLinking.fuzzy_match(expanded[0], candidates[0]) is True
def test_merge_substrings():
    result = EntityLinking.merge_substrings(
        ['Luís Filipe Menezes', 'Dr. Menezes', 'doutor Menezes'])
    assert result == ['Luís Filipe Menezes']

    result = EntityLinking.merge_substrings(
        ['Pedro Silva Pereira', '”Pedro Silva Pereira'])
    assert result == ['Pedro Silva Pereira']

    result = EntityLinking.merge_substrings(
        ['Luís Filipe Menezes', 'Dr. Menezes', 'doutor Menezes'])
    assert result == ['Luís Filipe Menezes']

    result = EntityLinking.merge_substrings(
        ['Luís Marques Mendes', 'Marques Mendes'])
    assert result == ['Luís Marques Mendes']

    result = EntityLinking.merge_substrings(
        ['Filipe Anacoreta Correia', 'Anacoreta Correia'])
    assert result == ['Filipe Anacoreta Correia']

    result = EntityLinking.merge_substrings(
        ['Freitas do Amaral', 'Diogo Freitas do Amaral'])
    assert result == ['Diogo Freitas do Amaral']

    result = EntityLinking.merge_substrings(['George Bush', 'George W. Bush'])
    assert result == ['George W. Bush']

    result = EntityLinking.merge_substrings(['Víto Gaspar', 'Vítor Gaspar'])
    assert result == ['Vítor Gaspar']

    result = EntityLinking.merge_substrings(
        ['Jerónimo Sousa', 'Jerónimo de Sousa'])
    assert result == ['Jerónimo de Sousa']

    result = EntityLinking.merge_substrings(
        ['Nicolas Maduro', 'Nicolás Maduro'])
    assert result == ['Nicolás Maduro']
Пример #7
0
def main():
    args = parse_args()

    if args.publico:
        f_name = args.publico

    if args.chave:
        f_name = args.chave

    if args.arquivo:
        f_name = args.arquivo

    # load the relationships classification model
    print("Loading relationship classifier...")
    relationship_clf = joblib.load(MODELS + "SVC_2021_06_19_03_12.joblib")
    tf_idf_vectorizer = joblib.load(MODELS +
                                    "tf_idf_weights_2021_06_19_03_12.joblib")

    print("Loading NER classifier")
    ner = get_ner()
    # ToDo: load named-entities that should be ignored in the NER model itself
    with open('../classifiers/ner/names_ignore.txt', 'rt') as f_in:
        ner_ignore = [line.strip() for line in f_in.readlines()]

    print("Loading relation direction classifier")
    direction_clf = DirectionClassifier()

    print("Loading Entity Linking")
    articles_db = ArticlesDB()

    mappings = {
        "Cavaco": "Aníbal Cavaco Silva",
        "Marques Mendes": "Luís Marques Mendes",
    }

    el = EntityLinking(ner, articles_db, mappings)

    # log everything for error analysis
    ner_ignored = jsonlines.open("ner_ignored.jsonl", mode="w")
    no_entities = jsonlines.open("titles_processed_no_entities.jsonl",
                                 mode="w")
    more_entities = jsonlines.open("titles_processed_more_entities.jsonl",
                                   mode="w")
    processed = jsonlines.open("titles_processed.jsonl", mode="w")
    ner_linked = jsonlines.open("ner_linked.jsonl", mode="w")
    processing_errors = jsonlines.open("processing_errors.jsonl", mode="w")

    count = 0

    with open(f_name, 'rt') as f_in:

        for line in f_in:

            if args.publico:
                entry = line.split('\t')
                date = entry[0]
                url = entry[1]
                title = entry[2]

            elif args.arquivo or args.chave:
                entry = json.loads(line)
                title = entry["title"]
                url = entry["linkToArchive"]
                date = entry["tstamp"]

            count += 1
            if count % 1000 == 0:
                print(count)

            try:
                cleaned_title = clean_title_re(title)
            except Exception as e:
                print(e, title)
                continue

            # named-entity recognition
            persons = ner.tag(cleaned_title)

            # ignore certain 'person' entities
            # ToDo: move this to the ner object
            if any(person in persons for person in ner_ignore):
                ner_ignored.write({
                    "title": cleaned_title,
                    "entities": persons
                })
                continue

            if len(persons) <= 1:
                no_entities.write({
                    "title": cleaned_title,
                    "entities": persons
                })
                continue

            if len(persons) > 2:
                more_entities.write({
                    "title": cleaned_title,
                    "entities": persons
                })
                continue

            # entity linking
            entity1_wiki = el.entity_linking(persons[0], url)
            entity2_wiki = el.entity_linking(persons[1], url)

            # relationship extraction
            labels = ['opposes', 'other', 'supports']

            from politiquices.nlp.classifiers.relationship.train_clf_linear import get_text_tokens

            sample = {
                'title': cleaned_title,
                'ent1': persons[0],
                'ent2': persons[1]
            }

            try:
                textual_context = get_text_tokens([sample], tokenized=True)
            except TypeError:
                processing_errors.write(sample)
                continue

            tf_idf_weights = tf_idf_vectorizer.transform(textual_context)
            predicted_probs = relationship_clf.predict_proba(tf_idf_weights)
            rel_type_scores = {
                label: float(pred)
                for label, pred in zip(labels, predicted_probs[0])
            }

            pred_rel = max(rel_type_scores, key=rel_type_scores.get)

            if pred_rel != 'other':
                # detect relationship direction
                pred, pattern, context, pos_tags = direction_clf.detect_direction(
                    cleaned_title, persons[0], persons[1])
                pred_rel = pred.replace("rel", pred_rel)

            result = {
                "title": cleaned_title,
                "entities": persons,
                "ent_1": entity1_wiki,
                "ent_2": entity2_wiki,
                "scores": rel_type_scores,
                "pred_rel": pred_rel,
                "url": url,
                "date": date,
            }

            if entity1_wiki and entity2_wiki:
                processed.write(result)

            ner_linked.write({
                "ner": persons[0],
                "wiki": result['ent_1'],
                "url": url
            })
            ner_linked.write({
                "ner": persons[1],
                "wiki": result['ent_2'],
                "url": url
            })
Пример #8
0
from politiquices.nlp.classifiers.entity_linking.entitly_linking_clf import EntityLinking
from politiquices.nlp.data_sources.articles_db import ArticlesDB
from politiquices.nlp.extraction_pipeline.extract_relationships import get_ner
from politiquices.nlp.utils.utils import read_ground_truth, write_iterator_to_file

from sklearn.metrics import accuracy_score

mappings = {
    "Cavaco": "Aníbal Cavaco Silva",
    "Marques Mendes": "Luís Marques Mendes",
}

articles_db = ArticlesDB()
ner = get_ner()
el = EntityLinking(ner, articles_db, mappings)

all_ent_surface_string = []
ent_surface_string_with_wiki = []
ent_surface_string_without_wiki = []
ent_true = []
ent_pred = []
freqs = defaultdict(list)


def evaluate_one(entity_str, entity_id, url):
    res = el.entity_linking(entity_str, url)
    # 'None' as str for accuracy_sore() to work
    true = entity_id.split("/")[-1] if entity_id else 'None'
    pred = res['wiki_id'] if res else 'None'
    ent_true.append(true)