Exemplo n.º 1
0
def ner_tagged_tokens(record):
    _, payload = record
    key = find_key(payload)
    text = None
    tagged_tokens = None
    if key != "":
        text = html2text(payload)
        tagged_tokens = nlp_preproc(text)
        for token in tagged_tokens:
            yield key, text, token[0]
Exemplo n.º 2
0
def Entities_Linking(record):
    _, payload = record
    key = find_key(payload)
    text = None
    tagged_tokens = None
    # res = ""
    if key != "":
        text = html2text(payload)
        tagged_tokens = nlp_preproc(text)
        mentions_types = []
        entity_result_dict = {}
        for token in tagged_tokens:
            if token not in mentions_types:
                mentions_types.append(token)
        for token in mentions_types:
            Tfidf_score_max = 0
            entity_score_max = ""
            entities = search_candidate(token[0])

            for entity, labels in entities:
                # abstract = query_abstract(SPARQL,entity)
                abstract = query_candidate_abstract(entity)
                # print(entity,labels)
                if abstract != None:
                    score = cosine_sim(text, abstract)
                    if score > Tfidf_score_max:
                        Tfidf_score_max = score
                        entity_score_max = entity

            if Tfidf_score_max != 0:
                entity_result_dict[token[0]] = entity_score_max
                # print(token)
        #        yield key + '\t' + token[0]  + '\t' + entity_score_max
        for token in tagged_tokens:
            if entity_result_dict.__contains__(token[0]):
                yield key + '\t' + token[0] + '\t' + entity_result_dict[
                    token[0]]
Exemplo n.º 3
0

if __name__ == '__main__':
    import sys
    try:
        _, INPUT, ELASTICSEARCH, SPARQL = sys.argv
    except Exception as e:
        print('Usage: python starter-code.py INPUT ELASTICSEARCH SPARQL')
        sys.exit(0)

    with open(INPUT, errors='ignore') as fo:
        for record in split_records(fo):
            key = find_key(record)
            if key != '':
                text = html2text(record)
                tagged_tokens = nlp_preproc(text)
                mentions_types = []
                entity_result_dict = {}
                for token in tagged_tokens:
                    if token not in mentions_types:
                        mentions_types.append(token)

                for token in mentions_types:
                    Tfidf_score_max = 0
                    entity_score_max = ""
                    entities = search_candidate(token[0])

                    for entity, labels in entities:
                        if labels['freebase_label'] == token[0]:
                            score = math.inf
                            Tfidf_score_max = score