def compute_mean(text, list):

    ann_dict = {}
    annotations = tagme.annotate(text)
    # print(annotations.get_annotations())
    for ann in annotations.get_annotations(0.3):
        ann_dict[ann.entity_id] = ann.score
        # print ann
    # print "------------"
    final_ann = []

    if len(ann_dict) > 1:
        if len(ann_dict) > 10:
            ordered = sorted(ann_dict.items(),
                             key=lambda x: x[1],
                             reverse=True)
            top_ann = ordered[:10]
            final_ann = [x[0] for x in top_ann]
        else:
            final_ann = [x for x in ann_dict]
        media_temp = []
        for i in range(len(final_ann)):
            interest = final_ann[i]
            for j in range(i + 1, len(final_ann)):
                keyword = final_ann[j]
                if interest != keyword:
                    rels = tagme.relatedness_wid((interest, keyword))
                    #print "relatedness is:"
                    #print rels.relatedness[0].rel
                    media_temp.append(rels.relatedness[0].rel)
        media = numpy.mean(media_temp)
        list.append(media)
        print media
    else:
        print "solo un'annotazione trovata per il testo"
Пример #2
0
 def process_text_append_text_annotations(input_text: str):
     # Find annotations in a text
     annotations = tagme.annotate(input_text, GCUBE_TOKEN)
     entities = " ".join(
         [word.entity_title for word in annotations.get_annotations(0.2)])
     # Convert characters to lower case
     input_text_to_lower = (input_text + " " + entities).lower()
     # Remove special characters from the string
     input_text_to_lower = re.sub('[^a-zA-Z0-9 \n]', '',
                                  input_text_to_lower)
     # Remove common words using list of stop words
     filtered_words_list = [
         word for word in input_text_to_lower.split()
         if word not in Ranking.stop_words
     ]
     # Stem the list of words
     filtered_words_list = [stem(word) for word in filtered_words_list]
     # Word ranking
     ranked_dict = dict()
     for word in filtered_words_list:
         if word in ranked_dict:
             ranked_dict[word] += 1
         else:
             ranked_dict[word] = 1
     return ranked_dict
Пример #3
0
def tag_text(text):
    while True:
        try:
            annotations = tagme.annotate(text)
            break
        except:
            (text + "Again!")
            pass
    entities_list = set()
    # Print annotations with a score higher than 0.1
    if annotations:
        for ann in annotations.get_annotations(0.1):
            # begin = int(ann.begin)
            # end = int(ann.end)
            # score = float(ann.score)
            entity_title = str(ann.entity_title)
            # if entity_title not in Entities:
            #     print("missing: " + entity_title)
            # entity_title = entity_title.strip().replace(" ", "_").replace(";", "-COLON-").replace("(", "-LRB-").replace(")", "-RRB-")
            entity_title = re.sub("\s+", " ", entity_title)
            # entities_list.append([begin, end, entity_title, float(score)])

            entities_list.add(entity_title)
    entities_list = list(entities_list)
    return entities_list
 def run(self):
     global queue
     while queue.qsize() > 0:
         # Killed
         if self._stop_event.is_set():
             break
         # Extract entities from sentences in queue
         sentence_meta = queue.get()
         try:
             if 'entities' not in sentence_meta:
                 sentence_annotations = tagme.annotate(
                     sentence_meta['sentence'])
                 entities = [{'pos_begin': ann.begin, 'pos_end': ann.end,
                              'entity_id': ann.entity_id, 'score': ann.score}
                             for ann in sentence_annotations.annotations]
                 sentence_meta['entities'] = entities
             logger.info(
                 '{}, worker: {}, jobs remain: {}.'.format(datetime.now(),
                                                           self._index,
                                                           queue.qsize()))
         except Exception as e:
             logger.warning(e)
             # Send job back to queue
             queue.put(sentence_meta)
     logger.info('Worker {} exited.'.format(self._index))
Пример #5
0
def retrieveTopics(text, topics):
    try:
        annotations = tagme.annotate(text)
        for ann in annotations.get_annotations(0.3):
            topics.setdefault(ann.entity_title, []).append(ann.score)
    except (Timeout, ConnectionError) as exc:
        print "errore di connessione"
Пример #6
0
    def string2ent(self, string):
        tokens = self.tokenizer(string)
        print(string)
        bpe_tokens = sum([self._bpe(t) for t in tokens], tuple())
        bpe_tokens = ' '.join(bpe_tokens).replace(BPEVocab.we, '')
        #using tagme

        ann = tagme.annotate(bpe_tokens)
        ents = []
        for a in ann.get_annotations(0.1):
            if a.entity_title not in ent_map:
                continue
            print(a)
            ents.append([ent_map[a.entity_title], a.begin, a.end, a.score])
        ent_str = self.split_str(bpe_tokens, ents)

        #using spacy
        doc = nlp(string)
        ents = []
        ann = [X.text for X in doc.ents]
        print(ann)
        for a in ann:
            if a not in ent_map:
                continue
            ents.append(ent_map[a])
        '''
        ent_str = self.spacy_str(bpe_tokens, ents)
        '''
        return ent_str
Пример #7
0
def get_annotation(query, th):
    lunch_annotations = tagme.annotate(query)
    res = []
    # Print annotations with a score higher than 0.1
    for ann in lunch_annotations.get_annotations(th):
        res.append(ann.entity_title)
    return ' '.join(res)
Пример #8
0
def getEntities(wordList):
    dictionary = {}
    annotationsText = (" ").join(wordList)
    print((str(annotationsText)))
    annotation = tagme.annotate((annotationsText))
    for ann in annotation.get_annotations(0.08):
        dictionary[ann.entity_title] = ann.score
    return dictionary
Пример #9
0
def get_annotation_of_noCleanedFile(user_id, score, dirX):
    lunch_annotations = tagme.annotate(
        take_text_by_id_of_noCleanedFile(user_id, dirX))
    # Print annotations with a score higher than 0.2
    dictionary = {}
    for ann in lunch_annotations.get_annotations(score):
        dictionary[ann.entity_title] = ann.score
    return dictionary
Пример #10
0
 def retrieve_titles_w_tag_me(self, question, tagme_api_key):
     import tagme
     tagme.GCUBE_TOKEN = tagme_api_key
     q_annotations = tagme.annotate(question)
     tagged_titles = []
     for ann in q_annotations.get_annotations(0.1):
         tagged_titles.append(ann.entity_title)
     return tagged_titles
Пример #11
0
def disambig(text, min_rho=None):
    annotations = tagme.annotate(text)
    a = dict()
    for x in annotations.annotations:
        if min_rho is None or x.score > min_rho:
            a[str(x.mention)] = x.entity_title

    return a
Пример #12
0
    def get_links(self, text: str) -> List[Pair]:
        annotations = tagme.annotate(text)

        return [
            Pair(
                f'http://dbpedia.org/resource/{ann.entity_title.replace(" ", "_")}',
                ann.mention, 'entity')
            for ann in annotations.get_annotations(0.1)
        ]
Пример #13
0
 def tagme(text) -> list:
     """
         :text:  --> The text in english after translation
         :return:  --> return a list of entities
     """
     if text == "":
         return []
     tagme.GCUBE_TOKEN = VishvasnewsFactCheckingSiteExtractor.TAGME_API_KEY
     return tagme.annotate(text)
Пример #14
0
def QE_LM_Dirichlet(queries, paragraphs, en=False):
    # pseudo relevance matrix
    result = np.zeros((len(paragraphs), len(queries)), dtype=np.float)
    mu = 1500

    collection = [w for (pid, para) in paragraphs for w in para.split(" ")]
    count_w_C = {}
    sum_w_D = {}
    for q_index, query in enumerate(queries):
        print("query: %s with %s words.." % (q_index, len(query.split(" "))))
        cwD = np.zeros((len(paragraphs), len(query.split(" "))),
                       dtype=np.float)
        for no, q in enumerate(query.split(" ")):
            count_w_D = {}
            for p_index, (pid, para) in enumerate(paragraphs):
                count_w_D[pid] = para.split(" ").count(q)
                cwD[p_index, no] = count_w_D.get(pid)
            if q in count_w_C.keys():
                continue
            count_w_C[q] = count_w_D
            sum_w_D[q] = sum(count_w_D.values())
        # Smoothing & i.i.d Language Model
        print(((cwD + mu * collection.count(q)) /
               (sum(count_w_D.values()) + mu)).shape)
        result[:, q_index] = np.prod(
            (cwD + mu * collection.count(q)) / (sum(count_w_D.values()) + mu),
            axis=1)

    if en == False:
        for q_index in range(result.shape[1]):
            print("expanding:%s" % q_index)
            pseudo_relevance = result[:, q_index]
            top_document_index = list(pseudo_relevance.argsort()[0:10])
            for index in top_document_index:
                print("expanding:%s; top document: %s" % (q_index, index))
                words = dict(
                    collections.Counter(paragraphs[index][1].split(" ")))
                words = sorted(words.items(),
                               key=lambda item: item[1],
                               reverse=True)
                queries[q_index] = queries[q_index] + " " + words[0][0]
    else:
        for q_index in range(result.shape[1]):
            pseudo_relevance = result[:, q_index]
            top_document_index = list(pseudo_relevance.argsort())
            count = 0
            for index in top_document_index:
                if count >= 10:
                    break
                annotations = tagme.annotate(paragraphs[index][1])
                # annotations with a score higher than 0.1
                for ann in annotations.get_annotations(0.1):
                    queries[
                        q_index] = queries[q_index] + " " + ann.entity_title
                    count += 1
    return result, queries
Пример #15
0
 def extract_entities(self, question, threshold=0.1):
     if type(question) == list:
         return [self.extract_entities(_question) for _question in question]
     annotations = tagme.annotate(question)
     return [{
         'entity': ann.entity_title,
         'entity_id': ann.entity_id,
         'mention': ann.mention,
         'score': ann.score
     } for ann in sorted(annotations.get_annotations(threshold),
                         key=lambda x: -x.score)]
Пример #16
0
def entity_list(request):
    import tagme
    # Set the authorization token for subsequent calls.
    tagme.GCUBE_TOKEN = "a5a377c1-1bd0-47b9-907a-75b1cdacb1d9-843339462"

    db_post = PostTitle.objects.filter(user_id=request.user.id,
                                       search_date_interval=val2(),
                                       search_word=val(),
                                       date=date.today().strftime("%b-%d-%Y"))
    file1 = db_post.values_list('title', flat=True)

    # file1 = open('file.txt', 'r', encoding='utf-8')
    entity_title_list = []

    # Using for loop
    for line in file1:
        lunch_annotations = tagme.annotate(line)
        # Print annotations with a score higher than 0.1
        for ann in lunch_annotations.get_annotations(min_rho=0.35):
            entity_title_list.append(ann.entity_title)

    # Closing files
    # file1.close()

    entity_title_list = list(dict.fromkeys(entity_title_list))

    relation_degree_list = []
    relation_pair_list = []
    relation_list = [
        "ALL RELATIONS BETWEEN ENTITIES ARE LISTED BELOW", "\n", "\n"
    ]

    for i in range(0, len(entity_title_list)):
        for j in range(i + 1, len(entity_title_list)):
            rels = tagme.relatedness_title(
                (entity_title_list[i], entity_title_list[j]))
            relation_degree = rels.relatedness[0].rel
            # print(f"{entity_title_list[i]} ve {entity_title_list[j]} {relation_degree}")
            # print("\n")
            if relation_degree != 0.0:
                relation_pair_list.append(
                    (entity_title_list[i], entity_title_list[j]))
                relation_degree_list.append(relation_degree)
                pairs = entity_title_list[i] + " " + entity_title_list[j]
                relation_list.append(pairs)
                relation_list.append("\t")
                relation_list.append(relation_degree)
                relation_list.append("\n")
        print(entity_title_list[i])
        print("Kalan entity sayısı:  ", len(entity_title_list) - (i + 1))
        print("done")

    return HttpResponse(relation_list, content_type="text/plain")
Пример #17
0
def tagme_result(request):
    import tagme
    # Set the authorization token for subsequent calls.
    tagme.GCUBE_TOKEN = "a5a377c1-1bd0-47b9-907a-75b1cdacb1d9-843339462"
    '''
    with open('file.txt', 'r') as f:
        first_line = f.readline()

    lunch_annotations = tagme.annotate(first_line)

    ann_list = []
    # Print annotations with a score higher than 0.1
    for ann in lunch_annotations.get_annotations(0.1):
        ann_list.append(ann)
    '''

    # file1 = open('file.txt', 'r', encoding='utf-8')
    count = 0
    ann_list = ["Entity Results", "\n", "\n"]

    entity_title_list = []

    searched_word = val()
    selected_date = val2()

    try:
        db_post = PostTitle.objects.filter(
            user_id=request.user.id,
            search_date_interval=selected_date,
            search_word=searched_word,
            date=date.today().strftime("%b-%d-%Y"))
        file1 = db_post.values_list('title', flat=True)
        # Using for loop
        for line in file1:
            count += 1
            lunch_annotations = tagme.annotate(line)
            ann_list.append(count)
            ann_list.append("\n")

            # Print annotations with a score higher than 0.1
            for ann in lunch_annotations.get_annotations(min_rho=0.35):
                ann_list.append(ann)
                ann_list.append("\t")
                ann_list.append(ann.uri())
                ann_list.append("\n")
                entity_title_list.append(ann.entity_title)
        return HttpResponse(ann_list, content_type="text/plain")

    except AttributeError:
        error_message = "Something is wrong. Please contact to administrator or send an email to '*****@*****.**'. Thank you!"
        return HttpResponse(error_message, content_type="text/plain")
    '''
 def get_normal_enrichment(self, comment):
     base_url = "https://en.wikipedia.org/wiki/"
     tagme.GCUBE_TOKEN = self.tagme_token
     annotations = tagme.annotate(comment)
     for annotation in annotations.get_annotations(0.4):
         response = requests.get(base_url + annotation.entity_title)
         soup = BeautifulSoup(response.text, 'html.parser')
         p = soup.find_all('p')
         wiki_text = ""
         for paragraph in p:
             wiki_text += paragraph.get_text() + " "
         comment += " " + wiki_text
     return comment
Пример #19
0
def mytagme_ann(data):
    annotations = tagme.annotate(data)
    dic = {}
    for ann in annotations.get_annotations(0.2):
        try:
            A, B, score = str(ann).split(" -> ")[0], str(ann).split(
                " -> ")[1].split(" (score: ")[0], str(ann).split(
                    " -> ")[1].split(" (score: ")[1].split(")")[0]
            dic[A] = {"link": B, "score": score}

        except:
            print('error annotation about ' + ann)
    return dic
Пример #20
0
def return_tags():
    req_data = request.get_json(force=True)
    text = req_data['text']
    lunch_annotations = tagme.annotate(text)
    # Print annotations with a score higher than 0.1
    entities = []
    for ann in lunch_annotations.get_annotations(score_higher_than):
        s = str(re.findall(r'->(.*?)score:', str(ann)))
        entity = re.sub('[^A-Za-z0-9]+', ' ', s).strip()
        entities.append(entity)

    entities = list(dict.fromkeys(entities))
    entities_output = {"Entity-Linking-Entities": entities}
    return jsonify(entities_output)
Пример #21
0
def get_entities(sentence: str) -> list:
    # Extract entities from a sentence, return information about entities
    res = []
    for ann in tagme.annotate(sentence).annotations:
        entity = {
            'start_pos': ann.begin,
            'end_pos': ann.end,
            'score': ann.score,
            'title': ann.entity_title,
            'tagme_id': ann.entity_id,
            'wiki_id': get_wiki_id(ann.entity_title),
            'kg_id': get_kg_id(ann.entity_title)
        }
        res.append(entity)
    return res
Пример #22
0
def tagme_annotation(quest_path):
    print("annotating using tagme")
    tagme_entity_list = []
    with open(quest_path) as f:
        for i, line in enumerate(f):
            print(i)
            line = line.strip()
            ent_t_score = {}
            lunch_annotations = tagme.annotate(line)
            # Print annotations with a score higher than 0.1
            for ann in lunch_annotations.get_annotations(0.2):
                #print(ann.entity_title)
                ent_t_score[ann.entity_title] = ann.score
            tagme_entity_list.append(ent_t_score)
    return tagme_entity_list
Пример #23
0
def extract_keywords_from_tweet(text: str, filterStopwords: bool) -> set:

    extractor = extractors.ArticleExtractor()
    keywords = set()
    # print(text)
    links = get_urls_from_text(text)
    # print("Debug: number of links:"+ str(len(links)))

    # From disaster dataset
    text = clean_text(text)

    keywords = text.split(" ")

    for key in keywords:
        if key in string.punctuation:
            keywords.remove(key)

    if filterStopwords == True:
        # Delete stopwords from text
        stop_words = stopwords.words('english')
        word_tokens = word_tokenize(text)
        filtered_sentence = set()

        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.add(w)
        keywords = filtered_sentence

    for url in links:
        try:
            external_content = extractor.get_content_from_url(url)
            # Debug
            # print("External content:" + external_content)
            if external_content != "":
                try:
                    annotations = tagme.annotate(external_content)
                    for ann in annotations.get_annotations(
                            annotation_score_treshold):
                        #
                        #
                        # print(ann)
                        keywords.add(ann.entity_title)
                except:
                    print("Error with tagme, skipping")
        except:
            pass

    return keywords
Пример #24
0
    def identify_entities(self, text):
        result = []
        annotations = tagme.annotate(text)
        for ann in annotations.get_annotations(0.1):
            name = ann.entity_title
            score = ann.score
            wiki_title = tagme.normalize_title(name)
            logger.info("Wiki title: " + wiki_title)
            mid = self.wiki_url[wiki_title]
            if mid is None: continue

            e = KBEntity(name, mid, score)
            ie = IdentifiedEntity(name, e, score)
            result.append(ie)

        return result
    def __call__(self, src_path, filename):
        """

        src_path - path to web-page file
        filename - name of web-page file
        """
        with open(os.path.join(src_path, filename), 'r') as file:
            doc = ' '.join(line.strip() for line in file)
            cnt = Counter(
                ent.entity_title
                for ent in tagme.annotate(doc).get_annotations(self.rho_score))
        if cnt:
            with open(
                    f'{os.path.join(self.tgt_path, os.path.splitext(filename)[0])}.json',
                    'w') as tgt:
                json.dump(dict(cnt), tgt)
def add_europeana_node(data,
                       db_conn,
                       link_to_nodes=False,
                       annotation_threshold=0.1):
    """
    Add Europeana nodes into the MEMEX-KG

    :param data: the dictionary of ky->values to be saved in the KG
    :param db_conn: database connection
    :param link_to_nodes: a flag to link Europeana nodes to existing nodes in the MEMEX-KG (default FALSE)
    :param annotation_threshold: TAG-ME mandatory threshold
    """

    wikidata_found = False
    for n in data:
        if link_to_nodes:
            # Add wikidata wids
            wiki_titles = []
            for idx, property_name in enumerate(n[0]):
                if property_name == "label" or property_name == "description" or property_name == "dcCreator":
                    value = n[1][idx]
                    europeana_annotations = tagme.annotate(value)
                    if europeana_annotations:
                        for ann in europeana_annotations.get_annotations(
                                annotation_threshold):
                            t = ann.entity_title
                            t = t[0].lower() + t[1:]
                            wiki_titles.append(t)

            wiki_titles = list(dict.fromkeys(wiki_titles))
            all_wids = []
            for title in wiki_titles:
                wids = db_conn.get_wikidata_ids_by_label(title)
                if wids:
                    all_wids.extend(wids)

            if all_wids:
                wikidata_found = True
                n[0].append("wids")
                n[1].append(all_wids)

        # Insert the node
        db_conn.queue_insert_node(n)  # , additional_class="Europeana")

    # Link to wikidata nodes
    if link_to_nodes and wikidata_found:
        db_conn.match_with_wikidata()
Пример #27
0
def main():
    # Annotate a text.
    print("Annotating text: ", SAMPLE_TEXT)
    #resp = tagme.annotate(SAMPLE_TEXT)
    resp = tagme.annotate(SAMPLE_TEXT, include_categories=True)
    print(resp)
    for ann in resp.annotations:
        print(ann)

    # Find mentions in a text.
    print("Finding mentions in text: ", SAMPLE_TEXT)
    resp = tagme.mentions(SAMPLE_TEXT)
    print(resp)
    for mention in resp.mentions:
        print(mention)

    # Find relatedness between one pair of entities, by title.
    resp = tagme.relatedness_title(["Barack_Obama", "Italy"])
    print(resp)
    for rel in resp.relatedness:
        print(rel)

    # Find relatedness between pairs of entities, by title.
    resp = tagme.relatedness_title([("Barack_Obama", "Italy"),
                                    ("Italy", "Germany"),
                                    ("Italy", "BAD ENTITY NAME")])
    print(resp)
    for rel in resp.relatedness:
        print(rel)

    # Access the relatedness response as a dictionary.
    resp_dict = dict(resp)
    print("Relatedness between Italy and Germany: ",
          resp_dict[("Italy", "Germany")])

    # Find relatedness between one pair of entities, by wikipedia id
    resp = tagme.relatedness_wid((31717, 534366))
    print(resp)
    for rel in resp.relatedness:
        print(rel)

    # Find relatedness between pairs of entities, by wikipedia id
    resp = tagme.relatedness_wid([(534366, 534366 + a) for a in range(1010)])
    print(resp)
    for rel in resp.relatedness:
        print(rel)
Пример #28
0
def tagme_annotate(in_file, out_file, threshold=0.1):
    with jsonlines.open(in_file) as f_in, jsonlines.open(out_file,
                                                         "w") as f_out:
        for line in f_in:
            aliases = []
            spans = []
            qids = []
            probs = []
            text = line["sentence"]
            text_spans = text.split()
            text_span_indices = []
            total_len = 0

            # get word boundaries for converting char spans to word spans
            for i, t in enumerate(text_spans):
                text_span_indices.append(total_len)
                total_len += len(t) + 1
            lunch_annotations = tagme.annotate(text)

            # as the threshold increases, the precision increases, but the recall decreases
            for ann in lunch_annotations.get_annotations(threshold):
                mention = ann.mention
                try:
                    qid = enwiki_title_to_wikidata_id(ann.entity_title)
                except:
                    print(f"No wikidata id found for {ann.entity_title}")
                    continue
                span_start = text_span_indices.index(ann.begin)
                try:
                    span_end = text_span_indices.index(ann.end + 1)
                except:
                    span_end = len(text_spans)
                aliases.append(mention)
                spans.append([span_start, span_end])
                qids.append(qid)
                probs.append(ann.score)

            line["aliases"] = aliases
            line["qids"] = qids
            line["spans"] = spans
            line["probs"] = probs
            line["gold"] = [True for _ in aliases]
            f_out.write(line)
Пример #29
0
async def getEntities(body:TAGME_MODEL):
    text = body.text
    text = re.sub('[^.,a-zA-Z0-9 \n\.]', '', text)
    score = body.tagme_score
    tagme.GCUBE_TOKEN = str(body.tagme_token_api)
    lunch_annotations = tagme.annotate(text)
    # Print annotations with a score higher than 0.1
    entities = []
    for ann in lunch_annotations.get_annotations(score):
        s = str(re.findall(r'->(.*?)score:',str(ann)))
        entity = re.sub('[^A-Za-z0-9]+', ' ', s).strip()
        entities.append(entity)

    entities = list(dict.fromkeys(entities))
    entities = list(set(entities))
    entities_output = {
        "Entities" : entities
    }
    return entities_output
Пример #30
0
    def run(self):
        tagme.GCUBE_TOKEN = TAGME_GCUBE_TOKEN
        with open('output/tagme/batches.pickle', 'rb') as f:
            batch_dict = pickle.load(f)
        batch_questions = batch_dict[self.question_batch]
        dict_annotations = {}
        for q in batch_questions:
            annotated_sentences = {}
            for s, text in q.text.items():
                # This insures that preprocessing is matched for neural models
                text = ' '.join(tokenize_question(text))
                annotation = annotation_to_dict(tagme.annotate(text))
                annotated_sentences[s] = annotation
            dict_annotations[q.qnum] = annotated_sentences

        with open(
                'output/tagme/tagged_batch_{}.pickle'.format(
                    self.question_batch), 'wb') as f:
            pickle.dump(dict_annotations, f)
Пример #31
0
     totalBotMicroPrecS += len(trueSet) * precS
     totalBotMicroRecS += len(trueSet) * recS
     # BOT macro scores
     totalBotMacroPrecS += precS
     totalBotMacroRecS += recS
     
     totalMyMentionsS += len(resultS)
     
     if verbose:
         print 'Split: ' + str(precS) + ', ' + str(recS)
     
 # get results for manually split string
 if doManual:
     # tagme has separate way to do things
     if mthd == 'tagme':
         antns = tagme.annotate(" ".join(line['text']))
         resultM = []
         for an in antns.get_annotations(0.005):
             resultM.append([an.begin,an.end,title2id(an.entity_title)])
     else:
         # unsplit string to be manually split and mentions found
         try:
             resultM = wikifyEval(" ".join(line['text']), False, hybridC = doHybrid, 
                              maxC = maxCands, method = mthd, model = mlModel, erMethod = erMethod)
         except:
             skipped += 1
             badThing.append(line)
             continue
     
     precM = precision(trueEntities, resultM) # precision of manual split
     recM = recall(trueEntities, resultM) # recall of manual split
Пример #32
0
        logging.info("Processing {}".format(xml_file))
        for i, doc in enumerate(get_documents(xml_file)):
            if doc is None:
                logging.warning("Could not parse document {} from {}".format(i, xml_file))
                continue
            key, title, body, time = doc
            
            doc_path = "{}.csv".format(os.path.join(docs_path_base, key))
            entities_path = "{}.csv".format(os.path.join(entities_path_base, key))
            
            if (os.path.isfile(doc_path) and os.path.isfile(entities_path)):
                logging.info("Document {} already annotated, skipping.".format(key))
                continue
            
            logging.info("Annotating document key={} length={} ({})".format(key, len(body), xml_file))
            tagme_response = tagme.annotate(u'{} {}'.format(title, body), args.gcube_token, lang=args.lang)
            if not tagme_response:
                logging.warning("Could not annoate document {} from {} (key {})".format(i, xml_file, key))
                continue
            annotations = tagme_response.get_annotations(min_rho=0.2)
            logging.info("Found {} annotations".format(len(annotations)))
            
            with open(doc_path, 'wb') as csv_doc_out:
                w = csv.DictWriter(csv_doc_out, encoding='utf-8', fieldnames=DOCS_CSV_FIELDS)
                w.writerow({'key': key, 'title': title, 'body': body, 'time': time})

            with open(entities_path, 'wb') as csv_entities_out:
                w = csv.DictWriter(csv_entities_out, encoding='utf-8', fieldnames=ENTITIES_CSV_FIELDS)
                for annotation in annotations:
                    w.writerow({'key': key, 'entity': annotation.entity_title, 'score': annotation.score, 'time': time})