def extract_entity(filename, label_desc):
    '''
    :param filename:电子病历文件
    :param label_desc:ICD以及对应的描述(dict)
    :return:
        label_entity.pkl 文件:每个label对应的实体list(dict)
        EHR-label-entity-kg.csv文件:相比filename增加了EHR对应的实体这一列
    '''
    # 结果写入文件
    writer_f = open('data/EHR-label-entity-kg.csv', 'w', newline='')
    writer = csv.writer(writer_f)
    label_entity_EHR_related = {}
    #打开文件
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        data = [row for row in reader][1:]
        for row in data:
            # 原数据:SUBJECT_ID,HADM_ID,TEXT,LABELS
            print('row:', row)
            count = 0
            # 利用tagme提取其中的实体
            #row='group of metabolic disorders characterized by high blood sugar levels over a prolonged period'
            try:
                tomatoes_mentions = tagme.mentions(row[2])
                mentions = tomatoes_mentions.mentions
                # 将数据写入文件中
                content = [row[0], row[1], row[2], 0, row[3]]
                content[3] = ';'.join([
                    mention.__str__().strip().split('[')[0][:-1]
                    for mention in mentions
                ])
                if len(content) > 0:
                    writer.writerow(content)
                    count += 1
            except:
                pass
            labels = row[3].split(';')
            for label in labels:
                if label in label_desc:
                    desc = label_desc.get(label)
                    try:
                        tomatoes_mentions = tagme.mentions(desc)
                        mentions = tomatoes_mentions.mentions
                        if label not in label_entity_EHR_related:
                            label_entity_EHR_related[label] = [
                                mention.__str__().strip().split('[')[0][:-1]
                                for mention in mentions
                            ]
                    except:
                        label_entity_EHR_related[label] = []
                else:
                    label_entity_EHR_related[label] = []
                print(label_entity_EHR_related.get(label))

    writer_f.close()
    # 将label_entity_EHR_related保存下来 以备后面的使用
    with open('data/label_entity.pkl', 'wb') as f:
        pickle.dump(label_entity_EHR_related, f)
예제 #2
0
    def extract_mentions(self, question):
        if type(question) == list:
            return [self.extract_mentions(_question) for _question in question]

        mentions = tagme.mentions(question)
        return [{'mention': mention.mention, 'score': mention.linkprob} \
                for mention in sorted(mentions.mentions, key=lambda x: -x.linkprob)]
예제 #3
0
 def process_text_append_text_mentions(input_text: str):
     # Find spots in a text
     mentions = tagme.mentions(input_text, GCUBE_TOKEN)
     entities = " ".join(
         [word.mention for word in mentions.get_mentions(0.01)])
     # Convert characters to lower case
     input_text_to_lower = (input_text + " " + entities).lower()
     # Remove special characters from the string
     input_text_to_lower = re.sub('[^a-zA-Z0-9 \n]', '',
                                  input_text_to_lower)
     # Remove common words using list of stop words
     filtered_words_list = [
         word for word in input_text_to_lower.split()
         if word not in Ranking.stop_words
     ]
     # Stem the list of words
     filtered_words_list = [stem(word) for word in filtered_words_list]
     # Word ranking
     ranked_dict = dict()
     for word in filtered_words_list:
         if word in ranked_dict:
             ranked_dict[word] += 1
         else:
             ranked_dict[word] = 1
     return ranked_dict
예제 #4
0
def Annotation_mentions(txt):
    """
    发现那些文本中可以是维基概念实体的概念
    :param txt: 一段文本对象,str类型
    :return: 键值对,键为本文当中原有的实体概念,值为该概念作为维基概念的概念大小,那些属于维基概念但是存在歧义现象的也包含其内
    """
    annotation_mentions = tagme.mentions(txt)
    dic = dict()
    for mention in annotation_mentions.mentions:
        try:
            dic[str(mention).split(" [")[0]] = str(mention).split("] lp=")[1]
        except:
            logger.error('error annotation_mention about ' + mention)
    return dic
예제 #5
0
def Annotation_mentions(txt):
    """
    Discover the concepts of wiki concept entities in those texts
    :param txt: a text object, str type
    :return: key-value pair, the key is the original entity concept in this article, the value is the concept size of the concept as a wiki concept, and those that belong to the wiki concept but have ambiguity also include
    """
    annotation_mentions = tagme.mentions(txt)
    dic = dict()
    for mention in annotation_mentions.mentions:
        try:
            dic[str(mention).split(" [")[0]] = str(mention).split("] lp=")[1]
        except:
            logger.error('error annotation_mention about ' + mention)
    return dic
예제 #6
0
def main():
    # Annotate a text.
    print("Annotating text: ", SAMPLE_TEXT)
    #resp = tagme.annotate(SAMPLE_TEXT)
    resp = tagme.annotate(SAMPLE_TEXT, include_categories=True)
    print(resp)
    for ann in resp.annotations:
        print(ann)

    # Find mentions in a text.
    print("Finding mentions in text: ", SAMPLE_TEXT)
    resp = tagme.mentions(SAMPLE_TEXT)
    print(resp)
    for mention in resp.mentions:
        print(mention)

    # Find relatedness between one pair of entities, by title.
    resp = tagme.relatedness_title(["Barack_Obama", "Italy"])
    print(resp)
    for rel in resp.relatedness:
        print(rel)

    # Find relatedness between pairs of entities, by title.
    resp = tagme.relatedness_title([("Barack_Obama", "Italy"),
                                    ("Italy", "Germany"),
                                    ("Italy", "BAD ENTITY NAME")])
    print(resp)
    for rel in resp.relatedness:
        print(rel)

    # Access the relatedness response as a dictionary.
    resp_dict = dict(resp)
    print("Relatedness between Italy and Germany: ",
          resp_dict[("Italy", "Germany")])

    # Find relatedness between one pair of entities, by wikipedia id
    resp = tagme.relatedness_wid((31717, 534366))
    print(resp)
    for rel in resp.relatedness:
        print(rel)

    # Find relatedness between pairs of entities, by wikipedia id
    resp = tagme.relatedness_wid([(534366, 534366 + a) for a in range(1010)])
    print(resp)
    for rel in resp.relatedness:
        print(rel)