def extract_entity(filename, label_desc): ''' :param filename:电子病历文件 :param label_desc:ICD以及对应的描述(dict) :return: label_entity.pkl 文件:每个label对应的实体list(dict) EHR-label-entity-kg.csv文件:相比filename增加了EHR对应的实体这一列 ''' # 结果写入文件 writer_f = open('data/EHR-label-entity-kg.csv', 'w', newline='') writer = csv.writer(writer_f) label_entity_EHR_related = {} #打开文件 with open(filename, 'r') as f: reader = csv.reader(f) data = [row for row in reader][1:] for row in data: # 原数据:SUBJECT_ID,HADM_ID,TEXT,LABELS print('row:', row) count = 0 # 利用tagme提取其中的实体 #row='group of metabolic disorders characterized by high blood sugar levels over a prolonged period' try: tomatoes_mentions = tagme.mentions(row[2]) mentions = tomatoes_mentions.mentions # 将数据写入文件中 content = [row[0], row[1], row[2], 0, row[3]] content[3] = ';'.join([ mention.__str__().strip().split('[')[0][:-1] for mention in mentions ]) if len(content) > 0: writer.writerow(content) count += 1 except: pass labels = row[3].split(';') for label in labels: if label in label_desc: desc = label_desc.get(label) try: tomatoes_mentions = tagme.mentions(desc) mentions = tomatoes_mentions.mentions if label not in label_entity_EHR_related: label_entity_EHR_related[label] = [ mention.__str__().strip().split('[')[0][:-1] for mention in mentions ] except: label_entity_EHR_related[label] = [] else: label_entity_EHR_related[label] = [] print(label_entity_EHR_related.get(label)) writer_f.close() # 将label_entity_EHR_related保存下来 以备后面的使用 with open('data/label_entity.pkl', 'wb') as f: pickle.dump(label_entity_EHR_related, f)
def extract_mentions(self, question): if type(question) == list: return [self.extract_mentions(_question) for _question in question] mentions = tagme.mentions(question) return [{'mention': mention.mention, 'score': mention.linkprob} \ for mention in sorted(mentions.mentions, key=lambda x: -x.linkprob)]
def process_text_append_text_mentions(input_text: str): # Find spots in a text mentions = tagme.mentions(input_text, GCUBE_TOKEN) entities = " ".join( [word.mention for word in mentions.get_mentions(0.01)]) # Convert characters to lower case input_text_to_lower = (input_text + " " + entities).lower() # Remove special characters from the string input_text_to_lower = re.sub('[^a-zA-Z0-9 \n]', '', input_text_to_lower) # Remove common words using list of stop words filtered_words_list = [ word for word in input_text_to_lower.split() if word not in Ranking.stop_words ] # Stem the list of words filtered_words_list = [stem(word) for word in filtered_words_list] # Word ranking ranked_dict = dict() for word in filtered_words_list: if word in ranked_dict: ranked_dict[word] += 1 else: ranked_dict[word] = 1 return ranked_dict
def Annotation_mentions(txt): """ 发现那些文本中可以是维基概念实体的概念 :param txt: 一段文本对象,str类型 :return: 键值对,键为本文当中原有的实体概念,值为该概念作为维基概念的概念大小,那些属于维基概念但是存在歧义现象的也包含其内 """ annotation_mentions = tagme.mentions(txt) dic = dict() for mention in annotation_mentions.mentions: try: dic[str(mention).split(" [")[0]] = str(mention).split("] lp=")[1] except: logger.error('error annotation_mention about ' + mention) return dic
def Annotation_mentions(txt): """ Discover the concepts of wiki concept entities in those texts :param txt: a text object, str type :return: key-value pair, the key is the original entity concept in this article, the value is the concept size of the concept as a wiki concept, and those that belong to the wiki concept but have ambiguity also include """ annotation_mentions = tagme.mentions(txt) dic = dict() for mention in annotation_mentions.mentions: try: dic[str(mention).split(" [")[0]] = str(mention).split("] lp=")[1] except: logger.error('error annotation_mention about ' + mention) return dic
def main(): # Annotate a text. print("Annotating text: ", SAMPLE_TEXT) #resp = tagme.annotate(SAMPLE_TEXT) resp = tagme.annotate(SAMPLE_TEXT, include_categories=True) print(resp) for ann in resp.annotations: print(ann) # Find mentions in a text. print("Finding mentions in text: ", SAMPLE_TEXT) resp = tagme.mentions(SAMPLE_TEXT) print(resp) for mention in resp.mentions: print(mention) # Find relatedness between one pair of entities, by title. resp = tagme.relatedness_title(["Barack_Obama", "Italy"]) print(resp) for rel in resp.relatedness: print(rel) # Find relatedness between pairs of entities, by title. resp = tagme.relatedness_title([("Barack_Obama", "Italy"), ("Italy", "Germany"), ("Italy", "BAD ENTITY NAME")]) print(resp) for rel in resp.relatedness: print(rel) # Access the relatedness response as a dictionary. resp_dict = dict(resp) print("Relatedness between Italy and Germany: ", resp_dict[("Italy", "Germany")]) # Find relatedness between one pair of entities, by wikipedia id resp = tagme.relatedness_wid((31717, 534366)) print(resp) for rel in resp.relatedness: print(rel) # Find relatedness between pairs of entities, by wikipedia id resp = tagme.relatedness_wid([(534366, 534366 + a) for a in range(1010)]) print(resp) for rel in resp.relatedness: print(rel)