def GetGoldAnwer(SentencesList): #保存标注的ytest的实体位置和关系位置/类别 gold_entity = [] #所有实体的位置 gold_relation = [] #所有关系的实体位置及类别 for sentence in SentencesList: #每句话 text = sentence['text'] text = sample_token4(text) text = nltk.tokenize.word_tokenize(text) if len(text) == 0 or len(text) == 1: #如果这句话是空白或只有句号则跳过 continue entity_s = [] #一句话中的实体 relation_s = [] #一句话中的关系 for entity in sentence['entity']: #将这句话中实体们的位置保存 entity_s.append(entity) for pair in sentence['pair']: e1_index = pair[0] #实体1在句子中的索引 e2_index = pair[1] #实体2在句子中的索引 label = pair[2] #关系 e1_dir = sentence['entity'][e1_index][:2] #实体1的左右边界(忽略空格) e2_dir = sentence['entity'][e2_index][:2] #实体2的左右边界(忽略空格) labelAbbr = RelationAbbr[label] #该关系的缩写 relation_s.append( [e1_dir[0], e1_dir[1], e2_dir[0], e2_dir[1], labelAbbr]) gold_entity.append(entity_s) gold_relation.append(relation_s) print len(gold_entity) return gold_entity, gold_relation
def GenarateBIO(senslist, schema, num_toolong): #将转换成BIO标注的二维list article = [] #文章级别的列表 for i in range(len(senslist)): text = senslist[i]['text'] #取出文本 text = sample_token4(text) text = nltk.tokenize.word_tokenize(text) entitys = senslist[i]['entity'] word_label = [] #句子级别的列表 left = -1 right = -1 for token in text: left = right + 1 #当前token的左右边界(含头含尾) right = right + len(token) ifBI = 0 if schema == 'BIO': for entity in entitys: #一个实体的列表[left,right,label] if left == entity[0]: word_label.append([token, 'B']) ifBI = 1 break elif left > entity[0] and right <= entity[1]: word_label.append([token, 'I']) ifBI = 1 break elif left == entity[0] and right > entity[1]: num_toolong += 1 if ifBI == 0: word_label.append([token, 'O']) if schema == 'BIOES': for entity in entitys: #一个实体的列表[left,right,label] if left == entity[0] and right == entity[1]: word_label.append([token, 'S']) ifBI = 1 break elif left == entity[0] and right < entity[1]: word_label.append([token, 'B']) ifBI = 1 break elif left > entity[0] and right < entity[1]: word_label.append([token, 'I']) ifBI = 1 break elif left > entity[0] and right == entity[1]: word_label.append([token, 'E']) ifBI = 1 break elif left == entity[0] and right > entity[1]: # print token # print left,right # print entitys # print senslist[i]['text'] # print senindex2file[i] +'\n' num_toolong += 1 if ifBI == 0: word_label.append([token, 'O']) article.append(word_label) return article, num_toolong
def GetGoldAnwer(SentencesList): #保存标注的ytest的实体位置和关系位置/类别 gold_entity = [] #所有实体的位置 for sentence in SentencesList: #每句话 text = sentence['text'] text = sample_token4(text) text = nltk.tokenize.word_tokenize(text) if len(text) == 0 or len(text) == 1: #如果这句话是空白或只有句号则跳过 continue entity_s = [] #一句话中的实体 for entity in sentence['entity']: #将这句话中实体们的位置保存 entity_s.append([entity[0], entity[1]]) gold_entity.append(entity_s) return gold_entity
def GetGoldAnwer(SentencesList): #保存标注的ytest的实体位置和关系位置/类别 gold_entity = [] #所有实体的位置 for sentence in SentencesList: #每句话 text = sentence['text'] text = sample_token4(text) text = nltk.tokenize.word_tokenize(text) entity_s = [] #一句话中的实体 for entity in sentence['entity']: #将这句话中实体们的位置保存 if entity[2] == 'CHEMICAL': entity_s.append([entity[0], entity[1], eAbbr[entity[2]]]) gold_entity.append(entity_s) return gold_entity
def GetMap(self): alltokens = [] #所有句子的tokens 二维列表 eindex2tindex = [] #所有句子的实体index to tokenindex的映射字典 rindex2eindex = [] #所有句子的关系index to 实体index的映射字典 for i in range(len(self.SentencesList)): #提取tokens text = self.SentencesList[i]['text'] #取出文本 text = sample_token4(text) text = nltk.tokenize.word_tokenize(text) if len(text) == 0 or len(text) == 1: #如果这句话是空白或只有句号则跳过 continue tokens = [] for token in text: tokens.append(token.lower()) alltokens.append(tokens) #提取entity2token的映射 entitys = self.SentencesList[i]['entity'] e2t = {} #句子级别的实体到token的映射 for j in range(len(entitys)): #每个实体至少对应一个token!!!不然无法生成位置索引 e2t[j] = [] entity = entitys[j] left = -1 right = -1 for k in range(len(text)): token = text[k] left = right + 1 #当前token的左右边界(含头含尾) right = right + len(token) if left == entity[ 0] or left > entity[0] and right <= entity[ 1] or left < entity[0] and right > entity[0]: e2t[j].append(k) if len(e2t[j]) == 0: print text print entitys[j] eindex2tindex.append(e2t) #提取关系到实体的映射 relations = self.SentencesList[i]['pair'] rindex2eindex.append(relations) return alltokens, eindex2tindex, rindex2eindex
def GenarateBIO(senslist, schema): #生成BIO标注的二维list article = [] #文章级别的列表 token_more_than_entity = 0 for i in range(len(senslist)): text = senslist[i]['text'] #取出文本 text = sample_token4(text) text = nltk.tokenize.word_tokenize(text) entitys = senslist[i]['entity'] word_label = [] #句子级别的列表 left = -1 right = -1 for token in text: left = right + 1 #当前token的左右边界(含头含尾) right = right + len(token) ifBI = 0 if schema == 'BIOES': for entity in entitys: #先使用类别为chemical的标记token,再使用gene覆盖掉 if entity[2] == 'CHEMICAL': if left == entity[0] and right == entity[1]: now_token = [token, 'S-' + eAbbr[entity[2]]] ifBI = 1 #break elif left == entity[0] and right < entity[1]: now_token = [token, 'B-' + eAbbr[entity[2]]] ifBI = 1 #break elif left > entity[0] and right < entity[1]: now_token = [token, 'I-' + eAbbr[entity[2]]] ifBI = 1 #break elif left > entity[0] and right == entity[1]: now_token = [token, 'E-' + eAbbr[entity[2]]] ifBI = 1 #break elif left == entity[0] and right > entity[1]: now_token = [token, 'S-' + eAbbr[entity[2]]] ifBI = 1 # print token # print entity[0],entity[1] # print left,right # print entitys # print senslist[i]['text'] # print senindex2file[i] +'\n' token_more_than_entity += 1 #print 'token length more than entity length' #break for entity in entitys: if entity[2] == 'GENE-Y' or entity[2] == 'GENE-N': if left == entity[0] and right == entity[1]: now_token = [token, 'S-' + eAbbr[entity[2]]] ifBI = 1 #break elif left == entity[0] and right < entity[1]: now_token = [token, 'B-' + eAbbr[entity[2]]] ifBI = 1 #break elif left > entity[0] and right < entity[1]: now_token = [token, 'I-' + eAbbr[entity[2]]] ifBI = 1 #break elif left > entity[0] and right == entity[1]: now_token = [token, 'E-' + eAbbr[entity[2]]] ifBI = 1 #break elif left == entity[0] and right > entity[1]: now_token = [token, 'S-' + eAbbr[entity[2]]] ifBI = 1 token_more_than_entity += 1 if ifBI == 0: now_token = [token, 'O'] word_label.append(now_token) article.append(word_label) print 'there is %s token length > entity' % token_more_than_entity return article
def GetMap(self): ''' 生成每个句子的: tokens: [[token11,token12...]] eindex2tindex = [{0:[2,3,4],1:[7]...},...] rindex2eindex = [[[0,1,'C1'],...],...] ''' alltokens = [] #所有句子的tokens 二维列表 eindex2tindex_C = [] #句子的实体index to tokenindex的映射字典 只包含药物实体 eindex2tindex_G = [] #句子的实体index to tokenindex的映射字典 只包含基因实体 rindex2eindex = [] #所有句子的关系index to 实体index的映射字典 for i in range(len(self.SentencesList)): #提取每句话的tokens text = self.SentencesList[i]['text'] #取出文本 text = sample_token4(text) text = nltk.tokenize.word_tokenize(text) tokens = [] for token in text: tokens.append(token.lower()) alltokens.append(tokens) #提取entity2token的映射 entitys = self.SentencesList[i]['entity'] e2t_c = {} #句子级别的实体到token的映射 e2t_g = {} #句子级别的实体到token的映射 for j in range(len(entitys)): #每个实体至少对应一个token,不然无法生成位置索引! if entitys[j][2] == 'CHEMICAL': e2t_c[j] = [] entity = entitys[j] left = -1 right = -1 for k in range(len(text)): token = text[k] left = right + 1 #当前token的左右边界(含头含尾) right = right + len(token) if left == entity[0] or left > entity[ 0] and left <= entity[1] or left < entity[ 0] and right >= entity[0]: e2t_c[j].append(k) if len(e2t_c[j]) == 0: print(text) print(u'this entity cannot find tokens %s' % entitys[j]) else: e2t_g[j] = [] entity = entitys[j] left = -1 right = -1 for k in range(len(text)): token = text[k] left = right + 1 #当前token的左右边界(含头含尾) right = right + len(token) if left == entity[0] or left > entity[ 0] and left <= entity[1] or left < entity[ 0] and right >= entity[0]: e2t_g[j].append(k) if len(e2t_g[j]) == 0: print(text) print(u'this entity cannot find tokens %s' % entitys[j]) eindex2tindex_C.append(e2t_c) eindex2tindex_G.append(e2t_g) #提取关系到实体的映射 relations = self.SentencesList[i]['pair'] rindex2eindex.append(relations) return alltokens, eindex2tindex_C, eindex2tindex_G, rindex2eindex