Пример #1
0
def GetGoldAnwer(SentencesList):  #保存标注的ytest的实体位置和关系位置/类别
    gold_entity = []  #所有实体的位置
    gold_relation = []  #所有关系的实体位置及类别
    for sentence in SentencesList:  #每句话
        text = sentence['text']
        text = sample_token4(text)
        text = nltk.tokenize.word_tokenize(text)
        if len(text) == 0 or len(text) == 1:  #如果这句话是空白或只有句号则跳过
            continue
        entity_s = []  #一句话中的实体
        relation_s = []  #一句话中的关系
        for entity in sentence['entity']:  #将这句话中实体们的位置保存
            entity_s.append(entity)
        for pair in sentence['pair']:
            e1_index = pair[0]  #实体1在句子中的索引
            e2_index = pair[1]  #实体2在句子中的索引
            label = pair[2]  #关系
            e1_dir = sentence['entity'][e1_index][:2]  #实体1的左右边界(忽略空格)
            e2_dir = sentence['entity'][e2_index][:2]  #实体2的左右边界(忽略空格)
            labelAbbr = RelationAbbr[label]  #该关系的缩写
            relation_s.append(
                [e1_dir[0], e1_dir[1], e2_dir[0], e2_dir[1], labelAbbr])
        gold_entity.append(entity_s)
        gold_relation.append(relation_s)
    print len(gold_entity)
    return gold_entity, gold_relation
Пример #2
0
def GenarateBIO(senslist, schema, num_toolong):  #将转换成BIO标注的二维list
    article = []  #文章级别的列表
    for i in range(len(senslist)):
        text = senslist[i]['text']  #取出文本
        text = sample_token4(text)
        text = nltk.tokenize.word_tokenize(text)
        entitys = senslist[i]['entity']
        word_label = []  #句子级别的列表
        left = -1
        right = -1
        for token in text:
            left = right + 1  #当前token的左右边界(含头含尾)
            right = right + len(token)
            ifBI = 0
            if schema == 'BIO':
                for entity in entitys:  #一个实体的列表[left,right,label]
                    if left == entity[0]:
                        word_label.append([token, 'B'])
                        ifBI = 1
                        break
                    elif left > entity[0] and right <= entity[1]:
                        word_label.append([token, 'I'])
                        ifBI = 1
                        break
                    elif left == entity[0] and right > entity[1]:
                        num_toolong += 1
                if ifBI == 0:
                    word_label.append([token, 'O'])
            if schema == 'BIOES':
                for entity in entitys:  #一个实体的列表[left,right,label]
                    if left == entity[0] and right == entity[1]:
                        word_label.append([token, 'S'])
                        ifBI = 1
                        break
                    elif left == entity[0] and right < entity[1]:
                        word_label.append([token, 'B'])
                        ifBI = 1
                        break
                    elif left > entity[0] and right < entity[1]:
                        word_label.append([token, 'I'])
                        ifBI = 1
                        break
                    elif left > entity[0] and right == entity[1]:
                        word_label.append([token, 'E'])
                        ifBI = 1
                        break
                    elif left == entity[0] and right > entity[1]:
                        #                        print token
                        #                        print left,right
                        #                        print entitys
                        #                        print senslist[i]['text']
                        #                        print senindex2file[i] +'\n'
                        num_toolong += 1
                if ifBI == 0:
                    word_label.append([token, 'O'])
        article.append(word_label)

    return article, num_toolong
Пример #3
0
def GetGoldAnwer(SentencesList):  #保存标注的ytest的实体位置和关系位置/类别
    gold_entity = []  #所有实体的位置
    for sentence in SentencesList:  #每句话
        text = sentence['text']
        text = sample_token4(text)
        text = nltk.tokenize.word_tokenize(text)
        if len(text) == 0 or len(text) == 1:  #如果这句话是空白或只有句号则跳过
            continue
        entity_s = []  #一句话中的实体
        for entity in sentence['entity']:  #将这句话中实体们的位置保存
            entity_s.append([entity[0], entity[1]])
        gold_entity.append(entity_s)
    return gold_entity
Пример #4
0
def GetGoldAnwer(SentencesList):  #保存标注的ytest的实体位置和关系位置/类别
    gold_entity = []  #所有实体的位置
    for sentence in SentencesList:  #每句话
        text = sentence['text']
        text = sample_token4(text)
        text = nltk.tokenize.word_tokenize(text)

        entity_s = []  #一句话中的实体
        for entity in sentence['entity']:  #将这句话中实体们的位置保存
            if entity[2] == 'CHEMICAL':
                entity_s.append([entity[0], entity[1], eAbbr[entity[2]]])
        gold_entity.append(entity_s)
    return gold_entity
Пример #5
0
 def GetMap(self):
     alltokens = []  #所有句子的tokens 二维列表
     eindex2tindex = []  #所有句子的实体index to tokenindex的映射字典
     rindex2eindex = []  #所有句子的关系index to 实体index的映射字典
     for i in range(len(self.SentencesList)):
         #提取tokens
         text = self.SentencesList[i]['text']  #取出文本
         text = sample_token4(text)
         text = nltk.tokenize.word_tokenize(text)
         if len(text) == 0 or len(text) == 1:  #如果这句话是空白或只有句号则跳过
             continue
         tokens = []
         for token in text:
             tokens.append(token.lower())
         alltokens.append(tokens)
         #提取entity2token的映射
         entitys = self.SentencesList[i]['entity']
         e2t = {}  #句子级别的实体到token的映射
         for j in range(len(entitys)):  #每个实体至少对应一个token!!!不然无法生成位置索引
             e2t[j] = []
             entity = entitys[j]
             left = -1
             right = -1
             for k in range(len(text)):
                 token = text[k]
                 left = right + 1  #当前token的左右边界(含头含尾)
                 right = right + len(token)
                 if left == entity[
                         0] or left > entity[0] and right <= entity[
                             1] or left < entity[0] and right > entity[0]:
                     e2t[j].append(k)
             if len(e2t[j]) == 0:
                 print text
                 print entitys[j]
         eindex2tindex.append(e2t)
         #提取关系到实体的映射
         relations = self.SentencesList[i]['pair']
         rindex2eindex.append(relations)
     return alltokens, eindex2tindex, rindex2eindex
Пример #6
0
def GenarateBIO(senslist, schema):  #生成BIO标注的二维list
    article = []  #文章级别的列表
    token_more_than_entity = 0
    for i in range(len(senslist)):
        text = senslist[i]['text']  #取出文本
        text = sample_token4(text)
        text = nltk.tokenize.word_tokenize(text)
        entitys = senslist[i]['entity']
        word_label = []  #句子级别的列表
        left = -1
        right = -1
        for token in text:
            left = right + 1  #当前token的左右边界(含头含尾)
            right = right + len(token)
            ifBI = 0
            if schema == 'BIOES':
                for entity in entitys:
                    #先使用类别为chemical的标记token,再使用gene覆盖掉
                    if entity[2] == 'CHEMICAL':
                        if left == entity[0] and right == entity[1]:
                            now_token = [token, 'S-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #break
                        elif left == entity[0] and right < entity[1]:
                            now_token = [token, 'B-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #break
                        elif left > entity[0] and right < entity[1]:
                            now_token = [token, 'I-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #break
                        elif left > entity[0] and right == entity[1]:
                            now_token = [token, 'E-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #break
                        elif left == entity[0] and right > entity[1]:
                            now_token = [token, 'S-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #                        print token
                            #                        print entity[0],entity[1]
                            #                        print left,right
                            #                        print entitys
                            #                        print senslist[i]['text']
                            #                        print senindex2file[i] +'\n'
                            token_more_than_entity += 1
                            #print 'token length more than entity length'
                            #break
                for entity in entitys:
                    if entity[2] == 'GENE-Y' or entity[2] == 'GENE-N':
                        if left == entity[0] and right == entity[1]:
                            now_token = [token, 'S-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #break
                        elif left == entity[0] and right < entity[1]:
                            now_token = [token, 'B-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #break
                        elif left > entity[0] and right < entity[1]:
                            now_token = [token, 'I-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #break
                        elif left > entity[0] and right == entity[1]:
                            now_token = [token, 'E-' + eAbbr[entity[2]]]
                            ifBI = 1
                            #break
                        elif left == entity[0] and right > entity[1]:
                            now_token = [token, 'S-' + eAbbr[entity[2]]]
                            ifBI = 1
                            token_more_than_entity += 1
                if ifBI == 0:
                    now_token = [token, 'O']
                word_label.append(now_token)
        article.append(word_label)
    print 'there is %s token length > entity' % token_more_than_entity
    return article
Пример #7
0
    def GetMap(self):
        '''
            生成每个句子的:
            tokens: [[token11,token12...]]
            eindex2tindex = [{0:[2,3,4],1:[7]...},...]
            rindex2eindex = [[[0,1,'C1'],...],...]
        '''
        alltokens = []  #所有句子的tokens 二维列表
        eindex2tindex_C = []  #句子的实体index to tokenindex的映射字典 只包含药物实体
        eindex2tindex_G = []  #句子的实体index to tokenindex的映射字典 只包含基因实体
        rindex2eindex = []  #所有句子的关系index to 实体index的映射字典
        for i in range(len(self.SentencesList)):
            #提取每句话的tokens
            text = self.SentencesList[i]['text']  #取出文本
            text = sample_token4(text)
            text = nltk.tokenize.word_tokenize(text)
            tokens = []
            for token in text:
                tokens.append(token.lower())
            alltokens.append(tokens)

            #提取entity2token的映射
            entitys = self.SentencesList[i]['entity']
            e2t_c = {}  #句子级别的实体到token的映射
            e2t_g = {}  #句子级别的实体到token的映射
            for j in range(len(entitys)):  #每个实体至少对应一个token,不然无法生成位置索引!
                if entitys[j][2] == 'CHEMICAL':
                    e2t_c[j] = []
                    entity = entitys[j]
                    left = -1
                    right = -1
                    for k in range(len(text)):
                        token = text[k]
                        left = right + 1  #当前token的左右边界(含头含尾)
                        right = right + len(token)
                        if left == entity[0] or left > entity[
                                0] and left <= entity[1] or left < entity[
                                    0] and right >= entity[0]:
                            e2t_c[j].append(k)
                    if len(e2t_c[j]) == 0:
                        print(text)
                        print(u'this entity cannot find tokens %s' %
                              entitys[j])
                else:
                    e2t_g[j] = []
                    entity = entitys[j]
                    left = -1
                    right = -1
                    for k in range(len(text)):
                        token = text[k]
                        left = right + 1  #当前token的左右边界(含头含尾)
                        right = right + len(token)
                        if left == entity[0] or left > entity[
                                0] and left <= entity[1] or left < entity[
                                    0] and right >= entity[0]:
                            e2t_g[j].append(k)
                    if len(e2t_g[j]) == 0:
                        print(text)
                        print(u'this entity cannot find tokens %s' %
                              entitys[j])
            eindex2tindex_C.append(e2t_c)
            eindex2tindex_G.append(e2t_g)
            #提取关系到实体的映射
            relations = self.SentencesList[i]['pair']
            rindex2eindex.append(relations)
        return alltokens, eindex2tindex_C, eindex2tindex_G, rindex2eindex