示例#1
0
def translateNCRFPPintoEntities(doc_token, predict_results, doc_name):

    entity_id = 1
    results = []

    sent_num = len(predict_results)
    for idx in range(sent_num):
        sent_length = len(predict_results[idx][0])
        sent_token = doc_token[(doc_token['sent_idx'] == idx)]

        assert sent_token.shape[0] == sent_length, "file {}, sent {}".format(
            doc_name, idx)
        labelSequence = []

        for idy in range(sent_length):
            token = sent_token.iloc[idy]
            label = predict_results[idx][0][idy]
            labelSequence.append(label)

            if label[0] == 'S' or label[0] == 'B':
                entity = Entity()
                entity.create(str(entity_id), label[2:], token['start'],
                              token['end'], token['text'], idx, idy, idy)
                results.append(entity)
                entity_id += 1

            elif label[0] == 'M' or label[0] == 'E':
                if checkWrongState(labelSequence):
                    entity = results[-1]
                    entity.append(token['start'], token['end'], token['text'],
                                  idy)

    return results
示例#2
0
def read_one_file(fileName, annotation_dir, entities_overlapped_types):
    annotation_file = get_bioc_file(join(annotation_dir, fileName))

    bioc_passage = annotation_file[0].passages[0]

    entities = []

    for entity in bioc_passage.annotations:
        entity_ = Entity()
        entity_.create(entity.id, entity.infons['type'],
                       entity.locations[0].offset, entity.locations[0].end,
                       entity.text, None, None, None)

        for old_entity in entities:
            if is_overlapped(entity_, old_entity):

                logging.debug(
                    "entity overlapped: doc:{}, entity1_id:{}, entity1_type:{}, entity1_span:{} {}, entity2_id:{}, entity2_type:{}, entity2_span:{} {}"
                    .format(fileName, old_entity.id, old_entity.type,
                            old_entity.start, old_entity.end, entity_.id,
                            entity_.type, entity_.start, entity_.end))

                overlapped_types = entity_.type + "_" + old_entity.type if cmp(
                    entity_.type, old_entity.type
                ) > 0 else old_entity.type + "_" + entity_.type
                if overlapped_types in entities_overlapped_types:
                    count = entities_overlapped_types[overlapped_types]
                    count += 1
                    entities_overlapped_types[overlapped_types] = count
                else:
                    entities_overlapped_types[overlapped_types] = 1

        entities.append(entity_)