def translateNCRFPPintoEntities(doc_token, predict_results, doc_name): entity_id = 1 results = [] sent_num = len(predict_results) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) sent_token = doc_token[(doc_token['sent_idx'] == idx)] assert sent_token.shape[0] == sent_length, "file {}, sent {}".format( doc_name, idx) labelSequence = [] for idy in range(sent_length): token = sent_token.iloc[idy] label = predict_results[idx][0][idy] labelSequence.append(label) if label[0] == 'S' or label[0] == 'B': entity = Entity() entity.create(str(entity_id), label[2:], token['start'], token['end'], token['text'], idx, idy, idy) results.append(entity) entity_id += 1 elif label[0] == 'M' or label[0] == 'E': if checkWrongState(labelSequence): entity = results[-1] entity.append(token['start'], token['end'], token['text'], idy) return results
def read_one_file(fileName, annotation_dir, entities_overlapped_types): annotation_file = get_bioc_file(join(annotation_dir, fileName)) bioc_passage = annotation_file[0].passages[0] entities = [] for entity in bioc_passage.annotations: entity_ = Entity() entity_.create(entity.id, entity.infons['type'], entity.locations[0].offset, entity.locations[0].end, entity.text, None, None, None) for old_entity in entities: if is_overlapped(entity_, old_entity): logging.debug( "entity overlapped: doc:{}, entity1_id:{}, entity1_type:{}, entity1_span:{} {}, entity2_id:{}, entity2_type:{}, entity2_span:{} {}" .format(fileName, old_entity.id, old_entity.type, old_entity.start, old_entity.end, entity_.id, entity_.type, entity_.start, entity_.end)) overlapped_types = entity_.type + "_" + old_entity.type if cmp( entity_.type, old_entity.type ) > 0 else old_entity.type + "_" + entity_.type if overlapped_types in entities_overlapped_types: count = entities_overlapped_types[overlapped_types] count += 1 entities_overlapped_types[overlapped_types] = count else: entities_overlapped_types[overlapped_types] = 1 entities.append(entity_)