def test_remove_entity_overlapping_empty(self): output = remove_entity_overlapping( [EntityCollection([]), EntityCollection([])], tokens_words=["There", "is", "really", "nothing"] ) expected = [EntityCollection([]), EntityCollection([])] self.assertCountEqual(expected[0].entities, output[0].entities) self.assertCountEqual(expected[1].entities, output[1].entities)
def test_remove_entity_overlapping_2(self): bacteria = EntityCollection( [Entity("M. tuberculosis", "111", BACTERIA_TAG), Entity("M. tuberculosis", "111", BACTERIA_TAG)], tag=BACTERIA_TAG, ) nutrients = EntityCollection([Entity("propionic", "123", NUTRIENT_TAG)], NUTRIENT_TAG) diseases = EntityCollection( [ Entity("tuberculosis", "a", DISEASE_TAG), Entity("tuberculosis", "a", DISEASE_TAG), Entity("tuberculosis", "a", DISEASE_TAG), Entity("chronic obstructive syndrome", "a1", DISEASE_TAG), Entity("obstructive syndrome", "b1", DISEASE_TAG), ], DISEASE_TAG, ) output = remove_entity_overlapping( [bacteria, nutrients, diseases], tokens_words=[ "M.", "tuberculosis", "is", "the", "cause", "of", "tuberculosis", "and", "chronic", "obstructive", "syndrome", ",", "also", "M.", "tuberculosis", "is", "a", "propionic", "acid", "producer", ".", ], ) expected = [ EntityCollection([bacteria.entities[0], bacteria.entities[1]], BACTERIA_TAG), EntityCollection([diseases.entities[1], diseases.entities[3]], DISEASE_TAG), EntityCollection([nutrients.entities[0]], NUTRIENT_TAG), ] self.assertCountEqual(expected[0].entities, output[0].entities) self.assertCountEqual(expected[1].entities, output[1].entities) self.assertCountEqual(expected[2].entities, output[2].entities)
def get_sentence(self, sentence_text, article): if not self.check_if_title(article.title): return None if len(sentence_text) > SENTENCE_LENGTH_THRESHOLD: return None entities_collections = [] for catalog in self.catalog_list: found_entities = catalog.find(sentence_text) entities_collections.append(found_entities) tags_in_sentence = set([collection.tag for collection in entities_collections if len(collection.entities) > 0]) if not self.check_if_tags(tags_in_sentence): return None tokens = self.nlp(sentence_text) tokens_words = [token.orth_ for token in tokens] # todo: check - if we found bacteria both in gut_catalog and all_bacteria_catalog - which we would keep? logger.info("entities before remove overlapping: %s" % str(entities_collections)) entities_collections = remove_entity_overlapping(entities_collections, tokens_words) logger.info("entities after remove overlapping: %s" % str(entities_collections)) # separate all several-words-names by underscope (_) for collection in entities_collections: for entity in collection.entities: dashed_name = entity.name.replace(' ', '_') sentence_text = sentence_text.replace(entity.name, dashed_name) entity.name = dashed_name # remove bad entities for collection in entities_collections: bad_entities = [x for x in collection.entities if any(y in self.tags_to_exclude for y in x.additional_tags) or x.tag in self.tags_to_exclude] for entity in bad_entities: collection.entities.remove(entity) entities_collections = [collection for collection in entities_collections if len(collection.entities) > 0] tags_in_sentence = set([collection.tag for collection in entities_collections if len(collection.entities) > 0]) logger.info("entities after excluding: %s" % str(entities_collections)) if not self.check_if_tags(tags_in_sentence): return None tokens = self.nlp(sentence_text) # entities list for parser all_entities_list = [] for collection in entities_collections: all_entities_list.extend(collection.entities) parser_output = self.sentence_parser.parse_sentence(sentence_text, all_entities_list, tokens) paths = self.sentence_analyzer.analyze_sentence(parser_output, tags_in_sentence) sentence = Sentence(text=sentence_text, article=article, entities_collections=entities_collections, parser_output=parser_output, shortest_paths=paths) return sentence