def test_something(self): parser = SpacySentenceParser() sentence = 'M._tuberculosis is the cause of tuberculosis and chronic_obstructive_syndrome, ' \ 'also M._tuberculosis is a propionic acid producer.' bacteria = EntityCollection([Entity('M._tuberculosis', '111', BACTERIA_TAG), Entity('M._tuberculosis', '111', BACTERIA_TAG)], tag=BACTERIA_TAG) nutrients = EntityCollection([Entity('propionic', '123', NUTRIENT_TAG)]) diseases = EntityCollection([Entity('tuberculosis', 'a', DISEASE_TAG), Entity('tuberculosis', 'a', DISEASE_TAG), Entity('tuberculosis', 'a', DISEASE_TAG), Entity('chronic_obstructive_syndrome', 'a1', DISEASE_TAG), Entity('obstructive_syndrome', 'b1', DISEASE_TAG)]) not_overlapped_collections = [EntityCollection([bacteria.entities[0], bacteria.entities[1]]), EntityCollection([diseases.entities[1], diseases.entities[3]]), EntityCollection([nutrients.entities[0]])] parser_output = parser.parse_sentence(sentence, not_overlapped_collections[0].entities + not_overlapped_collections[1].entities + not_overlapped_collections[2].entities) expected_bact_tags = [0, 10] expected_nut_tags = [13] expected_dis_tags = [5, 7] actual_bact_tags = [i for i, tag in parser_output.tags.items() if tag == BACTERIA_TAG] actual_nut_tags = [i for i, tag in parser_output.tags.items() if tag == NUTRIENT_TAG] actual_dis_tags = [i for i, tag in parser_output.tags.items() if tag == DISEASE_TAG] self.assertCountEqual(expected_bact_tags, actual_bact_tags) self.assertCountEqual(expected_nut_tags, actual_nut_tags) self.assertCountEqual(expected_dis_tags, actual_dis_tags)
from nltk import StanfordTokenizer from ohmygut.core.sentence_processing import SpacySentenceParser from ohmygut.paths import stanford_jar_path """ A help script to plot sentence graph TODO: make as command line utility? """ sentence = "B.-vulgatus protects against Escherichia coli-induced colitis in gnotobiotic interleukin-2 deficient mice." # sentence = "Intriguingly, others have previously reported that pediatric IBD patients " \ # "have increased serum antibody titers against a TonB-dependent receptor from " \ # "human commensal Bacteroides caccae named OmpW." stanford_tokenizer = StanfordTokenizer(path_to_jar=stanford_jar_path) parser = SpacySentenceParser() parser_output = parser.parse_sentence(sentence, entities='') parser_output.draw("path.png")