def test_dir_reader2(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") dir_reader = EhostDirReader( nlp=English(), support_overlap=True, recursive=True, schema_file='data/ehost_test_corpus/config/projectschema.xml') docs = dir_reader.read(txt_dir='data/ehost_test_corpus/') assert (len(docs) == 2) for doc in docs: assert (len(doc._.concepts) == 7) assert ('Doc_Level_Purulence_Assessment' in doc._.concepts) assert (str( doc._.concepts['Doc_Level_Purulence_Assessment'][0]) == 'CHIEF' ) assert ('Purulent' in doc._.concepts) assert (str(doc._.concepts['Purulent'][0]) == 'Abdominal pain') assert ('Non-Purulent' in doc._.concepts) assert (str(doc._.concepts['Non-Purulent'][0]) == 'PRESENT') assert ('Incision_and_Drainage' in doc._.concepts) assert (str( doc._.concepts['Incision_and_Drainage'][0]) == 'patient') assert ('PreAnnotated' in doc._.concepts) assert (str(doc._.concepts['PreAnnotated'][0]) == '71-year-old') assert ('Nonspecific_SSTI' in doc._.concepts) assert (str( doc._.concepts['Nonspecific_SSTI'][0]) == 'X. The patient') assert ('Exclusions' in doc._.concepts) assert (str(doc._.concepts['Exclusions'][0]) == 'presented')
def test_read(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') self.eval(doc) def test_read_doc_name(self): ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == 'doc1.txt') ereader.doc_name_depth = 1 doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == r'corpus/doc1.txt') ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml', doc_name_depth=2) doc = ereader.read('data/ehost_test_corpus/corpus/doc1.txt') assert (doc._.doc_name == r'ehost_test_corpus/corpus/doc1.txt')
def test_parse_to_dicts(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") ereader = EhostDocReader(nlp=English()) spans, classes, attributes, relations = ereader.parse_to_dicts( 'data/ehost_test_corpus/saved/doc1.txt.knowtator.xml') assert (len(spans) == 7) assert (len(classes) == 7) assert (len(attributes) == 6)
def test_set_attributes(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml') nlp = English() doc = nlp('test status attribute') assert (hasattr(doc[1:2]._, 'status')) assert (doc[1:2]._.status == 'present')
def test_set_attributes(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") BratDocReader(nlp=English(), schema_file='data/brat_test_corpus/annotation.conf') nlp = English() doc = nlp('test status attribute') span = doc[1:2] assert (hasattr(span._, 'Negation')) assert (hasattr(span._, 'Confidence'))
def test_parse_to_dicts(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") breader = BratDocReader(nlp=English()) spans, classes, attributes, relations = breader.parse_to_dicts( Path('data/brat_test_corpus/000-introduction.ann').read_text()) assert (len(spans) == 12) assert (len(classes) == 17) assert (len(attributes) == 6) assert (len(relations) == 5)
def test_dir_reader(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") dir_reader = EhostDirReader( nlp=English(), recursive=True, schema_file='data/ehost_test_corpus/config/projectschema.xml') docs = dir_reader.read(txt_dir='data/ehost_test_corpus/') assert (len(docs) == 2) for doc in docs: self.eval(doc)
def test_check_spans(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") ereader = EhostDocReader( nlp=English(), schema_file='data/ehost_test_corpus/config/projectschema.xml', support_overlap=False, store_anno_string=True, encoding='UTF8', log_level=logging.DEBUG) doc = ereader.read('data/ehost_test_corpus/corpus/doc2.txt') for span in doc.ents: print(span._.span_txt, '<>', span) assert (span._.span_txt.replace('\n', ' ') in str(span).replace('\n', ' '))
def remove_doc_extensions(): """ Remove :mod:`textacy.extract` custom property and method doc extensions from the global :class:`spacy.tokens.Doc`. """ for name in get_doc_extensions().keys(): _ = Doc.remove_extension(name)
def test_check_spans(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") breader = BratDocReader( nlp=English(), schema_file='data/brat_test_corpus/annotation.conf', support_overlap=True, store_anno_string=True, encoding='UTF8', log_level=logging.DEBUG) doc = breader.read('data/brat_test_corpus/000-introduction.txt') for span in doc.ents: if span._.span_txt.replace('\n', ' ') not in str(span).replace( '\n', ' '): print(span._.span_txt, '<>', span) assert (span._.span_txt == 'complicated panic' or (span._.span_txt.replace('\n', ' ') in str(span).replace('\n', ' ')))
def test_read(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") breader = BratDocReader( nlp=English(), schema_file='data/brat_test_corpus/annotation.conf') doc = breader.read('data/brat_test_corpus/000-introduction.txt') assert (len(doc.ents) == 12) assert (str(doc.ents[0].label_) == 'Gene_expression') assert (str(doc.ents[1].label_) == 'Protein') assert (str(doc.ents[2].label_) == 'Negative_regulation') assert (str(doc.ents[3].label_) == 'Positive_regulation') assert (str(doc.ents[4].label_) == 'Protein') assert (str(doc.ents[5].label_) == 'Gene_expression') assert (str(doc.ents[6].label_) == 'Protein') assert (str(doc.ents[7].label_) == 'Complex') assert (str(doc.ents[8].label_) == 'Protein') assert (str(doc.ents[9].label_) == 'Positive_regulation') assert (str(doc.ents[10].label_) == 'Simple_chemical') assert (str(doc.ents[11].label_) == 'Protein')
def test_check_spans2(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") breader = BratDocReader( nlp=English(), schema_file='data/brat_test_corpus/annotation.conf', support_overlap=False, store_anno_string=True, encoding='UTF8', log_level=logging.DEBUG) doc = breader.read( 'data/brat_test_corpus/040-text_span_annotation.txt') for span in doc.ents: assert (span._.span_txt.replace('\n', ' ') in str(span).replace('\n', ' ')) def test_dir_reader(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") dir_reader = BratDirReader( nlp=English(), support_overlap=True, recursive=True, schema_file='data/brat_test_corpus/annotation.conf') docs = dir_reader.read(txt_dir='data/brat_test_corpus/') assert (len(docs) == 2) doc = docs[0] assert (len(doc._.concepts) == 6) assert ('Gene_expression' in doc._.concepts) assert ('Protein' in doc._.concepts) assert ('Negative_regulation' in doc._.concepts) assert ('Positive_regulation' in doc._.concepts) assert ('Complex' in doc._.concepts) assert ('Simple_chemical' in doc._.concepts) assert (len(doc._.concepts['Gene_expression']) == 2) assert (len(doc._.concepts['Protein']) == 5) assert (len(doc._.concepts['Negative_regulation']) == 1) assert (len(doc._.concepts['Positive_regulation']) == 2) assert (len(doc._.concepts['Complex']) == 1) assert (len(doc._.concepts['Simple_chemical']) == 1)
def test_dir_reader(self): if Doc.has_extension("concepts"): Doc.remove_extension("concepts") dir_reader = BratDirReader( nlp=English(), support_overlap=True, recursive=True, schema_file='data/brat_test_corpus/annotation.conf') docs = dir_reader.read(txt_dir='data/brat_test_corpus/') assert (len(docs) == 2) doc = docs[0] assert (len(doc._.concepts) == 6) assert ('Gene_expression' in doc._.concepts) assert ('Protein' in doc._.concepts) assert ('Negative_regulation' in doc._.concepts) assert ('Positive_regulation' in doc._.concepts) assert ('Complex' in doc._.concepts) assert ('Simple_chemical' in doc._.concepts) assert (len(doc._.concepts['Gene_expression']) == 2) assert (len(doc._.concepts['Protein']) == 5) assert (len(doc._.concepts['Negative_regulation']) == 1) assert (len(doc._.concepts['Positive_regulation']) == 2) assert (len(doc._.concepts['Complex']) == 1) assert (len(doc._.concepts['Simple_chemical']) == 1)