def test_callable_args(self, spacy_doc): results = list( extract.terms( spacy_doc, ngs=lambda doc: extract.ngrams(doc, n=2), ents=extract.entities, ncs=extract.noun_chunks, )) assert results assert all(isinstance(result, Span) for result in results)
def test_dedupe(self, dedupe, spacy_doc): results = list( extract.terms(spacy_doc, ngs=2, ents=True, ncs=True, dedupe=dedupe)) assert results if dedupe is True: assert (len(results) == len( set((result.start, result.end) for result in results))) else: assert (len(results) > len( set((result.start, result.end) for result in results)))
def tokenized_docs(): texts = [ "Mary had a little lamb. Its fleece was white as snow.", "Everywhere that Mary went the lamb was sure to go.", "It followed her to school one day, which was against the rule.", "It made the children laugh and play to see a lamb at school.", "And so the teacher turned it out, but still it lingered near.", "It waited patiently about until Mary did appear.", "Why does the lamb love Mary so? The eager children cry.", "Mary loves the lamb, you know, the teacher did reply.", ] nlp = textacy.load_spacy_lang("en_core_web_sm") docs = list(nlp.pipe(texts)) tokenized_docs = [ [term.text.lower() for term in extract.terms(doc, ngs=1)] for doc in docs ] return tokenized_docs
def test_simple_args(self, spacy_doc): results = list(extract.terms(spacy_doc, ngs=2, ents=True, ncs=True)) assert results assert all(isinstance(result, Span) for result in results)
def test_default(self, spacy_doc): with pytest.raises(ValueError): _ = list(extract.terms(spacy_doc))