def test_split_sentences_by_char_limit(): """Unit test of splitting sentences by char limit.""" lingual_parser = SpacyParser("en") text = "This is a text. This is another text." all_sentences = [ Sentence(**parts) for parts in lingual_parser.split_sentences(text) ] assert len(all_sentences) == 2 assert [len(sentence.text) for sentence in all_sentences] == [15, 21] lingual_parser.model.remove_pipe("sentencizer") lingual_parser.model.add_pipe(set_custom_boundary, before="parser", name="sentence_boundary_detector") sentence_batches = lingual_parser._split_sentences_by_char_limit( all_sentences, 20) assert len(sentence_batches) == 2 sentence_batches = lingual_parser._split_sentences_by_char_limit( all_sentences, 100) assert len(sentence_batches) == 1 sentence_batch = sentence_batches[0] custom_tokenizer = TokenPreservingTokenizer(lingual_parser.model.vocab) doc = custom_tokenizer(sentence_batch) doc.user_data = sentence_batch for name, proc in lingual_parser.model.pipeline: # iterate over components in order doc = proc(doc) assert doc.is_parsed # See if the number of parsed spaCy sentences matches that of input sentences assert len(list(doc.sents)) == len(sentence_batch)
def doc_setup(): doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "This is apple" lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) return doc
def doc_setup(): """Set up document.""" doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = "This is apple. That is orange. Where is banaba? I like Apple." lingual_parser = SpacyParser("en") # Split sentences for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Enrich sentences for _ in lingual_parser.enrich_sentences_with_NLP(doc.sentences): pass # Pick one sentence and add visual information # so that all the words get aligned horizontally. sentence: Sentence = doc.sentences[0] sentence.page = [1, 1, 1, 1] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] # Assume the 2nd sentence is horizontally aligned with 1st. sentence: Sentence = doc.sentences[1] sentence.page = [1, 1, 1, 1] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [40, 50, 60, 70] sentence.right = [50, 60, 70, 80] # Assume the 3rd sentence is vertically aligned with 1st. sentence: Sentence = doc.sentences[2] sentence.page = [1, 1, 1, 1] sentence.top = [10, 10, 10, 10] sentence.bottom = [20, 20, 20, 20] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] # Assume the 4th sentence is in 2nd page. sentence: Sentence = doc.sentences[3] sentence.page = [2, 2, 2, 2] sentence.top = [0, 0, 0, 0] sentence.bottom = [10, 10, 10, 10] sentence.left = [0, 10, 20, 30] sentence.right = [10, 20, 30, 40] return doc
def test_ner_matchers(): """Test different ner type matchers.""" # Set up a document doc = Document(id=1, name="test", stable_id="1::document:0:0") doc.text = " ".join([ "Tim Cook was born in USA in 1960.", "He is the CEO of Apple.", "He sold 100 million of iPhone.", ]) lingual_parser = SpacyParser("en") for parts in lingual_parser.split_sentences(doc.text): parts["document"] = doc Sentence(**parts) # Manually attach ner_tags as the result from spacy may fluctuate. doc.sentences[0].ner_tags = [ "PERSON", "PERSON", "O", "O", "O", "GPE", "O", "DATE", "O", ] doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"] doc.sentences[2].ner_tags = [ "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O" ] # the length of words and that of ner_tags should match. assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags) assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags) space = MentionNgrams(n_min=1, n_max=2) # Test if PersonMatcher works as expected matcher = PersonMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"} # Test if LocationMatcher works as expected matcher = LocationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"USA"} # Test if DateMatcher works as expected matcher = DateMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"1960"} # Test if OrganizationMatcher works as expected matcher = OrganizationMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"Apple"} # Test if NumberMatcher works as expected matcher = NumberMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"100 million"} # Test if MiscMatcher works as expected matcher = MiscMatcher() assert set(tc.get_span() for tc in matcher.apply(space.apply(doc))) == {"iPhone"}