Пример #1
0
def ts():
    text = """
    Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet.
    Mr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government.
    Mr. Speaker, that is what the American people want, that is what they need, and that is what this body must do.
    """.strip()
    doc = make_spacy_doc(text, lang="en")
    ts_ = text_stats.TextStats(doc)
    return ts_
Пример #2
0
 def test_invalid_data(self):
     invalid_contents = [
         b"This is an English sentence in bytes.",
         {
             "content": "This is an English sentence as dict value."
         },
         True,
     ]
     for invalid_content in invalid_contents:
         with pytest.raises(TypeError):
             _ = make_spacy_doc(invalid_content)
Пример #3
0
 def test_invalid_data_lang_combo(self):
     spacy_lang = cache.load_spacy_lang("en")
     combos = (
         (spacy_lang("Hello, how are you my friend?"), "es"),
         (spacy_lang("Hello, how are you my friend?"), True),
         ("This is an English sentence.", True),
         (("This is an English sentence.", {
             "foo": "bar"
         }), True),
     )
     for data, lang in combos:
         with pytest.raises((ValueError, TypeError)):
             _ = make_spacy_doc(data, lang=lang)
Пример #4
0
 def test_to_bag_of_words_values(self):
     text = "Burton Jacob DeWilde, Burton Jacob, Burton."
     doc = make_spacy_doc(text, lang="en")
     bow = doc._.to_bag_of_words(weighting="count",
                                 normalize="lower",
                                 as_strings=True)
     assert len(bow) == 3
     assert bow["burton"] == 3
     assert bow["jacob"] == 2
     assert bow["dewilde"] == 1
     bow = doc._.to_bag_of_words(weighting="freq",
                                 normalize="lower",
                                 as_strings=True)
     assert len(bow) == 3
     assert bow["burton"] == 3 / len(doc)
     assert bow["jacob"] == 2 / len(doc)
     assert bow["dewilde"] == 1 / len(doc)
     bow = doc._.to_bag_of_words(weighting="binary",
                                 normalize="lower",
                                 as_strings=True)
     assert len(bow) == 3
     assert bow["burton"] == 1
     assert bow["jacob"] == 1
     assert bow["dewilde"] == 1
Пример #5
0
 def test_invalid_lang(self):
     invalid_langs = [b"en", ["en", "en_core_web_sm"], True]
     for invalid_lang in invalid_langs:
         with pytest.raises(TypeError):
             _ = make_spacy_doc("This is an English sentence.",
                                lang=invalid_lang)
Пример #6
0
 def test_doc_data(self, langs):
     spacy_lang = cache.load_spacy_lang("en")
     doc = spacy_lang("This is an English sentence.")
     assert isinstance(make_spacy_doc(doc), spacy.tokens.Doc)
     for lang in langs:
         assert isinstance(make_spacy_doc(doc, lang=lang), spacy.tokens.Doc)
Пример #7
0
 def test_record_data(self, langs):
     record = ("This is an English sentence.", {"foo": "bar"})
     assert isinstance(make_spacy_doc(record), spacy.tokens.Doc)
     for lang in langs:
         assert isinstance(make_spacy_doc(record, lang=lang),
                           spacy.tokens.Doc)
Пример #8
0
 def test_text_data(self, langs):
     text = "This is an English sentence."
     assert isinstance(make_spacy_doc(text), spacy.tokens.Doc)
     for lang in langs:
         assert isinstance(make_spacy_doc(text, lang=lang),
                           spacy.tokens.Doc)
Пример #9
0
def doc(request):
    return make_spacy_doc((TEXT, {"foo": "bar!"}), lang="en")
Пример #10
0
def doc2(text2):
    return make_spacy_doc(text2, lang="en")
Пример #11
0
def doc1(text1):
    return make_spacy_doc(text1, lang="en")
Пример #12
0
def doc(text):
    spacy_lang = cache.load_spacy_lang("en")
    return make_spacy_doc(text, lang=spacy_lang)
Пример #13
0
def doc_pairs(text_pairs):
    return [(make_spacy_doc(text1, lang="en"), make_spacy_doc(text2,
                                                              lang="en"))
            for text1, text2 in text_pairs]