示例#1
0
def ts():
    text = """
    Mr. Speaker, 480,000 Federal employees are working without pay, a form of involuntary servitude; 280,000 Federal employees are not working, and they will be paid. Virtually all of these workers have mortgages to pay, children to feed, and financial obligations to meet.
    Mr. Speaker, what is happening to these workers is immoral, is wrong, and must be rectified immediately. Newt Gingrich and the Republican leadership must not continue to hold the House and the American people hostage while they push their disastrous 7-year balanced budget plan. The gentleman from Georgia, Mr. Gingrich, and the Republican leadership must join Senator Dole and the entire Senate and pass a continuing resolution now, now to reopen Government.
    Mr. Speaker, that is what the American people want, that is what they need, and that is what this body must do.
    """.strip()
    doc = make_spacy_doc(text, lang="en")
    ts_ = text_stats.TextStats(doc)
    return ts_
示例#2
0
 def test_invalid_data(self):
     invalid_contents = [
         b"This is an English sentence in bytes.",
         {
             "content": "This is an English sentence as dict value."
         },
         True,
     ]
     for invalid_content in invalid_contents:
         with pytest.raises(TypeError):
             _ = make_spacy_doc(invalid_content)
示例#3
0
 def test_invalid_data_lang_combo(self):
     spacy_lang = cache.load_spacy_lang("en")
     combos = (
         (spacy_lang("Hello, how are you my friend?"), "es"),
         (spacy_lang("Hello, how are you my friend?"), True),
         ("This is an English sentence.", True),
         (("This is an English sentence.", {
             "foo": "bar"
         }), True),
     )
     for data, lang in combos:
         with pytest.raises((ValueError, TypeError)):
             _ = make_spacy_doc(data, lang=lang)
示例#4
0
 def test_to_bag_of_words_values(self):
     text = "Burton Jacob DeWilde, Burton Jacob, Burton."
     doc = make_spacy_doc(text, lang="en")
     bow = doc._.to_bag_of_words(weighting="count",
                                 normalize="lower",
                                 as_strings=True)
     assert len(bow) == 3
     assert bow["burton"] == 3
     assert bow["jacob"] == 2
     assert bow["dewilde"] == 1
     bow = doc._.to_bag_of_words(weighting="freq",
                                 normalize="lower",
                                 as_strings=True)
     assert len(bow) == 3
     assert bow["burton"] == 3 / len(doc)
     assert bow["jacob"] == 2 / len(doc)
     assert bow["dewilde"] == 1 / len(doc)
     bow = doc._.to_bag_of_words(weighting="binary",
                                 normalize="lower",
                                 as_strings=True)
     assert len(bow) == 3
     assert bow["burton"] == 1
     assert bow["jacob"] == 1
     assert bow["dewilde"] == 1
示例#5
0
 def test_invalid_lang(self):
     invalid_langs = [b"en", ["en", "en_core_web_sm"], True]
     for invalid_lang in invalid_langs:
         with pytest.raises(TypeError):
             _ = make_spacy_doc("This is an English sentence.",
                                lang=invalid_lang)
示例#6
0
 def test_doc_data(self, langs):
     spacy_lang = cache.load_spacy_lang("en")
     doc = spacy_lang("This is an English sentence.")
     assert isinstance(make_spacy_doc(doc), spacy.tokens.Doc)
     for lang in langs:
         assert isinstance(make_spacy_doc(doc, lang=lang), spacy.tokens.Doc)
示例#7
0
 def test_record_data(self, langs):
     record = ("This is an English sentence.", {"foo": "bar"})
     assert isinstance(make_spacy_doc(record), spacy.tokens.Doc)
     for lang in langs:
         assert isinstance(make_spacy_doc(record, lang=lang),
                           spacy.tokens.Doc)
示例#8
0
 def test_text_data(self, langs):
     text = "This is an English sentence."
     assert isinstance(make_spacy_doc(text), spacy.tokens.Doc)
     for lang in langs:
         assert isinstance(make_spacy_doc(text, lang=lang),
                           spacy.tokens.Doc)
示例#9
0
def doc(request):
    return make_spacy_doc((TEXT, {"foo": "bar!"}), lang="en")
示例#10
0
def doc2(text2):
    return make_spacy_doc(text2, lang="en")
示例#11
0
def doc1(text1):
    return make_spacy_doc(text1, lang="en")
示例#12
0
def doc(text):
    spacy_lang = cache.load_spacy_lang("en")
    return make_spacy_doc(text, lang=spacy_lang)
示例#13
0
def doc_pairs(text_pairs):
    return [(make_spacy_doc(text1, lang="en"), make_spacy_doc(text2,
                                                              lang="en"))
            for text1, text2 in text_pairs]