def evaluate_unit_overlap(evaluated_sentences, reference_sentences): evaluated_words = tuple(chain(*(s.words for s in evaluated_sentences))) reference_words = tuple(chain(*(s.words for s in reference_sentences))) evaluated_model = TfDocumentModel(evaluated_words) reference_model = TfDocumentModel(reference_words) return unit_overlap(evaluated_model, reference_model)
def evaluate_cosine_similarity(evaluated_sentences, reference_sentences): evaluated_words = tuple(chain(*(s.words for s in evaluated_sentences))) reference_words = tuple(chain(*(s.words for s in reference_sentences))) evaluated_model = TfDocumentModel(evaluated_words) reference_model = TfDocumentModel(reference_words) return cosine_similarity(evaluated_model, reference_model)
def test_cosine_half_match(self): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) self.assertAlmostEqual(cosine_similarity(model1, model2), 0.5)
def test_unit_overlap_no_match(): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) assert unit_overlap(model1, model2) == approx(0.0)
def test_cosine_no_match(): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) assert cosine_similarity(model1, model2) == approx(0.0)
def test_unit_overlap_half_match(self): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Veta aká sa tu len veľmi ťažko hľadá", tokenizer) model2 = TfDocumentModel("Teta ktorá sa tu iba veľmi zle hľadá", tokenizer) self.assertAlmostEqual(unit_overlap(model1, model2), 1 / 3)
def test_unit_overlap_no_match(self): tokenizer = Tokenizer("czech") model1 = TfDocumentModel("Toto je moja veta. To sa nedá poprieť!", tokenizer) model2 = TfDocumentModel("Hento bolo jeho slovo, ale možno klame.", tokenizer) self.assertAlmostEqual(unit_overlap(model1, model2), 0.0)
def test_pretokenized_words_frequencies(self): model = TfDocumentModel(("wC", "wC", "WC", "wA", "WB", "wB")) self.assertEqual(model.term_frequency("wa"), 1) self.assertEqual(model.term_frequency("wb"), 2) self.assertEqual(model.term_frequency("wc"), 3) self.assertEqual(model.term_frequency("wd"), 0) self.assertEqual(model.most_frequent_terms(), ("wc", "wb", "wa"))
def test_pretokenized_words_frequencies(): model = TfDocumentModel(("wC", "wC", "WC", "wA", "WB", "wB")) assert model.term_frequency("wa") == 1 assert model.term_frequency("wb") == 2 assert model.term_frequency("wc") == 3 assert model.term_frequency("wd") == 0 assert model.most_frequent_terms() == ("wc", "wb", "wa")
def test_empty_model(self): text = "Toto je moja veta, to sa nedá poprieť." model = TfDocumentModel(text, Tokenizer("czech")) empty_model = TfDocumentModel([]) self.assertRaises(ValueError, cosine_similarity, empty_model, empty_model) self.assertRaises(ValueError, cosine_similarity, empty_model, model) self.assertRaises(ValueError, cosine_similarity, model, empty_model)
def test_most_frequent_terms(self): tokenizer = Tokenizer("english") text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE" model = TfDocumentModel(text, tokenizer) self.assertEqual(model.most_frequent_terms(1), ("we",)) self.assertEqual(model.most_frequent_terms(2), ("we", "wd")) self.assertEqual(model.most_frequent_terms(3), ("we", "wd", "wc")) self.assertEqual(model.most_frequent_terms(4), ("we", "wd", "wc", "wb")) self.assertEqual(model.most_frequent_terms(5), ("we", "wd", "wc", "wb", "wa")) self.assertEqual(model.most_frequent_terms(), ("we", "wd", "wc", "wb", "wa"))
def test_most_frequent_terms(): tokenizer = Tokenizer("english") text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE" model = TfDocumentModel(text, tokenizer) assert model.most_frequent_terms(1) == ("we",) assert model.most_frequent_terms(2) == ("we", "wd") assert model.most_frequent_terms(3) == ("we", "wd", "wc") assert model.most_frequent_terms(4) == ("we", "wd", "wc", "wb") assert model.most_frequent_terms(5) == ("we", "wd", "wc", "wb", "wa") assert model.most_frequent_terms() == ("we", "wd", "wc", "wb", "wa")
def test_empty_model(): text = "Toto je moja veta, to sa nedá poprieť." model = TfDocumentModel(text, Tokenizer("czech")) empty_model = TfDocumentModel([]) with pytest.raises(ValueError): cosine_similarity(empty_model, empty_model) with pytest.raises(ValueError): cosine_similarity(empty_model, model) with pytest.raises(ValueError): cosine_similarity(model, empty_model)
def test_normalized_words_frequencies(): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) assert model.normalized_term_frequency("a") == pytest.approx(1/5) assert model.normalized_term_frequency("b") == pytest.approx(2/5) assert model.normalized_term_frequency("c") == pytest.approx(3/5) assert model.normalized_term_frequency("d") == pytest.approx(4/5) assert model.normalized_term_frequency("e") == pytest.approx(5/5) assert model.normalized_term_frequency("z") == pytest.approx(0.0) assert model.most_frequent_terms() == ("e", "d", "c", "b", "a")
def test_normalized_words_frequencies(self): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) self.assertAlmostEqual(model.normalized_term_frequency("a"), 1/5) self.assertAlmostEqual(model.normalized_term_frequency("b"), 2/5) self.assertAlmostEqual(model.normalized_term_frequency("c"), 3/5) self.assertAlmostEqual(model.normalized_term_frequency("d"), 4/5) self.assertAlmostEqual(model.normalized_term_frequency("e"), 5/5) self.assertAlmostEqual(model.normalized_term_frequency("z"), 0.0) self.assertEqual(model.most_frequent_terms(), ("e", "d", "c", "b", "a"))
def test_normalized_words_frequencies_with_smoothing_term(): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) assert model.normalized_term_frequency("a", 0.5) == pytest.approx(0.5 + 1/10) assert model.normalized_term_frequency("b", 0.5) == pytest.approx(0.5 + 2/10) assert model.normalized_term_frequency("c", 0.5) == pytest.approx(0.5 + 3/10) assert model.normalized_term_frequency("d", 0.5) == pytest.approx(0.5 + 4/10) assert model.normalized_term_frequency("e", 0.5) == pytest.approx(0.5 + 5/10) assert model.normalized_term_frequency("z", 0.5) == pytest.approx(0.5) assert model.most_frequent_terms() == ("e", "d", "c", "b", "a")
def test_normalized_words_frequencies_with_smoothing_term(self): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) self.assertAlmostEqual(model.normalized_term_frequency("a", 0.5), 0.5 + 1/10) self.assertAlmostEqual(model.normalized_term_frequency("b", 0.5), 0.5 + 2/10) self.assertAlmostEqual(model.normalized_term_frequency("c", 0.5), 0.5 + 3/10) self.assertAlmostEqual(model.normalized_term_frequency("d", 0.5), 0.5 + 4/10) self.assertAlmostEqual(model.normalized_term_frequency("e", 0.5), 0.5 + 5/10) self.assertAlmostEqual(model.normalized_term_frequency("z", 0.5), 0.5) self.assertEqual(model.most_frequent_terms(), ("e", "d", "c", "b", "a"))
def test_unit_overlap_wrong_arguments(self): tokenizer = Tokenizer("english") model = TfDocumentModel("", tokenizer) self.assertRaises(ValueError, unit_overlap, "model", "model") self.assertRaises(ValueError, unit_overlap, "model", model) self.assertRaises(ValueError, unit_overlap, model, "model")
def test_terms(self): tokenizer = Tokenizer("english") text = "wA wB wC wD wB wD wE" model = TfDocumentModel(text, tokenizer) terms = tuple(sorted(model.terms)) self.assertEqual(terms, ("wa", "wb", "wc", "wd", "we"))
def load_summary(filename): """ Load the summary for analysis. Parameters: filename: the filename of the summary text file Returns: Spacy processed text and sumy processed text for analysis. """ spacy_available = True try: nlp = load('en_core_web_lg') except: spacy_available = False if not isfile(filename): return '', '' if spacy_available: with open(filename, 'r') as summary_file: summary_text = ' '.join(summary_file) summary_doc = nlp(summary_text) else: summary_doc = '' summary_parser = PlaintextParser.from_file(filename, Tokenizer("english")) summary_model = TfDocumentModel(str(summary_parser.document.sentences), Tokenizer("en")) return summary_doc, summary_model
def test_wrong_arguments(self): text = "Toto je moja veta, to sa nedá poprieť." model = TfDocumentModel(text, Tokenizer("czech")) self.assertRaises(ValueError, cosine_similarity, text, text) self.assertRaises(ValueError, cosine_similarity, text, model) self.assertRaises(ValueError, cosine_similarity, model, text)
def test_unit_overlap_wrong_arguments(): tokenizer = Tokenizer("english") model = TfDocumentModel("", tokenizer) with pytest.raises(ValueError): unit_overlap("model", "model") with pytest.raises(ValueError): unit_overlap("model", model) with pytest.raises(ValueError): unit_overlap(model, "model")
def test_wrong_arguments(): text = "Toto je moja veta, to sa nedá poprieť." model = TfDocumentModel(text, Tokenizer("czech")) with pytest.raises(ValueError): cosine_similarity(text, text) with pytest.raises(ValueError): cosine_similarity(text, model) with pytest.raises(ValueError): cosine_similarity(model, text)
def evaluate(summary, sumref, debug=False): sumstring = sumtostr(summary) sumtuple = sumtotup(summary) refstring = sumtostr(sumref) reftuple = sumtotup(sumref) summodel = TfDocumentModel(sumstring, Tokenizer("english")) refmodel = TfDocumentModel(refstring, Tokenizer("english")) if debug: print(reftuple) print(sumtuple) cos_val = cosine_similarity(summodel, refmodel) unit_val = unit_overlap(summodel, refmodel) precision_val = precision(sumtuple, reftuple) recall_val = recall(sumtuple, reftuple) f_val = f_score(sumtuple, reftuple) return cos_val, unit_val, precision_val, recall_val, f_val
def test_normalized_words_frequencies_with_smoothing_term(): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) assert model.normalized_term_frequency("a", 0.5) == pytest.approx(0.5 + 1 / 10) assert model.normalized_term_frequency("b", 0.5) == pytest.approx(0.5 + 2 / 10) assert model.normalized_term_frequency("c", 0.5) == pytest.approx(0.5 + 3 / 10) assert model.normalized_term_frequency("d", 0.5) == pytest.approx(0.5 + 4 / 10) assert model.normalized_term_frequency("e", 0.5) == pytest.approx(0.5 + 5 / 10) assert model.normalized_term_frequency("z", 0.5) == pytest.approx(0.5) assert model.most_frequent_terms() == ("e", "d", "c", "b", "a")
def test_normalized_words_frequencies_with_smoothing_term(self): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) self.assertAlmostEqual(model.normalized_term_frequency("a", 0.5), 0.5 + 1 / 10) self.assertAlmostEqual(model.normalized_term_frequency("b", 0.5), 0.5 + 2 / 10) self.assertAlmostEqual(model.normalized_term_frequency("c", 0.5), 0.5 + 3 / 10) self.assertAlmostEqual(model.normalized_term_frequency("d", 0.5), 0.5 + 4 / 10) self.assertAlmostEqual(model.normalized_term_frequency("e", 0.5), 0.5 + 5 / 10) self.assertAlmostEqual(model.normalized_term_frequency("z", 0.5), 0.5) self.assertEqual(model.most_frequent_terms(), ("e", "d", "c", "b", "a"))
def test_term_frequency(self): tokenizer = Tokenizer("english") text = "wA wB wC wA wA wC wD wCwB" model = TfDocumentModel(text, tokenizer) self.assertEqual(model.term_frequency("wa"), 3) self.assertEqual(model.term_frequency("wb"), 1) self.assertEqual(model.term_frequency("wc"), 2) self.assertEqual(model.term_frequency("wd"), 1) self.assertEqual(model.term_frequency("wcwb"), 1) self.assertEqual(model.term_frequency("we"), 0) self.assertEqual(model.term_frequency("missing"), 0)
def test_term_frequency(): tokenizer = Tokenizer("english") text = "wA wB wC wA wA wC wD wCwB" model = TfDocumentModel(text, tokenizer) assert model.term_frequency("wa") == 3 assert model.term_frequency("wb") == 1 assert model.term_frequency("wc") == 2 assert model.term_frequency("wd") == 1 assert model.term_frequency("wcwb") == 1 assert model.term_frequency("we") == 0 assert model.term_frequency("missing") == 0
def test_most_frequent_terms(): tokenizer = Tokenizer("english") text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE" model = TfDocumentModel(text, tokenizer) assert model.most_frequent_terms(1) == ("we", ) assert model.most_frequent_terms(2) == ("we", "wd") assert model.most_frequent_terms(3) == ("we", "wd", "wc") assert model.most_frequent_terms(4) == ("we", "wd", "wc", "wb") assert model.most_frequent_terms(5) == ("we", "wd", "wc", "wb", "wa") assert model.most_frequent_terms() == ("we", "wd", "wc", "wb", "wa")
def test_normalized_words_frequencies(): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) assert model.normalized_term_frequency("a") == pytest.approx(1 / 5) assert model.normalized_term_frequency("b") == pytest.approx(2 / 5) assert model.normalized_term_frequency("c") == pytest.approx(3 / 5) assert model.normalized_term_frequency("d") == pytest.approx(4 / 5) assert model.normalized_term_frequency("e") == pytest.approx(5 / 5) assert model.normalized_term_frequency("z") == pytest.approx(0.0) assert model.most_frequent_terms() == ("e", "d", "c", "b", "a")
def test_normalized_words_frequencies(self): words = "a b c d e c b d c e e d e d e".split() model = TfDocumentModel(tuple(words)) self.assertAlmostEqual(model.normalized_term_frequency("a"), 1 / 5) self.assertAlmostEqual(model.normalized_term_frequency("b"), 2 / 5) self.assertAlmostEqual(model.normalized_term_frequency("c"), 3 / 5) self.assertAlmostEqual(model.normalized_term_frequency("d"), 4 / 5) self.assertAlmostEqual(model.normalized_term_frequency("e"), 5 / 5) self.assertAlmostEqual(model.normalized_term_frequency("z"), 0.0) self.assertEqual(model.most_frequent_terms(), ("e", "d", "c", "b", "a"))
def test_most_frequent_terms(self): tokenizer = Tokenizer("english") text = "wE wD wC wB wA wE WD wC wB wE wD WE wC wD wE" model = TfDocumentModel(text, tokenizer) self.assertEqual(model.most_frequent_terms(1), ("we", )) self.assertEqual(model.most_frequent_terms(2), ("we", "wd")) self.assertEqual(model.most_frequent_terms(3), ("we", "wd", "wc")) self.assertEqual(model.most_frequent_terms(4), ("we", "wd", "wc", "wb")) self.assertEqual(model.most_frequent_terms(5), ("we", "wd", "wc", "wb", "wa")) self.assertEqual(model.most_frequent_terms(), ("we", "wd", "wc", "wb", "wa"))
def test_most_frequent_terms_empty(self): tokenizer = Tokenizer("english") model = TfDocumentModel("", tokenizer) self.assertEqual(model.most_frequent_terms(), ()) self.assertEqual(model.most_frequent_terms(10), ())
def test_magnitude(self): tokenizer = Tokenizer("english") text = "wA wB wC wD" model = TfDocumentModel(text, tokenizer) self.assertAlmostEqual(model.magnitude, 2.0)
def test_most_frequent_terms_empty(): tokenizer = Tokenizer("english") model = TfDocumentModel("", tokenizer) assert model.most_frequent_terms() == () assert model.most_frequent_terms(10) == ()
def test_most_frequent_terms_negative_count(self): tokenizer = Tokenizer("english") model = TfDocumentModel("text", tokenizer) self.assertRaises(ValueError, model.most_frequent_terms, -1)
def test_pretokenized_words(self): model = TfDocumentModel(("wA", "WB", "wB", "WA")) terms = tuple(sorted(model.terms)) self.assertEqual(terms, ("wa", "wb"))
def test_most_frequent_terms_negative_count(): tokenizer = Tokenizer("english") model = TfDocumentModel("text", tokenizer) with pytest.raises(ValueError): model.most_frequent_terms(-1)