def test_tf_idf_5(self): dtc = [ ["a", "b", "a", "c"], ["b", "d"] ] w = tf_idf(dtc) self.assertEqual(w, (["a", "b", "c", "d"], [[2/4.0 * log(2/1.0), # a 1/4.0 * log(2/2.0), # b 1/4.0 * log(2/1.0), # c 0/4.0 * log(2/1.0) # d ], [0/2.0 * log(2/1.0), # a 1/2.0 * log(2/2.0), # b 0/2.0 * log(2/1.0), # c 1/2.0 * log(2/1.0) # d ] ]))
def test_tf_idf_5(self): dtc = [["a", "b", "a", "c"], ["b", "d"]] w = tf_idf(dtc) self.assertEqual( w, ( ["a", "b", "c", "d"], [ [ 2 / 4.0 * log(2 / 1.0), # a 1 / 4.0 * log(2 / 2.0), # b 1 / 4.0 * log(2 / 1.0), # c 0 / 4.0 * log(2 / 1.0) # d ], [ 0 / 2.0 * log(2 / 1.0), # a 1 / 2.0 * log(2 / 2.0), # b 0 / 2.0 * log(2 / 1.0), # c 1 / 2.0 * log(2 / 1.0) # d ] ]))
def tf_idf_indicator_weight(terms_per_elem): # consider the pair of documents as a combined collection # of N-single sentence documents # tf is either 1 or 0, depending on wether the term occurs in the sentence terms, weights = tf_idf(terms_per_elem, normalized=True, indicator=True) return weights
def test_tf_idf_empty_3(self): dtc = [["a"], []] self.assertEqual(tf_idf(dtc), (["a"], [[1 / 1 * log(2 / 1)], [0.0]]))
def test_tf_idf_1(self): dtc = [["a"]] w = tf_idf(dtc) self.assertEqual(w, (["a"], [[1 / 1.0 * log(1 / 1.0)]]))
def test_tf_idf_empty_2(self): dtc = [[]] self.assertEqual(tf_idf(dtc), ([], [[]]))
def test_tf_idf_empty_1(self): dtc = [] self.assertEqual(tf_idf(dtc), ([], []))
def test_tf_idf_4(self): dtc = [["a"], ["b"]] w = tf_idf(dtc) self.assertEqual(w, (["a", "b"], [[1 / 1 * log(2 / 1.0), 0.0], [0.0, 1 / 1 * log(2 / 1.0)]]))
def test_tf_idf_1(self): dtc = [ ["a"] ] w = tf_idf(dtc) self.assertEqual(w, (["a"], [[1/1.0 * log(1/1.0)]]))
def test_tf_idf_empty_3(self): dtc = [["a"], []] self.assertEqual(tf_idf(dtc), (["a"], [[1/1 * log(2/1)], [0.0]]))
def test_tf_idf_4(self): dtc = [ ["a"], ["b"] ] w = tf_idf(dtc) self.assertEqual(w, (["a", "b"], [[1/1 * log(2/1.0), 0.0], [0.0, 1/1 * log(2/1.0)]]))