class TestMetrics(TestCase): def setUp(self): self.m = Metrics() def test_jaccard_same_len(self): with self.assertRaises(ValueError): self.m.jaccard_vectors( [0, 1], [0, 1, 2, 3] ) def test_jaccard_empty(self): e = self.m.jaccard_vectors([],[]) self.assertEqual(e,1) def test_jaccard_int(self): e = self.m.jaccard_vectors( [0, 2, 1, 3], [0, 1, 2, 3] ) self.assertEqual(e,0.75) def test_jaccard_bool(self): e = self.m.jaccard_vectors( [False, False, True, True, True ], [False, True , True, True, False] ) self.assertEqual(e,0.4) def test_euclid_same_len(self): with self.assertRaises(ValueError): self.m.euclid_vectors( [0, 1, 2, 3], [0, 1] ) def test_euclid(self): e = self.m.euclid_vectors([1,1],[4,5]) self.assertEqual(e,5) def test_cos_same_len(self): with self.assertRaises(ValueError): self.m.cos_vectors( [0, 1, 2], [1, 1] ) def test_cos_0(self): c = self.m.cos_vectors([1,0,1],[0,1,0]) self.assertEqual(round(c,5),float(0)) def test_cos_1(self): c = self.m.cos_vectors([1,1,1],[1,1,1]) self.assertEqual(round(c,5),float(1))
class TestMetrics(unittest.TestCase): def setUp(self): self.m = Metrics() def test_metrics(self): e = self.m.euclid_vectors([1, 1], [4, 5]) self.assertEqual(e, 5) c = self.m.cos_vectors([1, 1, 1], [1, 1, 1]) self.assertEqual(round(c, 5), float(1)) c = self.m.cos_vectors([1, 0, 1], [0, 1, 0]) self.assertEqual(round(c, 5), float(0))
class TestMetrics(unittest.TestCase): def setUp(self): self.m = Metrics() def test_metrics(self): e = self.m.euclid_vectors([1,1],[4,5]) self.assertEqual(e,5) c = self.m.cos_vectors([1,1,1],[1,1,1]) self.assertEqual(round(c,5),float(1)) c = self.m.cos_vectors([1,0,1],[0,1,0]) self.assertEqual(round(c,5),float(0))
class QA: def __init__(self): self.file_name = 'qa.txt' self.qa_list = {} self.qa_id = 0 self.prep = Preprocessor() self.mx = Matrix() self.metric = Metrics() def randomize(self, a): for i in range(len(a)): a[i] = random.randint(0,1) def readfile(self): fd = open(self.file_name,'r') for line in fd.readlines(): line = line.strip().lower().split(':') if len(line) != 2: continue elif line[0] == 'q': q_line = ' '.join(line[1:]) self.qa_id += 1 self.qa_list[self.qa_id] = {'q': q_line, 'a': ''} terms = self.prep.ngram_tokenizer(text=q_line) self.mx.add_doc(doc_id=self.qa_id, doc_terms=terms, frequency=True, do_padding=True) elif line[0] == 'a': a_line = ' '.join(line[1:]) self.qa_list[self.qa_id]['a'] = a_line #print 'Number of read questions and answers:', len(self.mx.docs) #print 'Number of read terms', len(self.mx.terms) def ask(self, q=''): q_id = 0 q_distance = 99999 terms = self.prep.ngram_tokenizer(text=q) q_vector = self.mx.query_to_vector(terms, frequency=False) if sum(q_vector) == 0: self.randomize(q_vector) for doc in self.mx.docs: distance = self.metric.euclid_vectors(doc['terms'], q_vector) if distance < q_distance: q_distance = distance q_id = doc['id'] print 'Tarek:', self.qa_list[q_id]['a']