예제 #1
0
파일: qa.py 프로젝트: GersonBastos/irlib
class QA:
    
    def __init__(self):
        self.file_name = 'qa.txt'
        self.qa_list = {}
        self.qa_id = 0
        self.prep = Preprocessor()
        self.mx = Matrix()
        self.metric = Metrics()
        
    def randomize(self, a):
        for i in range(len(a)):
            a[i] = random.randint(0,1)

    def readfile(self):

        fd = open(self.file_name,'r')
        for line in fd.readlines():
            line = line.strip().lower().split(':')
            if len(line) != 2:  
                continue
            elif line[0] == 'q':
                q_line = ' '.join(line[1:])
                self.qa_id += 1
                self.qa_list[self.qa_id] = {'q': q_line, 'a': ''}
                terms = self.prep.ngram_tokenizer(text=q_line)
                self.mx.add_doc(doc_id=self.qa_id, doc_terms=terms, 
                        frequency=True, do_padding=True)
            elif line[0] == 'a': 
                a_line = ' '.join(line[1:])
                self.qa_list[self.qa_id]['a'] = a_line
        
        #print 'Number of read questions and answers:', len(self.mx.docs)
        #print 'Number of read terms', len(self.mx.terms)
               
    def ask(self, q=''):

        q_id = 0
        q_distance = 99999

        terms = self.prep.ngram_tokenizer(text=q)
        q_vector = self.mx.query_to_vector(terms, frequency=False)

        if sum(q_vector) == 0:
            self.randomize(q_vector)

        for doc in self.mx.docs:
            distance = self.metric.euclid_vectors(doc['terms'], q_vector)
            if distance < q_distance:
                q_distance = distance
                q_id = doc['id']
    
        print 'Tarek:', self.qa_list[q_id]['a']
예제 #2
0
class TestMatrix(TestCase):
    def setUp(self):
        self.m = Matrix()

    def test_add_doc(self):
        # Try without frequency
        self.assertEqual(len(self.m), 0)
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Spam',
                       doc_terms=doc1_terms,
                       frequency=False)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later'])
        self.assertEqual(self.m.docs[0]['terms'], [1, 1, 1, 1])

        # Now try with frequency
        doc2_terms = ['buy', 'today', 'or', 'buy', 'later']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Spam',
                       doc_terms=doc2_terms,
                       frequency=True)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later', 'today'])
        self.assertEqual(self.m.docs[1]['terms'], [2, 0, 1, 1, 1])

        # Now let's see if padding is working
        doc2_terms = ['buy', 'now']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Ham',
                       doc_terms=doc2_terms,
                       frequency=True,
                       do_padding=True)
        #print self.m.terms, self.m.docs[0]['terms']
        self.assertEqual(len(self.m.terms), len(self.m.docs[0]['terms']))
        self.assertEqual(len(self.m), 3)
        self.assertEqual('buy' in self.m, True)
        self.assertEqual('shopping' in self.m, False)

    def test_add_doc_empty(self):
        doc1_terms = []
        with self.assertRaises(ValueError):
            self.m.add_doc(doc_id='doc1',
                           doc_class='Spam',
                           doc_terms=doc1_terms)

    def test_meta_data(self):
        mx = Matrix()
        for i, s in enumerate(['hello', 'world']):
            mx.add_doc(doc_id=str(i),
                       doc_class='Email',
                       doc_terms=s.split(),
                       do_padding=True,
                       frequency=True,
                       meta_data={
                           'original_text': s,
                           'original_text_len': len(s)
                       })
        self.assertEqual(mx.docs[1]['original_text'], 'world')
        self.assertEqual(mx.docs[1]['original_text_len'], 5)

    def test_docs_unique_ids(self):
        mx = Matrix()
        for i, s in enumerate(['hello', 'how are you', 'fine thank you']):
            mx.add_doc(doc_id=str(i),
                       doc_class='Email',
                       doc_terms=s.split(),
                       do_padding=True,
                       frequency=True)
        mx.add_doc(doc_id='1',
                   doc_class='Email',
                   doc_terms='goodbye'.split(),
                   do_padding=True,
                   frequency=True,
                   unique_ids=True)
        self.assertEqual(len(mx), 3)

    def test_get_doc_by_id(self):
        mx = Matrix()
        for i, s in enumerate(['hello', 'how are you', 'fine thank you']):
            mx.add_doc(doc_id=str(i),
                       doc_class='Email',
                       doc_terms=s.split(),
                       do_padding=True,
                       frequency=True)
        doc1_id = mx.docs.index('1')
        self.assertEqual(mx.docs[doc1_id]['id'], '1')

    def test_query_alignment(self):
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Spam',
                       doc_terms=doc1_terms,
                       frequency=False)
        q_vector = self.m.query_to_vector(['best', 'buy'], frequency=False)
        self.assertEqual(q_vector, [1, 0, 0, 0])

    def test_tf_idf(self):
        doc1_terms = ['new', 'york', 'times']
        self.m.add_doc(doc_id='doc1',
                       doc_class='Spam',
                       doc_terms=doc1_terms,
                       do_padding=True,
                       frequency=True)
        doc2_terms = ['new', 'york', 'post']
        self.m.add_doc(doc_id='doc2',
                       doc_class='Spam',
                       doc_terms=doc2_terms,
                       do_padding=True,
                       frequency=True)
        doc3_terms = ['los', 'angeles', 'times']
        self.m.add_doc(doc_id='doc3',
                       doc_class='Spam',
                       doc_terms=doc3_terms,
                       do_padding=True,
                       frequency=True)
        self.m.tf_idf(log_base=2)
        doc1_tfidf_retval = self.m.docs[0]['terms']
        doc1_tfidf_retval = [round(item, 3) for item in doc1_tfidf_retval]
        doc1_tfidf_expval = [0.585, 0.585, 0.585, 0, 0, 0]
        self.assertEqual(doc1_tfidf_retval, doc1_tfidf_expval)

    def test_white_and_black_lists(self):
        doc_terms = ['this', 'is', 'a', 'new', 'test']
        white_list = ['test']
        black_list = ['this', 'is', 'a']
        mx = Matrix(whitelist=white_list, blacklist=black_list)
        mx.add_doc(doc_id='doc1',
                   doc_class='TestClass',
                   doc_terms=doc_terms,
                   do_padding=True,
                   frequency=True)
        returned_terms = mx.vocabulary()
        expected_terms = ['test']
        self.assertItemsEqual(returned_terms, expected_terms)
예제 #3
0
class TestMatrix(TestCase):

    def setUp(self):
        self.m = Matrix()
    
    def test_add_doc(self):
        # Try without frequency
        self.assertEqual(len(self.m),0)
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Spam', 
                        doc_terms= doc1_terms,
                        frequency=False)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later'])
        self.assertEqual(self.m.docs[0]['terms'], [1,1,1,1])
   
        # Now try with frequency
        doc2_terms = ['buy', 'today', 'or', 'buy', 'later']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Spam', 
                        doc_terms= doc2_terms,
                        frequency=True)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later', 'today'])
        self.assertEqual(self.m.docs[1]['terms'], [2,0,1,1,1])

        # Now let's see if padding is working
        doc2_terms = ['buy', 'now']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Ham', 
                        doc_terms= doc2_terms,
                        frequency=True,
                        do_padding=True)
        #print self.m.terms, self.m.docs[0]['terms']
        self.assertEqual(len(self.m.terms), len(self.m.docs[0]['terms'])) 
        self.assertEqual(len(self.m),3)
        self.assertEqual('buy' in self.m, True)
        self.assertEqual('shopping' in self.m, False)

    def test_add_doc_empty(self):
        doc1_terms = []
        with self.assertRaises(ValueError):
            self.m.add_doc( doc_id = 'doc1', 
                            doc_class='Spam', 
                            doc_terms= doc1_terms)

    def test_meta_data(self):
        mx = Matrix()
        for i,s in enumerate(['hello', 'world']):
            mx.add_doc( doc_id = str(i), 
                        doc_class='Email', 
                        doc_terms= s.split(),
                        do_padding=True,
                        frequency=True,
                        meta_data={
                            'original_text': s,
                            'original_text_len': len(s)
                        })
        self.assertEqual(mx.docs[1]['original_text'], 'world')
        self.assertEqual(mx.docs[1]['original_text_len'], 5)

    def test_docs_unique_ids(self):
        mx = Matrix()
        for i,s in enumerate(['hello', 'how are you', 'fine thank you']):
            mx.add_doc( doc_id = str(i), 
                        doc_class='Email', 
                        doc_terms= s.split(),
                        do_padding=True,
                        frequency=True)
        mx.add_doc(doc_id = '1', 
                   doc_class='Email', 
                   doc_terms= 'goodbye'.split(),
                   do_padding=True,
                   frequency=True,
                   unique_ids=True)
        self.assertEqual(len(mx), 3)

    def test_get_doc_by_id(self):
        mx = Matrix()
        for i,s in enumerate(['hello', 'how are you', 'fine thank you']):
            mx.add_doc( doc_id = str(i), 
                        doc_class='Email', 
                        doc_terms= s.split(),
                        do_padding=True,
                        frequency=True)
        doc1_id = mx.docs.index('1')
        self.assertEqual(mx.docs[doc1_id]['id'], '1')

    def test_query_alignment(self):
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Spam', 
                        doc_terms= doc1_terms,
                        frequency=False)
        q_vector = self.m.query_to_vector(['best', 'buy'], frequency=False)
        self.assertEqual(q_vector, [1,0,0,0]) 

    def test_tf_idf(self):
        doc1_terms = ['new', 'york', 'times']
        self.m.add_doc( doc_id = 'doc1', 
                        doc_class='Spam', 
                        doc_terms= doc1_terms,
                        do_padding=True,
                        frequency=True)
        doc2_terms = ['new', 'york', 'post']
        self.m.add_doc( doc_id = 'doc2', 
                        doc_class='Spam', 
                        doc_terms= doc2_terms,
                        do_padding=True,
                        frequency=True)
        doc3_terms = ['los', 'angeles', 'times']
        self.m.add_doc( doc_id = 'doc3', 
                        doc_class='Spam', 
                        doc_terms= doc3_terms,
                        do_padding=True,
                        frequency=True)
        self.m.tf_idf(log_base=2)
        doc1_tfidf_retval = self.m.docs[0]['terms']
        doc1_tfidf_retval = [round(item, 3) for item in doc1_tfidf_retval]
        doc1_tfidf_expval = [0.585, 0.585, 0.585, 0, 0, 0]
        self.assertEqual(doc1_tfidf_retval, doc1_tfidf_expval)

    def test_white_and_black_lists(self):
        doc_terms = ['this', 'is', 'a', 'new', 'test']
        white_list = ['test']
        black_list = ['this', 'is', 'a']
        mx = Matrix(whitelist=white_list, blacklist=black_list)
        mx.add_doc( doc_id = 'doc1', 
                    doc_class='TestClass', 
                    doc_terms= doc_terms,
                    do_padding=True,
                    frequency=True)
        returned_terms = mx.vocabulary()
        expected_terms = ['test']
        self.assertItemsEqual(returned_terms, expected_terms)