예제 #1
0
    def __validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc( doc_id=response.url,doc_terms=terms,frequency=True,do_padding=True)
            cnt = Counter()

            for word in terms:
                cnt[word] += 1
            table = Texttable()
            table.set_cols_align(["l", "l"])
            table.set_cols_valign(["m", "m"])
            table.set_cols_width([40,55])

            rows = [["Term", "Frequency"]]
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.lower() in queryTerms.lower().split():
                    rows.append([word, cnt[word]])
            table.add_rows(rows)
            print table.draw() + "\n"

        else:
            print "[-] Response for %s is %s " %(response.url, response.status_code)
예제 #2
0
    def __validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            cnt = Counter()

            for word in terms:
                cnt[word] += 1
            table = Texttable()
            table.set_cols_align(["l", "l"])
            table.set_cols_valign(["m", "m"])
            table.set_cols_width([40, 55])

            rows = [["Term", "Frequency"]]
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.lower() in queryTerms.lower().split():
                    rows.append([word, cnt[word]])
            table.add_rows(rows)
            print table.draw() + "\n"

        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
예제 #3
0
    def validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc( doc_id=response.url,doc_terms=terms,frequency=True,do_padding=True)
            '''for doc in mx.docs:
                distance = metric.euclid_vectors(doc['terms'], q_vector)
                print distance
            '''
            cnt = Counter()
            for word in terms:
                cnt[word] += 1
            tableTerms = PrettyTable(["Term", "Frequency"])
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.encode('ascii').lower() in queryTerms.encode('ascii').lower().split():
                    tableTerms.add_row([word, cnt[word]])
            print tableTerms
        else:
            print "[-] Response for %s is %s " %(response.url, response.status_code)
예제 #4
0
    def validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            '''for doc in mx.docs:
                distance = metric.euclid_vectors(doc['terms'], q_vector)
                print distance
            '''
            cnt = Counter()
            for word in terms:
                cnt[word] += 1
            tableTerms = PrettyTable(["Term", "Frequency"])
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.encode('ascii').lower() in queryTerms.encode(
                        'ascii').lower().split():
                    tableTerms.add_row([word, cnt[word]])
            print tableTerms
        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
예제 #5
0
 def test_get_doc_by_id(self):
     mx = Matrix()
     for i,s in enumerate(['hello', 'how are you', 'fine thank you']):
         mx.add_doc( doc_id = str(i), 
                     doc_class='Email', 
                     doc_terms= s.split(),
                     do_padding=True,
                     frequency=True)
     doc1_id = mx.docs.index('1')
     self.assertEqual(mx.docs[doc1_id]['id'], '1')
예제 #6
0
 def test_get_doc_by_id(self):
     mx = Matrix()
     for i, s in enumerate(['hello', 'how are you', 'fine thank you']):
         mx.add_doc(doc_id=str(i),
                    doc_class='Email',
                    doc_terms=s.split(),
                    do_padding=True,
                    frequency=True)
     doc1_id = mx.docs.index('1')
     self.assertEqual(mx.docs[doc1_id]['id'], '1')
예제 #7
0
파일: qa.py 프로젝트: GersonBastos/irlib
class QA:
    
    def __init__(self):
        self.file_name = 'qa.txt'
        self.qa_list = {}
        self.qa_id = 0
        self.prep = Preprocessor()
        self.mx = Matrix()
        self.metric = Metrics()
        
    def randomize(self, a):
        for i in range(len(a)):
            a[i] = random.randint(0,1)

    def readfile(self):

        fd = open(self.file_name,'r')
        for line in fd.readlines():
            line = line.strip().lower().split(':')
            if len(line) != 2:  
                continue
            elif line[0] == 'q':
                q_line = ' '.join(line[1:])
                self.qa_id += 1
                self.qa_list[self.qa_id] = {'q': q_line, 'a': ''}
                terms = self.prep.ngram_tokenizer(text=q_line)
                self.mx.add_doc(doc_id=self.qa_id, doc_terms=terms, 
                        frequency=True, do_padding=True)
            elif line[0] == 'a': 
                a_line = ' '.join(line[1:])
                self.qa_list[self.qa_id]['a'] = a_line
        
        #print 'Number of read questions and answers:', len(self.mx.docs)
        #print 'Number of read terms', len(self.mx.terms)
               
    def ask(self, q=''):

        q_id = 0
        q_distance = 99999

        terms = self.prep.ngram_tokenizer(text=q)
        q_vector = self.mx.query_to_vector(terms, frequency=False)

        if sum(q_vector) == 0:
            self.randomize(q_vector)

        for doc in self.mx.docs:
            distance = self.metric.euclid_vectors(doc['terms'], q_vector)
            if distance < q_distance:
                q_distance = distance
                q_id = doc['id']
    
        print 'Tarek:', self.qa_list[q_id]['a']
예제 #8
0
 def test_white_and_black_lists(self):
     doc_terms = ['this', 'is', 'a', 'new', 'test']
     white_list = ['test']
     black_list = ['this', 'is', 'a']
     mx = Matrix(whitelist=white_list, blacklist=black_list)
     mx.add_doc( doc_id = 'doc1', 
                 doc_class='TestClass', 
                 doc_terms= doc_terms,
                 do_padding=True,
                 frequency=True)
     returned_terms = mx.vocabulary()
     expected_terms = ['test']
     self.assertItemsEqual(returned_terms, expected_terms)
예제 #9
0
 def test_white_and_black_lists(self):
     doc_terms = ['this', 'is', 'a', 'new', 'test']
     white_list = ['test']
     black_list = ['this', 'is', 'a']
     mx = Matrix(whitelist=white_list, blacklist=black_list)
     mx.add_doc(doc_id='doc1',
                doc_class='TestClass',
                doc_terms=doc_terms,
                do_padding=True,
                frequency=True)
     returned_terms = mx.vocabulary()
     expected_terms = ['test']
     self.assertItemsEqual(returned_terms, expected_terms)
예제 #10
0
 def test_meta_data(self):
     mx = Matrix()
     for i, s in enumerate(['hello', 'world']):
         mx.add_doc(doc_id=str(i),
                    doc_class='Email',
                    doc_terms=s.split(),
                    do_padding=True,
                    frequency=True,
                    meta_data={
                        'original_text': s,
                        'original_text_len': len(s)
                    })
     self.assertEqual(mx.docs[1]['original_text'], 'world')
     self.assertEqual(mx.docs[1]['original_text_len'], 5)
예제 #11
0
 def test_meta_data(self):
     mx = Matrix()
     for i,s in enumerate(['hello', 'world']):
         mx.add_doc( doc_id = str(i), 
                     doc_class='Email', 
                     doc_terms= s.split(),
                     do_padding=True,
                     frequency=True,
                     meta_data={
                         'original_text': s,
                         'original_text_len': len(s)
                     })
     self.assertEqual(mx.docs[1]['original_text'], 'world')
     self.assertEqual(mx.docs[1]['original_text_len'], 5)
예제 #12
0
 def test_docs_unique_ids(self):
     mx = Matrix()
     for i,s in enumerate(['hello', 'how are you', 'fine thank you']):
         mx.add_doc( doc_id = str(i), 
                     doc_class='Email', 
                     doc_terms= s.split(),
                     do_padding=True,
                     frequency=True)
     mx.add_doc(doc_id = '1', 
                doc_class='Email', 
                doc_terms= 'goodbye'.split(),
                do_padding=True,
                frequency=True,
                unique_ids=True)
     self.assertEqual(len(mx), 3)
예제 #13
0
파일: qa.py 프로젝트: GersonBastos/irlib
 def __init__(self):
     self.file_name = 'qa.txt'
     self.qa_list = {}
     self.qa_id = 0
     self.prep = Preprocessor()
     self.mx = Matrix()
     self.metric = Metrics()
예제 #14
0
def readfiles(fold_path='all-folds/fold1/'):

    prep = Preprocessor()
    mx = Matrix()

    files = os.listdir(fold_path)
    for filename in files:
        fd = open('%s/%s' % (fold_path, filename), 'r')
        file_data = fd.read()
        terms = prep.ngram_tokenizer(text=file_data)
        mx.add_doc(doc_id=filename, doc_terms=terms, 
                frequency=True, do_padding=True)


    print 'Number of read documents:', len(mx.docs)
    print 'Number of read terms', len(mx.terms)
    #print mx.terms[0:5], mx.terms[-5:-1]
    print mx.terms
    print mx.docs
예제 #15
0
파일: search.py 프로젝트: vatsrahul/irlib
def readfiles(fold_path='all-folds/fold1/'):

    prep = Preprocessor()
    mx = Matrix()

    files = os.listdir(fold_path)
    for filename in files:
        fd = open('%s/%s' % (fold_path, filename), 'r')
        file_data = fd.read()
        terms = prep.ngram_tokenizer(text=file_data)
        mx.add_doc(doc_id=filename,
                   doc_terms=terms,
                   frequency=True,
                   do_padding=True)

    print 'Number of read documents:', len(mx.docs)
    print 'Number of read terms', len(mx.terms)
    #print mx.terms[0:5], mx.terms[-5:-1]
    print mx.terms
    print mx.docs
예제 #16
0
파일: tests.py 프로젝트: ljc0753/irlib
class TestMatrix(unittest.TestCase):

    def setUp(self):
        self.m = Matrix()

    def test_add_doc(self):
        # Try without frequency
        self.assertEqual(len(self.m),0)
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Spam', 
                        doc_terms= doc1_terms,
                        frequency=False)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later'])
        self.assertEqual(self.m.docs[0]['terms'], [1,1,1,1])
        # Now try with frequency
        doc2_terms = ['buy', 'today', 'or', 'buy', 'later']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Spam', 
                        doc_terms= doc2_terms,
                        frequency=True)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later', 'today'])
        self.assertEqual(self.m.docs[1]['terms'], [2,0,1,1,1])
        # Now let's see if padding is working
        doc2_terms = ['buy', 'now']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Ham', 
                        doc_terms= doc2_terms,
                        frequency=True,
                        do_padding=True)
        print self.m.terms, self.m.docs[0]['terms']
        self.assertEqual(len(self.m.terms), len(self.m.docs[0]['terms'])) 
        self.assertEqual(len(self.m),3)
        self.assertEqual('buy' in self.m, True)
        self.assertEqual('shopping' in self.m, False)
예제 #17
0
파일: tests.py 프로젝트: GersonBastos/irlib
class TestMatrix(unittest.TestCase):
    def setUp(self):
        self.m = Matrix()

    def test_add_doc(self):
        # Try without frequency
        self.assertEqual(len(self.m), 0)
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Spam',
                       doc_terms=doc1_terms,
                       frequency=False)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later'])
        self.assertEqual(self.m.docs[0]['terms'], [1, 1, 1, 1])
        # Now try with frequency
        doc2_terms = ['buy', 'today', 'or', 'buy', 'later']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Spam',
                       doc_terms=doc2_terms,
                       frequency=True)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later', 'today'])
        self.assertEqual(self.m.docs[1]['terms'], [2, 0, 1, 1, 1])
        # Now let's see if padding is working
        doc2_terms = ['buy', 'now']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Ham',
                       doc_terms=doc2_terms,
                       frequency=True,
                       do_padding=True)
        #print self.m.terms, self.m.docs[0]['terms']
        self.assertEqual(len(self.m.terms), len(self.m.docs[0]['terms']))
        self.assertEqual(len(self.m), 3)
        self.assertEqual('buy' in self.m, True)
        self.assertEqual('shopping' in self.m, False)
예제 #18
0
 def test_docs_unique_ids(self):
     mx = Matrix()
     for i, s in enumerate(['hello', 'how are you', 'fine thank you']):
         mx.add_doc(doc_id=str(i),
                    doc_class='Email',
                    doc_terms=s.split(),
                    do_padding=True,
                    frequency=True)
     mx.add_doc(doc_id='1',
                doc_class='Email',
                doc_terms='goodbye'.split(),
                do_padding=True,
                frequency=True,
                unique_ids=True)
     self.assertEqual(len(mx), 3)
예제 #19
0
class TestMatrix(TestCase):

    def setUp(self):
        self.m = Matrix()
    
    def test_add_doc(self):
        # Try without frequency
        self.assertEqual(len(self.m),0)
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Spam', 
                        doc_terms= doc1_terms,
                        frequency=False)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later'])
        self.assertEqual(self.m.docs[0]['terms'], [1,1,1,1])
   
        # Now try with frequency
        doc2_terms = ['buy', 'today', 'or', 'buy', 'later']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Spam', 
                        doc_terms= doc2_terms,
                        frequency=True)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later', 'today'])
        self.assertEqual(self.m.docs[1]['terms'], [2,0,1,1,1])

        # Now let's see if padding is working
        doc2_terms = ['buy', 'now']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Ham', 
                        doc_terms= doc2_terms,
                        frequency=True,
                        do_padding=True)
        #print self.m.terms, self.m.docs[0]['terms']
        self.assertEqual(len(self.m.terms), len(self.m.docs[0]['terms'])) 
        self.assertEqual(len(self.m),3)
        self.assertEqual('buy' in self.m, True)
        self.assertEqual('shopping' in self.m, False)

    def test_add_doc_empty(self):
        doc1_terms = []
        with self.assertRaises(ValueError):
            self.m.add_doc( doc_id = 'doc1', 
                            doc_class='Spam', 
                            doc_terms= doc1_terms)

    def test_meta_data(self):
        mx = Matrix()
        for i,s in enumerate(['hello', 'world']):
            mx.add_doc( doc_id = str(i), 
                        doc_class='Email', 
                        doc_terms= s.split(),
                        do_padding=True,
                        frequency=True,
                        meta_data={
                            'original_text': s,
                            'original_text_len': len(s)
                        })
        self.assertEqual(mx.docs[1]['original_text'], 'world')
        self.assertEqual(mx.docs[1]['original_text_len'], 5)

    def test_docs_unique_ids(self):
        mx = Matrix()
        for i,s in enumerate(['hello', 'how are you', 'fine thank you']):
            mx.add_doc( doc_id = str(i), 
                        doc_class='Email', 
                        doc_terms= s.split(),
                        do_padding=True,
                        frequency=True)
        mx.add_doc(doc_id = '1', 
                   doc_class='Email', 
                   doc_terms= 'goodbye'.split(),
                   do_padding=True,
                   frequency=True,
                   unique_ids=True)
        self.assertEqual(len(mx), 3)

    def test_get_doc_by_id(self):
        mx = Matrix()
        for i,s in enumerate(['hello', 'how are you', 'fine thank you']):
            mx.add_doc( doc_id = str(i), 
                        doc_class='Email', 
                        doc_terms= s.split(),
                        do_padding=True,
                        frequency=True)
        doc1_id = mx.docs.index('1')
        self.assertEqual(mx.docs[doc1_id]['id'], '1')

    def test_query_alignment(self):
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc( doc_id = 'file_spam.txt', 
                        doc_class='Spam', 
                        doc_terms= doc1_terms,
                        frequency=False)
        q_vector = self.m.query_to_vector(['best', 'buy'], frequency=False)
        self.assertEqual(q_vector, [1,0,0,0]) 

    def test_tf_idf(self):
        doc1_terms = ['new', 'york', 'times']
        self.m.add_doc( doc_id = 'doc1', 
                        doc_class='Spam', 
                        doc_terms= doc1_terms,
                        do_padding=True,
                        frequency=True)
        doc2_terms = ['new', 'york', 'post']
        self.m.add_doc( doc_id = 'doc2', 
                        doc_class='Spam', 
                        doc_terms= doc2_terms,
                        do_padding=True,
                        frequency=True)
        doc3_terms = ['los', 'angeles', 'times']
        self.m.add_doc( doc_id = 'doc3', 
                        doc_class='Spam', 
                        doc_terms= doc3_terms,
                        do_padding=True,
                        frequency=True)
        self.m.tf_idf(log_base=2)
        doc1_tfidf_retval = self.m.docs[0]['terms']
        doc1_tfidf_retval = [round(item, 3) for item in doc1_tfidf_retval]
        doc1_tfidf_expval = [0.585, 0.585, 0.585, 0, 0, 0]
        self.assertEqual(doc1_tfidf_retval, doc1_tfidf_expval)

    def test_white_and_black_lists(self):
        doc_terms = ['this', 'is', 'a', 'new', 'test']
        white_list = ['test']
        black_list = ['this', 'is', 'a']
        mx = Matrix(whitelist=white_list, blacklist=black_list)
        mx.add_doc( doc_id = 'doc1', 
                    doc_class='TestClass', 
                    doc_terms= doc_terms,
                    do_padding=True,
                    frequency=True)
        returned_terms = mx.vocabulary()
        expected_terms = ['test']
        self.assertItemsEqual(returned_terms, expected_terms)
예제 #20
0
파일: tests.py 프로젝트: ljc0753/irlib
 def setUp(self):
     self.m = Matrix()
예제 #21
0
# Load the three modules:
from irlib.preprocessor import Preprocessor
from irlib.matrix import Matrix
from irlib.metrics import Metrics
import difflib

# Create instances for their classes:
prep = Preprocessor()
mx = Matrix()
metric = Metrics()
q_vector = []


def generateMatrix():
    fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r')
    count = 1
    for line in fd.readlines():
        terms = line.split(' ')
        terms = [x.strip() for x in terms]
        mx.add_doc(doc_id=str(count),
                   doc_terms=terms,
                   frequency=True,
                   do_padding=True)

        count += 1
        if count % 1000 == 0:
            print count

    mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)

예제 #22
0
 def setUp(self):
     self.m = Matrix()
예제 #23
0
class TestMatrix(TestCase):
    def setUp(self):
        self.m = Matrix()

    def test_add_doc(self):
        # Try without frequency
        self.assertEqual(len(self.m), 0)
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Spam',
                       doc_terms=doc1_terms,
                       frequency=False)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later'])
        self.assertEqual(self.m.docs[0]['terms'], [1, 1, 1, 1])

        # Now try with frequency
        doc2_terms = ['buy', 'today', 'or', 'buy', 'later']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Spam',
                       doc_terms=doc2_terms,
                       frequency=True)
        self.assertEqual(self.m.terms, ['buy', 'now', 'or', 'later', 'today'])
        self.assertEqual(self.m.docs[1]['terms'], [2, 0, 1, 1, 1])

        # Now let's see if padding is working
        doc2_terms = ['buy', 'now']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Ham',
                       doc_terms=doc2_terms,
                       frequency=True,
                       do_padding=True)
        #print self.m.terms, self.m.docs[0]['terms']
        self.assertEqual(len(self.m.terms), len(self.m.docs[0]['terms']))
        self.assertEqual(len(self.m), 3)
        self.assertEqual('buy' in self.m, True)
        self.assertEqual('shopping' in self.m, False)

    def test_add_doc_empty(self):
        doc1_terms = []
        with self.assertRaises(ValueError):
            self.m.add_doc(doc_id='doc1',
                           doc_class='Spam',
                           doc_terms=doc1_terms)

    def test_meta_data(self):
        mx = Matrix()
        for i, s in enumerate(['hello', 'world']):
            mx.add_doc(doc_id=str(i),
                       doc_class='Email',
                       doc_terms=s.split(),
                       do_padding=True,
                       frequency=True,
                       meta_data={
                           'original_text': s,
                           'original_text_len': len(s)
                       })
        self.assertEqual(mx.docs[1]['original_text'], 'world')
        self.assertEqual(mx.docs[1]['original_text_len'], 5)

    def test_docs_unique_ids(self):
        mx = Matrix()
        for i, s in enumerate(['hello', 'how are you', 'fine thank you']):
            mx.add_doc(doc_id=str(i),
                       doc_class='Email',
                       doc_terms=s.split(),
                       do_padding=True,
                       frequency=True)
        mx.add_doc(doc_id='1',
                   doc_class='Email',
                   doc_terms='goodbye'.split(),
                   do_padding=True,
                   frequency=True,
                   unique_ids=True)
        self.assertEqual(len(mx), 3)

    def test_get_doc_by_id(self):
        mx = Matrix()
        for i, s in enumerate(['hello', 'how are you', 'fine thank you']):
            mx.add_doc(doc_id=str(i),
                       doc_class='Email',
                       doc_terms=s.split(),
                       do_padding=True,
                       frequency=True)
        doc1_id = mx.docs.index('1')
        self.assertEqual(mx.docs[doc1_id]['id'], '1')

    def test_query_alignment(self):
        doc1_terms = ['buy', 'now', 'or', 'buy', 'later']
        self.m.add_doc(doc_id='file_spam.txt',
                       doc_class='Spam',
                       doc_terms=doc1_terms,
                       frequency=False)
        q_vector = self.m.query_to_vector(['best', 'buy'], frequency=False)
        self.assertEqual(q_vector, [1, 0, 0, 0])

    def test_tf_idf(self):
        doc1_terms = ['new', 'york', 'times']
        self.m.add_doc(doc_id='doc1',
                       doc_class='Spam',
                       doc_terms=doc1_terms,
                       do_padding=True,
                       frequency=True)
        doc2_terms = ['new', 'york', 'post']
        self.m.add_doc(doc_id='doc2',
                       doc_class='Spam',
                       doc_terms=doc2_terms,
                       do_padding=True,
                       frequency=True)
        doc3_terms = ['los', 'angeles', 'times']
        self.m.add_doc(doc_id='doc3',
                       doc_class='Spam',
                       doc_terms=doc3_terms,
                       do_padding=True,
                       frequency=True)
        self.m.tf_idf(log_base=2)
        doc1_tfidf_retval = self.m.docs[0]['terms']
        doc1_tfidf_retval = [round(item, 3) for item in doc1_tfidf_retval]
        doc1_tfidf_expval = [0.585, 0.585, 0.585, 0, 0, 0]
        self.assertEqual(doc1_tfidf_retval, doc1_tfidf_expval)

    def test_white_and_black_lists(self):
        doc_terms = ['this', 'is', 'a', 'new', 'test']
        white_list = ['test']
        black_list = ['this', 'is', 'a']
        mx = Matrix(whitelist=white_list, blacklist=black_list)
        mx.add_doc(doc_id='doc1',
                   doc_class='TestClass',
                   doc_terms=doc_terms,
                   do_padding=True,
                   frequency=True)
        returned_terms = mx.vocabulary()
        expected_terms = ['test']
        self.assertItemsEqual(returned_terms, expected_terms)
예제 #24
0
파일: search.py 프로젝트: Rogfel/buscador
class Search:

    def __init__(self):
        self._mx = Matrix()
        self._prep = Preprocessor(pattern='\W+', lower=True, stem=True)

    def readfiles(self, fold_path='all-folds/fold1/'):
        ruta = os.path.split(sys.argv[0])
        abs = os.path.join(ruta[0], fold_path)
        files = os.listdir(abs)
        for filename in files:
            abs_arch = os.path.join(abs, filename)
            fd = open(abs_arch, 'r')
            file_data = fd.read()

            self.createMX(filename, file_data)

        print 'Number of read documents:', len(self._mx.docs)
        print 'Number of read terms', len(self._mx.terms)
        #print mx.terms[0:5], mx.terms[-5:-1]
        '''print mx.terms
        for doc in mx.docs:
            print doc'''
        self.saveMX(self._mx)
        print 'proceso culminado'

    def saveMX(self, mx):
        ruta = os.path.split(sys.argv[0])
        abs = os.path.join(ruta[0], "db/matrix.mx")
        filemx = open(abs, 'w')
        serializer = Pickler(filemx)
        serializer.dump(mx)
        print 'matrix salvada'

    def createMX(self, file_id, file_data, lenguaje = 'english'):
        stop = stopwords.words(lenguaje)
        file = file_data.split(" ")
        content = [w for w in file if w.lower() not in stop]
        data = content.__str__()
        terms = self._prep.ngram_tokenizer(text=data)
        if len(terms) > 0:
            self._mx.add_doc(doc_id=file_id, doc_terms=terms,
                        frequency=True, do_padding=True)

    def search(self):
        ruta = os.path.split(sys.argv[0])
        abs = os.path.join(ruta[0], "db/matrix.mx")

        filemx = open(abs, 'r')
        serializer = Unpickler(filemx)
        self._mx = serializer.load()

        cadena = sys.argv
        del cadena[0]
        del cadena[0]
        cade = cadena.__str__()
        cade = cade.lower()
        cad = self._prep.ngram_tokenizer(text=cade)
        resultado = list()
        for doc in self._mx.docs:
            vector = list()
            for q in cad:
                if q in self._mx.terms:
                    pos = self._mx.terms.index(q)
                    vector.append(doc['terms'][pos])
            resultado.append((doc['id'],vector))
        resultado.sort(lambda a,b: self.__Deuclidiana(a[1]) - self.__Deuclidiana(b[1]), reverse = True)
        print resultado

    def __Deuclidiana(self, vector):
        dist = 0
        for v in vector:
            dist+=v**2
        return dist.__int__()


    def main(self):
        #self.readfiles()
        self.search()
예제 #25
0
파일: search.py 프로젝트: Rogfel/buscador
 def __init__(self):
     self._mx = Matrix()
     self._prep = Preprocessor(pattern='\W+', lower=True, stem=True)