예제 #1
0
 def test_passing_stopwords_should_remove_these_words_from_token_list(self):
     index = Index(stopwords=['yes', 'no', ',', '.', '!'])
     index.add_document('coffee', 'Yes, sir! No, Joyce.')
     self.assertEquals(
         index._index,
         {
             'sir': set(['coffee']),
             'joyce': set(['coffee'])
         },
     )
예제 #2
0
 def test_should_store_tokens_lowercase(self):
     index = Index()
     index.add_document('doc', 'This IS mY firsT DoCuMeNt')
     expected_tokens = set(['this', 'is', 'my', 'first', 'document'])
     expected_index = {'this': set(['doc']),
                       'is': set(['doc']),
                       'my': set(['doc']),
                       'first': set(['doc']),
                       'document': set(['doc']),}
     self.assertEquals(index.tokens(), expected_tokens)
     self.assertEquals(dict(index._index), expected_index)
예제 #3
0
 def test_calling_method_load_should_retrieve_object_from_pickle_file(self):
     fp = NamedTemporaryFile(delete=False)
     fp.close()
     self.filename = fp.name
     index = Index()
     index.add_document('coffee', 'I liked it')
     index.add_document('water', 'I need it')
     index.dump(self.filename)
     retrieved_index = Index.load(self.filename)
     self.assertEquals(len(retrieved_index), 2)
     self.assertEquals(set(retrieved_index._index.keys()),
                       set(['i', 'liked', 'need', 'it']))
예제 #4
0
 def test_passing_a_stemmer_should_index_tokens_stemmed(self):
     porter_stemmer = PorterStemmer()
     index = Index(stemmer=porter_stemmer)
     index.add_document('coffee', 'I liked it')
     self.assertEquals(index._index, {'i': set(['coffee']),
                                      'like': set(['coffee']),
                                      'it': set(['coffee'])},)
     index = Index(stemmer=None)
     index.add_document('coffee', 'I liked it')
     self.assertEquals(index._index, {'i': set(['coffee']),
                                      'liked': set(['coffee']),
                                      'it': set(['coffee'])},)
예제 #5
0
 def test_calling_method_load_should_retrieve_object_from_pickle_file(self):
     fp = NamedTemporaryFile(delete=False)
     fp.close()
     self.filename = fp.name
     index = Index()
     index.add_document('coffee', 'I liked it')
     index.add_document('water', 'I need it')
     index.dump(self.filename)
     retrieved_index = Index.load(self.filename)
     self.assertEquals(len(retrieved_index), 2)
     self.assertEquals(set(retrieved_index._index.keys()),
                       set(['i', 'liked', 'need', 'it']))
예제 #6
0
 def test_should_store_tokens_lowercase(self):
     index = Index()
     index.add_document('doc', 'This IS mY firsT DoCuMeNt')
     expected_tokens = set(['this', 'is', 'my', 'first', 'document'])
     expected_index = {
         'this': set(['doc']),
         'is': set(['doc']),
         'my': set(['doc']),
         'first': set(['doc']),
         'document': set(['doc']),
     }
     self.assertEquals(index.tokens(), expected_tokens)
     self.assertEquals(dict(index._index), expected_index)
예제 #7
0
 def test_calling_method_dump_should_pickle_the_index_object(self):
     fp = NamedTemporaryFile(delete=False)
     fp.close()
     self.filename = fp.name
     index = Index()
     index.add_document('coffee', 'I liked it')
     index.add_document('water', 'I need it')
     index.dump(self.filename)
     self.assertTrue(file_exists(self.filename))
     fp = open(self.filename)
     retrieved_index = cPickle.load(fp)
     self.assertEquals(len(retrieved_index), 2)
     self.assertEquals(set(retrieved_index._index.keys()),
                       set(['i', 'liked', 'need', 'it']))
예제 #8
0
 def test_calling_method_dump_should_pickle_the_index_object(self):
     fp = NamedTemporaryFile(delete=False)
     fp.close()
     self.filename = fp.name
     index = Index()
     index.add_document('coffee', 'I liked it')
     index.add_document('water', 'I need it')
     index.dump(self.filename)
     self.assertTrue(file_exists(self.filename))
     fp = open(self.filename)
     retrieved_index = cPickle.load(fp)
     self.assertEquals(len(retrieved_index), 2)
     self.assertEquals(set(retrieved_index._index.keys()),
                       set(['i', 'liked', 'need', 'it']))
예제 #9
0
 def test_should_automatically_index_when_add_documents(self):
     index = Index()
     index.add_document('test', 'this is my first document')
     index.add_document('test2', 'this is my second document')
     expected_tokens = set(['this', 'is', 'my', 'first', 'second',
                            'document'])
     expected_index = {'this': set(['test', 'test2']),
                       'is': set(['test', 'test2']),
                       'my': set(['test', 'test2']),
                       'first': set(['test']),
                       'second': set(['test2']),
                       'document': set(['test', 'test2']),}
     self.assertEquals(index.tokens(), expected_tokens)
     self.assertEquals(dict(index._index), expected_index)
예제 #10
0
 def test_should_automatically_index_when_add_documents(self):
     index = Index()
     index.add_document('test', 'this is my first document')
     index.add_document('test2', 'this is my second document')
     expected_tokens = set(
         ['this', 'is', 'my', 'first', 'second', 'document'])
     expected_index = {
         'this': set(['test', 'test2']),
         'is': set(['test', 'test2']),
         'my': set(['test', 'test2']),
         'first': set(['test']),
         'second': set(['test2']),
         'document': set(['test', 'test2']),
     }
     self.assertEquals(index.tokens(), expected_tokens)
     self.assertEquals(dict(index._index), expected_index)
예제 #11
0
 def test_passing_a_stemmer_should_index_tokens_stemmed(self):
     porter_stemmer = PorterStemmer()
     index = Index(stemmer=porter_stemmer)
     index.add_document('coffee', 'I liked it')
     self.assertEquals(
         index._index,
         {
             'i': set(['coffee']),
             'like': set(['coffee']),
             'it': set(['coffee'])
         },
     )
     index = Index(stemmer=None)
     index.add_document('coffee', 'I liked it')
     self.assertEquals(
         index._index,
         {
             'i': set(['coffee']),
             'liked': set(['coffee']),
             'it': set(['coffee'])
         },
     )
예제 #12
0
 def test_should_be_able_to_find_using_AND_OR_and_NOT(self):
     index = Index()
     index.add_document('doc1', 'this is my first document')
     index.add_document('doc2', 'this is my second document')
     index.add_document('doc3', 'another document')
     self.assertEquals(index.find('this document'), set(['doc1', 'doc2']))
     self.assertEquals(index.find('this another'), set())
     self.assertEquals(index.find('a b'), set())
     self.assertEquals(index.find('another'), set(['doc3']))
     self.assertEquals(index.find('first another'), set([]))
예제 #13
0
 def test_should_be_able_to_find_using_AND_OR_and_NOT(self):
     index = Index()
     index.add_document('doc1', 'this is my first document')
     index.add_document('doc2', 'this is my second document')
     index.add_document('doc3', 'another document')
     self.assertEquals(index.find('this document'), set(['doc1', 'doc2']))
     self.assertEquals(index.find('this another'), set())
     self.assertEquals(index.find('a b'), set())
     self.assertEquals(index.find('another'), set(['doc3']))
     self.assertEquals(index.find('first another'), set([]))
예제 #14
0
 def test_should_be_able_to_find_by_term(self):
     index = Index()
     index.add_document('doc1', 'this is my first document')
     index.add_document('doc2', 'this is my second document')
     index.add_document('doc3', 'another document')
     self.assertEquals(index.find_by_term('document'),
                       set(['doc1', 'doc2', 'doc3']))
     self.assertEquals(index.find_by_term('DOCUMENT'),
                       set(['doc1', 'doc2', 'doc3']))
     self.assertEquals(index.find_by_term('this'), set(['doc1', 'doc2']))
     self.assertEquals(index.find_by_term('is'), set(['doc1', 'doc2']))
     self.assertEquals(index.find_by_term('my'), set(['doc1', 'doc2']))
     self.assertEquals(index.find_by_term('first'), set(['doc1']))
     self.assertEquals(index.find_by_term('second'), set(['doc2']))
     self.assertEquals(index.find_by_term('another'), set(['doc3']))
예제 #15
0
 def test_should_be_able_to_find_by_term(self):
     index = Index()
     index.add_document('doc1', 'this is my first document')
     index.add_document('doc2', 'this is my second document')
     index.add_document('doc3', 'another document')
     self.assertEquals(index.find_by_term('document'),
                       set(['doc1', 'doc2', 'doc3']))
     self.assertEquals(index.find_by_term('DOCUMENT'),
                       set(['doc1', 'doc2', 'doc3']))
     self.assertEquals(index.find_by_term('this'), set(['doc1', 'doc2']))
     self.assertEquals(index.find_by_term('is'), set(['doc1', 'doc2']))
     self.assertEquals(index.find_by_term('my'), set(['doc1', 'doc2']))
     self.assertEquals(index.find_by_term('first'), set(['doc1']))
     self.assertEquals(index.find_by_term('second'), set(['doc2']))
     self.assertEquals(index.find_by_term('another'), set(['doc3']))
예제 #16
0
 def test_should_add_documents_with_name_and_content(self):
     index = Index()
     index.add_document('test', 'this is my first document')
     index.add_document('test2', 'this is my second document')
     self.assertEquals(len(index), 2)
     self.assertEquals(index._documents, set(['test', 'test2']))
예제 #17
0
#!/usr/bin/env python
# coding: utf-8
# Tip: run this script with `python -i example.py`
# (or `ipython -i example.py`), so you can interactively do searches by
# executing: `my_index.search('...search terms...')`

from nltk.corpus import machado
from index import Index


print 'Creating index...'
my_index = Index()
filenames = machado.fileids()[50:]
for filename in filenames:
    my_index.add_document(filename, machado.raw(filename))

print 'Searching...'
print my_index.find('brasil azul')
예제 #18
0
 def test_should_add_documents_with_name_and_content(self):
     index = Index()
     index.add_document('test', 'this is my first document')
     index.add_document('test2', 'this is my second document')
     self.assertEquals(len(index), 2)
     self.assertEquals(index._documents, set(['test', 'test2']))
예제 #19
0
 def test_passing_stopwords_should_remove_these_words_from_token_list(self):
     index = Index(stopwords=['yes', 'no', ',', '.', '!'])
     index.add_document('coffee', 'Yes, sir! No, Joyce.')
     self.assertEquals(index._index, {'sir': set(['coffee']),
                                      'joyce': set(['coffee'])},)
예제 #20
0
 def test_passing_a_stemmer_should_stem_search_term_before_matching(self):
     porter_stemmer = PorterStemmer()
     index = Index(stemmer=porter_stemmer)
     index.add_document('coffee', 'I liked it')
     self.assertEquals(index.find_by_term('liked'), set(['coffee']))
예제 #21
0
# coding: utf-8
from index import Index
import os

def iterate_over_dir(dir):
    for root, subFolders, files in os.walk(dir):
        for filename in files:
            file_path = os.path.join(root, filename)
            yield file_path


ind = Index()
for s in iterate_over_dir('../../books_search_b/not_sort_book/T'):
    try: 
        ind.add_document(s)
    except: 
        pass
    
ind.save('test.ind')



예제 #22
0
 def test_passing_a_stemmer_should_stem_search_term_before_matching(self):
     porter_stemmer = PorterStemmer()
     index = Index(stemmer=porter_stemmer)
     index.add_document('coffee', 'I liked it')
     self.assertEquals(index.find_by_term('liked'), set(['coffee']))