예제 #1
0
    def index_words(self, all_words):
        # Locations are the indices in all_words at which a particular word is
        for i in xrange(len(all_words)):
            current_word = all_words[i]
            if current_word not in self.cache:
                self.cache[current_word] = WordFromIndexedPage(
                    indexed_page=self.indexed_page, word=current_word)
            self.word_location_cache[current_word].append(i)

        for key in self.cache:
            self.cache[key].set_offsets(self.word_location_cache[key])
 def test_get_words_with_repetition(self):
     url = "http://www.google.com"
     google_page = IndexedPage(url=url)
     google_page.save()
     word_list = ["google", "feeling", "lucky", "search"] * 3
     for word in word_list:
         WordFromIndexedPage(word=word,
                             indexed_page=google_page,
                             offsets_in_indexedPage=str([1])).save()
     self.assertEqual(len(google_page.words.all()), len(set(word_list)))
     all_words = map(lambda element: element.word, google_page.get_words())
     for word in word_list:
         self.assertIn(word, all_words)
 def test_get_words_no_words(self):
     url = "http://www.google.com"
     google_page = IndexedPage(url=url)
     google_page.save()
     word_list = []
     for word in word_list:
         WordFromIndexedPage(word=word,
                             indexed_page=google_page,
                             offsets_in_indexedPage=str([1])).save()
     self.assertEqual(len(google_page.words.all()), len(word_list))
     all_words = map(lambda element: element.word, google_page.get_words())
     for word in word_list:
         self.assertIn(word, all_words)
예제 #4
0
 def test_set_and_check_attributes_duplicated_locations(self):
     url = "http://www.google.com"
     google_page = IndexedPage(url = url)
     google_page.save()
     our_word = WordFromIndexedPage(word = "google", indexed_page = google_page)
     locations_list = [2, 5, 7, 7, 2, 2, 11]
     our_word.set_offsets(locations_list)
     our_word.save()
     
     #checking if the locations are retrievable
     retrieved_offsets = our_word.get_offsets()
     for location in locations_list:
         self.assertIn(location, retrieved_offsets)
     self.assertEqual(len(locations_list), len(retrieved_offsets))
    def test_set_and_check_attributes_duplicated_locations(self):
        url = "http://www.google.com"
        google_page = IndexedPage(url=url)
        google_page.save()
        our_word = WordFromIndexedPage(word="google", indexed_page=google_page)
        locations_list = [2, 5, 7, 7, 2, 2, 11]
        our_word.set_offsets(locations_list)
        our_word.save()

        # checking if the locations are retrievable
        retrieved_offsets = our_word.get_offsets()
        for location in locations_list:
            self.assertIn(location, retrieved_offsets)
        self.assertEqual(len(locations_list), len(retrieved_offsets))
예제 #6
0
# get these words from nltk clean html
list_of_words = ["new", "google", "googl3", "google4"]

# Database queries (eg, save(), get, filter) etc do disk-IO op,
# hence are slower than mem access,
# we could cache stuffs, and batch save all objects at the end :)
# with a function like:

cache = {}  # word<String> : wordL<WordFromIndexedPage>
cacheWordLocation = defaultdict(list)

# it's ok, we do this just once...-ish
url = IndexedPage.objects.get(pk="http://www.google.com")
for word in list_of_words:
    if word not in cache:
        cache[word] = WordFromIndexedPage(indexedPage=url, word=word)

    cacheWordLocation[word].append(32)

for key in cache.keys():
    cache[key].set_offsets(cacheWordLocation[key])

    # DO NOT SAVE() each time ;)
all_models_to_save = cache.values()
# you might not wanna forget this dude. question: why mush URL be the
# *last model*?
all_models_to_save.append(url)

bulk_save(all_models_to_save)

raw_input("Inspect database (F5=refresh),\
예제 #7
0
from searchEngine.models import WordFromIndexedPage, IndexedPage

googlePage = IndexedPage(url="http://www.google.com")
googlePage.save()
print googlePage

googleWord = "google"
googleWord2 = "google2"

googleWordLocation = WordFromIndexedPage(indexedPage=googlePage,
                                         word=googleWord)
googleWordLocation.set_offsets([1])
googleWordLocation.save()

googleWord2Location = WordFromIndexedPage(indexedPage=googlePage,
                                          word=googleWord2)
googleWord2Location.set_offsets([1])
googleWord2Location.save()

print "--" * 100

print "googlePage.words:", googlePage.get_words()
print len(googlePage.words.all())
예제 #8
0
from searchEngine.models import WordFromIndexedPage, IndexedPage

googlePage = IndexedPage(url="http://www.google.com")
googlePage.save()
print googlePage

googleWord = "google"
googleWord2 = "google2"


googleWordLocation = WordFromIndexedPage(indexedPage=googlePage, word=googleWord)
googleWordLocation.set_offsets([1])
googleWordLocation.save()

googleWord2Location = WordFromIndexedPage(indexedPage=googlePage, word=googleWord2)
googleWord2Location.set_offsets([1])
googleWord2Location.save()

print "--"* 100

print "googlePage.words:", googlePage.get_words()
print len(googlePage.words.all())