def index_words(self, all_words): # Locations are the indices in all_words at which a particular word is for i in xrange(len(all_words)): current_word = all_words[i] if current_word not in self.cache: self.cache[current_word] = WordFromIndexedPage( indexed_page=self.indexed_page, word=current_word) self.word_location_cache[current_word].append(i) for key in self.cache: self.cache[key].set_offsets(self.word_location_cache[key])
def test_get_words_with_repetition(self): url = "http://www.google.com" google_page = IndexedPage(url=url) google_page.save() word_list = ["google", "feeling", "lucky", "search"] * 3 for word in word_list: WordFromIndexedPage(word=word, indexed_page=google_page, offsets_in_indexedPage=str([1])).save() self.assertEqual(len(google_page.words.all()), len(set(word_list))) all_words = map(lambda element: element.word, google_page.get_words()) for word in word_list: self.assertIn(word, all_words)
def test_get_words_no_words(self): url = "http://www.google.com" google_page = IndexedPage(url=url) google_page.save() word_list = [] for word in word_list: WordFromIndexedPage(word=word, indexed_page=google_page, offsets_in_indexedPage=str([1])).save() self.assertEqual(len(google_page.words.all()), len(word_list)) all_words = map(lambda element: element.word, google_page.get_words()) for word in word_list: self.assertIn(word, all_words)
def test_set_and_check_attributes_duplicated_locations(self): url = "http://www.google.com" google_page = IndexedPage(url = url) google_page.save() our_word = WordFromIndexedPage(word = "google", indexed_page = google_page) locations_list = [2, 5, 7, 7, 2, 2, 11] our_word.set_offsets(locations_list) our_word.save() #checking if the locations are retrievable retrieved_offsets = our_word.get_offsets() for location in locations_list: self.assertIn(location, retrieved_offsets) self.assertEqual(len(locations_list), len(retrieved_offsets))
def test_set_and_check_attributes_duplicated_locations(self): url = "http://www.google.com" google_page = IndexedPage(url=url) google_page.save() our_word = WordFromIndexedPage(word="google", indexed_page=google_page) locations_list = [2, 5, 7, 7, 2, 2, 11] our_word.set_offsets(locations_list) our_word.save() # checking if the locations are retrievable retrieved_offsets = our_word.get_offsets() for location in locations_list: self.assertIn(location, retrieved_offsets) self.assertEqual(len(locations_list), len(retrieved_offsets))
# get these words from nltk clean html list_of_words = ["new", "google", "googl3", "google4"] # Database queries (eg, save(), get, filter) etc do disk-IO op, # hence are slower than mem access, # we could cache stuffs, and batch save all objects at the end :) # with a function like: cache = {} # word<String> : wordL<WordFromIndexedPage> cacheWordLocation = defaultdict(list) # it's ok, we do this just once...-ish url = IndexedPage.objects.get(pk="http://www.google.com") for word in list_of_words: if word not in cache: cache[word] = WordFromIndexedPage(indexedPage=url, word=word) cacheWordLocation[word].append(32) for key in cache.keys(): cache[key].set_offsets(cacheWordLocation[key]) # DO NOT SAVE() each time ;) all_models_to_save = cache.values() # you might not wanna forget this dude. question: why mush URL be the # *last model*? all_models_to_save.append(url) bulk_save(all_models_to_save) raw_input("Inspect database (F5=refresh),\
from searchEngine.models import WordFromIndexedPage, IndexedPage googlePage = IndexedPage(url="http://www.google.com") googlePage.save() print googlePage googleWord = "google" googleWord2 = "google2" googleWordLocation = WordFromIndexedPage(indexedPage=googlePage, word=googleWord) googleWordLocation.set_offsets([1]) googleWordLocation.save() googleWord2Location = WordFromIndexedPage(indexedPage=googlePage, word=googleWord2) googleWord2Location.set_offsets([1]) googleWord2Location.save() print "--" * 100 print "googlePage.words:", googlePage.get_words() print len(googlePage.words.all())
from searchEngine.models import WordFromIndexedPage, IndexedPage googlePage = IndexedPage(url="http://www.google.com") googlePage.save() print googlePage googleWord = "google" googleWord2 = "google2" googleWordLocation = WordFromIndexedPage(indexedPage=googlePage, word=googleWord) googleWordLocation.set_offsets([1]) googleWordLocation.save() googleWord2Location = WordFromIndexedPage(indexedPage=googlePage, word=googleWord2) googleWord2Location.set_offsets([1]) googleWord2Location.save() print "--"* 100 print "googlePage.words:", googlePage.get_words() print len(googlePage.words.all())