def run(self): # Find links on first run self.get_links(self.parse_webpage(self.base_url)) # Calculate and save the word count in the document while self.links_to_search: url = heapq.heappop(self.links_to_search) # get next url in line heapq.heappush(self.links_searched, url) # save url as already searched soup = self.parse_webpage(self.base_url + url) # get webpage self.get_links(soup) # harvest urls to search clean = self.clean_text(soup) # clean webpage content doc = Document(url) doc.count_words(clean) # count number of words on webpage doc.save_word_count(self.base_url) # save document if len(self.links_searched) == 10000: # Caps the result at X pages, for test purposes break
def create_new_document(content): doc = Document("doc") doc.dictionary = doc.count_words(content) return doc