# Start listening for commands. while self.number_of_non_trivial_indexes <= self.max_links_to_crawl: write_cmd = self.main_thread_cmd_queue.pop( timeout=Crawler.POP_TIMEOUT_IN_SECONDS) if isinstance(write_cmd, RunOnMainThread): write_cmd.run() else: logger.warn( "Main thread received a command it couldn't parse: ", write_cmd) # Crawling complete. Hola the team! logger.info("Crawling complete. Logged: {n_urls}".format( n_urls=len(self.finished_indexers_list))) def main(): #[x.delete() for x in list(IndexedPage.objects.all()) + list(WordFromIndexedPage.objects.all())] crawler = Crawler(links_queue=TQueue( ["https://www.facebook.com/moforjohn"]), max_active_indexers=10, max_links_to_crawl=50) #crawler.start() #crawler.join() crawler.run() logger.info("Crawling complete!") if __name__ == '__main__': profile_main()
for link in lxml.html.fromstring(self.raw_html).xpath('//a/@href') ] if hasattr(self.links_queue, "extend"): self.links_queue.extend(all_links) logger.info("finished indexing url={url}".format(url=self.url)) self.done() def run(self): logger.info("starting to index page url={url}".format(url=self.url)) self.final_url = final_url_after_redirects(self.url) if self.final_url is not None: if self.links_queue is not None: self.populate_indexedPage(self.final_url) else: logger.warn("links_queue is None! Aborting") self.done() else: logger.warn("final_url is None. No content picked up. Aborting") self.done() def main(): #t = Indexer(indexed_page="http://example.com", on_finished_indexing=None, links_queue=[]) # t.start() # t.join() pass if __name__ == '__main__': profile_main('main()')
all_links = [urljoin(self.final_url, link) for link in lxml.html.fromstring(self.raw_html).xpath('//a/@href')] if hasattr(self.links_queue, "extend"): self.links_queue.extend(all_links) logger.info("finished indexing url={url}".format(url=self.url)) self.done() def run(self): logger.info("starting to index page url={url}".format(url=self.url)) self.final_url = final_url_after_redirects(self.url) if self.final_url is not None: if self.links_queue is not None: self.populate_indexedPage(self.final_url) else: logger.warn("links_queue is None! Aborting") self.done() else: logger.warn("final_url is None. No content picked up. Aborting") self.done() def main(): #t = Indexer(indexed_page="http://example.com", on_finished_indexing=None, links_queue=[]) # t.start() # t.join() pass if __name__ == '__main__': profile_main('main()')
# Start listening for commands. while self.number_of_non_trivial_indexes <= self.max_links_to_crawl: write_cmd = self.main_thread_cmd_queue.pop(timeout=Crawler.POP_TIMEOUT_IN_SECONDS) if isinstance(write_cmd, RunOnMainThread): write_cmd.run() else: logger.warn("Main thread received a command it couldn't parse: ", write_cmd) # Crawling complete. Hola the team! logger.info( "Crawling complete. Logged: {n_urls}".format( n_urls=len( self.finished_indexers_list))) def main(): #[x.delete() for x in list(IndexedPage.objects.all()) + list(WordFromIndexedPage.objects.all())] crawler = Crawler( links_queue=TQueue( ["https://www.facebook.com/moforjohn"]), max_active_indexers=10, max_links_to_crawl=50) #crawler.start() #crawler.join() crawler.run() logger.info("Crawling complete!") if __name__ == '__main__': profile_main()