def __init__(self, uri='mongodb://localhost:27017/', db="_bo", url_collection='_urls', documents_collection='_docs'): self._mongo = Mongo(uri, db, url_collection, documents_collection) self._job = None self.idle = None self.reset()
def __init__(self, pidfile='/tmp/worker.pid', uri='mongodb://localhost:27017/', db="_bo", url_collection='_urls', documents_collection='_docs', sleep_time=0.1): super(Worker, self).__init__(pidfile) self._mongo = Mongo(uri, db, url_collection, documents_collection) self.sleep_time = sleep_time self._callback = None self.bandwidth = 0
def __init__( self, uri="mongodb://localhost:27017/", db="_bo", url_collection="_urls", documents_collection="_docs" ): self._mongo = Mongo(uri, db, url_collection, documents_collection) self._job = None self.idle = None self.reset()
class Dispatcher(object): def __init__(self, uri='mongodb://localhost:27017/', db="_bo", url_collection='_urls', documents_collection='_docs'): self._mongo = Mongo(uri, db, url_collection, documents_collection) self._job = None self.idle = None self.reset() def reset(self): self._job = '{0}-{1}'.format(str(uuid.uuid4()), str(uuid.uuid4())) self.idle = False def add_url(self, url): url['text'] = '<root>' url['job'] = self._job url['url'] = url['target_url'] url['level'] = 0 self._mongo.add_url(url) def load_urls_at_level(self, level): documents = self._mongo.get_documents_at_level(self._job, level) url_count = 0 for doc in documents: if '_id' in doc: del (doc['_id']) keys = [ 'target_url', 'job', 'allowed_domains', 'url', 'level', ] url = {} for key in keys: url[key] = doc[key] self._mongo.add_url(url) url_count += 1 return url_count def dispatch(self, url, clean_job=True): ''' Dispatches the URLs to the workers url = { 'target_url': '', 'link_level': 0, 'allowed_domains': [], } ''' reqkeys = [ 'target_url', 'link_level', 'allowed_domains', ] for key in reqkeys: if key not in url: raise Exception('Missing key in URL: %s' % key) self.idle = False self.add_url(url) link_level = url['link_level'] level = 0 while level < link_level + 1: url_count = self.load_urls_at_level(level) working = True while working: scraped, not_scraped, typed, not_typed = \ self._mongo.get_counts(self._job) if not_scraped is 0 and not_typed is 0: working = False else: time.sleep(1) logger.info(("Level: {0} / {1}, Not Scraped: {2}," " Not Typed: {3}").format(level, link_level, not_scraped, not_typed)) level += 1 if clean_job: self._mongo.clean_job(self._job) self.idle = True logging.info("All URLs processed.") def get_documents(self, doc_types=['*']): docs = [] for doc_type in doc_types: if doc_type == "*": docs = self._mongo.get_all_documents(self._job) break else: for doc in self._mongo.get_documents(self._job, doc_type): docs.append(doc) return docs def clean_job(self): self._mongo.clean_job(self._job)
class Worker(Daemon): def __init__(self, pidfile='/tmp/worker.pid', uri='mongodb://localhost:27017/', db="_bo", url_collection='_urls', documents_collection='_docs', sleep_time=0.1): super(Worker, self).__init__(pidfile) self._mongo = Mongo(uri, db, url_collection, documents_collection) self.sleep_time = sleep_time self._callback = None self.bandwidth = 0 def register_callback(self, callback): self._callback = callback def run(self): try: self.bandwidth = 0 self.do_work() except Exception as e: print(str(e)) def do_work(self): ''' This function sits until it is told to exit by setting self._running to False 1) try and get a url to scrape 1a) check if it's an allowed domain 1b) scrape it and get all of the URLs it links to 1c) go through all found URLs 1cI) check if it's an allowed domain 1cII) check if document exists in the collection 1cIII) add document at level + 1 Note: we loop until there are no more URLs to scrape 2) try and get a document to type 2a) check if it's an allowed domain 2b) Type the link 2c) update URL with new type data Note: we loop until there are no more documents to type ''' no_work_count = 0 self._running = True while self._running: time.sleep(self.sleep_time) ''' This loop does page scraping ''' url = self._mongo.get_url() while url is not None: logger.info("Scrape: {0}".format(url['url'])) no_work_count = 0 if check_match(url, url['url']): page_urls, bandwidth, time_taken = get_page_urls(url) self.bandwidth += bandwidth for pu in page_urls: if check_match(url, pu['url']): document = pu if not self._mongo.check_document_exists( url, document, use_job=True): self._mongo.add_document(url, document) self._mongo.set_url_scraped(url) url = self._mongo.get_url() ''' This loop does document typing ''' document = self._mongo.get_document() while document is not None: no_work_count = 0 logger.info('Type: {0}'.format(document['url'])) if check_match(document, document['url']): doc_type, bad_url, bandwidth, time_taken, count = \ type_document(document) self.bandwidth += bandwidth self._mongo.set_document_type( document, doc_type, bad_url, bandwidth, time_taken ) if self._callback is not None: self._callback(document) document = self._mongo.get_document() if no_work_count is 10: print("No Work.") time.sleep(1) no_work_count = 0 else: no_work_count += 1
class Dispatcher(object): def __init__( self, uri="mongodb://localhost:27017/", db="_bo", url_collection="_urls", documents_collection="_docs" ): self._mongo = Mongo(uri, db, url_collection, documents_collection) self._job = None self.idle = None self.reset() def reset(self): self._job = "{0}-{1}".format(str(uuid.uuid4()), str(uuid.uuid4())) self.idle = False def add_url(self, url): url["text"] = "<root>" url["job"] = self._job url["url"] = url["target_url"] url["level"] = 0 self._mongo.add_url(url) def load_urls_at_level(self, level): documents = self._mongo.get_documents_at_level(self._job, level) url_count = 0 for doc in documents: if "_id" in doc: del (doc["_id"]) keys = ["target_url", "job", "allowed_domains", "url", "level"] url = {} for key in keys: url[key] = doc[key] self._mongo.add_url(url) url_count += 1 return url_count def dispatch(self, url, clean_job=True): """ Dispatches the URLs to the workers url = { 'target_url': '', 'link_level': 0, 'allowed_domains': [], } """ reqkeys = ["target_url", "link_level", "allowed_domains"] for key in reqkeys: if key not in url: raise Exception("Missing key in URL: %s" % key) self.idle = False self.add_url(url) link_level = url["link_level"] level = 0 while level < link_level + 1: url_count = self.load_urls_at_level(level) working = True while working: scraped, not_scraped, typed, not_typed = self._mongo.get_counts(self._job) if not_scraped is 0 and not_typed is 0: working = False else: time.sleep(1) logger.info( ("Level: {0} / {1}, Not Scraped: {2}," " Not Typed: {3}").format( level, link_level, not_scraped, not_typed ) ) level += 1 if clean_job: self._mongo.clean_job(self._job) self.idle = True logging.info("All URLs processed.") def get_documents(self, doc_types=["*"]): docs = [] for doc_type in doc_types: if doc_type == "*": docs = self._mongo.get_all_documents(self._job) break else: for doc in self._mongo.get_documents(self._job, doc_type): docs.append(doc) return docs def clean_job(self): self._mongo.clean_job(self._job)