def __init__(self, settings): self.settings = settings self.is_stopped = Event() self.pid = os.getpid() # channels self.url_queue = Queue() self.content_queue = Queue() self.orch_queue = Queue() thread_count = self.settings.thread_count # create sub processes self.content_processor = ContentProcessor( "content_processor", self.content_queue, self.orch_queue, 1, self.settings) self.content_process = Process(target=self.content_processor.run) self.url_processor = UrlProcessor( "url_processor", self.url_queue, self.orch_queue, thread_count, self.settings) self.url_process = Process(target=self.url_processor.run)
class Orchestrator(object): """Orchestrator Orchestrator creates and starts the control flow. It builds a multiprocess framework, in which there are three object, the orchestrator, the url processor, and the content processor. The url processor and the content processor run in a subprocess, receive jobs from the orchestrator, execute the jobs, send back the results to the orchestrator. They communicate with queues. Attributes: settings: a Settings instance for the orchestrator url_queue: queue, the channel for the orchestrator sending jobs (materials) to the url processor content_queue: queue, the channel for the orchestrator sending jobs (materials) to the content processor orch_queue: queue, the channel for the orchestrator receiving the executing result from the url/content processor """ def __init__(self, settings): self.settings = settings self.is_stopped = Event() self.pid = os.getpid() # channels self.url_queue = Queue() self.content_queue = Queue() self.orch_queue = Queue() thread_count = self.settings.thread_count # create sub processes self.content_processor = ContentProcessor( "content_processor", self.content_queue, self.orch_queue, 1, self.settings) self.content_process = Process(target=self.content_processor.run) self.url_processor = UrlProcessor( "url_processor", self.url_queue, self.orch_queue, thread_count, self.settings) self.url_process = Process(target=self.url_processor.run) def _feed_seed_urls(self): """Read urls from url_list_file and put them into url_queue """ # read seed urls from file url_list_file = self.settings.url_list_file with file(url_list_file) as urls_file: urls = [line.strip() for line in urls_file.readlines()] # put the urls to url_queue for url in urls: material = UrlMaterial(url=url, depth=0) self.url_queue.put(material) logging.debug("feed seed url %s", material.url) def _loop(self): """Main loop of this instance The loop read products from `orch_queue`. For every product, if it is 1) a UrlProduct instance, then build a content matertial and send it to content_queue 2) a ContentProduct instance, then (a) check depth, (b) build url materials if depth < max_depth, (c) send each of the materials to url_queue The loop would also break if could not get a product with 10 seconds """ while not self.is_stopped.is_set(): try: product = self.orch_queue.get(timeout=5) except Empty: logging.info("orchestrator loop broken for timeout") self.stop() break except: raise # dispatching if isinstance(product, UrlProduct): material = ContentMaterial(url=product.url, depth=product.depth, filepath=product.filepath, encoding=product.encoding) self.content_queue.put(material) elif isinstance(product, ContentProduct): for url in product.urls: material = UrlMaterial(url=url, depth=product.depth + 1) self.url_queue.put(material) def start(self): """Start the orchestrator """ # feed seed urls self._feed_seed_urls() # start processors and loopping self.content_process.start() self.url_process.start() logging.info("orchestrator loop") self._loop() self.stop() def stop(self): """Stop the orchestrator """ # close processors and wait if not self.is_stopped.is_set() and self.pid == os.getpid(): self.is_stopped.set() # logging.info("@@@@ %d %d", self.pid, os.getpid()) # stop processors self.url_processor.stop() self.content_processor.stop() # wait processes exiting self.content_process.join(timeout=3) self.url_process.join(timeout=3) logging.info("orchestrator exit")