def _treewalk_stop(self, data: Any) -> communication.Response: """Stop the current execution of the TreeWalk. Args: data (Any): ignored, but required due to callback signature Returns: communication.Response: response object """ if self._state.is_ready(): message = 'Attempted to stop when TreeWalk was ready.' else: command = communication.Command(command=communication.WORKER_STOP, data=None) for worker_control in self._workers: worker_control.queue_input.put(command) _ = worker_control.queue_output.get() self._workers_can_exit.set() for worker_control in self._workers: worker_control.me.join() self._workers_can_exit.clear() self._db_connection.set_crawl_state( tree_walk_id=self._tree_walk_id, status=communication.CRAWL_STATUS_ABORTED) self._reset() message = communication.MANAGER_OK return communication.Response(success=True, message=message, command=communication.MANAGER_STOP)
def _treewalk_pause(self, data: Any) -> communication.Response: """Pause the current execution of the TreeWalk. Args: data (Any): ignored, but required due to callback signature Returns: communication.Response: response object """ try: self._state.set_paused() command = communication.Command(command=communication.WORKER_PAUSE, data=None) for worker_control in self._workers: worker_control.queue_input.put(command) self._db_connection.set_crawl_state( tree_walk_id=self._tree_walk_id, status=communication.CRAWL_STATUS_PAUSED) success = True message = communication.MANAGER_OK except treewalk.StateException as err: success = False message = f'Attempted to pause. {str(err)}' return communication.Response(success=success, message=message, command=communication.MANAGER_PAUSE)
def done() -> None: """Send the finish signal to each worker and update the database.""" command = communication.Command( command=communication.WORKER_FINISH, data=None) for worker_control in self._workers: worker_control.queue_input.put(command) self._db_connection.set_crawl_state( tree_walk_id=self._tree_walk_id, status=communication.CRAWL_STATUS_FINISHED)
def _update_workers(self, num_workers: int, reduce: bool) -> None: """Update the workers due to maximum resource consumption. Args: num_workers (int): new number of workers reduce (bool): if True reduce, otherwise increase """ if reduce: diff = self._num_workers.value - num_workers command = communication.Command(command=communication.WORKER_STOP, data=None) to_kill = self._workers[:diff] del self._workers[:diff] for worker_control in to_kill: worker_control.queue_input.put(command) worker_control.queue_output.get() self._workers_can_exit.set() for worker_control in to_kill: worker_control.me.join() self._workers_can_exit.clear() logging.info( f'TWManager: reduced the number of workers by {diff}.') else: diff = num_workers - self._num_workers.value for id_worker in range(diff): queue_input = multiprocessing.Queue() queue_output = multiprocessing.Queue() worker = Worker( queue_input=queue_input, queue_output=queue_output, config=self._config, connection_data=self._connection_data, tree_walk_id=self._tree_walk_id, lock=self._worker_lock, counter=self._worker_counter, finished=self._workers_finished, num_workers=self._num_workers, measure_time=self._measure_time, event_can_exit=self._workers_can_exit, debug=environment.env.CRAWLER_LOGGING_LEVEL == 'DEBUG') worker_control = WorkerControl( worker=worker, queue_input=queue_input, queue_output=queue_output, event_finished=self._workers_can_exit) self._workers.append(worker_control) worker.start() logging.info( f'TWManager: increased the number of workers by {diff}.') self._work_packages = treewalk.resize_work_packages( work_packages=self._work_packages, num_workers=num_workers) self._state.set_running_workers(num_workers) self._num_workers.value = num_workers
def shutdown() -> communication.Response: """Retrieve information about the current state of the TreeWalk. Returns: communication.Response: response object """ command = communication.Command(command=communication.MANAGER_SHUTDOWN, data=None) communication.manager_queue_input.put(command) return communication.manager_queue_output.get()
def stop() -> communication.Response: """Stop the TreeWalk. Returns: communication.Response: response object """ command = communication.Command(command=communication.MANAGER_STOP, data=None) communication.manager_queue_input.put(command) return communication.manager_queue_output.get()
def unpause() -> communication.Response: """Continue the paused TreeWalk. Returns: communication.Response: response object """ command = communication.Command(command=communication.MANAGER_UNPAUSE, data=None) communication.manager_queue_input.put(command) return communication.manager_queue_output.get()
def start(config: Config) -> communication.Response: """Start the TreeWalk. Args: config (Config): new configuration Returns: communication.Response: response object """ if config.get_force_update(): logging.info('TWManagerInterface: force-update was set. stopping.') command = communication.Command(command=communication.MANAGER_STOP, data=None) communication.manager_queue_input.put(command) # Ignore response of command stop _ = communication.manager_queue_output.get() command = communication.Command(command=communication.MANAGER_START, data=config) communication.manager_queue_input.put(command) return communication.manager_queue_output.get()
def _do_command(command: str, data: Any = None) -> communication.Response: """Helper method for passing a command to the scheduler. Args: command (str): command type data (Any, optional): the data required for the command. Defaults to None. Returns: communication.Response: response """ command = communication.Command(command=command, data=data) communication.scheduler_queue_input.put(command) response = communication.scheduler_queue_output.get() return response
def work_single() -> None: """Work on the small work packages""" packages = [] for index in range(self._num_workers.value): try: work = self._work_packages[index].pop() packages.append(work) except IndexError: packages.append([]) pass self._work_packages = [ items for items in self._work_packages if items ] for index, package in enumerate(packages): worker_control = self._workers[index] command = communication.Command( command=communication.WORKER_PACKAGE, data=package) worker_control.queue_input.put(command) self._workers_finished.wait() self._workers_finished.clear()
def work_split() -> None: """Work on the work packages that have to be split across workers.""" directory = self._work_packages_split.pop() entries = [ os.path.join(directory, entry) for entry in os.listdir(directory) ] files = [entry for entry in entries if os.path.isfile(entry)] work_packages = treewalk.chunkify_files( files=files, size=self._config.get_package_size()) # In each iteration, all workers must retrieve a work package. # Otherwise, the finish mechanism won't work. while len(work_packages) % self._num_workers.value != 0: work_packages.append([]) while work_packages: for worker_control in self._workers: package = work_packages.pop() command = communication.Command( command=communication.WORKER_PACKAGE, data=package) worker_control.queue_input.put(command) self._workers_finished.wait() self._workers_finished.clear()
def shutdown() -> None: """Shutdown the database updater thread.""" command = communication.Command( command=communication.DATABASE_UPDATER_SHUTDOWN, data=None) communication.database_updater_input.put(command)
def shutdown() -> None: """Shutdown the TreeWalk scheduler.""" command = communication.Command(command=communication.SCHEDULER_SHUTDOWN, data=None) communication.scheduler_queue_input.put(command)