def launch(self) -> None: """Starts the storage controller""" self.storage_controller = Process( name="StorageController", target=StorageController.run, args=(self.storage_controller, ), ) self.storage_controller.daemon = True self.storage_controller.start() self.listener_address = self.status_queue.get()
def test_child_process_logging(self, tmpdir): log_file = self.get_logfile_path(str(tmpdir)) openwpm_logger = mp_logger.MPLogger(log_file) child_process = Process(target=child_proc_logging_exception()) child_process.daemon = True child_process.start() openwpm_logger.close() child_process.join() log_content = self.get_logfile_contents(log_file) assert "I'm logging an exception" in log_content
def __init__( self, structured_storage: StructuredStorageProvider, unstructured_storage: Optional[UnstructuredStorageProvider], ) -> None: self.listener_address: Optional[Tuple[str, int]] = None self.listener_process: Optional[Process] = None self.status_queue = Queue() self.completion_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received: Optional[float] = None self.logger = logging.getLogger("openwpm") self.storage_controller = StorageController( structured_storage, unstructured_storage, status_queue=self.status_queue, completion_queue=self.completion_queue, shutdown_queue=self.shutdown_queue, )
def test_multiprocess(self, tmpdir): # Set up loggingserver log_file = self.get_logfile_path(str(tmpdir)) openwpm_logger = mp_logger.MPLogger(log_file) child_process_1 = Process(target=child_proc, args=(0, )) child_process_1.daemon = True child_process_1.start() child_process_2 = Process(target=child_proc, args=(1, )) child_process_2.daemon = True child_process_2.start() # Send some sample logs logger.info(PARENT_INFO_STR_1) logger.error(PARENT_ERROR_STR) logger.critical(PARENT_CRITICAL_STR) logger.debug(PARENT_DEBUG_STR) logger.warning(PARENT_WARNING_STR) logger1 = logging.getLogger("test1") logger2 = logging.getLogger("test2") logger1.info(NAMED_LOGGER_INFO_1) logger2.info(NAMED_LOGGER_INFO_2) # Close the logging server time.sleep(2) # give some time for logs to be sent openwpm_logger.close() child_process_1.join() child_process_2.join() print("Child processes joined...") log_content = self.get_logfile_contents(log_file) for child in range(2): assert log_content.count(CHILD_INFO_STR_1 % child) == 1 assert log_content.count(CHILD_INFO_STR_2 % child) == 1 assert log_content.count(CHILD_ERROR_STR % child) == 1 assert log_content.count(CHILD_CRITICAL_STR % child) == 1 assert log_content.count(CHILD_DEBUG_STR % child) == 1 assert log_content.count(CHILD_WARNING_STR % child) == 1 assert log_content.count(PARENT_INFO_STR_1) == 1 assert log_content.count(PARENT_ERROR_STR) == 1 assert log_content.count(PARENT_CRITICAL_STR) == 1 assert log_content.count(PARENT_DEBUG_STR) == 1 assert log_content.count(PARENT_WARNING_STR) == 1
def test_child_process_with_exception(self, tmpdir): log_file = self.get_logfile_path(str(tmpdir)) openwpm_logger = mp_logger.MPLogger(log_file) child_process_1 = Process(target=child_proc_with_exception, args=(0, )) child_process_1.daemon = True child_process_1.start() child_process_2 = Process(target=child_proc_with_exception, args=(1, )) child_process_2.daemon = True child_process_2.start() # Close the logging server time.sleep(2) # give some time for logs to be sent child_process_1.join() child_process_2.join() print("Child processes joined...") openwpm_logger.close() log_content = self.get_logfile_contents(log_file) for child in range(2): assert log_content.count(CHILD_INFO_STR_1 % child) == 1 assert log_content.count(CHILD_INFO_STR_2 % child) == 1 assert log_content.count(CHILD_EXCEPTION_STR % child) == 1
class StorageControllerHandle: """This class contains all methods relevant for the TaskManager to interact with the StorageController """ def __init__( self, structured_storage: StructuredStorageProvider, unstructured_storage: Optional[UnstructuredStorageProvider], ) -> None: self.listener_address: Optional[Tuple[str, int]] = None self.listener_process: Optional[Process] = None self.status_queue = Queue() self.completion_queue = Queue() self.shutdown_queue = Queue() self._last_status = None self._last_status_received: Optional[float] = None self.logger = logging.getLogger("openwpm") self.storage_controller = StorageController( structured_storage, unstructured_storage, status_queue=self.status_queue, completion_queue=self.completion_queue, shutdown_queue=self.shutdown_queue, ) def get_next_visit_id(self) -> VisitId: """Generate visit id as randomly generated positive integer less than 2^53. Parquet can support integers up to 64 bits, but Javascript can only represent integers up to 53 bits: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Number/MAX_SAFE_INTEGER Thus, we cap these values at 53 bits. """ return VisitId(random.getrandbits(53)) def get_next_browser_id(self) -> BrowserId: """Generate crawl id as randomly generated positive 32bit integer Note: Parquet's partitioned dataset reader only supports integer partition columns up to 32 bits. """ return BrowserId(random.getrandbits(32)) def save_configuration( self, manager_params: ManagerParamsInternal, browser_params: List[BrowserParamsInternal], openwpm_version: str, browser_version: str, ) -> None: assert self.listener_address is not None sock = DataSocket(self.listener_address) task_id = random.getrandbits(32) sock.store_record( TableName("task"), INVALID_VISIT_ID, { "task_id": task_id, "manager_params": manager_params.to_json(), "openwpm_version": openwpm_version, "browser_version": browser_version, }, ) # Record browser details for each browser for browser_param in browser_params: sock.store_record( TableName("crawl"), INVALID_VISIT_ID, { "browser_id": browser_param.browser_id, "task_id": task_id, "browser_params": browser_param.to_json(), }, ) sock.finalize_visit_id(INVALID_VISIT_ID, success=True) def launch(self) -> None: """Starts the storage controller""" self.storage_controller = Process( name="StorageController", target=StorageController.run, args=(self.storage_controller, ), ) self.storage_controller.daemon = True self.storage_controller.start() self.listener_address = self.status_queue.get() def get_new_completed_visits(self) -> List[Tuple[int, bool]]: """ Returns a list of all visit ids that have been processed since the last time the method was called and whether or not they ran successfully. This method will return an empty list in case no visit ids have been processed since the last time this method was called """ finished_visit_ids = list() while not self.completion_queue.empty(): finished_visit_ids.append(self.completion_queue.get()) return finished_visit_ids def shutdown(self, relaxed: bool = True) -> None: """Terminate the storage controller process""" assert isinstance(self.storage_controller, Process) self.logger.debug( "Sending the shutdown signal to the Storage Controller...") self.shutdown_queue.put((SHUTDOWN_SIGNAL, relaxed)) start_time = time.time() self.storage_controller.join(300) self.logger.debug("%s took %s seconds to close." % (type(self).__name__, str(time.time() - start_time))) def get_most_recent_status(self) -> int: """Return the most recent queue size sent from the Storage Controller process""" # Block until we receive the first status update if self._last_status is None: return self.get_status() # Drain status queue until we receive most recent update while not self.status_queue.empty(): self._last_status = self.status_queue.get() self._last_status_received = time.time() # Check last status signal if (time.time() - self._last_status_received) > STATUS_TIMEOUT: raise RuntimeError( "No status update from the storage controller process " "for %d seconds." % (time.time() - self._last_status_received)) return self._last_status def get_status(self) -> int: """Get listener process status. If the status queue is empty, block.""" try: self._last_status = self.status_queue.get(block=True, timeout=STATUS_TIMEOUT) self._last_status_received = time.time() except queue.Empty: assert self._last_status_received is not None raise RuntimeError( "No status update from the storage controller process " "for %d seconds." % (time.time() - self._last_status_received)) assert isinstance(self._last_status, int) return self._last_status