class DataFetcher(DataFetcherBase): """ Implementation of the data fetcher reacting on data sent by another hidra instance. """ def __init__(self, datafetcher_base_config): """Initial setup Checks if all required parameters are set in the configuration """ self.f_descriptors = dict() self.transfer = None DataFetcherBase.__init__(self, datafetcher_base_config, name=__name__) # base class sets # self.config_all - all configurations # self.config_df - the config of the datafetcher # self.config - the module specific config # self.df_type - the name of the datafetcher module # self.log_queue # self.log self.metadata_r = None self.data_r = None self.set_required_params() # check that the required_params are set inside of module specific # config self.check_config() self._setup() def set_required_params(self): """ Defines the parameters to be in configuration to run this data fetcher. Depending if on Linux or Windows other parameters are required. """ self.required_params = { "network": ["ext_ip"], } df_params = [ "status_check_resp_port", "confirmation_resp_port", "context" ] if utils.is_windows(): df_params += ["datafetcher_port"] else: self.required_params["network"] += ["ipc_dir", "main_pid"] self.required_params["datafetcher"] = [ "store_data", { self.df_type: df_params } ] def _setup(self): """Sets up and configures the transfer. """ self.transfer = Transfer("STREAM", use_log=self.log_queue) config_net = self.config_all["network"] endpoint = "{}_{}".format(config_net["main_pid"], "out") self.transfer.start([config_net["ipc_dir"], endpoint], protocol="ipc", data_con_style="connect") # enable status check requests from any sender self.transfer.setopt(option="status_check", value=[ config_net["ext_ip"], self.config["status_check_resp_port"] ]) # enable confirmation reply if this is requested in a received data # packet self.transfer.setopt(option="confirmation", value=[ config_net["ext_ip"], self.config["confirmation_resp_port"] ]) def get_metadata(self, targets, metadata): """Implementation of the abstract method get_metadata. Args: targets (list): The target list this file is supposed to go. metadata (dict): The dictionary with the metadata to extend. """ timeout = 10000 # Get new data self.metadata_r, self.data_r = self.transfer.get(timeout) if (metadata["relative_path"] != self.metadata_r["relative_path"] or metadata["source_path"] != self.metadata_r["source_path"] or metadata["filename"] != self.metadata_r["filename"]): self.log.error("Received metadata do not match data") # Use received data to prevent mismatch of metadata and data # TODO handle case if file type requested by target does not match # pylint: disable=attribute-defined-outside-init # Build source file self.source_file = generate_filepath(self.metadata_r["source_path"], self.metadata_r) # Build target file # if local_target is not set (== None) generate_filepath returns None self.target_file = generate_filepath(self.config_df["local_target"], self.metadata_r) # Extends metadata if targets: if "filesize" not in self.metadata_r: self.log.error("Received metadata do not contain 'filesize'") if "file_mod_time" not in self.metadata_r: self.log.error("Received metadata do not contain " "'file_mod_time'. Setting it to current time") self.metadata_r["file_mod_time"] = time.time() if "file_create_time" not in self.metadata_r: self.log.error("Received metadata do not contain " "'file_create_time'. Setting it to current " "time") self.metadata_r["file_create_time"] = time.time() if "chunksize" not in self.metadata_r: self.log.error("Received metadata do not contain 'chunksize'. " "Setting it to locally configured one") self.metadata_r["chunksize"] = self.config_df["chunksize"] def send_data(self, targets, metadata, open_connections): """Implementation of the abstract method send_data. Args: targets (list): The target list this file is supposed to go. metadata (dict): The dictionary with the metadata of the file open_connections (dict): The dictionary containing all open zmq connections. """ # pylint: disable=unused-argument if not targets: return # targets are of the form [[<host:port>, <prio>, <metadata|data>], ...] targets_data = [i for i in targets if i[2] == "data"] if not targets_data: return self.log.debug("Received data for file %s (chunknumber %s)", self.source_file, self.metadata_r["chunk_number"]) self.log.debug("Passing multipart-message for file '%s'...", self.source_file) try: chunk_payload = [ json.dumps(self.metadata_r).encode("utf-8"), self.data_r ] except Exception: self.log.error("Unable to pack multipart-message for file " "'%s'", self.source_file, exc_info=True) return # send message to data targets try: self.send_to_targets(targets=targets_data, open_connections=open_connections, metadata=None, payload=chunk_payload, chunk_number=self.metadata_r["chunk_number"]) except DataError: self.log.error( "Unable to send multipart-message for file '%s' (chunk %s)", self.source_file, self.metadata_r["chunk_number"], exc_info=True) except Exception: self.log.error( "Unable to send multipart-message for file '%s' (chunk %s)", self.source_file, self.metadata_r["chunk_number"], exc_info=True) def finish(self, targets, metadata, open_connections): """Implementation of the abstract method finish. Args: targets (list): The target list this file is supposed to go. metadata (dict): The dictionary with the metadata of the file open_connections (dict): The dictionary containing all open zmq connections. """ # targets are of the form [[<host:port>, <prio>, <metadata|data>], ...] targets_metadata = [i for i in targets if i[2] == "metadata"] # send message to metadata targets if targets_metadata: try: self.send_to_targets(targets=targets_metadata, open_connections=open_connections, metadata=metadata, payload=None, chunk_number=None, timeout=self.config["send_timeout"]) self.log.debug( "Passing metadata multipart-message for file " "%s...done.", self.source_file) except Exception: self.log.error( "Unable to send metadata multipart-message for file" "'%s' to '%s'", self.source_file, targets_metadata, exc_info=True) # store data if self.config_df["store_data"]: try: # TODO: save message to file using a thread (avoids blocking) self.transfer.store_chunk( descriptors=self.f_descriptors, filepath=self.target_file, payload=self.data_r, base_path=self.config_df["local_target"], metadata=self.metadata_r) except Exception: self.log.error( "Storing multipart message for file '%s' failed", self.source_file, exc_info=True) def stop(self): """Implementation of the abstract method stop. """ # Close base class zmq sockets self.close_socket() # Close open file handler to prevent file corruption for target_file in list(self.f_descriptors): self.f_descriptors[target_file].close() del self.f_descriptors[target_file] # Close zmq sockets if self.transfer is not None: self.transfer.stop()
class DataReceiver(object): """Receives data and stores it to disc usign the hidra API. """ def __init__(self): self.transfer = None self.checking_thread = None self.timeout = None self.config = None self.log = None self.dirs_not_to_create = None self.lock = None self.target_dir = None self.data_ip = None self.data_port = None self.transfer = None self.checking_thread = None self.plugin_handler = None self.run_loop = True self.setup() self.exec_run() def setup(self): """Initializes parameters, logging and transfer object. """ global _whitelist try: self.config = argument_parsing() except Exception: self.log = logging.getLogger("DataReceiver") raise config_gen = self.config["general"] config_recv = self.config["datareceiver"] # change user user_info, user_was_changed = utils.change_user(config_gen) # set up logging utils.check_writable(config_gen["log_file"]) self._setup_logging() utils.log_user_change(self.log, user_was_changed, user_info) # set process name # pylint: disable=no-member setproctitle.setproctitle(config_gen["procname"]) self.log.info("Version: %s", __version__) self.dirs_not_to_create = config_gen["dirs_not_to_create"] # for proper clean up if kill is called signal.signal(signal.SIGTERM, self.signal_term_handler) self.timeout = 2000 self.lock = threading.Lock() try: ldap_retry_time = config_gen["ldap_retry_time"] except KeyError: ldap_retry_time = 10 try: check_time = config_gen["netgroup_check_time"] except KeyError: check_time = 2 if config_gen["whitelist"] is not None: self.log.debug("config_gen['whitelist']=%s", config_gen["whitelist"]) with self.lock: _whitelist = utils.extend_whitelist(config_gen["whitelist"], config_gen["ldapuri"], self.log) self.log.info("Configured whitelist: %s", _whitelist) else: _whitelist = None # only start the thread if a netgroup was configured if (config_gen["whitelist"] is not None and isinstance(config_gen["whitelist"], str)): self.log.debug("Starting checking thread") try: self.checking_thread = CheckNetgroup(config_gen["whitelist"], self.lock, config_gen["ldapuri"], ldap_retry_time, check_time) self.checking_thread.start() except Exception: self.log.error("Could not start checking thread", exc_info=True) else: self.log.debug("Checking thread not started: %s", config_gen["whitelist"]) self.target_dir = os.path.normpath(config_recv["target_dir"]) self.data_ip = config_recv["data_stream_ip"] self.data_port = config_recv["data_stream_port"] self.log.info("Writing to directory '%s'", self.target_dir) self.transfer = Transfer(connection_type="STREAM", use_log=True, dirs_not_to_create=self.dirs_not_to_create) self._load_plugin() def _setup_logging(self): config_gen = self.config["general"] # enable logging root = logging.getLogger() root.setLevel(logging.DEBUG) handlers = utils.get_log_handlers(config_gen["log_file"], config_gen["log_size"], config_gen["verbose"], config_gen["onscreen"]) if isinstance(handlers, tuple): for hdl in handlers: root.addHandler(hdl) else: root.addHandler(handlers) self.log = logging.getLogger("DataReceiver") def _load_plugin(self): try: plugin_name = self.config["datareceiver"]["plugin"] plugin_config = self.config[plugin_name] except KeyError: self.log.debug("No plugin specified") return self.plugin_handler = PluginHandler(plugin_name, plugin_config, self.target_dir, self.log) def exec_run(self): """Wrapper around run to react to exceptions. """ try: self.run() except KeyboardInterrupt: pass except Exception: self.log.error("Stopping due to unknown error condition", exc_info=True) raise finally: self.stop() def run(self): """Start the transfer and store the data. """ global _whitelist # pylint: disable=global-variable-not-assigned global _changed_netgroup if self.plugin_handler is not None: plugin_type = self.plugin_handler.get_data_type() self.plugin_handler.start() else: plugin_type = None try: self.transfer.start([self.data_ip, self.data_port], _whitelist) except Exception: self.log.error("Could not initiate stream", exc_info=True) self.stop(store=False) raise # enable status check requests from any sender self.transfer.setopt("status_check") # enable confirmation reply if this is requested in a received data # packet self.transfer.setopt("confirmation") self.log.debug("Waiting for new messages...") self.run_loop = True # run loop, and wait for incoming messages while self.run_loop: if _changed_netgroup: self.log.debug("Re-registering whitelist") self.transfer.register(_whitelist) # reset flag with self.lock: _changed_netgroup = False try: ret_val = self.transfer.store(target_base_path=self.target_dir, timeout=self.timeout, return_type=plugin_type) except KeyboardInterrupt: break except Exception: self.log.error("Storing data...failed.", exc_info=True) raise if self.plugin_handler is None or ret_val is None: continue try: self.plugin_handler.put(ret_val) # ret_val might have been mutated by the plugin and therefore # should only be reused if this is acceptable except Exception: self.log.error("Cannot submit message to plugin") def stop(self, store=True): """Stop threads, close sockets and cleans up. Args: store (optional, bool): Run a little longer to store remaining data. """ self.run_loop = False if self.transfer is not None: self.transfer.status = [b"ERROR", "receiver is shutting down"] if store: stop_timeout = 0.5 start_time = time.time() diff_time = (time.time() - start_time) * 1000 self.log.debug("Storing remaining data.") while diff_time < stop_timeout: try: self.log.debug("Storing remaining data...") self.transfer.store(self.target_dir, self.timeout) except Exception: self.log.error("Storing data...failed.", exc_info=True) diff_time = (time.time() - start_time) * 1000 self.log.info("Shutting down receiver...") self.transfer.stop() self.transfer = None if self.plugin_handler is not None: self.plugin_handler.stop() if self.checking_thread is not None: self.checking_thread.stop() self.checking_thread.join() self.log.debug("checking_thread stopped") self.checking_thread = None # pylint: disable=unused-argument def signal_term_handler(self, signal_to_react, frame): """React on external SIGTERM signal. """ self.log.debug('got SIGTERM') self.stop() def __exit__(self, exception_type, exception_value, traceback): self.stop() def __del__(self): self.stop()