def __init__(self, engine_conn, processors=4, buffer_roll=0, buffer_max_batch=50, buffer_max_seconds=1, test_mode=False, test_outfile='engine_test_output/engine_test_output'): """ Initializes with empty buffer & queue, set # of processors... :param processors: number of processors to start :type processors: int """ logger.info("Initializing EngineThread") super().__init__() self.test_run = test_mode self.test_outfile = test_outfile self.test_batches = {} self.pipe_conn = engine_conn self.buffers_out_q = JoinableQueue() self.number_of_processors = processors self.processors = [] self.run_engine = False self.buffer_record_limit = int(buffer_max_batch) self.buffer_time_limit_s = float(buffer_max_seconds) self.buffers = {} self.buffer_in_qs = {} self.buffer_workers = {} self.data_pullers = {} self.buffer_roll = -buffer_roll if buffer_roll > 0: self.buffer_roll_index = -buffer_roll else: self.buffer_roll_index = None
def parse_args(self, args=None): logger.info("Parsing args for configuration params. args = %s" % (args if args is not None else sysargs[1:])) try: parser = ArgumentParser() parser.add_argument("-f", "--config_file", action="append", dest="files") parser.add_argument("-o", "--option", action="append", dest="options") parser.add_argument("-D", "--param", action="append", dest="options") if args is None: args, unknown = parser.parse_known_args() else: args, unknown = parser.parse_known_args(args) files = args.files options = args.options if files is not None and isinstance(files, list): for file in files: try: self.load(file) except: logger.warn("Error in loading configuration file: %s" % file) if options is not None and isinstance(options, list): for option in options: try: key, value = self.__read_option_keyvalue(option) self.set(key, value) except: logger.warn("Could not load configuration argument: %s" % option) except (ArgumentError, ArgumentTypeError) as err: logger.warn("Bad configuration arguments. %s" % err.message) raise except: logger.warn("Unknown configuration argument error.") raise
def define(): """ Collect template for DStream init and return stream_token. Expects 'template' argument containing user-formatted template. """ tk['define'].start() args = srv.parse() template = args['template'] # dstream template cur_dstream = srv._dstream_new() logger.info("stream token is: {}".format(str(cur_dstream['stream_token']))) try: tk['define : try (template loading/processing)'].start() json_template = json.loads(template) logger.debug("define: json.loads done") cur_dstream.load_from_json(json_template) # sends template to engine to init buffer stuff + data puller, if applicable srv.server_conn.send((cur_dstream, "new")) logger.debug("define: dstream.load_from_json done") template_df = srv.coordinator.process_template(cur_dstream) srv.storage_queue.put(('template', template_df)) logger.debug("define: coordinator.process-template done") tk['define : try (template loading/processing)'].stop() except Exception as ex: logger.warning( "Server Error in define: Template loading/processing - {}".format( ex)) # bad_resp = Response(ex, 400) # bad_resp.headers['Access-Control-Allow-Origin']='*' # return bad_resp return '{}'.format(ex), 400 else: print(f"Created template for stream {cur_dstream['stream_token']}") resp = Response(str(cur_dstream['stream_token']), 200) resp.headers['Access-Control-Allow-Origin'] = '*' tk['define'].stop() return resp
def get(self, option, section=None, default=None): try: if section is None: option, section = self.__get_option_name(option) value = self._cfg.get(section, option) except (NoSectionError, NoOptionError, ConfigParserGeneralError) as err: logger.info("Configuration parameter didn't exist, returning the default value." % err.message) return default logger.debug("Read configuration parameter: (section=%s) %s=%s" % (section, option, value)) return value
def stop_engine(self): self.pipe_conn.close() if self.run_engine is True: self.run_engine = False for p in self.data_pullers.keys(): self.data_pullers[p].pulling = False logger.info(self.buffers_out_q.qsize()) self.buffers_out_q.join() logger.info("Queue joined") for p in self.processors: logger.info("Putting poison pills in Q") self.buffers_out_q.put("666_kIlL_thE_pROCess_666") logger.info("Poison pills done") for p in self.processors: p.join() logger.info("Engine shutdown- processor joined") print("done")
def run(self): """ Sets up numpy array buffer and puts stuff in and gets stuff out """ self._init_processors() self.run_engine = True while self.run_engine: if self.pipe_conn.poll(): item = self.pipe_conn.recv() # branch 2 - stop engine if item == "stop_poison_pill": for q in self.buffer_in_qs.keys(): self.buffer_in_qs[q].put("stop_buffer_worker") self.run_engine = False break # branch 1 - engine running, good data elif type(item) is tuple: partition_key = item[0]['stream_token'] new_buffer = self._new_buffer(partition_key) if new_buffer: logger.info( f"Initialized buffer for stream {partition_key}") if item[1] == "new": if item[0]["data_rules"]["pull"] is True: new_puller = self._new_data_puller( partition_key, item[0]) if new_puller: print( f"Initialized data puller for stream {partition_key}" ) else: logger.warn( f"Attempting to initialize data puller for stream {partition_key} - puller already exists" ) elif item[1] == "load": self.buffer_in_qs[partition_key].put(item[0]) else: raise TypeError( "Invalid tuple in pipe - index 1 must be str 'load' or str 'new'" ) else: raise TypeError("Invalid item in pipe") logger.info("Terminating Engine Thread") self.stop_engine()
def __init__(self, *args, **kwargs): logger.debug("Initialize RuleDict") self.update(*args, **kwargs) if "expected_keys" in kwargs: self.expected_keys = kwargs["expected_keys"] else: raise KeyError("no expected keys set") bad_keys = [] for key in self.keys(): if not key in self.expected_keys: bad_keys.append(key) logger.warning("non expected key found: %s" % (key)) for key in bad_keys: del self[key] for key in self.expected_keys: if key not in self: logger.info("No value supplied for %s, setting to None" % (key)) self[key] = None
def __load(self, file_name): logger.info("Loading configuration file: %s" % file_name) try: with open(file_name, 'r') as fp: try: self._cfg.read_file(fp) self._loaded_files.append(file_name) logger.info("Configuration loaded successfully.") except (ParsingError, NoSectionError, MissingSectionHeaderError, ConfigParserGeneralError) as err: logger.warn("Bad configuration file. File: %s. Error: %s" % (file_name, err.message)) except (PermissionError, FileNotFoundError) as err: logger.info("Could not read configuration file: %s. %s" % (file_name, str(err)))
def __init__(self, *args, **kwargs): """ Intializes an event dict. Can either be created empty or from an existing dict. Empty creation creates all necessary keys in the dict with 0 values Creating from an existing dict keeps all expected key value pair from the input dict and discards the rest. """ self.update(*args, **kwargs) logger.debug("initializing event") expected_keys = [ "event_name", "event_rules", "timestamp", "stream_token", "event_context" ] bad_keys = [] for key in self.keys(): if not key in expected_keys: bad_keys.append(key) logger.debug("non-expected key found: %s" % (key)) for key in bad_keys: del self[key] if not "event_name" in self.keys(): logger.info("No event_name found") self["event_name"] = "" if not "event_rules" in self.keys(): logger.info("No event_rules found") self["event_rules"] = {} if not "timestamp" in self.keys(): logger.debug("no timestamp supplied") self["timestamp"] = 0 if not "stream_token" in self.keys(): logger.info("no stream_token supllied") self["stream_token"] = "" if not "event_context" in self.keys(): logger.debug("No context") self["event_context"] = {}
def store(self, file_name=DEFAULT_FILE): logger.info("Saving configuration to file: %s" % file_name) with open(file_name, 'w') as f: self._cfg.write(f)
def _validate_context(self): if not isinstance(self.context, Context): raise TypeError("Invalid data puller context") else: logger.info("Validated data puller context")
def _validate_reader(self): if not isinstance(self.source_reader, SourceReader): raise TypeError("Invalid source reader") else: logger.info("Validated source reader")
def run(self): self.pulling = True self.pulling_start = datetime.now() while self.pulling: self.source_reader.read_input() logger.info("Quitting puller")
def run_buffer(self, partition_key): last_col = self.buffer_record_limit - 1 last_row = self.number_of_processors - 1 cur_row = 0 cur_col = 0 batch_tracker = {'start_time': time(), 'leftos_collected': False} while self.run_engine: try: item = self.buffer_in_qs[partition_key].get( timeout=self.buffer_time_limit_s) # branch 2 - stop engine if item == "stop_buffer_worker": break # branch 1 - engine running, good data elif isinstance( item, DStream) or (type(item) is dict and "stream_token" in item.keys()): if "data_rules" in item.keys( ): # some unit test data doesnt have this field if "date_format" in item["data_rules"].keys(): if item["data_rules"]["date_format"] is not None: item["timestamp"] = datetime.strptime( item["timestamp"], item["data_rules"] ["date_format"]).timestamp() # branch 1.1 - not last row if cur_row < last_row: # branch 1.1a - not last column, continue row if cur_col < last_col: logger.info("Buffering- row {}".format(cur_row)) self.buffers[partition_key][cur_row, cur_col] = item cur_col += 1 # branch 1.1b - last column, start new row else: self.buffers[partition_key][cur_row, cur_col] = item if self.test_run: self.buffers_out_q.put(( self.buffers[partition_key] [cur_row].copy(), f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt" )) self.test_batches[partition_key] += 1 else: self.buffers_out_q.put( self.buffers[partition_key] [cur_row].copy()) logger.info("New batch queued") roll_window = self.buffers[partition_key][ cur_row, self.buffer_roll_index:] cur_row += 1 for n in roll_window: for i in range(abs(self.buffer_roll)): self.buffers[partition_key][cur_row, i] = n cur_col -= cur_col + self.buffer_roll # REMOVE batch_tracker['start_time'] = time() # branch 1.2 - last row else: # branch 1.2a - not last column, continue row if cur_col < last_col: self.buffers[partition_key][cur_row, cur_col] = item cur_col += 1 # branch 1.2b - last column, start return to first row in new cycle else: self.buffers[partition_key][cur_row, cur_col] = item if self.test_run: self.buffers_out_q.put(( self.buffers[partition_key] [cur_row].copy(), f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt" )) self.test_batches[partition_key] += 1 else: self.buffers_out_q.put( self.buffers[partition_key] [cur_row].copy()) roll_window = self.buffers[partition_key][ cur_row, self.buffer_roll_index:] cur_row -= cur_row for n in roll_window: for i in range(abs(self.buffer_roll)): self.buffers[partition_key][cur_row, i] = n cur_col -= cur_col + self.buffer_roll batch_tracker['start_time'] = time() batch_tracker['leftos_collected'] = False # branch 3 bad data else: raise TypeError("Queued item is not valid dictionary.") except: # buffer time max reached, engine still running logger.info("Buffer batch timeout exceeded") if self.run_engine is True: # engine running, batch timeout with new buffer data (partial row) if cur_col > abs( self.buffer_roll ) and batch_tracker['leftos_collected'] is False: logger.info( "Collecting leftovers- pushing partial batch to queue after batch timeout" ) if self.test_run: self.buffers_out_q.put(( self.buffers[partition_key][ cur_row, :cur_col].copy(), f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt" )) self.test_batches[partition_key] += 1 else: self.buffers_out_q.put(self.buffers[partition_key][ cur_row, :cur_col].copy()) if cur_row < last_row: cur_row += 1 else: cur_row -= cur_row cur_col -= cur_col batch_tracker['start_time'] = time() batch_tracker['leftos_collected'] = True # leftovers already collected else: logger.info("No new data- resetting batch timer") batch_tracker['start_time'] = time()