Пример #1
0
 def __init__(self,
              engine_conn,
              processors=4,
              buffer_roll=0,
              buffer_max_batch=50,
              buffer_max_seconds=1,
              test_mode=False,
              test_outfile='engine_test_output/engine_test_output'):
     """
     Initializes with empty buffer & queue,
      set # of processors...
     :param processors: number of processors to start
     :type processors: int
     """
     logger.info("Initializing EngineThread")
     super().__init__()
     self.test_run = test_mode
     self.test_outfile = test_outfile
     self.test_batches = {}
     self.pipe_conn = engine_conn
     self.buffers_out_q = JoinableQueue()
     self.number_of_processors = processors
     self.processors = []
     self.run_engine = False
     self.buffer_record_limit = int(buffer_max_batch)
     self.buffer_time_limit_s = float(buffer_max_seconds)
     self.buffers = {}
     self.buffer_in_qs = {}
     self.buffer_workers = {}
     self.data_pullers = {}
     self.buffer_roll = -buffer_roll
     if buffer_roll > 0:
         self.buffer_roll_index = -buffer_roll
     else:
         self.buffer_roll_index = None
Пример #2
0
 def parse_args(self, args=None):
     logger.info("Parsing args for configuration params. args = %s" % (args if args is not None else sysargs[1:]))
     try:
         parser = ArgumentParser()
         parser.add_argument("-f", "--config_file", action="append", dest="files")
         parser.add_argument("-o", "--option", action="append", dest="options")
         parser.add_argument("-D", "--param", action="append", dest="options")
         if args is None:
             args, unknown = parser.parse_known_args()
         else:
             args, unknown = parser.parse_known_args(args)
         files = args.files
         options = args.options
         if files is not None and isinstance(files, list):
             for file in files:
                 try:
                     self.load(file)
                 except:
                     logger.warn("Error in loading configuration file: %s" % file)
         if options is not None and isinstance(options, list):
             for option in options:
                 try:
                     key, value = self.__read_option_keyvalue(option)
                     self.set(key, value)
                 except:
                     logger.warn("Could not load configuration argument: %s" % option)
     except (ArgumentError, ArgumentTypeError) as err:
         logger.warn("Bad configuration arguments. %s" % err.message)
         raise
     except:
         logger.warn("Unknown configuration argument error.")
         raise
Пример #3
0
def define():
    """ Collect template for DStream init and return stream_token.
    Expects 'template' argument containing user-formatted template.
    """
    tk['define'].start()
    args = srv.parse()
    template = args['template']  #   dstream template
    cur_dstream = srv._dstream_new()
    logger.info("stream token is: {}".format(str(cur_dstream['stream_token'])))
    try:
        tk['define : try (template loading/processing)'].start()
        json_template = json.loads(template)
        logger.debug("define: json.loads done")
        cur_dstream.load_from_json(json_template)
        # sends template to engine to init buffer stuff + data puller, if applicable
        srv.server_conn.send((cur_dstream, "new"))
        logger.debug("define: dstream.load_from_json done")
        template_df = srv.coordinator.process_template(cur_dstream)
        srv.storage_queue.put(('template', template_df))
        logger.debug("define: coordinator.process-template done")
        tk['define : try (template loading/processing)'].stop()
    except Exception as ex:
        logger.warning(
            "Server Error in define: Template loading/processing - {}".format(
                ex))
        # bad_resp = Response(ex, 400)
        # bad_resp.headers['Access-Control-Allow-Origin']='*'
        # return bad_resp
        return '{}'.format(ex), 400
    else:
        print(f"Created template for stream {cur_dstream['stream_token']}")
        resp = Response(str(cur_dstream['stream_token']), 200)
        resp.headers['Access-Control-Allow-Origin'] = '*'
        tk['define'].stop()
        return resp
Пример #4
0
 def get(self, option, section=None, default=None):
     try:
         if section is None:
             option, section = self.__get_option_name(option)
         value = self._cfg.get(section, option)
     except (NoSectionError, NoOptionError, ConfigParserGeneralError) as err:
         logger.info("Configuration parameter didn't exist, returning the default value." % err.message)
         return default
     logger.debug("Read configuration parameter: (section=%s) %s=%s" % (section, option, value))
     return value
Пример #5
0
 def stop_engine(self):
     self.pipe_conn.close()
     if self.run_engine is True:
         self.run_engine = False
     for p in self.data_pullers.keys():
         self.data_pullers[p].pulling = False
     logger.info(self.buffers_out_q.qsize())
     self.buffers_out_q.join()
     logger.info("Queue joined")
     for p in self.processors:
         logger.info("Putting poison pills in Q")
         self.buffers_out_q.put("666_kIlL_thE_pROCess_666")
     logger.info("Poison pills done")
     for p in self.processors:
         p.join()
         logger.info("Engine shutdown- processor joined")
     print("done")
Пример #6
0
    def run(self):
        """
        Sets up numpy array buffer and puts stuff in and gets stuff out
        """
        self._init_processors()
        self.run_engine = True

        while self.run_engine:
            if self.pipe_conn.poll():
                item = self.pipe_conn.recv()
                # branch 2 - stop engine
                if item == "stop_poison_pill":
                    for q in self.buffer_in_qs.keys():
                        self.buffer_in_qs[q].put("stop_buffer_worker")
                    self.run_engine = False
                    break
                # branch 1 - engine running, good data
                elif type(item) is tuple:
                    partition_key = item[0]['stream_token']
                    new_buffer = self._new_buffer(partition_key)
                    if new_buffer:
                        logger.info(
                            f"Initialized buffer for stream {partition_key}")
                    if item[1] == "new":
                        if item[0]["data_rules"]["pull"] is True:
                            new_puller = self._new_data_puller(
                                partition_key, item[0])
                            if new_puller:
                                print(
                                    f"Initialized data puller for stream {partition_key}"
                                )
                            else:
                                logger.warn(
                                    f"Attempting to initialize data puller for stream {partition_key} - puller already exists"
                                )
                    elif item[1] == "load":
                        self.buffer_in_qs[partition_key].put(item[0])
                    else:
                        raise TypeError(
                            "Invalid tuple in pipe - index 1 must be str 'load' or str 'new'"
                        )
                else:
                    raise TypeError("Invalid item in pipe")
        logger.info("Terminating Engine Thread")
        self.stop_engine()
Пример #7
0
    def __init__(self, *args, **kwargs):
        logger.debug("Initialize RuleDict")
        self.update(*args, **kwargs)
        if "expected_keys" in kwargs:
            self.expected_keys = kwargs["expected_keys"]
        else:
            raise KeyError("no expected keys set")

        bad_keys = []
        for key in self.keys():
            if not key in self.expected_keys:
                bad_keys.append(key)
                logger.warning("non expected key found: %s" % (key))
        for key in bad_keys:
            del self[key]
        for key in self.expected_keys:
            if key not in self:
                logger.info("No value supplied for %s, setting to None" %
                            (key))
                self[key] = None
Пример #8
0
 def __load(self, file_name):
     logger.info("Loading configuration file: %s" % file_name)
     try:
         with open(file_name, 'r') as fp:
             try:
                 self._cfg.read_file(fp)
                 self._loaded_files.append(file_name)
                 logger.info("Configuration loaded successfully.")
             except (ParsingError, NoSectionError, MissingSectionHeaderError, ConfigParserGeneralError) as err:
                 logger.warn("Bad configuration file. File: %s. Error: %s" % (file_name, err.message))
     except (PermissionError, FileNotFoundError) as err:
         logger.info("Could not read configuration file: %s. %s" % (file_name, str(err)))
Пример #9
0
    def __init__(self, *args, **kwargs):
        """
        Intializes an event dict. Can either be created empty or from an existing dict.
        Empty creation creates all necessary keys in the dict with 0 values
        Creating from an existing dict keeps all expected key value pair from the input dict and
        discards the rest.
        """
        self.update(*args, **kwargs)
        logger.debug("initializing event")
        expected_keys = [
            "event_name", "event_rules", "timestamp", "stream_token",
            "event_context"
        ]
        bad_keys = []
        for key in self.keys():
            if not key in expected_keys:
                bad_keys.append(key)
                logger.debug("non-expected key found: %s" % (key))
        for key in bad_keys:
            del self[key]

        if not "event_name" in self.keys():
            logger.info("No event_name found")
            self["event_name"] = ""
        if not "event_rules" in self.keys():
            logger.info("No event_rules found")
            self["event_rules"] = {}
        if not "timestamp" in self.keys():
            logger.debug("no timestamp supplied")
            self["timestamp"] = 0
        if not "stream_token" in self.keys():
            logger.info("no stream_token supllied")
            self["stream_token"] = ""
        if not "event_context" in self.keys():
            logger.debug("No context")
            self["event_context"] = {}
Пример #10
0
 def store(self, file_name=DEFAULT_FILE):
     logger.info("Saving configuration to file: %s" % file_name)
     with open(file_name, 'w') as f:
         self._cfg.write(f)
Пример #11
0
 def _validate_context(self):
     if not isinstance(self.context, Context):
         raise TypeError("Invalid data puller context")
     else:
         logger.info("Validated data puller context")
Пример #12
0
 def _validate_reader(self):
     if not isinstance(self.source_reader, SourceReader):
         raise TypeError("Invalid source reader")
     else:
         logger.info("Validated source reader")
Пример #13
0
 def run(self):
     self.pulling = True
     self.pulling_start = datetime.now()
     while self.pulling:
         self.source_reader.read_input()
     logger.info("Quitting puller")
Пример #14
0
    def run_buffer(self, partition_key):
        last_col = self.buffer_record_limit - 1
        last_row = self.number_of_processors - 1
        cur_row = 0
        cur_col = 0
        batch_tracker = {'start_time': time(), 'leftos_collected': False}

        while self.run_engine:
            try:
                item = self.buffer_in_qs[partition_key].get(
                    timeout=self.buffer_time_limit_s)
                # branch 2 - stop engine
                if item == "stop_buffer_worker":
                    break
                # branch 1 - engine running, good data
                elif isinstance(
                        item, DStream) or (type(item) is dict
                                           and "stream_token" in item.keys()):
                    if "data_rules" in item.keys(
                    ):  # some unit test data doesnt have this field
                        if "date_format" in item["data_rules"].keys():
                            if item["data_rules"]["date_format"] is not None:
                                item["timestamp"] = datetime.strptime(
                                    item["timestamp"], item["data_rules"]
                                    ["date_format"]).timestamp()
                    # branch 1.1 - not last row
                    if cur_row < last_row:
                        # branch 1.1a - not last column, continue row
                        if cur_col < last_col:
                            logger.info("Buffering- row {}".format(cur_row))
                            self.buffers[partition_key][cur_row,
                                                        cur_col] = item
                            cur_col += 1
                        # branch 1.1b - last column, start new row
                        else:
                            self.buffers[partition_key][cur_row,
                                                        cur_col] = item
                            if self.test_run:
                                self.buffers_out_q.put((
                                    self.buffers[partition_key]
                                    [cur_row].copy(),
                                    f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt"
                                ))
                                self.test_batches[partition_key] += 1
                            else:
                                self.buffers_out_q.put(
                                    self.buffers[partition_key]
                                    [cur_row].copy())
                            logger.info("New batch queued")
                            roll_window = self.buffers[partition_key][
                                cur_row, self.buffer_roll_index:]
                            cur_row += 1
                            for n in roll_window:
                                for i in range(abs(self.buffer_roll)):
                                    self.buffers[partition_key][cur_row, i] = n
                            cur_col -= cur_col + self.buffer_roll
                            # REMOVE
                            batch_tracker['start_time'] = time()
                    # branch 1.2 - last row
                    else:
                        # branch 1.2a - not last column, continue row
                        if cur_col < last_col:
                            self.buffers[partition_key][cur_row,
                                                        cur_col] = item
                            cur_col += 1
                        # branch 1.2b - last column, start return to first row in new cycle
                        else:
                            self.buffers[partition_key][cur_row,
                                                        cur_col] = item
                            if self.test_run:
                                self.buffers_out_q.put((
                                    self.buffers[partition_key]
                                    [cur_row].copy(),
                                    f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt"
                                ))
                                self.test_batches[partition_key] += 1
                            else:
                                self.buffers_out_q.put(
                                    self.buffers[partition_key]
                                    [cur_row].copy())

                            roll_window = self.buffers[partition_key][
                                cur_row, self.buffer_roll_index:]
                            cur_row -= cur_row
                            for n in roll_window:
                                for i in range(abs(self.buffer_roll)):
                                    self.buffers[partition_key][cur_row, i] = n
                            cur_col -= cur_col + self.buffer_roll
                            batch_tracker['start_time'] = time()
                    batch_tracker['leftos_collected'] = False
                # branch 3 bad data
                else:
                    raise TypeError("Queued item is not valid dictionary.")
            except:
                # buffer time max reached, engine still running
                logger.info("Buffer batch timeout exceeded")
                if self.run_engine is True:
                    # engine running, batch timeout with new buffer data (partial row)
                    if cur_col > abs(
                            self.buffer_roll
                    ) and batch_tracker['leftos_collected'] is False:
                        logger.info(
                            "Collecting leftovers- pushing partial batch to queue after batch timeout"
                        )
                        if self.test_run:
                            self.buffers_out_q.put((
                                self.buffers[partition_key][
                                    cur_row, :cur_col].copy(),
                                f"{self.test_outfile}_{partition_key}_{self.test_batches[partition_key]}.txt"
                            ))
                            self.test_batches[partition_key] += 1
                        else:
                            self.buffers_out_q.put(self.buffers[partition_key][
                                cur_row, :cur_col].copy())
                        if cur_row < last_row:
                            cur_row += 1
                        else:
                            cur_row -= cur_row

                        cur_col -= cur_col
                        batch_tracker['start_time'] = time()
                        batch_tracker['leftos_collected'] = True
                    # leftovers already collected
                    else:
                        logger.info("No new data- resetting batch timer")
                        batch_tracker['start_time'] = time()