Пример #1
0
 def run(self):
     with self.lock:
         logger.info("[job:{job}] start to run".format(job=self.to_dict()))
         try:
             for md in self.api_models_config:
                 model = Models[md.get("name")]
                 md_instance = self.store.set_model_instance({
                     'job_id':
                     self.id,
                     'model':
                     md.get("name"),
                     'report':
                     '',
                     'status':
                     MODEL_NEW
                 })
                 model_runner = model(md_instance, self.store, md,
                                      self.data_source, self.slack_channel,
                                      self.timeout)
                 t = Thread(target=model_runner.run)
                 t.start()
                 self.running_models.append(model_runner)
                 self.threads[md.get("name")] = t
         except Exception as e:
             logger.info("[job:{job}] fail to star: {err}".format(
                 job=self.to_dict(), err=str(e)))
             logger.exception("Exception Logged")
             self.status = JOB_ERROR
         else:
             self.timer = Timer(timeparse(self.timeout),
                                self.timeout_action)
             self.status = JOB_RUNNING
         finally:
             self.save_job()
Пример #2
0
    def close(self):
        logger.info("{log_prefix} closing".format(log_prefix=self.log_prefix))
        super(Rules, self).close()
        self.timer.cancel()

        self.status = MODEL_STOP
        self.save_model()
Пример #3
0
    def timeout_action(self):
        logger.info(
            "{log_prefix} finish the model".format(log_prefix=self.log_prefix))
        super(IForest, self).close()

        self.status = MODEL_FINISH
        self.save_model()
Пример #4
0
    def close(self):
        logger.info("stop collector !")
        self._exit = True
        self.event.set()

        for job_id, job in self.jobs.items():
            job.close()
Пример #5
0
    def run(self):
        logger.info("{log_prefix} start to run".format(
            log_prefix=self.log_prefix, model=self.name))
        self.timer.start()
        for metric in self.metrics:
            if metric not in Metrics:
                logger.error(
                    "{log_prefix}[metric:{metric}] is not supported".format(
                        log_prefix=self.log_prefix, metric=metric))
                continue

            val = Metrics[metric]
            if metric not in self.cfg.metrics:
                logger.error(
                    "{log_prefix}[metric:{metric}] can't found the config of this metric"
                    .format(log_prefix=self.log_prefix, metric=metric))
                continue

            self.report.metrics_report[metric] = []

            t = Thread(target=self.run_action,
                       args=(metric, val, self.cfg.metrics[metric]))
            t.start()
            self.threads[metric] = t

            self.status = MODEL_RUNNING
            self.save_model()
Пример #6
0
 def shutdown():
     logger.info('Stopping http server')
     manager.close()
     server.stop()
     logger.info('Will shutdown in %s seconds ...',
                 MAX_WAIT_SECONDS_BEFORE_SHUTDOWN)
     stop_loop(time.time() + MAX_WAIT_SECONDS_BEFORE_SHUTDOWN)
Пример #7
0
    def close(self):
        # TODO: close this job
        logger.info("{log_prefix} closing".format(log_prefix=self.log_prefix))
        super(IForest, self).close()
        self.timer.cancel()

        self.status = MODEL_STOP
        self.save_model()
Пример #8
0
 def stop_loop(deadline):
     now = time.time()
     if now < deadline and (io_loop._callbacks or io_loop._timeouts):
         logger.info('Waiting for next tick')
         io_loop.add_timeout(now + 1, stop_loop, deadline)
     else:
         io_loop.stop()
         logger.info('Shutdown finally')
Пример #9
0
    def run(self):
        logger.info("start to run collector")

        while not self._exit:
            self.schedule_task()
            self.clean()

            self.event.wait(timeparse(JOB_SCHEDULE_INTERVAL))
Пример #10
0
 def run_job(self, job):
     logger.info(
         "add new job: job-{id} to tasks queue".format(id=job.get('id')))
     try:
         self.new_task(Task(START_JOB, job))
     except Exception as e:
         logger.error("add new job: {job} failed: {err}".format(job=job,
                                                                err=str(e)))
         raise NewJobException(job, str(e))
Пример #11
0
    def init_model_template(self):
        logger.info("init model templates")

        for name, model in Models.items():
            cfg = model.get_model_template(name, MODELS_PATH)
            self.storage.set_model_template({
                "name": name,
                "config": str(cfg)
            })
Пример #12
0
    def stop_job_handler(self, job):
        logger.info("stop job: {job}".format(job=job))

        job_instance = self.jobs.get(job.get('id'))

        if job_instance is None:
            raise JobNotRunningException(job)

        t = Thread(target=job_instance.close)
        t.start()
Пример #13
0
        def get(self, job_id):
            if job_id == "":
                logger.error("job id is required")
                self.finish({
                    "code": HTTP_MISS_ARGS,
                    "message": "job id is required"
                })
                return

            try:
                logger.info("close running job:{job_id}".format(job_id=job_id))
                manager.stop_job(int(job_id))
            except JobNotExistsException:
                logger.error(
                    "close job:{job_id} failed: job is not exist".format(
                        job_id=job_id))
                logger.exception("Exception Logged")

                data = {
                    "code":
                    HTTP_FAIL,
                    "message":
                    "close job:{job_id} failed: job is not exist".format(
                        job_id=job_id)
                }
            except JobNotRunningException:
                logger.error(
                    "close job:{job_id} failed: job is not running".format(
                        job_id=job_id))
                logger.exception("Exception Logged")

                data = {
                    "code":
                    HTTP_FAIL,
                    "message":
                    "close job:{job_id} failed: job is not running".format(
                        job_id=job_id)
                }
            except Exception as e:
                logger.error("close job:{job_id} failed:{err}".format(
                    job_id=job_id, err=str(e)))
                logger.exception("Exception Logged")

                data = {
                    "code":
                    HTTP_FAIL,
                    "message":
                    "close job:{job_id} failed:{err}".format(job_id=job_id,
                                                             err=str(e))
                }
            else:
                data = {"code": HTTP_OK, "message": "OK"}
            finally:
                self.finish(data)
Пример #14
0
    def timeout_action(self):
        logger.info("[job:{job}] finish".format(job=self.to_dict()))

        self.status = JOB_FINISHED
        self.save_job()

        send_to_slack("{job:{job} finish \n. report detail: {url}".format(
            job=self.to_dict(),
            url=urlparse.urljoin(
                REPORT_ADDRESS,
                "api/v1/job/{job_id}/report".format(job_id=self.id))))
Пример #15
0
 def on_error(self, metric, predict_data):
     logger.info(
         "{log_prefix}[metric:{metric}] Predict Error, predict data:{predict_data}"
         .format(log_prefix=self.log_prefix,
                 metric=metric,
                 predict_data=predict_data))
     send_to_slack(
         "{log_prefix}[model:{model}], predict metric {metric} error, "
         "predict data:{predict_data}".format(log_prefix=self.log_prefix,
                                              model=self.name,
                                              metric=metric,
                                              predict_data=predict_data),
         self.slack_channel)
Пример #16
0
    def start_job_handler(self, job):
        logger.info("run job: {job}".format(job=job))
        if job.get('id') in self.jobs.keys():
            logger.warn("the job is running")
            return

        job_instance = Job(self.storage, job)
        if not job_instance.valid():
            logger.error("the job is invalid")
            return

        self.jobs[job.get('id')] = job_instance

        t = Thread(target=job_instance.run)
        t.start()
Пример #17
0
 def schedule_task(self):
     if len(self.jobs) >= RUNNING_SIZE_LIMIT:
         logger.warn(
             "the number of tasks allowed to run exceeded the limit.")
         return
     try:
         task = self.queue.get_nowait()
     except Empty:
         logger.info("the queue of tasks is empty, skip schedule")
         return
     else:
         if task.typ == START_JOB:
             self.start_job_handler(task.job)
         elif task.typ == STOP_JOB:
             self.stop_job_handler(task.job)
         else:
             raise TaskTypeNotSupportedException(task)
Пример #18
0
    def train_task(self, metric, query, config):
        data_set = self.api.query(query)
        if len(data_set) > 0:
            values = []
            for data in data_set.values():
                values.append(float(data))

            df_one = {}
            for key in config["features"]:
                if key in Features:
                    df_one[key] = Features[key](values)

            logger.info(
                "{log_prefix}[metric:{metric}] append data to train df:{df_one}"
                .format(log_prefix=self.log_prefix,
                        metric=metric,
                        df_one=df_one))
            self.df[metric] = self.df[metric].append(df_one, ignore_index=True)
Пример #19
0
    def predict_task(self, metric, query, config):
        data_set = self.api.query(query)
        values = []
        for data in data_set.values():
            values.append(float(data))

        df_one = []
        for key in config["features"]:
            if key in Features:
                df_one.append(Features[key](values))

        predict_data = np.array([df_one])

        logger.info(
            "{log_prefix}[metric:{metric}] predict data:{predict_data}".format(
                log_prefix=self.log_prefix,
                metric=metric,
                predict_data=predict_data))

        return self.ilf[metric].predict(predict_data), df_one
Пример #20
0
        def get(self, job_id):
            if job_id == "":
                logger.error("job id is required")
                self.finish({
                    "code": HTTP_MISS_ARGS,
                    "message": "job id is required"
                })
                return

            try:
                logger.info("get job:{job_id} detail".format(job_id=job_id))
                job = manager.get_job(int(job_id))
            except JobNotExistsException:
                logger.error(
                    "get job:{job_id} detail failed: job is not exist".format(
                        job_id=job_id))
                logger.exception("Exception Logged")

                data = {
                    "code":
                    HTTP_NOT_FOUND,
                    "message":
                    "get job:{job_id} failed: job is not exist".format(
                        job_id=job_id)
                }
            except Exception as e:
                logger.error("get job:{job_id} detail failed:{err}".format(
                    job_id=job_id, err=str(e)))
                logger.exception("Exception Logged")

                data = {
                    "code":
                    HTTP_FAIL,
                    "message":
                    "get job:{job_id} detail failed:{err}".format(
                        job_id=job_id, err=str(e))
                }
            else:
                data = {"code": HTTP_OK, "message": "OK", "data": job}
            finally:
                self.finish(data)
Пример #21
0
def main():
    parse_config()

    manager = Manager(options.db_path)
    hd.manager = manager

    handler = OasisHandler()

    app = make_app(handler)
    server = tornado.httpserver.HTTPServer(app)
    logger.info(
        "oasis server start to listen {port}...".format(port=options.port))
    server.listen(options.port)

    for sig in ('TERM', 'HUP', 'INT'):
        signal.signal(getattr(signal, 'SIG' + sig),
                      partial(sig_handler, server, manager))

    tornado.ioloop.IOLoop.current().start()

    logger.info("Exit...")
Пример #22
0
    def load(self):
        with self.lock:
            logger.info("start to load data")

            # load model templates
            mts = self.client.model_template.list()

            for model in mts:
                self.model_templates[model.get('name')] = model

            # load model instances
            mis = self.client.model_instance.list()

            for instance in mis:
                self.model_instances[instance.get('id')] = instance

            # load job
            jobs = self.client.job.list()

            for job in jobs:
                self.jobs[job.get('id')] = job
Пример #23
0
def parse_config():
    options.parse_command_line()
    if options.config != "":
        logger.info("parse config from config file: {config}".format(
            config=options.config))
        options.parse_config_file(options.config)

    if options.slack_token == "":
        logger.error("slack token is required!!")
        sys.exit(1)

    if options.models_path == "":
        logger.error("models path is required!!")
        sys.exit(1)

    if options.db_path == "":
        logger.info(
            "path of database is not set, use the default db path: {path}".
            format(path=DEFAULT_DB_PATH))
        options.db_path = DEFAULT_DB_PATH

    mg.MODELS_PATH = options.models_path
    alert.SLACK_TOKEN = options.slack_token
    job.REPORT_ADDRESS = options.address

    logger.info("config: {config}".format(config=options.items()))
Пример #24
0
    def close(self):
        with self.lock:
            logger.info(
                "[job-id:{job_id}] start to stop".format(job_id=self.id))
            try:
                self._exit = True
                self.event.set()

                for model in self.running_models:
                    model.close()

                for model in self.threads:
                    self.threads[model].join(THREAD_JOIN_TIMEOUT)

                self.timer.cancel()
            except Exception as e:
                logger.info("[job-id:{job_id}] fail to stop: {err}".format(
                    job_id=self.id, err=str(e)))
                self.status = JOB_ERROR
            else:
                self.status = JOB_STOPPED
            finally:
                self.save_job()
Пример #25
0
    def train(self, metric, query_expr, config):
        logger.info(
            "{log_prefix}[metric:{metric}] starting to get sample data".format(
                log_prefix=self.log_prefix, metric=metric))
        self.df[metric] = pd.DataFrame(columns=config["features"])
        self.ilf[metric] = IsolationForest(n_estimators=100, verbose=2)
        for index in range(0, self.cfg.model["train_count"], 1):
            if self._exit:
                logger.info("{log_prefix}[metric:{metric}] stop".format(
                    log_prefix=self.log_prefix, metric=metric))
                return False

            now = datetime.datetime.now()
            query = PrometheusQuery(
                query_expr,
                time.mktime(
                    (now - datetime.timedelta(minutes=15)).timetuple()),
                time.mktime(now.timetuple()), "15s")
            self.train_task(metric, query, config)

            if index % 10 == 0:
                df_one = {}
                for key in config["features"]:
                    if key in Features:
                        df_one[key] = float(random.randint(0, 10000))
                self.df[metric] = self.df[metric].append(df_one,
                                                         ignore_index=True)

                logger.info(
                    "{log_prefix}[metric:{metric}] append data to train df:{df_one}"
                    .format(log_prefix=self.log_prefix,
                            metric=metric,
                            df_one=df_one))

            self.event.wait(timeparse(self.cfg.model["train_interval"]))
        logger.info(
            "{log_prefix}[metric:{metric}] starting to train sample data".
            format(log_prefix=self.log_prefix, metric=metric))
        self.ilf[metric].fit(self.df[metric][config["features"]])
        return True
Пример #26
0
    def compute(self, metric, query_expr, config, rules):
        """Extraction features and match with rules

        First: get metric from data source
        Second: extraction features from metrics
        Third: use features to match with all rules about this metric,
               if not match, will send a alert to slack
        """
        while not self._exit:
            data_set = self.query_data(query_expr)
            if len(data_set) > 0:
                features_value = self.extraction_features(data_set, config)
                logger.info(
                    "{log_prefix}[metric:{metric}] extraction features {value}, "
                    "start to match with rule".format(
                        log_prefix=self.log_prefix,
                        metric=metric,
                        value=features_value))

                report = {
                    "metric": metric,
                    "time": datetime.datetime.now(),
                    "predict_data": features_value,
                }

                is_match, not_match_rule = self.match_rules(
                    features_value, rules)
                if is_match:
                    logger.info(
                        "{log_prefix}[metric:{metric}] predict OK".format(
                            log_prefix=self.log_prefix, metric=metric))
                else:
                    report["is_match"] = False
                    report["not_match_rule"] = not_match_rule

                    self.on_error(metric, not_match_rule)

                with self.lock:
                    self.report.metrics_report[metric].append(report)

            self.save_model()
            self.event.wait(timeparse(self.cfg.model["predict_interval"]))

        logger.info("{log_prefix}[metric:{metric}] stop".format(
            log_prefix=self.log_prefix, metric=metric))
Пример #27
0
    def predict(self, metric, query_expr, config):
        logger.info("{log_prefix}[metric:{metric}] starting to predict".format(
            log_prefix=self.log_prefix, metric=metric))
        while not self._exit:
            now = datetime.datetime.now()
            query = PrometheusQuery(
                query_expr,
                time.mktime(
                    (now - datetime.timedelta(minutes=10)).timetuple()),
                time.mktime(now.timetuple()), "15s")

            report = {
                "metric": metric,
                "time": now,
            }

            is_match, predict_data = self.predict_task(metric, query, config)
            if is_match == 1:
                logger.info("{log_prefix}[metric:{metric}] predict OK".format(
                    log_prefix=self.log_prefix, metric=metric))
                report["is_match"] = True
            else:
                report["is_match"] = False

                self.on_error(metric, predict_data)

            report["predict_data"] = predict_data

            with self.lock:
                self.report.metrics_report[metric].append(report)

            self.save_model()
            self.event.wait(timeparse(self.cfg.model["predict_interval"]))

        logger.info("{log_prefix}[metric:{metric}] stop".format(
            log_prefix=self.log_prefix, metric=metric))
Пример #28
0
        def get(self):
            logger.info("list all model templates")
            templates = manager.list_all_model_templates()

            self.finish({"code": HTTP_OK, "message": "OK", "data": templates})
Пример #29
0
 def close(self):
     logger.info("closing the server")
     with self.lock:
         self.controller.close()
Пример #30
0
 def stop_job(self, job):
     logger.info(
         "add stop job: job-{id} to tasks queue".format(id=job.get('id')))
     self.new_task(Task(STOP_JOB, job))