def run(self): with self.lock: logger.info("[job:{job}] start to run".format(job=self.to_dict())) try: for md in self.api_models_config: model = Models[md.get("name")] md_instance = self.store.set_model_instance({ 'job_id': self.id, 'model': md.get("name"), 'report': '', 'status': MODEL_NEW }) model_runner = model(md_instance, self.store, md, self.data_source, self.slack_channel, self.timeout) t = Thread(target=model_runner.run) t.start() self.running_models.append(model_runner) self.threads[md.get("name")] = t except Exception as e: logger.info("[job:{job}] fail to star: {err}".format( job=self.to_dict(), err=str(e))) logger.exception("Exception Logged") self.status = JOB_ERROR else: self.timer = Timer(timeparse(self.timeout), self.timeout_action) self.status = JOB_RUNNING finally: self.save_job()
def close(self): logger.info("{log_prefix} closing".format(log_prefix=self.log_prefix)) super(Rules, self).close() self.timer.cancel() self.status = MODEL_STOP self.save_model()
def timeout_action(self): logger.info( "{log_prefix} finish the model".format(log_prefix=self.log_prefix)) super(IForest, self).close() self.status = MODEL_FINISH self.save_model()
def close(self): logger.info("stop collector !") self._exit = True self.event.set() for job_id, job in self.jobs.items(): job.close()
def run(self): logger.info("{log_prefix} start to run".format( log_prefix=self.log_prefix, model=self.name)) self.timer.start() for metric in self.metrics: if metric not in Metrics: logger.error( "{log_prefix}[metric:{metric}] is not supported".format( log_prefix=self.log_prefix, metric=metric)) continue val = Metrics[metric] if metric not in self.cfg.metrics: logger.error( "{log_prefix}[metric:{metric}] can't found the config of this metric" .format(log_prefix=self.log_prefix, metric=metric)) continue self.report.metrics_report[metric] = [] t = Thread(target=self.run_action, args=(metric, val, self.cfg.metrics[metric])) t.start() self.threads[metric] = t self.status = MODEL_RUNNING self.save_model()
def shutdown(): logger.info('Stopping http server') manager.close() server.stop() logger.info('Will shutdown in %s seconds ...', MAX_WAIT_SECONDS_BEFORE_SHUTDOWN) stop_loop(time.time() + MAX_WAIT_SECONDS_BEFORE_SHUTDOWN)
def close(self): # TODO: close this job logger.info("{log_prefix} closing".format(log_prefix=self.log_prefix)) super(IForest, self).close() self.timer.cancel() self.status = MODEL_STOP self.save_model()
def stop_loop(deadline): now = time.time() if now < deadline and (io_loop._callbacks or io_loop._timeouts): logger.info('Waiting for next tick') io_loop.add_timeout(now + 1, stop_loop, deadline) else: io_loop.stop() logger.info('Shutdown finally')
def run(self): logger.info("start to run collector") while not self._exit: self.schedule_task() self.clean() self.event.wait(timeparse(JOB_SCHEDULE_INTERVAL))
def run_job(self, job): logger.info( "add new job: job-{id} to tasks queue".format(id=job.get('id'))) try: self.new_task(Task(START_JOB, job)) except Exception as e: logger.error("add new job: {job} failed: {err}".format(job=job, err=str(e))) raise NewJobException(job, str(e))
def init_model_template(self): logger.info("init model templates") for name, model in Models.items(): cfg = model.get_model_template(name, MODELS_PATH) self.storage.set_model_template({ "name": name, "config": str(cfg) })
def stop_job_handler(self, job): logger.info("stop job: {job}".format(job=job)) job_instance = self.jobs.get(job.get('id')) if job_instance is None: raise JobNotRunningException(job) t = Thread(target=job_instance.close) t.start()
def get(self, job_id): if job_id == "": logger.error("job id is required") self.finish({ "code": HTTP_MISS_ARGS, "message": "job id is required" }) return try: logger.info("close running job:{job_id}".format(job_id=job_id)) manager.stop_job(int(job_id)) except JobNotExistsException: logger.error( "close job:{job_id} failed: job is not exist".format( job_id=job_id)) logger.exception("Exception Logged") data = { "code": HTTP_FAIL, "message": "close job:{job_id} failed: job is not exist".format( job_id=job_id) } except JobNotRunningException: logger.error( "close job:{job_id} failed: job is not running".format( job_id=job_id)) logger.exception("Exception Logged") data = { "code": HTTP_FAIL, "message": "close job:{job_id} failed: job is not running".format( job_id=job_id) } except Exception as e: logger.error("close job:{job_id} failed:{err}".format( job_id=job_id, err=str(e))) logger.exception("Exception Logged") data = { "code": HTTP_FAIL, "message": "close job:{job_id} failed:{err}".format(job_id=job_id, err=str(e)) } else: data = {"code": HTTP_OK, "message": "OK"} finally: self.finish(data)
def timeout_action(self): logger.info("[job:{job}] finish".format(job=self.to_dict())) self.status = JOB_FINISHED self.save_job() send_to_slack("{job:{job} finish \n. report detail: {url}".format( job=self.to_dict(), url=urlparse.urljoin( REPORT_ADDRESS, "api/v1/job/{job_id}/report".format(job_id=self.id))))
def on_error(self, metric, predict_data): logger.info( "{log_prefix}[metric:{metric}] Predict Error, predict data:{predict_data}" .format(log_prefix=self.log_prefix, metric=metric, predict_data=predict_data)) send_to_slack( "{log_prefix}[model:{model}], predict metric {metric} error, " "predict data:{predict_data}".format(log_prefix=self.log_prefix, model=self.name, metric=metric, predict_data=predict_data), self.slack_channel)
def start_job_handler(self, job): logger.info("run job: {job}".format(job=job)) if job.get('id') in self.jobs.keys(): logger.warn("the job is running") return job_instance = Job(self.storage, job) if not job_instance.valid(): logger.error("the job is invalid") return self.jobs[job.get('id')] = job_instance t = Thread(target=job_instance.run) t.start()
def schedule_task(self): if len(self.jobs) >= RUNNING_SIZE_LIMIT: logger.warn( "the number of tasks allowed to run exceeded the limit.") return try: task = self.queue.get_nowait() except Empty: logger.info("the queue of tasks is empty, skip schedule") return else: if task.typ == START_JOB: self.start_job_handler(task.job) elif task.typ == STOP_JOB: self.stop_job_handler(task.job) else: raise TaskTypeNotSupportedException(task)
def train_task(self, metric, query, config): data_set = self.api.query(query) if len(data_set) > 0: values = [] for data in data_set.values(): values.append(float(data)) df_one = {} for key in config["features"]: if key in Features: df_one[key] = Features[key](values) logger.info( "{log_prefix}[metric:{metric}] append data to train df:{df_one}" .format(log_prefix=self.log_prefix, metric=metric, df_one=df_one)) self.df[metric] = self.df[metric].append(df_one, ignore_index=True)
def predict_task(self, metric, query, config): data_set = self.api.query(query) values = [] for data in data_set.values(): values.append(float(data)) df_one = [] for key in config["features"]: if key in Features: df_one.append(Features[key](values)) predict_data = np.array([df_one]) logger.info( "{log_prefix}[metric:{metric}] predict data:{predict_data}".format( log_prefix=self.log_prefix, metric=metric, predict_data=predict_data)) return self.ilf[metric].predict(predict_data), df_one
def get(self, job_id): if job_id == "": logger.error("job id is required") self.finish({ "code": HTTP_MISS_ARGS, "message": "job id is required" }) return try: logger.info("get job:{job_id} detail".format(job_id=job_id)) job = manager.get_job(int(job_id)) except JobNotExistsException: logger.error( "get job:{job_id} detail failed: job is not exist".format( job_id=job_id)) logger.exception("Exception Logged") data = { "code": HTTP_NOT_FOUND, "message": "get job:{job_id} failed: job is not exist".format( job_id=job_id) } except Exception as e: logger.error("get job:{job_id} detail failed:{err}".format( job_id=job_id, err=str(e))) logger.exception("Exception Logged") data = { "code": HTTP_FAIL, "message": "get job:{job_id} detail failed:{err}".format( job_id=job_id, err=str(e)) } else: data = {"code": HTTP_OK, "message": "OK", "data": job} finally: self.finish(data)
def main(): parse_config() manager = Manager(options.db_path) hd.manager = manager handler = OasisHandler() app = make_app(handler) server = tornado.httpserver.HTTPServer(app) logger.info( "oasis server start to listen {port}...".format(port=options.port)) server.listen(options.port) for sig in ('TERM', 'HUP', 'INT'): signal.signal(getattr(signal, 'SIG' + sig), partial(sig_handler, server, manager)) tornado.ioloop.IOLoop.current().start() logger.info("Exit...")
def load(self): with self.lock: logger.info("start to load data") # load model templates mts = self.client.model_template.list() for model in mts: self.model_templates[model.get('name')] = model # load model instances mis = self.client.model_instance.list() for instance in mis: self.model_instances[instance.get('id')] = instance # load job jobs = self.client.job.list() for job in jobs: self.jobs[job.get('id')] = job
def parse_config(): options.parse_command_line() if options.config != "": logger.info("parse config from config file: {config}".format( config=options.config)) options.parse_config_file(options.config) if options.slack_token == "": logger.error("slack token is required!!") sys.exit(1) if options.models_path == "": logger.error("models path is required!!") sys.exit(1) if options.db_path == "": logger.info( "path of database is not set, use the default db path: {path}". format(path=DEFAULT_DB_PATH)) options.db_path = DEFAULT_DB_PATH mg.MODELS_PATH = options.models_path alert.SLACK_TOKEN = options.slack_token job.REPORT_ADDRESS = options.address logger.info("config: {config}".format(config=options.items()))
def close(self): with self.lock: logger.info( "[job-id:{job_id}] start to stop".format(job_id=self.id)) try: self._exit = True self.event.set() for model in self.running_models: model.close() for model in self.threads: self.threads[model].join(THREAD_JOIN_TIMEOUT) self.timer.cancel() except Exception as e: logger.info("[job-id:{job_id}] fail to stop: {err}".format( job_id=self.id, err=str(e))) self.status = JOB_ERROR else: self.status = JOB_STOPPED finally: self.save_job()
def train(self, metric, query_expr, config): logger.info( "{log_prefix}[metric:{metric}] starting to get sample data".format( log_prefix=self.log_prefix, metric=metric)) self.df[metric] = pd.DataFrame(columns=config["features"]) self.ilf[metric] = IsolationForest(n_estimators=100, verbose=2) for index in range(0, self.cfg.model["train_count"], 1): if self._exit: logger.info("{log_prefix}[metric:{metric}] stop".format( log_prefix=self.log_prefix, metric=metric)) return False now = datetime.datetime.now() query = PrometheusQuery( query_expr, time.mktime( (now - datetime.timedelta(minutes=15)).timetuple()), time.mktime(now.timetuple()), "15s") self.train_task(metric, query, config) if index % 10 == 0: df_one = {} for key in config["features"]: if key in Features: df_one[key] = float(random.randint(0, 10000)) self.df[metric] = self.df[metric].append(df_one, ignore_index=True) logger.info( "{log_prefix}[metric:{metric}] append data to train df:{df_one}" .format(log_prefix=self.log_prefix, metric=metric, df_one=df_one)) self.event.wait(timeparse(self.cfg.model["train_interval"])) logger.info( "{log_prefix}[metric:{metric}] starting to train sample data". format(log_prefix=self.log_prefix, metric=metric)) self.ilf[metric].fit(self.df[metric][config["features"]]) return True
def compute(self, metric, query_expr, config, rules): """Extraction features and match with rules First: get metric from data source Second: extraction features from metrics Third: use features to match with all rules about this metric, if not match, will send a alert to slack """ while not self._exit: data_set = self.query_data(query_expr) if len(data_set) > 0: features_value = self.extraction_features(data_set, config) logger.info( "{log_prefix}[metric:{metric}] extraction features {value}, " "start to match with rule".format( log_prefix=self.log_prefix, metric=metric, value=features_value)) report = { "metric": metric, "time": datetime.datetime.now(), "predict_data": features_value, } is_match, not_match_rule = self.match_rules( features_value, rules) if is_match: logger.info( "{log_prefix}[metric:{metric}] predict OK".format( log_prefix=self.log_prefix, metric=metric)) else: report["is_match"] = False report["not_match_rule"] = not_match_rule self.on_error(metric, not_match_rule) with self.lock: self.report.metrics_report[metric].append(report) self.save_model() self.event.wait(timeparse(self.cfg.model["predict_interval"])) logger.info("{log_prefix}[metric:{metric}] stop".format( log_prefix=self.log_prefix, metric=metric))
def predict(self, metric, query_expr, config): logger.info("{log_prefix}[metric:{metric}] starting to predict".format( log_prefix=self.log_prefix, metric=metric)) while not self._exit: now = datetime.datetime.now() query = PrometheusQuery( query_expr, time.mktime( (now - datetime.timedelta(minutes=10)).timetuple()), time.mktime(now.timetuple()), "15s") report = { "metric": metric, "time": now, } is_match, predict_data = self.predict_task(metric, query, config) if is_match == 1: logger.info("{log_prefix}[metric:{metric}] predict OK".format( log_prefix=self.log_prefix, metric=metric)) report["is_match"] = True else: report["is_match"] = False self.on_error(metric, predict_data) report["predict_data"] = predict_data with self.lock: self.report.metrics_report[metric].append(report) self.save_model() self.event.wait(timeparse(self.cfg.model["predict_interval"])) logger.info("{log_prefix}[metric:{metric}] stop".format( log_prefix=self.log_prefix, metric=metric))
def get(self): logger.info("list all model templates") templates = manager.list_all_model_templates() self.finish({"code": HTTP_OK, "message": "OK", "data": templates})
def close(self): logger.info("closing the server") with self.lock: self.controller.close()
def stop_job(self, job): logger.info( "add stop job: job-{id} to tasks queue".format(id=job.get('id'))) self.new_task(Task(STOP_JOB, job))