def start_antinex_core_worker(): """start_antinex_core_worker This is the main function handler for starting the AntiNex Workers. Set these environment variables as needed: ``BROKER_URL=redis://localhost:6379/6`` - Celery will connect to this broker ``TRAIN_QUEUE=webapp.train.requests`` ``Train a New DNN`` requests from this queue in the broker ``PREDICT_QUEUE=webapp.predict.requests`` - consume ``Prediction`` requests from this queue in the broker ``MAX_MSGS=100`` - how many historical messages are saved for replay ``MAX_MODELS=100`` - max pre-trained dnns per worker """ broker_url = ev("BROKER_URL", "redis://localhost:6379/6") train_queue_name = ev("TRAIN_QUEUE", "webapp.train.requests") predict_queue_name = ev("PREDICT_QUEUE", "webapp.predict.requests") max_msgs = int(ev("MAX_MSGS", "100")) max_models = int(ev("MAX_MODELS", "100")) log.info("Creating antinex core") core = AntiNexCore(name="core", broker_url=broker_url, train_queue_name=train_queue_name, predict_queue_name=predict_queue_name, max_msgs=max_msgs, max_models=max_models) try: log.info("Starting antinex core") core.start(app=app) except Exception as e: log.info(("Core hit exception={} shutting down").format(e)) core.shutdown() log.info(("canceling consumer to queue={}").format(train_queue_name)) app.control.cancel_consumer(train_queue_name) log.info(("canceling consumer to queue={}").format(predict_queue_name)) app.control.cancel_consumer(predict_queue_name)
def __init__(self, name="", broker_url=ev("BROKER_URL", "redis://localhost:6379/6"), train_queue_name=ev("TRAIN_QUEUE", "webapp.train.requests"), predict_queue_name=ev("PREDICT_QUEUE", "webapp.predict.requests"), max_msgs=100, max_models=100): """__init__ :param name: worker name :param broker_url: connection string to broker :param train_queue_name: queue name for training requests :param predict_queue_name: queue name for predict requests :param max_msgs: num msgs to save for replay debugging (FIFO) :param max_models: num pre-trained models to keep in memory (FIFO) """ self.name = name log.info(("{} - INIT").format(self.name)) self.state = "INIT" self.broker_url = broker_url # Setup queues: self.train_queue_name = train_queue_name self.predict_queue_name = predict_queue_name self.queues = [self.train_queue_name, self.predict_queue_name] # Subscribers self.all_queues_sub = None # SSL Celery options dict self.ssl_options = {} # http://docs.celeryproject.org/en/latest/userguide/calling.html#calling-retry # noqa # allow publishes to retry for a time self.task_publish_retry_policy = { "interval_max": 1, "max_retries": 120, # None - forever "interval_start": 0.1, "interval_step": 0.2 } # Confirm publishes with Celery # https://github.com/celery/kombu/issues/572 self.transport_options = {"confirm_publish": True} self.conn_attrs = { "task_default_queue": "antinex.worker.control", "task_default_exchange": "antinex.worker.control", # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_hijack_root_logger "worker_hijack_root_logger": False, # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_prefetch_multiplier "worker_prefetch_multiplier": 1, # consume 1 message at a time # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_prefetch_multiplie "prefetch_count": 3, # noqa consume 1 message at a time per worker (3 workers) # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_heartbeat "broker_heartbeat": 240, # in seconds # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_connection_max_retries "broker_connection_max_retries": None, # None is forever # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-task_acks_late "task_acks_late": True, # noqa on consume do not send an immediate ack back "task_publish_retry_policy": self.task_publish_retry_policy } self.processor = AntiNexProcessor(name="{}.prc".format(self.name), max_msgs=max_msgs, max_models=max_models)
def task_ml_prepare(self=None, req_node=None): """task_ml_prepare :param self: parent task object for bind=True :param req_node: job utils dictionary for passing a dictionary """ log.info(("task - {} - start " "req_node={}").format(req_node["task_name"], ppj(req_node))) ml_prepare_data = req_node["data"].get("ml_prepare_data", None) user_obj = None ml_prepare_obj = None if req_node["use_cache"]: ml_prepare_obj = MLPrepare.objects.select_related().filter( Q(id=int(ml_prepare_data["id"]))).cache().first() else: ml_prepare_obj = MLPrepare.objects.select_related().filter( Q(id=int(ml_prepare_data["id"]))).first() # end of finding the MLPrepare record create_new_record = False # create the response node from request res = build_task_response(use_cache=req_node["use_cache"], celery_enabled=req_node["celery_enabled"], cache_key=req_node["cache_key"]) try: if create_new_record: create_res = create_ml_prepare_record(req_node=req_node) user_obj = create_res.get("user_obj", None) ml_prepare_obj = create_res.get("ml_prepare_obj", None) if not user_obj: res["error"] = ("{} - Failed to find User").format( req_node["task_name"]) res["status"] = ERR res["error"] = create_res.get("err", "error not set") res["data"] = None log.error(res["error"]) return res if not ml_prepare_obj: res["error"] = ("{} - Failed to create MLPrepare").format( req_node["task_name"]) res["status"] = ERR res["error"] = create_res.get("err", "error not set") res["data"] = None log.error(res["error"]) return res # end of create_new_record last_step = ("starting user={} prepare={} " "pipeline={} clean={} full={} " "post={} label={} tracking={}").format( ml_prepare_obj.user_id, ml_prepare_obj.id, ml_prepare_obj.pipeline_files, ml_prepare_obj.clean_file, ml_prepare_obj.full_file, ml_prepare_obj.post_proc, ml_prepare_obj.label_rules, ml_prepare_obj.tracking_id) log.info(last_step) log_id = "job={}".format(ml_prepare_obj.id) log.info(("prepare={} csvs={}").format(ml_prepare_obj.id, ml_prepare_obj.ds_glob_path)) ml_prepare_obj.pipeline_files = find_all_pipeline_csvs( use_log_id=log_id, csv_glob_path=ml_prepare_obj.ds_glob_path) log.info( ("preparing={} clean={} full={} " "meta_suffix={} files={}").format(ml_prepare_obj.id, ml_prepare_obj.clean_file, ml_prepare_obj.full_file, ml_prepare_obj.meta_suffix, ml_prepare_obj.pipeline_files)) save_node = build_csv(use_log_id=log_id, pipeline_files=ml_prepare_obj.pipeline_files, fulldata_file=ml_prepare_obj.full_file, clean_file=ml_prepare_obj.clean_file, post_proc_rules=ml_prepare_obj.post_proc, label_rules=ml_prepare_obj.label_rules, meta_suffix=ml_prepare_obj.meta_suffix) if save_node["status"] == VALID: log.info("successfully processed datasets:") ml_prepare_obj.post_proc = save_node["post_proc_rules"] ml_prepare_obj.post_proc["features_to_process"] = \ save_node["features_to_process"] ml_prepare_obj.post_proc["ignore_features"] = \ save_node["ignore_features"] ml_prepare_obj.post_proc["feature_to_predict"] = \ save_node["feature_to_predict"] ml_prepare_obj.label_rules = save_node["label_rules"] ml_prepare_obj.pipeline_files = save_node["pipeline_files"] ml_prepare_obj.full_file = save_node["fulldata_file"] ml_prepare_obj.clean_file = save_node["clean_file"] ml_prepare_obj.status = "finished" ml_prepare_obj.control_state = "finished" ml_prepare_obj.save() log.info(("saved prepare={}").format(ml_prepare_obj.id)) if ev("SHOW_SUMMARY", "0") == "1": log.info(("Full csv: {}").format(save_node["fulldata_file"])) log.info(("Full meta: {}").format( save_node["fulldata_metadata_file"])) log.info(("Clean csv: {}").format(save_node["clean_file"])) log.info(("Clean meta: {}").format( save_node["clean_metadata_file"])) log.info("------------------------------------------") log.info(("Predicting Feature: {}").format( save_node["feature_to_predict"])) log.info(("Features to Process: {}").format( ppj(save_node["features_to_process"]))) log.info(("Ignored Features: {}").format( ppj(save_node["ignore_features"]))) log.info("------------------------------------------") # end of show summary log.info("Full: {}".format(save_node["fulldata_file"])) log.info("Cleaned (no-NaNs in columns): {}".format( save_node["clean_file"])) data = ml_prepare_obj.get_public() res["status"] = SUCCESS res["err"] = "" res["data"] = data else: last_step = ("failed to prepare csv status={} " "errors: {}").format(save_node["status"], save_node["err"]) log.error(last_step) ml_prepare_obj.status = "error" ml_prepare_obj.control_state = "error" ml_prepare_obj.save() data["prepare"] = ml_prepare_obj.get_public() data["ready"] = {} res["status"] = ERR res["error"] = last_step res["data"] = data return res # end of checking it started except Exception as e: res["status"] = ERR res["err"] = ("Failed task={} with " "ex={}").format(req_node["task_name"], e) res["data"] = None log.error(res["err"]) # end of try/ex log.info(("task - {} - done").format(req_node["task_name"])) return res
def build_training_request( csv_file=ev("CSV_FILE", "/tmp/cleaned_attack_scans.csv"), meta_file=ev("CSV_META_FILE", "/tmp/cleaned_metadata.json"), predict_feature=ev("PREDICT_FEATURE", "label_value"), ignore_features=[ "label_name", "ip_src", # need to make this an int "ip_dst", # need to make this an int "eth_src", # need to make this an int "eth_dst" # need to make this an int ], seed=None, test_size=float(ev("TEST_SIZE", "0.20")), preproc_rules=None): """build_training_request :param csv_file: csv file built with prepare_dataset.py :param meta_file: metadata file built with prepare_dataset.py :param predict_feature: feature (column) to predict :param ignore_features: features to remove from the csv before the split of test + train data :param seed: integer to seed :param test_size: percent of records to split into test vs train :param preproc_rules: future preprocessing rules hooks """ last_step = "not started" res = { "status": INVALID, "err": "", "csv_file": csv_file, "meta_file": meta_file, "meta_data": None, "seed": None, "test_size": test_size, "predict_feature": predict_feature, "features_to_process": [], "ignore_features": ignore_features, "X_train": None, "X_test": None, "Y_train": None, "Y_test": None } try: last_step = ("building seed={}").format(seed) log.debug(last_step) use_seed = seed if not use_seed: use_seed = 9 res["seed"] = np.random.seed(use_seed) last_step = ("Loading csv={}").format(csv_file) log.info(last_step) if not os.path.exists(csv_file): res["status"] = ERROR res["err"] = ("Unable to find csv_file={}").format(csv_file) log.error(res["err"]) return res # end of checking for a valid csv file on disk if not os.path.exists(meta_file): res["status"] = ERROR res["err"] = ("Unable to find meta_file={}").format(meta_file) log.error(res["err"]) return res # end of checking for a valid metadata file on disk # load csv file into pandas dataframe df = pd.read_csv(csv_file) features_to_process = [] meta_data = {} try: last_step = ("opening metadata={}").format(meta_file) log.debug(last_step) meta_data = json.loads(open(meta_file, "r").read()) res["meta_data"] = meta_data if "post_proc_rules" in meta_data: if "drop_columns" in meta_data["post_proc_rules"]: log.debug(("Found drop_columns={}").format( meta_data["post_proc_rules"]["drop_columns"])) for ign in meta_data["post_proc_rules"]["drop_columns"]: ignore_features.append(ign) except Exception as e: res["error"] = ("Failed building ignore_features: " "ignore_features={} meta={} meta_data={} " "last_step='{}' ex='{}'").format( ignore_features, meta_file, meta_data, last_step, e) log.error(res["error"]) res["status"] = ERROR return res # end of trying to lookup the meta data file # for non-int/float features to ignore last_step = ("metadata={} df has " "columns={} ignore={}").format(meta_file, df.columns.values, ignore_features) log.info(last_step) for feature in df.columns.values: keep_it = True for ign in ignore_features: if feature == ign: keep_it = False if keep_it: if feature != predict_feature: features_to_process.append(feature) # end of for all features to process last_step = ("Done post-procecessing " "Predicting={} with features={} " "ignore_features={} records={}").format( predict_feature, features_to_process, ignore_features, len(df.index)) log.info(last_step) res["predict_feature"] = predict_feature res["ignore_features"] = [] for k in ignore_features: if k not in res["ignore_features"]: res["ignore_features"].append(k) res["features_to_process"] = [] for k in features_to_process: if k not in res["features_to_process"]: if k != predict_feature: res["features_to_process"].append(k) # split the data into training (res["X_train"], res["X_test"], res["Y_train"], res["Y_test"]) = train_test_split(df[features_to_process], df[predict_feature], test_size=test_size, random_state=res["seed"]) last_step = ("Done splitting rows={} into " "X_train={} X_test={} " "Y_train={} Y_test={}").format(len(df.index), len(res["X_train"]), len(res["X_test"]), len(res["Y_train"]), len(res["Y_test"])) log.info(("Success: {}").format(last_step)) res["err"] = "" res["status"] = VALID except Exception as e: res["status"] = ERROR res["err"] = ("Failed build_training_request " "step='{}' with ex='{}'").format(last_step, e) log.error(("build_training_request: {}").format(res["err"])) # end of try/ex return res
def build_csv(pipeline_files=[], fulldata_file=None, clean_file=None, post_proc_rules=None, label_rules=None, use_log_id=None, meta_suffix="metadata.json"): """build_csv :param pipeline_files: list of files to process :param fulldata_file: output of non-edited merged data :param clean_file: cleaned csv file should be ready for training :param post_proc_rules: apply these rules to post processing (clean) :param label_rules: apply labeling rules (classification only) :param use_log_id: label for tracking the job in the logs :param meta_suffix: file suffix """ save_node = { "status": INVALID, "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "fulldata_file": fulldata_file, "fulldata_metadata_file": None, "clean_file": clean_file, "clean_metadata_file": None, "features_to_process": [], "feature_to_predict": None, "ignore_features": [], "full_headers": [], "clean_headers": [], "df_json": {}, "version": 1 } log_id = "" if use_log_id: log_id = use_log_id if not fulldata_file: log.error("missing fulldata_file - stopping") save_node["status"] = INVALID return save_node if not clean_file: log.error("missing clean_file - stopping") save_node["status"] = INVALID return save_node fulldata_metadata_file = "{}/fulldata_{}".format( "/".join(fulldata_file.split("/")[:-1]), meta_suffix) clean_metadata_file = "{}/cleaned_{}".format( "/".join(clean_file.split("/")[:-1]), meta_suffix) log.info(("{} build_csv - START").format(log_id)) common_headers, \ headers_dict = find_all_headers( use_log_id=log_id, pipeline_files=pipeline_files) log.info( ("{} num common_headers={} headers={}").format(log_id, len(common_headers), common_headers)) # since the headers can be different we rebuild a new one: mark_default_value = None if "mark_empty" in post_proc_rules: mark_default_value = post_proc_rules["mark_empty"] log.info(("{} using mark_empty={}").format(log_id, mark_default_value)) hdrs = {} for h in common_headers: hdrs[h] = mark_default_value features_to_process = [] feature_to_predict = None ignore_features = [] set_if_above = None labels = [] label_values = [] if label_rules: set_if_above = label_rules["set_if_above"] labels = label_rules["labels"] label_values = label_rules["label_values"] if post_proc_rules: if "predict_feature" in post_proc_rules: feature_to_predict = post_proc_rules["predict_feature"] if not feature_to_predict: if "label_name" in hdrs: feature_to_predict = "label_name" all_rows = [] num_done = 1 total_files = len(pipeline_files) for c in pipeline_files: log.info(("{} merging={}/{} csv={}").format(log_id, num_done, total_files, c)) cf = pd.read_csv(c) if mark_default_value: log.info(("{} filling nan with value={}").format( log_id, mark_default_value)) cf.fillna(value=mark_default_value, inplace=True) # end of making sure fillna is done if requested log.info(("{} processing rows={}").format(log_id, len(cf.index))) for index, row in cf.iterrows(): valid_row = True new_row = copy.deepcopy(hdrs) new_row["src_file"] = c for k in hdrs: if k in row: new_row[k] = row[k] else: if mark_default_value: new_row[k] = mark_default_value # end of for all headers to copy in if label_rules: test_rand = random.randint(0, 100) if test_rand > set_if_above: new_row["label_value"] = label_values[1] new_row["label_name"] = labels[1] # if you make the "set above" greater than 100 # it will tag the entire dataset with just 1 label # nice if your data is the same else: new_row["label_value"] = label_values[0] new_row["label_name"] = labels[0] # end of applying label rules if valid_row: all_rows.append(new_row) # end of for all rows in this file num_done += 1 # end of building all files into one list log.info( ("{} fulldata rows={} generating df").format(log_id, len(all_rows))) df = pd.DataFrame(all_rows) log.info(("{} df rows={} headers={}").format(log_id, len(df.index), df.columns.values)) if ev("CONVERT_DF", "0") == "1": log.info(("{} converting df to json").format(log_id)) save_node["df_json"] = df.to_json() if clean_file: log.info(("{} writing fulldata_file={}").format(log_id, fulldata_file)) df.to_csv(fulldata_file, sep=',', encoding='utf-8', index=False) log.info( ("{} done writing fulldata_file={}").format(log_id, fulldata_file)) if post_proc_rules: features_to_process = [] ignore_features = [] if label_rules: if feature_to_predict: ignore_features = [feature_to_predict] if "drop_columns" in post_proc_rules: for p in post_proc_rules["drop_columns"]: if p in headers_dict: ignore_features.append(p) # post proce filter more features out # for non-int/float types for d in df.columns.values: add_this_one = True for i in ignore_features: if d == i: add_this_one = False break if add_this_one: features_to_process.append(d) # for all df columns we're not ignoring... # add them as features to process log.info(("{} writing fulldata metadata file={}").format( log_id, fulldata_metadata_file)) header_data = { "headers": list(df.columns.values), "output_type": "fulldata", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": features_to_process, "feature_to_predict": feature_to_predict, "ignore_features": ignore_features, "created": rnow() } save_node["full_headers"] = list(df.columns.values) with open(fulldata_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) keep_these = features_to_process if feature_to_predict: keep_these.append(feature_to_predict) log.info(("{} creating new clean_file={} " "keep_these={} " "predict={}").format(log_id, clean_file, keep_these, feature_to_predict)) # need to remove all columns that are all nan clean_df = None if "keep_nans" not in post_proc_rules: clean_df = df[keep_these].dropna(axis=1, how='all').dropna() else: clean_df = df[keep_these].dropna(axis=1, how='all') # allow keeping empty columns log.info(("{} clean_df colums={} rows={}").format( log_id, clean_df.columns.values, len(clean_df.index))) if len(clean_df.columns.values) == 0: log.error("Postproc clean df has no columns") if len(clean_df.index) == 0: log.error("Postproc clean df has no rows") cleaned_features = clean_df.columns.values cleaned_to_process = [] cleaned_ignore_features = [] for c in cleaned_features: if feature_to_predict: if c == feature_to_predict: cleaned_ignore_features.append(c) else: keep_it = True for ign in ignore_features: if c == ign: cleaned_ignore_features.append(c) keep_it = False break # end of for all feaures to remove if keep_it: cleaned_to_process.append(c) # end of new feature columns log.info(("{} writing DROPPED clean_file={} " "features_to_process={} " "ignore_features={} " "predict={}").format(log_id, clean_file, cleaned_to_process, cleaned_ignore_features, feature_to_predict)) write_clean_df = clean_df.drop(columns=cleaned_ignore_features) log.info( ("cleaned_df rows={}").format(len(write_clean_df.index))) write_clean_df.to_csv(clean_file, sep=',', encoding='utf-8', index=False) clean_metadata_file = "{}/cleaned_{}".format( "/".join(clean_file.split("/")[:-1]), meta_suffix) log.info(("{} writing clean metadata file={}").format( log_id, clean_metadata_file)) header_data = { "headers": list(write_clean_df.columns.values), "output_type": "clean", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": cleaned_to_process, "feature_to_predict": feature_to_predict, "ignore_features": cleaned_ignore_features, "created": rnow() } save_node["clean_headers"] = list( write_clean_df.columns.values) with open(clean_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) else: for d in df.columns.values: add_this_one = True for i in ignore_features: if d == i: add_this_one = False break if add_this_one: features_to_process.append(d) # for all df columns we're not ignoring... # add them as features to process log.info(("{} writing fulldata metadata file={}").format( log_id, fulldata_metadata_file)) header_data = { "headers": list(df.columns.values), "output_type": "fulldata", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": features_to_process, "feature_to_predict": feature_to_predict, "ignore_features": ignore_features, "created": rnow() } save_node["full_headers"] = list(df.columns.values) with open(fulldata_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) keep_these = features_to_process if feature_to_predict: keep_these.append(feature_to_predict) log.info(("{} creating new clean_file={} " "keep_these={} " "predict={}").format(log_id, clean_file, keep_these, feature_to_predict)) # need to remove all columns that are all nan clean_df = None if "keep_nans" not in post_proc_rules: clean_df = df[keep_these].dropna(axis=1, how='all').dropna() else: clean_df = df[keep_these].dropna(axis=1, how='all') # allow keeping empty columns log.info(("{} clean_df colums={} rows={}").format( log_id, clean_df.columns.values, len(clean_df.index))) if len(clean_df.columns.values) == 0: log.error( ("{} The clean df has no columns").format(log_id)) if len(clean_df.index) == 0: log.error(("{} The clean df has no rows").format(log_id)) cleaned_features = clean_df.columns.values cleaned_to_process = [] cleaned_ignore_features = [] for c in cleaned_features: if feature_to_predict: if c == feature_to_predict: cleaned_ignore_features.append(c) else: keep_it = True for ign in ignore_features: if c == ign: cleaned_ignore_features.append(c) keep_it = False break # end of for all feaures to remove if keep_it: cleaned_to_process.append(c) # end of new feature columns log.info(("{} writing DROPPED clean_file={} " "features_to_process={} " "ignore_features={} " "predict={}").format(log_id, clean_file, cleaned_to_process, cleaned_ignore_features, feature_to_predict)) write_clean_df = clean_df.drop(columns=cleaned_ignore_features) log.info(("{} cleaned_df rows={}").format( log_id, len(write_clean_df.index))) write_clean_df.to_csv(clean_file, sep=',', encoding='utf-8', index=False) log.info(("{} writing clean metadata file={}").format( log_id, clean_metadata_file)) header_data = { "headers": list(write_clean_df.columns.values), "output_type": "clean", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": cleaned_to_process, "feature_to_predict": feature_to_predict, "ignore_features": cleaned_ignore_features, "created": rnow() } save_node["clean_headers"] = list( write_clean_df.columns.values) with open(clean_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) # end of if/else save_node["clean_file"] = clean_file save_node["clean_metadata_file"] = clean_metadata_file log.info( ("{} done writing clean_file={}").format(log_id, clean_file)) # end of post_proc_rules save_node["fulldata_file"] = fulldata_file save_node["fulldata_metadata_file"] = fulldata_metadata_file save_node["status"] = VALID # end of writing the file save_node["features_to_process"] = features_to_process save_node["feature_to_predict"] = feature_to_predict save_node["ignore_features"] = ignore_features log.info(("{} build_csv - END").format(log_id)) return save_node