def test_get_ml_job_result(self): """ Get ML Job Result works """ data = self.build_ml_job_dataset() # validate jwt tokens work self.login_user() request = self.factory.post(self.ml_run_url, data, HTTP_AUTHORIZATION=self.jwt_auth, format="json") view = MLJobViewSet.as_view({"post": "create"}) response = view(request) self.assertEqual(MLJob.objects.count(), 1) self.assertEqual(MLJobResult.objects.count(), 1) self.assertEqual(response.status_code, status.HTTP_201_CREATED) self.assertEqual(response.data["job"]["user_name"], self.test_username) self.assertEqual(response.data["job"]["title"], data["title"]) job_id = int(response.data["job"]["id"]) result_data = response.data["results"] result_id = int(result_data["id"]) error_data = result_data["error_data"] request = self.factory.get(self.ml_get_result_url, HTTP_AUTHORIZATION=self.jwt_auth, format="json") view = MLJobResultViewSet.as_view({"get": "retrieve"}) get_response = view(request, pk=result_id) print(ppj(get_response.data)) self.assertEqual(get_response.data["user_name"], self.test_username) self.assertEqual(get_response.data["job_id"], job_id) self.assertEqual(get_response.data["id"], result_id) self.assertEqual(get_response.data["error_data"], error_data) self.assertContains(get_response, "model_json") self.assertContains(get_response, "model_weights") self.assertEqual( len(json.dumps(get_response.data["model_json"])) > 0, True) self.assertEqual( len(json.dumps(get_response.data["model_weights"])) > 0, True)
def build_dataset_regression_request( self, data_file=("./tests/datasets/regression/" "dataset_prediction.json"), predict_rows_file=("./tests/datasets/regression/" "stock.csv")): predict_rows = self.build_prediction_rows(data_file=predict_rows_file) dataset_manifest = None with open(data_file) as cur_file: dataset_manifest = json.loads(cur_file.read()) prediction_req = dataset_manifest prediction_req["label"] = "testing_{}".format(str(uuid.uuid4())) prediction_req["predict_rows"] = predict_rows if bool(os.getenv("TEST_DEBUG", "0") == "1"): print(ppj(prediction_req)) return prediction_req
if post_response.status_code != 201 \ and post_response.status_code != 200: log.error(("Failed with Post response status={} reason={}") .format(post_response.status_code, post_response.reason)) log.error("Details:\n{}".format(post_response.text)) sys.exit(1) else: log.info(("SUCCESS - Post Response status={} reason={}") .format(post_response.status_code, post_response.reason)) as_json = True record = {} if as_json: record = json.loads(post_response.text) log.info(ppj(record)) if using_named_files or custom_output_dir: print("") print("Train a Neural Network with:") print(("./create-keras-dnn.py " "{} {}cleaned_{}").format( record["clean_file"], record["output_dir"], record["meta_suffix"])) print("") # end of post for running an ML Job sys.exit(0)
app.config_from_object( "django.conf:settings", namespace="CELERY") app.autodiscover_tasks( lambda: settings.INSTALLED_APPS) datafile = "./drf_network_pipeline/tests/pubsub/publish-to-core.json" data = {} with open(datafile, "r") as f: data = json.loads(f.read()) # Celery task routing and queue parent_route = "drf_network_pipeline.pipeline.tasks" task_name = ("{}.task_publish_to_core").format( parent_route) queue_name = ("{}.task_publish_to_core").format( parent_route) log.info(("sending args={} to broker={} task={}") .format( ppj(data), app.conf["BROKER_URL"], task_name)) app.send_task( task_name, args=[data], queue=queue_name)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "drf_network_pipeline.settings") log.info("creating celery app") app = Celery("test-app") app.config_from_object("django.conf:settings", namespace="CELERY") app.autodiscover_tasks(lambda: settings.INSTALLED_APPS) datafile = "./drf_network_pipeline/tests/pubsub/get-user.json" data = {} with open(datafile, "r") as f: data = json.loads(f.read()) # Celery task routing and queue parent_route = "drf_network_pipeline.users.tasks" task_name = ("{}.task_get_user").format(parent_route) queue_name = ("{}.task_get_user").format(parent_route) log.info( ("sending args={} to broker={} task={}").format(ppj(data), app.conf["BROKER_URL"], task_name)) task_res = app.send_task(task_name, args=[data], queue=queue_name) log.info(("task={} task.id={} result={}").format(task_name, task_res.id, ppj(task_res.get())))
def show_diagnostics(self): """show_diagnostics""" log.info(("{} - models={}").format(self.name, self.models)) for midx, m in enumerate(self.recv_msgs): log.info(("msg={} contents={}").format(midx, ppj(m)))
def run_task(task_method=None, task_name="please-set-name", req_data=None, get_result=False, delay_timeout=settings.CELERY_GET_RESULT_TIMEOUT, use_cache=settings.CACHEOPS_ENABLED, cache_record=False, cache_key=None): """run_task Handles Celery sync/async task processing :param task_method: requested method :param task_name: name of the task for logging :param req_data: requested data :param get_result: get the result from task :param delay_timeout: seconds to wait for the task to finish :param use_cache: use the cached record if available :param cache_record: cache the result in redis after done :param cache_key: cache the result in this redis key """ req_node = build_task_request(task_name=task_name, use_cache=use_cache, cache_record=cache_record, cache_key=cache_key, data=req_data) res_node = build_task_response(status=NOTRUN, data=None, err="not-run") try: res_node = handle_task_method(req_node=req_node, get_result=get_result, delay_timeout=delay_timeout, task_method=task_method) if "celery_enabled" not in res_node: log.error(("Invalid return node from task={} " "task_method={} with req_node={} " "returned data={}").format(task_name, task_method, ppj(req_node), ppj(res_node))) if res_node["status"] == SUCCESS: log.info(("celery={} - running task with data={}").format( res_node["celery_enabled"], str(res_node["data"])[0:32])) elif not get_result and res_node["status"] == NOTDONE: log.info(("celery={} - running task with data={}").format( res_node["celery_enabled"], str(res_node["data"])[0:32])) else: res_node["data"] = None res_node["status"] = res_node["status"] res_node["err"] = ("task={} method={} " "status={} err={}").format( task_name, task_method, res_node["status"], res_node["err"]) log.error(("Failed {}").format(res_node["err"])) # end of handling success/failure except Exception as e: res_node = build_task_response( status=ERR, data=None, err=("Failed to run {} celery={} " "with ex={}").format(task_name, res_node.get("celery_enabled", None), e)) log.error(res_node["err"]) # try/ex handling Celery task return res_node
def run_train_and_predict(self, req): """run_train_and_predict :param req: message dict consumed from a queue """ log.info(("{} loading predict_rows into a df").format( req["use_model_name"])) # the REST API can ask for the worker to publish # results to the broker details from the manifest # which is stored in the REST API db worker_result_node = None if "manifest" in req: worker_result_node = req["manifest"].get("worker_result_node", None) # end of getting 'where to send the results' from # the manifest predict_df = pd.read_json(req["predict_rows"]) show_model_json = False try: show_model_json = bool(int(req.get("show_model_json", "0")) == 1) except Exception as e: show_model_json = False log.error(("{} - Set show_model_json to 0 or 1 ex={}").format( req["label"], e)) ml_type = req["ml_type"].lower() predict_feature = req["predict_feature"] predictions = [] res = antinex_utils.make_predictions.make_predictions(req) if res["status"] == SUCCESS: log.info(("{} - processing results").format(req["label"])) res_data = res["data"] model = res_data["model"] acc_data = res_data["acc"] are_predictions_merged = res_data["are_predicts_merged"] predictions = res_data["sample_predictions"] accuracy = acc_data.get("accuracy", None) if are_predictions_merged: log.info(("{} - processing merged predictions").format( req["label"])) merge_df = res_data["merge_df"] model_predict_feature = "_predicted_{}".format(predict_feature) if model_predict_feature not in merge_df: log.error( ("{} missing predicted feature={} " "from res={}").format(req["label"], model_predict_feature, res)) return res for idx, row in merge_df.iterrows(): log.info(("cur_sample={} - {}={} predicted={}").format( idx, predict_feature, float(row[predict_feature]), float(row[model_predict_feature]))) # same as the merge method in antinex-utils else: for idx, node in enumerate(predictions): label_name = None if "label_name" in node: label_name = node["label_name"] org_feature = "_original_{}".format(predict_feature) original_value = None if org_feature in node: original_value = node[org_feature] if "regression" in ml_type: log.info(("sample={} - {}={} predicted={}").format( node["_row_idx"], predict_feature, float(original_value), float(node[predict_feature]))) elif "classification" in ml_type: log.info( ("sample={} - {}={} predicted={} label={}").format( node["_row_idx"], predict_feature, original_value, node[predict_feature], label_name)) else: log.info(("sample={} - {}={} predicted={}").format( node["_row_idx"], predict_feature, original_value, node[predict_feature])) # end of predicting predictions if show_model_json: log.info(("{} made predictions={} model={} ").format( req["label"], len(predict_df.index), ppj(json.loads(model.model.to_json())))) log.info( ("{} made predictions={} found={} " "accuracy={}").format(req["label"], len(predict_df.index), len(res_data["sample_predictions"]), accuracy)) final_results = { "data": res_data, "created": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") } if worker_result_node: log.info(("CORERES {} publishing results back to api").format( req["label"])) status = send_results_to_broker(loc=worker_result_node, final_results=final_results) log.info(("CORERES {} publishing results success={}").format( req["label"], bool(status == SUCCESS))) # end of sending the results back log.info(("{} - model={} finished processing").format( req["label"], req["use_model_name"])) self.models[req["use_model_name"]] = final_results else: log.info( ("{} failed predictions={}").format(req["label"], len(predict_df.index))) # end of if good train and predict return res
def task_ml_prepare(self=None, req_node=None): """task_ml_prepare :param self: parent task object for bind=True :param req_node: job utils dictionary for passing a dictionary """ log.info(("task - {} - start " "req_node={}").format(req_node["task_name"], ppj(req_node))) ml_prepare_data = req_node["data"].get("ml_prepare_data", None) user_obj = None ml_prepare_obj = None if req_node["use_cache"]: ml_prepare_obj = MLPrepare.objects.select_related().filter( Q(id=int(ml_prepare_data["id"]))).cache().first() else: ml_prepare_obj = MLPrepare.objects.select_related().filter( Q(id=int(ml_prepare_data["id"]))).first() # end of finding the MLPrepare record create_new_record = False # create the response node from request res = build_task_response(use_cache=req_node["use_cache"], celery_enabled=req_node["celery_enabled"], cache_key=req_node["cache_key"]) try: if create_new_record: create_res = create_ml_prepare_record(req_node=req_node) user_obj = create_res.get("user_obj", None) ml_prepare_obj = create_res.get("ml_prepare_obj", None) if not user_obj: res["error"] = ("{} - Failed to find User").format( req_node["task_name"]) res["status"] = ERR res["error"] = create_res.get("err", "error not set") res["data"] = None log.error(res["error"]) return res if not ml_prepare_obj: res["error"] = ("{} - Failed to create MLPrepare").format( req_node["task_name"]) res["status"] = ERR res["error"] = create_res.get("err", "error not set") res["data"] = None log.error(res["error"]) return res # end of create_new_record last_step = ("starting user={} prepare={} " "pipeline={} clean={} full={} " "post={} label={} tracking={}").format( ml_prepare_obj.user_id, ml_prepare_obj.id, ml_prepare_obj.pipeline_files, ml_prepare_obj.clean_file, ml_prepare_obj.full_file, ml_prepare_obj.post_proc, ml_prepare_obj.label_rules, ml_prepare_obj.tracking_id) log.info(last_step) log_id = "job={}".format(ml_prepare_obj.id) log.info(("prepare={} csvs={}").format(ml_prepare_obj.id, ml_prepare_obj.ds_glob_path)) ml_prepare_obj.pipeline_files = find_all_pipeline_csvs( use_log_id=log_id, csv_glob_path=ml_prepare_obj.ds_glob_path) log.info( ("preparing={} clean={} full={} " "meta_suffix={} files={}").format(ml_prepare_obj.id, ml_prepare_obj.clean_file, ml_prepare_obj.full_file, ml_prepare_obj.meta_suffix, ml_prepare_obj.pipeline_files)) save_node = build_csv(use_log_id=log_id, pipeline_files=ml_prepare_obj.pipeline_files, fulldata_file=ml_prepare_obj.full_file, clean_file=ml_prepare_obj.clean_file, post_proc_rules=ml_prepare_obj.post_proc, label_rules=ml_prepare_obj.label_rules, meta_suffix=ml_prepare_obj.meta_suffix) if save_node["status"] == VALID: log.info("successfully processed datasets:") ml_prepare_obj.post_proc = save_node["post_proc_rules"] ml_prepare_obj.post_proc["features_to_process"] = \ save_node["features_to_process"] ml_prepare_obj.post_proc["ignore_features"] = \ save_node["ignore_features"] ml_prepare_obj.post_proc["feature_to_predict"] = \ save_node["feature_to_predict"] ml_prepare_obj.label_rules = save_node["label_rules"] ml_prepare_obj.pipeline_files = save_node["pipeline_files"] ml_prepare_obj.full_file = save_node["fulldata_file"] ml_prepare_obj.clean_file = save_node["clean_file"] ml_prepare_obj.status = "finished" ml_prepare_obj.control_state = "finished" ml_prepare_obj.save() log.info(("saved prepare={}").format(ml_prepare_obj.id)) if ev("SHOW_SUMMARY", "0") == "1": log.info(("Full csv: {}").format(save_node["fulldata_file"])) log.info(("Full meta: {}").format( save_node["fulldata_metadata_file"])) log.info(("Clean csv: {}").format(save_node["clean_file"])) log.info(("Clean meta: {}").format( save_node["clean_metadata_file"])) log.info("------------------------------------------") log.info(("Predicting Feature: {}").format( save_node["feature_to_predict"])) log.info(("Features to Process: {}").format( ppj(save_node["features_to_process"]))) log.info(("Ignored Features: {}").format( ppj(save_node["ignore_features"]))) log.info("------------------------------------------") # end of show summary log.info("Full: {}".format(save_node["fulldata_file"])) log.info("Cleaned (no-NaNs in columns): {}".format( save_node["clean_file"])) data = ml_prepare_obj.get_public() res["status"] = SUCCESS res["err"] = "" res["data"] = data else: last_step = ("failed to prepare csv status={} " "errors: {}").format(save_node["status"], save_node["err"]) log.error(last_step) ml_prepare_obj.status = "error" ml_prepare_obj.control_state = "error" ml_prepare_obj.save() data["prepare"] = ml_prepare_obj.get_public() data["ready"] = {} res["status"] = ERR res["error"] = last_step res["data"] = data return res # end of checking it started except Exception as e: res["status"] = ERR res["err"] = ("Failed task={} with " "ex={}").format(req_node["task_name"], e) res["data"] = None log.error(res["err"]) # end of try/ex log.info(("task - {} - done").format(req_node["task_name"])) return res
def task_ml_job(self=None, req_node=None): """task_ml_job :param self: parent task object for bind=True :param req_node: job utils dictionary for passing a dictionary """ log.info(("task - {} - start " "req_node={}").format(req_node["task_name"], ppj(req_node))) user_data = req_node["data"].get("user_data", None) ml_job = req_node["data"].get("ml_job_data", None) ml_result = req_node["data"].get("ml_result_data", None) model_desc = req_node["data"].get("model_desc", None) label_rules = req_node["data"].get("label_rules", None) predict_rows = req_node["data"].get("predict_rows", None) user_res = db_lookup_user(user_id=user_data["id"]) user_obj = user_res.get("user_obj", None) ml_job_id = None ml_result_id = None ml_job_obj = None found_predictions = [] found_accuracy = None if req_node["use_cache"]: ml_job_obj = MLJob.objects.select_related().filter( Q(id=int(ml_job["id"])) & Q(user=user_obj)).cache().first() else: ml_job_obj = MLJob.objects.select_related().filter( Q(id=int(ml_job["id"])) & Q(user=user_obj)).first() # end of finding the MLJob record ml_result_obj = None if req_node["use_cache"]: ml_result_obj = MLJobResult.objects.select_related().filter( Q(id=int(ml_result["id"])) & Q(user=user_obj)).cache().first() else: ml_result_obj = MLJobResult.objects.select_related().filter( Q(id=int(ml_result["id"])) & Q(user=user_obj)).first() # end of finding the MLJobResult record res = build_task_response(use_cache=req_node["use_cache"], celery_enabled=req_node["celery_enabled"], cache_key=req_node["cache_key"]) last_step = "not started" data = {} data["job"] = {} data["results"] = {} try: res["status"] = ERR res["error"] = "" predict_manifest = ml_job_obj.predict_manifest csv_file = predict_manifest.get("csv_file", None) meta_file = predict_manifest.get("meta_file", None) epochs = int(predict_manifest.get("epochs", "5")) test_size = float(predict_manifest.get("test_size", "0.2")) batch_size = int(predict_manifest.get("batch_size", "32")) verbose = int(predict_manifest.get("verbose", "1")) # use pre-trained models in memory by label use_model_name = ml_job_obj.predict_manifest.get( "use_model_name", None) dataset = ml_job_obj.predict_manifest.get("dataset", None) predict_rows = ml_job_obj.predict_manifest.get("predict_rows", None) predict_feature = ml_job_obj.predict_manifest.get( "predict_feature", None) features_to_process = ml_job_obj.predict_manifest.get( "features_to_process", None) ignore_features = ml_job_obj.predict_manifest.get( "ignore_features", None) publish_to_core = ml_job_obj.predict_manifest.get( "publish_to_core", None) apply_scaler = ml_job_obj.predict_manifest.get("apply_scaler", True) sort_values = ml_job_obj.predict_manifest.get("sort_values", None) max_records = int( ml_job_obj.predict_manifest.get("max_records", "100000")) loss = ml_job_obj.predict_manifest.get("loss", "binary_crossentropy") metrics = ml_job_obj.predict_manifest.get("metrics", ["accuracy"]) optimizer = ml_job_obj.predict_manifest.get("optimizer", "adam") histories = ml_job_obj.predict_manifest.get( "histories", ["val_loss", "val_acc", "loss", "acc"]) needs_local_builder = True if ((dataset or predict_rows) and features_to_process): log.info(("using antinex builder dataset={} predict_rows={} " "features_to_process={}").format(dataset, predict_rows, features_to_process)) needs_local_builder = False # flag for bypassing build inside django instead of antinex-utils image_file = ml_result_obj.acc_image_file version = ml_job_obj.version ml_job_id = ml_job_obj.id ml_result_id = ml_result_obj.id last_step = ("starting user={} " "job.id={} result.id={} predict={} " "model_desc={} " "csv={} meta={}").format(ml_job_obj.user.id, ml_job_id, ml_result_id, ml_job_obj.predict_feature, model_desc, csv_file, meta_file) log.info(last_step) ml_job_obj.status = "analyzing" ml_job_obj.save() if needs_local_builder: log.info("starting local build_training_request") ml_req = build_training_request( csv_file=csv_file, meta_file=meta_file, predict_feature=ml_job_obj.predict_feature, test_size=test_size) if ml_req["status"] != VALID: last_step = ("Stopping for status={} " "errors: {}").format(ml_req["status"], ml_req["err"]) log.error(last_step) ml_job_obj.status = "error" ml_job_obj.control_state = "error" log.info(("saving job={}").format(ml_job_id)) ml_job_obj.save() data["job"] = ml_job_obj.get_public() error_data = {"status": ml_req["status"], "err": ml_req["err"]} data["results"] = error_data res["status"] = ERR res["error"] = last_step res["data"] = data return res else: predict_manifest["ignore_features"] = \ ml_req.get("ignore_features", []) predict_manifest["features_to_process"] = \ ml_req.get("features_to_process", []) if label_rules: predict_manifest["label_rules"] = \ label_rules else: predict_manifest["label_rules"] = \ ml_req["meta_data"]["label_rules"] predict_manifest["post_proc_rules"] = \ ml_req["meta_data"]["post_proc_rules"] predict_manifest["version"] = version last_step = ("job.id={} built_training_request={} " "predict={} features={} ignore={} " "label_rules={} post_proc={}").format( ml_job_obj.id, ml_req["status"], predict_manifest["predict_feature"], predict_manifest["features_to_process"], predict_manifest["ignore_features"], predict_manifest["label_rules"], predict_manifest["post_proc_rules"]) log.info(last_step) if ml_job_obj.ml_type == "regression": log.info(("using Keras - regression - " "sequential model ml_type={}").format( ml_job_obj.ml_type)) loss = "mse" metrics = ["mse", "mae", "mape", "cosine"] histories = [ "mean_squared_error", "mean_absolute_error", "mean_absolute_percentage_error", "cosine_proximity" ] else: log.info(("using Keras - sequential model " "ml_type={}").format(ml_job_obj.ml_type)) # end of classification vs regression ml_job_obj.predict_manifest["epochs"] = epochs ml_job_obj.predict_manifest["batch_size"] = batch_size ml_job_obj.predict_manifest["verbose"] = verbose ml_job_obj.predict_manifest["loss"] = loss ml_job_obj.predict_manifest["metrics"] = metrics ml_job_obj.predict_manifest["optimizer"] = optimizer ml_job_obj.predict_manifest["histories"] = histories ml_job_obj.predict_manifest = predict_manifest # end of updating without antinex-utils # end of if needs_local_builder: ml_job_obj.status = "started" ml_job_obj.save() scores = None prediction_req = { "label": "job_{}_result_{}".format(ml_job_id, ml_result_id), "manifest": ml_job_obj.predict_manifest, "model_json": ml_result_obj.model_json, "model_desc": model_desc, "weights_json": ml_result_obj.model_weights, } if dataset: prediction_req["dataset"] = dataset if max_records: prediction_req["max_records"] = max_records if predict_rows: prediction_req["predict_rows"] = json.dumps(predict_rows) if features_to_process: prediction_req["features_to_process"] = features_to_process if ignore_features: prediction_req["ignore_features"] = ignore_features if apply_scaler: prediction_req["apply_scaler"] = apply_scaler if sort_values: prediction_req["sort_values"] = sort_values if loss: prediction_req["loss"] = loss if metrics: prediction_req["metrics"] = metrics if optimizer: prediction_req["optimizer"] = optimizer if histories: prediction_req["histories"] = histories if predict_feature: prediction_req["predict_feature"] = predict_feature if csv_file: prediction_req["csv_file"] = csv_file if meta_file: prediction_req["meta_file"] = meta_file already_predicted = False # if you just want to use the core without django training: if publish_to_core or settings.ANTINEX_WORKER_ONLY: log.info(("model_name={} only publish={} worker={}").format( use_model_name, publish_to_core, settings.ANTINEX_WORKER_ONLY)) ml_job_obj.status = "launched" ml_job_obj.control_state = "launched" ml_job_obj.save() ml_result_obj.status = "launched" ml_result_obj.control_state = "launched" ml_result_obj.save() else: log.info( ("start make_predictions req={}").format(ppj(prediction_req))) prediction_res = make_predictions(req=prediction_req) if prediction_res["status"] != SUCCESS: last_step = ("Stopping for prediction_status={} " "errors: {}").format(prediction_res["status"], prediction_res["err"]) log.error(last_step) ml_job_obj.status = "error" ml_job_obj.control_state = "error" log.info(("saving job={}").format(ml_job_id)) ml_job_obj.save() data["job"] = ml_job_obj.get_public() error_data = { "status": prediction_res["status"], "err": prediction_res["err"] } data["results"] = error_data res["status"] = ERR res["error"] = last_step res["data"] = data return res already_predicted = True res_data = prediction_res["data"] model = res_data["model"] model_weights = res_data["weights"] scores = res_data["scores"] acc_data = res_data["acc"] error_data = res_data["err"] predictions_json = { "predictions": json.loads( pd.Series(res_data["sample_predictions"]).to_json( orient="records")) } found_predictions = res_data["sample_predictions"] found_accuracy = acc_data.get("accuracy", None) last_step = ("job={} accuracy={}").format(ml_job_id, scores[1] * 100) log.info(last_step) ml_job_obj.status = "finished" ml_job_obj.control_state = "finished" ml_job_obj.save() log.info(("saved job={}").format(ml_job_id)) data["job"] = ml_job_obj.get_public() acc_data = {"accuracy": scores[1] * 100} error_data = None log.info(("converting job={} model to json").format(ml_job_id)) model_json = json.loads(model.to_json()) log.info(("saving job={} weights_file={}").format( ml_job_id, ml_result_obj.model_weights_file)) log.info(("building job={} results").format(ml_job_id)) ml_result_obj.status = "finished" ml_result_obj.acc_data = acc_data ml_result_obj.error_data = error_data ml_result_obj.model_json = model_json ml_result_obj.model_weights = model_weights ml_result_obj.acc_image_file = image_file ml_result_obj.predictions_json = predictions_json ml_result_obj.version = version # end of handing off to core worker without a database connection log.info(("saving job={} results").format(ml_job_id)) # OpenShift 9.6 Postgres container killed the worker # here. Interested to see if this is a jsonb/jsonfield problem # 2018-05-20 try: ml_result_obj.save() except Exception as e: res["error"] = ("Failed saving model job.id={} with ex={}").format( ml_job_id, e) res["status"] = ERR res["data"] = data log.error(res["error"]) return res # end try/ex log.info(("done saving job={} results").format(ml_job_id)) data["job"] = ml_job_obj.get_public() data["results"] = ml_result_obj.get_public() res["status"] = SUCCESS res["error"] = "" res["data"] = data if settings.ANTINEX_WORKER_ENABLED and not already_predicted: if use_model_name: prediction_req["label"] = use_model_name log.info(("publishing to core use_model_name={} " "worker={} already_predicted={}").format( use_model_name, settings.ANTINEX_WORKER_ENABLED, already_predicted)) publish_req = {"body": prediction_req} if settings.CELERY_ENABLED: task_publish_to_core.delay(publish_node=publish_req) else: task_publish_to_core(publish_node=publish_req) else: log.info(("skip - worker={} already_predicted={}").format( settings.ANTINEX_WORKER_ENABLED, already_predicted)) # send to core except Exception as e: res["status"] = ERR res["err"] = ("Failed task={} with " "ex={}").format(req_node["task_name"], e) if ml_job_obj: data["job"] = ml_job_obj.get_public() else: data["job"] = None if ml_result_obj: data["results"] = ml_result_obj.get_public() else: data["results"] = None log.error(res["err"]) # end of try/ex log.info( ("task - {} - done - " "ml_job.id={} ml_result.id={} " "accuracy={} predictions={}").format(req_node["task_name"], ml_job_id, ml_result_id, found_accuracy, len(found_predictions))) return res
def build_csv(pipeline_files=[], fulldata_file=None, clean_file=None, post_proc_rules=None, label_rules=None, use_log_id=None, meta_suffix="metadata.json"): """build_csv :param pipeline_files: list of files to process :param fulldata_file: output of non-edited merged data :param clean_file: cleaned csv file should be ready for training :param post_proc_rules: apply these rules to post processing (clean) :param label_rules: apply labeling rules (classification only) :param use_log_id: label for tracking the job in the logs :param meta_suffix: file suffix """ save_node = { "status": INVALID, "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "fulldata_file": fulldata_file, "fulldata_metadata_file": None, "clean_file": clean_file, "clean_metadata_file": None, "features_to_process": [], "feature_to_predict": None, "ignore_features": [], "full_headers": [], "clean_headers": [], "df_json": {}, "version": 1 } log_id = "" if use_log_id: log_id = use_log_id if not fulldata_file: log.error("missing fulldata_file - stopping") save_node["status"] = INVALID return save_node if not clean_file: log.error("missing clean_file - stopping") save_node["status"] = INVALID return save_node fulldata_metadata_file = "{}/fulldata_{}".format( "/".join(fulldata_file.split("/")[:-1]), meta_suffix) clean_metadata_file = "{}/cleaned_{}".format( "/".join(clean_file.split("/")[:-1]), meta_suffix) log.info(("{} build_csv - START").format(log_id)) common_headers, \ headers_dict = find_all_headers( use_log_id=log_id, pipeline_files=pipeline_files) log.info( ("{} num common_headers={} headers={}").format(log_id, len(common_headers), common_headers)) # since the headers can be different we rebuild a new one: mark_default_value = None if "mark_empty" in post_proc_rules: mark_default_value = post_proc_rules["mark_empty"] log.info(("{} using mark_empty={}").format(log_id, mark_default_value)) hdrs = {} for h in common_headers: hdrs[h] = mark_default_value features_to_process = [] feature_to_predict = None ignore_features = [] set_if_above = None labels = [] label_values = [] if label_rules: set_if_above = label_rules["set_if_above"] labels = label_rules["labels"] label_values = label_rules["label_values"] if post_proc_rules: if "predict_feature" in post_proc_rules: feature_to_predict = post_proc_rules["predict_feature"] if not feature_to_predict: if "label_name" in hdrs: feature_to_predict = "label_name" all_rows = [] num_done = 1 total_files = len(pipeline_files) for c in pipeline_files: log.info(("{} merging={}/{} csv={}").format(log_id, num_done, total_files, c)) cf = pd.read_csv(c) if mark_default_value: log.info(("{} filling nan with value={}").format( log_id, mark_default_value)) cf.fillna(value=mark_default_value, inplace=True) # end of making sure fillna is done if requested log.info(("{} processing rows={}").format(log_id, len(cf.index))) for index, row in cf.iterrows(): valid_row = True new_row = copy.deepcopy(hdrs) new_row["src_file"] = c for k in hdrs: if k in row: new_row[k] = row[k] else: if mark_default_value: new_row[k] = mark_default_value # end of for all headers to copy in if label_rules: test_rand = random.randint(0, 100) if test_rand > set_if_above: new_row["label_value"] = label_values[1] new_row["label_name"] = labels[1] # if you make the "set above" greater than 100 # it will tag the entire dataset with just 1 label # nice if your data is the same else: new_row["label_value"] = label_values[0] new_row["label_name"] = labels[0] # end of applying label rules if valid_row: all_rows.append(new_row) # end of for all rows in this file num_done += 1 # end of building all files into one list log.info( ("{} fulldata rows={} generating df").format(log_id, len(all_rows))) df = pd.DataFrame(all_rows) log.info(("{} df rows={} headers={}").format(log_id, len(df.index), df.columns.values)) if ev("CONVERT_DF", "0") == "1": log.info(("{} converting df to json").format(log_id)) save_node["df_json"] = df.to_json() if clean_file: log.info(("{} writing fulldata_file={}").format(log_id, fulldata_file)) df.to_csv(fulldata_file, sep=',', encoding='utf-8', index=False) log.info( ("{} done writing fulldata_file={}").format(log_id, fulldata_file)) if post_proc_rules: features_to_process = [] ignore_features = [] if label_rules: if feature_to_predict: ignore_features = [feature_to_predict] if "drop_columns" in post_proc_rules: for p in post_proc_rules["drop_columns"]: if p in headers_dict: ignore_features.append(p) # post proce filter more features out # for non-int/float types for d in df.columns.values: add_this_one = True for i in ignore_features: if d == i: add_this_one = False break if add_this_one: features_to_process.append(d) # for all df columns we're not ignoring... # add them as features to process log.info(("{} writing fulldata metadata file={}").format( log_id, fulldata_metadata_file)) header_data = { "headers": list(df.columns.values), "output_type": "fulldata", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": features_to_process, "feature_to_predict": feature_to_predict, "ignore_features": ignore_features, "created": rnow() } save_node["full_headers"] = list(df.columns.values) with open(fulldata_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) keep_these = features_to_process if feature_to_predict: keep_these.append(feature_to_predict) log.info(("{} creating new clean_file={} " "keep_these={} " "predict={}").format(log_id, clean_file, keep_these, feature_to_predict)) # need to remove all columns that are all nan clean_df = None if "keep_nans" not in post_proc_rules: clean_df = df[keep_these].dropna(axis=1, how='all').dropna() else: clean_df = df[keep_these].dropna(axis=1, how='all') # allow keeping empty columns log.info(("{} clean_df colums={} rows={}").format( log_id, clean_df.columns.values, len(clean_df.index))) if len(clean_df.columns.values) == 0: log.error("Postproc clean df has no columns") if len(clean_df.index) == 0: log.error("Postproc clean df has no rows") cleaned_features = clean_df.columns.values cleaned_to_process = [] cleaned_ignore_features = [] for c in cleaned_features: if feature_to_predict: if c == feature_to_predict: cleaned_ignore_features.append(c) else: keep_it = True for ign in ignore_features: if c == ign: cleaned_ignore_features.append(c) keep_it = False break # end of for all feaures to remove if keep_it: cleaned_to_process.append(c) # end of new feature columns log.info(("{} writing DROPPED clean_file={} " "features_to_process={} " "ignore_features={} " "predict={}").format(log_id, clean_file, cleaned_to_process, cleaned_ignore_features, feature_to_predict)) write_clean_df = clean_df.drop(columns=cleaned_ignore_features) log.info( ("cleaned_df rows={}").format(len(write_clean_df.index))) write_clean_df.to_csv(clean_file, sep=',', encoding='utf-8', index=False) clean_metadata_file = "{}/cleaned_{}".format( "/".join(clean_file.split("/")[:-1]), meta_suffix) log.info(("{} writing clean metadata file={}").format( log_id, clean_metadata_file)) header_data = { "headers": list(write_clean_df.columns.values), "output_type": "clean", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": cleaned_to_process, "feature_to_predict": feature_to_predict, "ignore_features": cleaned_ignore_features, "created": rnow() } save_node["clean_headers"] = list( write_clean_df.columns.values) with open(clean_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) else: for d in df.columns.values: add_this_one = True for i in ignore_features: if d == i: add_this_one = False break if add_this_one: features_to_process.append(d) # for all df columns we're not ignoring... # add them as features to process log.info(("{} writing fulldata metadata file={}").format( log_id, fulldata_metadata_file)) header_data = { "headers": list(df.columns.values), "output_type": "fulldata", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": features_to_process, "feature_to_predict": feature_to_predict, "ignore_features": ignore_features, "created": rnow() } save_node["full_headers"] = list(df.columns.values) with open(fulldata_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) keep_these = features_to_process if feature_to_predict: keep_these.append(feature_to_predict) log.info(("{} creating new clean_file={} " "keep_these={} " "predict={}").format(log_id, clean_file, keep_these, feature_to_predict)) # need to remove all columns that are all nan clean_df = None if "keep_nans" not in post_proc_rules: clean_df = df[keep_these].dropna(axis=1, how='all').dropna() else: clean_df = df[keep_these].dropna(axis=1, how='all') # allow keeping empty columns log.info(("{} clean_df colums={} rows={}").format( log_id, clean_df.columns.values, len(clean_df.index))) if len(clean_df.columns.values) == 0: log.error( ("{} The clean df has no columns").format(log_id)) if len(clean_df.index) == 0: log.error(("{} The clean df has no rows").format(log_id)) cleaned_features = clean_df.columns.values cleaned_to_process = [] cleaned_ignore_features = [] for c in cleaned_features: if feature_to_predict: if c == feature_to_predict: cleaned_ignore_features.append(c) else: keep_it = True for ign in ignore_features: if c == ign: cleaned_ignore_features.append(c) keep_it = False break # end of for all feaures to remove if keep_it: cleaned_to_process.append(c) # end of new feature columns log.info(("{} writing DROPPED clean_file={} " "features_to_process={} " "ignore_features={} " "predict={}").format(log_id, clean_file, cleaned_to_process, cleaned_ignore_features, feature_to_predict)) write_clean_df = clean_df.drop(columns=cleaned_ignore_features) log.info(("{} cleaned_df rows={}").format( log_id, len(write_clean_df.index))) write_clean_df.to_csv(clean_file, sep=',', encoding='utf-8', index=False) log.info(("{} writing clean metadata file={}").format( log_id, clean_metadata_file)) header_data = { "headers": list(write_clean_df.columns.values), "output_type": "clean", "pipeline_files": pipeline_files, "post_proc_rules": post_proc_rules, "label_rules": label_rules, "features_to_process": cleaned_to_process, "feature_to_predict": feature_to_predict, "ignore_features": cleaned_ignore_features, "created": rnow() } save_node["clean_headers"] = list( write_clean_df.columns.values) with open(clean_metadata_file, "w") as otfile: otfile.write(str(ppj(header_data))) # end of if/else save_node["clean_file"] = clean_file save_node["clean_metadata_file"] = clean_metadata_file log.info( ("{} done writing clean_file={}").format(log_id, clean_file)) # end of post_proc_rules save_node["fulldata_file"] = fulldata_file save_node["fulldata_metadata_file"] = fulldata_metadata_file save_node["status"] = VALID # end of writing the file save_node["features_to_process"] = features_to_process save_node["feature_to_predict"] = feature_to_predict save_node["ignore_features"] = ignore_features log.info(("{} build_csv - END").format(log_id)) return save_node
from antinex_utils.utils import ppj name = 'send-worker-publish-to-core' log = build_colorized_logger(name=name) log.info("creating celery app") app = Celery("test-decoupled-app") broker_settings = { "broker_url": os.getenv("ANTINEX_REST_API_BROKER_URL", "redis://localhost:6379/9") } app.conf.update(**broker_settings) datafile = "../webapp/drf_network_pipeline/tests/pubsub/publish-to-core.json" data = {} with open(datafile, "r") as f: data = json.loads(f.read()) # Celery task routing and queue parent_route = "drf_network_pipeline.pipeline.tasks" task_name = ("{}.task_publish_to_core").format(parent_route) queue_name = ("{}.task_publish_to_core").format(parent_route) log.info( ("sending args={} to broker={} task={}").format(ppj(data), app.conf["BROKER_URL"], task_name)) app.send_task(task_name, args=[data], queue=queue_name)