Пример #1
0
def start_antinex_core_worker():
    """start_antinex_core_worker


    This is the main function handler for starting the AntiNex Workers.

    Set these environment variables as needed:

    ``BROKER_URL=redis://localhost:6379/6`` - Celery will connect
    to this broker

    ``TRAIN_QUEUE=webapp.train.requests`` ``Train a New DNN``
    requests from this queue in the broker

    ``PREDICT_QUEUE=webapp.predict.requests`` - consume ``Prediction``
    requests from this queue in the broker

    ``MAX_MSGS=100`` - how many historical messages are saved for replay

    ``MAX_MODELS=100`` - max pre-trained dnns per worker
    """
    broker_url = ev("BROKER_URL", "redis://localhost:6379/6")
    train_queue_name = ev("TRAIN_QUEUE", "webapp.train.requests")
    predict_queue_name = ev("PREDICT_QUEUE", "webapp.predict.requests")
    max_msgs = int(ev("MAX_MSGS", "100"))
    max_models = int(ev("MAX_MODELS", "100"))

    log.info("Creating antinex core")
    core = AntiNexCore(name="core",
                       broker_url=broker_url,
                       train_queue_name=train_queue_name,
                       predict_queue_name=predict_queue_name,
                       max_msgs=max_msgs,
                       max_models=max_models)
    try:
        log.info("Starting antinex core")
        core.start(app=app)
    except Exception as e:
        log.info(("Core hit exception={} shutting down").format(e))
        core.shutdown()
        log.info(("canceling consumer to queue={}").format(train_queue_name))
        app.control.cancel_consumer(train_queue_name)
        log.info(("canceling consumer to queue={}").format(predict_queue_name))
        app.control.cancel_consumer(predict_queue_name)
Пример #2
0
    def __init__(self,
                 name="",
                 broker_url=ev("BROKER_URL", "redis://localhost:6379/6"),
                 train_queue_name=ev("TRAIN_QUEUE", "webapp.train.requests"),
                 predict_queue_name=ev("PREDICT_QUEUE",
                                       "webapp.predict.requests"),
                 max_msgs=100,
                 max_models=100):
        """__init__

        :param name: worker name
        :param broker_url: connection string to broker
        :param train_queue_name: queue name for training requests
        :param predict_queue_name: queue name for predict requests
        :param max_msgs: num msgs to save for replay debugging (FIFO)
        :param max_models: num pre-trained models to keep in memory (FIFO)
        """

        self.name = name
        log.info(("{} - INIT").format(self.name))

        self.state = "INIT"
        self.broker_url = broker_url

        # Setup queues:
        self.train_queue_name = train_queue_name
        self.predict_queue_name = predict_queue_name

        self.queues = [self.train_queue_name, self.predict_queue_name]

        # Subscribers
        self.all_queues_sub = None

        # SSL Celery options dict
        self.ssl_options = {}

        # http://docs.celeryproject.org/en/latest/userguide/calling.html#calling-retry  # noqa
        # allow publishes to retry for a time
        self.task_publish_retry_policy = {
            "interval_max": 1,
            "max_retries": 120,  # None - forever
            "interval_start": 0.1,
            "interval_step": 0.2
        }

        # Confirm publishes with Celery
        # https://github.com/celery/kombu/issues/572
        self.transport_options = {"confirm_publish": True}

        self.conn_attrs = {
            "task_default_queue": "antinex.worker.control",
            "task_default_exchange": "antinex.worker.control",
            # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_hijack_root_logger
            "worker_hijack_root_logger": False,
            # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_prefetch_multiplier
            "worker_prefetch_multiplier": 1,  # consume 1 message at a time
            # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-worker_prefetch_multiplie
            "prefetch_count":
            3,  # noqa consume 1 message at a time per worker (3 workers)
            # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_heartbeat
            "broker_heartbeat": 240,  # in seconds
            # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_connection_max_retries
            "broker_connection_max_retries": None,  # None is forever
            # noqa http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-task_acks_late
            "task_acks_late":
            True,  # noqa on consume do not send an immediate ack back
            "task_publish_retry_policy": self.task_publish_retry_policy
        }

        self.processor = AntiNexProcessor(name="{}.prc".format(self.name),
                                          max_msgs=max_msgs,
                                          max_models=max_models)
def task_ml_prepare(self=None, req_node=None):
    """task_ml_prepare

    :param self: parent task object for bind=True
    :param req_node: job utils dictionary for passing a dictionary
    """

    log.info(("task - {} - start "
              "req_node={}").format(req_node["task_name"], ppj(req_node)))

    ml_prepare_data = req_node["data"].get("ml_prepare_data", None)

    user_obj = None
    ml_prepare_obj = None
    if req_node["use_cache"]:
        ml_prepare_obj = MLPrepare.objects.select_related().filter(
            Q(id=int(ml_prepare_data["id"]))).cache().first()
    else:
        ml_prepare_obj = MLPrepare.objects.select_related().filter(
            Q(id=int(ml_prepare_data["id"]))).first()
    # end of finding the MLPrepare record

    create_new_record = False

    # create the response node from request
    res = build_task_response(use_cache=req_node["use_cache"],
                              celery_enabled=req_node["celery_enabled"],
                              cache_key=req_node["cache_key"])

    try:

        if create_new_record:
            create_res = create_ml_prepare_record(req_node=req_node)
            user_obj = create_res.get("user_obj", None)
            ml_prepare_obj = create_res.get("ml_prepare_obj", None)
            if not user_obj:
                res["error"] = ("{} - Failed to find User").format(
                    req_node["task_name"])
                res["status"] = ERR
                res["error"] = create_res.get("err", "error not set")
                res["data"] = None
                log.error(res["error"])
                return res
            if not ml_prepare_obj:
                res["error"] = ("{} - Failed to create MLPrepare").format(
                    req_node["task_name"])
                res["status"] = ERR
                res["error"] = create_res.get("err", "error not set")
                res["data"] = None
                log.error(res["error"])
            return res
        # end of create_new_record

        last_step = ("starting user={} prepare={} "
                     "pipeline={} clean={} full={} "
                     "post={} label={} tracking={}").format(
                         ml_prepare_obj.user_id, ml_prepare_obj.id,
                         ml_prepare_obj.pipeline_files,
                         ml_prepare_obj.clean_file, ml_prepare_obj.full_file,
                         ml_prepare_obj.post_proc, ml_prepare_obj.label_rules,
                         ml_prepare_obj.tracking_id)
        log.info(last_step)

        log_id = "job={}".format(ml_prepare_obj.id)

        log.info(("prepare={} csvs={}").format(ml_prepare_obj.id,
                                               ml_prepare_obj.ds_glob_path))

        ml_prepare_obj.pipeline_files = find_all_pipeline_csvs(
            use_log_id=log_id, csv_glob_path=ml_prepare_obj.ds_glob_path)

        log.info(
            ("preparing={} clean={} full={} "
             "meta_suffix={} files={}").format(ml_prepare_obj.id,
                                               ml_prepare_obj.clean_file,
                                               ml_prepare_obj.full_file,
                                               ml_prepare_obj.meta_suffix,
                                               ml_prepare_obj.pipeline_files))

        save_node = build_csv(use_log_id=log_id,
                              pipeline_files=ml_prepare_obj.pipeline_files,
                              fulldata_file=ml_prepare_obj.full_file,
                              clean_file=ml_prepare_obj.clean_file,
                              post_proc_rules=ml_prepare_obj.post_proc,
                              label_rules=ml_prepare_obj.label_rules,
                              meta_suffix=ml_prepare_obj.meta_suffix)

        if save_node["status"] == VALID:

            log.info("successfully processed datasets:")

            ml_prepare_obj.post_proc = save_node["post_proc_rules"]
            ml_prepare_obj.post_proc["features_to_process"] = \
                save_node["features_to_process"]
            ml_prepare_obj.post_proc["ignore_features"] = \
                save_node["ignore_features"]
            ml_prepare_obj.post_proc["feature_to_predict"] = \
                save_node["feature_to_predict"]
            ml_prepare_obj.label_rules = save_node["label_rules"]
            ml_prepare_obj.pipeline_files = save_node["pipeline_files"]
            ml_prepare_obj.full_file = save_node["fulldata_file"]
            ml_prepare_obj.clean_file = save_node["clean_file"]
            ml_prepare_obj.status = "finished"
            ml_prepare_obj.control_state = "finished"
            ml_prepare_obj.save()
            log.info(("saved prepare={}").format(ml_prepare_obj.id))

            if ev("SHOW_SUMMARY", "0") == "1":
                log.info(("Full csv: {}").format(save_node["fulldata_file"]))
                log.info(("Full meta: {}").format(
                    save_node["fulldata_metadata_file"]))
                log.info(("Clean csv: {}").format(save_node["clean_file"]))
                log.info(("Clean meta: {}").format(
                    save_node["clean_metadata_file"]))
                log.info("------------------------------------------")
                log.info(("Predicting Feature: {}").format(
                    save_node["feature_to_predict"]))
                log.info(("Features to Process: {}").format(
                    ppj(save_node["features_to_process"])))
                log.info(("Ignored Features: {}").format(
                    ppj(save_node["ignore_features"])))
                log.info("------------------------------------------")
            # end of show summary

            log.info("Full: {}".format(save_node["fulldata_file"]))
            log.info("Cleaned (no-NaNs in columns): {}".format(
                save_node["clean_file"]))
            data = ml_prepare_obj.get_public()
            res["status"] = SUCCESS
            res["err"] = ""
            res["data"] = data
        else:
            last_step = ("failed to prepare csv status={} "
                         "errors: {}").format(save_node["status"],
                                              save_node["err"])
            log.error(last_step)
            ml_prepare_obj.status = "error"
            ml_prepare_obj.control_state = "error"
            ml_prepare_obj.save()
            data["prepare"] = ml_prepare_obj.get_public()
            data["ready"] = {}
            res["status"] = ERR
            res["error"] = last_step
            res["data"] = data
            return res
        # end of checking it started

    except Exception as e:
        res["status"] = ERR
        res["err"] = ("Failed task={} with "
                      "ex={}").format(req_node["task_name"], e)
        res["data"] = None
        log.error(res["err"])
    # end of try/ex

    log.info(("task - {} - done").format(req_node["task_name"]))

    return res
def build_training_request(
        csv_file=ev("CSV_FILE", "/tmp/cleaned_attack_scans.csv"),
        meta_file=ev("CSV_META_FILE", "/tmp/cleaned_metadata.json"),
        predict_feature=ev("PREDICT_FEATURE", "label_value"),
        ignore_features=[
            "label_name",
            "ip_src",  # need to make this an int
            "ip_dst",  # need to make this an int
            "eth_src",  # need to make this an int
            "eth_dst"  # need to make this an int
        ],
        seed=None,
        test_size=float(ev("TEST_SIZE", "0.20")),
        preproc_rules=None):
    """build_training_request

    :param csv_file: csv file built with prepare_dataset.py
    :param meta_file: metadata file built with prepare_dataset.py
    :param predict_feature: feature (column) to predict
    :param ignore_features: features to remove from the csv
                            before the split of test + train
                            data
    :param seed: integer to seed
    :param test_size: percent of records to split into test
                      vs train
    :param preproc_rules: future preprocessing rules hooks
    """

    last_step = "not started"
    res = {
        "status": INVALID,
        "err": "",
        "csv_file": csv_file,
        "meta_file": meta_file,
        "meta_data": None,
        "seed": None,
        "test_size": test_size,
        "predict_feature": predict_feature,
        "features_to_process": [],
        "ignore_features": ignore_features,
        "X_train": None,
        "X_test": None,
        "Y_train": None,
        "Y_test": None
    }

    try:

        last_step = ("building seed={}").format(seed)

        log.debug(last_step)

        use_seed = seed
        if not use_seed:
            use_seed = 9

        res["seed"] = np.random.seed(use_seed)

        last_step = ("Loading csv={}").format(csv_file)

        log.info(last_step)

        if not os.path.exists(csv_file):
            res["status"] = ERROR
            res["err"] = ("Unable to find csv_file={}").format(csv_file)
            log.error(res["err"])
            return res
        # end of checking for a valid csv file on disk

        if not os.path.exists(meta_file):
            res["status"] = ERROR
            res["err"] = ("Unable to find meta_file={}").format(meta_file)
            log.error(res["err"])
            return res
        # end of checking for a valid metadata file on disk

        # load csv file into pandas dataframe
        df = pd.read_csv(csv_file)

        features_to_process = []
        meta_data = {}

        try:
            last_step = ("opening metadata={}").format(meta_file)
            log.debug(last_step)
            meta_data = json.loads(open(meta_file, "r").read())
            res["meta_data"] = meta_data
            if "post_proc_rules" in meta_data:
                if "drop_columns" in meta_data["post_proc_rules"]:
                    log.debug(("Found drop_columns={}").format(
                        meta_data["post_proc_rules"]["drop_columns"]))
                    for ign in meta_data["post_proc_rules"]["drop_columns"]:
                        ignore_features.append(ign)
        except Exception as e:
            res["error"] = ("Failed building ignore_features: "
                            "ignore_features={} meta={} meta_data={} "
                            "last_step='{}' ex='{}'").format(
                                ignore_features, meta_file, meta_data,
                                last_step, e)
            log.error(res["error"])
            res["status"] = ERROR
            return res
        # end of trying to lookup the meta data file
        # for non-int/float features to ignore

        last_step = ("metadata={} df has "
                     "columns={} ignore={}").format(meta_file,
                                                    df.columns.values,
                                                    ignore_features)

        log.info(last_step)

        for feature in df.columns.values:
            keep_it = True
            for ign in ignore_features:
                if feature == ign:
                    keep_it = False
            if keep_it:
                if feature != predict_feature:
                    features_to_process.append(feature)
        # end of for all features to process

        last_step = ("Done post-procecessing "
                     "Predicting={} with features={} "
                     "ignore_features={} records={}").format(
                         predict_feature, features_to_process, ignore_features,
                         len(df.index))

        log.info(last_step)

        res["predict_feature"] = predict_feature

        res["ignore_features"] = []
        for k in ignore_features:
            if k not in res["ignore_features"]:
                res["ignore_features"].append(k)
        res["features_to_process"] = []
        for k in features_to_process:
            if k not in res["features_to_process"]:
                if k != predict_feature:
                    res["features_to_process"].append(k)

        # split the data into training
        (res["X_train"], res["X_test"], res["Y_train"],
         res["Y_test"]) = train_test_split(df[features_to_process],
                                           df[predict_feature],
                                           test_size=test_size,
                                           random_state=res["seed"])

        last_step = ("Done splitting rows={} into "
                     "X_train={} X_test={} "
                     "Y_train={} Y_test={}").format(len(df.index),
                                                    len(res["X_train"]),
                                                    len(res["X_test"]),
                                                    len(res["Y_train"]),
                                                    len(res["Y_test"]))

        log.info(("Success: {}").format(last_step))

        res["err"] = ""
        res["status"] = VALID
    except Exception as e:
        res["status"] = ERROR
        res["err"] = ("Failed build_training_request "
                      "step='{}' with ex='{}'").format(last_step, e)
        log.error(("build_training_request: {}").format(res["err"]))
    # end of try/ex

    return res
def build_csv(pipeline_files=[],
              fulldata_file=None,
              clean_file=None,
              post_proc_rules=None,
              label_rules=None,
              use_log_id=None,
              meta_suffix="metadata.json"):
    """build_csv

    :param pipeline_files: list of files to process
    :param fulldata_file: output of non-edited merged data
    :param clean_file: cleaned csv file should be ready for training
    :param post_proc_rules: apply these rules to post processing (clean)
    :param label_rules: apply labeling rules (classification only)
    :param use_log_id: label for tracking the job in the logs
    :param meta_suffix: file suffix
    """

    save_node = {
        "status": INVALID,
        "pipeline_files": pipeline_files,
        "post_proc_rules": post_proc_rules,
        "label_rules": label_rules,
        "fulldata_file": fulldata_file,
        "fulldata_metadata_file": None,
        "clean_file": clean_file,
        "clean_metadata_file": None,
        "features_to_process": [],
        "feature_to_predict": None,
        "ignore_features": [],
        "full_headers": [],
        "clean_headers": [],
        "df_json": {},
        "version": 1
    }

    log_id = ""
    if use_log_id:
        log_id = use_log_id

    if not fulldata_file:
        log.error("missing fulldata_file - stopping")
        save_node["status"] = INVALID
        return save_node
    if not clean_file:
        log.error("missing clean_file - stopping")
        save_node["status"] = INVALID
        return save_node

    fulldata_metadata_file = "{}/fulldata_{}".format(
        "/".join(fulldata_file.split("/")[:-1]), meta_suffix)

    clean_metadata_file = "{}/cleaned_{}".format(
        "/".join(clean_file.split("/")[:-1]), meta_suffix)

    log.info(("{} build_csv - START").format(log_id))

    common_headers, \
        headers_dict = find_all_headers(
                            use_log_id=log_id,
                            pipeline_files=pipeline_files)

    log.info(
        ("{} num common_headers={} headers={}").format(log_id,
                                                       len(common_headers),
                                                       common_headers))

    # since the headers can be different we rebuild a new one:

    mark_default_value = None
    if "mark_empty" in post_proc_rules:
        mark_default_value = post_proc_rules["mark_empty"]
        log.info(("{} using mark_empty={}").format(log_id, mark_default_value))

    hdrs = {}
    for h in common_headers:
        hdrs[h] = mark_default_value

    features_to_process = []
    feature_to_predict = None
    ignore_features = []

    set_if_above = None
    labels = []
    label_values = []
    if label_rules:
        set_if_above = label_rules["set_if_above"]
        labels = label_rules["labels"]
        label_values = label_rules["label_values"]
    if post_proc_rules:
        if "predict_feature" in post_proc_rules:
            feature_to_predict = post_proc_rules["predict_feature"]
    if not feature_to_predict:
        if "label_name" in hdrs:
            feature_to_predict = "label_name"

    all_rows = []
    num_done = 1
    total_files = len(pipeline_files)
    for c in pipeline_files:
        log.info(("{} merging={}/{} csv={}").format(log_id, num_done,
                                                    total_files, c))
        cf = pd.read_csv(c)
        if mark_default_value:
            log.info(("{} filling nan with value={}").format(
                log_id, mark_default_value))
            cf.fillna(value=mark_default_value, inplace=True)
        # end of making sure fillna is done if requested

        log.info(("{} processing rows={}").format(log_id, len(cf.index)))
        for index, row in cf.iterrows():
            valid_row = True
            new_row = copy.deepcopy(hdrs)
            new_row["src_file"] = c
            for k in hdrs:
                if k in row:
                    new_row[k] = row[k]
                else:
                    if mark_default_value:
                        new_row[k] = mark_default_value
            # end of for all headers to copy in

            if label_rules:
                test_rand = random.randint(0, 100)
                if test_rand > set_if_above:
                    new_row["label_value"] = label_values[1]
                    new_row["label_name"] = labels[1]

                # if you make the "set above" greater than 100
                # it will tag the entire dataset with just 1 label
                # nice if your data is the same
                else:
                    new_row["label_value"] = label_values[0]
                    new_row["label_name"] = labels[0]
            # end of applying label rules

            if valid_row:
                all_rows.append(new_row)
        # end of for all rows in this file

        num_done += 1
    # end of building all files into one list

    log.info(
        ("{} fulldata rows={} generating df").format(log_id, len(all_rows)))

    df = pd.DataFrame(all_rows)
    log.info(("{} df rows={} headers={}").format(log_id, len(df.index),
                                                 df.columns.values))

    if ev("CONVERT_DF", "0") == "1":
        log.info(("{} converting df to json").format(log_id))
        save_node["df_json"] = df.to_json()

    if clean_file:
        log.info(("{} writing fulldata_file={}").format(log_id, fulldata_file))
        df.to_csv(fulldata_file, sep=',', encoding='utf-8', index=False)
        log.info(
            ("{} done writing fulldata_file={}").format(log_id, fulldata_file))

        if post_proc_rules:

            features_to_process = []
            ignore_features = []
            if label_rules:
                if feature_to_predict:
                    ignore_features = [feature_to_predict]

            if "drop_columns" in post_proc_rules:
                for p in post_proc_rules["drop_columns"]:
                    if p in headers_dict:
                        ignore_features.append(p)
                # post proce filter more features out
                # for non-int/float types

                for d in df.columns.values:
                    add_this_one = True
                    for i in ignore_features:
                        if d == i:
                            add_this_one = False
                            break
                    if add_this_one:
                        features_to_process.append(d)
                # for all df columns we're not ignoring...
                # add them as features to process

                log.info(("{} writing fulldata metadata file={}").format(
                    log_id, fulldata_metadata_file))
                header_data = {
                    "headers": list(df.columns.values),
                    "output_type": "fulldata",
                    "pipeline_files": pipeline_files,
                    "post_proc_rules": post_proc_rules,
                    "label_rules": label_rules,
                    "features_to_process": features_to_process,
                    "feature_to_predict": feature_to_predict,
                    "ignore_features": ignore_features,
                    "created": rnow()
                }

                save_node["full_headers"] = list(df.columns.values)
                with open(fulldata_metadata_file, "w") as otfile:
                    otfile.write(str(ppj(header_data)))

                keep_these = features_to_process
                if feature_to_predict:
                    keep_these.append(feature_to_predict)

                log.info(("{} creating new clean_file={} "
                          "keep_these={} "
                          "predict={}").format(log_id, clean_file, keep_these,
                                               feature_to_predict))

                # need to remove all columns that are all nan
                clean_df = None
                if "keep_nans" not in post_proc_rules:
                    clean_df = df[keep_these].dropna(axis=1,
                                                     how='all').dropna()
                else:
                    clean_df = df[keep_these].dropna(axis=1, how='all')
                # allow keeping empty columns

                log.info(("{} clean_df colums={} rows={}").format(
                    log_id, clean_df.columns.values, len(clean_df.index)))

                if len(clean_df.columns.values) == 0:
                    log.error("Postproc clean df has no columns")
                if len(clean_df.index) == 0:
                    log.error("Postproc clean df has no rows")

                cleaned_features = clean_df.columns.values
                cleaned_to_process = []
                cleaned_ignore_features = []
                for c in cleaned_features:
                    if feature_to_predict:
                        if c == feature_to_predict:
                            cleaned_ignore_features.append(c)
                    else:
                        keep_it = True
                        for ign in ignore_features:
                            if c == ign:
                                cleaned_ignore_features.append(c)
                                keep_it = False
                                break
                        # end of for all feaures to remove
                        if keep_it:
                            cleaned_to_process.append(c)
                # end of new feature columns

                log.info(("{} writing DROPPED clean_file={} "
                          "features_to_process={} "
                          "ignore_features={} "
                          "predict={}").format(log_id, clean_file,
                                               cleaned_to_process,
                                               cleaned_ignore_features,
                                               feature_to_predict))

                write_clean_df = clean_df.drop(columns=cleaned_ignore_features)
                log.info(
                    ("cleaned_df rows={}").format(len(write_clean_df.index)))
                write_clean_df.to_csv(clean_file,
                                      sep=',',
                                      encoding='utf-8',
                                      index=False)

                clean_metadata_file = "{}/cleaned_{}".format(
                    "/".join(clean_file.split("/")[:-1]), meta_suffix)
                log.info(("{} writing clean metadata file={}").format(
                    log_id, clean_metadata_file))
                header_data = {
                    "headers": list(write_clean_df.columns.values),
                    "output_type": "clean",
                    "pipeline_files": pipeline_files,
                    "post_proc_rules": post_proc_rules,
                    "label_rules": label_rules,
                    "features_to_process": cleaned_to_process,
                    "feature_to_predict": feature_to_predict,
                    "ignore_features": cleaned_ignore_features,
                    "created": rnow()
                }

                save_node["clean_headers"] = list(
                    write_clean_df.columns.values)
                with open(clean_metadata_file, "w") as otfile:
                    otfile.write(str(ppj(header_data)))
            else:

                for d in df.columns.values:
                    add_this_one = True
                    for i in ignore_features:
                        if d == i:
                            add_this_one = False
                            break
                    if add_this_one:
                        features_to_process.append(d)
                # for all df columns we're not ignoring...
                # add them as features to process

                log.info(("{} writing fulldata metadata file={}").format(
                    log_id, fulldata_metadata_file))
                header_data = {
                    "headers": list(df.columns.values),
                    "output_type": "fulldata",
                    "pipeline_files": pipeline_files,
                    "post_proc_rules": post_proc_rules,
                    "label_rules": label_rules,
                    "features_to_process": features_to_process,
                    "feature_to_predict": feature_to_predict,
                    "ignore_features": ignore_features,
                    "created": rnow()
                }

                save_node["full_headers"] = list(df.columns.values)
                with open(fulldata_metadata_file, "w") as otfile:
                    otfile.write(str(ppj(header_data)))

                keep_these = features_to_process
                if feature_to_predict:
                    keep_these.append(feature_to_predict)

                log.info(("{} creating new clean_file={} "
                          "keep_these={} "
                          "predict={}").format(log_id, clean_file, keep_these,
                                               feature_to_predict))

                # need to remove all columns that are all nan
                clean_df = None
                if "keep_nans" not in post_proc_rules:
                    clean_df = df[keep_these].dropna(axis=1,
                                                     how='all').dropna()
                else:
                    clean_df = df[keep_these].dropna(axis=1, how='all')
                # allow keeping empty columns

                log.info(("{} clean_df colums={} rows={}").format(
                    log_id, clean_df.columns.values, len(clean_df.index)))

                if len(clean_df.columns.values) == 0:
                    log.error(
                        ("{} The clean df has no columns").format(log_id))
                if len(clean_df.index) == 0:
                    log.error(("{} The clean df has no rows").format(log_id))

                cleaned_features = clean_df.columns.values
                cleaned_to_process = []
                cleaned_ignore_features = []
                for c in cleaned_features:
                    if feature_to_predict:
                        if c == feature_to_predict:
                            cleaned_ignore_features.append(c)
                    else:
                        keep_it = True
                        for ign in ignore_features:
                            if c == ign:
                                cleaned_ignore_features.append(c)
                                keep_it = False
                                break
                        # end of for all feaures to remove
                        if keep_it:
                            cleaned_to_process.append(c)
                # end of new feature columns

                log.info(("{} writing DROPPED clean_file={} "
                          "features_to_process={} "
                          "ignore_features={} "
                          "predict={}").format(log_id, clean_file,
                                               cleaned_to_process,
                                               cleaned_ignore_features,
                                               feature_to_predict))

                write_clean_df = clean_df.drop(columns=cleaned_ignore_features)
                log.info(("{} cleaned_df rows={}").format(
                    log_id, len(write_clean_df.index)))
                write_clean_df.to_csv(clean_file,
                                      sep=',',
                                      encoding='utf-8',
                                      index=False)

                log.info(("{} writing clean metadata file={}").format(
                    log_id, clean_metadata_file))
                header_data = {
                    "headers": list(write_clean_df.columns.values),
                    "output_type": "clean",
                    "pipeline_files": pipeline_files,
                    "post_proc_rules": post_proc_rules,
                    "label_rules": label_rules,
                    "features_to_process": cleaned_to_process,
                    "feature_to_predict": feature_to_predict,
                    "ignore_features": cleaned_ignore_features,
                    "created": rnow()
                }

                save_node["clean_headers"] = list(
                    write_clean_df.columns.values)
                with open(clean_metadata_file, "w") as otfile:
                    otfile.write(str(ppj(header_data)))

            # end of if/else

            save_node["clean_file"] = clean_file
            save_node["clean_metadata_file"] = clean_metadata_file

            log.info(
                ("{} done writing clean_file={}").format(log_id, clean_file))
        # end of post_proc_rules

        save_node["fulldata_file"] = fulldata_file
        save_node["fulldata_metadata_file"] = fulldata_metadata_file

        save_node["status"] = VALID
    # end of writing the file

    save_node["features_to_process"] = features_to_process
    save_node["feature_to_predict"] = feature_to_predict
    save_node["ignore_features"] = ignore_features

    log.info(("{} build_csv - END").format(log_id))

    return save_node