def output_generator(): logging.info("Start output generator ...") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"]) logging.info("Reading with dtypes: %s" % dtypes) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=100000): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() if recipe_desc.get("filterInputColumns", False): input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]] logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") transformed = pipeline.process(input_df) logging.info("Applying it") (labels_arr, additional_columns) = clustering_predict(modeling_params, clf, transformed) cluster_labels = pd.Series(labels_arr, name="cluster_labels").map(naming) cluster_labels.index = transformed["TRAIN"].index final_df = pd.concat([ input_df_orig.join(cluster_labels, how='left'), additional_columns ], axis=1) if preprocessing_params["outliers"]["method"] == "CLUSTER": outliers_cluter_name = cluster_name_map.get( constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS) final_df['cluster_labels'].fillna(outliers_cluter_name, inplace=True) logging.info("Done predicting it") yield final_df
def df_from_split_desc(split_desc, split, feature_params, prediction_type=None): df = df_from_split_desc_no_normalization(split_desc, split, feature_params, prediction_type) return utils.normalize_dataframe(df, feature_params)
def main(exec_folder, output_dataset, keptInputColumns): start = unix_time_millis() listener = ProgressListener() split_desc = json.load(open(osp.join(exec_folder, "_esplit.json"))) preprocessing_params = json.load(open(osp.join(exec_folder, "rpreprocessing_params.json"))) modeling_params = json.load(open(osp.join(exec_folder, "rmodeling_params.json"))) with listener.push_state(constants.STATE_LOADING_SRC): input_df = df_from_split_desc_no_normalization(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded full df: shape=(%d,%d)" % input_df.shape) input_df_orig = input_df.copy() input_df = utils.normalize_dataframe(input_df, preprocessing_params["per_feature"]) with listener.push_state("Collecting preprocessing data"): collector = ClusteringPreprocessingDataCollector(input_df, preprocessing_params) collector_data = collector.build() preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, exec_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() with listener.push_state("Preprocessing data"): transformed_train = pipeline.fit_and_process(input_df) start_train = unix_time_millis() (clf, actual_params, cluster_labels, additional_columns) = clustering_fit(modeling_params, transformed_train) # if model has custom labels, use them try: cluster_names = clf.get_cluster_labels() except AttributeError: cluster_names = ["cluster_%s" % i for i in range(len(np.unique(cluster_labels)))] cl = pd.Series(data=cluster_labels, name="cluster_labels").map(lambda i: cluster_names[i]) cl.index = transformed_train["TRAIN"].index final_df = pd.concat([input_df_orig.join(cl, how='left'), additional_columns], axis=1) if keptInputColumns is not None: final_df = final_df[keptInputColumns + ['cluster_labels']] if preprocessing_params["outliers"]["method"] == "CLUSTER": final_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True) dataiku.Dataset(output_dataset).write_from_dataframe(final_df) end = unix_time_millis() utils.write_done_traininfo(exec_folder, start, start_train, end, listener)
def _renormalize_dates(df, schema, prep): from dataiku.doctor.utils import normalize_dataframe df = df.copy(deep=False) (names, dtypes, parse_dates) = Dataset.get_dataframe_schema_st(schema["columns"], infer_with_pandas=False, bool_as_str=True) # For columns for which preparation output schema says date, parse it, # because the Pandas CSV parser does not do it if parse_dates is not False: for col_idx in parse_dates: col = schema["columns"][col_idx]["name"] if col in df: df[col] = pd.to_datetime(df[col]) return normalize_dataframe(df, prep["per_feature"])
def output_generator(): logging.info("Start output generator ...") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=core_params["prediction_type"]) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=batch_size, float_precision="round_trip"): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") logging.info("Predicting it") if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: pred_df = binary_classification_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, recipe_desc["forcedClassifierThreshold"], input_df, output_probas=recipe_desc["outputProbabilities"]) # Probability percentile & Conditional outputs pred_df = binary_classif_scoring_add_percentile_and_cond_outputs( pred_df, recipe_desc, model_folder, cond_outputs, preprocessing_handler.target_map) elif core_params["prediction_type"] == constants.MULTICLASS: pred_df = multiclass_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, input_df, output_probas=recipe_desc["outputProbabilities"]) elif core_params["prediction_type"] == constants.REGRESSION: pred_df = regression_predict(clf, pipeline, modeling_params, input_df) else: raise ValueError("bad prediction type %s" % core_params["prediction_type"]) logging.info("pred df debug :") logging.info(pred_df) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] yield pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1)
def main(model_folder, input_dataset_smartname, output_dataset_smartname, metrics_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs=None): # Obtain a streamed result of the preparation input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "rmodeling_params.json")) collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, preprocessing_params, model_folder) preprocessing_handler.collector_data = collector_data pipeline = preprocessing_handler.build_preprocessing_pipeline( with_target=True) with open(osp.join(model_folder, "clf.pkl"), "rb") as f: clf = pickle.load(f) logging.info("Scoring data") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=core_params["prediction_type"]) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) with input_dataset._stream(infer_with_pandas=True, sampling='head', sampling_column=None, limit=None, ratio=None, columns=names) as stream: input_df = pd.read_table(stream, names=names, dtype=dtypes, header=None, sep='\t', doublequote=True, quotechar='"', parse_dates=parse_date_columns, float_precision="round_trip") input_df_orig = input_df.copy() logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") transformed = pipeline.process(input_df) logging.info("Predicting it") if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION: pred_df = binary_classification_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, recipe_desc["forcedClassifierThreshold"], input_df, output_probas=recipe_desc["outputProbabilities"], # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) # Probability percentile & Conditional outputs has_cond_output = recipe_desc["outputProbabilities"] and cond_outputs has_percentiles = recipe_desc["outputProbaPercentiles"] or ( has_cond_output and len([ co for co in cond_outputs if co["input"] == "proba_percentile" ])) if has_percentiles: model_perf = dkujson.load_from_filepath( osp.join(model_folder, "perf.json")) if model_perf.has_key( "probaPercentiles") and model_perf["probaPercentiles"]: percentile = pd.Series(model_perf["probaPercentiles"]) proba_1 = "proba_" + str( (k for k, v in preprocessing_handler.target_map.items() if v == 1).next()) pred_df["proba_percentile"] = pred_df[proba_1].apply( lambda p: percentile.where(percentile <= p).count() + 1) else: raise Exception( "Probability percentiles are missing from model.") if has_cond_output: for co in cond_outputs: inp = pred_df[co["input"]] acc = inp.notnull() # condition accumulator for r in co["rules"]: if r["operation"] == 'GT': cond = inp > r["operand"] elif r["operation"] == 'GE': cond = inp >= r["operand"] elif r["operation"] == 'LT': cond = inp < r["operand"] elif r["operation"] == 'LE': cond = inp <= r["operand"] pred_df.loc[acc & cond, co["name"]] = r["output"] acc = acc & (~cond) pred_df.loc[acc, co["name"]] = co.get("defaultOutput", "") if has_percentiles and not recipe_desc[ "outputProbaPercentiles"]: # was only for conditional outputs pred_df.drop("proba_percentile", axis=1, inplace=True) elif core_params["prediction_type"] == constants.MULTICLASS: pred_df = multiclass_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, input_df, output_probas=recipe_desc["outputProbabilities"], # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) elif core_params["prediction_type"] == constants.REGRESSION: pred_df = regression_predict( clf, pipeline, modeling_params, input_df, # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) else: raise ValueError("bad prediction type %s" % core_params["prediction_type"]) # add error information to pred_df y = transformed["target"] target_mapping = {} if core_params["prediction_type"] in [ constants.BINARY_CLASSIFICATION, constants.MULTICLASS ]: target_mapping = { label: int(class_id) for label, class_id in preprocessing_handler.target_map.items() } pred_df = add_evaluation_columns(core_params["prediction_type"], pred_df, y, target_mapping) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] output_df = pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1) # write scored data output_dataset = dataiku.Dataset(output_dataset_smartname) #logging.info("writing scored schema") #output_dataset.write_schema_from_dataframe(output_df) # backend should do this logging.info("writing scored data") output_dataset.write_from_dataframe(output_df) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in { "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None metrics_df = compute_metrics_df(core_params["prediction_type"], target_mapping, modeling_params, output_df, recipe_desc, y, transformed["UNPROCESSED"], sample_weight) # write metrics dataset if metrics_dataset_smartname: metrics_dataset = dataiku.Dataset(metrics_dataset_smartname) #logging.info("writing metrics schema") #metrics_dataset.write_schema_from_dataframe(metrics_df) # backend should maybe do this ? logging.info("writing metrics data") metrics_dataset.write_from_dataframe(metrics_df)
def scored_dataset_generator(model_folder, input_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs, output_y=False, output_input_df=False, should_add_evaluation_columns=False): from keras.models import load_model from dataiku.doctor.deep_learning import gpu from dataiku.doctor.deep_learning.keras_utils import tag_special_features, split_train_per_input # Load GPU Options if recipe_desc["useGPU"]: from dataiku.doctor.deep_learning import gpu gpu.load_gpu_options(recipe_desc["gpuList"], allow_growth=recipe_desc["gpuAllowGrowth"], per_process_gpu_memory_fraction=float( recipe_desc["perGPUMemoryFraction"])) else: gpu.deactivate_gpu() batch_size = recipe_desc.get("batchSize", 100) # Obtain a streamed result of the preparation input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "actual_params.json"))["resolved"] prediction_type = core_params["prediction_type"] # Tagging special features to take them into account only in special_preproc_handler/special_pipeline per_feature = preprocessing_params["per_feature"] tag_special_features(per_feature) preproc_handler = PreprocessingHandler.build(core_params, preprocessing_params, model_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline( with_target=output_y) target_map = preproc_handler.target_map logging.info("Loading model") model = load_model(osp.join(model_folder, constants.KERAS_MODEL_FILENAME)) logging.info("Start output generator") (names, dtypes, parse_date_columns) = dataiku.Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=prediction_type) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=batch_size): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() logging.info("Got a dataframe chunk : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing chunk") transformed = pipeline.process(input_df) features_X_orig = transformed["TRAIN"] transformed_X_mf = transformed["TRAIN"] inputs_dict = split_train_per_input( transformed_X_mf, per_feature, pipeline.generated_features_mapping) if prediction_type in [ constants.MULTICLASS, constants.BINARY_CLASSIFICATION ]: inv_map = { int(class_id): label for label, class_id in target_map.items() } classes = [ class_label for (_, class_label) in sorted(inv_map.items()) ] if prediction_type == constants.MULTICLASS: probas_raw = model.predict(inputs_dict) preds = np.argmax(probas_raw, axis=1) if prediction_type == constants.BINARY_CLASSIFICATION: if modeling_params["keras"]["oneDimensionalOutput"]: probas_one = np.squeeze(model.predict(inputs_dict), axis=1) probas_raw = np.zeros((probas_one.shape[0], 2)) probas_raw[:, 1] = probas_one probas_raw[:, 0] = 1 - probas_one else: probas_raw = model.predict(inputs_dict) probas_one = probas_raw[:, 1] threshold = recipe_desc["forcedClassifierThreshold"] preds = (probas_one > threshold).astype(np.int) (nb_rows, nb_present_classes) = probas_raw.shape logging.info("Probas raw shape %s/%s target_map=%s", nb_rows, nb_present_classes, len(target_map)) preds_remapped = np.zeros(preds.shape, dtype="object") for (mapped_value, original_value) in inv_map.items(): idx = (preds == mapped_value) preds_remapped[idx] = original_value pred_df = pd.DataFrame({"prediction": preds_remapped}) pred_df.index = features_X_orig.index proba_cols = ["proba_{}".format(c) for c in classes] # For Binary Classification: Must compute probas if conditional there are outputs that use them # Will be deleted afterwards (if outputProbabilities if False) # in binary_classif_scoring_add_percentile_and_cond_outputs probas_in_cond_outputs = (cond_outputs and len( [co for co in cond_outputs if co["input"] in proba_cols]) > 0) use_probas = recipe_desc[ "outputProbabilities"] or probas_in_cond_outputs if use_probas: proba_df = pd.DataFrame( probas_raw, columns=["proba_{}".format(c) for c in classes]) proba_df.index = features_X_orig.index pred_df = pd.concat([proba_df, pred_df], axis=1) if prediction_type == constants.BINARY_CLASSIFICATION: pred_df = binary_classif_scoring_add_percentile_and_cond_outputs( pred_df, recipe_desc, model_folder, cond_outputs, target_map) elif prediction_type == constants.REGRESSION: preds = model.predict(inputs_dict) pred_df = pd.DataFrame({"prediction": np.squeeze(preds, axis=1)}) pred_df.index = features_X_orig.index if should_add_evaluation_columns: if not output_y: raise ValueError( "Cannot add evaluation columns if not outputing Y") else: target_mapping = {} if core_params["prediction_type"] in [ constants.BINARY_CLASSIFICATION, constants.MULTICLASS ]: target_mapping = { label: int(class_id) for label, class_id in preproc_handler.target_map.items() } add_evaluation_columns(prediction_type, pred_df, transformed["target"], target_mapping) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] res = { "scored": pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1) } if output_y: res["y"] = transformed["target"] if output_input_df: res["input_df"] = input_df_orig yield res