def __init__(self, split_desc, core_params, preprocessing_folder, model_folder): self._split_desc = split_desc self._core_params = core_params self._preprocessing_folder = preprocessing_folder self._model_folder = model_folder self._preprocessing_params = dkujson.load_from_filepath( osp.join(preprocessing_folder, "rpreprocessing_params.json")) self._user_meta = dkujson.load_from_filepath( osp.join(model_folder, "user_meta.json")) self._modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "rmodeling_params.json")) self._keras_scoring_batches = 100 self._predictor = build_predictor( "PREDICTION", self._model_folder, self._preprocessing_folder, [], # no need for conditional outputs in this case self._core_params, self._split_desc) self._collector_data = None self._preproc_handler = None self._pipeline = None self._clf = None self._train_df = None self._test_df = None self._full_df = None
def build_predictor_for_saved_model(model_folder, model_type, conditional_outputs): is_prediction = is_model_prediction(model_type) if is_prediction: core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) else: core_params = None split_desc = dkujson.load_from_filepath( osp.join(model_folder, "split", "split.json")) return build_predictor(model_type, model_folder, model_folder, conditional_outputs, core_params, split_desc)
def save(self, pd_result): iperf = dkujson.load_from_filepath( os.path.join(self.folder, "iperf.json")) if "partialDependencies" not in iperf: iperf["partialDependencies"] = [] for partial_dep in iperf["partialDependencies"]: if partial_dep.get('feature') == pd_result.feature.name: iperf["partialDependencies"].remove(partial_dep) break new_partial_dependence = { "data": list(pd_result.partial_dependence), "feature": pd_result.feature.name, "distribution": pd_result.distribution, "computedPostTraining": True, "isDate": self.dtypes[pd_result.feature.name] == "date", "unrepresentedModalities": pd_result.unrepresented_modalities, } if pd_result.indices_to_drop is not None: new_partial_dependence["indicesToDrop"] = pd_result.indices_to_drop if pd_result.feature.type == 'CATEGORY': new_partial_dependence["categories"] = list(pd_result.scale) elif pd_result.feature.type == 'NUMERIC': new_partial_dependence["featureBins"] = list(pd_result.scale) iperf["partialDependencies"].append(new_partial_dependence) dkujson.dump_to_filepath(os.path.join(self.folder, "iperf.json"), iperf) return iperf
def load_relfilepath(basepath, relative_filepath): """ Returns None if the file does not exists """ filepath = osp.join(basepath, relative_filepath) if osp.exists(filepath): return dkujson.load_from_filepath(filepath) else: return None
def write_running_traininfo(folder, start_time, listener): status_filepath = osp.join(folder, "train_info.json") if osp.exists(status_filepath): status = dkujson.load_from_filepath(status_filepath) else: status = {} status["state"] = "RUNNING" status["startTime"] = start_time status["progress"] = listener.to_jsonifiable() dkujson.dump_to_filepath(status_filepath, status)
def clustering_rescore( split_desc, preprocessing_folder, model_folder): preprocessing_params = dkujson.load_from_filepath(osp.join(preprocessing_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath(osp.join(model_folder,"rmodeling_params.json")) user_meta = dkujson.load_from_filepath(osp.join(model_folder, "user_meta.json")) split_desc = dkujson.loads(split_desc) source_df = df_from_split_desc(split_desc, "full", preprocessing_params["per_feature"]) logging.info("Loaded source df: shape=(%d,%d)" % source_df.shape) collector_data = dkujson.load_from_filepath(osp.join(preprocessing_folder, "collector_data.json")) preproc_handler = ClusteringPreprocessingHandler({}, preprocessing_params, "") # we're not saving the data preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline() source_df_index = source_df.index.copy() transformed_source = pipeline.fit_and_process(source_df) logging.info("Loading the clustering model") with open(osp.join(model_folder, "clusterer.pkl"), "rb") as f: clf = pickle.load(f) try: logging.info("Post-processing the model") clf.post_process(user_meta) except AttributeError: pass train_np, is_sparse = prepare_multiframe(transformed_source["TRAIN"], modeling_params) cluster_labels = clf.predict(train_np) logging.info("Rescoring the clustering model") ClusteringModelScorer(clf, transformed_source, source_df_index, cluster_labels, preprocessing_params, modeling_params, pipeline, model_folder).score() return "ok"
def make_running_traininfo(folder, start_time, listener): status_filepath = osp.join(folder, "train_info.json") if osp.exists(status_filepath): status = dkujson.load_from_filepath(status_filepath) else: status = {} status["state"] = "RUNNING" status["startTime"] = start_time if isinstance(listener, ProgressListener): status["progress"] = listener.to_jsonifiable() else: status["progress"] = reduce(merge_listeners, listener) return status
def write_done_traininfo(folder, start_time, start_training_time, end_time, listener, end_preprocessing_time=None): status_filepath = osp.join(folder, "train_info.json") if osp.exists(status_filepath): status = dkujson.load_from_filepath(status_filepath) else: status = {} status["state"] = "DONE" status["startTime"] = start_time status["endTime"] = end_time status["preprocessingTime"] = (end_preprocessing_time or start_training_time) - start_time status["trainingTime"] = end_time - start_training_time if isinstance(listener, ProgressListener): status["progress"] = listener.to_jsonifiable() else: status["progress"] = reduce(merge_listeners, listener) dkujson.dump_to_filepath(status_filepath, status)
def binary_classif_scoring_add_percentile_and_cond_outputs(pred_df, recipe_desc, model_folder, cond_outputs, target_map): inv_map = { int(class_id): label for label, class_id in target_map.items() } classes = [class_label for (_, class_label) in sorted(inv_map.items())] proba_cols = ["proba_{}".format(c) for c in classes] has_probas = recipe_desc["outputProbabilities"] or (cond_outputs and len([co for co in cond_outputs if co["input"] in proba_cols])) has_percentiles = recipe_desc["outputProbaPercentiles"] or (cond_outputs and len([co for co in cond_outputs if co["input"] == "proba_percentile"])) if has_percentiles: model_perf = dkujson.load_from_filepath(osp.join(model_folder, "perf.json")) if model_perf.has_key("probaPercentiles") and model_perf["probaPercentiles"]: percentile = pd.Series(model_perf["probaPercentiles"]) proba_1 = "proba_" + str(inv_map[1]) pred_df["proba_percentile"] = pred_df[proba_1].apply( lambda p: percentile.where(percentile <= p).count() + 1) else: raise Exception("Probability percentiles are missing from model.") if cond_outputs: for co in cond_outputs: inp = pred_df[co["input"]] acc = inp.notnull() # condition accumulator for r in co["rules"]: if r["operation"] == 'GT': cond = inp > r["operand"] elif r["operation"] == 'GE': cond = inp >= r["operand"] elif r["operation"] == 'LT': cond = inp < r["operand"] elif r["operation"] == 'LE': cond = inp <= r["operand"] pred_df.loc[acc & cond, co["name"]] = r["output"] acc = acc & (~cond) pred_df.loc[acc, co["name"]] = co.get("defaultOutput", "") if has_percentiles and not recipe_desc["outputProbaPercentiles"]: # was only for conditional outputs pred_df.drop("proba_percentile", axis=1, inplace=True) if has_probas and not recipe_desc["outputProbabilities"]: # was only for conditional outputs pred_df.drop(proba_cols, axis=1, inplace=True) return pred_df
def __init__(self, parallel, m_folder=None, n_splits=None, n_candidates=None, timeout=None, n_jobs=None, evaluation_metric=None, metric_sign=1): self.parallel = parallel self.m_folder = m_folder self.n_splits = n_splits self.n_candidates = n_candidates self._watching = self.m_folder is not None self.grid_search_summary = [] self.end_time = time.time( ) + timeout * 60 if timeout is not None else None # timeout in minutes self.initial_grid_points = [] self.initial_grid_point_ids = [] self.n_jobs = n_jobs self.evaluation_metric = evaluation_metric self.metric_sign = metric_sign self.start_time = unix_time_millis() self.is_interrupted = False if self._watching: self.grid_folder = os.path.join(self.m_folder, 'grid') self.grid_tmp_folder = os.path.join(self.m_folder, 'grid.tmp') interrupt_optimization.set_interrupt_folder(self.m_folder) self.grid_search_file = os.path.join(self.m_folder, 'grid_search_done_py.json') self.grid_search_summary = dkujson.load_from_filepath(self.grid_search_file) \ if os.path.exists(self.grid_search_file) else [] self.initial_grid_point_ids = [ x['grid_point_id'] for x in self.grid_search_summary ] self.initial_grid_points = self.grid_search_summary[:] for grid_point_id in self.initial_grid_point_ids: logging.info( "Using precomputed score for Grid point {}".format( grid_point_id)) super(CVInterruptWatcherThread, self).__init__()
def main(model_folder, input_dataset_smartname, output_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs=None): # Obtain a streamed result of the preparation input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) listener = ProgressListener() core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "actual_params.json"))["resolved"] collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, preprocessing_params, model_folder) preprocessing_handler.collector_data = collector_data pipeline = preprocessing_handler.build_preprocessing_pipeline() batch_size = recipe_desc.get("pythonBatchSize", 100000) logging.info("Scoring with batch size: {}".format(batch_size)) with open(osp.join(model_folder, "clf.pkl"), "rb") as f: clf = pickle.load(f) def output_generator(): logging.info("Start output generator ...") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=core_params["prediction_type"]) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=batch_size, float_precision="round_trip"): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") logging.info("Predicting it") if core_params[ "prediction_type"] == constants.BINARY_CLASSIFICATION: pred_df = binary_classification_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, recipe_desc["forcedClassifierThreshold"], input_df, output_probas=recipe_desc["outputProbabilities"]) # Probability percentile & Conditional outputs pred_df = binary_classif_scoring_add_percentile_and_cond_outputs( pred_df, recipe_desc, model_folder, cond_outputs, preprocessing_handler.target_map) elif core_params["prediction_type"] == constants.MULTICLASS: pred_df = multiclass_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, input_df, output_probas=recipe_desc["outputProbabilities"]) elif core_params["prediction_type"] == constants.REGRESSION: pred_df = regression_predict(clf, pipeline, modeling_params, input_df) else: raise ValueError("bad prediction type %s" % core_params["prediction_type"]) logging.info("pred df debug :") logging.info(pred_df) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] yield pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1) output_dataset = dataiku.Dataset(output_dataset_smartname) logging.info("Starting writer") with output_dataset.get_writer() as writer: i = 0 logging.info("Starting to iterate") for output_df in output_generator(): logging.info("Generator generated a df %s" % str(output_df.shape)) #if i == 0: # output_dataset.write_schema_from_dataframe(output_df) i = i + 1 writer.write_dataframe(output_df) logging.info("Output df written")
def get_collector_data(self): if self._collector_data is None: self._collector_data = dkujson.load_from_filepath( osp.join(self._preprocessing_folder, "collector_data.json")) return self._collector_data
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders): listener = ProgressListener() listener.add_future_steps(constants.ENSEMBLE_STATES) start = unix_time_millis() def update_preprocessing_state(): utils.write_running_traininfo(model_folder, start, listener) split_desc = dkujson.loads(split_desc) core_params = dkujson.loads(core_params) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} # TODO: update downstream with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"} preprocessing_folders = dkujson.loads(preprocessing_folders) model_folders = dkujson.loads(model_folders) modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json")) ensemble_params = modeling_params["ensemble_params"] logging.info("creating ensemble") with listener.push_state(constants.STATE_ENSEMBLING): update_preprocessing_state() from dataiku.doctor.prediction.ensembles import ensemble_from_fitted train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) iperf = { "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ... "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings "modelInputIsSparse" : False } dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf) clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight) logging.info("saving model") with listener.push_state(constants.STATE_SAVING): update_preprocessing_state() with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f: pickle.dump(clf, f, 2) logging.info("scoring model") with listener.push_state(constants.STATE_SCORING): update_preprocessing_state() test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"], core_params["prediction_type"]) # this is annoying, but we have to use one of the previous preprocessings in order to get the target prep_folder = preprocessing_folders[0] rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json")) collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder) preprocessing_handler.collector_data = collector_data pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True) transformed = pipe.process(test) y = transformed["target"] if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode # to be able to compute metrics clf.set_with_target_pipelines_mode(True) pred = clf.predict(test) probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test) target_map = None if core_params["prediction_type"] == "REGRESSION" else \ {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]} prediction_type = core_params["prediction_type"] if prediction_type == "REGRESSION": RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score() elif prediction_type == "BINARY_CLASSIFICATION": BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score() else: MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score() update_preprocessing_state() end = unix_time_millis() dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params}) dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {}) utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start) return "ok"
def get_deep_learning_model_info(folder): status_filepath = osp.join(folder, "keras_model_training_info.json") return dkujson.load_from_filepath(status_filepath)
def build_predictor(model_type, model_folder, preprocessing_folder, conditional_outputs, core_params, split_desc): is_prediction = is_model_prediction(model_type) # import various parameters preprocessing_params = dkujson.load_from_filepath( osp.join(preprocessing_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "actual_params.json"))["resolved"] collector_data = dkujson.load_from_filepath( osp.join(preprocessing_folder, "collector_data.json")) user_meta = dkujson.load_from_filepath( osp.join(model_folder, "user_meta.json")) schema = split_desc["schema"] is_keras_backend = modeling_params["algorithm"] == "KERAS_CODE" # load model if is_keras_backend: try: # If model was trained on GPU, the prediction will always use GPU as well # In order for one model not to take all the GPU capabilities, we force TensorFlow # to "allow_growth" on each GPU, i.e. it will take only the required resources from dataiku.doctor.deep_learning import gpu gpu.load_gpu_options_only_allow_growth() from keras.models import load_model model_path = osp.join(model_folder, "keras_model.h5") model = load_model(model_path) except IOError: raise NotImplementedError( "Using saved models in python recipes is limited to models trained using the Keras engine" ) else: try: pkl_path = osp.join( model_folder, "clf.pkl" if is_prediction else "clusterer.pkl") with open(pkl_path, "rb") as f: clf = pickle.load(f) try: logging.info("Post-processing model") clf.post_process(user_meta) except AttributeError: pass # method does not exist if model cannot be post-processed, just pass except IOError: raise NotImplementedError( "Using saved models in python recipes is limited to models trained using the python engine" ) # Only prediction has perf.json if osp.isfile(osp.join(model_folder, "perf.json")): model_perf = dkujson.load_from_filepath( osp.join(model_folder, "perf.json")) else: model_perf = {} if is_prediction: cluster_name_map = None else: cluster_name_map = {} if "clusterMetas" in user_meta: for cluster_id, cluster_data in user_meta["clusterMetas"].items(): cluster_name_map[cluster_id] = cluster_data["name"] # create preprocessing from dataiku.doctor.preprocessing_handler import PreprocessingHandler from dataiku.doctor.preprocessing_handler import ClusteringPreprocessingHandler if is_prediction: preprocessing_handler = PreprocessingHandler.build( core_params, preprocessing_params, preprocessing_folder) else: preprocessing_handler = ClusteringPreprocessingHandler( {}, preprocessing_params, preprocessing_folder) preprocessing_handler.collector_data = collector_data params = ModelParams(model_type, modeling_params, preprocessing_params, core_params, schema, user_meta, model_perf, conditional_outputs, cluster_name_map) if modeling_params["algorithm"] == "PYTHON_ENSEMBLE": return EnsemblePredictor(params, clf) else: pipeline = preprocessing_handler.build_preprocessing_pipeline() if is_keras_backend: from dataiku.doctor.deep_learning.keras_utils import tag_special_features per_feature = preprocessing_params["per_feature"] tag_special_features(per_feature) preprocessing = KerasPreprocessing(pipeline, modeling_params, per_feature) return KerasPredictor(params, preprocessing, model, modeling_params, batch_size=100) else: preprocessing = Preprocessing(pipeline, modeling_params) features = _generate_features(collector_data, pipeline) return Predictor(params, preprocessing, features, clf)
def ensemble_from_fitted(core_params, ensemble_params, prep_folders, model_folders, train, with_sample_weight=False, with_class_weight=False): logging.debug("creating ensemble for doctor") model_ids = ensemble_params["model_ids"] prep_hashes = ensemble_params["preprocessing_hashes"] rppp_map = { h: prep for h, prep in zip(ensemble_params["ordered_hashes"], ensemble_params["preprocessing_params"]) } pipe_map = {} preds = [] clfs = [] y = None sample_weight = None target_map = None if "target_remapping" not in ensemble_params["preprocessing_params"][0] else \ {x["sourceValue"]: x["mappedValue"] for x in ensemble_params["preprocessing_params"][0]["target_remapping"]} proba_inputs = ensemble_params["proba_inputs"] for i in range(len(model_ids)): fmi = model_ids[i] hash = prep_hashes[fmi] prep = rppp_map[hash] if hash in pipe_map: # prep pipeline was already cached pipe_with_target = pipe_map[hash]["with_target"] else: # load the preparation pipeline from dataiku.doctor.preprocessing_handler import PredictionPreprocessingHandler prep_folder = prep_folders[i] collector_data = dkujson.load_from_filepath( osp.join(prep_folder, "collector_data.json")) # Build a pipe with target for fitting the ensemble preprocessing_handler = PredictionPreprocessingHandler.build( core_params, prep, prep_folder) preprocessing_handler.collector_data = collector_data pipe_with_target = preprocessing_handler.build_preprocessing_pipeline( with_target=True) # Also build a pipe without target for scoring preprocessing_handler = PredictionPreprocessingHandler.build( core_params, prep, prep_folder) preprocessing_handler.collector_data = collector_data scorable_pipe = preprocessing_handler.build_preprocessing_pipeline( with_target=False) pipe_map[hash] = { "with_target": pipe_with_target, "scorable": scorable_pipe } with open(osp.join(model_folders[i], "clf.pkl"), "rb") as clf_file: clf = pickle.load(clf_file) clfs.append(clf) if y is None: # because some rows might be dropped, we have to recover the target here transformed = pipe_with_target.process(train) y = transformed["target"] # because some rows might be dropped, we have to recover the sample weights here if with_sample_weight: sample_weight = transformed["weight"] # todo : group this to avoid multiple preprocessings. modeling_params = ensemble_params["modeling_params"][i] if core_params["prediction_type"] == "REGRESSION": p = regression_predict(clf, pipe_with_target, modeling_params, train)["prediction"] elif core_params["prediction_type"] == "BINARY_CLASSIFICATION": threshold = 0.5 if "thresholds" not in ensemble_params else ensemble_params[ "thresholds"][i] from dataiku.doctor.prediction import binary_classification_predict p_df = binary_classification_predict(clf, pipe_with_target, modeling_params, prep, target_map, threshold, train) if proba_inputs: p = extract_probas(p_df, prep["target_remapping"]) else: p = p_df["prediction"] else: from dataiku.doctor.prediction import multiclass_predict p_df = multiclass_predict(clf, pipe_with_target, modeling_params, prep, target_map, train) if proba_inputs: p = extract_probas(p_df, prep["target_remapping"]) else: p = p_df["prediction"] preds.append(p) # fit the ensemble if core_params["prediction_type"] == "REGRESSION": ensembler = get_regression_ensembler(ensemble_params, preds, y, sample_weight) elif proba_inputs: ensembler = get_probabilistic_ensembler(len(prep["target_remapping"]), ensemble_params, preds, y, sample_weight, with_class_weight) else: ensembler = get_classifier_ensembler(len(prep["target_remapping"]), ensemble_params, preds, y, sample_weight, with_class_weight) scorable_pipes = [ pipe_map[h]["scorable"] for h in ensemble_params["ordered_hashes"] ] pipes_with_target = [ pipe_map[h]["with_target"] for h in ensemble_params["ordered_hashes"] ] return EnsembleModel(core_params, ensemble_params, scorable_pipes, pipes_with_target, clfs, ensembler)
if __name__ == "__main__": setup_log() read_dku_env_and_set() execution = read_execution() execution_id = execution['id'] with ErrorMonitoringWrapper(): load_libs() logging.info("Launching doctor main") if execution['type'] == 'RECIPE_PREDICTION_SCORE_PYTHON': from dataiku.doctor.prediction.reg_scoring_recipe import main names = json.loads(execution['payload']) main( 'model', names['inputDatasetSmartName'], names['outputDatasetSmartName'], dkujson.load_from_filepath('work/desc.json'), dkujson.load_from_filepath('work/script.json'), dkujson.load_from_filepath( 'work/preparation_output_schema.json'), dkujson.load_from_filepath('work/conditional_outputs.json')) elif execution['type'] == 'RECIPE_PREDICTION_SCORE_KERAS': from dataiku.doctor.prediction.keras_scoring_recipe import main names = json.loads(execution['payload']) main( 'model', names['inputDatasetSmartName'], names['outputDatasetSmartName'], dkujson.load_from_filepath('work/desc.json'), dkujson.load_from_filepath('work/script.json'), dkujson.load_from_filepath( 'work/preparation_output_schema.json'), dkujson.load_from_filepath('work/conditional_outputs.json'))
def main(model_folder, input_dataset_smartname, output_dataset_smartname, recipe_desc, script, preparation_output_schema): input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) listener = ProgressListener() preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "actual_params.json"))["resolved"] collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) # Name remapping user_meta = dkujson.load_from_filepath( osp.join(model_folder, "user_meta.json")) cluster_name_map = {} if "clusterMetas" in user_meta: logging.info("Cluster metas: %s" % user_meta["clusterMetas"]) for (cluster_id, cluster_data) in user_meta["clusterMetas"].items(): cluster_name_map[cluster_id] = cluster_data["name"] preprocessing_handler = ClusteringPreprocessingHandler( {}, preprocessing_params, model_folder) preprocessing_handler.collector_data = collector_data pipeline = preprocessing_handler.build_preprocessing_pipeline() with open(osp.join(model_folder, "clusterer.pkl"), "rb") as f: clf = pickle.load(f) try: logging.info("Post-processing model") clf.post_process(user_meta) except AttributeError: # method does not exist if model cannot be post-processed, just pass pass try: custom_labels = clf.get_cluster_labels() def map_fun_custom(i): name = custom_labels[i] return cluster_name_map.get(name, name) naming = map_fun_custom except AttributeError: def map_fun(i): name = "cluster_%i" % i return cluster_name_map.get(name, name) naming = map_fun def output_generator(): logging.info("Start output generator ...") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"]) logging.info("Reading with dtypes: %s" % dtypes) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=100000): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() if recipe_desc.get("filterInputColumns", False): input_df_orig = input_df_orig[recipe_desc["keptInputColumns"]] logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") transformed = pipeline.process(input_df) logging.info("Applying it") (labels_arr, additional_columns) = clustering_predict(modeling_params, clf, transformed) cluster_labels = pd.Series(labels_arr, name="cluster_labels").map(naming) cluster_labels.index = transformed["TRAIN"].index final_df = pd.concat([ input_df_orig.join(cluster_labels, how='left'), additional_columns ], axis=1) if preprocessing_params["outliers"]["method"] == "CLUSTER": outliers_cluter_name = cluster_name_map.get( constants.CLUSTER_OUTLIERS, constants.CLUSTER_OUTLIERS) final_df['cluster_labels'].fillna(outliers_cluter_name, inplace=True) logging.info("Done predicting it") yield final_df output_dataset = dataiku.Dataset(output_dataset_smartname) logging.info("Starting writer") with output_dataset.get_writer() as writer: i = 0 logging.info("Starting to iterate") for output_df in output_generator(): logging.info("Generator generated a df %s" % str(output_df.shape)) #if i == 0: # output_dataset.write_schema_from_dataframe(output_df) i = i + 1 writer.write_dataframe(output_df) logging.info("Output df written")
def main(model_folder, input_dataset_smartname, output_dataset_smartname, metrics_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs=None): # Obtain a streamed result of the preparation input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "rmodeling_params.json")) collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) preprocessing_handler = PreprocessingHandler.build(core_params, preprocessing_params, model_folder) preprocessing_handler.collector_data = collector_data pipeline = preprocessing_handler.build_preprocessing_pipeline( with_target=True) with open(osp.join(model_folder, "clf.pkl"), "rb") as f: clf = pickle.load(f) logging.info("Scoring data") (names, dtypes, parse_date_columns) = Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=core_params["prediction_type"]) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) with input_dataset._stream(infer_with_pandas=True, sampling='head', sampling_column=None, limit=None, ratio=None, columns=names) as stream: input_df = pd.read_table(stream, names=names, dtype=dtypes, header=None, sep='\t', doublequote=True, quotechar='"', parse_dates=parse_date_columns, float_precision="round_trip") input_df_orig = input_df.copy() logging.info("Got a dataframe : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing it") transformed = pipeline.process(input_df) logging.info("Predicting it") if core_params["prediction_type"] == constants.BINARY_CLASSIFICATION: pred_df = binary_classification_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, recipe_desc["forcedClassifierThreshold"], input_df, output_probas=recipe_desc["outputProbabilities"], # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) # Probability percentile & Conditional outputs has_cond_output = recipe_desc["outputProbabilities"] and cond_outputs has_percentiles = recipe_desc["outputProbaPercentiles"] or ( has_cond_output and len([ co for co in cond_outputs if co["input"] == "proba_percentile" ])) if has_percentiles: model_perf = dkujson.load_from_filepath( osp.join(model_folder, "perf.json")) if model_perf.has_key( "probaPercentiles") and model_perf["probaPercentiles"]: percentile = pd.Series(model_perf["probaPercentiles"]) proba_1 = "proba_" + str( (k for k, v in preprocessing_handler.target_map.items() if v == 1).next()) pred_df["proba_percentile"] = pred_df[proba_1].apply( lambda p: percentile.where(percentile <= p).count() + 1) else: raise Exception( "Probability percentiles are missing from model.") if has_cond_output: for co in cond_outputs: inp = pred_df[co["input"]] acc = inp.notnull() # condition accumulator for r in co["rules"]: if r["operation"] == 'GT': cond = inp > r["operand"] elif r["operation"] == 'GE': cond = inp >= r["operand"] elif r["operation"] == 'LT': cond = inp < r["operand"] elif r["operation"] == 'LE': cond = inp <= r["operand"] pred_df.loc[acc & cond, co["name"]] = r["output"] acc = acc & (~cond) pred_df.loc[acc, co["name"]] = co.get("defaultOutput", "") if has_percentiles and not recipe_desc[ "outputProbaPercentiles"]: # was only for conditional outputs pred_df.drop("proba_percentile", axis=1, inplace=True) elif core_params["prediction_type"] == constants.MULTICLASS: pred_df = multiclass_predict( clf, pipeline, modeling_params, preprocessing_params, preprocessing_handler.target_map, input_df, output_probas=recipe_desc["outputProbabilities"], # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) elif core_params["prediction_type"] == constants.REGRESSION: pred_df = regression_predict( clf, pipeline, modeling_params, input_df, # For ensemble model, we need to indicate that we have target, so that a target-aware pipeline is # selected. See 0c87605 for more information ensemble_has_target=True) else: raise ValueError("bad prediction type %s" % core_params["prediction_type"]) # add error information to pred_df y = transformed["target"] target_mapping = {} if core_params["prediction_type"] in [ constants.BINARY_CLASSIFICATION, constants.MULTICLASS ]: target_mapping = { label: int(class_id) for label, class_id in preprocessing_handler.target_map.items() } pred_df = add_evaluation_columns(core_params["prediction_type"], pred_df, y, target_mapping) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] output_df = pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1) # write scored data output_dataset = dataiku.Dataset(output_dataset_smartname) #logging.info("writing scored schema") #output_dataset.write_schema_from_dataframe(output_df) # backend should do this logging.info("writing scored data") output_dataset.write_from_dataframe(output_df) weight_method = core_params.get("weight", {}).get("weightMethod", None) with_sample_weight = weight_method in { "SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT" } if with_sample_weight: sample_weight = transformed["weight"] else: sample_weight = None metrics_df = compute_metrics_df(core_params["prediction_type"], target_mapping, modeling_params, output_df, recipe_desc, y, transformed["UNPROCESSED"], sample_weight) # write metrics dataset if metrics_dataset_smartname: metrics_dataset = dataiku.Dataset(metrics_dataset_smartname) #logging.info("writing metrics schema") #metrics_dataset.write_schema_from_dataframe(metrics_df) # backend should maybe do this ? logging.info("writing metrics data") metrics_dataset.write_from_dataframe(metrics_df)
output_df = pd.concat(output_list) input_df = pd.concat(input_df_list) logging.info("writing scored data") output_dataset = dataiku.Dataset(output_dataset_smartname) output_dataset.write_from_dataframe(output_df) # Compute and write Metrics Dataset # Don't need to provide sample weight because not supported by KERAS backend metrics_df = compute_metrics_df(prediction_type, target_mapping, modeling_params, output_df, recipe_desc, y, unprocessed=input_df, sample_weight=None) logging.info("writing metrics data") metrics_dataset = dataiku.Dataset(metrics_dataset_smartname) metrics_dataset.write_from_dataframe(metrics_df) if __name__ == "__main__": read_dku_env_and_set() main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], dkujson.load_from_filepath(sys.argv[5]), dkujson.load_from_filepath(sys.argv[6]), dkujson.load_from_filepath(sys.argv[7]), dkujson.load_from_filepath(sys.argv[8]))
def main(model_folder, input_dataset_smartname, output_dataset_smartname, metrics_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs=None): # Fetching information about the model core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "actual_params.json"))["resolved"] collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) prediction_type = core_params["prediction_type"] preprocessing_handler = PreprocessingHandler.build(core_params, preprocessing_params, model_folder) preprocessing_handler.collector_data = collector_data target_mapping = {} if core_params["prediction_type"] in [ constants.BINARY_CLASSIFICATION, constants.MULTICLASS ]: target_mapping = { label: int(class_id) for label, class_id in preprocessing_handler.target_map.items() } # Retrieving scored data with generator (in order to prevent from out of memory errors with # big preprocessing) output_generator = scored_dataset_generator( model_folder, input_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs, output_y=True, output_input_df=True, should_add_evaluation_columns=True) logging.info("Starting to iterate") i = 0 y_list = [] output_list = [] input_df_list = [] for output_dict in output_generator: output_list.append(output_dict["scored"]) y_list.append(output_dict["y"]) input_df_list.append(output_dict["input_df"]) logging.info("Generator generated a df {}".format( str(output_dict["scored"].shape))) i += 1 y = pd.concat(y_list) output_df = pd.concat(output_list) input_df = pd.concat(input_df_list) logging.info("writing scored data") output_dataset = dataiku.Dataset(output_dataset_smartname) output_dataset.write_from_dataframe(output_df) # Compute and write Metrics Dataset # Don't need to provide sample weight because not supported by KERAS backend metrics_df = compute_metrics_df(prediction_type, target_mapping, modeling_params, output_df, recipe_desc, y, unprocessed=input_df, sample_weight=None) logging.info("writing metrics data") metrics_dataset = dataiku.Dataset(metrics_dataset_smartname) metrics_dataset.write_from_dataframe(metrics_df)
def aggregate_grid_dir(self): for grid_point_file_name in os.listdir(self.grid_folder): grid_point_file_path = os.path.join(self.grid_folder, grid_point_file_name) self.process_line(dkujson.load_from_filepath(grid_point_file_path)) os.remove(grid_point_file_path)
def scored_dataset_generator(model_folder, input_dataset_smartname, recipe_desc, script, preparation_output_schema, cond_outputs, output_y=False, output_input_df=False, should_add_evaluation_columns=False): from keras.models import load_model from dataiku.doctor.deep_learning import gpu from dataiku.doctor.deep_learning.keras_utils import tag_special_features, split_train_per_input # Load GPU Options if recipe_desc["useGPU"]: from dataiku.doctor.deep_learning import gpu gpu.load_gpu_options(recipe_desc["gpuList"], allow_growth=recipe_desc["gpuAllowGrowth"], per_process_gpu_memory_fraction=float( recipe_desc["perGPUMemoryFraction"])) else: gpu.deactivate_gpu() batch_size = recipe_desc.get("batchSize", 100) # Obtain a streamed result of the preparation input_dataset = dataiku.Dataset(input_dataset_smartname) logging.info("Will do preparation, output schema: %s" % preparation_output_schema) input_dataset.set_preparation_steps(script["steps"], preparation_output_schema) core_params = dkujson.load_from_filepath( osp.join(model_folder, "core_params.json")) preprocessing_params = dkujson.load_from_filepath( osp.join(model_folder, "rpreprocessing_params.json")) collector_data = dkujson.load_from_filepath( osp.join(model_folder, "collector_data.json")) modeling_params = dkujson.load_from_filepath( osp.join(model_folder, "actual_params.json"))["resolved"] prediction_type = core_params["prediction_type"] # Tagging special features to take them into account only in special_preproc_handler/special_pipeline per_feature = preprocessing_params["per_feature"] tag_special_features(per_feature) preproc_handler = PreprocessingHandler.build(core_params, preprocessing_params, model_folder) preproc_handler.collector_data = collector_data pipeline = preproc_handler.build_preprocessing_pipeline( with_target=output_y) target_map = preproc_handler.target_map logging.info("Loading model") model = load_model(osp.join(model_folder, constants.KERAS_MODEL_FILENAME)) logging.info("Start output generator") (names, dtypes, parse_date_columns) = dataiku.Dataset.get_dataframe_schema_st( preparation_output_schema["columns"], parse_dates=True, infer_with_pandas=False) logging.info("Reading with INITIAL dtypes: %s" % dtypes) dtypes = utils.ml_dtypes_from_dss_schema( preparation_output_schema, preprocessing_params["per_feature"], prediction_type=prediction_type) logging.info("Reading with dtypes: %s" % dtypes) for i in xrange(0, len(names)): logging.info("Column %s = %s (dtype=%s)" % (i, names[i], dtypes.get(names[i], None))) for input_df in input_dataset.iter_dataframes_forced_types( names, dtypes, parse_date_columns, chunksize=batch_size): input_df.index = range(input_df.shape[0]) input_df_orig = input_df.copy() logging.info("Got a dataframe chunk : %s" % str(input_df.shape)) normalize_dataframe(input_df, preprocessing_params['per_feature']) for col in input_df: logging.info("NORMALIZED: %s -> %s" % (col, input_df[col].dtype)) logging.info("Processing chunk") transformed = pipeline.process(input_df) features_X_orig = transformed["TRAIN"] transformed_X_mf = transformed["TRAIN"] inputs_dict = split_train_per_input( transformed_X_mf, per_feature, pipeline.generated_features_mapping) if prediction_type in [ constants.MULTICLASS, constants.BINARY_CLASSIFICATION ]: inv_map = { int(class_id): label for label, class_id in target_map.items() } classes = [ class_label for (_, class_label) in sorted(inv_map.items()) ] if prediction_type == constants.MULTICLASS: probas_raw = model.predict(inputs_dict) preds = np.argmax(probas_raw, axis=1) if prediction_type == constants.BINARY_CLASSIFICATION: if modeling_params["keras"]["oneDimensionalOutput"]: probas_one = np.squeeze(model.predict(inputs_dict), axis=1) probas_raw = np.zeros((probas_one.shape[0], 2)) probas_raw[:, 1] = probas_one probas_raw[:, 0] = 1 - probas_one else: probas_raw = model.predict(inputs_dict) probas_one = probas_raw[:, 1] threshold = recipe_desc["forcedClassifierThreshold"] preds = (probas_one > threshold).astype(np.int) (nb_rows, nb_present_classes) = probas_raw.shape logging.info("Probas raw shape %s/%s target_map=%s", nb_rows, nb_present_classes, len(target_map)) preds_remapped = np.zeros(preds.shape, dtype="object") for (mapped_value, original_value) in inv_map.items(): idx = (preds == mapped_value) preds_remapped[idx] = original_value pred_df = pd.DataFrame({"prediction": preds_remapped}) pred_df.index = features_X_orig.index proba_cols = ["proba_{}".format(c) for c in classes] # For Binary Classification: Must compute probas if conditional there are outputs that use them # Will be deleted afterwards (if outputProbabilities if False) # in binary_classif_scoring_add_percentile_and_cond_outputs probas_in_cond_outputs = (cond_outputs and len( [co for co in cond_outputs if co["input"] in proba_cols]) > 0) use_probas = recipe_desc[ "outputProbabilities"] or probas_in_cond_outputs if use_probas: proba_df = pd.DataFrame( probas_raw, columns=["proba_{}".format(c) for c in classes]) proba_df.index = features_X_orig.index pred_df = pd.concat([proba_df, pred_df], axis=1) if prediction_type == constants.BINARY_CLASSIFICATION: pred_df = binary_classif_scoring_add_percentile_and_cond_outputs( pred_df, recipe_desc, model_folder, cond_outputs, target_map) elif prediction_type == constants.REGRESSION: preds = model.predict(inputs_dict) pred_df = pd.DataFrame({"prediction": np.squeeze(preds, axis=1)}) pred_df.index = features_X_orig.index if should_add_evaluation_columns: if not output_y: raise ValueError( "Cannot add evaluation columns if not outputing Y") else: target_mapping = {} if core_params["prediction_type"] in [ constants.BINARY_CLASSIFICATION, constants.MULTICLASS ]: target_mapping = { label: int(class_id) for label, class_id in preproc_handler.target_map.items() } add_evaluation_columns(prediction_type, pred_df, transformed["target"], target_mapping) logging.info("Done predicting it") if recipe_desc.get("filterInputColumns", False): clean_kept_columns = [ c for c in recipe_desc["keptInputColumns"] if c not in pred_df.columns ] else: clean_kept_columns = [ c for c in input_df_orig.columns if c not in pred_df.columns ] res = { "scored": pd.concat([input_df_orig[clean_kept_columns], pred_df], axis=1) } if output_y: res["y"] = transformed["target"] if output_input_df: res["input_df"] = input_df_orig yield res