Пример #1
0
    def save(self, pd_result):
        iperf = dkujson.load_from_filepath(
            os.path.join(self.folder, "iperf.json"))

        if "partialDependencies" not in iperf:
            iperf["partialDependencies"] = []

        for partial_dep in iperf["partialDependencies"]:
            if partial_dep.get('feature') == pd_result.feature.name:
                iperf["partialDependencies"].remove(partial_dep)
                break

        new_partial_dependence = {
            "data": list(pd_result.partial_dependence),
            "feature": pd_result.feature.name,
            "distribution": pd_result.distribution,
            "computedPostTraining": True,
            "isDate": self.dtypes[pd_result.feature.name] == "date",
            "unrepresentedModalities": pd_result.unrepresented_modalities,
        }

        if pd_result.indices_to_drop is not None:
            new_partial_dependence["indicesToDrop"] = pd_result.indices_to_drop

        if pd_result.feature.type == 'CATEGORY':
            new_partial_dependence["categories"] = list(pd_result.scale)
        elif pd_result.feature.type == 'NUMERIC':
            new_partial_dependence["featureBins"] = list(pd_result.scale)

        iperf["partialDependencies"].append(new_partial_dependence)
        dkujson.dump_to_filepath(os.path.join(self.folder, "iperf.json"),
                                 iperf)

        return iperf
Пример #2
0
def command(job_id, split_desc, core_params, preprocessing_folder, model_folder, computation_parameters):

    # LOADING INFO #

    model_handler = PredictionModelInformationHandler(split_desc, core_params, preprocessing_folder, model_folder)

    test_df = model_handler.get_test_df()

    # COMPUTING SUBPOPULATION #

    col_name = get_computation_parameter("column", computation_parameters)
    col_type = get_type_of_column(col_name, model_handler)

    if col_type == "CATEGORY":
        value = get_computation_parameter("value", computation_parameters)
        subpop_df = test_df[test_df[col_name] == value]
    else:
        raise NotImplementedError("Not implemented yet :-(")

    # COMPUTING NEW METRICS ON SUBPOP #

    prediction_type = model_handler.get_prediction_type()

    if prediction_type == constants.BINARY_CLASSIFICATION:
        results = compute_binary_subpopulation_metrics(subpop_df, model_handler)
    else:
        raise NotImplementedError("Not implemented yet :-(")

    dkujson.dump_to_filepath(osp.join(model_folder, "subpop.json"), results)

    return "ok"
Пример #3
0
 def report(self, pipeline):
     report = {}
     if hasattr(self, "core_params"):
         pipeline.report_fit(report, self.core_params)
     else:
         pipeline.report_fit(report, {})
     dkujson.dump_to_filepath(
         osp.join(self.data_path, "preprocessing_report.json"), report)
Пример #4
0
    def __exit__(self, typ, val, tb):
        self._watching = False
        self.join()

        if self.m_folder is not None:
            self.aggregate_grid_dir()
            dkujson.dump_to_filepath(self.grid_search_file,
                                     self.grid_search_summary)
        self.cleanup()
Пример #5
0
def write_running_traininfo(folder, start_time, listener):
    status_filepath = osp.join(folder, "train_info.json")
    if osp.exists(status_filepath):
        status = dkujson.load_from_filepath(status_filepath)
    else:
        status = {}

    status["state"] = "RUNNING"
    status["startTime"] = start_time
    status["progress"] = listener.to_jsonifiable()
    dkujson.dump_to_filepath(status_filepath, status)
Пример #6
0
    def score(self):
        logging.info("Intrinsic scoring of clustering model")
        if self.modeling_params['algorithm'] in ['PY_TWO_STEP']:
            dkujson.dump_to_filepath(self.pk_path('hierarchy.json'),
                                     self.clf.to_json(self.train_X, self._extract_rescalers()))

        # anomaly detection
        if self.modeling_params['algorithm'] in ['PY_ISOLATION_FOREST']:
            columns_to_keep = [s for s in list(set(self.profiling_df.columns) - (set(self.train_X.columns) | set(["cluster_labels"]))) if s[:6]!="dummy:"]
            extra_columns_df = self.profiling_df[columns_to_keep]
            # if there are actually two clusters (regular and anomaly)
            if self.profiling_df["cluster_labels"].nunique() > 1:
                dkujson.dump_to_filepath(self.pk_path('anomalies.json'), self.clf.get_top_outliers(self.train_X, self._extract_rescalers(), extra_columns_df))
Пример #7
0
 def _serialize_pipeline_meta(self, name):
     meta = {
         "backend":
         "KERAS" if self.modeling_params.get("algorithm") == "KERAS_CODE"
         else "PY_MEMORY",
         "algorithm_name":
         name,
         "columns":
         self.columns
     }
     if self.target_mapping is not None:
         # because scikit does it own class mapping, we have to remap here. So the final classes will be different
         # from the target_mapping if some were missing from the training set
         inv_mapping = {x[1]: x[0] for x in self.target_mapping.items()}
         meta["classes"] = [inv_mapping[i] for i in self.clf.classes_]
     dkujson.dump_to_filepath(
         osp.join(self.model_folder, "dss_pipeline_meta.json"), meta)
Пример #8
0
def write_done_traininfo(folder, start_time, start_training_time, end_time, listener, end_preprocessing_time=None):
    status_filepath = osp.join(folder, "train_info.json")
    if osp.exists(status_filepath):
        status = dkujson.load_from_filepath(status_filepath)
    else:
        status = {}

    status["state"] = "DONE"
    status["startTime"] = start_time
    status["endTime"] = end_time
    status["preprocessingTime"] = (end_preprocessing_time or start_training_time) - start_time
    status["trainingTime"] = end_time - start_training_time
    if isinstance(listener, ProgressListener):
        status["progress"] = listener.to_jsonifiable()
    else:
        status["progress"] = reduce(merge_listeners, listener)

    dkujson.dump_to_filepath(status_filepath, status)
Пример #9
0
def save_prediction_model(clf, out_params, listener, update_fn, folder):
    import dataiku.doctor.constants as constants
    from dataiku.core import dkujson
    import os.path as osp
    try:
        import cPickle as pickle
    except:
        import pickle

    with listener.push_state(constants.STATE_SAVING):
        update_fn()
        # UGLY
        if hasattr(clf, "scorer"):
            clf.scorer = None
            if "scorer" in clf.params:
                del clf.params["scorer"]
        with open(osp.join(folder, "clf.pkl"),
                  dku_write_mode_for_pickling()) as f:
            pickle.dump(clf, f, 2)
        dkujson.dump_to_filepath(osp.join(folder, "actual_params.json"),
                                 out_params)
Пример #10
0
    def score(self):
        logging.info("Computing regression performance on %s\n", self.preds)
        self.ret["regression_performance"] = self.get_regression_performance(self.valid_Y, self.preds, self.sample_weight)

        # Scatter plot
        both = pd.DataFrame({
            "predicted": self.preds,
            "actual": self.valid_Y
        })
        nb_records = len(both.index)
        if nb_records < 1000:
            proba = 1
        else:
            proba = 1000.0 / nb_records

        s, m = pdu.split_train_valid(both, prop=proba, seed=42)
        self.ret["scatterPlotData"] = {"x": [], "y": []}
        for record in s.itertuples():
            self.ret["scatterPlotData"]["x"].append(float(record[1]))
            self.ret["scatterPlotData"]["y"].append(float("%.4f" % record[2]))

        # Metrics
        self.ret["metrics"] = compute_metrics(self.valid_Y, self.preds, self.sample_weight)

        if self.modeling_params["metrics"]["evaluationMetric"] == "CUSTOM":
            custom_scorefunc = get_custom_scorefunc(self.modeling_params, self.valid_unprocessed)
            self.ret["metrics"]["customScore"] = custom_scorefunc(self.valid_Y, self.preds, sample_weight=self.sample_weight)

        # Dump the predicted set
        if self.valid_X_index is not None:
            self.compute_predicted_data(self.preds, self.valid_X_index)

        # Dump the perf
        dkujson.dump_to_filepath(osp.join(self.out_folder, "perf.json"), self.ret)

        self.perf_data = self.ret

        return self.ret
Пример #11
0
def clustering_train_score_save(transformed_src, src_index,
                                preprocessing_params, modeling_params,
                                run_folder, listener, update_fn, pipeline):
    """Trains one model and saves results to run_folder"""

    with listener.push_state(constants.STATE_FITTING):
        update_fn()
        (clf, out_params, cluster_labels,
         additional_columns) = clustering_fit(modeling_params, transformed_src)

    with listener.push_state(constants.STATE_SAVING):
        update_fn()
        with open(osp.join(run_folder, "clusterer.pkl"),
                  dku_write_mode_for_pickling()) as f:
            pickle.dump(clf, f, 2)
        dkujson.dump_to_filepath(osp.join(run_folder, "actual_params.json"),
                                 out_params)

    with listener.push_state(constants.STATE_SCORING):
        update_fn()
        ClusteringModelScorer(clf, transformed_src, src_index, cluster_labels,
                              preprocessing_params, modeling_params, pipeline,
                              run_folder).score()
Пример #12
0
    def score(self):
        ret = self.iipd
        logging.info("Intrinsic scoring")

        if self.calibrate_proba:
            uncalibrated_clf = self.clf.base_estimator
        else:
            uncalibrated_clf = self.clf

        if self.modeling_params['algorithm'] in ['XGBOOST_CLASSIFICATION']:
            max_iterations = self.modeling_params['xgboost_grid']['n_estimators']
            best_iteration = uncalibrated_clf._Booster.best_iteration
            early_stopping_rounds = self.modeling_params['xgboost_grid']['early_stopping_rounds']
            ret["nBoostedEstimators"] = min(best_iteration + early_stopping_rounds, max_iterations)

        if 'feature_importances_' in dir(uncalibrated_clf):
            self.get_rf_raw_importance(uncalibrated_clf, ret)

            if self.modeling_params['algorithm'] in ['SCIKIT_MODEL']:
                # Make sure variable importances are normalized
                ri = ret["rawImportance"]
                weights_sum = sum(ri["importances"])
                if weights_sum != 0:
                    ri["importances"] = np.array(ri["importances"]) / float(weights_sum)

        # Regression coefficients for logit (binary only and only if not too many non-zero coef TODO @analysis)
        if self.modeling_params['algorithm'] in ['LOGISTIC_REGRESSION', 'SGD_CLASSIFICATION', 'LARS'] and uncalibrated_clf.coef_.shape[0] == 1:
            ret["lmCoefficients"] = _compute_coefs(uncalibrated_clf, self.train_X, self.prepared_X, self.train_y,
                                                   self._extract_rescalers())

        # Decision tree summary, not dumped in iperf json but separate file
        if self.modeling_params['algorithm'] in ['DECISION_TREE_CLASSIFICATION']:
            if not self.modeling_params.get("skipExpensiveReports"):
                logging.info("Creating decision tree summary")
                tree_summary = TreeSummaryBuilder(uncalibrated_clf, self.train_X.columns(), self._extract_rescalers(), False).build()
                dkujson.dump_to_filepath(osp.join(self.out_folder, "tree.json"), tree_summary)

        if self.modeling_params['algorithm'] == 'GBT_CLASSIFICATION':
            rescalers = self._extract_rescalers()
            # create tree summaries
            if not self.modeling_params.get("skipExpensiveReports"):
                logging.info("Creating gradient boosting trees summary")
                summary = GradientBoostingSummaryBuilder(uncalibrated_clf, self.train_X.columns(), rescalers, False,
                                                         self.modeling_params["max_ensemble_nodes_serialized"]).build()
                dkujson.dump_to_filepath(osp.join(self.out_folder, "trees.json"), summary)

            # Compute partial dependencies
            ret["partialDependencies"] = PartialDependencyPlotBuilder(uncalibrated_clf, self.train_X, self.train_y, rescalers).build()

        if self.modeling_params['algorithm'] == 'RANDOM_FOREST_CLASSIFICATION':
            if not self.modeling_params.get("skipExpensiveReports"):
                logging.info("Creating random forest trees summary")
                summary = RandomForestSummaryBuilder(uncalibrated_clf, self.train_X.columns(), self._extract_rescalers(), False,
                                                     self.modeling_params["max_ensemble_nodes_serialized"]).build()
                dkujson.dump_to_filepath(osp.join(self.out_folder, "trees.json"), summary)

        if self.modeling_params['algorithm'] == 'LARS':
            dkujson.dump_to_filepath(osp.join(self.out_folder, "coef_path.json"), {
                "path": [[[t for t in x] for x in c] for c in uncalibrated_clf.coef_path_],
                "features": self.train_X.columns(),
                "currentIndex": uncalibrated_clf.current_index
            })

        # Learning curve if requested
        if self.modeling_params["computeLearningCurves"]:
            logging.info("Computing learning curves")
            train_X, is_sparse = prepare_multiframe(self.train_X, self.modeling_params)
            train_nbsamples = train_X.shape[0]
            train_y = self.train_y.astype(int)

            train_sizes, train_scores, valid_scores = learning_curve(uncalibrated_clf, train_X, train_y)
            ret["learningCurve"] = {
                "samples" : train_sizes,
                "trainScoreMean" : np.mean(train_scores, axis=1),
                "trainScoreStd": np.std(train_scores, axis=1),
                "cvScoreMean" : np.mean(valid_scores, axis=1),
                "cvScoreStd":  np.std(valid_scores, axis=1)
            }

        ret["probaAware"] = is_proba_aware(self.modeling_params['algorithm'], uncalibrated_clf)

        # Dump the perf
        dkujson.dump_to_filepath(osp.join(self.out_folder, "iperf.json"), ret)
Пример #13
0
def _dku_fit_and_score(estimator,
                       X,
                       y,
                       scorer,
                       train,
                       test,
                       verbose,
                       is_interruptible,
                       parameters,
                       cvwatcher,
                       fit_params,
                       error_score='raise',
                       m_folder=None,
                       split_id=None,
                       parameter_id=None,
                       sample_weight=None,
                       algo_supports_weight=True):
    if cvwatcher.is_interrupted and is_interruptible:
        return None

    current_thread = threading.current_thread()
    current_thread.name = "GS-%s" % (current_thread.ident)

    if verbose > 1:
        if parameters is None:
            msg = ''
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in parameters.items()))
        logging.info("Fit  p=%s s=%s: %s %s" % (parameter_id, split_id, msg,
                                                (64 - len(msg)) * '.'))

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = unix_time_millis()

    X_train, y_train = _safe_split(estimator, X, y, train)
    X_test, y_test = _safe_split(estimator, X, y, test, train)

    # Adjust length of sample weights
    fit_params = fit_params if fit_params is not None else {}

    # XGBoost early stopping
    if fit_params.get("early_stopping_rounds") is not None:
        if fit_params.get("eval_set") is None:
            # log the train and test objective but optimize on the test (last tuple used for early stopping eval)
            fit_params["eval_set"] = [(X_train, y_train), (X_test, y_test)]
        else:
            pass  # still keep the possibility to use a fixed eval_set

    if sample_weight is not None:
        w_train, _ = _safe_split(estimator, sample_weight, y, train)
        w_test, _ = _safe_split(estimator, sample_weight, y, test)
        if algo_supports_weight:
            # fit with sample weights whenever they are enabled AND the algorithm supports them
            fit_params["sample_weight"] = np.array(w_train)

    fit_params = dict([(k, _dku_index_param_value(X, v, train))
                       for k, v in fit_params.items()])

    try:
        if y_train is None:
            estimator.fit(X_train, **fit_params)
        else:
            estimator.fit(X_train, y_train, **fit_params)

    except Exception as e:
        # Note fit time as time until error
        fit_time = unix_time_millis() - start_time
        score_time = 0.0
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            train_score = error_score
            warnings.warn(
                "Classifier fit failed. The score on this train-test"
                " partition for these parameters will be set to %f. "
                "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)")

    else:
        fit_time = unix_time_millis() - start_time
        if sample_weight is not None:
            # score with sample weights whenever they are enabled, regardless of the support by the algorithm
            test_score = _dku_score(estimator,
                                    X_test,
                                    y_test,
                                    scorer,
                                    sample_weight=w_test,
                                    indices=test)
            train_score = _dku_score(estimator,
                                     X_train,
                                     y_train,
                                     scorer,
                                     sample_weight=w_train,
                                     indices=train)
        else:
            test_score = _dku_score(estimator,
                                    X_test,
                                    y_test,
                                    scorer,
                                    indices=test)
            train_score = _dku_score(estimator,
                                     X_train,
                                     y_train,
                                     scorer,
                                     indices=train)
        score_time = unix_time_millis() - start_time - fit_time
    if verbose > 1:
        end_msg = "%s (ft=%.1fs st=%.1fs sc=%s)" % (
            msg, fit_time / 1000, score_time / 1000, test_score)
        logging.info("Done p=%s s=%s: %s" % (parameter_id, split_id, end_msg))
    num_samples = _num_samples(X_test)
    best_iteration = getattr(estimator, 'best_iteration', None)
    ret = {
        "train_score": train_score,
        "test_score": test_score,
        "num_samples": num_samples,
        "fit_time": fit_time,
        "score_time": score_time,
        "time": fit_time + score_time,
        "parameters": parameters,
        "parameter_id": parameter_id,
        "grid_point_id": get_grid_point_id(parameters, split_id),
        "best_iteration": best_iteration,
        "done_at": unix_time_millis()
    }
    if m_folder is not None:
        tmp_file = os.path.join(
            m_folder, 'grid.tmp/grid_search_{}.{}.gridpoint'.format(
                parameter_id, split_id))
        dest_file = os.path.join(
            m_folder,
            'grid/grid_search_{}.{}.gridpoint'.format(parameter_id, split_id))
        dkujson.dump_to_filepath(tmp_file, ret)
        os.rename(tmp_file, dest_file)
    return ret
Пример #14
0
def create_ensemble(split_desc, core_params, model_folder, preprocessing_folder, model_folders, preprocessing_folders):
    listener = ProgressListener()
    listener.add_future_steps(constants.ENSEMBLE_STATES)
    start = unix_time_millis()

    def update_preprocessing_state():
        utils.write_running_traininfo(model_folder, start, listener)

    split_desc = dkujson.loads(split_desc)
    core_params = dkujson.loads(core_params)
    weight_method = core_params.get("weight", {}).get("weightMethod", None)
    with_sample_weight = weight_method in {"SAMPLE_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    # TODO: update downstream
    with_class_weight = weight_method in {"CLASS_WEIGHT", "CLASS_AND_SAMPLE_WEIGHT"}
    preprocessing_folders = dkujson.loads(preprocessing_folders)
    model_folders = dkujson.loads(model_folders)
    modeling_params = dkujson.load_from_filepath(osp.join(model_folder, "rmodeling_params.json"))
    ensemble_params = modeling_params["ensemble_params"]
    logging.info("creating ensemble")
    with listener.push_state(constants.STATE_ENSEMBLING):
        update_preprocessing_state()
        from dataiku.doctor.prediction.ensembles import ensemble_from_fitted
        train = df_from_split_desc(split_desc, "train", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        iperf = {
            "modelInputNRows" : train.shape[0], #todo : not the right count as may have dropped ...
            "modelInputNCols" : -1, # makes no sense for an ensemble as may have different preprocessings
            "modelInputIsSparse" : False
        }
        dkujson.dump_to_filepath(osp.join(model_folder, "iperf.json"), iperf)
        clf = ensemble_from_fitted(core_params, ensemble_params, preprocessing_folders, model_folders, train, with_sample_weight, with_class_weight)

    logging.info("saving model")
    with listener.push_state(constants.STATE_SAVING):
        update_preprocessing_state()
        with open(osp.join(model_folder, "clf.pkl"), dku_write_mode_for_pickling()) as f:
            pickle.dump(clf, f, 2)

    logging.info("scoring model")
    with listener.push_state(constants.STATE_SCORING):
        update_preprocessing_state()
        test = df_from_split_desc(split_desc, "test", ensemble_params["preprocessing_params"][0]["per_feature"],
                                   core_params["prediction_type"])
        # this is annoying, but we have to use one of the previous preprocessings in order to get the target
        prep_folder = preprocessing_folders[0]
        rppp = dkujson.load_from_filepath(osp.join(prep_folder, "rpreprocessing_params.json"))
        collector_data = dkujson.load_from_filepath(osp.join(prep_folder, "collector_data.json"))
        preprocessing_handler = PreprocessingHandler.build(core_params, rppp, prep_folder)
        preprocessing_handler.collector_data = collector_data
        pipe = preprocessing_handler.build_preprocessing_pipeline(with_target=True)
        transformed = pipe.process(test)
        y = transformed["target"]

        if with_sample_weight:
            sample_weight = transformed["weight"]
        else:
            sample_weight = None

        # Now that the CLF with scorable pipelines has been saved, set it in "pipelines with target" mode
        # to be able to compute metrics
        clf.set_with_target_pipelines_mode(True)

        pred = clf.predict(test)
        probas = None if core_params["prediction_type"] == "REGRESSION" else clf.predict_proba(test)
        target_map = None if core_params["prediction_type"] == "REGRESSION" else \
            {t["sourceValue"]: t["mappedValue"] for t in ensemble_params["preprocessing_params"][0]["target_remapping"]}
        prediction_type = core_params["prediction_type"]
        if prediction_type == "REGRESSION":
            RegressionModelScorer(modeling_params, clf, pred, y, model_folder, transformed, test.index.copy(), sample_weight).score()
        elif prediction_type == "BINARY_CLASSIFICATION":
            BinaryClassificationModelScorer(modeling_params, clf, model_folder, pred, probas, y, target_map, transformed, test.index.copy(), sample_weight).score()
        else:
            MulticlassModelScorer(modeling_params, clf, model_folder, pred, probas, y.astype(int), target_map, transformed, test.index.copy(), sample_weight).score()

    update_preprocessing_state()
    end = unix_time_millis()
    dkujson.dump_to_filepath(osp.join(model_folder, "actual_params.json"), {"resolved": modeling_params})
    dkujson.dump_to_filepath(osp.join(preprocessing_folder, "preprocessing_report.json"), {})
    utils.write_done_traininfo(model_folder, start, end, end, listener, end_preprocessing_time=start)

    return "ok"
Пример #15
0
def write_preproc_file(run_folder, filename, obj):
    dkujson.dump_to_filepath(osp.join(run_folder, filename), obj)
Пример #16
0
def write_model_status(modeling_set, status):
    status_filepath = osp.join(modeling_set["run_folder"], "train_info.json")
    dkujson.dump_to_filepath(status_filepath, status)
Пример #17
0
    def score(self):
        ret = self.iipd

        logging.info("Intrinsic scoring")
        if self.modeling_params['algorithm'] == 'DECISION_TREE_REGRESSION':
            if not self.modeling_params.get("skipExpensiveReports"):
                logging.info("Creating decision tree summary")
                tree_summary = TreeSummaryBuilder(self.clf, self.train_X.columns(), self._extract_rescalers(), True).build()
                dkujson.dump_to_filepath(osp.join(self.out_folder, "tree.json"), tree_summary)

            rescalers = self._extract_rescalers()
            ret["partialDependencies"] = _dt_pdp(self.clf, self.train_X, self.train_y, rescalers)

        if self.modeling_params['algorithm'] == 'GBT_REGRESSION':
            rescalers = self._extract_rescalers()

            # Create decision tree summary
            if not self.modeling_params.get("skipExpensiveReports"):
                logging.info("Creating gradient boosting trees summary")
                summary = GradientBoostingSummaryBuilder(self.clf, self.train_X.columns(), rescalers, True,
                                                         self.modeling_params["max_ensemble_nodes_serialized"]).build()
                dkujson.dump_to_filepath(osp.join(self.out_folder, "trees.json"), summary)

            # Compute partial dependencies
            ret["partialDependencies"] = PartialDependencyPlotBuilder(self.clf, self.train_X, self.train_y, rescalers) \
                .build()

        if self.modeling_params['algorithm'] == 'RANDOM_FOREST_REGRESSION':
            rescalers = self._extract_rescalers()
            ret["partialDependencies"] = _rf_pdp(self.clf, self.train_X, self.train_y, rescalers)
            if not self.modeling_params.get("skipExpensiveReports"):
                logging.info("Creating random forest trees summary")
                summary = RandomForestSummaryBuilder(self.clf, self.train_X.columns(), self._extract_rescalers(), True,
                                                     self.modeling_params["max_ensemble_nodes_serialized"]).build()
                dkujson.dump_to_filepath(osp.join(self.out_folder, "trees.json"), summary)

        if self.modeling_params['algorithm'] in ['XGBOOST_REGRESSION']:
            max_iterations = self.modeling_params['xgboost_grid']['n_estimators']
            best_iteration = self.clf._Booster.best_iteration
            early_stopping_rounds = self.modeling_params['xgboost_grid']['early_stopping_rounds']
            ret["nBoostedEstimators"] = min(best_iteration + early_stopping_rounds, max_iterations)

        if self.modeling_params['algorithm'] == 'LARS':
            dkujson.dump_to_filepath(osp.join(self.out_folder, "coef_path.json"), {
                "path": [[[t] for t in x] for x in self.clf.coef_path_],
                "features": self.train_X.columns(),
                "currentIndex": self.clf.current_index
            })

        if 'feature_importances_' in dir(self.clf):
            self.get_rf_raw_importance(self.clf, ret)

            if self.modeling_params['algorithm'] in ['SCIKIT_MODEL']:
                # Make sure variable importances are normalized
                ri = ret["rawImportance"]
                weights_sum = sum(ri["importances"])
                if weights_sum != 0:
                    ri["importances"] = np.array(ri["importances"]) / float(weights_sum)

        # compute coefs if model has any, except for SVM and XGBOOST where _coef can be missing
        if 'coef_' in dir(self.clf) and self.modeling_params['algorithm'] not in {"SVM_REGRESSION", "XGBOOST_REGRESSION"}:
            ret["lmCoefficients"] = _compute_coefs(self.clf, self.train_X, self.prepared_X, self.train_y,
                                                   self._extract_rescalers())

        dkujson.dump_to_filepath(osp.join(self.out_folder, "iperf.json"), ret)
Пример #18
0
def update_gridsearch_info(folder, grid_search_scores):
    status_filepath_tmp = osp.join(folder, "grid_search_scores.json.tmp")
    status_filepath = osp.join(folder, "grid_search_scores.json")
    dkujson.dump_to_filepath(status_filepath_tmp, grid_search_scores)
    os.rename(status_filepath_tmp, status_filepath)
Пример #19
0
    def score(self, optimize_threshold = False):

        self.use_probas = is_proba_aware(self.modeling_params["algorithm"], self.clf)

        check_test_set_ok_for_classification(self.valid_y)

        # Not clear whether this is good or not ...
        # all_classes_in_test_set = np.unique(self.valid_y)
        # all_classes_in_pred = np.unique(self.preds)
        # logging.info("  IN TEST: %s" % all_classes_in_test_set)
        # logging.info("  IN PRED: %s" % all_classes_in_pred)
        # for cls in all_classes_in_pred:
        #     if not cls in all_classes_in_test_set:
        #         raise Exception("One of the classes predicted by the model (%s) is not in the test set. Cannot proceed." % (cls))


        # Compute unmapped preds
        if self.target_map:
            self.mapped_preds = np.zeros(self.preds.shape, np.object)
            for k, v in self.target_map.items():
                self.mapped_preds[self.preds == v] = k
        else:
            self.mapped_preds = self.preds

        # Confusion matrix
        self.ret["classes"] = self.classes
        self.ret["confusion"] = self.get_multiclass_confusion_matrix()
        logging.info("Calculated confusion matrix")

        # 1-vs-all ROC for proba-aware classifiers
        if self.use_probas:
            self.ret["oneVsAllRocAUC"] = {}
            self.ret["oneVsAllRocCurves"] = {}
            self.ret["oneVsAllCalibrationCurves"] = {}
            self.ret["oneVsAllCalibrationLoss"] = {}
            for class_selected in self.classes:
                class_selected_id = int(self.target_map[class_selected])
                logging.info("Make ROC, valid_y=%s" %  self.valid_y)
                logging.info("Make ROC, probas=%s"  % self.probas[:,class_selected_id])

                try:
                    false_positive_rates, true_positive_rates, thresholds = \
                        roc_curve(self.valid_y, self.probas[:, class_selected_id], class_selected_id, self.sample_weight)
                    roc_data = zip(false_positive_rates, true_positive_rates, thresholds)
                    logging.info("AUC %s %s" % (false_positive_rates, true_positive_rates))
                    self.ret["oneVsAllRocCurves"][class_selected] = [{"x": x, "y": y, "p": p}
                                                                     for (x, y, p) in trim_curve(roc_data)]
                    self.ret["oneVsAllRocAUC"][class_selected] = auc(false_positive_rates, true_positive_rates)
                except Exception as e:
                    logging.error(e)
                    continue
                finally:
                    try:
                        y_bin = (self.valid_y.values == int(class_selected_id)).astype(int)
                        freqs, avg_preds, weights = dku_calibration_curve(y_bin, self.probas[:,int(class_selected_id)], n_bins=10, sample_weight=self.sample_weight)
                        zipped = [(t, p, n) for (t, p, n) in zip(freqs, avg_preds, weights) if not np.isnan(t + p + n)]
                        curve = [{"y": 0, "x": 0, "n": 0}] + [{"y": t, "x": p, "n": n} for (t, p, n) in zipped] + [{"y": 1, "x": 1, "n": 0}]
                        self.ret["oneVsAllCalibrationCurves"][class_selected] = curve
                        self.ret["oneVsAllCalibrationLoss"][class_selected] = dku_calibration_loss([x[0] for x in zipped], [x[1] for x in zipped], [x[2] for x in zipped])
                    except Exception as e:
                        logging.error(e)

            self.ret["densityData"] = format_all_proba_density(self.classes,
                        self.target_map, self.probas, self.valid_y, self.sample_weight)

        self.ret["metrics"] = {}

        if self.use_probas:
            self.ret["metrics"]["mrocAUC"] = mroc_auc_score(self.valid_y, self.probas, self.sample_weight)
            self.ret["metrics"]["mcalibrationLoss"] = sum(self.ret["oneVsAllCalibrationLoss"].values()) / len(self.classes)

        if self.modeling_params["metrics"]["evaluationMetric"] == "CUSTOM":
            custom_scorefunc = get_custom_scorefunc(self.modeling_params, self.valid_unprocessed)
            if self.modeling_params["metrics"]["customEvaluationMetricNeedsProba"]:
                self.ret["metrics"]["customScore"] = custom_scorefunc(self.valid_y, self.probas, sample_weight=self.sample_weight)
            else:
                self.ret["metrics"]["customScore"] = custom_scorefunc(self.valid_y, self.preds, sample_weight=self.sample_weight)

        self.ret["metrics"]["precision"] = precision_score(self.valid_y, self.preds, average='macro', pos_label=None, sample_weight=self.sample_weight)
        self.ret["metrics"]["recall"] = recall_score(self.valid_y, self.preds, average='macro', pos_label=None, sample_weight=self.sample_weight)
        self.ret["metrics"]["f1"] = f1_score(self.valid_y, self.preds, average='macro', pos_label=None, sample_weight=self.sample_weight)
        self.ret["metrics"]["accuracy"] = accuracy_score(self.valid_y, self.preds, sample_weight=self.sample_weight)
        self.ret["metrics"]["hammingLoss"] = hamming_loss(self.valid_y, self.preds, sample_weight=self.sample_weight)

        try:
            self.ret["metrics"]["logLoss"] = log_loss(self.valid_y, self.probas, sample_weight=self.sample_weight)
        except:
            # log loss only possible if all classes found, not always the case ...
            pass

         # Dump the predicted set
        if self.valid_X_index is not None:
            if self.use_probas:
                proba_df = pd.DataFrame(self.probas, columns = ["proba_%s" %x for x in self.classes])
                pred_df = pd.DataFrame({"prediction": self.mapped_preds})
                out_df = pd.concat([proba_df, pred_df], axis=1)
                # Realign
                out_df.index = self.valid_X_index
                full = pd.DataFrame(index = self.test_df_index)
                out_df = full.join(out_df, how="left")
                out_df.to_csv(self.out_folder +"/predicted.csv", sep="\t", header=True, index=False, encoding='utf-8')
                self.predicted_df = out_df
            else:
                pred_df = pd.DataFrame({"prediction": self.mapped_preds})
                # Realign
                pred_df.index = self.valid_X_index
                full = pd.DataFrame(index = self.test_df_index)
                pred_df = full.join(pred_df, how="left")
                pred_df.to_csv(self.out_folder +"/predicted.csv", sep="\t", header=True, index=False, encoding='utf-8')
                self.predicted_df = pred_df

        # Dump the perf
        self.ret = remove_all_nan(self.ret)
        self.perf_data = self.ret
        dkujson.dump_to_filepath(osp.join(self.out_folder, "perf.json"), self.ret)

        return self.ret
Пример #20
0
    def score(self):
        if self.use_probas:
            optimize_threshold = self.modeling_params["autoOptimizeThreshold"]
            forced_threshold = self.modeling_params["forcedClassifierThreshold"]

            # Compute probas on classifier and create cut data
            (nb_rows, nb_present_classes) = self.probas.shape
            logging.info("Probas raw shape %s/%s target_map=%s", nb_rows, nb_present_classes, len(self.target_map))
            new_probas = np.zeros((nb_rows, len(self.target_map)))
            if not self.ignore_num_classes:
                for j in range(nb_present_classes):
                    actual_class_id = self.clf.classes_[j]
                    new_probas[:, actual_class_id] = self.probas[:, j]
                self.probas = new_probas

            # Compute all per-cut data
            probas_one = pd.Series(data=self.probas[:, 1], name='predicted')
            pcd = { "cut" : [], "tp" : [], "tn" : [], "fp":[], "fn":[],
                "precision":[], "recall": [], "accuracy": [], "f1" :[], "mcc" :[], "hammingLoss" :[]}

            # np.sort shouldn't be necessary but works around a microbug leading to non-monotonous percentiles.
            # See https://github.com/numpy/numpy/issues/10373
            # Percentiles could include [..., a, b, a, ...] with b < a at the 15 or 16th decimal place,
            # which could lead to different probaPercentile results at prediction time.
            self.ret["probaPercentiles"] = np.sort(probas_one.quantile([float(x + 1) / 100 for x in range(99)]).values)

            custom_scorefunc = None

            if self.modeling_params["metrics"]["evaluationMetric"] == "CUSTOM":
                pcd["customScore"] = []
                custom_scorefunc = get_custom_scorefunc(self.modeling_params, self.valid_unprocessed)
                custom_needsproba = self.modeling_params["metrics"]["customEvaluationMetricNeedsProba"]

            for cut in np.arange(0.0, 1.0, 0.025):
                decision = probas_one > cut
                pcd["cut"].append(cut)
                conf = confusion_matrix(self.valid_y, decision, sample_weight=self.sample_weight)
                pcd["tp"].append(conf[1,1])
                pcd["tn"].append(conf[0,0])
                pcd["fp"].append(conf[0,1])
                pcd["fn"].append(conf[1,0])

                pcd["precision"].append(1.0 if conf[1,1] == 0 and conf[0,1] == 0 \
                        else precision_score(self.valid_y, decision, sample_weight=self.sample_weight))
                pcd["recall"].append(recall_score(self.valid_y, decision, sample_weight=self.sample_weight))
                pcd["f1"].append(f1_score(self.valid_y, decision, sample_weight=self.sample_weight))
                pcd["accuracy"].append(accuracy_score(self.valid_y, decision, sample_weight=self.sample_weight))
                pcd["mcc"].append(matthews_corrcoef(self.valid_y, decision, sample_weight=self.sample_weight))
                pcd["hammingLoss"].append(hamming_loss(self.valid_y, decision, sample_weight=self.sample_weight))

                if custom_scorefunc is not None and not custom_needsproba:
                    decision_with_valid_index = decision.copy()
                    decision_with_valid_index.index = self.valid_y.index
                    ret = custom_scorefunc(self.valid_y, decision_with_valid_index, sample_weight=self.sample_weight)
                    if ret is None:
                        pcd["customScore"].append(0)
                    else:
                        pcd["customScore"].append(ret)

            self.ret["perCutData"] = pcd

            if optimize_threshold:
                best_cut = compute_otimized_threshold(self.valid_y, self.probas, self.modeling_params,
                                                      self.sample_weight)
                self.ret["optimalThreshold"] = best_cut
                used_threshold = best_cut
            else:
                used_threshold = forced_threshold

            self.ret["usedThreshold"] = used_threshold

            # Compute predictions based on the threshold
            probas_one = pd.Series(data=self.probas[:, 1], name='predicted')
            self.preds = (probas_one > used_threshold).astype(np.int)

        else:
            pass #todo : remove branching if we don't need the pandas series cast
            # No probas on clf, compute predictions directly
            # self.preds = pd.Series(self.clf.predict(self.valid_X).astype(np.int))

        if self.target_map:
            self.mapped_preds = np.zeros(self.preds.shape, np.object)
            logging.info("preds %s" % self.preds)
            logging.info("MAPPED SHAPE %s" % self.mapped_preds.shape)

            for k, v in self.target_map.items():
                v = int(v)
                logging.info("k=%s v=%s" % (k,v))
                mask = self.preds == v
                logging.info("Mask data %s", mask.values)
                logging.info("mapped pred %s" % self.mapped_preds.__class__)
                self.mapped_preds[mask.values] = k
        else:
            self.mapped_preds = self.preds

        logging.info("MAPPED PREDS %s" % self.mapped_preds)

        if self.use_probas:
            # Threshold-independent metrics
            self.ret["tiMetrics"] = {}
            self.ret["tiMetrics"]["auc"] = mroc_auc_score(self.valid_y, self.probas, sample_weight=self.sample_weight)
            self.ret["tiMetrics"]["logLoss"] = log_loss(self.valid_y, self.probas, sample_weight=self.sample_weight)

            self.ret["tiMetrics"]["lift"] = make_lift_score(self.modeling_params["metrics"])(self.valid_y, self.probas, sample_weight=self.sample_weight)

            if custom_scorefunc is not None and custom_needsproba:
                ret = custom_scorefunc(self.valid_y, self.probas, sample_weight=self.sample_weight)
                if ret is None:
                    ret = 0
                self.ret["tiMetrics"]["customScore"] = ret

            # ROC and Lift for proba-aware classifiers
            false_positive_rates, true_positive_rates, thresholds = roc_curve(self.valid_y, self.probas[:, 1],
                                                                              sample_weight=self.sample_weight)
            # full roc curve data
            roc_data = zip(false_positive_rates, true_positive_rates, thresholds)
            # trim the data as we don't need all points for visualization
            # in a single-element array for k-fold compatibility
            self.ret["rocVizData"] = [[{"x": x, "y": y, "p": p} for (x, y, p) in trim_curve(roc_data)]]

            predicted = pd.Series(data=self.probas[:, 1], name='predicted')
            with_weight = self.sample_weight is not None
            if with_weight:
                results = pd.DataFrame({"__target__": self.valid_y, "sample_weight": self.sample_weight}).join(predicted)
            else:
                results = pd.DataFrame({"__target__": self.valid_y}).join(predicted)

            lb = LiftBuilder(results, '__target__', 'predicted', with_weight)
            try:
                self.ret["liftVizData"] = lb.build()
            except:
                logging.exception("Cannot compute Lift curve")

            # Probability density per actual class
            self.ret["densityData"] = format_all_proba_density(self.classes,
                        self.target_map, self.probas, self.valid_y, self.sample_weight)

            freqs, avg_preds, weights = dku_calibration_curve(self.valid_y.values, self.probas[:,1], sample_weight=self.sample_weight, n_bins=10)
            zipped = [(t, p, n) for (t, p, n) in zip(freqs, avg_preds, weights) if not np.isnan(t + p + n)]
            self.ret["calibrationData"] = [{"y": 0, "x": 0, "n": 0}] + [{"y": t, "x": p, "n": n} for (t, p, n) in zipped ] + [{"y": 1, "x": 1, "n": 0}]
            self.ret["tiMetrics"]["calibrationLoss"] = dku_nonan(dku_calibration_loss([x[0] for x in zipped], [x[1] for x in zipped], [x[2] for x in zipped]))

        # if self.probas is not None:
        #     self.add_metric("ROC - AUC Score", mroc_auc_score(self.valid_Y, self.probas), "From 0.5 (random model) to 1 (perfect model).")
        # if not self.multiclass and self.probas is not None:
        #     self.add_metric('Average Precision Score', average_precision_score(self.valid_Y, self.probas[:, 1]), "Average precision for all classes")
        # self.add_metric('Accuracy Score', accuracy_score(self.valid_Y, self.preds), "Proportion of correct predictions (positive and negative) in the sample")
        # self.add_metric('F1 Score', f1_score(self.valid_Y, self.preds), "Harmonic mean of Precision and Recall")
        # self.add_metric('Precision Score', precision_score(self.valid_Y, self.preds), "Proportion of correct 'positive' predictions in the sample")
        # self.add_metric('Recall Score', recall_score(self.valid_Y, self.preds), "Proportion of catched 'positive' actual records in the predictions")
        # #self.add_metric('Hinge Loss', hinge_loss(self.valid_Y, self.preds))
        # if not self.multiclass:
        #     self.add_metric('Matthews Correlation Coefficient', matthews_corrcoef(self.valid_Y, self.preds), "The MCC is a correlation coefficient between actual and predicted classifications; +1 is perfect, -1 means no correlation")
        # self.add_metric('Hamming Loss', hamming_loss(self.valid_Y, self.preds), "The Hamming loss is the fraction of labels that are incorrectly predicted. (The lower the better)")
        # #self.add_metric('Jaccard Similarity Score', jaccard_similarity_score(self.valid_Y, self.preds))
        # #self.add_metric('Zero One Loss', zero_one_loss(self.valid_Y, self.preds))
        # if self.probas is not None:
        #     self.add_metric('Log Loss', log_loss(self.valid_Y.values, self.probas), "Error metric that takes into account the predicted probabilities")

        # Dump the predicted set
        if self.valid_X_index is not None:
            if self.use_probas:
                proba_df = pd.DataFrame(self.probas, columns = ["proba_%s" %x for x in self.classes])
                # Realign
                proba_df.index = self.valid_X_index
                full = pd.DataFrame(index = self.test_df_index)
                proba_df = full.join(proba_df, how="left")

                proba_df.to_csv(self.out_folder +"/predicted.csv", sep="\t", header=True, index=False, encoding='utf-8')
                self.predicted_df = proba_df
            else:
                preds_remapped = np.zeros(self.preds.shape, dtype="object")
                for (mapped_value, original_value) in self.inv_map.items():
                    idx = (self.preds.values == mapped_value)
                    preds_remapped[idx] = original_value
                pred_df = pd.DataFrame({"prediction": preds_remapped})
                # Realign
                pred_df.index = self.valid_X_index
                full = pd.DataFrame(index = self.test_df_index)
                pred_df = full.join(pred_df, how="left")
                pred_df.to_csv(self.out_folder +"/predicted.csv", sep="\t", header=True, index=False, encoding='utf-8')
                self.predicted_df = pred_df

        # Dump the perf
        self.ret = remove_all_nan(self.ret)
        dkujson.dump_to_filepath(osp.join(self.out_folder, "perf.json"), self.ret)

        self.perf_data = self.ret
        return self.ret
Пример #21
0
    def score(self):
        logging.info("Clustering scoring: Starting work")
        nb_clusters = len(np.unique(self.cluster_labels))

        # Metrics
        if hasattr(self.cluster_model, "inertia_"):
            self.ret["metrics"]["inertia"] = dku_nonan(self.cluster_model.inertia_)
        if nb_clusters > 1:
            self.ret["metrics"]["silhouette"] = self.silhouette_score()
        self.ret["metrics"]["nbClusters"] = dku_nonan(nb_clusters)

        # Importance
        self.ret["variables_importance"] = self.variables_importance()

        # Build profiling_df
        logging.info("Clustering scoring: building final profiling_df")

        cluster_labels = self.cluster_labels.map(lambda x: self.cluster_names[x])

        #Keep only cluster_names that actually appear in cluster_labels
        self.cluster_names = [cn for cn in self.cluster_names if cn in cluster_labels.unique()]

        self.profiling_df = self.profiling_df.join(cluster_labels)
        if set(self.train.columns).intersection(self.profiling_df.columns):
            # There was no PCA, so we append all columns from train to profiling
            # to get the dummies
            self.ret["reduce_vars"] = []
            train_with_suffixed = self.train.copy(False)
            train_with_suffixed.columns = [u"%s__fromtrain" % x for x in train_with_suffixed.columns]
            self.profiling_df = self.profiling_df.join(train_with_suffixed)
        else:
            # There was a PCA, so train only contains the PCA columns.
            self.ret["reduce_vars"] = list(self.train.columns)
            # We append train to get the factors in scatter plot
            self.profiling_df = self.profiling_df.join(self.train)
            # We append the PREPCA to profiling for the dummies
            train_with_suffixed = self.train_prepca.copy(False)
            train_with_suffixed.columns = [u"%s__fromtrain" % x for x in train_with_suffixed.columns]
            self.profiling_df = self.profiling_df.join(train_with_suffixed)

        # Dedup ...
        # I find it very stupid to have to do that while I just wanted to add some columns ...
        self.profiling_df = self.profiling_df[
            list(filter(lambda x: not x.endswith("__fromtrain"), self.profiling_df.columns))]

        self.nfact = self.profiling_df.columns
        nb_outliers = self.profiling_df.shape[0] - self.train.shape[0]
        self.fact = ['cluster_labfels']
        logging.info("shape ofw train : %i,%i" % self.train.shape)
        logging.info("shape of global dataframe : %i,%i" % self.profiling_df.shape)

        add_cluster_outliers_label = False
        if self.preprocessing_params["outliers"]["method"] == "DROP":
            pass
            #self.profiling_df['cluster_labels'].dropna(inplace=True)
        elif self.preprocessing_params["outliers"]["method"] == "CLUSTER" and self.profiling_df['cluster_labels'].isnull().sum() > 0:
            self.profiling_df['cluster_labels'].fillna(constants.CLUSTER_OUTLIERS, inplace=True)
            add_cluster_outliers_label = True

        self.ret.update({
            "train_nb_records": self.train.shape[0],
            "train_nb_features": self.train.shape[1],
            "train_nb_outliers": nb_outliers
        })
        logging.info("Clustering scorer: final profiling_df %s" % str(self.profiling_df.shape))

        labels_df = pd.DataFrame({"cluster_labels": self.profiling_df["cluster_labels"]})
        #logging.info("Clustering scorer: labels_df: %s" % labels_df)
        # Realign
        # labels_df.index = self.transformed_source.index
        full = pd.DataFrame(index=self.source_index)
        labels_df = full.join(labels_df, how="left")
        # If model has additional scoring columns, fetch them
        if hasattr(self.cluster_model, "get_additional_scoring_columns"):
            additional_scoring_columns = self.cluster_model.get_additional_scoring_columns(self.train)
            labels_df = labels_df.join(additional_scoring_columns, how="left")

        labels_df.to_csv(self.results_path + "/clustered.csv", sep="\t", header=True, index=False, encoding='utf-8')

        self.cluster_labels = self.cluster_names  # this was and remains awful
        if add_cluster_outliers_label:
            self.cluster_labels.append(constants.CLUSTER_OUTLIERS)
        self.cluster_description()
        self.cluster_profiling()
        self.cluster_summary()
        logging.info("Done cluster desc/profiling/summary")
        self.build_scatter()
        self.build_numerical_cluster_stats()
        #If there is only one cluster, the heatmap is irrelevant
        if len(self.cluster_names) > 1:
            self.build_heatmap()
        self.build_facts()

        dkujson.dump_to_filepath(self.pk_path('results.json'), self.ret)

        # intrinsic scoring
        IntrinsicClusteringModelScorer(self.modeling_params, self.cluster_model, self.train,
                                       self.pipeline, self.results_path, self.profiling_df).score()
Пример #22
0
def update_deep_learning_model_info(folder, model_info):
    status_filepath_tmp = osp.join(folder,
                                   "keras_model_training_info.json.tmp")
    status_filepath = osp.join(folder, "keras_model_training_info.json")
    dkujson.dump_to_filepath(status_filepath_tmp, model_info)
    os.rename(status_filepath_tmp, status_filepath)
Пример #23
0
    def cluster_profiling(self, ):
        cluster_profiling = []

        # aggs = [np.min, np.max, np.median, percentile(25), percentile(75)]
        def profile_numerical(vals, scale):
            vals = np.array(vals)
            vals_no_nan = vals[~np.isnan(vals)]
            nb_rows = vals_no_nan.shape[0]
            if nb_rows < 2:
                return {
                    "min": None,
                    "max": None,
                    "median": None,
                    "percentile25": None,
                    "percentile75": None,
                    "percentile9": None,
                    "percentile91": None,
                    "std": None,
                    "distribution": None,
                    "total_no_nan": nb_rows,
                    "max_ratio": 0.0,
                    "total": vals.shape[0]
                }
            else:
                percentile = make_percentile(vals_no_nan)
                distribution = np.histogram(vals_no_nan, scale)[0]
                max_ratio = distribution.max() / float(nb_rows)
                # TODO use the interpolation option in numpy 1.9
                return {
                    "min": np.min(vals_no_nan),
                    "max": np.max(vals_no_nan),
                    "median": float(percentile(50)),
                    "percentile25": float(percentile(25)),
                    "percentile75": float(percentile(75)),
                    "percentile9": float(percentile(9)),
                    "percentile91": float(percentile(91)),
                    "std": np.std(vals_no_nan),
                    "distribution": distribution,
                    "max_ratio": max_ratio,
                    "total_no_nan": nb_rows,
                    "total": vals.shape[0]
                }

        def profile_categorical(vals, categories):
            nb_rows = vals.shape[0]
            if nb_rows == 0:
                return {
                    "distribution": None,
                    "max_ratio": 0.0,
                    "total_no_nan": nb_rows,
                    "total": nb_rows
                }
            else:
                counts = value_counts(vals, n_most_common=30)
                distribution = [
                    {
                        "label": category,
                        "total_no_nan": counts.get(category, 0),
                        "ratio": counts.get(category, 0) / float(nb_rows)
                    }
                    for category in categories
                    ]
                max_ratio = max(counts.values()) / float(nb_rows)
                return {
                    "distribution": distribution,
                    "max_ratio": max_ratio,
                    "total": nb_rows,
                    "total_no_nan": nb_rows
                }

        # add source variables
        if len(self.nfact) >= 2:  # cause 'cluster' in it anyway.
            profiling_df = self.profiling_df[self.nfact]
            cluster_labels = profiling_df["cluster_labels"]
            cluster_names = self.cluster_labels  # sorted(np.unique(cluster_labels))
            for col in profiling_df.columns:
                logging.info("Study profiling column: %s dtype=%s" % (col, profiling_df[col].dtype))
                if col == "cluster_labels":
                    continue
                if col.startswith("factor_"):
                    continue
                if col.startswith("dummy:"):
                    continue
                # if col.endswith("")

                col_profiling = {"variable": col}
                per_cluster = []
                col_profiling["per_cluster"] = per_cluster
                if float in profiling_df[col].dtype.type.mro() or int in profiling_df[col].dtype.type.mro():
                    logging.info("  It's a float")
                    col_profiling["type"] = "numerical"
                    cluster_profiling.append(col_profiling)
                    col_vals = profiling_df[col]
                    col_vals_no_na = no_nan(col_vals)
                    percentile = make_percentile(col_vals_no_na)
                    scale_start = percentile(0)
                    scale_stop = percentile(100)
                    max_ratio = 0.01
                    col_profiling["scale"] = {
                        "min": scale_start,
                        "max": scale_stop,
                    }
                    if scale_stop - scale_start == 0:
                        logging.info("This variable has no variance")
                        col_profiling["no_variance"] = True
                        continue
                    scale = np.linspace(scale_start, scale_stop, num=61)
                    col_profiling["global"] = profile_numerical(col_vals, scale)
                    max_ratio = max(max_ratio, col_profiling["global"]["max_ratio"])
                    for cluster_label in cluster_names:
                        filtered_col_vals = np.array(col_vals[cluster_labels == cluster_label])
                        cluster_profile = profile_numerical(filtered_col_vals, scale)
                        max_ratio = max(max_ratio, cluster_profile["max_ratio"])
                        cluster_profile["cluster_name"] = cluster_label
                        per_cluster.append(cluster_profile)
                    col_profiling["scale"]["max_ratio"] = max_ratio
                else:
                    col_profiling["type"] = "categorical"
                    logging.info("  It's a cat")
                    # categorical stuff.
                    col_vals = profiling_df[col]
                    global_counts = value_counts(col_vals, n_most_common=30)
                    # global_counts contains the counts for the category values we break down on
                    mask = col_vals.isin(global_counts.keys())
                    if None in global_counts:
                        mask |= col_vals.isnull()
                    col_vals = col_vals[mask]
                    cluster_profiling.append(col_profiling)
                    col_profiling["global"] = profile_categorical(col_vals, global_counts.keys())
                    max_ratio = 0.0
                    for cluster_label in cluster_names:
                        filtered_col_vals = col_vals[cluster_labels == cluster_label]
                        cluster_profile = profile_categorical(filtered_col_vals, global_counts.keys())
                        cluster_profile["cluster_name"] = cluster_label
                        max_ratio = max(max_ratio, cluster_profile["max_ratio"])
                        per_cluster.append(cluster_profile)
                    scale = {"max_ratio": max_ratio}
                    col_profiling["scale"] = scale
                    scale["categories"] = list(global_counts.keys())

        dkujson.dump_to_filepath(self.pk_path('profiling.json'), cluster_profiling)
        logging.info("DONE cluster profiling")