예제 #1
0
def command(job_id, split_desc, core_params, preprocessing_folder, model_folder, computation_parameters):

    # LOADING INFO #

    model_handler = PredictionModelInformationHandler(split_desc, core_params, preprocessing_folder, model_folder)

    test_df = model_handler.get_test_df()

    # COMPUTING SUBPOPULATION #

    col_name = get_computation_parameter("column", computation_parameters)
    col_type = get_type_of_column(col_name, model_handler)

    if col_type == "CATEGORY":
        value = get_computation_parameter("value", computation_parameters)
        subpop_df = test_df[test_df[col_name] == value]
    else:
        raise NotImplementedError("Not implemented yet :-(")

    # COMPUTING NEW METRICS ON SUBPOP #

    prediction_type = model_handler.get_prediction_type()

    if prediction_type == constants.BINARY_CLASSIFICATION:
        results = compute_binary_subpopulation_metrics(subpop_df, model_handler)
    else:
        raise NotImplementedError("Not implemented yet :-(")

    dkujson.dump_to_filepath(osp.join(model_folder, "subpop.json"), results)

    return "ok"
예제 #2
0
def get_drift_metrics():
    try:
        model_id = request.args.get('model_id')
        version_id = request.args.get('version_id')
        test_set = request.args.get('test_set')
        new_test_df = dataiku.Dataset(test_set).get_dataframe(
            bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW)

        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        drifter = DriftAnalyzer()
        drifter.fit(new_test_df, model_accessor=model_accessor)
        return json.dumps(drifter.get_drift_metrics_for_webapp(),
                          allow_nan=False,
                          default=convert_numpy_int64_to_int)
    except:
        logger.error(traceback.format_exc())
        return traceback.format_exc(), 500
예제 #3
0
def get_histograms(model_id, version_id, advantageous_outcome,
                   sensitive_column):

    fmi = get_webapp_config().get("trainedModelFullModelId")
    if fmi is None:
        model = dataiku.Model(model_id)
        model_handler = get_model_handler(model, version_id=version_id)
        model_accessor = ModelAccessor(model_handler)
    else:
        original_model_handler = PredictionModelInformationHandler.from_full_model_id(
            fmi)
        model_accessor = ModelAccessor(original_model_handler)

    raw_test_df = model_accessor.get_original_test_df()
    test_df = raw_test_df.dropna(subset=[sensitive_column])
    target_variable = model_accessor.get_target_variable()

    y_true = test_df.loc[:, target_variable]
    pred_df = model_accessor.predict(test_df)
    y_pred = pred_df.loc[:, DkuWebappConstants.PREDICTION]

    advantageous_outcome_proba_col = 'proba_{}'.format(advantageous_outcome)
    y_pred_proba = pred_df.loc[:, advantageous_outcome_proba_col]
    sensitive_feature_values = test_df[sensitive_column]

    return get_histogram_data(y_true, y_pred, y_pred_proba,
                              advantageous_outcome, sensitive_feature_values)
def get_value_list(model_id, version_id, column):
    try:
        if column == 'undefined' or column == 'null':
            raise ValueError('Please choose a column.')

        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        test_df = model_accessor.get_original_test_df()
        value_list = test_df[column].unique().tolist(
        )  # should check for categorical variables ?
        filtered_value_list = remove_nan_from_list(value_list)

        if len(filtered_value_list) > DkuWebappConstants.MAX_NUM_CATEGORIES:
            raise ValueError(
                'Column "{2}" is either of numerical type or has too many categories ({0}). Max {1} are allowed.'
                .format(len(filtered_value_list),
                        DkuWebappConstants.MAX_NUM_CATEGORIES, column))

        return simplejson.dumps(filtered_value_list,
                                ignore_nan=True,
                                default=convert_numpy_int64_to_int)
    except:
        logger.error("When trying to call get-value-list endpoint: {}.".format(
            traceback.format_exc()))
        return "{}Check backend log for more details.".format(
            traceback.format_exc()), 500
def check_model_type(model_id, version_id):
    try:
        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        if model_accessor.get_prediction_type() in [
                DkuModelAccessorConstants.REGRRSSION_TYPE,
                DkuModelAccessorConstants.CLUSTERING_TYPE
        ]:
            raise ValueError(
                'Model Fairness Report only supports binary classification model.'
            )
        return 'ok'
    except:
        logger.error(
            "When trying to call check-model-type endpoint: {}.".format(
                traceback.format_exc()))
        return "{}Check backend log for more details.".format(
            traceback.format_exc()), 500
def get_outcome_list(model_id, version_id):
    try:
        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        # note: sometimes when the dataset is very unbalanced, the original_test_df does not have all the target values
        test_df = model_accessor.get_original_test_df()
        target = model_accessor.get_target_variable()
        outcome_list = test_df[target].unique().tolist()
        filtered_outcome_list = remove_nan_from_list(outcome_list)
        return simplejson.dumps(filtered_outcome_list,
                                ignore_nan=True,
                                default=convert_numpy_int64_to_int)
    except:
        logger.error(
            "When trying to call get-outcome-list endpoint: {}.".format(
                traceback.format_exc()))
        return "{}Check backend log for more details.".format(
            traceback.format_exc()), 500
예제 #7
0
def get_model_info():
    try:
        logger.info('Retrieving model data...')
        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = Model(get_webapp_config()["modelId"])
            version_id = get_webapp_config().get("versionId")
            original_model_handler = get_model_handler(model, version_id)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
        stressor.model_accessor = ModelAccessor(original_model_handler)

        return jsonify(
            target_classes=stressor.model_accessor.get_target_classes(),
            pred_type=stressor.model_accessor.get_prediction_type(),
            features={
                feature: preprocessing["type"]
                for (feature, preprocessing
                     ) in stressor.model_accessor.get_per_feature().items()
                if preprocessing["role"] == "INPUT"
            },
            metric=stressor.model_accessor.get_evaluation_metric())
    except:
        logger.error(traceback.format_exc())
        return traceback.format_exc(), 500
예제 #8
0
def get_model_handler(model, version_id=None):
    try:
        params = model.get_predictor(version_id).params
        return PredictionModelInformationHandler(
            params.split_desc, params.core_params, params.model_folder, params.model_folder
        )
    except Exception as e:
        if "ordinal not in range(128)" in safe_str(e):
            raise Exception("Model stress test only supports models built with Python 3. This one was built with Python 2.") from None
        else:
            raise e
예제 #9
0
def _get_model_info_handler(saved_model_version_id):
    infos = saved_model_version_id.split("-")
    if len(infos) != 4 or infos[0] != "S":
        raise ValueError("Invalid saved model id")
    pkey = infos[1]
    model_id = infos[2]
    version_id = infos[3]

    datadir_path = os.environ['DIP_HOME']
    version_folder = os.path.join(datadir_path, "saved_models", pkey, model_id,
                                  "versions", version_id)

    # Loading and resolving paths in split_desc
    split_folder = os.path.join(version_folder, "split")
    with open(os.path.join(split_folder, "split.json")) as split_file:
        split_desc = json.load(split_file)

    path_field_names = ["trainPath", "testPath", "fullPath"]
    for field_name in path_field_names:
        if split_desc.get(field_name, None) is not None:
            split_desc[field_name] = os.path.join(split_folder,
                                                  split_desc[field_name])

    with open(os.path.join(version_folder,
                           "core_params.json")) as core_params_file:
        core_params = json.load(core_params_file)

    try:
        return PredictionModelInformationHandler(split_desc, core_params,
                                                 version_folder,
                                                 version_folder)
    except Exception as e:
        from future.utils import raise_
        if "ordinal not in range(128)" in str(e):
            raise_(
                Exception,
                "The plugin is using a python3 code-env, cannot load a python2 model.",
                sys.exc_info()[2])
        elif str(e) == "non-string names in Numpy dtype unpickling":
            raise_(
                Exception,
                "The plugin is using a python2 code-env, cannot load a python3 model.",
                sys.exc_info()[2])
        else:
            raise_(Exception, "Fail to load saved model: {}".format(e),
                   sys.exc_info()[2])
def get_feature_list(model_id, version_id):
    try:
        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = dataiku.Model(model_id)
            model_handler = get_model_handler(model, version_id=version_id)
            model_accessor = ModelAccessor(model_handler)
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            model_accessor = ModelAccessor(original_model_handler)

        column_list = model_accessor.get_selected_and_rejected_features()
        return simplejson.dumps(column_list,
                                ignore_nan=True,
                                default=convert_numpy_int64_to_int)
    except:
        logger.error(
            "When trying to call get-feature-list endpoint: {}.".format(
                traceback.format_exc()))
        return "{}Check backend log for more details.".format(
            traceback.format_exc()), 500
예제 #11
0
def get_original_model_info():
    try:
        fmi = get_webapp_config().get("trainedModelFullModelId")
        if fmi is None:
            model = Model(get_webapp_config()["modelId"])
            version_id = get_webapp_config().get("versionId")
            original_model_handler = get_model_handler(model, version_id)
            name = model.get_name()
        else:
            original_model_handler = PredictionModelInformationHandler.from_full_model_id(
                fmi)
            name = DSSMLTask.from_full_model_id(
                api_client(),
                fmi).get_trained_model_snippet(fmi).get("userMeta",
                                                        {}).get("name", fmi)
        handler.set_error_analyzer(original_model_handler)
        return jsonify(modelName=name,
                       isRegression='REGRESSION'
                       in original_model_handler.get_prediction_type())
    except:
        LOGGER.error(traceback.format_exc())
        return traceback.format_exc(), 500
예제 #12
0
def get_metrics(model_id, version_id, advantageous_outcome, sensitive_column,
                reference_group):

    fmi = get_webapp_config().get("trainedModelFullModelId")
    if fmi is None:
        model = dataiku.Model(model_id)
        model_handler = get_model_handler(model, version_id=version_id)
        model_accessor = ModelAccessor(model_handler)
    else:
        original_model_handler = PredictionModelInformationHandler.from_full_model_id(
            fmi)
        model_accessor = ModelAccessor(original_model_handler)

    test_df = model_accessor.get_original_test_df()
    target_variable = model_accessor.get_target_variable()
    test_df.dropna(subset=[sensitive_column, target_variable],
                   how='any',
                   inplace=True)

    y_true = test_df.loc[:, target_variable]
    pred_df = model_accessor.predict(test_df)
    y_pred = pred_df.loc[:, DkuWebappConstants.PREDICTION]

    try:  # check whether or not the column can be casted to int
        if np.array_equal(test_df[sensitive_column],
                          test_df[sensitive_column].astype(int)):
            test_df[sensitive_column] = test_df[sensitive_column].astype(int)
        if test_df[sensitive_column].dtypes == int:
            reference_group = int(reference_group)
        if test_df[sensitive_column].dtypes == float:
            reference_group = float(reference_group)
    except Exception as e:
        logger.info('Sensitive column can not be casted to int. ', e)
        pass

    sensitive_feature_values = test_df[sensitive_column]
    model_report = ModelFairnessMetricReport(y_true, y_pred,
                                             sensitive_feature_values,
                                             advantageous_outcome)
    population_names = sensitive_feature_values.unique()

    metric_dct = {}
    disparity_dct = {}
    for metric_func in ModelFairnessMetric.get_available_metric_functions():
        metric_summary = model_report.compute_metric_per_group(
            metric_function=metric_func)
        metric_dct[metric_func.__name__] = metric_summary.get(
            DkuFairnessConstants.BY_GROUP)
        metric_diff = model_report.compute_group_difference_from_summary(
            metric_summary, reference_group=reference_group)
        v = np.array(
            list(metric_diff.get(
                DkuFairnessConstants.BY_GROUP).values())).reshape(
                    1, -1).squeeze()
        v_without_nan = [x for x in v if not np.isnan(x)]
        if len(v_without_nan) > 0:
            max_disparity = max(v_without_nan, key=abs)
            disparity_dct[metric_func.__name__] = max_disparity
        else:
            disparity_dct[metric_func.__name__] = 'N/A'  # for display purpose

    populations = []
    for name in population_names:
        dct = {
            DkuWebappConstants.NAME:
            name,
            DkuWebappConstants.SIZE:
            len(test_df[test_df[sensitive_column] == name])
        }
        for m, v in metric_dct.items():
            # the following strings are used only here, too lazy to turn them into constant variables
            if m == 'demographic_parity':
                dct['positive_rate'] = v[name]
            if m == 'equalized_odds':
                dct['true_positive_rate'], dct['false_positive_rate'] = v[name]
            if m == 'predictive_rate_parity':
                dct['positive_predictive_value'] = v[name]

        # make sure that NaN is replaced by a string (a dot here), for display purpose
        for k, v in dct.items():
            if not isinstance(v, str) and np.isnan(v):
                dct[k] = '.'
        populations.append(dct)

    label_list = model_report.get_label_list()

    sorted_populations = sorted(
        populations,
        key=lambda population: population[DkuWebappConstants.SIZE],
        reverse=True)

    return sorted_populations, disparity_dct, label_list
예제 #13
0
def compute(job_id, split_desc, core_params, preprocessing_folder,
            model_folder, computation_params):
    if computation_params is None or "features_to_compute" not in computation_params:
        raise Exception(
            "'computation_params' should contains a key 'features_to_compute'")

    model_handler = PredictionModelInformationHandler(split_desc, core_params,
                                                      preprocessing_folder,
                                                      model_folder)
    features_to_compute = computation_params["features_to_compute"]

    if model_handler.is_kfolding():
        df = model_handler.get_full_df()
    else:
        df = model_handler.get_test_df()
    progress = PartialDependenciesProgress(job_id, len(features_to_compute))
    saver = PartialDependenciesSaver(model_folder, split_desc["schema"])
    computer = PartialDependencyComputer(
        df, model_handler.get_prediction_type(), model_handler.predict,
        progress, model_handler.get_sample_weight_variable())

    for index, feature_name in enumerate(features_to_compute):
        drop_missing = model_handler.get_per_feature_col(feature_name).get(
            "missing_handling") == "DROP_ROW"
        feature_type = model_handler.get_type_of_column(feature_name)
        is_dummified = False
        category_possible_value = None
        if feature_type == 'CATEGORY':
            # nan values are replaced by a fake one because neither a scale nor a distribution can be computed with nan
            feature_values = df[feature_name].fillna(
                constants.FILL_NA_VALUE).values
            is_dummified = model_handler.is_column_dummified(feature_name)
            category_possible_value = model_handler.category_possible_values(
                feature_name)
        else:
            feature_values = df[feature_name].values
        pd_feature = PartialDependencyFeature(feature_type, feature_values,
                                              feature_name, is_dummified,
                                              category_possible_value,
                                              drop_missing)
        result = computer.compute(pd_feature)
        saver.save(result)
        progress.set_percentage((index + 1) * 100 / len(features_to_compute),
                                no_fail=False)