def get_outcome_list(model_id, version_id): try: fmi = get_webapp_config().get("trainedModelFullModelId") if fmi is None: model = dataiku.Model(model_id) model_handler = get_model_handler(model, version_id=version_id) model_accessor = ModelAccessor(model_handler) else: original_model_handler = PredictionModelInformationHandler.from_full_model_id( fmi) model_accessor = ModelAccessor(original_model_handler) # note: sometimes when the dataset is very unbalanced, the original_test_df does not have all the target values test_df = model_accessor.get_original_test_df() target = model_accessor.get_target_variable() outcome_list = test_df[target].unique().tolist() filtered_outcome_list = remove_nan_from_list(outcome_list) return simplejson.dumps(filtered_outcome_list, ignore_nan=True, default=convert_numpy_int64_to_int) except: logger.error( "When trying to call get-outcome-list endpoint: {}.".format( traceback.format_exc())) return "{}Check backend log for more details.".format( traceback.format_exc()), 500
def check_model_type(model_id, version_id): try: fmi = get_webapp_config().get("trainedModelFullModelId") if fmi is None: model = dataiku.Model(model_id) model_handler = get_model_handler(model, version_id=version_id) model_accessor = ModelAccessor(model_handler) else: original_model_handler = PredictionModelInformationHandler.from_full_model_id( fmi) model_accessor = ModelAccessor(original_model_handler) if model_accessor.get_prediction_type() in [ DkuModelAccessorConstants.REGRRSSION_TYPE, DkuModelAccessorConstants.CLUSTERING_TYPE ]: raise ValueError( 'Model Fairness Report only supports binary classification model.' ) return 'ok' except: logger.error( "When trying to call check-model-type endpoint: {}.".format( traceback.format_exc())) return "{}Check backend log for more details.".format( traceback.format_exc()), 500
def get_params(config): range_mode = config.get('range_mode') if config.get('input_mode') == 'dataset': df_ref = dataiku.Dataset( config.get("ds_ref")).get_dataframe(bool_as_str=True) columns = [col for col in config.get("columns_dataset") if col != ''] columns_not_in_df_ref = set(columns) - set(df_ref.columns) if len(columns_not_in_df_ref) > 0: raise ValueError( 'The following chosen columns are not in the reference dataset: {}. Please remove them from the list of columns to check.' .format(' ,'.join(list(columns_not_in_df_ref)))) else: model_ref = config.get('model_ref') if model_ref is None: raise ValueError('Please choose a reference model.') model = dataiku.Model(model_ref) model_handler = get_model_handler(model) model_accessor = ModelAccessor(model_handler) df_ref = model_accessor.get_train_df() selected_features = model_accessor.get_selected_features() chosen_columns = [ col for col in config.get("columns_model") if col != '' ] if len(chosen_columns) > 0: columns = chosen_columns features_not_in_model = list(set(columns) - set(selected_features)) if len(features_not_in_model) > 0: raise ValueError( 'The following chosen columns are not used in the model: {}. Please remove them from the list of columns to check.' .format(' ,'.join(features_not_in_model))) else: columns = selected_features return df_ref, columns, range_mode
def get_input_output(has_model_as_second_input=False): if len(get_input_names_for_role('new')) == 0: raise ValueError('No new dataset.') if len(get_output_names_for_role('output_dataset')) == 0: raise ValueError('No output dataset.') new_dataset_name = get_input_names_for_role('new')[0] new_dataset = dataiku.Dataset(new_dataset_name) output_dataset_name = get_output_names_for_role('output_dataset')[0] output_dataset = dataiku.Dataset(output_dataset_name) if has_model_as_second_input: if len(get_input_names_for_role('model')) == 0: raise ValueError('No input model.') model_name = get_input_names_for_role('model')[0] model = dataiku.Model(model_name) return (new_dataset, model, output_dataset) else: if len(get_input_names_for_role('original')) == 0: raise ValueError('No original dataset.') original_dataset_name = get_input_names_for_role('original')[0] original_dataset = dataiku.Dataset(original_dataset_name) return (new_dataset, original_dataset, output_dataset)
def get_value_list(model_id, version_id, column): try: if column == 'undefined' or column == 'null': raise ValueError('Please choose a column.') fmi = get_webapp_config().get("trainedModelFullModelId") if fmi is None: model = dataiku.Model(model_id) model_handler = get_model_handler(model, version_id=version_id) model_accessor = ModelAccessor(model_handler) else: original_model_handler = PredictionModelInformationHandler.from_full_model_id( fmi) model_accessor = ModelAccessor(original_model_handler) test_df = model_accessor.get_original_test_df() value_list = test_df[column].unique().tolist( ) # should check for categorical variables ? filtered_value_list = remove_nan_from_list(value_list) if len(filtered_value_list) > DkuWebappConstants.MAX_NUM_CATEGORIES: raise ValueError( 'Column "{2}" is either of numerical type or has too many categories ({0}). Max {1} are allowed.' .format(len(filtered_value_list), DkuWebappConstants.MAX_NUM_CATEGORIES, column)) return simplejson.dumps(filtered_value_list, ignore_nan=True, default=convert_numpy_int64_to_int) except: logger.error("When trying to call get-value-list endpoint: {}.".format( traceback.format_exc())) return "{}Check backend log for more details.".format( traceback.format_exc()), 500
def get_drift_metrics(): try: model_id = request.args.get('model_id') version_id = request.args.get('version_id') test_set = request.args.get('test_set') new_test_df = dataiku.Dataset(test_set).get_dataframe( bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW) fmi = get_webapp_config().get("trainedModelFullModelId") if fmi is None: model = dataiku.Model(model_id) model_handler = get_model_handler(model, version_id=version_id) model_accessor = ModelAccessor(model_handler) else: original_model_handler = PredictionModelInformationHandler.from_full_model_id( fmi) model_accessor = ModelAccessor(original_model_handler) drifter = DriftAnalyzer() drifter.fit(new_test_df, model_accessor=model_accessor) return json.dumps(drifter.get_drift_metrics_for_webapp(), allow_nan=False, default=convert_numpy_int64_to_int) except: logger.error(traceback.format_exc()) return traceback.format_exc(), 500
def get_histograms(model_id, version_id, advantageous_outcome, sensitive_column): fmi = get_webapp_config().get("trainedModelFullModelId") if fmi is None: model = dataiku.Model(model_id) model_handler = get_model_handler(model, version_id=version_id) model_accessor = ModelAccessor(model_handler) else: original_model_handler = PredictionModelInformationHandler.from_full_model_id( fmi) model_accessor = ModelAccessor(original_model_handler) raw_test_df = model_accessor.get_original_test_df() test_df = raw_test_df.dropna(subset=[sensitive_column]) target_variable = model_accessor.get_target_variable() y_true = test_df.loc[:, target_variable] pred_df = model_accessor.predict(test_df) y_pred = pred_df.loc[:, DkuWebappConstants.PREDICTION] advantageous_outcome_proba_col = 'proba_{}'.format(advantageous_outcome) y_pred_proba = pred_df.loc[:, advantageous_outcome_proba_col] sensitive_feature_values = test_df[sensitive_column] return get_histogram_data(y_true, y_pred, y_pred_proba, advantageous_outcome, sensitive_feature_values)
def do(payload, config, plugin_config, inputs): """ DSS built-in interface for param loading in the form. Retrieve the available versions of a pretrained model in DSS. :param payload: :param config: :param plugin_config: :param inputs: :return: """ model = None for input_ in inputs: if input_['role'] == 'model': model = str(input_['fullName']) if model is None: raise Exception("Did not catch the right input model") model_id = model.split('.')[-1] model = dataiku.Model(model_id) if model.get_info().get('type') != 'PREDICTION': raise ValueError('Model type {} is not supported. Please choose a regression or classifcation model.'.format(model.get_info().get('type'))) choice_list = [] for version in model.list_versions(): version_detail = version.get('snippet', {}) algorithm = version_detail.get('algorithm', '').lower().replace('_', ' ') active_version = version.get('active') is True train_date = process_timestamp(version_detail.get('trainDate')) version_id = version.get('versionId') if active_version: version_info = { 'value': version_id, 'label': 'active version, trained on {1}, {0}'.format(algorithm, train_date) } else: version_info = { 'value': version_id, 'label': 'trained on {1}, {0}'.format(algorithm, train_date) } choice_list.append((version_info, train_date)) sorted_choice_list = sorted(choice_list, key=lambda k: k[1]) final_choice_list = [choice[0] for choice in sorted_choice_list] return {"choices": final_choice_list}
def get_drift_metrics(): try: model_id = request.args.get('model_id') version_id = request.args.get('version_id') test_set = request.args.get('test_set') new_test_df = dataiku.Dataset(test_set).get_dataframe(bool_as_str=True, limit=ModelDriftConstants.MAX_NUM_ROW) model = dataiku.Model(model_id) model_handler = get_model_handler(model, version_id=version_id) model_accessor = ModelAccessor(model_handler) drifter = DriftAnalyzer() drifter.fit(new_test_df, model_accessor=model_accessor) return json.dumps(drifter.get_drift_metrics_for_webapp(), allow_nan=False, default=convert_numpy_int64_to_int) except: logger.error(traceback.format_exc()) return traceback.format_exc(), 500
def get_feature_list(model_id, version_id): try: fmi = get_webapp_config().get("trainedModelFullModelId") if fmi is None: model = dataiku.Model(model_id) model_handler = get_model_handler(model, version_id=version_id) model_accessor = ModelAccessor(model_handler) else: original_model_handler = PredictionModelInformationHandler.from_full_model_id( fmi) model_accessor = ModelAccessor(original_model_handler) column_list = model_accessor.get_selected_and_rejected_features() return simplejson.dumps(column_list, ignore_nan=True, default=convert_numpy_int64_to_int) except: logger.error( "When trying to call get-feature-list endpoint: {}.".format( traceback.format_exc())) return "{}Check backend log for more details.".format( traceback.format_exc()), 500
def can_use_gpu(inputs): """Check that system supports gpu.""" # Check that 'tensorflow-gpu' is installed on the current code-env import pkg_resources has_tf_gpu = "tensorflow-gpu" in [d.key for d in pkg_resources.working_set] if not has_tf_gpu: return False # In the case of classification query sampler, check if the model is keras is_keras_model = True saved_models = [ inp for inp in inputs if inp["type"] == 'SAVED_MODEL' and inp["role"] == 'saved_model' ] if len(saved_models) > 0: # We found a saved model, we are in the query sampling case model = dataiku.Model(saved_models[0]['fullName']) is_keras_model = ( model.get_definition().get('contentType') == 'prediction/keras') return is_keras_model else: return any([(inp["type"] == 'MANAGED_FOLDER' and inp["role"] == 'saved_model') for inp in inputs])
# from dataiku.customwebapp import * import dataiku import pandas as pd from dataiku.apinode.predict.server import handle_predict from flask import request import json SAMPLE_SIZE = 10000 THRESHOLD_CARDINALITY = 100 model_id = 'wLe7LGbH' #make this an input visually later dataset_name = 'webapp_input' model = dataiku.Model(model_id) predictor = model.get_predictor() def get_categoricals(dataset, schema): """ Detects low cardinality features and consider them as categoricals Returns the dataset schema enriched with the values of its categorical features """ df = dataset.get_dataframe(limit=SAMPLE_SIZE) for column in schema: values = df[column['name']].unique() if len(values) < THRESHOLD_CARDINALITY: column['computedType'] = 'categorical' column['values'] = [None if pd.isnull(x) else x for x in values] else: column['computedType'] = column['type'] return schema
from dataiku.customrecipe import * from cardinal import uncertainty from lal import utils, gpu_utils config = get_recipe_config() # GPU set up gpu_opts = gpu_utils.load_gpu_options(config.get('should_use_gpu', False), config.get('list_gpu', ''), config.get('gpu_allocation', 0.)) # Load configuration unlabeled_samples_container = get_input_names_for_role('unlabeled_samples')[0] saved_model_id = get_input_names_for_role('saved_model')[0] model = dataiku.Model(saved_model_id) queries_ds = dataiku.Dataset(get_output_names_for_role('queries')[0], ignore_flow=True) strategy_mapper = { 'confidence': uncertainty.confidence_sampling, 'margin': uncertainty.margin_sampling, 'entropy': uncertainty.entropy_sampling } clf = utils.load_classifier(model) ################# # Active learning unlabeled_df, unlabeled_is_folder = utils.load_data(
# -*- coding: utf-8 -*- import dataiku import pandas as pd, numpy as np, json from dataiku import pandasutils as pdu # Read recipe inputs in_events = dataiku.StreamingEndpoint("in-events") in_events_messages = in_events.get_native_kafka_consumer( ) # use as a generator logreg_model = dataiku.Model('mIkUmJfL') predictor = logreg_model.get_predictor() # Write recipe outputs out_events = dataiku.StreamingEndpoint("out-events") out_events.set_schema(in_events.get_schema()) with out_events.get_native_kafka_producer() as out_events_writer: for f_event in in_events_messages: # Extract the event data print('Receiving event:') print(f_event.value) f_event_data = json.loads(f_event.value) df = pd.DataFrame.from_records([f_event_data]) # Make the prediction pred = predictor.predict(df) print('Prediction result:') print(pred) # Add the prediction result to the event f_event_data['prediction'] = pred['prediction'][0] f_event_data['proba_0'] = pred['proba_0'][0]
import dataiku model = dataiku.Model('model_name_or_id', 'project') for version in model.list_versions(): print('Algorithm ' + version['snippet']['algorithm'] + (' (active)' if version['active'] else ''))
def get_model(self): return dataiku.Model( self.report_item["target"]["modelId"], project_key=self.report_item["target"]["projectKey"])
## Loading inputs input_A_names = get_input_names_for_role('train') # The dataset objects themselves can then be created like this: input_A_datasets = [dataiku.Dataset(name) for name in input_A_names] input_A_datasets = input_A_datasets[0] # For outputs 1: output_A_names = get_output_names_for_role('main_output') output_A_datasets = [dataiku.Dataset(name) for name in output_A_names] target1 = get_recipe_config()['target1'] target2 = get_recipe_config()['target2'] ## Loading and importing dataiku ml model model_name = get_input_names_for_role('input_model')[0] model = dataiku.Model(model_name) my_predictor = model.get_predictor() my_clf = my_predictor._clf # Loading Training Data data = input_A_datasets.get_dataframe() ### Loading model parameeter from URL data_dir = os.environ['DIP_HOME'] data_dir ### Getting information from model url1 = get_recipe_config()['url'] #url1="https://dss-amer.pfizer.com/projects/GBSUSVACCINEMODEL/analysis/RYrvXZCT/ml/p/XJ7pRh5L/A-GBSUSVACCINEMODEL-RYrvXZCT-XJ7pRh5L-s88-pp3-m1/report/#summary" split = url1.split('projects/')[1].split('/') project_key = split[0]
# Read recipe inputs input_dataset = dataiku.Dataset(get_input_names_for_role('input_dataset')[0]) input_df = input_dataset.get_dataframe() metrics_folder_path = dataiku.Folder(get_output_names_for_role('metrics_folder')[0]).get_path() resource_path = get_recipe_resource() book = openpyxl.load_workbook(resource_path + "/confusion_matrix_TEMPLATE.xlsx") sheet = book.worksheets[0] #Get metrics from training training_metrics_list = dataiku.Model(get_input_names_for_role('trained_model')[0]) training_metrics_list.get_predictor() for version in training_metrics_list.versions: if version['active']==True: training_metrics = version training_metrics = training_metrics['snippet'] if training_metrics['trainInfo']['kfold'] == True: accuracystd = training_metrics['accuracystd'] recallstd = training_metrics['recallstd'] precisionstd = training_metrics['precisionstd'] prediction_type = get_recipe_config()['prediction_type'] now = datetime.datetime.now()
def get_metrics(model_id, version_id, advantageous_outcome, sensitive_column, reference_group): fmi = get_webapp_config().get("trainedModelFullModelId") if fmi is None: model = dataiku.Model(model_id) model_handler = get_model_handler(model, version_id=version_id) model_accessor = ModelAccessor(model_handler) else: original_model_handler = PredictionModelInformationHandler.from_full_model_id( fmi) model_accessor = ModelAccessor(original_model_handler) test_df = model_accessor.get_original_test_df() target_variable = model_accessor.get_target_variable() test_df.dropna(subset=[sensitive_column, target_variable], how='any', inplace=True) y_true = test_df.loc[:, target_variable] pred_df = model_accessor.predict(test_df) y_pred = pred_df.loc[:, DkuWebappConstants.PREDICTION] try: # check whether or not the column can be casted to int if np.array_equal(test_df[sensitive_column], test_df[sensitive_column].astype(int)): test_df[sensitive_column] = test_df[sensitive_column].astype(int) if test_df[sensitive_column].dtypes == int: reference_group = int(reference_group) if test_df[sensitive_column].dtypes == float: reference_group = float(reference_group) except Exception as e: logger.info('Sensitive column can not be casted to int. ', e) pass sensitive_feature_values = test_df[sensitive_column] model_report = ModelFairnessMetricReport(y_true, y_pred, sensitive_feature_values, advantageous_outcome) population_names = sensitive_feature_values.unique() metric_dct = {} disparity_dct = {} for metric_func in ModelFairnessMetric.get_available_metric_functions(): metric_summary = model_report.compute_metric_per_group( metric_function=metric_func) metric_dct[metric_func.__name__] = metric_summary.get( DkuFairnessConstants.BY_GROUP) metric_diff = model_report.compute_group_difference_from_summary( metric_summary, reference_group=reference_group) v = np.array( list(metric_diff.get( DkuFairnessConstants.BY_GROUP).values())).reshape( 1, -1).squeeze() v_without_nan = [x for x in v if not np.isnan(x)] if len(v_without_nan) > 0: max_disparity = max(v_without_nan, key=abs) disparity_dct[metric_func.__name__] = max_disparity else: disparity_dct[metric_func.__name__] = 'N/A' # for display purpose populations = [] for name in population_names: dct = { DkuWebappConstants.NAME: name, DkuWebappConstants.SIZE: len(test_df[test_df[sensitive_column] == name]) } for m, v in metric_dct.items(): # the following strings are used only here, too lazy to turn them into constant variables if m == 'demographic_parity': dct['positive_rate'] = v[name] if m == 'equalized_odds': dct['true_positive_rate'], dct['false_positive_rate'] = v[name] if m == 'predictive_rate_parity': dct['positive_predictive_value'] = v[name] # make sure that NaN is replaced by a string (a dot here), for display purpose for k, v in dct.items(): if not isinstance(v, str) and np.isnan(v): dct[k] = '.' populations.append(dct) label_list = model_report.get_label_list() sorted_populations = sorted( populations, key=lambda population: population[DkuWebappConstants.SIZE], reverse=True) return sorted_populations, disparity_dct, label_list
# Configuration conf = get_recipe_config() model_version = conf.get('model_version', 'active').lower() n_samples = int(conf.get('n_samples', -1)) idx_variables = conf.get('copy_cols', None) compute_importance = len(get_output_names_for_role('Shap_imp')) # Outputs out_dataset_name = get_output_names_for_role('Shap_values')[0].split('.')[1] if compute_importance: out_imp_name = get_output_names_for_role('Shap_imp')[0].split('.')[1] ############################# # Load inputs ############################# # Load model model = dataiku.Model(lookup=model_name[1], project_key=model_name[0]) # Get version_id of the 'active' version if model_version != 'active' or empty version_id = ([ version['versionId'] for version in model.list_versions() if version['active'] ][0]) if model_version in (u'active', u'') else model_version # Get predictor from selected version predictor = model.get_predictor(version_id=version_id) # Load the dataset limit = None if n_samples < 0 else n_samples dku_dataset = dataiku.Dataset(dataset_name) shap_values_output = dataiku.Dataset(out_dataset_name) shap_imp_output = dataiku.Dataset(out_imp_name) n_rows = 0 # Is classification or regression? is_regression = len(predictor.classes) == 0
logging.basicConfig(level=logging.INFO, format='%(name)s %(levelname)s - %(message)s') step_config = get_step_config() client = dataiku.api_client() project = client.get_project(dataiku.Project().project_key) # GPU set up gpu_opts = gpu_utils.load_gpu_options( step_config.get('should_use_gpu', False), step_config.get('list_gpu', ''), float(step_config.get('gpu_allocation', 0.))) if step_config['model'] in [m['id'] for m in project.list_saved_models()]: model = dataiku.Model(step_config['model']) else: # model_id could be set in a master project of a DKU APP, but the saved model was then recreated in an App logging.info( 'Model {} was not found in project, trying to find a model by "Classifier" name' .format(step_config['model'])) model = dataiku.Model( 'Classifier') # default name for ML Assisted labeling plugin DKU Apps if step_config['unlabeled_select'] == 'dataset': unlabeled = step_config['unlabeled_dataset'] else: unlabeled = step_config['unlabeled_folder'] metadata = step_config['metadata'] n_samples = int(step_config['n_samples'])
def get_train_date(model_version, version_id): m = dataiku.Model(model_version, ignore_flow=True) for v in m.list_versions(): if v.get('versionId') == version_id: return process_timestamp((v.get('snippet').get('trainDate'))) return None