class BuildFairlearnDashboard(): def __init__(self): self.__parser = argparse.ArgumentParser("fairlearn") self.__parser.add_argument("--dataset_name", type=str, default="heart_disease_preprocessed_train", help="Name of the dataset") self.__parser.add_argument("--output_fairness_dict", type=str, help="Name of the dataset") self.__args = self.__parser.parse_args() self.__run = Run.get_context() self.__local_run = type(self.__run) == _OfflineRun if self.__local_run: self.__ws = Workspace.from_config('../../notebooks-settings') self.__exp = Experiment(self.__ws, 'fairlearn') self.__run = self.__exp.start_logging() else: self.__ws = self.__run.experiment.workspace self.__exp = self.__run.experiment def main(self): fairlearn_dict_path = os.path.join(self.__args.output_fairness_dict, 'fairlean_predictions_values.pkl') fairlearn_values = joblib.load(fairlearn_dict_path) dash_dict = self.__get_dashboard_dict(fairlearn_values['A_test'], fairlearn_values['Y_test'], fairlearn_values['Y_pred'], fairlearn_values['model_id']) self.__upload_dashboard_dict(dash_dict) def __get_dashboard_dict(self, A_test, Y_test, Y_pred, model_id): sf = { 'diabetic': A_test.diabetic, 'asthmatic': A_test.asthmatic, 'smoker': A_test.smoker } return _create_group_metric_set( y_true=Y_test, predictions={model_id: Y_pred}, sensitive_features=sf, prediction_type='binary_classification') def __upload_dashboard_dict(self, dash_dict): run = self.__exp.start_logging() try: dashboard_title = "Fairness insights of Logistic Regression Classifier with heart-disease data" upload_id = upload_dashboard_dictionary( run, dash_dict, dataset_name=self.__args.dataset_name, dashboard_name=dashboard_title) finally: run.complete()
class TrackedAzureMLEvaluation: """ Class to automatically track parameters, metrics and artifacts for a single model with azureml-sdk """ def __init__(self, experimentName: str, workspace: Workspace, evaluator: MetricsDictProvider): """ :param experimentName: :param workspace: :param evaluator: """ self.experimentName = experimentName self.evaluator = evaluator self.experiment = Experiment(workspace=workspace, name=experimentName) def evalModel(self, model: VectorModel, additionalLoggingValuesDict: dict = None, **startLoggingKwargs): with self.experiment.start_logging(**startLoggingKwargs) as run: valuesDict = self.evaluator.computeMetrics(model) valuesDict['str(model)'] = str(model) if additionalLoggingValuesDict is not None: valuesDict.update(additionalLoggingValuesDict) for name, value in valuesDict.items(): run.log(name, value)
class AmlExperimentation(Experimentation): def __init__(self, ws): super().__init__() self.aml_ws = ws self.aml_experiment = None self.aml_run = None self.is_running_flag = False def set_experiment(self, name, artifact_location=None): logging.info("Connecting to Azure ML") self.aml_experiment = Experiment(workspace=self.aml_ws, name=name) def start_run(self): self.aml_run = self.aml_experiment.start_logging() self.is_running_flag = True def end_run(self): self.aml_run.complete() self.is_running_flag = False def log_param(self, key, value): self.aml_run.log(key, value) def log_params(self, params): self.aml_run.log(params) def log_metric(self, key, value, step=None): self.aml_run.log(key, value) def log_metrics(self, metrics, step=None): self.aml_run.log(metrics) def search_runs( self, experiment_ids=None, filter_string="", run_view_type=1, max_results=100000, order_by=None, ): raise NotImplementedError() def log_image(self, title, fig): self.aml_run.log_image(name=title, plot=fig) def log_artifact(self, local_path, name=None, artifact_path=None): self.aml_run def log_artifacts(self, local_path, name=None, artifact_path=None): pass
def toAzure(): import azureml.core from azureml.core import Workspace from azureml.core import Experiment import shutil, os, glob from azureml.core.authentication import InteractiveLoginAuthentication with open("outputs/_experiment-name_.txt", "r", encoding="utf-8") as file: experiment_name = file.readline() try: ws = Workspace.get( name="sparknlp", subscription_id="bc5674c1-2f09-4eff-8497-b97f5466158f", resource_group="datascientists") except: interactive_auth = InteractiveLoginAuthentication( tenant_id="55574e46-daf5-45bd-8659-de00e36fb97c", force=True) ws = Workspace.get( name="sparknlp", subscription_id="bc5674c1-2f09-4eff-8497-b97f5466158f", resource_group="datascientists", auth=interactive_auth) experiment = Experiment(workspace=ws, name=experiment_name) notebooks = glob.glob("*.ipynb") for nb in notebooks: shutil.copy(nb, "outputs/_notebooks/CopyOf_" + nb) run = experiment.start_logging() print( f"Uploading the content of your '{experiment_name}' to Azure Cloud...") run.complete() runs = experiment.get_runs() print(f"Your {len(list(runs))}. run was uploaded.") print( """You can view your logs on Microsoft Azure Machine Learning Studio. To view the details of your last run, click the link below :""") runs = experiment.get_runs() return list(runs)[0]
def test_run(expname, ws): # create a new experiment exp = Experiment(workspace=ws, name=expname) # start a run run = exp.start_logging() # log a number run.log('my magic number', 42) # log a list (Fibonacci numbers) run.log_list('my list', [1, 1, 2, 3, 5, 8, 13, 21, 34, 55]) # finish the run run.complete() print(run.get_portal_url())
class OfflineRunInitializer(RunInitializer): """Offline run. Download the sample dataset and run locally. Still push results to Azure""" def __init__(self, data_config: Bunch, eval_config: Bunch): super().__init__(data_config, eval_config) def run_azureml_setup(self): logger.info("Running in offline mode...") logger.info("Accessing workspace...") self.workspace = Workspace.from_config() self.experiment = Experiment(self.workspace, EVAL_EXPERIMENT_NAME) self.run = self.experiment.start_logging(outputs=None, snapshot_directory=None) def get_dataset(self): logger.info("Accessing dataset...") dataset_name = self._data_config.NAME self.dataset_path = str(REPO_DIR / "data" / "datasets" / dataset_name) if not os.path.exists(self.dataset_path): dataset = self.workspace.datasets[dataset_name] dataset.download(target_path=self.dataset_path, overwrite=False)
class TrackedAzureMLExperiment(TrackedExperiment): def __init__(self, experimentName: str, workspace: Workspace, additionalLoggingValuesDict=None): """ :param experimentName: name of experiment for tracking in workspace :param workspace: Azure workspace object :param additionalLoggingValuesDict: additional values to be logged for each run """ self.experimentName = experimentName self.experiment = Experiment(workspace=workspace, name=experimentName) super().__init__( additionalLoggingValuesDict=additionalLoggingValuesDict) def _trackValues(self, valuesDict: Dict[str, Any]): with self.experiment.start_logging() as run: for name, value in valuesDict.items(): run.log(name, value)
class SpacyRetrainer: def __init__(self, original_model_name=None, experiment_name=None, n_iter=100, dropout=0.5, aml_config='config.json', output_dir='../../model-outputs', train_pickle='../data/train.pickle', test_pickle='../data/test.pickle'): self.experiment_name = experiment_name if aml_config: self.ws = Workspace.from_config(aml_config) self.experiment = Experiment(workspace=self.ws, name=experiment_name) self.aml_run = self.experiment.start_logging() self.has_aml = True else: self.has_aml = False self.model = original_model_name self.n_iter = n_iter self.output_dir = output_dir self.train_file = train_pickle self.test_file = test_pickle self.dropout = dropout def run(self): if self.has_aml: self.aml_run.log("model", self.model) self.aml_run.log("n_iter", self.n_iter) self.aml_run.log("train_file", self.train_file) self.aml_run.log("test_file", self.test_file) self.aml_run.log("dropout rate", self.dropout) model_path = self._train(self.model, self.output_dir, self.n_iter, self.train_file, self.experiment_name) self._score_validate(model_path, self.test_file) if self.has_aml: self.aml_run.complete() def print_scores(self, split, evaluation_result): """ Logs results into experiment run. :param split: Name of this split. For ex 'train' or 'valid' :param evaluation_result: EvaluationResult containing various metrics :return: None. Writes to experiment runner and logs locally. """ logging.info('SPLIT: {0}. PII_precision: {1}, PII_recall: {2},' 'Person_precision: {3}, Person_recall: {4}'. \ format(split, evaluation_result.pii_precision, evaluation_result.pii_recall, evaluation_result.entity_precision_dict['PERSON'], evaluation_result.entity_recall_dict['PERSON'])) if self.has_aml: self.aml_run.log('Precision', evaluation_result.pii_precision, split) self.aml_run.log('Recall', evaluation_result.pii_recall, split) @staticmethod def _score(model, data): """ Score the model against the data :param model: Trained model :param data: Data split which is being scored. :return: An EvaluationResult containing various metrics """ spacy_evaluator = SpacyEvaluator(model=model) results = [] for text, ground_truth_annotations in data: ground_truth_entities = ground_truth_annotations['entities'] input_sample = InputSample.from_spacy(text, ground_truth_entities) results.append(spacy_evaluator.evaluate_sample(input_sample)) return spacy_evaluator.calculate_score(evaluation_results=results) def _score_validate(self, model_path, test_data_file): """ Validation step for the model. Also prints the scores. :param model_path: Path to trained model. :param test_data_file: Data file which has the dataset for this split. :return: None. Prints the scores. """ with open(test_data_file, 'rb') as f: valid_data = pickle.load(f) nlp = spacy.load(model_path) self.print_scores('Valid', self._score(nlp, valid_data)) # @plac.annotations( # model=("Model name. Defaults to blank 'en' model.", "option", "m", str), # output_dir=("Optional output directory", "option", "o", Path), # n_iter=("Number of training iterations", "option", "n", int), # train_file=("File containing pickled training Spacy NER formatted data", "option", "d", Path), # test_file=("File containing pickled test Spacy NER formatted data", "option", "d", Path), # exp_name=("Name of this experiment", "option", "e") # ) def _train(self, model, output_dir, n_iter, train_file, exp_name): """Load the model, set up the pipeline and train the entity recognizer.""" nlp = self.load_or_create_empty_model(model) if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") with open(train_file, 'rb') as f: train_data = pickle.load(f) # DEBUG train_data = train_data[:50] # add labels for _, annotations in train_data: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(train_data) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=self.dropout, losses=losses, ) logging.debug("Losses", losses) if self.has_aml: self.aml_run.log('Losses', losses['ner']) self.print_scores('Itn {}'.format(itn), self._score(nlp, train_data)) self.print_scores('Train', self._score(nlp, train_data)) saved_model_path = self.save_model(exp_name, nlp, output_dir) return saved_model_path @staticmethod def save_model(exp_name, model, output_dir): """ Saves model to disk for later use. :param exp_name: Name of the running experiment. This is used as folder name for storing the model. :param model: Model being saved :param output_dir: Directory where to save the model. :return: Full path to saved model. """ saved_model_path = Path(output_dir, exp_name) if not saved_model_path.exists(): saved_model_path.mkdir(parents=True) model.to_disk(saved_model_path) logging.info("Saved model to {}".format(output_dir)) return saved_model_path @staticmethod def load_model(exp_name, model_dir): """ Loads a spacy model from disk :param exp_name: Name of experiment under which the model was saved :param model_dir: path to saved model :return: spacy model """ saved_model_path = Path(model_dir, exp_name) return spacy.load(saved_model_path) @staticmethod def load_or_create_empty_model(model=None): """ Loads a given model or creates a blank english model. :param model: Optional Model to load. :return: Loaded or blank model. """ if model: nlp = spacy.load(model) logging.debug("Loaded model {}".format(model)) else: nlp = spacy.blank("en") logging.debug("Created blank 'en' model") return nlp
class ExploratoryAnalysis(): def __init__(self): self.__parser = argparse.ArgumentParser("preprocessing") self.__parser.add_argument("--datastore", type=str, help="Name of the datastore", default="workspaceblobstore") self.__parser.add_argument("--dataset_name", type=str, help="Name of the dataset") self.__parser.add_argument("--dataset_preprocessed_name", type=str, help="Standard preprocessed dataset") self.__parser.add_argument("--output_preprocess_dataset", type=str, help="Name of the PipelineData reference") self.__args = self.__parser.parse_args() self.__run = Run.get_context() self.__local_run = type(self.__run) == _OfflineRun if self.__local_run: self.__ws = Workspace.from_config('../../notebooks-settings') self.__exp = Experiment(self.__ws, 'exploratory_analysis') self.__run = self.__exp.start_logging() else: self.__ws = self.__run.experiment.workspace self.__exp = self.__run.experiment self.__datastore = Datastore.get(self.__ws, datastore_name=self.__args.datastore) def main(self): df, df_eda = self.__preprocess_dataset( schema_path="./schema_dataset.json") self.__make_exploratory_analysis(df_eda) self.__upload_datasets(df, df.columns) def __preprocess_dataset(self, schema_path): with open(schema_path) as f: schema = json.load(f) df, df_eda = self.__get_dataset(self.__args.dataset_name) columns_names = schema.keys() df.columns = columns_names return df, df_eda def __make_exploratory_analysis(self, df): self.__frequency_tremor(df) self.__tremor_acceleration_energy(df) self.__execute_tsne(df) def __get_dataset(self, dataset_name): acc0 = self.__ws.datasets.get( f"{dataset_name}0_dataset").to_pandas_dataframe() acc1 = self.__ws.datasets.get( f"{dataset_name}1_dataset").to_pandas_dataframe() acc2 = self.__ws.datasets.get( f"{dataset_name}2_dataset").to_pandas_dataframe() acc3 = self.__ws.datasets.get( f"{dataset_name}3_dataset").to_pandas_dataframe() df_eda = pd.concat([acc0, acc1, acc2, acc3], axis=0) df_eda['Tremor'] = df_eda['Tremor'].replace(2, 1) df = pd.concat([acc0, acc1, acc2, acc3], axis=0) df['Tremor'] = df['Tremor'].replace(2, 1) return df, df_eda def __upload_datasets(self, df, columns): dataset_name, preprocess_filepath, datastore_path = self.__get_dataset_metadata( df, "train") self.__upload_dataset(self.__ws, self.__datastore, dataset_name, datastore_path, preprocess_filepath, use_datadrift=False, type_dataset="standard") def __get_dataset_metadata(self, df, extension): dataset_name = f'{self.__args.dataset_preprocessed_name}_{extension}' output_preprocessed_directory = self.__args.output_preprocess_dataset if extension == "train" else f'{self.__args.output_preprocess_dataset}_{extension}' preprocess_filepath = os.path.join(output_preprocessed_directory, f'{dataset_name}.csv') datastore_path = f"parkinson/{dataset_name}.csv" os.makedirs(output_preprocessed_directory, exist_ok=True) df.to_csv(preprocess_filepath, index=False) return dataset_name, preprocess_filepath, datastore_path def __upload_dataset(self, ws, def_blob_store, dataset_name, datastore_path, filepath, use_datadrift, type_dataset): def_blob_store.upload_files([filepath], target_path="parkinson", overwrite=True) tab_data_set = Dataset.Tabular.from_delimited_files( path=(def_blob_store, datastore_path)) try: tab_data_set.register(workspace=ws, name=f'{dataset_name}', description=f'{dataset_name} data', tags={ 'format': 'CSV', 'use_datadrift': use_datadrift, 'type_dataset': type_dataset }, create_new_version=True) except Exception as ex: print(ex) def __frequency_tremor(self, df): axis = ["accZ_mean", "accX_mean"] for axi in axis: sns.set_style('whitegrid') plt.rcParams['font.family'] = 'Dejavu Sans' plt.figure(figsize=(16, 8)) sns.set_palette("Set1", desat=0.80) facetgrid = sns.FacetGrid(df, hue='Tremor', size=6, aspect=2) facetgrid.map(sns.distplot, f"{axi}", hist=False)\ .add_legend() self.__run.log_image(f"Parkinson Tremor - {axi}", plot=plt) def __tremor_acceleration_energy(self, df): plt.figure(figsize=(6, 8)) sns.boxplot(x='Tremor', y='accX_energy', data=df, showfliers=False, saturation=1) plt.ylabel('Acceleration Energy X') self.__run.log_image(f"Parkinson Tremor - Acceleration Energy X", plot=plt) def __perform_tsne(self, X_data, y_data, perplexities, n_iter=1000, img_name_prefix='t-sne'): for index, perplexity in enumerate(perplexities): print( '\nperforming tsne with perplexity {} and with {} iterations at max' .format(perplexity, n_iter)) X_reduced = TSNE(verbose=2, perplexity=perplexity).fit_transform(X_data) print('Done..') print('Creating plot for this t-sne visualization..') df = pd.DataFrame({ 'x': X_reduced[:, 0], 'y': X_reduced[:, 1], 'label': y_data }) sns.lmplot(data=df, x='x', y='y', hue='label', fit_reg=False, size=8,\ palette="Set1",markers=['*', 'o']) plt.title("perplexity : {} and max_iter : {}".format( perplexity, n_iter)) img_name = img_name_prefix + '_perp_{}_iter_{}.png'.format( perplexity, n_iter) print('saving this plot as image in present working directory...') self.__run.log_image(f"Parkinson Tremor - {img_name}", plot=plt) def __execute_tsne(self, df): X_norm = normalize(df.drop(['Tremor'], axis=1), norm='l2') X_new2 = MinMaxScaler().fit_transform(X_norm) X_pre_tsne = X_new2 y_pre_tsne = df['Tremor'] self.__perform_tsne(X_data=X_pre_tsne, y_data=y_pre_tsne, perplexities=[2, 5, 10, 20, 50])
def main(): config = utils.load_yaml(args.config) task = config['task'] EPOCHS = config['epoch'] N_FOLDS = 5 BATCH_SIZE = config['batchsize'] IMAGE_SIZE = config['image_size'] model_name = config['model'] optimizer_name = config['optimizer'] loss = config['loss'] lr = float(config['lr']) n_class = config['n_class'] lr_scheduler = config.get('lr_scheduler') azure_run = None tb_writer = None num_workers = 64 experiment_name = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S') print(f'found {torch.cuda.device_count()} gpus !!') try: if args.debug: print('running in debug mode') EPOCHS = 1 N_FOLDS = 2 if args.debug: result_dir = Path(utils.RESULT_DIR) / ('debug-' + experiment_name) else: result_dir = Path(utils.RESULT_DIR) / experiment_name ws = Workspace.from_config('.aml_config/config.json') exp = Experiment(workspace=ws, name='kaggle-aptos2019') azure_run = exp.start_logging() azure_run.log('experiment name', experiment_name) azure_run.log('epoch', EPOCHS) azure_run.log('batch size', BATCH_SIZE) azure_run.log('image size', IMAGE_SIZE) azure_run.log('model', model_name) azure_run.log('optimizer', optimizer_name) azure_run.log('loss_name', loss['name']) azure_run.log('lr', lr) azure_run.log('lr_scheduler', lr_scheduler) azure_run.log('task', task) if args.cv: azure_run.log('cv', N_FOLDS) else: azure_run.log('cv', 0) if args.multi: print('use multi gpu !!') os.mkdir(result_dir) print(f'created: {result_dir}') utils.save_yaml(result_dir / Path(args.config).name, config) # if not args.debug: # tb_writer = SummaryWriter(log_dir=result_dir) device = torch.device("cuda:0") config = { 'epochs': EPOCHS, 'multi': args.multi, 'batch_size': BATCH_SIZE, 'image_size': IMAGE_SIZE, 'model_name': model_name, 'n_class': n_class, 'optimizer_name': optimizer_name, 'loss': loss, 'lr': lr, 'lr_scheduler': lr_scheduler, 'task': task, 'device': device, 'num_workers': num_workers, } print(config) if not args.debug: slack.notify_start(experiment_name, config) train_df = pd.read_csv(utils.TRAIN_CSV_PATH) if args.debug: train_df = train_df[:1000] config['df'] = train_df skf = StratifiedKFold(n_splits=N_FOLDS, random_state=41, shuffle=True) indices = list(skf.split(train_df, train_df['diagnosis'])) if not args.cv: print('do not use cross validation') indices = [indices[0]] # cross validation oof_preds = np.zeros((len(train_df), n_class)) for i_fold, (train_index, valid_index) in tqdm(enumerate(indices)): model_path = result_dir / f'model_fold{i_fold}' config['train_index'] = train_index config['valid_index'] = valid_index config['model_path'] = str(model_path) if azure_run: if i_fold == 0: config['azure_run'] = azure_run y_pred, y_true = utils.run_model(**config) else: with azure_run.child_run() as child: config['azure_run'] = child y_pred, y_true = utils.run_model(**config) else: y_pred, y_true = utils.run_model(**config) if args.cv: oof_preds[valid_index] = y_pred if args.cv: valid_preds = oof_preds valid_true = train_df['diagnosis'] else: valid_preds = y_pred valid_true = y_true if task == 'class': round_valid_preds = np.argmax(valid_preds, axis=1) elif task == 'reg': print('optimizing threshold ...') optR = utils.OptimizedRounder() optR.fit(valid_preds, valid_true) coef = optR.coefficients() print(f'best coef: {coef}') if azure_run: azure_run.log('coef', coef) round_valid_preds = optR.predict(valid_preds, coef) val_kappa = cohen_kappa_score(round_valid_preds, valid_true, weights='quadratic') print(f'best val kappa: {val_kappa}') if azure_run: azure_run.log('best val kappa', val_kappa) test_csv = pd.read_csv(utils.TEST_CSV_PATH) #test_tfms = utils.build_transform(size=IMAGE_SIZE, mode='test') test_tfms = utils.build_transform(size=IMAGE_SIZE, mode='val') test_dataset = RetinopathyDataset(df=test_csv, mode='test', transform=test_tfms, auto_crop=True, add_blur=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True, num_workers=num_workers) test_preds = np.zeros((len(test_csv), n_class)) for i in range(len(indices)): model = utils.load_pytorch_model(model_name, result_dir / f'model_fold{i}', n_class) test_preds += utils.predict(model, test_loader, n_class=n_class, device=device, tta=1) test_preds /= len(indices) if task == 'class': round_test_preds = np.argmax(test_preds, axis=1) elif task == 'reg': round_test_preds = optR.predict(test_preds, coef) submission_csv = pd.read_csv(utils.SAMPLE_SUBMISSION_PATH) submission_csv['diagnosis'] = round_test_preds submission_csv.to_csv(result_dir / 'submission.csv', index=False) print('finish!!!') if not args.debug: slack.notify_finish(experiment_name, config, val_kappa) except KeyboardInterrupt as e: if not args.debug: slack.notify_fail(experiment_name, config, e.__class__.__name__, str(e)) except Exception as e: if azure_run: azure_run.fail(e) if not args.debug: slack.notify_fail(experiment_name, config, e.__class__.__name__, str(e)) raise finally: if azure_run: azure_run.complete() print('close azure_run') if tb_writer: tb_writer.export_scalars_to_json( os.path.join(result_dir, 'all_scalars.json')) tb_writer.close() print('close tb_writer')
print('The joblib version is {}.'.format(joblib.__version__)) print('The pandas version is {}.'.format(pd.__version__)) #print('The sklearn_pandas version is {}.'.format(sklearn_pandas.__version__)) # + from azureml.core import Dataset run = Run.get_context() if run.identity.startswith('OfflineRun'): ws = Workspace.from_config() experiment_name = 'heart-failure-clinical-data' experiment = Experiment(ws, experiment_name) interactive_run = experiment.start_logging() else: ws = run.experiment.workspace ds = Dataset.get_by_name(ws, name='Heart Failure Prediction') # - x, y = get_x_y(ds) # + from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y,
"train":{"X": X_train, "y": y_train}, "test":{"X": X_test, "y": y_test} } # In[2]: # Get an experiment object from Azure Machine Learning from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from sklearn.externals import joblib experiment = Experiment(workspace=ws, name="train-within-notebook") run = experiment.start_logging(snapshot_directory=None) # Create a run object in the experiment # Log the algorithm parameter alpha to the run run.log('alpha', 0.03) # Create, fit, and test the scikit-learn Ridge regression model regression_model = Ridge(alpha=0.03) regression_model.fit(data['train']['X'], data['train']['y']) preds = regression_model.predict(data['test']['X']) # Output the Mean Squared Error to the notebook and to the run print('Mean Squared Error is', mean_squared_error(data['test']['y'], preds)) run.log('mse', mean_squared_error(data['test']['y'], preds)) # Save the model to the outputs directory for capture
def analyse_with_gordo(): ws = Workspace.from_config() # Azure ML # Get an experiment object from Azure Machine Learning experiment_name = "dummy_test" experiment = Experiment(workspace=ws, name=experiment_name) # Azure ML mlflow.set_experiment(experiment_name) # MLFlow resamples_for_model = ["1T", "1H"] aggregation_methods = ["max", "mean"] batch_sizes = [1, 10, 100] epochs = [1, 10] number_of_permutations = len( list( itertools.product( aggregation_methods, resamples_for_model, batch_sizes, epochs ) ) ) resampled_original_data = read_and_resample("2nd_test.hdf", "1S") if PLOTTING: plotnum = 0 f, axarr = plt.subplots(number_of_permutations + 1, sharex=True) axarr[plotnum].plot( resampled_original_data, linewidth=1, label="sensor_data_1S_mean" ) axarr[plotnum].legend(loc="upper left") plotnum += 1 for aggregation_method, interval, batch_size, epoch in itertools.product( aggregation_methods, resamples_for_model, batch_sizes, epochs ): run = experiment.start_logging() with mlflow.start_run(): mlflow.log_param("interval", interval) # MLFlow mlflow.log_param("aggregation_method", aggregation_method) # MLFlow mlflow.log_param("batch_size", batch_size) # MLFlow mlflow.log_param("epochs", epoch) # MLFlow run.log("interval", interval) # Azure ML run.log("aggregation_method", aggregation_method) # Azure ML run.log("batch_size", batch_size) # Azure ML run.log("epochs", epoch) # Azure ML print( f"Build model for data resampled with {interval} resolution, method {aggregation_method}, batch size {batch_size} and number of epochs {epoch}" ) resampled = read_and_resample( "2nd_test.hdf", interval, aggregation_method=aggregation_method ) anomalies, avg_train_anomaly, predicted_data, train_until_index = build_model( resampled, epoch, batch_size ) r2_train, expl_train, r2_test, expl_test = calc_scores( resampled, predicted_data, train_until_index ) run.log("r2_train", r2_train) # Azure ML run.log("explained_variance_train", expl_train) # Azure ML run.log("r2_test", r2_test) # Azure ML run.log("explained_variance_test", expl_test) # Azure ML mlflow.log_metric("r2_train", r2_train) # MLFlow mlflow.log_metric("explained_variance_train", expl_train) # MLFlow mlflow.log_metric("r2_test", r2_test) # MLFlow mlflow.log_metric("explained_variance_test", expl_test) # MLFlow anomalies = anomalies.rolling( resamples_for_model[-1] ).mean() # Use the last of the experiment resamples as the anomaly resample if PLOTTING: axarr[plotnum].plot( anomalies, label=interval + "-" + aggregation_method + "-model" ) axarr[plotnum].axhline(avg_train_anomaly, color="r") axarr[plotnum].legend(loc="upper left") plotnum += 1 run.complete() # Azure ML if PLOTTING: plt.show()
# define compile to minimize categorical loss, use ada delta optimized, and optimize to maximizing accuracy model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['accuracy']) experiment_name = 'fashion-mnist' experiment = Experiment(workspace=ws, name=experiment_name) # Define early stopping callback my_callbacks = [EarlyStopping(monitor='val_acc', patience=5, mode='max')] os.makedirs('./outputs', exist_ok=True) # Train the model and test/validate the mode with the test data after each cycle (epoch) through the training data # Return history of loss and accuracy for each epoch with experiment.start_logging() as run: run.tag("Description","Locally trained Fashion MNIST model") hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=my_callbacks, validation_data=(x_test, y_test)) run.log_list('Training Accuracy', hist.history['acc']) run.log_list('Validation Accuracy', hist.history['val_acc']) # Evaluate the model with the test data to get the scores on "real" data. score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1])
class Deployment(): def __init__(self): self.__parser = argparse.ArgumentParser("deploy") self.__parser.add_argument( "--update_deployment", type=distutils.util.strtobool, help= "Deployment Flag. False=Generate deploy from scratch, True=Update Service" ) self.__parser.add_argument("--dataset_name", type=str, help="Dataset name") self.__parser.add_argument("--model_name", type=str, help="Model name") self.__parser.add_argument("--explainer_model_name", type=str, help="Explainer model name") self.__parser.add_argument("--service_name", type=str, help="Service name") self.__args = self.__parser.parse_args() self.__run = Run.get_context() self.__local_run = type(self.__run) == _OfflineRun if self.__local_run: self.__ws = Workspace.from_config('../notebooks-settings') self.__exp = Experiment(self.__ws, 'deploy_service') self.__run = self.__exp.start_logging() else: self.__ws = self.__run.experiment.workspace self.__exp = self.__run.experiment self.__config = configparser.ConfigParser() self.__config.read("./config.ini") def main(self): dataset = Dataset.get_by_name(self.__ws, self.__args.dataset_name) df = dataset.to_pandas_dataframe() df = df.drop(['target'], axis=1) columns = [*df.columns] parameters = { "model_name": self.__args.model_name, "explainer_model_name": self.__args.explainer_model_name, "dataset_columns": columns } joblib.dump( parameters, os.path.join(self.__config.get('DEPLOY', 'DEPENDENCIES_DIRECTORY'), "deploy_parameters.pkl")) self.__deploy_model() def __deploy_model(self): service_name = self.__args.service_name model = Model(self.__ws, self.__args.model_name) explainer_model = Model(self.__ws, self.__args.explainer_model_name) myenv = Environment.from_conda_specification( name=self.__config.get('DEPLOY', 'ENV_NAME'), file_path=self.__config.get('DEPLOY', 'ENV_FILE_PATH')) inference_config = InferenceConfig( entry_script=self.__config.get('DEPLOY', 'SCORE_PATH'), environment=myenv, source_directory=self.__config.get('DEPLOY', 'DEPENDENCIES_DIRECTORY')) if not self.__args.update_deployment: deployment_config = AciWebservice.deploy_configuration( cpu_cores=self.__config.getint('DEPLOY', 'ACI_CPU'), memory_gb=self.__config.getint('DEPLOY', 'ACI_MEM'), collect_model_data=True, enable_app_insights=True) service = Model.deploy(self.__ws, service_name, [model, explainer_model], inference_config, deployment_config) else: service = AciWebservice(self.__ws, service_name) service.update(models=[model, explainer_model], inference_config=inference_config) service.wait_for_deployment(show_output=True) print(service.state) print(service.get_logs())
experiment = Experiment(workspace=ws, name="diabetes-experiment") x_df = Diabetes.get_tabular_dataset().to_pandas_dataframe().dropna() y_df = x_df.pop("Y") X_train, X_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=66) # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print(X_train) alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for alpha in alphas: run = experiment.start_logging() run.log("alpha_value", alpha) model = Ridge(alpha=alpha) model.fit(X=X_train, y=y_train) y_pred = model.predict(X=X_test) rmse = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)) run.log("rmse", rmse) model_name = "model_alpha_" + str(alpha) + ".pkl" filename = "outputs/" + model_name joblib.dump(value=model, filename=filename) run.upload_file(name=model_name, path_or_stream=filename) run.complete()
workspace_name = "" workspace_region = "" ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) # COMMAND ---------- # create experiment experiment_name = 'bikeSharingDemand' exp = Experiment(workspace=ws, name=experiment_name) # COMMAND ---------- run = exp.start_logging(snapshot_directory=None) # COMMAND ---------- df = (spark.read.format("csv").option("inferSchema", "True").option( "header", "True").load("/databricks-datasets/bikeSharing/data-001/day.csv")) # split data train_df, test_df = df.randomSplit([0.7, 0.3]) # One Hot Encoding mnth_encoder = OneHotEncoder(inputCol="mnth", outputCol="encoded_mnth") weekday_encoder = OneHotEncoder(inputCol="weekday", outputCol="encoded_weekday")
PARSER.add_argument('--AZUREML_ARM_WORKSPACE_NAME') PARSER.add_argument('--TENANT_ID') PARSER.add_argument('--APP_ID') PARSER.add_argument('--APP_SECRET') ARGS = PARSER.parse_args() WORKSPACE_NAME = ARGS.AZUREML_ARM_WORKSPACE_NAME RESOURCE_GROUP = ARGS.AZUREML_ARM_RESOURCEGROUP SUBSCRIPTION_ID = ARGS.AZUREML_ARM_SUBSCRIPTION TENANT_ID = ARGS.TENANT_ID APP_ID = ARGS.APP_ID APP_SECRET = ARGS.APP_SECRET SP_AUTH = ServicePrincipalAuthentication(tenant_id=TENANT_ID, service_principal_id=APP_ID, service_principal_password=APP_SECRET) WORKSPACE = Workspace.get(WORKSPACE_NAME, SP_AUTH, SUBSCRIPTION_ID, RESOURCE_GROUP) EXPERIMENT = Experiment(workspace=WORKSPACE, name="trainpipeline") print(EXPERIMENT.name, EXPERIMENT.workspace.name, sep='\n') EXPERIMENT_RUN = EXPERIMENT.start_logging() EXPERIMENT_RUN.log('my magic number', 45) EXPERIMENT_RUN.complete()
class BuildFairnLearnModels(): def __init__(self): self.__parser = argparse.ArgumentParser("fairlearn") self.__parser.add_argument("--dataset_name", type=str, default="heart_disease_preprocessed_train", help="Name of the dataset") self.__args = self.__parser.parse_args() self.__run = Run.get_context() self.__local_run = type(self.__run) == _OfflineRun if self.__local_run: self.__ws = Workspace.from_config('../../notebooks-settings') self.__exp = Experiment(self.__ws, 'fairlearn') self.__run = self.__exp.start_logging() else: self.__ws = self.__run.experiment.workspace self.__exp = self.__run.experiment self.__sensitive_features = ['asthmatic', 'diabetic', 'smoker'] def main(self): dataset = self.__get_dataset(self.__args.dataset_name) df = dataset.to_pandas_dataframe() X_raw, Y, A, X = self.__transform_df(df) X_train, X_test, Y_train, Y_test, A_train, A_test = self.__df_train_split( X_raw, Y, A, X) clf = Pipeline(steps=[('classifier', LogisticRegression( solver='liblinear', fit_intercept=True))]) model = clf.fit(X_train, Y_train) predictors = self.__mitigation_with_gridsearch( X_train, A_train, Y_train, model) all_results = self.__remove_predictors_dominated_error_disparity_by_sweep( predictors, X_train, Y_train, A_train) dominant_models_dict, all_models_dict = self.__generate_dominant_models( model, all_results) models_all = self.__build_predictions_all_models( all_models_dict, X_test) dominant_all = self.__build_predictions_dominant_models( dominant_models_dict, X_test) os.makedirs('models', exist_ok=True) model_name_id_mapping = self.__get_dominant_models_names(dominant_all) dominant_all_ids = self.__get_dominant_models_id( dominant_all, model_name_id_mapping) dash_dict_all = self.__get_dashboard_dict( A_test, Y_test, dominant_all_ids) self.__plot_all_multimodel_by_feature(dash_dict_all) self.__upload_best_disparity_model_by_feature( dash_dict_all['precomputedMetrics'], dominant_all) self.__upload_dashboard_dict(dash_dict_all) def __get_dataset(self, dataset_name): return self.__ws.datasets.get(dataset_name) def __transform_df(self, df): X_raw = df.drop(['target'], axis=1) Y = df['target'] A = X_raw[self.__sensitive_features] X = X_raw.drop(labels=self.__sensitive_features, axis=1) return X_raw, Y, A, X def __df_train_split(self, X_raw, Y, A, X): X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split(X_raw, Y, A, test_size=0.3, random_state=123, stratify=Y, shuffle=True) X_train = X_train.reset_index(drop=True) A_train = A_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) A_test = A_test.reset_index(drop=True) A_test.diabetic.loc[(A_test['diabetic'] == 0)] = 'not diabetic' A_test.diabetic.loc[(A_test['diabetic'] == 1)] = 'diabetic' A_test.asthmatic.loc[(A_test['asthmatic'] == 0)] = 'not asthmatic' A_test.asthmatic.loc[(A_test['asthmatic'] == 1)] = 'asthmatic' A_test.smoker.loc[(A_test['smoker'] == 0)] = 'not smoker' A_test.smoker.loc[(A_test['smoker'] == 1)] = 'smoker' return X_train, X_test, Y_train, Y_test, A_train, A_test def __mitigation_with_gridsearch(self, X_train, A_train, Y_train, fitted_model): sweep = GridSearch(LogisticRegression(solver='liblinear', fit_intercept=True), constraints=DemographicParity(), grid_size=70) sweep.fit(X_train, Y_train, sensitive_features=A_train.diabetic) predictors = sweep._predictors return predictors def __remove_predictors_dominated_error_disparity_by_sweep(self, predictors, X_train, Y_train, A_train): errors, disparities = [], [] for m in predictors: def classifier(X): return m.predict(X) error = ErrorRate() error.load_data(X_train, pd.Series(Y_train), sensitive_features=A_train.diabetic) disparity = DemographicParity() disparity.load_data(X_train, pd.Series( Y_train), sensitive_features=A_train.diabetic) errors.append(error.gamma(classifier)[0]) disparities.append(disparity.gamma(classifier).max()) return pd.DataFrame({"predictor": predictors, "error": errors, "disparity": disparities}) def __generate_dominant_models(self, model, all_results): all_models_dict = {"heart_disease_unmitigated": model} dominant_models_dict = {"heart_disease_unmitigated": model} base_name_format = "heart_disease_grid_model_{0}" row_id = 0 for row in all_results.itertuples(): model_name = base_name_format.format(row_id) all_models_dict[model_name] = row.predictor errors_for_lower_or_eq_disparity = all_results[ "error"][all_results["disparity"] <= row.disparity] if row.error <= errors_for_lower_or_eq_disparity.min(): dominant_models_dict[model_name] = row.predictor row_id = row_id + 1 return dominant_models_dict, all_models_dict def __build_predictions_all_models(self, all_models_dict, X_test): dashboard_all = dict() models_all = dict() for name, predictor in all_models_dict.items(): value = predictor.predict(X_test) dashboard_all[name] = value models_all[name] = predictor return models_all def __build_predictions_dominant_models(self, dominant_models_dict, X_test): dominant_all = dict() for n, p in dominant_models_dict.items(): dominant_all[n] = p.predict(X_test) return dominant_all def __get_dominant_models_id(self, dominant_all, model_name_id_mapping): dominant_all_ids = dict() for name, y_pred in dominant_all.items(): dominant_all_ids[model_name_id_mapping[name]] = y_pred return dominant_all_ids def __get_dashboard_dict(self, A_test, Y_test, dominant_all_ids): sf = {'diabetic': A_test.diabetic, 'asthmatic': A_test.asthmatic, 'smoker': A_test.smoker} return _create_group_metric_set(y_true=Y_test, predictions=dominant_all_ids, sensitive_features=sf, prediction_type='binary_classification') def __register_model(self, name, model, disparity=""): print("Registering ", name) model_path = "models/{0}.pkl".format(name) joblib.dump(value=model, filename=model_path) registered_model = Model.register(model_path=model_path, model_name=name, workspace=self.__ws, properties={ "root_run_id": self.__run._root_run_id, "child_run_id": self.__run.id, "experiment": self.__run.experiment.name}, tags={"disparity": f'{disparity}%'}) print("Registered ", registered_model.id) return registered_model.id def __get_dominant_models_names(self, dominant_all): model_name_id_mapping = dict() for name, model in dominant_all.items(): m_id = self.__register_model(name, model) model_name_id_mapping[name] = m_id return model_name_id_mapping def __upload_dashboard_dict(self, dash_dict_all): run = self.__exp.start_logging() try: dashboard_title = "Upload MultiAsset from Grid Search with heart-disease data" upload_id = upload_dashboard_dictionary(run, dash_dict_all, dataset_name=self.__args.dataset_name, dashboard_name=dashboard_title) finally: run.complete() def __difference_selection_rate(self, selection_rate): return abs(selection_rate[0]-selection_rate[1]) def __build_models_metrics(self, tags, feature_models, feature): tags[feature]['disparity'].append(self.__difference_selection_rate( feature_models['selection_rate']['bins'])) def __upload_best_disparity_model_by_feature(self, dash_dict_all, dominant_all): tags = {} for i, feature in enumerate(self.__sensitive_features): tags[feature] = {} tags[feature]['disparity'] = [] list(map(lambda feature_models: self.__build_models_metrics( tags, feature_models, feature), dash_dict_all[i])) model_info = tuple(dominant_all.items())[ tags[feature]['disparity'].index(min(tags[feature]['disparity']))] self.__register_model( f'{feature}', model_info[1], min(tags[feature]['disparity'])) def __scatterplot(self, disparities, accuracy_scores, legend, feature): plt.figure(figsize=(12, 7), dpi=80) colors = np.random.rand(len(accuracy_scores), 4) for accuracy, disparity, model_name, color in zip(accuracy_scores, disparities, legend, colors): plt.scatter(accuracy, disparity, c=[ color], s=170, label=model_name, alpha=0.3) plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left") plt.title('Multi model view - Models Comparison') plt.xlabel("Accuracy") plt.ylabel("Disparity in predictions") plt.grid() self.__run.log_image( f'Multi model view - Models Comparison of {feature}', plot=plt) def __get_models_metrics(self, feature_models, disparities, accuracy_scores): disparities.append(self.__difference_selection_rate( feature_models['selection_rate']['bins'])) accuracy_scores.append(feature_models['accuracy_score']['global']) def __plot_all_multimodel_by_feature(self, dash_dict_all): for feature in self.__sensitive_features: self.__plot_multimodel_view_by_feature(feature, dash_dict_all) def __plot_multimodel_view_by_feature(self, feature, dash_dict_all): disparities = [] accuracy_scores = [] list(map(lambda feature_models: self.__get_models_metrics(feature_models, disparities, accuracy_scores), dash_dict_all['precomputedMetrics'][self.__sensitive_features.index(feature)])) self.__scatterplot(disparities, accuracy_scores, dash_dict_all['modelNames'], feature)
class DifferentialPrivacy(): def __init__(self): self.__parser = argparse.ArgumentParser("differential_privacy") self.__parser.add_argument("--datastore", type=str, help="Name of the datastore", default="workspaceblobstore") self.__parser.add_argument( "--dataset_name", type=str, help="Name of the dataset") self.__parser.add_argument( "--retrain_status", type=distutils.util.strtobool, help="Retrain status") self.__args = self.__parser.parse_args() self.__run = Run.get_context() self.__local_run = type(self.__run) == _OfflineRun if self.__local_run: self.__ws = Workspace.from_config('../../notebooks-settings') self.__exp = Experiment(self.__ws, 'differential_privacy') self.__run = self.__exp.start_logging() else: self.__ws = self.__run.experiment.workspace self.__exp = self.__run.experiment self.__datastore = Datastore.get( self.__ws, datastore_name=self.__args.datastore) def main(self): if not self.__args.retrain_status: self.__main_execution() else: self.__run.add_properties( {'status': "The following step have been skipped because a retraining pipeline have been launched"}) def __main_execution(self): with wn.Analysis() as analysis: data, self.__nsize = self.__get_dp_noise_dataset() sex_histogram_geometric, sex_histogram_laplace = self.__create_sex_histograms( data) state_histogram_geometric, state_histogram_laplace = self.__create_state_histograms( data) age_histogram_geometric, age_histogram_laplace = self.__create_age_histograms( data) analysis.release() n_sex, n_state, n_age = self.__create_and_upload_real_data_histograms() self.__show_dp_and_real_histogram( "Sex", ['female', 'male'], n_sex, sex_histogram_geometric, sex_histogram_laplace) self.__show_dp_and_real_histogram("State", self.get_states( ), n_state, state_histogram_geometric, state_histogram_laplace) self.__show_dp_and_real_histogram("Age", list( range(20, 80, 10)), n_age, age_histogram_geometric, age_histogram_laplace) if self.__local_run: self.__run.complete() def get_columns(self): df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe() return [*df.columns] def get_states(self): df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe() return [*df['state'].unique()] def __get_dp_noise_dataset(self): df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe() df.to_csv('tmp.csv', index=False) return wn.Dataset(path='tmp.csv', column_names=self.get_columns()), len(df.index) def __get_dataset(self, dataset_name): return self.__ws.datasets.get(dataset_name) def __create_sex_histograms(self, data): sex_histogram_geometric = wn.dp_histogram( wn.to_bool(data['sex'], true_label="0"), upper=self.__nsize, privacy_usage={'epsilon': .5, 'delta': 0.00001} ) sex_prep = wn.histogram(wn.to_bool( data['sex'], true_label="0"), null_value=True) sex_histogram_laplace = wn.laplace_mechanism( sex_prep, privacy_usage={"epsilon": 0.4, "delta": .000001}) return sex_histogram_geometric, sex_histogram_laplace def __create_state_histograms(self, data): states = self.get_states() state_histogram_geometric = wn.dp_histogram( data['state'], categories=states, null_value=states[0], privacy_usage={'epsilon': 0.2} ) state_prep = wn.histogram(data['state'], categories=states, null_value=states[0]) state_histogram_laplace = wn.laplace_mechanism(state_prep, privacy_usage={"epsilon": 0.5, "delta": .000001}) return state_histogram_geometric, state_histogram_laplace def __create_age_histograms(self, data): age_edges = list(range(20, 80, 10)) age_histogram_geometric = wn.dp_histogram( wn.to_int(data['age'], lower=20, upper=80), edges=age_edges, upper=self.__nsize, null_value=20, privacy_usage={'epsilon': 0.5} ) age_prep = wn.histogram(wn.to_int(data['age'], lower=20, upper=80), edges=age_edges, null_value=20) age_histogram_laplace = wn.laplace_mechanism( age_prep, privacy_usage={"epsilon": 0.5, "delta": .000001}) return age_histogram_geometric, age_histogram_laplace def __create_and_upload_real_data_histograms(self): df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe() sex = list(df[:]['sex']) state = list(df[:]['state']) age = list(df[:]['age']) n_sex = self.__upload_real_data_histogram(sex, [-0.5, 0.5, 1.5], "Sex") n_state = self.__upload_real_data_histogram( state, list(range(6)), "State") n_age = self.__upload_real_data_histogram( age, list(range(20, 90, 10)), "Age") return n_sex, n_state, n_age def __upload_real_data_histogram(self, data, bins, title): n_data, bins, _ = plt.hist(data, bins=bins, color='#0504aa', alpha=0.7, rwidth=0.85) plt.grid(axis='y', alpha=0.75) plt.xlabel(title) plt.ylabel('Frequency') plt.title(f'True Dataset {title} Distribution') self.__run.log_image( f'Differential Privacy Noise - True Dataset {title} Distribution', plot=plt) plt.clf() return n_data def __plot(self, ax, data, title, colors, xlabels, legend_names, width=0.2): positions = [ [i+width*column for column in range(len(data[0]))] for i in range(len(data))] for position, value in zip(positions, data): ax.bar(position, value, width, alpha=0.75, color=colors ) ax.set_title(title) ax.set_xticks([p[0] + 1.5 * width for p in positions]) ax.set_xticklabels(xlabels) proxies = [ax.bar([0], [0], width=0, color=c, alpha=0.75)[0] for c in colors] ax.legend((proxies), legend_names, loc='upper left') ax.set_xlim(positions[0][0]-width, positions[-1][0]+width*len(data[0])) ax.set_ylim([0, max(max(l) for l in data)*1.2]) plt.grid() self.__run.log_image( f'Differential Privacy - Histograms for {title} Distribution', plot=plt) plt.clf() def __show_dp_and_real_histogram(self, title, labels, n_data, geometric_histogram, laplace_histogram): colorseq = ["forestgreen", "indianred", "orange", "orangered", "orchid"] legend = ['True Value', 'DP Geometric', 'DP Laplace'] fig = plt.figure() ax = fig.add_subplot(111) data = [n_data, geometric_histogram.value, laplace_histogram.value] self.__plot(ax, list(map(list, zip(*data))), title, colorseq, labels, legend)
for compute in ws.compute_targets: print(compute) #Now lets define an experiment. # Experiment is nothing by the process used for running a script. Experiment consists of many runs and we can track all the runs. #lets create the Experiment from azureml.core import Experiment experiment = Experiment(ws, "my_exp") new_run = experiment.start_logging() #my code. new_run.complete() #Now you know we can also log all the metrics and can save the entire folder or any file. import numpy as np import pandas as pd import json from azureml.core import Workspace ws = Workspace.from_config() exp = Experiment(ws, "my_exp") run = exp.start_logging()
class AzureMLTrainer(trainer.Trainer): is_connected: bool = False __config_file: str = '.azureml/config.json' __workspace: Workspace = None __experiment: Experiment = None __current_experiment_name: str __current_run: Run = None __logger: Logger = None __vm_size_list: list = None def __init__(self, experiment_name: str, aml_workspace: Workspace, aml_run: Run = None): ''' Initializes a new connected Trainer that will persist and log all runs on AzureML workspace Args: experiment_name (str): The name of the experiment that will be seen on AzureML aml_workspace (Workspace): The connected workspace on AzureML ''' self.__workspace = aml_workspace self.__logger = logging.getLogger() if aml_run is not None: self.__current_run = aml_run self.__experiment = aml_run.experiment self.__current_experiment_name = aml_run.experiment.name else: self.__current_experiment_name = experiment_name self.__experiment = Experiment(workspace=self.__workspace, name=experiment_name) @classmethod def CreateFromContext(cls): ''' Creates a Trainer, based on the current Run context. This will only work when used in an Estimator Returns: AzureMLTrainer: an instance of AzureMLTrainer allowing the user to work connected. ''' run = Run.get_context() return cls(run.experiment.name, run.experiment.workspace, run) def new_run(self, description: str = None, copy_folder: bool = True, metrics: dict = None) -> Run: ''' This will begin a new interactive run on the existing AzureML Experiment. When a previous run was still active, it will be completed. Args: description (str): An optional description that will be added to the run metadata copy_folder (bool): Indicates if the output folder should be snapshotted and persisted metrics (dict): The metrics that should be logged in the run already Returns: Run: the AzureML Run object that can be used for further access and custom logic ''' if(self.__current_run is not None): self.__current_run.complete() if(copy_folder): self.__current_run = self.__experiment.start_logging() else: self.__current_run = self.__experiment.start_logging(snapshot_directory = None) if(metrics is not None): for k, v in metrics.items(): self.__current_run.log(k, v) if(description is not None): self.__current_run.log('Description', description) return self.__current_run def add_tuning_result(self, run_index: int, train_score: float, test_score: float, sample_count: int, durations:np.array, parameters: dict, estimator): ''' This add results of a cross validation fold to the child run in a Grid Search Args: train_score (float): The given score of the training data test_score (float): The given score of the test data sample_count (int): The number of samples that were part of a fold durations (np.array): The different durations of the Grid Search parameters (dict): The parameter combinations that have been tested in this cross validation fold estimate (model): The actual fitted estimator / model that was trained in this fold ''' _child_run = self.__current_run.child_run('Gridsearch' + str(run_index)) self.__current_run.log_row('Trainscore', score = train_score) self.__current_run.log_row('Testscore', score = test_score) _table = { 'Testing score': test_score, 'Training score': train_score } for k in parameters.keys(): v = parameters[k] if(v is None): v = 'None' _child_run.log(k, v) _table[k] = v self.__current_run.log_row('Results', '', **_table) _child_run.complete() def get_best_model(self, metric_name:str, take_highest:bool = True): ''' Tags and returns the best model of the experiment, based on the given metric Args: metric_name (str): The name of the metric, such as accuracy take_highest (bool): In case of accuracy and score, this is typically True. In case you want to get the model based on the lowest error, you can use False Returns: Run: the best run, which will be labeled as best run ''' runs = {} run_metrics = {} for r in tqdm(self.__experiment.get_runs()): metrics = r.get_metrics() if metric_name in metrics.keys(): runs[r.id] = r run_metrics[r.id] = metrics best_run_id = min(run_metrics, key = lambda k: run_metrics[k][metric_name]) best_run = runs[best_run_id] best_run.tag('Best run') return best_run def get_azureml_experiment(self): ''' Gives access to the AzureML experiment object Returns: Experiment: the existing experiment ''' return self.__experiment def complete_run(self, fitted_model, metrics_to_log: dict = None, upload_model: bool = True): ''' Saves all results of the active Run and completes it Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested metrics_to_log (dict): The metrics that should be logged with the model to the run upload_model (bool): This will upload the model (pkl file or json) to AzureML run (defaults to True) ''' is_keras = 'keras' in str(type(fitted_model)) if(metrics_to_log is not None): for k, v in metrics_to_log.items(): self._log_metrics(k, v) if upload_model: # Save the model to the outputs directory for capture if(is_keras): model_folder_name = 'outputs/model' fitted_model.save(model_folder_name) files_to_upload = dict() else: model_file_name = 'outputs/model.pkl' joblib.dump(value = fitted_model, filename = model_file_name) self._complete_run() def evaluate_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, save_curves_as_image: bool = False, class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array: ''' Will predict and evaluate a model against a test set and save all results to the active Run on AzureML Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested X_test (np.array): The test set to calculate the predictions with y_test (np.array): The output test set to evaluate the predictions against show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier save_curves_as_image (bool): This will save the training & loss curves as images class_names (np.array): The class names that will be linked to the Confusion Matrix. If not provided, the unique values of the y_test matrix will be used finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True) upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True) return_predictions (bool): If true, the y_pred values will be returned Returns: np.array: The predicted (y_pred) values against the model ''' is_keras = 'keras' in str(type(fitted_model)) # Predict X_test with model if(is_keras): if 'predict_classes' in dir(fitted_model): y_pred = fitted_model.predict_classes(X_test) else: y_pred = fitted_model.predict(X_test) y_pred = np.argmax(y_pred, axis=1) self.add_training_plots(fitted_model, save_image=save_curves_as_image) else: y_pred = fitted_model.predict(X_test) if class_names is None: class_names = np.char.mod('%d', sorted(np.unique(y_test))) # Print classification report print(metrics.classification_report(y_test, y_pred)) # Confusion matrix cf = metrics.confusion_matrix(y_test, y_pred) self._log_confmatrix(cf, class_names) # Accuracy accuracy = metrics.accuracy_score(y_test, y_pred) * 100 self._log_metrics('accuracy', accuracy, description='') if(show_roc == True): # Verify that we are having a binary classifier if(len(class_names)!=2): raise AttributeError('Showing a ROC curve is only possible for binary classifier, not for multi class') self.__log_roc_curve(y_test, y_pred) if (finish_existing_run): self.complete_run(fitted_model, upload_model = upload_model) if return_predictions: return y_pred def add_training_plots(self, fitted_model, metrics=None, save_image: bool = False): ''' Add the training plots to the Run history Args: fitted_model (Keras model): the fitted model that contains the training history metrics (list): the metrics that should be tracked to the run. If None, all available metrics will be taken ''' history = fitted_model.history if metrics is None: metrics = history.history.keys() for metric in metrics: if(metric in history.history.keys()): self.__current_run.log_table(f'Plot {metric}', {metric: history.history[metric]}) if(save_image and not metric.startswith('val_') and metric in history.history.keys()): plt.plot(history.history[metric]) plt.plot(history.history[f'val_{metric}']) plt.title(f'model {metric}') plt.ylabel(metric) plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') #plt.show() self.__current_run.log_image(f'model {metric}', plot=plt) plt.close() def evaluate_image_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, failed_classifications_to_save: int = 0, image_shape = None, save_curves_as_image: bool = False, class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array: ''' Will predict and evaluate a model against a test set and save all results to the active Run on AzureML Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested X_test (np.array): The test set to calculate the predictions with y_test (np.array): The output test set to evaluate the predictions against show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier failed_classifications_to_save (int): If greather than 0, this amount of incorrectly classified images will be tracked to the Run image_shape ((int, int, int)): Indicates if images should be reshaped before saving them class_names (np.array): The class names that will be used in the description. If not provided, the unique values of the y_test matrix will be used finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True) upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True) Returns: np.array: The predicted (y_pred) values against the model ''' from arcus.ml.images import explorer y_pred = self.evaluate_classifier(fitted_model, X_test, y_test, show_roc=show_roc, save_curves_as_image=save_curves_as_image, class_names= class_names, finish_existing_run=False, upload_model=upload_model, return_predictions=True) if failed_classifications_to_save > 0: # Take incorrect classified images and save import random incorrect_predictions = [i for i, item in enumerate(y_pred) if item != y_test[i]] total_images = min(len(incorrect_predictions), failed_classifications_to_save) for i in random.sample(incorrect_predictions, total_images): pred_class = y_pred[i] act_class = y_test[i] if class_names is not None: pred_class = class_names[pred_class] act_class = class_names[act_class] if image_shape is not None: # Reshape image before saving it imgplot = explorer.show_image(X_test[i].reshape(image_shape), silent_mode=True) else: imgplot = explorer.show_image(X_test[i], silent_mode=True) description = f'Predicted {pred_class} - Actual {act_class}' self.__current_run.log_image(description, plot=imgplot) if return_predictions: return y_pred def __stack_images(self, img1: np.array, img2: np.array): ha,wa = img1.shape[:2] hb,wb = img2.shape[:2] max_width = np.max([wa, wb]) total_height = ha+hb new_img = np.zeros(shape=(total_height, max_width, 3)) new_img[:ha,:wa]=img1 new_img[ha:hb+ha,:wb]=img2 return new_img def __concat_images(self, image_list: np.array) -> np.array: output = None for i, img in enumerate(image_list): if i==0: output = img else: output = self.__stack_images(output, img) return output def save_image_outputs(self, X_test: np.array, y_test: np.array, y_pred: np.array, samples_to_save: int = 1) -> np.array: ''' Will save image outputs to the run Args: X_test (np.array): The input images for the model y_test (np.array): The actual expected output images of the model y_pred (np.array): The predicted or calculated output images of the model samples_to_save (int): If greather than 0, this amount of input, output and generated image combinations will be tracked to the Run ''' from arcus.ml.images import explorer if samples_to_save > 0: import random total_images = min(len(y_pred), samples_to_save) for i in random.sample(range(len(y_pred)), total_images): newimg = self.__concat_images([X_test[i], y_test[i], y_pred[i]]) imgplot = explorer.show_image(newimg, silent_mode=True) self.__current_run.log_image(f'Image combo sample {i}', plot=imgplot) imgplot.close() def setup_training(self, training_name: str, overwrite: bool = False): ''' Will initialize a new directory (using the given training_name) and add a training script and requirements file to run training Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory overwrite (bool): Defines if the existing training files should be overwritten ''' if not os.path.exists(training_name): os.makedirs(training_name) # Take default training script and copy to the new folder default_training_script_file = os.path.join(str(os.path.dirname(__file__)), 'resources/train.py') default_requirements_file = os.path.join(str(os.path.dirname(__file__)), 'resources/requirements.txt') dest_training_script_file = os.path.join(training_name, 'train.py') dest_requirements_file = os.path.join(training_name, 'requirements.txt') if overwrite or not(os.path.isfile(dest_training_script_file)): shutil.copy2(default_training_script_file, training_name) if overwrite or not(os.path.isfile(dest_requirements_file)): shutil.copy2(default_requirements_file, training_name) def start_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, use_estimator: bool = False, **kwargs): ''' Will start a new training, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run Returns: Run : the submitted run ''' if use_estimator: print('Scheduling Estimator training') self._start_estimator_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs) else: print('Scheduling ScriptRunConfig training') self._start_environment_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs) if script_parameters is not None: for arg in script_parameters.keys(): self.__current_run.log(arg.replace('--', ''), script_parameters[arg]) print(self.__current_run.get_portal_url()) if(show_widget): from azureml.widgets import RunDetails RunDetails(self.__current_run).show() return self.__current_run def _start_environment_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs): ''' Will start a new training using ScriptRunConfig, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run ''' from azureml.train.estimator import Estimator from azureml.core import Environment, ScriptRunConfig from azureml.core.runconfig import RunConfiguration from azureml.core.runconfig import DataReferenceConfiguration from azureml.core.runconfig import CondaDependencies from arcus.azureml.experimenting import train_environment as te # Check if directory exists if not(os.path.exists(training_name) and os.path.isdir(training_name)): raise FileNotFoundError(training_name) # Check compute target if compute_target != 'local': self.__check_compute_target(compute_target, gpu_compute) training_env = te.get_training_environment(self.__workspace, training_name, os.path.join(training_name, 'requirements.txt'), use_gpu=gpu_compute, include_prerelease=True, environment_type=environment_type) runconfig = RunConfiguration() # Add datasets datarefs = dict() scriptargs = list() if script_parameters is not None: for key in script_parameters.keys(): scriptargs.append(key) scriptargs.append(script_parameters[key]) if(input_datasets is not None): for ds in input_datasets: print(f'Adding mounting data reference for dataset {ds}') # scriptargs.append(ds) scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute = ds)) # datastore, path = self._get_data_reference(self.__workspace.datasets[ds]) # datarefs[ds] = DataReferenceConfiguration(datastore_name=datastore, path_on_datastore = path, path_on_compute = '/' + ds, mode = 'mount', overwrite = False) if(input_datasets_to_download is not None): for ds in input_datasets_to_download: print(f'Adding download data reference for dataset {ds}') # scriptargs.append(ds) scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute = ds)) scriptrunconfig = ScriptRunConfig(source_directory='./' + training_name, script="train.py", run_config=runconfig, arguments=scriptargs) scriptrunconfig.run_config.target = compute_target scriptrunconfig.run_config.environment = training_env #scriptrunconfig.run_config.data_references = datarefs # Submit training self.__current_run = self.__experiment.submit(scriptrunconfig) def _get_data_reference(self, dataset: Dataset): import json j = json.loads(str(dataset).replace('FileDataset\n', '')) source = j['source'][0] sections = source.split("'") return sections[1], sections[3] def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs): ''' Will start a new training using an Estimator, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run ''' from azureml.train.estimator import Estimator # Check if directory exists if not(os.path.exists(training_name) and os.path.isdir(training_name)): raise FileNotFoundError(training_name) # Check compute target if compute_target != 'local': self.__check_compute_target(compute_target, gpu_compute) # Add datasets datasets = list() if(input_datasets is not None): for ds in input_datasets: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds)) if(input_datasets_to_download is not None): for ds in input_datasets_to_download: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds)) # as mount - as download constructor_parameters = { 'source_directory':training_name, 'script_params':script_parameters, 'inputs':datasets, 'compute_target':compute_target, 'entry_script':'train.py', 'pip_requirements_file':'requirements.txt', 'use_gpu':gpu_compute, 'use_docker':True} print('Creating estimator of type', estimator_type) if(estimator_type is None): # Using default Estimator estimator = Estimator(**constructor_parameters) elif(estimator_type == 'tensorflow'): from azureml.train.dnn import TensorFlow version_par = 'framework_version' if(not version_par in constructor_parameters.keys()): print('Defaulting to version 2.0 for TensorFlow') constructor_parameters[version_par] = '2.0' estimator = TensorFlow(**constructor_parameters) elif(estimator_type == 'sklearn'): from azureml.train.sklearn import SKLearn estimator = SKLearn(**constructor_parameters) elif(estimator_type == 'pytorch'): from azureml.train.dnn import PyTorch estimator = PyTorch(**constructor_parameters) # Submit training self.__current_run = self.__experiment.submit(estimator) # protected implementation methods def _log_metrics(self, metric_name: str, metric_value: float, description:str = None): print(metric_name, metric_value) self.__current_run.log(metric_name, metric_value, description=description) def _complete_run(self): ''' Completes the current run ''' self.__current_run.complete() def _log_confmatrix(self, confusion_matrix: np.array, class_names: np.array): data = {} data['schema_type'] = 'confusion_matrix' data['schema_version'] = 'v1' data['data'] = {} data['data']['class_labels'] = class_names.tolist() data['data']['matrix'] = confusion_matrix.tolist() print(confusion_matrix) json_data = json.dumps(data) self.__current_run.log_confusion_matrix('Confusion matrix', json_data, description='') def _save_roc_curve(self, roc_auc: float, roc_plot: plt): self._log_metrics('roc_auc', roc_auc) self.__current_run.log_image('ROC Curve', plot=plt) def __check_compute_target(self, compute_target, use_gpu: bool): __vm_size = '' if isinstance(compute_target, AmlCompute): __vm_size = compute_target.vm_size elif isinstance(compute_target, str): compute = ComputeTarget(workspace=self.__workspace, name=compute_target) __vm_size = compute.vm_size if self.__vm_size_list is None: self.__vm_size_list = AmlCompute.supported_vmsizes(self.__workspace) vm_description = list(filter(lambda vmsize: str.upper(vmsize['name']) == str.upper(__vm_size), self.__vm_size_list))[0] if(use_gpu and vm_description['gpus'] == 0): raise errors.TrainingComputeException(f'gpu_compute was specified, but the target does not have GPUs: {vm_description} ') if(not (use_gpu) and vm_description['vCPUs'] == 0): raise errors.TrainingComputeException(f'cpu_compute was specified, but the target does not have CPUs: {vm_description} ') def __log_roc_curve(self, y_pred: np.array, y_test: np.array): '''Will upload the Receiver Operating Characteristic (ROC) Curve for binary classifiers Args: y_pred (np.array): The predicted values of the test set y_test (np.array): The actual outputs of the test set Returns: float: The ROC_AUC value ''' # calculate the fpr and tpr for all thresholds of the classification fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred) roc_auc = metrics.auc(fpr, tpr) plt.cla() plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') self._save_roc_curve(roc_auc, plt) plt.show(block=False) plt.close() return roc_auc
tags = { "mlflow.source.type": "JOB", "mlflow.source.name": "train.py", "mlflow.user": "******" } run.set_tags(tags) # log environment variables env_dictionary["MLFLOW_EXPERIMENT_ID"] = exp._id env_dictionary["MLFLOW_RUN_ID"] = run_id env_dictionary["MLFLOW_TRACKING_URI"] = _get_mlflow_tracking_uri(ws) env_dictionary["HOME"] = "~/" else: # start run ws = get_ws() exp = Experiment(workspace=ws, name=experiment_name) run = exp.start_logging(snapshot_directory="/scripts") run.child_run(name=run_name) # TODO: add the step's name tags = { "mlflow.source.type": "JOB", "mlflow.source.name": "train.py", "mlflow.user": "******" } run.set_tags(tags) job_info_dict = { "run_id": run._run_id, "experiment_name": exp.name, "experiment_id": exp._id } json_dict = json.dumps(job_info_dict) with open(job_info_path, "w") as f: f.write(json_dict)
class ExploratoryAnalysis(): def __init__(self): self.__parser = argparse.ArgumentParser("preprocessing") self.__parser.add_argument("--datastore", type=str, help="Name of the datastore", default="workspaceblobstore") self.__parser.add_argument("--dataset_name", type=str, help="Name of the dataset") self.__parser.add_argument("--dataset_preprocessed_name", type=str, help="Standard preprocessed dataset") self.__parser.add_argument("--output_preprocess_dataset", type=str, help="Name of the PipelineData reference") self.__parser.add_argument( "--use_datadrift", type=distutils.util.strtobool, help= "Use datadrift(True/False). If true, we split the original datset by sex" ) self.__parser.add_argument("--retrain_status", type=distutils.util.strtobool, help="Retrain status") self.__args = self.__parser.parse_args() self.__run = Run.get_context() self.__local_run = type(self.__run) == _OfflineRun if self.__local_run: self.__ws = Workspace.from_config('../../notebooks-settings') self.__exp = Experiment(self.__ws, 'exploratory_analysis') self.__run = self.__exp.start_logging() else: self.__ws = self.__run.experiment.workspace self.__exp = self.__run.experiment self.__datastore = Datastore.get(self.__ws, datastore_name=self.__args.datastore) def main(self): df = self.__preprocess_dataset(schema_path="./schema_dataset.json") if not self.__args.retrain_status: self.__make_exploratory_analysis(df) else: self.__run.add_properties({ 'status': "The following step have been skipped because a retraining pipeline have been launched" }) self.__upload_datasets(df, df.columns) def __preprocess_dataset(self, schema_path): with open(schema_path) as f: schema = json.load(f) df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe() df = df.drop([ 'address', 'city', 'state', 'postalCode', 'name', 'ssn', 'observation' ], axis=1) columns_names = schema.keys() df.columns = columns_names return df def __make_exploratory_analysis(self, df): self.__get_profiling(df) self.__generate_count_target_plot(df) self.__count_target_variable(df) self.__generate_counts_with_target(df) self.__relation_plot("age", df) self.__relation_plot("cholesterol", df) self.__relation_plot("st_slope", df) self.__relation_plot("num_major_vessels", df) plt.rcParams['figure.figsize'] = (15, 5) sns.distplot(df['age']) plt.title('Distribution of Age', fontsize=20) self.__run.log_image('Distribution of Age', plot=plt) self.__count_sex_variable(df) size = df['sex'].value_counts() colors = ['lightblue', 'lightgreen'] labels = "Male", "Female" explode = [0, 0.01] my_circle = plt.Circle((0, 0), 0.7, color='white') plt.rcParams['figure.figsize'] = (9, 9) plt.pie(size, colors=colors, labels=labels, shadow=True, explode=explode, autopct='%.2f') plt.title('Distribution of Gender', fontsize=20) p = plt.gcf() p.gca().add_artist(my_circle) plt.legend() self.__run.log_image('Distribution of Gender', plot=plt) self.__generate_frequency_plot(df) plt.scatter(x=df.age[df.target == 1], y=df.max_heart_rate_achieved[(df.target == 1)]) plt.scatter(x=df.age[df.target == 0], y=df.max_heart_rate_achieved[(df.target == 0)]) plt.legend(["Disease", "Not Disease"]) plt.xlabel("Age") plt.ylabel("Maximum Heart Rate") self.__run.log_image('Disease/Not Disease', plot=plt) self.__get_outliers(df) self.__get_correlation_matrix(df) self.__get_mutual_info(df) self.__get_principal_components_analysis(df) def __get_dataset(self, dataset_name): return self.__ws.datasets.get(dataset_name) def __upload_datasets(self, df, columns): if self.__args.use_datadrift: splitted_datasets = self.__split_dataset(df) for dataset_type in splitted_datasets: dataset_name, preprocess_filepath, datastore_path = self.__get_dataset_metadata( splitted_datasets[dataset_type], dataset_type) self.__upload_dataset(self.__ws, self.__datastore, dataset_name, datastore_path, preprocess_filepath, use_datadrift=True, type_dataset=dataset_type) else: dataset_name, preprocess_filepath, datastore_path = self.__get_dataset_metadata( df, "train") self.__upload_dataset(self.__ws, self.__datastore, dataset_name, datastore_path, preprocess_filepath, use_datadrift=False, type_dataset="standard") def __split_dataset(self, df): df_female = df.drop(['target'], axis=1) df_female = df_female.loc[df_female['sex'] == 0] df_male = df.loc[df['sex'] == 1] return {"train": df_male, "inference": df_female} def __get_dataset_metadata(self, df, extension): dataset_name = f'{self.__args.dataset_preprocessed_name}_{extension}' output_preprocessed_directory = self.__args.output_preprocess_dataset if extension == "train" else f'{self.__args.output_preprocess_dataset}_{extension}' preprocess_filepath = os.path.join(output_preprocessed_directory, f'{dataset_name}.csv') datastore_path = f"heart-disease/{dataset_name}.csv" os.makedirs(output_preprocessed_directory, exist_ok=True) df.to_csv(preprocess_filepath, index=False) return dataset_name, preprocess_filepath, datastore_path def __upload_dataset(self, ws, def_blob_store, dataset_name, datastore_path, filepath, use_datadrift, type_dataset): def_blob_store.upload_files([filepath], target_path="heart-disease", overwrite=True) tab_data_set = Dataset.Tabular.from_delimited_files( path=(def_blob_store, datastore_path)) try: tab_data_set.register(workspace=ws, name=f'{dataset_name}', description=f'{dataset_name} data', tags={ 'format': 'CSV', 'use_datadrift': use_datadrift, 'type_dataset': type_dataset }, create_new_version=True) except Exception as ex: print(ex) def __get_profiling(self, df): profile = ProfileReport( df, title="Exploratory Analysis Report - Heart Disease") profile.to_file("heart-disease-report.html") self.__run.upload_file("heart-disease-report.html", "heart-disease-report.html") def __generate_count_target_plot(self, df): plt.figure(figsize=(20, 10)) df["target"].value_counts().plot.bar(figsize=(20, 10)) self.__run.log_image(f'Count target', plot=plt) def __count_target_variable(self, df): countNoDisease = len(df[df.target == 0]) countHaveDisease = len(df[df.target == 1]) self.__run.log( 'Percentage of Havent Heart Disease', "{:.2f}%".format( (countNoDisease / (len(df.target)) * 100))) self.__run.log( 'Percentage of Have Heart Disease', "{:.2f}%".format( (countHaveDisease / (len(df.target)) * 100))) def __generate_counts_with_target(self, df): columns = [ 'fasting_blood_sugar', 'exercise_induced_angina', 'rest_ecg' ] for column in columns: plt.figure(figsize=(20, 10)) sns.catplot(x="target", col=column, kind="count", data=df) self.__run.log_image(f'Count {column} over target', plot=plt) def __count_sex_variable(self, df): countFemale = len(df[df.sex == 0]) countMale = len(df[df.sex == 1]) self.__run.log('Percentage of Female Patients', "{:.2f}%".format( (countFemale / (len(df.sex)) * 100))) self.__run.log('Percentage of Male Patients', "{:.2f}%".format( (countMale / (len(df.sex)) * 100))) def __generate_frequency_plot(self, df): columns = [ 'age', 'sex', 'st_slope', 'fasting_blood_sugar', 'chest_pain_type' ] for column in columns: pd.crosstab(f'df.{column}', df.target).plot(kind="bar", figsize=(20, 6)) plt.title(f'Heart Disease Frequency for {column}') plt.xlabel(column) plt.xticks(rotation=0) plt.legend(["Haven't Disease", "Have Disease"]) plt.ylabel('Frequency of Disease or Not') self.__run.log_image(f'Heart Disease Frequency for {column}', plot=plt) def __get_outliers(self, df): outliers_columns = [ 'age', 'resting_blood_pressure', 'cholesterol', 'max_heart_rate_achieved', 'st_depression' ] for column in outliers_columns: f, ax = plt.subplots(figsize=(8, 6)) sns.boxplot(x=df[column]) self.__run.log_image(column, plot=plt) def __relation_plot(self, attribute, df): plt.rcParams['figure.figsize'] = (12, 9) sns.violinplot(x=df["target"], y=df[attribute], data=df, palette="muted") plt.title(f'Relation of target with {attribute}', fontsize=20, fontweight=30) self.__run.log_image(f'Relation of target with {attribute}', plot=plt) def __get_mutual_info(self, df): X = df.drop(['target'], axis=1) y = df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123) X_train.shape, y_train.shape, X_test.shape, y_test.shape mutual_info = mutual_info_classif(X_train.fillna(0), y_train) mi_series = pd.Series(mutual_info) mi_series.index = X_train.columns mi_series.sort_values(ascending=False) plt.figure(figsize=(20, 10)) mi_series.sort_values(ascending=False).plot.bar(figsize=(20, 8)) self.__run.log_image('Mutual Information features scores', plot=plt) k_best_features = SelectKBest(mutual_info_classif, k=10).fit(X_train.fillna(0), y_train) self.__run.log('Selected top 10 features', X_train.columns[k_best_features.get_support()]) def __get_correlation_matrix(self, df): plt.rcParams['figure.figsize'] = (20, 15) plt.style.use('ggplot') sns.heatmap(df.corr(), annot=True) plt.title('Correlation Matrix', fontsize=20) self.__run.log_image('Correlation Matrix', plot=plt) def __get_principal_components_analysis(self, df): x_data = df.drop(['target'], axis=1) y = df.target.values pca_exp = PCA(n_components=5) pca_exp.fit_transform(x_data) plt.figure(figsize=(10, 10)) plt.plot(np.cumsum(pca_exp.explained_variance_ratio_), 'ro-') plt.grid() self.__run.log_image('Explained_variance_ratio', plot=plt) pca = PCA(n_components=2) principalComponents = pca.fit_transform(x_data) self.__run.log('Total PCA Components', pca.n_components_) self.__run.log('Total explained variance', round(pca.explained_variance_ratio_.sum(), 5)) principal_df = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) plt.figure() plt.figure(figsize=(10, 10)) plt.xticks(fontsize=12) plt.yticks(fontsize=14) plt.xlabel('Principal Component - 1', fontsize=20) plt.ylabel('Principal Component - 2', fontsize=20) plt.title("Principal Component Analysis of Heart Disease Dataset", fontsize=20) targets = [0, 1] colors = ['r', 'g'] for target, color in zip(targets, colors): indicesToKeep = df['target'] == target plt.scatter(principal_df.loc[indicesToKeep, 'principal component 1'], principal_df.loc[indicesToKeep, 'principal component 2'], c=color, s=50) plt.legend(targets, prop={'size': 15}) self.__run.log_image( 'Principal Component Analysis of Heart Disease Dataset', plot=plt)
import os, json from azureml.core import Experiment, Workspace from azureml.core.authentication import ServicePrincipalAuthentication root_dir = os.path.abspath(__file__ + "/../../../") script_dir = os.path.join(root_dir, "aml_config/config.json") with open(script_dir) as f: config = json.load(f) workspace_name = config['workspace_name'] resource_group = config['resource_group'] subscription_id = config['subscription_id'] ws = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group) exp = Experiment(workspace=ws, name="trainpipeline") print(exp.name, exp.workspace.name, sep='\n') run = exp.start_logging() run.log('my magic number', 45) run.complete()
class DetectFairness(): def __init__(self): self.__parser = argparse.ArgumentParser("fairlearn") self.__parser.add_argument("--fitted_model_name", type=str, default="heart_disease_model_automl", help="Name of fitted model") self.__parser.add_argument("--model_data", type=str, help="Path of the model") self.__parser.add_argument("--dataset_name", type=str, default="heart_disease_preprocessed_train", help="Name of the dataset") self.__parser.add_argument("--output_fairness_dict", type=str, help="Name of the dataset") self.__args = self.__parser.parse_args() self.__run = Run.get_context() self.__local_run = type(self.__run) == _OfflineRun if self.__local_run: self.__ws = Workspace.from_config('../../notebooks-settings') self.__exp = Experiment(self.__ws, 'fairlearn') self.__run = self.__exp.start_logging() else: self.__ws = self.__run.experiment.workspace self.__exp = self.__run.experiment self.__sensitive_features = ['asthmatic', 'diabetic', 'smoker'] def main(self): dataset = self.__get_dataset(self.__args.dataset_name) model = self.__load_model() df = dataset.to_pandas_dataframe() X_raw, Y, A, X = self.__transform_df(df) X_train, X_test, Y_train, Y_test, A_train, A_test = self.__df_train_split( X_raw, Y, A, X) Y_pred = model.predict(X_test) content = { "Y_pred": Y_pred, "Y_test": Y_test, "A_test": A_test, "model_id": Model(self.__ws, self.__args.fitted_model_name).id } self.__set_fairlearn_dict_as_pipeline_output(content) def __get_dataset(self, dataset_name): return self.__ws.datasets.get(dataset_name) def __load_model(self): Model(self.__ws, self.__args.fitted_model_name).download(".") with open(self.__args.model_data, "rb") as f: return joblib.load(f) def __transform_df(self, df): X_raw = df.drop(['target'], axis=1) Y = df['target'] A = X_raw[self.__sensitive_features] X = X_raw.drop(labels=self.__sensitive_features, axis=1) return X_raw, Y, A, X def __df_train_split(self, X_raw, Y, A, X): X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split( X_raw, Y, A, test_size=0.3, random_state=123, stratify=Y) X_train = X_train.reset_index(drop=True) A_train = A_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) A_test = A_test.reset_index(drop=True) A_test.diabetic.loc[(A_test['diabetic'] == 0)] = 'not diabetic' A_test.diabetic.loc[(A_test['diabetic'] == 1)] = 'diabetic' A_test.asthmatic.loc[(A_test['asthmatic'] == 0)] = 'not asthmatic' A_test.asthmatic.loc[(A_test['asthmatic'] == 1)] = 'asthmatic' A_test.smoker.loc[(A_test['smoker'] == 0)] = 'not smoker' A_test.smoker.loc[(A_test['smoker'] == 1)] = 'smoker' return X_train, X_test, Y_train, Y_test, A_train, A_test def __set_fairlearn_dict_as_pipeline_output(self, content): os.makedirs(self.__args.output_fairness_dict, exist_ok=True) fairlearn_dict_path = os.path.join(self.__args.output_fairness_dict, 'fairlean_predictions_values.pkl') joblib.dump(value=content, filename=fairlearn_dict_path)
import pickle import random from preprocessing import preprocess_depthmap, preprocess_targets # Get the current run. run = Run.get_context() # Offline run. Download the sample dataset and run locally. Still push results to Azure. if(run.id.startswith("OfflineRun")): print("Running in offline mode...") # Access workspace. print("Accessing workspace...") workspace = Workspace.from_config() experiment = Experiment(workspace, "s4-cnndepthmap-height-offline") run = experiment.start_logging(outputs=None, snapshot_directory=".") # Get dataset. print("Accessing dataset...") if os.path.exists("premiumfileshare") == False: assert False, "Requires small size dataset" dataset_name = "cgmmldevpremium-SampleDataset-Example" dataset = workspace.datasets[dataset_name] dataset.download(target_path='.', overwrite=False) dataset_path = glob.glob(os.path.join("premiumfileshare"))[0] # Online run. Use dataset provided by training notebook. else: print("Running in online mode...") experiment = run.experiment workspace = experiment.workspace
# Make experiment reproducible tf.random.set_seed(EVAL_CONFIG.SPLIT_SEED) random.seed(EVAL_CONFIG.SPLIT_SEED) # Get the current run. run = Run.get_context() # Offline run. Download the sample dataset and run locally. Still push results to Azure. if run.id.startswith("OfflineRun"): print("Running in offline mode...") # Access workspace. print("Accessing workspace...") workspace = Workspace.from_config() experiment = Experiment(workspace, EVAL_CONFIG.EXPERIMENT_NAME) run = experiment.start_logging(outputs={}, snapshot_directory={}) # Get dataset. print("Accessing dataset...") dataset_name = DATA_CONFIG.NAME dataset_path = str(REPO_DIR / "data" / dataset_name) if not os.path.exists(dataset_path): dataset = workspace.datasets[dataset_name] dataset.download(target_path=dataset_path, overwrite=False) # Online run. Use dataset provided by training notebook. else: print("Running in online mode...") experiment = run.experiment workspace = experiment.workspace dataset_path = run.input_datasets["dataset"]