Exemplo n.º 1
0
class BuildFairlearnDashboard():
    def __init__(self):
        self.__parser = argparse.ArgumentParser("fairlearn")
        self.__parser.add_argument("--dataset_name",
                                   type=str,
                                   default="heart_disease_preprocessed_train",
                                   help="Name of the dataset")
        self.__parser.add_argument("--output_fairness_dict",
                                   type=str,
                                   help="Name of the dataset")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'fairlearn')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

    def main(self):
        fairlearn_dict_path = os.path.join(self.__args.output_fairness_dict,
                                           'fairlean_predictions_values.pkl')
        fairlearn_values = joblib.load(fairlearn_dict_path)
        dash_dict = self.__get_dashboard_dict(fairlearn_values['A_test'],
                                              fairlearn_values['Y_test'],
                                              fairlearn_values['Y_pred'],
                                              fairlearn_values['model_id'])
        self.__upload_dashboard_dict(dash_dict)

    def __get_dashboard_dict(self, A_test, Y_test, Y_pred, model_id):
        sf = {
            'diabetic': A_test.diabetic,
            'asthmatic': A_test.asthmatic,
            'smoker': A_test.smoker
        }

        return _create_group_metric_set(
            y_true=Y_test,
            predictions={model_id: Y_pred},
            sensitive_features=sf,
            prediction_type='binary_classification')

    def __upload_dashboard_dict(self, dash_dict):
        run = self.__exp.start_logging()
        try:
            dashboard_title = "Fairness insights of Logistic Regression Classifier with heart-disease data"
            upload_id = upload_dashboard_dictionary(
                run,
                dash_dict,
                dataset_name=self.__args.dataset_name,
                dashboard_name=dashboard_title)
        finally:
            run.complete()
Exemplo n.º 2
0
class TrackedAzureMLEvaluation:
    """
    Class to automatically track parameters, metrics and artifacts for a single model with azureml-sdk
    """
    def __init__(self, experimentName: str, workspace: Workspace,
                 evaluator: MetricsDictProvider):
        """
        :param experimentName:
        :param workspace:
        :param evaluator:
        """
        self.experimentName = experimentName
        self.evaluator = evaluator
        self.experiment = Experiment(workspace=workspace, name=experimentName)

    def evalModel(self,
                  model: VectorModel,
                  additionalLoggingValuesDict: dict = None,
                  **startLoggingKwargs):
        with self.experiment.start_logging(**startLoggingKwargs) as run:
            valuesDict = self.evaluator.computeMetrics(model)
            valuesDict['str(model)'] = str(model)
            if additionalLoggingValuesDict is not None:
                valuesDict.update(additionalLoggingValuesDict)
            for name, value in valuesDict.items():
                run.log(name, value)
class AmlExperimentation(Experimentation):
    def __init__(self, ws):
        super().__init__()
        self.aml_ws = ws
        self.aml_experiment = None
        self.aml_run = None
        self.is_running_flag = False

    def set_experiment(self, name, artifact_location=None):
        logging.info("Connecting to Azure ML")
        self.aml_experiment = Experiment(workspace=self.aml_ws, name=name)

    def start_run(self):
        self.aml_run = self.aml_experiment.start_logging()
        self.is_running_flag = True

    def end_run(self):
        self.aml_run.complete()
        self.is_running_flag = False

    def log_param(self, key, value):
        self.aml_run.log(key, value)

    def log_params(self, params):
        self.aml_run.log(params)

    def log_metric(self, key, value, step=None):
        self.aml_run.log(key, value)

    def log_metrics(self, metrics, step=None):
        self.aml_run.log(metrics)

    def search_runs(
        self,
        experiment_ids=None,
        filter_string="",
        run_view_type=1,
        max_results=100000,
        order_by=None,
    ):
        raise NotImplementedError()

    def log_image(self, title, fig):
        self.aml_run.log_image(name=title, plot=fig)

    def log_artifact(self, local_path, name=None, artifact_path=None):
        self.aml_run

    def log_artifacts(self, local_path, name=None, artifact_path=None):
        pass
Exemplo n.º 4
0
def toAzure():
    import azureml.core
    from azureml.core import Workspace
    from azureml.core import Experiment
    import shutil, os, glob
    from azureml.core.authentication import InteractiveLoginAuthentication

    with open("outputs/_experiment-name_.txt", "r", encoding="utf-8") as file:
        experiment_name = file.readline()

    try:
        ws = Workspace.get(
            name="sparknlp",
            subscription_id="bc5674c1-2f09-4eff-8497-b97f5466158f",
            resource_group="datascientists")

    except:

        interactive_auth = InteractiveLoginAuthentication(
            tenant_id="55574e46-daf5-45bd-8659-de00e36fb97c", force=True)
        ws = Workspace.get(
            name="sparknlp",
            subscription_id="bc5674c1-2f09-4eff-8497-b97f5466158f",
            resource_group="datascientists",
            auth=interactive_auth)

    experiment = Experiment(workspace=ws, name=experiment_name)

    notebooks = glob.glob("*.ipynb")
    for nb in notebooks:
        shutil.copy(nb, "outputs/_notebooks/CopyOf_" + nb)

    run = experiment.start_logging()
    print(
        f"Uploading the content of your '{experiment_name}' to Azure Cloud...")

    run.complete()
    runs = experiment.get_runs()

    print(f"Your {len(list(runs))}. run was uploaded.")
    print(
        """You can view your logs on Microsoft Azure Machine Learning Studio. To view the 
details of your last run, click the link below :""")

    runs = experiment.get_runs()
    return list(runs)[0]
Exemplo n.º 5
0
def test_run(expname, ws):
    # create a new experiment
    exp = Experiment(workspace=ws, name=expname)

    # start a run
    run = exp.start_logging()

    # log a number
    run.log('my magic number', 42)

    # log a list (Fibonacci numbers)
    run.log_list('my list', [1, 1, 2, 3, 5, 8, 13, 21, 34, 55])

    # finish the run
    run.complete()

    print(run.get_portal_url())
Exemplo n.º 6
0
class OfflineRunInitializer(RunInitializer):
    """Offline run. Download the sample dataset and run locally. Still push results to Azure"""
    def __init__(self, data_config: Bunch, eval_config: Bunch):
        super().__init__(data_config, eval_config)

    def run_azureml_setup(self):
        logger.info("Running in offline mode...")
        logger.info("Accessing workspace...")
        self.workspace = Workspace.from_config()
        self.experiment = Experiment(self.workspace, EVAL_EXPERIMENT_NAME)
        self.run = self.experiment.start_logging(outputs=None,
                                                 snapshot_directory=None)

    def get_dataset(self):
        logger.info("Accessing dataset...")
        dataset_name = self._data_config.NAME
        self.dataset_path = str(REPO_DIR / "data" / "datasets" / dataset_name)
        if not os.path.exists(self.dataset_path):
            dataset = self.workspace.datasets[dataset_name]
            dataset.download(target_path=self.dataset_path, overwrite=False)
Exemplo n.º 7
0
class TrackedAzureMLExperiment(TrackedExperiment):
    def __init__(self,
                 experimentName: str,
                 workspace: Workspace,
                 additionalLoggingValuesDict=None):
        """

        :param experimentName: name of experiment for tracking in workspace
        :param workspace: Azure workspace object
        :param additionalLoggingValuesDict: additional values to be logged for each run
        """
        self.experimentName = experimentName
        self.experiment = Experiment(workspace=workspace, name=experimentName)
        super().__init__(
            additionalLoggingValuesDict=additionalLoggingValuesDict)

    def _trackValues(self, valuesDict: Dict[str, Any]):
        with self.experiment.start_logging() as run:
            for name, value in valuesDict.items():
                run.log(name, value)
Exemplo n.º 8
0
class SpacyRetrainer:
    def __init__(self,
                 original_model_name=None,
                 experiment_name=None,
                 n_iter=100,
                 dropout=0.5,
                 aml_config='config.json',
                 output_dir='../../model-outputs',
                 train_pickle='../data/train.pickle',
                 test_pickle='../data/test.pickle'):
        self.experiment_name = experiment_name
        if aml_config:
            self.ws = Workspace.from_config(aml_config)
            self.experiment = Experiment(workspace=self.ws,
                                         name=experiment_name)
            self.aml_run = self.experiment.start_logging()
            self.has_aml = True
        else:
            self.has_aml = False

        self.model = original_model_name
        self.n_iter = n_iter
        self.output_dir = output_dir
        self.train_file = train_pickle
        self.test_file = test_pickle
        self.dropout = dropout

    def run(self):
        if self.has_aml:
            self.aml_run.log("model", self.model)
            self.aml_run.log("n_iter", self.n_iter)
            self.aml_run.log("train_file", self.train_file)
            self.aml_run.log("test_file", self.test_file)
            self.aml_run.log("dropout rate", self.dropout)
        model_path = self._train(self.model, self.output_dir, self.n_iter,
                                 self.train_file, self.experiment_name)
        self._score_validate(model_path, self.test_file)
        if self.has_aml:
            self.aml_run.complete()

    def print_scores(self, split, evaluation_result):
        """
        Logs results into experiment run.
        :param split: Name of this split. For ex 'train' or 'valid'
        :param evaluation_result: EvaluationResult containing various metrics
        :return: None. Writes to experiment runner and logs locally.
        """
        logging.info('SPLIT: {0}. PII_precision: {1}, PII_recall: {2},'
                     'Person_precision: {3}, Person_recall: {4}'. \
                     format(split, evaluation_result.pii_precision, evaluation_result.pii_recall,
                            evaluation_result.entity_precision_dict['PERSON'],
                            evaluation_result.entity_recall_dict['PERSON']))
        if self.has_aml:
            self.aml_run.log('Precision', evaluation_result.pii_precision,
                             split)
            self.aml_run.log('Recall', evaluation_result.pii_recall, split)

    @staticmethod
    def _score(model, data):
        """
        Score the model against the data
        :param model: Trained model
        :param data: Data split which is being scored.
        :return: An EvaluationResult containing various metrics
        """

        spacy_evaluator = SpacyEvaluator(model=model)

        results = []
        for text, ground_truth_annotations in data:
            ground_truth_entities = ground_truth_annotations['entities']
            input_sample = InputSample.from_spacy(text, ground_truth_entities)
            results.append(spacy_evaluator.evaluate_sample(input_sample))

        return spacy_evaluator.calculate_score(evaluation_results=results)

    def _score_validate(self, model_path, test_data_file):
        """
        Validation step for the model. Also prints the scores.
        :param model_path: Path to trained model.
        :param test_data_file: Data file which has the dataset for this split.
        :return: None. Prints the scores.
        """
        with open(test_data_file, 'rb') as f:
            valid_data = pickle.load(f)
        nlp = spacy.load(model_path)
        self.print_scores('Valid', self._score(nlp, valid_data))

    # @plac.annotations(
    #     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    #     output_dir=("Optional output directory", "option", "o", Path),
    #     n_iter=("Number of training iterations", "option", "n", int),
    #     train_file=("File containing pickled training Spacy NER formatted data", "option", "d", Path),
    #     test_file=("File containing pickled test Spacy NER formatted data", "option", "d", Path),
    #     exp_name=("Name of this experiment", "option", "e")
    # )

    def _train(self, model, output_dir, n_iter, train_file, exp_name):
        """Load the model, set up the pipeline and train the entity recognizer."""
        nlp = self.load_or_create_empty_model(model)

        if "ner" not in nlp.pipe_names:
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner, last=True)
        else:
            ner = nlp.get_pipe("ner")

        with open(train_file, 'rb') as f:
            train_data = pickle.load(f)

        # DEBUG
        train_data = train_data[:50]

        # add labels
        for _, annotations in train_data:
            for ent in annotations.get("entities"):
                ner.add_label(ent[2])

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        with nlp.disable_pipes(*other_pipes):  # only train NER
            # reset and initialize the weights randomly – but only if we're
            # training a new model
            if model is None:
                nlp.begin_training()
            for itn in range(n_iter):
                random.shuffle(train_data)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(train_data,
                                    size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(
                        texts,
                        annotations,
                        drop=self.dropout,
                        losses=losses,
                    )
                logging.debug("Losses", losses)
                if self.has_aml:
                    self.aml_run.log('Losses', losses['ner'])
                self.print_scores('Itn {}'.format(itn),
                                  self._score(nlp, train_data))

        self.print_scores('Train', self._score(nlp, train_data))

        saved_model_path = self.save_model(exp_name, nlp, output_dir)
        return saved_model_path

    @staticmethod
    def save_model(exp_name, model, output_dir):
        """
        Saves model to disk for later use.
        :param exp_name: Name of the running experiment. This is used as folder name for storing the model.
        :param model: Model being saved
        :param output_dir: Directory where to save the model.
        :return: Full path to saved model.
        """
        saved_model_path = Path(output_dir, exp_name)
        if not saved_model_path.exists():
            saved_model_path.mkdir(parents=True)
        model.to_disk(saved_model_path)
        logging.info("Saved model to {}".format(output_dir))
        return saved_model_path

    @staticmethod
    def load_model(exp_name, model_dir):
        """
        Loads a spacy model from disk

        :param exp_name: Name of experiment under which the model was saved
        :param model_dir: path to saved model
        :return: spacy model
        """
        saved_model_path = Path(model_dir, exp_name)
        return spacy.load(saved_model_path)

    @staticmethod
    def load_or_create_empty_model(model=None):
        """
        Loads a given model or creates a blank english model.
        :param model: Optional Model to load.
        :return: Loaded or blank model.
        """
        if model:
            nlp = spacy.load(model)
            logging.debug("Loaded model {}".format(model))
        else:
            nlp = spacy.blank("en")
            logging.debug("Created blank 'en' model")
        return nlp
Exemplo n.º 9
0
class ExploratoryAnalysis():
    def __init__(self):
        self.__parser = argparse.ArgumentParser("preprocessing")
        self.__parser.add_argument("--datastore",
                                   type=str,
                                   help="Name of the datastore",
                                   default="workspaceblobstore")
        self.__parser.add_argument("--dataset_name",
                                   type=str,
                                   help="Name of the dataset")
        self.__parser.add_argument("--dataset_preprocessed_name",
                                   type=str,
                                   help="Standard preprocessed dataset")
        self.__parser.add_argument("--output_preprocess_dataset",
                                   type=str,
                                   help="Name of the PipelineData reference")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'exploratory_analysis')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

        self.__datastore = Datastore.get(self.__ws,
                                         datastore_name=self.__args.datastore)

    def main(self):
        df, df_eda = self.__preprocess_dataset(
            schema_path="./schema_dataset.json")
        self.__make_exploratory_analysis(df_eda)
        self.__upload_datasets(df, df.columns)

    def __preprocess_dataset(self, schema_path):
        with open(schema_path) as f:
            schema = json.load(f)

        df, df_eda = self.__get_dataset(self.__args.dataset_name)
        columns_names = schema.keys()
        df.columns = columns_names

        return df, df_eda

    def __make_exploratory_analysis(self, df):
        self.__frequency_tremor(df)
        self.__tremor_acceleration_energy(df)
        self.__execute_tsne(df)

    def __get_dataset(self, dataset_name):
        acc0 = self.__ws.datasets.get(
            f"{dataset_name}0_dataset").to_pandas_dataframe()
        acc1 = self.__ws.datasets.get(
            f"{dataset_name}1_dataset").to_pandas_dataframe()
        acc2 = self.__ws.datasets.get(
            f"{dataset_name}2_dataset").to_pandas_dataframe()
        acc3 = self.__ws.datasets.get(
            f"{dataset_name}3_dataset").to_pandas_dataframe()

        df_eda = pd.concat([acc0, acc1, acc2, acc3], axis=0)
        df_eda['Tremor'] = df_eda['Tremor'].replace(2, 1)

        df = pd.concat([acc0, acc1, acc2, acc3], axis=0)
        df['Tremor'] = df['Tremor'].replace(2, 1)

        return df, df_eda

    def __upload_datasets(self, df, columns):
        dataset_name, preprocess_filepath, datastore_path = self.__get_dataset_metadata(
            df, "train")
        self.__upload_dataset(self.__ws,
                              self.__datastore,
                              dataset_name,
                              datastore_path,
                              preprocess_filepath,
                              use_datadrift=False,
                              type_dataset="standard")

    def __get_dataset_metadata(self, df, extension):
        dataset_name = f'{self.__args.dataset_preprocessed_name}_{extension}'
        output_preprocessed_directory = self.__args.output_preprocess_dataset if extension == "train" else f'{self.__args.output_preprocess_dataset}_{extension}'
        preprocess_filepath = os.path.join(output_preprocessed_directory,
                                           f'{dataset_name}.csv')
        datastore_path = f"parkinson/{dataset_name}.csv"

        os.makedirs(output_preprocessed_directory, exist_ok=True)
        df.to_csv(preprocess_filepath, index=False)

        return dataset_name, preprocess_filepath, datastore_path

    def __upload_dataset(self, ws, def_blob_store, dataset_name,
                         datastore_path, filepath, use_datadrift,
                         type_dataset):
        def_blob_store.upload_files([filepath],
                                    target_path="parkinson",
                                    overwrite=True)
        tab_data_set = Dataset.Tabular.from_delimited_files(
            path=(def_blob_store, datastore_path))
        try:
            tab_data_set.register(workspace=ws,
                                  name=f'{dataset_name}',
                                  description=f'{dataset_name} data',
                                  tags={
                                      'format': 'CSV',
                                      'use_datadrift': use_datadrift,
                                      'type_dataset': type_dataset
                                  },
                                  create_new_version=True)
        except Exception as ex:
            print(ex)

    def __frequency_tremor(self, df):
        axis = ["accZ_mean", "accX_mean"]
        for axi in axis:
            sns.set_style('whitegrid')
            plt.rcParams['font.family'] = 'Dejavu Sans'
            plt.figure(figsize=(16, 8))

            sns.set_palette("Set1", desat=0.80)
            facetgrid = sns.FacetGrid(df, hue='Tremor', size=6, aspect=2)
            facetgrid.map(sns.distplot, f"{axi}", hist=False)\
                .add_legend()
            self.__run.log_image(f"Parkinson Tremor - {axi}", plot=plt)

    def __tremor_acceleration_energy(self, df):
        plt.figure(figsize=(6, 8))
        sns.boxplot(x='Tremor',
                    y='accX_energy',
                    data=df,
                    showfliers=False,
                    saturation=1)
        plt.ylabel('Acceleration Energy X')
        self.__run.log_image(f"Parkinson Tremor - Acceleration Energy X",
                             plot=plt)

    def __perform_tsne(self,
                       X_data,
                       y_data,
                       perplexities,
                       n_iter=1000,
                       img_name_prefix='t-sne'):
        for index, perplexity in enumerate(perplexities):
            print(
                '\nperforming tsne with perplexity {} and with {} iterations at max'
                .format(perplexity, n_iter))
            X_reduced = TSNE(verbose=2,
                             perplexity=perplexity).fit_transform(X_data)
            print('Done..')

            print('Creating plot for this t-sne visualization..')
            df = pd.DataFrame({
                'x': X_reduced[:, 0],
                'y': X_reduced[:, 1],
                'label': y_data
            })

            sns.lmplot(data=df, x='x', y='y', hue='label', fit_reg=False, size=8,\
                    palette="Set1",markers=['*', 'o'])
            plt.title("perplexity : {} and max_iter : {}".format(
                perplexity, n_iter))
            img_name = img_name_prefix + '_perp_{}_iter_{}.png'.format(
                perplexity, n_iter)
            print('saving this plot as image in present working directory...')
            self.__run.log_image(f"Parkinson Tremor - {img_name}", plot=plt)

    def __execute_tsne(self, df):
        X_norm = normalize(df.drop(['Tremor'], axis=1), norm='l2')
        X_new2 = MinMaxScaler().fit_transform(X_norm)
        X_pre_tsne = X_new2
        y_pre_tsne = df['Tremor']
        self.__perform_tsne(X_data=X_pre_tsne,
                            y_data=y_pre_tsne,
                            perplexities=[2, 5, 10, 20, 50])
Exemplo n.º 10
0
def main():
    config = utils.load_yaml(args.config)
    task = config['task']
    EPOCHS = config['epoch']
    N_FOLDS = 5
    BATCH_SIZE = config['batchsize']
    IMAGE_SIZE = config['image_size']
    model_name = config['model']
    optimizer_name = config['optimizer']
    loss = config['loss']
    lr = float(config['lr'])
    n_class = config['n_class']
    lr_scheduler = config.get('lr_scheduler')
    azure_run = None
    tb_writer = None
    num_workers = 64
    experiment_name = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')

    print(f'found {torch.cuda.device_count()} gpus !!')
    try:

        if args.debug:
            print('running in debug mode')
            EPOCHS = 1
            N_FOLDS = 2
        if args.debug:
            result_dir = Path(utils.RESULT_DIR) / ('debug-' + experiment_name)
        else:
            result_dir = Path(utils.RESULT_DIR) / experiment_name
            ws = Workspace.from_config('.aml_config/config.json')
            exp = Experiment(workspace=ws, name='kaggle-aptos2019')
            azure_run = exp.start_logging()
            azure_run.log('experiment name', experiment_name)
            azure_run.log('epoch', EPOCHS)
            azure_run.log('batch size', BATCH_SIZE)
            azure_run.log('image size', IMAGE_SIZE)
            azure_run.log('model', model_name)
            azure_run.log('optimizer', optimizer_name)
            azure_run.log('loss_name', loss['name'])
            azure_run.log('lr', lr)
            azure_run.log('lr_scheduler', lr_scheduler)
            azure_run.log('task', task)
            if args.cv:
                azure_run.log('cv', N_FOLDS)
            else:
                azure_run.log('cv', 0)

        if args.multi:
            print('use multi gpu !!')

        os.mkdir(result_dir)
        print(f'created: {result_dir}')
        utils.save_yaml(result_dir / Path(args.config).name, config)

        #         if not args.debug:
        #             tb_writer = SummaryWriter(log_dir=result_dir)

        device = torch.device("cuda:0")
        config = {
            'epochs': EPOCHS,
            'multi': args.multi,
            'batch_size': BATCH_SIZE,
            'image_size': IMAGE_SIZE,
            'model_name': model_name,
            'n_class': n_class,
            'optimizer_name': optimizer_name,
            'loss': loss,
            'lr': lr,
            'lr_scheduler': lr_scheduler,
            'task': task,
            'device': device,
            'num_workers': num_workers,
        }

        print(config)

        if not args.debug:
            slack.notify_start(experiment_name, config)
        train_df = pd.read_csv(utils.TRAIN_CSV_PATH)
        if args.debug:
            train_df = train_df[:1000]
        config['df'] = train_df

        skf = StratifiedKFold(n_splits=N_FOLDS, random_state=41, shuffle=True)
        indices = list(skf.split(train_df, train_df['diagnosis']))
        if not args.cv:
            print('do not use cross validation')
            indices = [indices[0]]

        # cross validation
        oof_preds = np.zeros((len(train_df), n_class))
        for i_fold, (train_index, valid_index) in tqdm(enumerate(indices)):
            model_path = result_dir / f'model_fold{i_fold}'
            config['train_index'] = train_index
            config['valid_index'] = valid_index
            config['model_path'] = str(model_path)
            if azure_run:
                if i_fold == 0:
                    config['azure_run'] = azure_run
                    y_pred, y_true = utils.run_model(**config)
                else:
                    with azure_run.child_run() as child:
                        config['azure_run'] = child
                        y_pred, y_true = utils.run_model(**config)
            else:
                y_pred, y_true = utils.run_model(**config)
            if args.cv:
                oof_preds[valid_index] = y_pred
        if args.cv:
            valid_preds = oof_preds
            valid_true = train_df['diagnosis']
        else:
            valid_preds = y_pred
            valid_true = y_true
        if task == 'class':
            round_valid_preds = np.argmax(valid_preds, axis=1)
        elif task == 'reg':
            print('optimizing threshold ...')
            optR = utils.OptimizedRounder()
            optR.fit(valid_preds, valid_true)
            coef = optR.coefficients()
            print(f'best coef: {coef}')
            if azure_run:
                azure_run.log('coef', coef)
            round_valid_preds = optR.predict(valid_preds, coef)
        val_kappa = cohen_kappa_score(round_valid_preds,
                                      valid_true,
                                      weights='quadratic')

        print(f'best val kappa: {val_kappa}')
        if azure_run:
            azure_run.log('best val kappa', val_kappa)

        test_csv = pd.read_csv(utils.TEST_CSV_PATH)
        #test_tfms = utils.build_transform(size=IMAGE_SIZE, mode='test')
        test_tfms = utils.build_transform(size=IMAGE_SIZE, mode='val')
        test_dataset = RetinopathyDataset(df=test_csv,
                                          mode='test',
                                          transform=test_tfms,
                                          auto_crop=True,
                                          add_blur=True)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=BATCH_SIZE,
                                                  shuffle=False,
                                                  pin_memory=True,
                                                  num_workers=num_workers)

        test_preds = np.zeros((len(test_csv), n_class))
        for i in range(len(indices)):
            model = utils.load_pytorch_model(model_name,
                                             result_dir / f'model_fold{i}',
                                             n_class)
            test_preds += utils.predict(model,
                                        test_loader,
                                        n_class=n_class,
                                        device=device,
                                        tta=1)
        test_preds /= len(indices)
        if task == 'class':
            round_test_preds = np.argmax(test_preds, axis=1)
        elif task == 'reg':
            round_test_preds = optR.predict(test_preds, coef)
        submission_csv = pd.read_csv(utils.SAMPLE_SUBMISSION_PATH)
        submission_csv['diagnosis'] = round_test_preds
        submission_csv.to_csv(result_dir / 'submission.csv', index=False)

        print('finish!!!')
        if not args.debug:
            slack.notify_finish(experiment_name, config, val_kappa)

    except KeyboardInterrupt as e:
        if not args.debug:
            slack.notify_fail(experiment_name, config, e.__class__.__name__,
                              str(e))
    except Exception as e:
        if azure_run:
            azure_run.fail(e)
        if not args.debug:
            slack.notify_fail(experiment_name, config, e.__class__.__name__,
                              str(e))
        raise
    finally:
        if azure_run:
            azure_run.complete()
            print('close azure_run')
        if tb_writer:
            tb_writer.export_scalars_to_json(
                os.path.join(result_dir, 'all_scalars.json'))
            tb_writer.close()
            print('close tb_writer')
Exemplo n.º 11
0
print('The joblib version is {}.'.format(joblib.__version__))
print('The pandas version is {}.'.format(pd.__version__))
#print('The sklearn_pandas version is {}.'.format(sklearn_pandas.__version__))

# +
from azureml.core import Dataset

run = Run.get_context()

if run.identity.startswith('OfflineRun'):
    ws = Workspace.from_config()

    experiment_name = 'heart-failure-clinical-data'
    experiment = Experiment(ws, experiment_name)

    interactive_run = experiment.start_logging()
else:
    ws = run.experiment.workspace

ds = Dataset.get_by_name(ws, name='Heart Failure Prediction')
# -

x, y = get_x_y(ds)

# +
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
    "train":{"X": X_train, "y": y_train},        
    "test":{"X": X_test, "y": y_test}
}


# In[2]:


# Get an experiment object from Azure Machine Learning
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

experiment = Experiment(workspace=ws, name="train-within-notebook")
run = experiment.start_logging(snapshot_directory=None)
# Create a run object in the experiment

# Log the algorithm parameter alpha to the run
run.log('alpha', 0.03)

# Create, fit, and test the scikit-learn Ridge regression model
regression_model = Ridge(alpha=0.03)
regression_model.fit(data['train']['X'], data['train']['y'])
preds = regression_model.predict(data['test']['X'])

# Output the Mean Squared Error to the notebook and to the run
print('Mean Squared Error is', mean_squared_error(data['test']['y'], preds))
run.log('mse', mean_squared_error(data['test']['y'], preds))

# Save the model to the outputs directory for capture
Exemplo n.º 13
0
def analyse_with_gordo():
    ws = Workspace.from_config()  # Azure ML
    # Get an experiment object from Azure Machine Learning
    experiment_name = "dummy_test"
    experiment = Experiment(workspace=ws, name=experiment_name)  # Azure ML
    mlflow.set_experiment(experiment_name)  # MLFlow

    resamples_for_model = ["1T", "1H"]
    aggregation_methods = ["max", "mean"]
    batch_sizes = [1, 10, 100]
    epochs = [1, 10]
    number_of_permutations = len(
        list(
            itertools.product(
                aggregation_methods, resamples_for_model, batch_sizes, epochs
            )
        )
    )

    resampled_original_data = read_and_resample("2nd_test.hdf", "1S")

    if PLOTTING:
        plotnum = 0
        f, axarr = plt.subplots(number_of_permutations + 1, sharex=True)
        axarr[plotnum].plot(
            resampled_original_data, linewidth=1, label="sensor_data_1S_mean"
        )
        axarr[plotnum].legend(loc="upper left")
        plotnum += 1

    for aggregation_method, interval, batch_size, epoch in itertools.product(
        aggregation_methods, resamples_for_model, batch_sizes, epochs
    ):
        run = experiment.start_logging()
        with mlflow.start_run():
            mlflow.log_param("interval", interval)  # MLFlow
            mlflow.log_param("aggregation_method", aggregation_method)  # MLFlow
            mlflow.log_param("batch_size", batch_size)  # MLFlow
            mlflow.log_param("epochs", epoch)  # MLFlow

            run.log("interval", interval)  # Azure ML
            run.log("aggregation_method", aggregation_method)  # Azure ML
            run.log("batch_size", batch_size)  # Azure ML
            run.log("epochs", epoch)  # Azure ML

            print(
                f"Build model for data resampled with {interval} resolution,  method {aggregation_method}, batch size {batch_size} and number of epochs {epoch}"
            )
            resampled = read_and_resample(
                "2nd_test.hdf", interval, aggregation_method=aggregation_method
            )
            anomalies, avg_train_anomaly, predicted_data, train_until_index = build_model(
                resampled, epoch, batch_size
            )

            r2_train, expl_train, r2_test, expl_test = calc_scores(
                resampled, predicted_data, train_until_index
            )
            run.log("r2_train", r2_train)  # Azure ML
            run.log("explained_variance_train", expl_train)  # Azure ML
            run.log("r2_test", r2_test)  # Azure ML
            run.log("explained_variance_test", expl_test)  # Azure ML

            mlflow.log_metric("r2_train", r2_train)  # MLFlow
            mlflow.log_metric("explained_variance_train", expl_train)  # MLFlow
            mlflow.log_metric("r2_test", r2_test)  # MLFlow
            mlflow.log_metric("explained_variance_test", expl_test)  # MLFlow

            anomalies = anomalies.rolling(
                resamples_for_model[-1]
            ).mean()  # Use the last of the experiment resamples as the anomaly resample
            if PLOTTING:
                axarr[plotnum].plot(
                    anomalies, label=interval + "-" + aggregation_method + "-model"
                )
                axarr[plotnum].axhline(avg_train_anomaly, color="r")
                axarr[plotnum].legend(loc="upper left")
                plotnum += 1

        run.complete()  # Azure ML

    if PLOTTING:
        plt.show()
#   define compile to minimize categorical loss, use ada delta optimized, and optimize to maximizing accuracy
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

experiment_name = 'fashion-mnist'
experiment = Experiment(workspace=ws, name=experiment_name)

#   Define early stopping callback
my_callbacks = [EarlyStopping(monitor='val_acc', patience=5, mode='max')]

os.makedirs('./outputs', exist_ok=True)

#   Train the model and test/validate the mode with the test data after each cycle (epoch) through the training data
#   Return history of loss and accuracy for each epoch
with experiment.start_logging() as run:
    run.tag("Description","Locally trained Fashion MNIST model")
    
    hist = model.fit(x_train, y_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=1,
        callbacks=my_callbacks,
        validation_data=(x_test, y_test))
    run.log_list('Training Accuracy', hist.history['acc'])
    run.log_list('Validation Accuracy', hist.history['val_acc'])
    
    #   Evaluate the model with the test data to get the scores on "real" data.
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
Exemplo n.º 15
0
class Deployment():
    def __init__(self):
        self.__parser = argparse.ArgumentParser("deploy")

        self.__parser.add_argument(
            "--update_deployment",
            type=distutils.util.strtobool,
            help=
            "Deployment Flag. False=Generate deploy from scratch, True=Update Service"
        )
        self.__parser.add_argument("--dataset_name",
                                   type=str,
                                   help="Dataset name")
        self.__parser.add_argument("--model_name", type=str, help="Model name")
        self.__parser.add_argument("--explainer_model_name",
                                   type=str,
                                   help="Explainer model name")
        self.__parser.add_argument("--service_name",
                                   type=str,
                                   help="Service name")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'deploy_service')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

        self.__config = configparser.ConfigParser()
        self.__config.read("./config.ini")

    def main(self):
        dataset = Dataset.get_by_name(self.__ws, self.__args.dataset_name)
        df = dataset.to_pandas_dataframe()
        df = df.drop(['target'], axis=1)
        columns = [*df.columns]
        parameters = {
            "model_name": self.__args.model_name,
            "explainer_model_name": self.__args.explainer_model_name,
            "dataset_columns": columns
        }

        joblib.dump(
            parameters,
            os.path.join(self.__config.get('DEPLOY', 'DEPENDENCIES_DIRECTORY'),
                         "deploy_parameters.pkl"))
        self.__deploy_model()

    def __deploy_model(self):
        service_name = self.__args.service_name

        model = Model(self.__ws, self.__args.model_name)
        explainer_model = Model(self.__ws, self.__args.explainer_model_name)
        myenv = Environment.from_conda_specification(
            name=self.__config.get('DEPLOY', 'ENV_NAME'),
            file_path=self.__config.get('DEPLOY', 'ENV_FILE_PATH'))
        inference_config = InferenceConfig(
            entry_script=self.__config.get('DEPLOY', 'SCORE_PATH'),
            environment=myenv,
            source_directory=self.__config.get('DEPLOY',
                                               'DEPENDENCIES_DIRECTORY'))

        if not self.__args.update_deployment:
            deployment_config = AciWebservice.deploy_configuration(
                cpu_cores=self.__config.getint('DEPLOY', 'ACI_CPU'),
                memory_gb=self.__config.getint('DEPLOY', 'ACI_MEM'),
                collect_model_data=True,
                enable_app_insights=True)
            service = Model.deploy(self.__ws, service_name,
                                   [model, explainer_model], inference_config,
                                   deployment_config)
        else:
            service = AciWebservice(self.__ws, service_name)
            service.update(models=[model, explainer_model],
                           inference_config=inference_config)

        service.wait_for_deployment(show_output=True)
        print(service.state)
        print(service.get_logs())
Exemplo n.º 16
0
experiment = Experiment(workspace=ws, name="diabetes-experiment")
x_df = Diabetes.get_tabular_dataset().to_pandas_dataframe().dropna()
y_df = x_df.pop("Y")

X_train, X_test, y_train, y_test = train_test_split(x_df,
                                                    y_df,
                                                    test_size=0.2,
                                                    random_state=66)

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(X_train)

alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for alpha in alphas:
    run = experiment.start_logging()
    run.log("alpha_value", alpha)

    model = Ridge(alpha=alpha)
    model.fit(X=X_train, y=y_train)
    y_pred = model.predict(X=X_test)
    rmse = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred))
    run.log("rmse", rmse)

    model_name = "model_alpha_" + str(alpha) + ".pkl"
    filename = "outputs/" + model_name

    joblib.dump(value=model, filename=filename)
    run.upload_file(name=model_name, path_or_stream=filename)
    run.complete()
Exemplo n.º 17
0
workspace_name = ""
workspace_region = ""

ws = Workspace(subscription_id=subscription_id,
               resource_group=resource_group,
               workspace_name=workspace_name)

# COMMAND ----------

# create experiment
experiment_name = 'bikeSharingDemand'
exp = Experiment(workspace=ws, name=experiment_name)

# COMMAND ----------

run = exp.start_logging(snapshot_directory=None)

# COMMAND ----------

df = (spark.read.format("csv").option("inferSchema", "True").option(
    "header",
    "True").load("/databricks-datasets/bikeSharing/data-001/day.csv"))

# split data
train_df, test_df = df.randomSplit([0.7, 0.3])

# One Hot Encoding
mnth_encoder = OneHotEncoder(inputCol="mnth", outputCol="encoded_mnth")
weekday_encoder = OneHotEncoder(inputCol="weekday",
                                outputCol="encoded_weekday")
Exemplo n.º 18
0
PARSER.add_argument('--AZUREML_ARM_WORKSPACE_NAME')
PARSER.add_argument('--TENANT_ID')
PARSER.add_argument('--APP_ID')
PARSER.add_argument('--APP_SECRET')

ARGS = PARSER.parse_args()

WORKSPACE_NAME = ARGS.AZUREML_ARM_WORKSPACE_NAME
RESOURCE_GROUP = ARGS.AZUREML_ARM_RESOURCEGROUP
SUBSCRIPTION_ID = ARGS.AZUREML_ARM_SUBSCRIPTION
TENANT_ID = ARGS.TENANT_ID
APP_ID = ARGS.APP_ID
APP_SECRET = ARGS.APP_SECRET

SP_AUTH = ServicePrincipalAuthentication(tenant_id=TENANT_ID,
                                         service_principal_id=APP_ID,
                                         service_principal_password=APP_SECRET)

WORKSPACE = Workspace.get(WORKSPACE_NAME, SP_AUTH, SUBSCRIPTION_ID,
                          RESOURCE_GROUP)

EXPERIMENT = Experiment(workspace=WORKSPACE, name="trainpipeline")

print(EXPERIMENT.name, EXPERIMENT.workspace.name, sep='\n')

EXPERIMENT_RUN = EXPERIMENT.start_logging()

EXPERIMENT_RUN.log('my magic number', 45)

EXPERIMENT_RUN.complete()
class BuildFairnLearnModels():

    def __init__(self):
        self.__parser = argparse.ArgumentParser("fairlearn")
        self.__parser.add_argument("--dataset_name", type=str,
                                   default="heart_disease_preprocessed_train",
                                   help="Name of the dataset")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'fairlearn')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

        self.__sensitive_features = ['asthmatic', 'diabetic', 'smoker']

    def main(self):
        dataset = self.__get_dataset(self.__args.dataset_name)
        df = dataset.to_pandas_dataframe()

        X_raw, Y, A, X = self.__transform_df(df)
        X_train, X_test, Y_train, Y_test, A_train, A_test = self.__df_train_split(
            X_raw, Y, A, X)

        clf = Pipeline(steps=[('classifier', LogisticRegression(
            solver='liblinear', fit_intercept=True))])
        model = clf.fit(X_train, Y_train)

        predictors = self.__mitigation_with_gridsearch(
            X_train, A_train, Y_train, model)
        all_results = self.__remove_predictors_dominated_error_disparity_by_sweep(
            predictors, X_train, Y_train, A_train)
        dominant_models_dict, all_models_dict = self.__generate_dominant_models(
            model, all_results)
        models_all = self.__build_predictions_all_models(
            all_models_dict, X_test)
        dominant_all = self.__build_predictions_dominant_models(
            dominant_models_dict, X_test)

        os.makedirs('models', exist_ok=True)

        model_name_id_mapping = self.__get_dominant_models_names(dominant_all)
        dominant_all_ids = self.__get_dominant_models_id(
            dominant_all, model_name_id_mapping)
        dash_dict_all = self.__get_dashboard_dict(
            A_test, Y_test, dominant_all_ids)
        self.__plot_all_multimodel_by_feature(dash_dict_all)
        self.__upload_best_disparity_model_by_feature(
            dash_dict_all['precomputedMetrics'], dominant_all)
        self.__upload_dashboard_dict(dash_dict_all)

    def __get_dataset(self, dataset_name):
        return self.__ws.datasets.get(dataset_name)

    def __transform_df(self, df):
        X_raw = df.drop(['target'], axis=1)
        Y = df['target']

        A = X_raw[self.__sensitive_features]
        X = X_raw.drop(labels=self.__sensitive_features, axis=1)

        return X_raw, Y, A, X

    def __df_train_split(self, X_raw, Y, A, X):
        X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split(X_raw, Y, A,
                                                                             test_size=0.3,
                                                                             random_state=123,
                                                                             stratify=Y,
                                                                             shuffle=True)
        X_train = X_train.reset_index(drop=True)
        A_train = A_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        A_test = A_test.reset_index(drop=True)

        A_test.diabetic.loc[(A_test['diabetic'] == 0)] = 'not diabetic'
        A_test.diabetic.loc[(A_test['diabetic'] == 1)] = 'diabetic'

        A_test.asthmatic.loc[(A_test['asthmatic'] == 0)] = 'not asthmatic'
        A_test.asthmatic.loc[(A_test['asthmatic'] == 1)] = 'asthmatic'

        A_test.smoker.loc[(A_test['smoker'] == 0)] = 'not smoker'
        A_test.smoker.loc[(A_test['smoker'] == 1)] = 'smoker'

        return X_train, X_test, Y_train, Y_test, A_train, A_test

    def __mitigation_with_gridsearch(self, X_train, A_train, Y_train, fitted_model):
        sweep = GridSearch(LogisticRegression(solver='liblinear', fit_intercept=True),
                           constraints=DemographicParity(),
                           grid_size=70)
        sweep.fit(X_train, Y_train, sensitive_features=A_train.diabetic)
        predictors = sweep._predictors

        return predictors

    def __remove_predictors_dominated_error_disparity_by_sweep(self, predictors, X_train, Y_train, A_train):
        errors, disparities = [], []
        for m in predictors:
            def classifier(X): return m.predict(X)

            error = ErrorRate()
            error.load_data(X_train, pd.Series(Y_train),
                            sensitive_features=A_train.diabetic)
            disparity = DemographicParity()
            disparity.load_data(X_train, pd.Series(
                Y_train), sensitive_features=A_train.diabetic)

            errors.append(error.gamma(classifier)[0])
            disparities.append(disparity.gamma(classifier).max())

        return pd.DataFrame({"predictor": predictors, "error": errors, "disparity": disparities})

    def __generate_dominant_models(self, model, all_results):
        all_models_dict = {"heart_disease_unmitigated": model}
        dominant_models_dict = {"heart_disease_unmitigated": model}
        base_name_format = "heart_disease_grid_model_{0}"

        row_id = 0
        for row in all_results.itertuples():
            model_name = base_name_format.format(row_id)
            all_models_dict[model_name] = row.predictor
            errors_for_lower_or_eq_disparity = all_results[
                "error"][all_results["disparity"] <= row.disparity]
            if row.error <= errors_for_lower_or_eq_disparity.min():
                dominant_models_dict[model_name] = row.predictor
            row_id = row_id + 1

        return dominant_models_dict, all_models_dict

    def __build_predictions_all_models(self, all_models_dict, X_test):
        dashboard_all = dict()
        models_all = dict()
        for name, predictor in all_models_dict.items():
            value = predictor.predict(X_test)
            dashboard_all[name] = value
            models_all[name] = predictor

        return models_all

    def __build_predictions_dominant_models(self, dominant_models_dict, X_test):
        dominant_all = dict()
        for n, p in dominant_models_dict.items():
            dominant_all[n] = p.predict(X_test)

        return dominant_all

    def __get_dominant_models_id(self, dominant_all, model_name_id_mapping):
        dominant_all_ids = dict()
        for name, y_pred in dominant_all.items():
            dominant_all_ids[model_name_id_mapping[name]] = y_pred

        return dominant_all_ids

    def __get_dashboard_dict(self, A_test, Y_test, dominant_all_ids):
        sf = {'diabetic': A_test.diabetic,
              'asthmatic': A_test.asthmatic, 'smoker': A_test.smoker}
        return _create_group_metric_set(y_true=Y_test,
                                        predictions=dominant_all_ids,
                                        sensitive_features=sf,
                                        prediction_type='binary_classification')

    def __register_model(self, name, model, disparity=""):
        print("Registering ", name)
        model_path = "models/{0}.pkl".format(name)
        joblib.dump(value=model, filename=model_path)
        registered_model = Model.register(model_path=model_path,
                                          model_name=name,
                                          workspace=self.__ws,
                                          properties={
                                              "root_run_id": self.__run._root_run_id,
                                              "child_run_id": self.__run.id,
                                              "experiment": self.__run.experiment.name},
                                          tags={"disparity": f'{disparity}%'})
        print("Registered ", registered_model.id)
        return registered_model.id

    def __get_dominant_models_names(self, dominant_all):
        model_name_id_mapping = dict()
        for name, model in dominant_all.items():
            m_id = self.__register_model(name, model)
            model_name_id_mapping[name] = m_id

        return model_name_id_mapping

    def __upload_dashboard_dict(self, dash_dict_all):
        run = self.__exp.start_logging()
        try:
            dashboard_title = "Upload MultiAsset from Grid Search with heart-disease data"
            upload_id = upload_dashboard_dictionary(run,
                                                    dash_dict_all,
                                                    dataset_name=self.__args.dataset_name,
                                                    dashboard_name=dashboard_title)
        finally:
            run.complete()

    def __difference_selection_rate(self, selection_rate):
        return abs(selection_rate[0]-selection_rate[1])

    def __build_models_metrics(self, tags, feature_models, feature):
        tags[feature]['disparity'].append(self.__difference_selection_rate(
            feature_models['selection_rate']['bins']))

    def __upload_best_disparity_model_by_feature(self, dash_dict_all, dominant_all):
        tags = {}
        for i, feature in enumerate(self.__sensitive_features):
            tags[feature] = {}
            tags[feature]['disparity'] = []
            list(map(lambda feature_models: self.__build_models_metrics(
                tags, feature_models, feature), dash_dict_all[i]))
            model_info = tuple(dominant_all.items())[
                tags[feature]['disparity'].index(min(tags[feature]['disparity']))]
            self.__register_model(
                f'{feature}', model_info[1], min(tags[feature]['disparity']))

    def __scatterplot(self, disparities, accuracy_scores, legend, feature):
        plt.figure(figsize=(12, 7), dpi=80)
        colors = np.random.rand(len(accuracy_scores), 4)
        for accuracy, disparity, model_name, color in zip(accuracy_scores, disparities, legend, colors):
            plt.scatter(accuracy, disparity, c=[
                        color], s=170, label=model_name, alpha=0.3)
        plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
        plt.title('Multi model view - Models Comparison')
        plt.xlabel("Accuracy")
        plt.ylabel("Disparity in predictions")
        plt.grid()
        self.__run.log_image(
            f'Multi model view - Models Comparison of {feature}', plot=plt)

    def __get_models_metrics(self, feature_models, disparities, accuracy_scores):
        disparities.append(self.__difference_selection_rate(
            feature_models['selection_rate']['bins']))
        accuracy_scores.append(feature_models['accuracy_score']['global'])

    def __plot_all_multimodel_by_feature(self, dash_dict_all):
        for feature in self.__sensitive_features:
            self.__plot_multimodel_view_by_feature(feature, dash_dict_all)

    def __plot_multimodel_view_by_feature(self, feature, dash_dict_all):
        disparities = []
        accuracy_scores = []
        list(map(lambda feature_models: self.__get_models_metrics(feature_models, disparities,
                                                                  accuracy_scores), dash_dict_all['precomputedMetrics'][self.__sensitive_features.index(feature)]))
        self.__scatterplot(disparities, accuracy_scores,
                           dash_dict_all['modelNames'], feature)
class DifferentialPrivacy():

    def __init__(self):
        self.__parser = argparse.ArgumentParser("differential_privacy")
        self.__parser.add_argument("--datastore", type=str, help="Name of the datastore",
                                   default="workspaceblobstore")
        self.__parser.add_argument(
            "--dataset_name", type=str, help="Name of the dataset")
        self.__parser.add_argument(
            "--retrain_status", type=distutils.util.strtobool, help="Retrain status")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'differential_privacy')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

        self.__datastore = Datastore.get(
            self.__ws, datastore_name=self.__args.datastore)

    def main(self):
        if not self.__args.retrain_status:
            self.__main_execution()
        else:
            self.__run.add_properties(
                {'status': "The following step have been skipped because a retraining pipeline have been launched"})

    def __main_execution(self):
        with wn.Analysis() as analysis:
            data, self.__nsize = self.__get_dp_noise_dataset()
            sex_histogram_geometric, sex_histogram_laplace = self.__create_sex_histograms(
                data)
            state_histogram_geometric, state_histogram_laplace = self.__create_state_histograms(
                data)
            age_histogram_geometric, age_histogram_laplace = self.__create_age_histograms(
                data)
        analysis.release()

        n_sex, n_state, n_age = self.__create_and_upload_real_data_histograms()

        self.__show_dp_and_real_histogram(
            "Sex", ['female', 'male'], n_sex, sex_histogram_geometric, sex_histogram_laplace)
        self.__show_dp_and_real_histogram("State", self.get_states(
        ), n_state, state_histogram_geometric, state_histogram_laplace)
        self.__show_dp_and_real_histogram("Age", list(
            range(20, 80, 10)), n_age, age_histogram_geometric, age_histogram_laplace)

        if self.__local_run:
            self.__run.complete()

    def get_columns(self):
        df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe()
        return [*df.columns]

    def get_states(self):
        df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe()
        return [*df['state'].unique()]

    def __get_dp_noise_dataset(self):
        df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe()
        df.to_csv('tmp.csv', index=False)
        return wn.Dataset(path='tmp.csv', column_names=self.get_columns()), len(df.index)

    def __get_dataset(self, dataset_name):
        return self.__ws.datasets.get(dataset_name)

    def __create_sex_histograms(self, data):
        sex_histogram_geometric = wn.dp_histogram(
            wn.to_bool(data['sex'], true_label="0"),
            upper=self.__nsize,
            privacy_usage={'epsilon': .5, 'delta': 0.00001}
        )
        sex_prep = wn.histogram(wn.to_bool(
            data['sex'], true_label="0"), null_value=True)
        sex_histogram_laplace = wn.laplace_mechanism(
            sex_prep, privacy_usage={"epsilon": 0.4, "delta": .000001})

        return sex_histogram_geometric, sex_histogram_laplace

    def __create_state_histograms(self, data):
        states = self.get_states()
        state_histogram_geometric = wn.dp_histogram(
            data['state'],
            categories=states,
            null_value=states[0],
            privacy_usage={'epsilon': 0.2}
        )

        state_prep = wn.histogram(data['state'], categories=states,
                                  null_value=states[0])
        state_histogram_laplace = wn.laplace_mechanism(state_prep,
                                                       privacy_usage={"epsilon": 0.5, "delta": .000001})
        return state_histogram_geometric, state_histogram_laplace

    def __create_age_histograms(self, data):
        age_edges = list(range(20, 80, 10))
        age_histogram_geometric = wn.dp_histogram(
            wn.to_int(data['age'], lower=20, upper=80),
            edges=age_edges,
            upper=self.__nsize,
            null_value=20,
            privacy_usage={'epsilon': 0.5}
        )

        age_prep = wn.histogram(wn.to_int(data['age'], lower=20, upper=80),
                                edges=age_edges, null_value=20)
        age_histogram_laplace = wn.laplace_mechanism(
            age_prep, privacy_usage={"epsilon": 0.5, "delta": .000001})

        return age_histogram_geometric, age_histogram_laplace

    def __create_and_upload_real_data_histograms(self):
        df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe()

        sex = list(df[:]['sex'])
        state = list(df[:]['state'])
        age = list(df[:]['age'])

        n_sex = self.__upload_real_data_histogram(sex, [-0.5, 0.5, 1.5], "Sex")
        n_state = self.__upload_real_data_histogram(
            state, list(range(6)), "State")
        n_age = self.__upload_real_data_histogram(
            age, list(range(20, 90, 10)), "Age")

        return n_sex, n_state, n_age

    def __upload_real_data_histogram(self, data, bins, title):
        n_data, bins, _ = plt.hist(data, bins=bins, color='#0504aa',
                                   alpha=0.7, rwidth=0.85)
        plt.grid(axis='y', alpha=0.75)
        plt.xlabel(title)
        plt.ylabel('Frequency')
        plt.title(f'True Dataset {title} Distribution')
        self.__run.log_image(
            f'Differential Privacy Noise - True Dataset {title} Distribution', plot=plt)
        plt.clf()

        return n_data

    def __plot(self, ax, data, title, colors, xlabels, legend_names, width=0.2):
        positions = [
            [i+width*column for column in range(len(data[0]))] for i in range(len(data))]

        for position, value in zip(positions, data):
            ax.bar(position,
                   value,
                   width,
                   alpha=0.75,
                   color=colors
                   )

        ax.set_title(title)

        ax.set_xticks([p[0] + 1.5 * width for p in positions])
        ax.set_xticklabels(xlabels)

        proxies = [ax.bar([0], [0], width=0, color=c, alpha=0.75)[0]
                   for c in colors]
        ax.legend((proxies), legend_names, loc='upper left')

        ax.set_xlim(positions[0][0]-width, positions[-1][0]+width*len(data[0]))
        ax.set_ylim([0, max(max(l) for l in data)*1.2])

        plt.grid()
        self.__run.log_image(
            f'Differential Privacy - Histograms for {title} Distribution', plot=plt)
        plt.clf()

    def __show_dp_and_real_histogram(self, title, labels, n_data, geometric_histogram, laplace_histogram):
        colorseq = ["forestgreen", "indianred",
                    "orange", "orangered", "orchid"]
        legend = ['True Value', 'DP Geometric', 'DP Laplace']
        fig = plt.figure()
        ax = fig.add_subplot(111)
        data = [n_data, geometric_histogram.value, laplace_histogram.value]
        self.__plot(ax, list(map(list, zip(*data))),
                    title, colorseq, labels, legend)
Exemplo n.º 21
0
for compute in ws.compute_targets:
    print(compute)


#Now lets define an experiment. 

# Experiment is nothing by the process used for running a script. Experiment consists of many runs and we can track all the runs. 

#lets create the Experiment

from azureml.core import Experiment

experiment = Experiment(ws, "my_exp")

new_run = experiment.start_logging()

#my code.

new_run.complete()

#Now you know we can also log all the metrics and can save the entire folder or any file.

import numpy as np
import pandas as pd
import json
from azureml.core import Workspace
ws = Workspace.from_config()

exp = Experiment(ws, "my_exp")
run = exp.start_logging()
Exemplo n.º 22
0
class AzureMLTrainer(trainer.Trainer):
    is_connected: bool = False
    __config_file: str = '.azureml/config.json'
    __workspace: Workspace = None
    __experiment: Experiment = None
    __current_experiment_name: str
    __current_run: Run = None
    __logger: Logger = None
    __vm_size_list: list = None

    def __init__(self, experiment_name: str, aml_workspace: Workspace, aml_run: Run = None):
        '''
        Initializes a new connected Trainer that will persist and log all runs on AzureML workspace
        Args:
            experiment_name (str): The name of the experiment that will be seen on AzureML
            aml_workspace (Workspace): The connected workspace on AzureML
        '''
        self.__workspace = aml_workspace
        self.__logger = logging.getLogger()
        if aml_run is not None:
            self.__current_run = aml_run
            self.__experiment = aml_run.experiment
            self.__current_experiment_name = aml_run.experiment.name
        else:
            self.__current_experiment_name = experiment_name
            self.__experiment = Experiment(workspace=self.__workspace, name=experiment_name)


    @classmethod
    def CreateFromContext(cls):
        '''
        Creates a Trainer, based on the current Run context.  This will only work when used in an Estimator
        Returns: 
            AzureMLTrainer: an instance of AzureMLTrainer allowing the user to work connected.
        '''   
        run = Run.get_context()
        return cls(run.experiment.name, run.experiment.workspace, run)


    def new_run(self, description: str = None, copy_folder: bool = True, metrics: dict = None) -> Run:
        '''
        This will begin a new interactive run on the existing AzureML Experiment.  When a previous run was still active, it will be completed.
        Args:
            description (str): An optional description that will be added to the run metadata
            copy_folder (bool): Indicates if the output folder should be snapshotted and persisted
            metrics (dict): The metrics that should be logged in the run already
        Returns:
            Run: the AzureML Run object that can be used for further access and custom logic
        '''
        if(self.__current_run is not None):
            self.__current_run.complete()
        if(copy_folder):
            self.__current_run = self.__experiment.start_logging()
        else:
            self.__current_run = self.__experiment.start_logging(snapshot_directory = None)

        if(metrics is not None):
            for k, v in metrics.items():
                self.__current_run.log(k, v)

        if(description is not None):
            self.__current_run.log('Description', description)
        
        return self.__current_run

    def add_tuning_result(self, run_index: int, train_score: float, test_score: float, sample_count: int, durations:np.array, parameters: dict, estimator):
        '''
        This add results of a cross validation fold to the child run in a Grid Search
        Args:
            train_score (float): The given score of the training data
            test_score (float): The given score of the test data
            sample_count (int): The number of samples that were part of a fold
            durations (np.array): The different durations of the Grid Search
            parameters (dict): The parameter combinations that have been tested in this cross validation fold
            estimate (model): The actual fitted estimator / model that was trained in this fold
        '''
        _child_run = self.__current_run.child_run('Gridsearch' + str(run_index))
        self.__current_run.log_row('Trainscore', score = train_score)
        self.__current_run.log_row('Testscore', score = test_score)

        _table = {
            'Testing score': test_score,
            'Training score': train_score
            }

        for k in parameters.keys():
            v = parameters[k]
            if(v is None):
                v = 'None'
            _child_run.log(k, v)
            _table[k] = v
        
        self.__current_run.log_row('Results', '', **_table)
        _child_run.complete()


    def get_best_model(self, metric_name:str, take_highest:bool = True):
        '''
        Tags and returns the best model of the experiment, based on the given metric
        Args:
            metric_name (str): The name of the metric, such as accuracy
            take_highest (bool): In case of accuracy and score, this is typically True.  In case you want to get the model based on the lowest error, you can use False
        Returns:
            Run: the best run, which will be labeled as best run
        '''
        runs = {}
        run_metrics = {}
        for r in tqdm(self.__experiment.get_runs()):
            metrics = r.get_metrics()
            if metric_name in metrics.keys():
                runs[r.id] = r
                run_metrics[r.id] = metrics
        best_run_id = min(run_metrics, key = lambda k: run_metrics[k][metric_name])
        best_run = runs[best_run_id]
        best_run.tag('Best run')
        return best_run

    def get_azureml_experiment(self):
        '''
        Gives access to the AzureML experiment object
        Returns:
            Experiment: the existing experiment
        '''
        return self.__experiment
        
    def complete_run(self, fitted_model, metrics_to_log: dict = None, upload_model: bool = True):
        '''
        Saves all results of the active Run and completes it
        Args:
            fitted_model (model): The already fitted model to be tested.  Sklearn and Keras models have been tested
            metrics_to_log (dict): The metrics that should be logged with the model to the run
            upload_model (bool): This will upload the model (pkl file or json) to AzureML run (defaults to True)
        '''
        is_keras = 'keras' in str(type(fitted_model))

        if(metrics_to_log is not None):
            for k, v in metrics_to_log.items():
                self._log_metrics(k, v)
        
        if upload_model:
            # Save the model to the outputs directory for capture
            if(is_keras):
                model_folder_name = 'outputs/model'
                fitted_model.save(model_folder_name)
                files_to_upload = dict()
            else:
                model_file_name = 'outputs/model.pkl'
                joblib.dump(value = fitted_model, filename = model_file_name)

        self._complete_run()

    def evaluate_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, save_curves_as_image: bool = False,
                             class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array:

        '''
        Will predict and evaluate a model against a test set and save all results to the active Run on AzureML
        Args:
            fitted_model (model): The already fitted model to be tested.  Sklearn and Keras models have been tested
            X_test (np.array): The test set to calculate the predictions with
            y_test (np.array): The output test set to evaluate the predictions against
            show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier
            save_curves_as_image (bool): This will save the training & loss curves as images
            class_names (np.array): The class names that will be linked to the Confusion Matrix.  If not provided, the unique values of the y_test matrix will be used
            finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True)
            upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True)
            return_predictions (bool): If true, the y_pred values will be returned
        Returns: 
            np.array: The predicted (y_pred) values against the model
        '''
        is_keras = 'keras' in str(type(fitted_model))
        
        # Predict X_test with model
        if(is_keras):
            if 'predict_classes' in dir(fitted_model):
                y_pred = fitted_model.predict_classes(X_test)
            else:
                y_pred = fitted_model.predict(X_test)
                y_pred = np.argmax(y_pred, axis=1)
            self.add_training_plots(fitted_model, save_image=save_curves_as_image)
        else:
            y_pred = fitted_model.predict(X_test)

        if class_names is None:
            class_names = np.char.mod('%d', sorted(np.unique(y_test)))

        # Print classification report
        print(metrics.classification_report(y_test, y_pred))

        # Confusion matrix
        cf = metrics.confusion_matrix(y_test, y_pred)
        self._log_confmatrix(cf, class_names)

        # Accuracy
        accuracy = metrics.accuracy_score(y_test, y_pred) * 100
        self._log_metrics('accuracy', accuracy, description='')

        if(show_roc == True):
            # Verify that we are having a binary classifier
            if(len(class_names)!=2):
                raise AttributeError('Showing a ROC curve is only possible for binary classifier, not for multi class')
            self.__log_roc_curve(y_test, y_pred) 

        if (finish_existing_run):
            self.complete_run(fitted_model, upload_model = upload_model)

        if return_predictions:  
            return y_pred

    def add_training_plots(self, fitted_model, metrics=None, save_image: bool = False):
        '''
        Add the training plots to the Run history
        Args:
            fitted_model (Keras model): the fitted model that contains the training history
            metrics (list): the metrics that should be tracked to the run.  If None, all available metrics will be taken
        
        '''
        history = fitted_model.history
        if metrics is None:
            metrics = history.history.keys()

        for metric in metrics:
            if(metric in history.history.keys()):
                self.__current_run.log_table(f'Plot {metric}', {metric: history.history[metric]})

                if(save_image and not metric.startswith('val_') and metric in history.history.keys()):
                    plt.plot(history.history[metric])
                    plt.plot(history.history[f'val_{metric}'])
                    plt.title(f'model {metric}')
                    plt.ylabel(metric)
                    plt.xlabel('epoch')
                    plt.legend(['train', 'test'], loc='upper left')
                    #plt.show()
                    self.__current_run.log_image(f'model {metric}', plot=plt)
                    plt.close()

    def evaluate_image_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, failed_classifications_to_save: int = 0, image_shape = None, save_curves_as_image: bool = False,
                                class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array:

        '''
        Will predict and evaluate a model against a test set and save all results to the active Run on AzureML
        Args:
            fitted_model (model): The already fitted model to be tested.  Sklearn and Keras models have been tested
            X_test (np.array): The test set to calculate the predictions with
            y_test (np.array): The output test set to evaluate the predictions against
            show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier
            failed_classifications_to_save (int): If greather than 0, this amount of incorrectly classified images will be tracked to the Run
            image_shape ((int, int, int)): Indicates if images should be reshaped before saving them
            class_names (np.array): The class names that will be used in the description.  If not provided, the unique values of the y_test matrix will be used
            finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True)
            upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True)
        Returns: 
            np.array: The predicted (y_pred) values against the model
        ''' 
        from arcus.ml.images import explorer
        
        y_pred = self.evaluate_classifier(fitted_model, X_test, y_test, show_roc=show_roc, save_curves_as_image=save_curves_as_image, class_names= class_names, finish_existing_run=False, upload_model=upload_model, return_predictions=True)
        if failed_classifications_to_save > 0:
            # Take incorrect classified images and save
            import random
            incorrect_predictions = [i for i, item in enumerate(y_pred) if item != y_test[i]]
            total_images = min(len(incorrect_predictions), failed_classifications_to_save)

            for i in random.sample(incorrect_predictions, total_images):
                pred_class = y_pred[i]
                act_class = y_test[i]
                if class_names is not None:
                    pred_class = class_names[pred_class]
                    act_class = class_names[act_class]
                if image_shape is not None:
                    # Reshape image before saving it
                    imgplot = explorer.show_image(X_test[i].reshape(image_shape), silent_mode=True)
                else:
                    imgplot = explorer.show_image(X_test[i], silent_mode=True)
                description = f'Predicted {pred_class} - Actual {act_class}'
                self.__current_run.log_image(description, plot=imgplot)

        if return_predictions:  
            return y_pred




    def __stack_images(self, img1: np.array, img2: np.array):
        ha,wa = img1.shape[:2]
        hb,wb = img2.shape[:2]
        max_width = np.max([wa, wb])
        total_height = ha+hb
        new_img = np.zeros(shape=(total_height, max_width, 3))
        new_img[:ha,:wa]=img1
        new_img[ha:hb+ha,:wb]=img2
        return new_img

    def __concat_images(self, image_list: np.array) -> np.array:
        output = None
        for i, img in enumerate(image_list):
            if i==0:
                output = img
            else:
                output = self.__stack_images(output, img)
        return output

 

    def save_image_outputs(self, X_test: np.array, y_test: np.array, y_pred: np.array, samples_to_save: int = 1) -> np.array:
        '''
        Will save image outputs to the run
        Args:
            X_test (np.array): The input images for the model
            y_test (np.array): The actual expected output images of the model
            y_pred (np.array): The predicted or calculated output images of the model
            samples_to_save (int): If greather than 0, this amount of input, output and generated image combinations will be tracked to the Run
        ''' 
        from arcus.ml.images import explorer

        if samples_to_save > 0:
            import random
            total_images = min(len(y_pred), samples_to_save)

            for i in random.sample(range(len(y_pred)), total_images):
                newimg = self.__concat_images([X_test[i], y_test[i], y_pred[i]])
                imgplot = explorer.show_image(newimg, silent_mode=True)
                self.__current_run.log_image(f'Image combo sample {i}', plot=imgplot)
                imgplot.close()

    def setup_training(self, training_name: str, overwrite: bool = False):
        '''
        Will initialize a new directory (using the given training_name) and add a training script and requirements file to run training
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            overwrite (bool): Defines if the existing training files should be overwritten
        '''
        if not os.path.exists(training_name):
            os.makedirs(training_name)
        # Take default training script and copy to the new folder
        default_training_script_file = os.path.join(str(os.path.dirname(__file__)), 'resources/train.py')
        default_requirements_file = os.path.join(str(os.path.dirname(__file__)), 'resources/requirements.txt')
        dest_training_script_file = os.path.join(training_name, 'train.py')
        dest_requirements_file = os.path.join(training_name, 'requirements.txt')

        if overwrite or not(os.path.isfile(dest_training_script_file)):
            shutil.copy2(default_training_script_file, training_name)

        if overwrite or not(os.path.isfile(dest_requirements_file)):
            shutil.copy2(default_requirements_file, training_name)
        
    def start_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, 
                        input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, 
                        script_parameters: dict = None, show_widget: bool = True, use_estimator: bool = False, **kwargs):
        ''' 
        Will start a new training, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        Returns:
            Run : the submitted run
        '''
        
        if use_estimator:
            print('Scheduling Estimator training')
            self._start_estimator_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs)
        else:
            print('Scheduling ScriptRunConfig training')
            self._start_environment_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs)
        
        if script_parameters is not None:
            for arg in script_parameters.keys():
                self.__current_run.log(arg.replace('--', ''), script_parameters[arg])

        print(self.__current_run.get_portal_url())

        if(show_widget):
            from azureml.widgets import RunDetails
            RunDetails(self.__current_run).show()
        return self.__current_run

    def _start_environment_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, 
                                    input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, 
                                    script_parameters: dict = None, show_widget: bool = True, **kwargs):
        ''' 
        Will start a new training using ScriptRunConfig, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        '''
        from azureml.train.estimator import Estimator
        from azureml.core import Environment, ScriptRunConfig
        from azureml.core.runconfig import RunConfiguration
        from azureml.core.runconfig import DataReferenceConfiguration
        from azureml.core.runconfig import CondaDependencies
        from arcus.azureml.experimenting import train_environment as te

        # Check if directory exists
        if not(os.path.exists(training_name) and os.path.isdir(training_name)):
            raise FileNotFoundError(training_name)

        # Check compute target
        if compute_target != 'local':
            self.__check_compute_target(compute_target, gpu_compute)

        training_env = te.get_training_environment(self.__workspace, training_name, os.path.join(training_name, 'requirements.txt'), use_gpu=gpu_compute, include_prerelease=True, environment_type=environment_type)
        runconfig = RunConfiguration()

        # Add datasets
        datarefs = dict()
        
        scriptargs = list()
        if script_parameters is not None:
           for key in script_parameters.keys():
               scriptargs.append(key)
               scriptargs.append(script_parameters[key])

        if(input_datasets is not None):
            for ds in input_datasets:
                print(f'Adding mounting data reference for dataset {ds}')
                # scriptargs.append(ds)
                scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute = ds))
#                datastore, path = self._get_data_reference(self.__workspace.datasets[ds])
#                datarefs[ds] = DataReferenceConfiguration(datastore_name=datastore, path_on_datastore = path, path_on_compute = '/' + ds, mode = 'mount', overwrite = False)
        if(input_datasets_to_download is not None):
            for ds in input_datasets_to_download:
                print(f'Adding download data reference for dataset {ds}')
                # scriptargs.append(ds)
                scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute = ds))



        scriptrunconfig = ScriptRunConfig(source_directory='./' + training_name, script="train.py", run_config=runconfig, 
                                            arguments=scriptargs)
        scriptrunconfig.run_config.target = compute_target
        scriptrunconfig.run_config.environment = training_env
        #scriptrunconfig.run_config.data_references = datarefs

        # Submit training
        self.__current_run = self.__experiment.submit(scriptrunconfig)
        


    def _get_data_reference(self, dataset: Dataset):
        import json
        j = json.loads(str(dataset).replace('FileDataset\n', ''))
        source = j['source'][0]
        sections = source.split("'")
        return sections[1], sections[3]

    def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs):
        ''' 
        Will start a new training using an Estimator, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        '''
        from azureml.train.estimator import Estimator

        # Check if directory exists
        if not(os.path.exists(training_name) and os.path.isdir(training_name)):
            raise FileNotFoundError(training_name)

        # Check compute target
        if compute_target != 'local':
            self.__check_compute_target(compute_target, gpu_compute)
            

        # Add datasets
        datasets = list()
        if(input_datasets is not None):
            for ds in input_datasets:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds))
        if(input_datasets_to_download is not None):
            for ds in input_datasets_to_download:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds))

        # as mount - as download
        constructor_parameters = {
            'source_directory':training_name,
            'script_params':script_parameters,
            'inputs':datasets,
            'compute_target':compute_target,
            'entry_script':'train.py',
            'pip_requirements_file':'requirements.txt', 
            'use_gpu':gpu_compute,
            'use_docker':True}
        
        print('Creating estimator of type', estimator_type)

        if(estimator_type is None):
            # Using default Estimator
            estimator = Estimator(**constructor_parameters)
        elif(estimator_type == 'tensorflow'):
            from azureml.train.dnn import TensorFlow
            version_par = 'framework_version'
            if(not version_par in constructor_parameters.keys()):
                print('Defaulting to version 2.0 for TensorFlow')
                constructor_parameters[version_par] = '2.0'
            estimator = TensorFlow(**constructor_parameters)
        elif(estimator_type == 'sklearn'):
            from azureml.train.sklearn import SKLearn
            estimator = SKLearn(**constructor_parameters)
        elif(estimator_type == 'pytorch'):
            from azureml.train.dnn import PyTorch
            estimator = PyTorch(**constructor_parameters)

        # Submit training
        self.__current_run = self.__experiment.submit(estimator)

    # protected implementation methods
    def _log_metrics(self, metric_name: str, metric_value: float, description:str = None):
        print(metric_name, metric_value) 

        self.__current_run.log(metric_name, metric_value, description=description)

    
    def _complete_run(self):
        '''
        Completes the current run
        '''
        self.__current_run.complete()

    def _log_confmatrix(self, confusion_matrix: np.array, class_names: np.array):
        data = {}
        data['schema_type'] = 'confusion_matrix'
        data['schema_version'] = 'v1'
        data['data'] = {}
        data['data']['class_labels'] = class_names.tolist()
        data['data']['matrix'] = confusion_matrix.tolist()
        
        print(confusion_matrix)

        json_data = json.dumps(data)
        self.__current_run.log_confusion_matrix('Confusion matrix', json_data, description='')

    def _save_roc_curve(self, roc_auc: float, roc_plot: plt):
        self._log_metrics('roc_auc', roc_auc)
        self.__current_run.log_image('ROC Curve', plot=plt)

    def __check_compute_target(self, compute_target, use_gpu: bool):
        __vm_size = ''
        if isinstance(compute_target, AmlCompute):
            __vm_size = compute_target.vm_size
        elif isinstance(compute_target, str):
            compute = ComputeTarget(workspace=self.__workspace, name=compute_target)
            __vm_size = compute.vm_size

        if self.__vm_size_list is None:
            self.__vm_size_list = AmlCompute.supported_vmsizes(self.__workspace)
        
        vm_description = list(filter(lambda vmsize: str.upper(vmsize['name']) == str.upper(__vm_size), self.__vm_size_list))[0]
        if(use_gpu and vm_description['gpus'] == 0):
            raise errors.TrainingComputeException(f'gpu_compute was specified, but the target does not have GPUs: {vm_description} ')
        if(not (use_gpu) and vm_description['vCPUs'] == 0):
            raise errors.TrainingComputeException(f'cpu_compute was specified, but the target does not have CPUs: {vm_description} ')


    def __log_roc_curve(self, y_pred: np.array, y_test: np.array):
        '''Will upload the Receiver Operating Characteristic (ROC) Curve for binary classifiers

        Args:
            y_pred (np.array): The predicted values of the test set 
            y_test (np.array): The actual outputs of the test set

        Returns: 
            float: The ROC_AUC value
        '''
        # calculate the fpr and tpr for all thresholds of the classification
        fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)
        roc_auc = metrics.auc(fpr, tpr)
        plt.cla()
        plt.title('Receiver Operating Characteristic')
        plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        plt.legend(loc = 'lower right')
        plt.plot([0, 1], [0, 1],'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        self._save_roc_curve(roc_auc, plt)
        plt.show(block=False)
        plt.close()
        return roc_auc
Exemplo n.º 23
0
     tags = {
         "mlflow.source.type": "JOB",
         "mlflow.source.name": "train.py",
         "mlflow.user": "******"
     }
     run.set_tags(tags)
     # log environment variables
     env_dictionary["MLFLOW_EXPERIMENT_ID"] = exp._id
     env_dictionary["MLFLOW_RUN_ID"] = run_id
     env_dictionary["MLFLOW_TRACKING_URI"] = _get_mlflow_tracking_uri(ws)
     env_dictionary["HOME"] = "~/"
 else:
     # start run
     ws = get_ws()
     exp = Experiment(workspace=ws, name=experiment_name)
     run = exp.start_logging(snapshot_directory="/scripts")
     run.child_run(name=run_name)  # TODO: add the step's name
     tags = {
         "mlflow.source.type": "JOB",
         "mlflow.source.name": "train.py",
         "mlflow.user": "******"
     }
     run.set_tags(tags)
     job_info_dict = {
         "run_id": run._run_id,
         "experiment_name": exp.name,
         "experiment_id": exp._id
     }
     json_dict = json.dumps(job_info_dict)
     with open(job_info_path, "w") as f:
         f.write(json_dict)
class ExploratoryAnalysis():
    def __init__(self):
        self.__parser = argparse.ArgumentParser("preprocessing")
        self.__parser.add_argument("--datastore",
                                   type=str,
                                   help="Name of the datastore",
                                   default="workspaceblobstore")
        self.__parser.add_argument("--dataset_name",
                                   type=str,
                                   help="Name of the dataset")
        self.__parser.add_argument("--dataset_preprocessed_name",
                                   type=str,
                                   help="Standard preprocessed dataset")
        self.__parser.add_argument("--output_preprocess_dataset",
                                   type=str,
                                   help="Name of the PipelineData reference")
        self.__parser.add_argument(
            "--use_datadrift",
            type=distutils.util.strtobool,
            help=
            "Use datadrift(True/False). If true, we split the original datset by sex"
        )
        self.__parser.add_argument("--retrain_status",
                                   type=distutils.util.strtobool,
                                   help="Retrain status")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'exploratory_analysis')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

        self.__datastore = Datastore.get(self.__ws,
                                         datastore_name=self.__args.datastore)

    def main(self):
        df = self.__preprocess_dataset(schema_path="./schema_dataset.json")
        if not self.__args.retrain_status:
            self.__make_exploratory_analysis(df)
        else:
            self.__run.add_properties({
                'status':
                "The following step have been skipped because a retraining pipeline have been launched"
            })
        self.__upload_datasets(df, df.columns)

    def __preprocess_dataset(self, schema_path):
        with open(schema_path) as f:
            schema = json.load(f)

        df = self.__get_dataset(self.__args.dataset_name).to_pandas_dataframe()

        df = df.drop([
            'address', 'city', 'state', 'postalCode', 'name', 'ssn',
            'observation'
        ],
                     axis=1)

        columns_names = schema.keys()
        df.columns = columns_names

        return df

    def __make_exploratory_analysis(self, df):
        self.__get_profiling(df)
        self.__generate_count_target_plot(df)
        self.__count_target_variable(df)
        self.__generate_counts_with_target(df)

        self.__relation_plot("age", df)
        self.__relation_plot("cholesterol", df)
        self.__relation_plot("st_slope", df)
        self.__relation_plot("num_major_vessels", df)

        plt.rcParams['figure.figsize'] = (15, 5)
        sns.distplot(df['age'])
        plt.title('Distribution of Age', fontsize=20)
        self.__run.log_image('Distribution of Age', plot=plt)

        self.__count_sex_variable(df)

        size = df['sex'].value_counts()
        colors = ['lightblue', 'lightgreen']
        labels = "Male", "Female"
        explode = [0, 0.01]

        my_circle = plt.Circle((0, 0), 0.7, color='white')

        plt.rcParams['figure.figsize'] = (9, 9)
        plt.pie(size,
                colors=colors,
                labels=labels,
                shadow=True,
                explode=explode,
                autopct='%.2f')
        plt.title('Distribution of Gender', fontsize=20)
        p = plt.gcf()
        p.gca().add_artist(my_circle)
        plt.legend()
        self.__run.log_image('Distribution of Gender', plot=plt)

        self.__generate_frequency_plot(df)

        plt.scatter(x=df.age[df.target == 1],
                    y=df.max_heart_rate_achieved[(df.target == 1)])
        plt.scatter(x=df.age[df.target == 0],
                    y=df.max_heart_rate_achieved[(df.target == 0)])
        plt.legend(["Disease", "Not Disease"])
        plt.xlabel("Age")
        plt.ylabel("Maximum Heart Rate")
        self.__run.log_image('Disease/Not Disease', plot=plt)

        self.__get_outliers(df)
        self.__get_correlation_matrix(df)
        self.__get_mutual_info(df)
        self.__get_principal_components_analysis(df)

    def __get_dataset(self, dataset_name):
        return self.__ws.datasets.get(dataset_name)

    def __upload_datasets(self, df, columns):
        if self.__args.use_datadrift:
            splitted_datasets = self.__split_dataset(df)

            for dataset_type in splitted_datasets:
                dataset_name, preprocess_filepath, datastore_path = self.__get_dataset_metadata(
                    splitted_datasets[dataset_type], dataset_type)

                self.__upload_dataset(self.__ws,
                                      self.__datastore,
                                      dataset_name,
                                      datastore_path,
                                      preprocess_filepath,
                                      use_datadrift=True,
                                      type_dataset=dataset_type)
        else:
            dataset_name, preprocess_filepath, datastore_path = self.__get_dataset_metadata(
                df, "train")
            self.__upload_dataset(self.__ws,
                                  self.__datastore,
                                  dataset_name,
                                  datastore_path,
                                  preprocess_filepath,
                                  use_datadrift=False,
                                  type_dataset="standard")

    def __split_dataset(self, df):
        df_female = df.drop(['target'], axis=1)
        df_female = df_female.loc[df_female['sex'] == 0]
        df_male = df.loc[df['sex'] == 1]

        return {"train": df_male, "inference": df_female}

    def __get_dataset_metadata(self, df, extension):
        dataset_name = f'{self.__args.dataset_preprocessed_name}_{extension}'
        output_preprocessed_directory = self.__args.output_preprocess_dataset if extension == "train" else f'{self.__args.output_preprocess_dataset}_{extension}'
        preprocess_filepath = os.path.join(output_preprocessed_directory,
                                           f'{dataset_name}.csv')
        datastore_path = f"heart-disease/{dataset_name}.csv"

        os.makedirs(output_preprocessed_directory, exist_ok=True)
        df.to_csv(preprocess_filepath, index=False)

        return dataset_name, preprocess_filepath, datastore_path

    def __upload_dataset(self, ws, def_blob_store, dataset_name,
                         datastore_path, filepath, use_datadrift,
                         type_dataset):
        def_blob_store.upload_files([filepath],
                                    target_path="heart-disease",
                                    overwrite=True)
        tab_data_set = Dataset.Tabular.from_delimited_files(
            path=(def_blob_store, datastore_path))
        try:
            tab_data_set.register(workspace=ws,
                                  name=f'{dataset_name}',
                                  description=f'{dataset_name} data',
                                  tags={
                                      'format': 'CSV',
                                      'use_datadrift': use_datadrift,
                                      'type_dataset': type_dataset
                                  },
                                  create_new_version=True)
        except Exception as ex:
            print(ex)

    def __get_profiling(self, df):
        profile = ProfileReport(
            df, title="Exploratory Analysis Report - Heart Disease")
        profile.to_file("heart-disease-report.html")
        self.__run.upload_file("heart-disease-report.html",
                               "heart-disease-report.html")

    def __generate_count_target_plot(self, df):
        plt.figure(figsize=(20, 10))
        df["target"].value_counts().plot.bar(figsize=(20, 10))
        self.__run.log_image(f'Count target', plot=plt)

    def __count_target_variable(self, df):
        countNoDisease = len(df[df.target == 0])
        countHaveDisease = len(df[df.target == 1])
        self.__run.log(
            'Percentage of Havent Heart Disease', "{:.2f}%".format(
                (countNoDisease / (len(df.target)) * 100)))
        self.__run.log(
            'Percentage of Have Heart Disease', "{:.2f}%".format(
                (countHaveDisease / (len(df.target)) * 100)))

    def __generate_counts_with_target(self, df):
        columns = [
            'fasting_blood_sugar', 'exercise_induced_angina', 'rest_ecg'
        ]
        for column in columns:
            plt.figure(figsize=(20, 10))
            sns.catplot(x="target", col=column, kind="count", data=df)
            self.__run.log_image(f'Count {column} over target', plot=plt)

    def __count_sex_variable(self, df):
        countFemale = len(df[df.sex == 0])
        countMale = len(df[df.sex == 1])
        self.__run.log('Percentage of Female Patients', "{:.2f}%".format(
            (countFemale / (len(df.sex)) * 100)))
        self.__run.log('Percentage of Male Patients', "{:.2f}%".format(
            (countMale / (len(df.sex)) * 100)))

    def __generate_frequency_plot(self, df):
        columns = [
            'age', 'sex', 'st_slope', 'fasting_blood_sugar', 'chest_pain_type'
        ]
        for column in columns:
            pd.crosstab(f'df.{column}', df.target).plot(kind="bar",
                                                        figsize=(20, 6))
            plt.title(f'Heart Disease Frequency for {column}')
            plt.xlabel(column)
            plt.xticks(rotation=0)
            plt.legend(["Haven't Disease", "Have Disease"])
            plt.ylabel('Frequency of Disease or Not')
            self.__run.log_image(f'Heart Disease Frequency for {column}',
                                 plot=plt)

    def __get_outliers(self, df):
        outliers_columns = [
            'age', 'resting_blood_pressure', 'cholesterol',
            'max_heart_rate_achieved', 'st_depression'
        ]
        for column in outliers_columns:
            f, ax = plt.subplots(figsize=(8, 6))
            sns.boxplot(x=df[column])
            self.__run.log_image(column, plot=plt)

    def __relation_plot(self, attribute, df):
        plt.rcParams['figure.figsize'] = (12, 9)
        sns.violinplot(x=df["target"],
                       y=df[attribute],
                       data=df,
                       palette="muted")
        plt.title(f'Relation of target with {attribute}',
                  fontsize=20,
                  fontweight=30)
        self.__run.log_image(f'Relation of target with {attribute}', plot=plt)

    def __get_mutual_info(self, df):
        X = df.drop(['target'], axis=1)
        y = df['target']
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.3,
                                                            random_state=123)
        X_train.shape, y_train.shape, X_test.shape, y_test.shape

        mutual_info = mutual_info_classif(X_train.fillna(0), y_train)

        mi_series = pd.Series(mutual_info)
        mi_series.index = X_train.columns
        mi_series.sort_values(ascending=False)
        plt.figure(figsize=(20, 10))
        mi_series.sort_values(ascending=False).plot.bar(figsize=(20, 8))
        self.__run.log_image('Mutual Information features scores', plot=plt)

        k_best_features = SelectKBest(mutual_info_classif,
                                      k=10).fit(X_train.fillna(0), y_train)
        self.__run.log('Selected top 10 features',
                       X_train.columns[k_best_features.get_support()])

    def __get_correlation_matrix(self, df):
        plt.rcParams['figure.figsize'] = (20, 15)
        plt.style.use('ggplot')

        sns.heatmap(df.corr(), annot=True)
        plt.title('Correlation Matrix', fontsize=20)
        self.__run.log_image('Correlation Matrix', plot=plt)

    def __get_principal_components_analysis(self, df):
        x_data = df.drop(['target'], axis=1)
        y = df.target.values

        pca_exp = PCA(n_components=5)
        pca_exp.fit_transform(x_data)

        plt.figure(figsize=(10, 10))
        plt.plot(np.cumsum(pca_exp.explained_variance_ratio_), 'ro-')
        plt.grid()
        self.__run.log_image('Explained_variance_ratio', plot=plt)

        pca = PCA(n_components=2)
        principalComponents = pca.fit_transform(x_data)

        self.__run.log('Total PCA Components', pca.n_components_)
        self.__run.log('Total explained variance',
                       round(pca.explained_variance_ratio_.sum(), 5))

        principal_df = pd.DataFrame(
            data=principalComponents,
            columns=['principal component 1', 'principal component 2'])

        plt.figure()
        plt.figure(figsize=(10, 10))
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=14)
        plt.xlabel('Principal Component - 1', fontsize=20)
        plt.ylabel('Principal Component - 2', fontsize=20)
        plt.title("Principal Component Analysis of Heart Disease Dataset",
                  fontsize=20)
        targets = [0, 1]
        colors = ['r', 'g']
        for target, color in zip(targets, colors):
            indicesToKeep = df['target'] == target
            plt.scatter(principal_df.loc[indicesToKeep,
                                         'principal component 1'],
                        principal_df.loc[indicesToKeep,
                                         'principal component 2'],
                        c=color,
                        s=50)

        plt.legend(targets, prop={'size': 15})
        self.__run.log_image(
            'Principal Component Analysis of Heart Disease Dataset', plot=plt)
Exemplo n.º 25
0
import os, json
from azureml.core import Experiment, Workspace
from azureml.core.authentication import ServicePrincipalAuthentication

root_dir = os.path.abspath(__file__ + "/../../../")
script_dir = os.path.join(root_dir, "aml_config/config.json")

with open(script_dir) as f:
    config = json.load(f)

workspace_name = config['workspace_name']
resource_group = config['resource_group']
subscription_id = config['subscription_id']

ws = Workspace.get(name=workspace_name,
                   subscription_id=subscription_id,
                   resource_group=resource_group)

exp = Experiment(workspace=ws, name="trainpipeline")

print(exp.name, exp.workspace.name, sep='\n')

run = exp.start_logging()

run.log('my magic number', 45)

run.complete()
class DetectFairness():
    def __init__(self):
        self.__parser = argparse.ArgumentParser("fairlearn")
        self.__parser.add_argument("--fitted_model_name",
                                   type=str,
                                   default="heart_disease_model_automl",
                                   help="Name of fitted model")
        self.__parser.add_argument("--model_data",
                                   type=str,
                                   help="Path of the model")
        self.__parser.add_argument("--dataset_name",
                                   type=str,
                                   default="heart_disease_preprocessed_train",
                                   help="Name of the dataset")
        self.__parser.add_argument("--output_fairness_dict",
                                   type=str,
                                   help="Name of the dataset")

        self.__args = self.__parser.parse_args()
        self.__run = Run.get_context()
        self.__local_run = type(self.__run) == _OfflineRun

        if self.__local_run:
            self.__ws = Workspace.from_config('../../notebooks-settings')
            self.__exp = Experiment(self.__ws, 'fairlearn')
            self.__run = self.__exp.start_logging()
        else:
            self.__ws = self.__run.experiment.workspace
            self.__exp = self.__run.experiment

        self.__sensitive_features = ['asthmatic', 'diabetic', 'smoker']

    def main(self):
        dataset = self.__get_dataset(self.__args.dataset_name)
        model = self.__load_model()
        df = dataset.to_pandas_dataframe()

        X_raw, Y, A, X = self.__transform_df(df)
        X_train, X_test, Y_train, Y_test, A_train, A_test = self.__df_train_split(
            X_raw, Y, A, X)

        Y_pred = model.predict(X_test)

        content = {
            "Y_pred": Y_pred,
            "Y_test": Y_test,
            "A_test": A_test,
            "model_id": Model(self.__ws, self.__args.fitted_model_name).id
        }

        self.__set_fairlearn_dict_as_pipeline_output(content)

    def __get_dataset(self, dataset_name):
        return self.__ws.datasets.get(dataset_name)

    def __load_model(self):
        Model(self.__ws, self.__args.fitted_model_name).download(".")
        with open(self.__args.model_data, "rb") as f:
            return joblib.load(f)

    def __transform_df(self, df):
        X_raw = df.drop(['target'], axis=1)
        Y = df['target']

        A = X_raw[self.__sensitive_features]
        X = X_raw.drop(labels=self.__sensitive_features, axis=1)

        return X_raw, Y, A, X

    def __df_train_split(self, X_raw, Y, A, X):
        X_train, X_test, Y_train, Y_test, A_train, A_test = train_test_split(
            X_raw, Y, A, test_size=0.3, random_state=123, stratify=Y)
        X_train = X_train.reset_index(drop=True)
        A_train = A_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        A_test = A_test.reset_index(drop=True)

        A_test.diabetic.loc[(A_test['diabetic'] == 0)] = 'not diabetic'
        A_test.diabetic.loc[(A_test['diabetic'] == 1)] = 'diabetic'

        A_test.asthmatic.loc[(A_test['asthmatic'] == 0)] = 'not asthmatic'
        A_test.asthmatic.loc[(A_test['asthmatic'] == 1)] = 'asthmatic'

        A_test.smoker.loc[(A_test['smoker'] == 0)] = 'not smoker'
        A_test.smoker.loc[(A_test['smoker'] == 1)] = 'smoker'

        return X_train, X_test, Y_train, Y_test, A_train, A_test

    def __set_fairlearn_dict_as_pipeline_output(self, content):
        os.makedirs(self.__args.output_fairness_dict, exist_ok=True)
        fairlearn_dict_path = os.path.join(self.__args.output_fairness_dict,
                                           'fairlean_predictions_values.pkl')
        joblib.dump(value=content, filename=fairlearn_dict_path)
Exemplo n.º 27
0
import pickle
import random
from preprocessing import preprocess_depthmap, preprocess_targets

# Get the current run.
run = Run.get_context()

# Offline run. Download the sample dataset and run locally. Still push results to Azure.
if(run.id.startswith("OfflineRun")):
    print("Running in offline mode...")

    # Access workspace.
    print("Accessing workspace...")
    workspace = Workspace.from_config()
    experiment = Experiment(workspace, "s4-cnndepthmap-height-offline")
    run = experiment.start_logging(outputs=None, snapshot_directory=".")

    # Get dataset.
    print("Accessing dataset...")
    if os.path.exists("premiumfileshare") == False:
        assert False, "Requires small size dataset"
        dataset_name = "cgmmldevpremium-SampleDataset-Example"
        dataset = workspace.datasets[dataset_name]
        dataset.download(target_path='.', overwrite=False)
    dataset_path = glob.glob(os.path.join("premiumfileshare"))[0]

# Online run. Use dataset provided by training notebook.
else:
    print("Running in online mode...")
    experiment = run.experiment
    workspace = experiment.workspace
Exemplo n.º 28
0
    # Make experiment reproducible
    tf.random.set_seed(EVAL_CONFIG.SPLIT_SEED)
    random.seed(EVAL_CONFIG.SPLIT_SEED)

    # Get the current run.
    run = Run.get_context()

    # Offline run. Download the sample dataset and run locally. Still push results to Azure.
    if run.id.startswith("OfflineRun"):
        print("Running in offline mode...")

        # Access workspace.
        print("Accessing workspace...")
        workspace = Workspace.from_config()
        experiment = Experiment(workspace, EVAL_CONFIG.EXPERIMENT_NAME)
        run = experiment.start_logging(outputs={}, snapshot_directory={})

        # Get dataset.
        print("Accessing dataset...")
        dataset_name = DATA_CONFIG.NAME
        dataset_path = str(REPO_DIR / "data" / dataset_name)
        if not os.path.exists(dataset_path):
            dataset = workspace.datasets[dataset_name]
            dataset.download(target_path=dataset_path, overwrite=False)

    # Online run. Use dataset provided by training notebook.
    else:
        print("Running in online mode...")
        experiment = run.experiment
        workspace = experiment.workspace
        dataset_path = run.input_datasets["dataset"]