示例#1
0
def auto_train_model(ws, experiment_name, model_name, full_X, full_Y,training_set_percentage, training_target_accuracy):

    # start a training run by defining an experiment
    experiment = Experiment(ws, experiment_name)
    
    train_X, test_X, train_Y, test_Y = train_test_split(full_X, full_Y, train_size=training_set_percentage, random_state=42)

    train_Y_array = train_Y.values.flatten()

    # Configure the automated ML job
    # The model training is configured to run on the local machine
    # The values for all settings are documented at https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train
    # Notice we no longer have to scale the input values, as Auto ML will try various data scaling approaches automatically
    Automl_config = AutoMLConfig(task = 'classification',
                                 primary_metric = 'accuracy',
                                 max_time_sec = 12000,
                                 iterations = 20,
                                 n_cross_validations = 3,
                                 exit_score = training_target_accuracy,
                                 blacklist_algos = ['kNN','LinearSVM'],
                                 X = train_X,
                                 y = train_Y_array,
                                 path='./04-automl/outputs')

    # Execute the job
    run = experiment.submit(Automl_config, show_output=True)

    # Get the run with the highest accuracy value.
    best_run, best_model = run.get_output()

    return (best_model, run, best_run)
示例#2
0
def RunAutoML():   
    automl_settings = {
    "name": "AutoML_Demo_Experiment",
    "iteration_timeout_minutes": 15,
    "iterations": 3,
    "n_cross_validations": 5,
    "primary_metric": 'r2_score',
    "preprocess": False,
    "max_concurrent_iterations": 8,
    "verbosity": logging.INFO
    }
    subscription_id = request.json['subscription_id']
    print(userData)
    print(userData[subscription_id])
    #return "ok"
    try:
        automl_config = AutoMLConfig(task="classification",
                        X=userData[subscription_id][1],
                        y=userData[subscription_id][2],
                        debug_log='automl_errors.log',
                        preprocess=True,
                        **automl_settings,
                        )
        experiment=Experiment(userData[subscription_id][0], 'automl_remote')
        run = experiment.submit(automl_config, show_output=True)
        run
        best_model,fitted_model = run.get_output()

        return 'ok'
    except:
        return 'error'  
def RunAutoML():
        subscription_id = request.json['subscription_id']
        resource_group = request.json['resource_group']
        workspace_name = request.json['workspace_name']
        file_name = request.json['file_name']
        #location = request.json['location']
    
        ws = Workspace(subscription_id=subscription_id,
                                  resource_group=resource_group,
                                  workspace_name=workspace_name)
                                            
        print("Found workspace {} at location {}".format(ws.name, ws.location))
        print('Found existing Workspace.')
            
        dataset_name = file_name

        # Get a dataset by name
        df = Dataset.get_by_name(workspace=ws, name=dataset_name)
        stock_dataset_df = df.to_pandas_dataframe()
        print('file successfully recieved.')
        stock_dataset_df.head()
        #stock_dataset_json = stock_dataset_df.to_json(orient='split')
        #print(stock_dataset_json)
        y_df = stock_dataset_df['ActionTaken'].values
        x_df = stock_dataset_df.drop(['ActionTaken'], axis=1)
        
        ExperimentName = request.json['ExperimentName']       
        tasks = request.json['tasks']
        iterations = request.json['iterations']
        iteration_timeout_minutes = request.json['iteration_timeout_minutes']
        primary_metric = request.json['primary_metric']
        
        #n_cross_validations = request.json['n_cross_validations']
        
        try:
            automl_config = AutoMLConfig(
                task=tasks,
                X=x_df,
                y=y_df,
                iterations=iterations,
                iteration_timeout_minutes=iteration_timeout_minutes,
                primary_metric=primary_metric,
                #n_cross_validations=n_cross_validations,
                preprocess=True,
                )
            experiment = Experiment(ws, ExperimentName)
            run = experiment.submit(config=automl_config, show_output=True)
    
            best_model,fitted_model = run.get_output()

            return 'ok'
        except:

            return 'error'
def main(train_path, pred_path, n_pred, dt, target, time_limit_min):
    df_train = pd.read_csv(train_path)
    df_train[dt] = pd.to_datetime(df_train[dt])

    time_series_settings = {
        "time_column_name": dt,
        "max_horizon": n_pred,
        "target_lags": "auto",
        "target_rolling_window_size": "auto"
    }
    automl_config = AutoMLConfig(task="forecasting",
                                 training_data=df_train,
                                 label_column_name=target,
                                 n_cross_validations=5,
                                 max_cores_per_iteration=-1,
                                 path=os.environ["SCRATCH"],
                                 experiment_timeout_minutes=time_limit_min,
                                 ensemble_download_models_timeout_sec=3600,
                                 **time_series_settings)
    ws = Workspace.from_config()
    experiment = Experiment(ws, "experiment")
    best_run, fitted_model = experiment.submit(automl_config,
                                               show_output=True).get_output()

    print("Best pipeline:")
    try:
        ensemble = vars(fitted_model.steps[1][1])["_wrappedEnsemble"]
        print(ensemble.__class__)
        steps = ensemble.estimators_
    except:
        steps = fitted_model.steps
    best_pipeline = ""
    for i, step in enumerate(steps):
        best_pipeline += f"{i}. {str(step)}\n"
    print(best_pipeline)

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', -1)
    print(fitted_model.named_steps["timeseriestransformer"].
          get_engineered_feature_names())
    featurization_summary = fitted_model.named_steps[
        "timeseriestransformer"].get_featurization_summary()
    print(pd.DataFrame.from_records(featurization_summary))

    x_pred = pd.date_range(df_train[dt].iloc[-1],
                           periods=n_pred + 1,
                           freq=pd.infer_freq(df_train[dt]))[1:]
    y_pred = fitted_model.forecast(forecast_destination=x_pred[-1])[0]
    #     y_pred = fitted_model.forecast(pd.DataFrame({dt: x_pred}))[0]

    df_pred = pd.DataFrame({dt: x_pred, target: y_pred})
    df_pred.to_csv(pred_path, index=False)
def train_model(data_file, random_seed):
    """Train the automl model."""
    target = "utilization"
    df = pd.read_parquet(data_file)

    x = df.loc[:, [c for c in df if c != target]].values
    y = df[target].values
    project_folder = "./automl"

    automl_config = AutoMLConfig(
        task="regression",
        iteration_timeout_minutes=5,
        iterations=10,
        primary_metric="spearman_correlation",
        n_cross_validations=5,
        debug_log="automl.log",
        verbosity=logging.INFO,
        X=x,
        y=y,
        path=project_folder,
    )

    load_dotenv(find_dotenv())
    ws = Workspace(
        workspace_name=getenv("AML_WORKSPACE_NAME"),
        subscription_id=getenv("AML_SUBSCRIPTION_ID"),
        resource_group=getenv("AML_RESOURCE_GROUP"),
    )
    experiment = Experiment(ws, getenv("AML_EXPERIMENT_NAME"))

    local_run = experiment.submit(automl_config, show_output=True)

    sub_runs = list(local_run.get_children())

    best_run = None
    best_score = 0

    for sub_run in sub_runs:
        props = sub_run.get_properties()
        if props["run_algorithm"] != "Ensemble":
            if float(props["score"]) > best_score:
                best_run = sub_run

    model_name = "Automl{}".format(str(uuid.uuid4()).replace("-", ""))[:20]
    best_run.register_model(model_name=model_name,
                            model_path="outputs/model.pkl")

    # best_run, fitted_model = local_run.get_output()
    # local_run.register_model(
    #     description="automl meetup best model"
    # )
    print("Model name is {}".format(model_name))
示例#6
0
    def submit(self,
               dispatcher: CollectingDispatcher,
               tracker: Tracker,
               domain: Dict[Text, Any],) -> List[Dict]:
        """Define what the form has to do
        after all required slots are filled"""
        task=tracker.get_slot('task')
        data=tracker.get_slot('data')
        column_name=tracker.get_slot('column_name')
        dispatcher.utter_message(template="utter_doing_task", task=tracker.get_slot('task'),data=tracker.get_slot('data'),
                                 column_name=tracker.get_slot('column_name'))
        # Load the workspace from the saved config file
        ws = Workspace.from_config()
        print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

        
        df = pd.read_csv(data)
        train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)
        label = column_name
        automl_config = AutoMLConfig(name='Automated ML Experiment',
                             task= task,
                             compute_target='local',
                             training_data = train_data,
                             validation_data = test_data,
                             label_column_name= label,
                             experiment_timeout_minutes=30,
                             iterations=6,
                             primary_metric = 'AUC_weighted',
                             featurization='auto',
                             )
        automl_experiment = Experiment(ws, 'mslearn-diabetes-automl')
        automl_run = automl_experiment.submit(automl_config)
        best_run, fitted_model = automl_run.get_output()
        best_run_metrics = best_run.get_metrics()
        metric_list = []
        for metric_name in best_run_metrics:
            metric = best_run_metrics[metric_name]
            metric_list.append((metric_name, metric))
        return fitted_model, metric_list
        
        print("The best model pipeline for the data is")
        dispatcher.utter_message(text="The best model pipeline for the data is")
        print(model)
        dispatcher.utter_message(model)
        print("The different metrics are")
        dispatcher.utter_message(text="The different metrics are")
        print(metrics)
        dispatcher.utter_message(text=metrics)
                             column_name=tracker.get_slot('column_name'))
示例#7
0
def main(force_model_register: bool, skip_model_register: bool,
         submit_pipeline: bool, publish_pipeline: bool, experiment_name: str,
         debug_run: bool, dbx_cluster_name: str, aml_compute_name: str,
         input_dataset_name: str, validation_dataset_name: str):
    pipeline: Pipeline = create_pipeline(
        debug_run=debug_run,
        dbx_compute=dbx_cluster_name,
        aml_compute=aml_compute_name,
        input_dataset=input_dataset_name,
        validation_dataset=validation_dataset_name)
    pipeline.validate()

    if submit_pipeline and not publish_pipeline:
        exp = Experiment(WS, experiment_name)
        exp.submit(pipeline,
                   pipeline_parameters={
                       "force_registration": str(force_model_register),
                       "skip_registration": str(skip_model_register)
                   })

    if publish_pipeline:
        published_pipeline: PublishedPipeline = pipeline.publish(
            name="Driver Safety Pipeline",
            description="Training Pipeline for new driver safety model")

        if submit_pipeline:
            published_pipeline.submit(workspace=WS,
                                      experiment_name=experiment_name,
                                      pipeline_parameters={
                                          "force_registration":
                                          str(force_model_register),
                                          "skip_registration":
                                          str(skip_model_register)
                                      })

        sys.stdout.write(published_pipeline.id)
    def run_pipeline(self, params):
        """
        run_pipeline - Submit a pipeline job.

        :param Workspace ws: AML Workspace.
        :param Pipeline pipeline: AML pipeline.
        :param str pipeline_name: Directory of the source files.
        :param dict params: Pipeline parameteters.

        :returns:                               An AML experiment
        :rtype:                                 Experiment
        """
        # Submit the pipeline to be run
        exp = Experiment(self.ws, self.pipeline_name)
        exp_id = exp.submit(self.pipeline, pipeline_parameters=params)
        return exp_id
示例#9
0
class _InnerAutomatedMLModel():
    # Inner single model to be passed that wrapper can use to pass into MultiOutputRegressor
    def __init__(self,
                 automl_config,
                 workspace,
                 experiment_name_prefix="aml_experiment"):
        self._show_output = automl_config._show_output
        self._workspace = workspace
        self._automl_config = automl_config
        self._experiment_name_prefix = experiment_name_prefix

    def get_params(self, deep=True):
        # Must be implemented for MultiOutputRegressor to view _InnerAutomatedMLModel
        # as an sklearn estimator
        return {
            'workspace': self._workspace,
            'automl_config': self._automl_config,
            'experiment_name_prefix': self._experiment_name_prefix
        }

    def fit(self, X, y, sample_weight=None):
        # fit implementation for a single output model.
        # Create experiment for specified workspace
        automl_config = copy.deepcopy(self._automl_config)
        current_time = time.localtime()
        current_time_string = time.strftime('%y_%m_%d-%H_%M_%S', current_time)
        experiment_name = self._experiment_name_prefix + "_" + current_time_string
        self._experiment = Experiment(self._workspace, experiment_name)
        # Configure automl_config with training set information.
        automl_config.user_settings['X'] = X
        automl_config.user_settings['y'] = y
        automl_config.user_settings['sample_weight'] = sample_weight
        # Wait for remote run to complete, the set the model
        print("Experiment " + experiment_name + " has started.")
        local_run = self._experiment.submit(automl_config,
                                            show_output=self._show_output)
        print("Experiment " + experiment_name + " completed.")
        _, self._model = local_run.get_output()

    def predict(self, X):
        return self._model.predict(X)

    def predict_proba(self, X):
        return self._model.predict_proba(X)
示例#10
0
    "preprocess": True,
    "verbosity": logging.INFO,
    "n_cross_validations": 10
}

# AutoML object for running experiment
automated_ml_config = aml.AutoMLConfig(task='regression',
                                       debug_log='automated_ml_errors.log',
                                       path='./automated-ml-regression',
                                       X=trainingx_df.values,
                                       y=trainingy_df.values.flatten(),
                                       model_explainability=True,
                                       **automl_settings)

# Submit experiment to get AutoMLRun object
local_run = experiment.submit(automated_ml_config, show_output=True)

# Best pipeline, Model from the best pipeline, in the bunch of runs(experiment)
best_run, fitted_model = local_run.get_output()
# this tells which algorithm was used in the model from best pipeline
print(best_run.get_details())

# Predicting vlaues in a list for test data
y_predict = fitted_model.predict(testx_df.values)

# Printing the predictions to csv
f = open('predict2014.csv', 'w')

# Mean Absolute Error
mae = mean_absolute_error(testy_df, y_predict)
# Range of y in training data
with open("config/aml_config.json") as f:
    config = json.load(f)

workspace_name = config["workspace_name"]
resource_group = config["resource_group"]
subscription_id = config["subscription_id"]
workspace_region = config["location"]

#Interactive Authentication
ws = Workspace(workspace_name=workspace_name,
               subscription_id=subscription_id,
               resource_group=resource_group,
               auth=cli_auth)

local_run = RunConfiguration()

local_run.environment.python.user_managed_dependencies = True

############# Experiement local-gbr-turbofan ######################
experiement_name = 'gbr-turbofan'

exp = Experiment(workspace=ws, name=experiement_name)
src = ScriptRunConfig(source_directory='compute/',
                      script='01-train.py',
                      run_config=local_run)

run = exp.submit(src, tags={"build number": sys.argv[1]})

run.wait_for_completion(show_output=True)
示例#12
0
from azureml.pipeline.core import PublishedPipeline
from azureml.core.experiment import Experiment
from azureml.core import Workspace

workspace = Workspace.from_config()

published_pipeline_id = ""
is_debug = True
debug_relay_connection_name = "test"

if published_pipeline_id is None or published_pipeline_id == "":
    raise ValueError("Initialize published_pipeline_id")

pipeline_parameters = {"is_debug": is_debug}
if is_debug:
    if debug_relay_connection_name == "":
        raise ValueError("Hybrid connection name cannot be empty!")

    pipeline_parameters.update(
        {"debug_relay_connection_name": debug_relay_connection_name})

experiment = Experiment(workspace, "Pipeline_debug_experiment")
published_pipeline = PublishedPipeline.get(workspace=workspace,
                                           id=published_pipeline_id)
experiment.submit(published_pipeline, pipeline_parameters=pipeline_parameters)
def RunAutoML():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    location = request.json['location']
    target_var = request.json['target_var']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')

    dataset_name = file_name

    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    stock_dataset_df.head()
    #stock_dataset_json = stock_dataset_df.to_json(orient='split')
    #print(stock_dataset_json)
    y_df = stock_dataset_df[target_var].values
    x_df = stock_dataset_df.drop([target_var], axis=1)
    print(y_df)
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    max_concurrent_iterations = request.json['max_concurrent_iterations']
    best_model = request.json['best_model']

    #n_cross_validations = request.json['n_cross_validations']

    try:
        automl_settings = {
            "name": ExperimentName,
            "iteration_timeout_minutes": iteration_timeout_minutes,
            "iterations": iterations,
            "n_cross_validations": n_cross_validations,
            "primary_metric": primary_metric,
            "preprocess": True,
            "max_concurrent_iterations": max_concurrent_iterations,
            "verbosity": logging.INFO
        }

        automl_config = AutoMLConfig(
            task=tasks,
            debug_log='automl_errors.log',
            path=
            'D:\\Stock_Prediction\\AutoML_Azure\\python\\Flask_API_Azure\\log',
            #compute_target = 'Automlvm',
            X=x_df,
            y=y_df,
            **automl_settings,
        )

        experiment = Experiment(ws, ExperimentName)
        remote_run = experiment.submit(automl_config, show_output=True)
        best_run, fitted_model = remote_run.get_output()
        #print(best_run)
        print(best_run.get_file_names())
        #Register the model
        from datetime import date
        model = best_run.register_model(model_name=best_model +
                                        str(date.today()),
                                        model_path='outputs/model.pkl')
        print(model.name, model.id, model.version, sep='\t')
        children = list(remote_run.get_children())
        metricslist = {}
        for run in children:
            properties = run.get_properties()
            metrics = {
                k: v
                for k, v in run.get_metrics().items() if isinstance(v, float)
            }
            metricslist[int(properties['iteration'])] = metrics

        rundata = pd.DataFrame(metricslist).sort_index(1)
        rundata.rename(column={
            0: "one",
            1: "two",
            2: "three",
            3: "four",
            4: "five",
            5: "six",
            6: "seven",
            7: "right",
            8: "nine",
            9: "ten",
        },
                       inplace=True)
        rundata_toJson = rundata.to_json(orient='columns')
        print(rundata_toJson)
        return rundata_toJson
    except:

        return 'error'
示例#14
0
print("Training the model...")
# configure Auto ML
automl_config = AutoMLConfig(task='classification',
                             debug_log='automl_errors.log',
                             primary_metric='AUC_weighted',
                             iteration_timeout_minutes=2,
                             iterations=20,
                             n_cross_validations=5,
                             preprocess=False,
                             max_concurrent_iterations=5,
                             verbosity=logging.INFO,
                             path=project_folder,
                             compute_target=batch_ai_compute,
                             data_script=project_folder + "/get_data.py")
remote_run = experiment.submit(automl_config, show_output=False)
remote_run.wait_for_completion(show_output=True)

# Retrieve All Child Runs
print("Retrieving All Child Runs")
children = list(remote_run.get_children())
metricslist = {}
for run in children:
    properties = run.get_properties()
    metrics = {
        k: v
        for k, v in run.get_metrics().items() if isinstance(v, float)
    }
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1)
示例#15
0
    compute_target=aml_compute,
    path=os.path.realpath(scripts_folder),
    data_script='get_data.py',
    **automl_settings)

train_step = AutoMLStep(name='AutoML_Classification',
                        automl_config=automl_config,
                        inputs=[output_split_train_x, output_split_train_y],
                        allow_reuse=True)

print("Building pipeline")
pipeline_steps = [train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)

print("Submitting pipeline")
pipeline_run = experiment.submit(pipeline, regenerate_outputs=False)

print("Waiting for pipeline completion")
pipeline_run.wait_for_completion()


def get_download_path(download_path, output_name):
    output_folder = os.listdir(download_path + '/azureml')[0]
    path = download_path + '/azureml/' + output_folder + '/' + output_name
    return path


def fetch_df(step, output_name):
    output_data = step.get_output_data(output_name)

    download_path = './outputs/' + output_name
示例#16
0
def run(workspace, config, args):
    compute_target_name = config['train']['compute_target_name']
    data_folder = config['train']['data_folder']

    try:
        compute_target = ComputeTarget(workspace=workspace,
                                       name=compute_target_name)
        print('found existing:', compute_target.name)
    except ComputeTargetException:
        print('creating new.')
        compute_config = AmlCompute.provisioning_configuration(
            vm_size=config['train']['vm_size'], min_nodes=0, max_nodes=1)
        compute_target = ComputeTarget.create(workspace, compute_target_name,
                                              compute_config)
        compute_target.wait_for_completion(show_output=True)

    # ds = Datastore.register_azure_blob_container(
    #     workspace,
    #     datastore_name=config['train']['datastore_name'],
    #     account_name=config['train']['account_name'],
    #     account_key=config['train']['account_key'],
    #     container_name=config['train']['container_name'],
    #     overwrite=True)
    #
    # # # Upload local "data" folder (incl. files) as "tfdata" folder
    # ds.upload(
    #     src_dir=config['train']['local_directory'],
    #     target_path=data_folder,
    #     overwrite=True)

    ds = Datastore.get(workspace,
                       datastore_name=config['train']['datastore_name'])

    # generate data reference configuration
    dr_conf = DataReferenceConfiguration(
        datastore_name=ds.name, path_on_datastore=data_folder, mode='mount'
    )  # set 'download' if you copy all files instead of mounting

    run_config = RunConfiguration(framework="python",
                                  conda_dependencies=CondaDependencies.create(
                                      conda_packages=ast.literal_eval(
                                          config['train']['conda_packages'])))
    run_config.target = compute_target.name
    run_config.data_references = {ds.name: dr_conf}
    run_config.environment.docker.enabled = True
    # run_config.environment.docker.gpu_support = True
    run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE

    src = ScriptRunConfig(
        source_directory='./script',
        script='train.py',
        run_config=run_config,
        arguments=[
            '--datadir',
            str(ds.as_mount()), '--step', args.step, '--train_on',
            args.train_on, '--fold', args.fold, '--epochs', args.epochs,
            '--experiment', args.experiment, '--reference', args.reference,
            '--batchsize', args.batchsize, '--optimizertype',
            args.optimizertype, '--convrnn_filters', args.convrnn_filters,
            '--learning_rate', args.learning_rate, '--pix250m', args.pix250m
        ])
    # exp = Experiment(workspace=ws, name='test20181210-09')
    exp = Experiment(workspace=workspace,
                     name=config['train']['experiment_name'])
    run = exp.submit(config=src)
    run.wait_for_completion(show_output=True)
示例#17
0
#
# - Create an experiment to run.
# - Submit the experiment.
# - Wait for the run to complete.

# ### Create the experiment

# In[ ]:

experiment = Experiment(ws, experiment_name)

# ### Submit the experiment

# In[ ]:

run = experiment.submit(keras_est)

# Wait for the run to complete by executing the following cell. Note that this process will perform the following:
# - Build and deploy the container to Azure Machine Learning compute (~8 minutes)
# - Execute the training script (~2 minutes)
#
# If you change only the training script and re-submit, it will run faster the second time because the necessary container is already prepared so the time requried is just that for executing the training script.

# In[ ]:

run.wait_for_completion(show_output=True)

# ## Download the model files from the run

# In the training script, the Keras model is saved into two files, model.json and model.h5, in the outputs/models folder on the GPU cluster AmlCompute node. Azure ML automatically uploaded anything written in the ./outputs folder into run history file store. Subsequently, we can use the run object to download the model files. They are under the the outputs/model folder in the run history file store, and are downloaded into a local folder named model.
示例#18
0
def peptide_identification(args):
    print(datetime.now(), ': Peptid identification starts...')
    print('Settings: ')
    print(args)

    # PLATO setting
    subclusterCount = args.subclusterCount
    spy = args.spy
    spy_portion = args.spy_portion
    RN = args.RN
    rnd_all = args.rnd_all  # If random method, include all decoys
    rnd_portion = args.rnd_portion  # If random method, include rnd.portion of positive set, default 1: pos set = neg set
    replicates_cnt = args.replicates_cnt
    include_label = args.include_label
    AML_preprocess = args.AML_preprocess
    output_folder = args.output_folder

    # AutoML parameter setting
    autoML_best_model_selection = args.autoML_best_model_selection
    autoML_iterations = args.autoML_iterations

    metric = args.metric  # Other metrics: azureml.train.automl.utilities.get_primary_metrics('classification')
    cv_fold = args.cv_fold

    # Input, output
    file_name = args.sample_name
    input_path = args.input_folder
    output_path = output_folder + '/' + file_name
    log_file = output_path + '_autoML_errors_log.html'

    # Instantiate AutoML config and create an experiment in autoML workspace
    ws = Workspace.from_config()
    experiment_name = file_name
    experiment = Experiment(ws, experiment_name)
    print(datetime.now(),
          ': Assigned experiment ' + experiment_name + ' on Azure portal ')

    output = {}
    output['SDK version'] = azureml.core.VERSION
    output['Workspace Name'] = ws.name
    output['Resource Group'] = ws.resource_group
    output['Location'] = ws.location
    outputDf = pd.DataFrame(data=output, index=[''])
    print(outputDf)

    print(datetime.now(), ': Reading inputs')
    # Read POSITIVES and ALL inputs
    positives_path = glob.glob(input_path + file_name + '*POSITIVES*')
    raw_positives = pd.read_csv(positives_path[0], sep='\t')

    if AML_preprocess == True:
        all_path = glob.glob(input_path + file_name + '-ALL.txt')
        raw_all = pd.read_csv(all_path[0], sep='\t')
        # Extract new features
        # First and last three amino acides of peptide sequences as features - If NA then B category
        raw_all['Peptide'] = raw_all.Peptide.str.replace(
            r'([\(\[]).*?([\)\]])', r'B', regex=True)
        raw_all['P1'] = raw_all['Peptide'].str[0]
        raw_all['P2'] = raw_all['Peptide'].str[2]
        raw_all['P3'] = raw_all['Peptide'].str[3]
        raw_all['P4'] = raw_all['Peptide'].str[-4]
        raw_all['P5'] = raw_all['Peptide'].str[-3]
        raw_all['P6'] = raw_all['Peptide'].str[-1]

    else:
        all_path = glob.glob(input_path + file_name +
                             '_percolator_feature.txt')
        raw_all = pd.read_csv(all_path[0], sep='\t')

    raw_all['Class'] = 0

    # Make positive and test set
    test_data = raw_all.drop(['ScanNr', 'Proteins'], axis=1)
    positive_set = pd.merge(left=pd.DataFrame(raw_positives['SpecId']),
                            right=pd.DataFrame(test_data),
                            how='left',
                            left_on='SpecId',
                            right_on='SpecId')
    positive_set['Class'] = 1

    # Remove decoys in positive set, if there is any
    decoys_in_positive_idx = positive_set.index[positive_set['Label'] ==
                                                -1].tolist()
    positive_set = positive_set[positive_set['Label'] != -1]

    # Dataframe to store predictions
    all_predictions = pd.DataFrame({
        'SpecId': list(test_data['SpecId']),
        'Peptide': list(test_data['Peptide']),
        'Label': list(test_data['Label'])
    })
    prediction_summary = all_predictions

    # Prepare test set for modeling
    y_test = test_data['Class']
    if include_label == True:
        X_test = test_data.drop(['SpecId', 'Peptide', 'Class'], axis=1)
    else:
        X_test = test_data.drop(['SpecId', 'Peptide', 'Label', 'Class'],
                                axis=1)

    # Prepare positive set for modeling
    positive_set_idx = [
        test_data['SpecId'].tolist().index(x)
        for x in positive_set['SpecId'].tolist()
        if x in test_data['SpecId'].tolist()
    ]

    # Used to create the negative set
    decoys_idx = np.setdiff1d(
        test_data.index[test_data['Label'] == -1].tolist(),
        decoys_in_positive_idx).tolist()

    global gower_dist_avg
    if RN == True:
        if os.path.exists(input_path + file_name +
                          'gower_dist_avg.npy') == False:
            print(datetime.now(), ': Calculating Gower distance')
            gower_dist = gower.gower_matrix(test_data)
            selected_rows = gower_dist[positive_set_idx]
            gower_dist_avg = np.mean(selected_rows, axis=0)
            print(datetime.now(), ': Saving Gower distance matrix')
            np.save(input_path + '/' + file_name + 'gower_dist_avg.npy',
                    gower_dist_avg)  # save
        else:
            print(datetime.now(), ': Loading Gower distance matrix from ',
                  input_path + file_name + 'gower_dist_avg.npy')
            gower_dist_avg = np.load(input_path + file_name +
                                     'gower_dist_avg.npy')  # load

    if spy == True:
        all_spies = pd.DataFrame()
    '''
    Create train set by concatinating positive and negative set, build model(s) using autoML
    and store predictions based on the best model
    '''
    for rep in range(0, replicates_cnt):
        print(datetime.now(), ': Replicate #', rep + 1)
        if spy == True:
            # Exclude spy_portion of training data to be the spies
            positive_set = positive_set.sample(n=len(positive_set),
                                               random_state=rep *
                                               100).reset_index(drop=True)
            spySet_size = round(len(positive_set) * spy_portion)
            spies_ID = positive_set.loc[1:spySet_size, ['SpecId']]
            positive_set_wSpy = positive_set.iloc[spySet_size +
                                                  1:len(positive_set)]

        if RN == False:
            if rnd_all == True:
                # Negative set includes all decoys
                negative_set_idx = decoys_idx
            else:
                # Negative set idx includes rnd_portion times of |positive_set| indecies
                random.seed(rep)
                random.shuffle(decoys_idx)
                negative_set_idx = decoys_idx[0:rnd_portion *
                                              len(positive_set)]
        else:
            print(datetime.now(), ': Starts estimating RNs')
            negative_set_idx = reliable_negative(test_data, positive_set,
                                                 subclusterCount, rep)
            print(datetime.now(), ': Ends estimating RNs')

        negative_set = test_data.iloc[negative_set_idx]

        if spy == True:
            train_data = pd.concat([positive_set_wSpy, negative_set], axis=0)
        else:
            train_data = pd.concat([positive_set, negative_set], axis=0)

        y_train = train_data['Class']
        if include_label == True:
            X_train = train_data.drop(['SpecId', 'Peptide', 'Class'], axis=1)
        else:
            X_train = train_data.drop(['SpecId', 'Peptide', 'Class', 'Label'],
                                      axis=1)

        print('Training set size:', len(y_train), '\nTest set size:',
              len(y_test))

        automl_config = AutoMLConfig(task='classification',
                                     debug_log=log_file,
                                     primary_metric=metric,
                                     iteration_timeout_minutes=200,
                                     iterations=autoML_iterations,
                                     verbosity=logging.INFO,
                                     preprocess=AML_preprocess,
                                     X=X_train,
                                     y=y_train,
                                     n_cross_validations=cv_fold,
                                     model_explainability=True)

        print(datetime.now(), ': modeling replicate #' + str(rep + 1) + '...')
        local_run = experiment.submit(automl_config, show_output=True)

        if autoML_best_model_selection == False:
            # Retrieve the Best Model based on bunch of metrics
            children = list(local_run.get_children())
            metricslist = {}
            for run in children:
                properties = run.get_properties()
                metrics = {
                    k: v
                    for k, v in run.get_metrics().items()
                    if isinstance(v, float)
                }
                metricslist[int(properties['iteration'])] = metrics

            rundata = pd.DataFrame(metricslist).sort_index(1)
            tmp = rundata.T.sort_values([
                'AUC_weighted', 'f1_score_weighted',
                'precision_score_weighted', 'recall_score_weighted',
                'weighted_accuracy'
            ],
                                        ascending=False)
            rundata = tmp.sort_values('log_loss', ascending=True).T
            best_run_iteration = rundata.columns.values[0]
            rundata.to_csv(output_path + '_metrics_list_' + str(rep) + '.txt')
            best_run, fitted_model = local_run.get_output(
                iteration=best_run_iteration)
        else:
            best_run, fitted_model = local_run.get_output()

        print('Best run: ', best_run)
        print(datetime.now(), ': Saving best model and predictions')
        # Save the best model, prediction value and probability
        modelname = output_path + '_model_' + str(rep) + '.sav'
        joblib.dump(fitted_model, modelname)
        y_pred_val = fitted_model.predict(X_test)
        y_pred_prob = fitted_model.predict_proba(X_test)

        # Add the results of the replicate to all predictions table
        all_predictions['pred_rep' + str(rep)] = list(y_pred_val)
        all_predictions['prob_rep' + str(rep)] = list(
            [item[1] for item in y_pred_prob])

        # Overwrite prediction values based on the spies cutoff
        if spy == True:
            threshold = min(
                pd.merge(spies_ID, all_predictions,
                         on='SpecId')['prob_rep' + str(rep)])
            all_predictions['pred_rep' + str(rep)] = np.where(
                all_predictions['prob_rep' + str(rep)] >= threshold, 1, 0)
            all_spies['SpecId' + str(rep)] = spies_ID['SpecId']
            all_spies['Prob_rep' + str(rep)] = list(
                pd.merge(spies_ID, all_predictions,
                         on=['SpecId'])['prob_rep' + str(rep)])

        print(datetime.now(), ': Replicate #' + str(rep + 1) + ' processed!')
        all_predictions.to_csv(output_path + '_all_predictions.csv',
                               index=False)

    if spy == True:
        all_spies.to_csv(output_path + '_all_spies.csv', index=False)

    print(datetime.now(), ': Generate prediction summary of all replicates')
    pred_col_indecies = [
        col for col in all_predictions.columns if 'pred' in col
    ]
    prob_col_indecies = [
        col for col in all_predictions.columns if 'prob' in col
    ]

    prediction_summary['Std'] = all_predictions[prob_col_indecies].std(
        skipna=True, axis=1)
    prediction_summary['Min'] = all_predictions[prob_col_indecies].min(
        skipna=True, axis=1)
    prediction_summary['Max'] = all_predictions[prob_col_indecies].max(
        skipna=True, axis=1)
    prediction_summary['Avg'] = all_predictions[prob_col_indecies].mean(
        skipna=True, axis=1)
    prediction_summary['Median'] = all_predictions[prob_col_indecies].median(
        skipna=True, axis=1)
    prediction_summary['Vote'] = all_predictions[pred_col_indecies].sum(
        skipna=True, axis=1)
    prediction_summary.to_csv(output_path + '_prediction_summary.txt',
                              sep='\t',
                              index=False)

    # Feature importance
    print(datetime.now(), ': Output feature importance of the best run')
    client = ExplanationClient.from_run(best_run)
    raw_explanations = client.download_model_explanation(
        top_k=len(X_test.columns))
    print('Raw feature importance')
    print(raw_explanations.get_feature_importance_dict())
    d = raw_explanations.get_feature_importance_dict()
    raw_feature_importance = pd.DataFrame(list(d.items()))
    raw_feature_importance.to_csv(output_path + '_raw_feature_importance.csv',
                                  index=False)
    # Engineered
    engineered_explanations = client.download_model_explanation(
        top_k=len(X_test.columns))
    print('Engineered feature importance')
    print(engineered_explanations.get_feature_importance_dict())
    d = engineered_explanations.get_feature_importance_dict()
    engineered_feature_importance = pd.DataFrame(list(d.items()))
    engineered_feature_importance.to_csv(output_path +
                                         '_engineered_feature_importance.csv',
                                         index=False)

    now = datetime.now()
    print(datetime.now(), ': Program end')
################################

#%%
# Step 14 - Create estimator
#############################
from azureml.train.estimator import Estimator

script_params = {
    '--data-folder': ds.as_mount(),
    '--training-set-percentage': 0.3
}

est_config = Estimator(source_directory='./training',
                       script_params=script_params,
                       compute_target=compute_target,
                       entry_script='train.py',
                       conda_packages=['scikit-learn', 'pandas'])

#%%
# Step 15 - Execute the estimator job
#####################################
run = exp.submit(config=est_config)
run

# Poll for job status
run.wait_for_completion(
    show_output=True)  # value of True will display a verbose, streaming log

# Examine the recorded metrics from the run
print(run.get_metrics())
示例#20
0
    automl_settings = {
        "iteration_timeout_minutes": 60,
        "iterations": 100,
        "n_cross_validations": 5,
        "primary_metric": 'AUC_weighted',
        "preprocess": True,
        "max_cores_per_iteration": 2
    }

    automl_config = AutoMLConfig(task='classification',
                                 path=project_folder,
                                 run_configuration=conda_run_config,
                                 data_script=project_folder + "/get_data.py",
                                 **automl_settings)

    remote_run = experiment.submit(automl_config)

    # Canceling runs
    #
    # You can cancel ongoing remote runs using the *cancel()* and *cancel_iteration()* functions

    print(remote_run.id)

    time.sleep(180)

    # Cancel the ongoing experiment and stop scheduling new iterations
    remote_run.cancel()

    print('run cancelled')

    # Wait for the run to complete.  It should complete soon because it has been canceled.
示例#21
0
def RunAutoMLForecast():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    location = request.json['location']
    target_var = request.json['target_var']
    cluster_name = request.json['cluster_name']
    best_model = request.json['best_model']
    time_column_name = request.json['time_column_name']
    max_horizon = request.json['max_horizon']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')
    compute_target = AmlCompute(ws, cluster_name)
    print('Found existing AML compute context.')
    dataset_name = file_name
    time_column_name = time_column_name
    # Get a dataset by name
    dataset = Dataset.get_by_name(workspace=ws,
                                  name=dataset_name).with_timestamp_columns(
                                      fine_grain_timestamp=time_column_name)
    print(dataset)
    #df_ts = Dataset.Tabular.from_delimited_files(df_ts)
    dataset.to_pandas_dataframe().describe()
    dataset.take(3).to_pandas_dataframe()
    print(dataset)
    #y_df = df_ts[target_var].values
    #x_df = df_ts.drop([target_var], axis=1)
    print('file successfully recieved.')
    #stock_dataset_df.head()
    # create a new RunConfig object
    conda_run_config = RunConfiguration(framework="python")
    conda_run_config.environment.docker.enabled = True
    conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
    cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'],
                                  conda_packages=['numpy', 'py-xgboost<=0.80'])
    conda_run_config.environment.python.conda_dependencies = cd
    print('run config is ready')
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    #max_concurrent_iterations = request.json['max_concurrent_iterations']

    automl_settings = {
        'time_column_name': time_column_name,
        'max_horizon': max_horizon,
        "iterations": iterations,
    }

    automl_config = AutoMLConfig(
        task=tasks,
        primary_metric=primary_metric,
        #blacklist_models = ['ExtremeRandomTrees', 'AutoArima', 'Prophet'],
        experiment_timeout_minutes=iteration_timeout_minutes,
        training_data=dataset,
        label_column_name=target_var,
        compute_target=compute_target,
        enable_early_stopping=True,
        n_cross_validations=n_cross_validations,
        #verbosity=logging.INFO,
        **automl_settings)
    print("AutoML config created.")
    experiment = Experiment(ws, ExperimentName)
    remote_run = experiment.submit(automl_config, show_output=True)
    children = list(remote_run.get_children())
    metricslist = {}
    for run in children:
        properties = run.get_properties()
        metrics = {
            k: v
            for k, v in run.get_metrics().items() if isinstance(v, float)
        }
        metricslist[int(properties['iteration'])] = metrics

    rundata = pd.DataFrame(metricslist).sort_index(axis=1, by=primary_metric)
    rundata.rename(columns={
        0: "one",
        1: "two",
        2: "three",
        3: "four",
        4: "five",
        5: "six",
        6: "seven",
        7: "eight",
        8: "nine",
        9: "ten",
    },
                   inplace=True)
    iterations_toJson = rundata.to_json(orient='columns')
    print(iterations_toJson)
    best_run, fitted_model = remote_run.get_output()
    #best_run_toJson = best_run.get_metrics()
    #dict = {}
    #dict['iterations_toJson'] = iterations_toJson
    #dict['best_run_toJson'] = best_run_toJson
    #print(best_run.get_file_names())
    #Register the model
    #from datetime import date
    model = remote_run.register_model(model_name=best_model,
                                      description='AutoML Model')
    print(model.name, model.id, model.version, sep='\t')
    best_model = model.name
    best_model
    var1 = "@"
    var2 = var1 + best_model
    return '{} {}'.format(iterations_toJson, var2)
示例#22
0
def RunAutoMLReg():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    location = request.json['location']
    target_var = request.json['target_var']
    cluster_name = request.json['cluster_name']
    best_model = request.json['best_model']
    #best_model = request.json['best_model']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')
    #compute_target = AmlCompute(ws, cluster_name)
    compute_target = ws.compute_targets[cluster_name]
    print('Found existing AML compute context.')
    dataset_name = file_name

    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    #stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    #stock_dataset_df.head()
    #stock_dataset_json = stock_dataset_df.to_json(orient='split')
    #print(stock_dataset_json)
    X = df.drop_columns(columns=[target_var])
    y = df.keep_columns(columns=[target_var], validate=True)
    #y_df = stock_dataset_df[target_var].values
    #x_df = stock_dataset_df.drop([target_var], axis=1)
    print(y)
    # create a new RunConfig object
    conda_run_config = RunConfiguration(framework="python")
    conda_run_config.environment.docker.enabled = True
    conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE
    cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'],
                                  conda_packages=['numpy', 'py-xgboost<=0.90'])
    conda_run_config.environment.python.conda_dependencies = cd
    print('run config is ready')
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    max_concurrent_iterations = request.json['max_concurrent_iterations']

    try:
        automl_settings = {
            "name": ExperimentName,
            "iteration_timeout_minutes": iteration_timeout_minutes,
            "featurization": 'auto',
            "iterations": iterations,
            "n_cross_validations": n_cross_validations,
            "primary_metric": primary_metric,
            "preprocess": True,
            "max_concurrent_iterations": max_concurrent_iterations
            #"verbosity": logging.INFO
        }

        automl_config = AutoMLConfig(
            task=tasks,
            debug_log='automl_errors.log',
            blacklist_models=['XGBoost'],
            #path=os.getcwd(),
            compute_target=compute_target,
            #run_configuration=conda_run_config,
            X=X,
            y=y,
            **automl_settings,
        )

        experiment = Experiment(ws, ExperimentName)
        remote_run = experiment.submit(automl_config, show_output=True)
        remote_run.flush(timeout_seconds=400)
        children = list(remote_run.get_children())
        metricslist = {}
        for run in children:
            properties = run.get_properties()
            metrics = {
                k: v
                for k, v in run.get_metrics().items() if isinstance(v, float)
            }
            metricslist[int(properties['iteration'])] = metrics

        rundata = pd.DataFrame(metricslist).sort_index(axis=1,
                                                       by=primary_metric)
        rundata = rundata.drop([
            'mean_absolute_percentage_error',
            'normalized_median_absolute_error',
            'normalized_root_mean_squared_log_error',
            'root_mean_squared_log_error'
        ])
        rundata.rename(columns={
            0: "one",
            1: "two",
            2: "three",
            3: "four",
            4: "five",
            5: "six",
            6: "seven",
            7: "eight",
            8: "nine",
            9: "ten",
        },
                       inplace=True)
        iterations_toJson = rundata.to_json(orient='columns')
        print(iterations_toJson)
        best_run, fitted_model = remote_run.get_output()
        best_run_toJson = best_run.get_metrics()
        cwd = 'D:/DCSAIAUTOML/BestModels/Azure'
        best_model_name = best_run.name
        model = remote_run.register_model(description=best_model)
        print(model.name, model.id, model.version, sep='\t')
        model_path = os.path.join(cwd, best_model, best_model_name)
        print(model_path)
        #print("Model DownLoad Complete")
        #model = Model(workspace=ws, name=model.name)
        #model.download_files(target_dir=model_path)
        #dict = {}
        #dict['iterations_toJson'] = iterations_toJson
        #dict['best_run_toJson'] = best_run_toJson
        #print(best_run.get_file_names())
        #Register the model
        #from datetime import date

        best_model_id = best_run.name

        var1 = "@"
        var2 = var1 + best_model_id

        Reg_model_name = model.name
        var4 = var1 + Reg_model_name

        best_run.flush(timeout_seconds=3600)
        best_run.download_files(output_directory=model_path)
        # importing required modules
        #import shutil
        #output_path = os.path.join(model_path, best_model_id)
        #dir_name1 = "D:\\DCSAIAUTOML\\BestModels\\Azure\\my_azure_best"
        #dir_name1 = "D:\\DCSAIAUTOML\\BestModels\\Azure\\my_azure_best\\my_azure_best"
        #shutil.make_archive(model_path,'zip',model_path)

        #zipf = zipfile.ZipFile(best_model_id+'.zip', 'w', zipfile.ZIP_DEFLATED)
        #for root, dirs, files in os.walk(model_path):
        #for file in files:
        #zipf.write(os.path.join(root, file))

        #def zipdir(path, ziph):
        # ziph is zipfile handle
        #import os
        #for root, dirs, files in os.walk(path):
        #for file in files:
        #ziph.write(os.path.join(root, file))

        #zipdir(model_path, zipf)
        #remote_run.clean_preprocessor_cache()
        print("ready to return")
        var5 = "no exception"
        return '{} {} {} {} {}'.format(iterations_toJson, var2, var4, var1,
                                       var5)
        #return iterations_toJson
    except Exception as e:
        error_statement = str(e)
        print("Error statement: ", error_statement)
        model_path1 = os.path.join(model_path, 'outputs')
        file_name = 'model.pkl'
        print("in exception: ", model_path1)
        src = 'D:\\Final Script_dev'
        full_file_name = os.path.join(src, file_name)
        import shutil
        #remote_run.download_file('model.pkl', output_file_path=model_path1)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, model_path1)
        return '{} {} {} {} {}'.format(iterations_toJson, var2, var4, var1,
                                       error_statement)
def RunAutoML():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    #location = request.json['location']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')

    dataset_name = file_name

    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    stock_dataset_df.head()
    #stock_dataset_json = stock_dataset_df.to_json(orient='split')
    #print(stock_dataset_json)
    y_df = stock_dataset_df['ActionTaken'].values
    x_df = stock_dataset_df.drop(['ActionTaken'], axis=1)
    print(y_df)
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    max_concurrent_iterations = request.json['max_concurrent_iterations']

    #n_cross_validations = request.json['n_cross_validations']

    try:
        automl_settings = {
            "name": ExperimentName,
            "iteration_timeout_minutes": iteration_timeout_minutes,
            "iterations": iterations,
            "n_cross_validations": n_cross_validations,
            "primary_metric": primary_metric,
            "preprocess": True,
            "max_concurrent_iterations": max_concurrent_iterations,
            "verbosity": logging.INFO
        }

        automl_config = AutoMLConfig(
            task=tasks,
            debug_log='automl_errors.log',
            path=os.getcwd(),
            #compute_target = 'Automlvm',
            X=x_df,
            y=y_df,
            **automl_settings,
        )

        experiment = Experiment(ws, 'automl_local_v2')
        remote_run = experiment.submit(automl_config, show_output=True)
        children = list(remote_run.get_children())
        metricslist = {}
        for run in children:
            properties = run.get_properties()
            metrics = {
                k: v
                for k, v in run.get_metrics().items() if isinstance(v, float)
            }
            metricslist[int(properties['iteration'])] = metrics

        rundata = pd.DataFrame(metricslist).sort_index(1)
        rundata_toJson = rundata.to_json(orient='columns')

        return rundata_toJson
    except:

        return 'error'
示例#24
0
def main(req: func.HttpRequest) -> func.HttpResponse:
    logging.info('Python HTTP trigger function processed a request.')

    # interactive_auth = InteractiveLoginAuthentication(tenant_id="b88f1ff4-e3ab-4adb-83e6-4ea99d41c665")

    sp = ServicePrincipalAuthentication(tenant_id='b88f1ff4-e3ab-4adb-83e6-4ea99d41c665',
                                    service_principal_id='2e90efa1-d53f-45d4-96d8-7adde8a02cdc',
                                    service_principal_password='******'
    )
    query = req.params.get('query')

    if not query:
        try:
            req_body = req.get_json()
        except ValueError:
            pass
        else:
            query = req_body.get('query')

    if query == 'run':
        try:
            ws = Workspace.get(name="vrd-ml",
               subscription_id="b9301f45-7da5-41f6-9125-1331de94f262",
               resource_group="vrd-dev-asia",
               auth=sp
               )
            
            compute_name = 'automl-compute'

            if compute_name in ws.compute_targets:
                compute_target = ws.compute_targets[compute_name]
                if compute_target and type(compute_target) is AmlCompute:
                    print('found compute target. just use it. ' + compute_name)
            else:
                print('creating a new compute target...')
                provisioning_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_D2_V2',
                                                                            min_nodes = 0, 
                                                                            max_nodes = 4)
                compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
                compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

            dataset = Dataset.get_by_name(ws, name='datasetfunc')

            train_data, test_data = dataset.random_split(percentage=0.8, seed=223)
            label = "ERP"

            automl_config = AutoMLConfig(task = 'regression',
                            compute_target = compute_name,
                            training_data = train_data,
                            label_column_name = label,
                            validation_data = test_data,
                            # n_cross_validations= 3,
                            primary_metric= 'r2_score',
                            enable_early_stopping= True, 
                            experiment_timeout_hours= 0.3,
                            max_concurrent_iterations= 4,
                            max_cores_per_iteration= -1,
                            verbosity= logging.INFO
                            )

            experiment_name = 'expfunc'
            experiment = Experiment(workspace = ws, name = experiment_name)

            run = experiment.submit(automl_config, show_output = True)                
            run

            run.wait_for_completion()
        except ValueError:
            pass
        return func.HttpResponse("AutoML Run Completed")
    else:
        return func.HttpResponse(
             "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response.",
             status_code=200
        )
示例#25
0
    max_concurrent_iterations=9,
    max_cores_per_iteration=-1,
    forecasting_parameters=forecasting_parameters,
)

# COMMAND ----------

# DBTITLE 1,Train
# submit a new training run
from azureml.train.automl.run import AutoMLRun

try:
    if new_training == "True":
        print("New Training Run")
        remote_run = experiment.submit(
            automl_config,
            show_output=False)  # Story No. 3018 modified Mukesh Dutta 9/3/2021
    else:
        # If you need to retrieve a run that already started, use the following code
        print("Existing Training Run")
        remote_run = AutoMLRun(experiment=experiment, run_id=runid)
except Exception as error:
    print(error)
    log_error("{} {}".format(notebook, error))  #log error in sentry
    #raise dbutils.notebook.exit(error) #raise the exception
    raise error  #raise the exception

remote_run

# COMMAND ----------
示例#26
0
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

aml_run_config = RunConfiguration()

aml_run_config.target = cpu_cluster_name

# AmlCompute is created in the same region as your workspace
# Set the VM size for AmlCompute from the list of supported_vmsizes
aml_run_config.amlcompute.vm_size = 'STANDARD_D2_V2'
aml_run_config.amlcompute._cluster_max_node_count = 2

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies(
    "./../localscripts/turbofan.yml")

#CondaDependencies.create(conda_packages=['scikit-learn'])

############# Experiement remote-gbr-turbofan ######################
experiement_name = 'gbr-turbofan'

exp = Experiment(workspace=ws, name=experiement_name)
src = ScriptRunConfig(source_directory='./',
                      script='01-train.py',
                      run_config=aml_run_config)

run = exp.submit(src, tags={"python version": sys.version[0:6]})

run.wait_for_completion(show_output=True)
# |
#
# Automated machine learning trains multiple machine learning pipelines.  Each pipelines training is known as an iteration.
# * You can specify a maximum number of iterations using the `iterations` parameter.
# * You can specify a maximum time for the run using the `experiment_timeout_minutes` parameter.
# * If you specify neither the `iterations` nor the `experiment_timeout_minutes`, automated ML keeps running iterations while it continues to see improvements in the scores.
#
# The following example doesn't specify `iterations` or `experiment_timeout_minutes` and so runs until the scores stop improving.
#

# In[43]:

## run remote train
from azureml.core.experiment import Experiment
experiment = Experiment(ws, 'automl_remote')
remote_run = experiment.submit(automl_config, show_output=True)

# In[4]:

# configure  experimetn to run on LOCALE  cluster
automl_config = AutoMLConfig(task='classification',
                             primary_metric='AUC_weighted',
                             X=X_train,
                             y=y_train,
                             n_cross_validations=3)

# Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.
# In this example, we specify `show_output = True` to print currently running iterations to the console.

# In[5]:
import azureml.core
from azureml.core import Workspace
from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.train.dnn import PyTorch

subscription_id = "" # The ID of the Azure Subscription
resource_group = "AdvanceAnalytics.Aml.Experiments" # Name of a logical resource group
workspace_name = "aa-ml-aml-workspace" # The name of the workspace to look for or to create
workspace_region = 'eastus' # Location of the workspace
computetarget_vm= 'Standard_NC6' # Size of the VM to use
experiment_name = 'azureml-gpubenchmark'
train_script = 'train_and_track.py'

ws = Workspace.create(
    name = workspace_name,
    subscription_id = subscription_id,
    resource_group = resource_group, 
    location = workspace_region,
    exist_ok = True)

src = PyTorch(source_directory =  r'.\fastai', compute_target='amlcompute', vm_size=computetarget_vm, entry_script = train_script, use_gpu = True, pip_packages = ['fastai', "azureml-sdk"])
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(src)

run.wait_for_completion(show_output = True)




    whitelist_models=[
        "GradientBoosting",
        "DecisionTree",
        "RandomForest",
        "ExtremeRandomTrees",
        "LightGBM",
    ],
    blacklist_models=["ensemble"],
    X=x,
    y=y,
    path=project_folder,
)

experiment = Experiment(ws, "host-ml-nt-ai-meetup")

db_run = experiment.submit(automl_config, show_output=True)

sub_runs = list(db_run.get_children())

best_run = None
best_score = 0

for sub_run in sub_runs:
    props = sub_run.get_properties()
    if props["run_algorithm"] != "Ensemble":
        if float(props["score"]) > best_score:
            best_run = sub_run

model_name = "Automl{}".format(str(uuid.uuid4()).replace("-", ""))[:20]
best_run.register_model(model_name=model_name, model_path="outputs/model.pkl")
# best_run, fitted_model = local_run.get_output()
    #spark_context=sc,
    training_data=training_data,
    label_column_name=label,
    **automl_settings,
    featurization='auto',
    experiment_exit_score=.98)

# COMMAND ----------

# MAGIC %md Submit the experiment to Automated ML service. This step can take longer depending on the settings. AutoML will give us updates as models are trained and evaluated by the metric we specified above. The information from each ML model training will be stored in the Experiment section of the Azure ML Workspace in Azure Portal.

# COMMAND ----------

# DBTITLE 1,Submit run to your Databricks cluster
local_run = experiment.submit(
    automl_config, show_output=True
)  # for higher runs please use show_output=False and use the below

# COMMAND ----------

# DBTITLE 1,Monitor progress in the portal
displayHTML(
    "<a href={} target='_blank'>Your experiment in Azure Portal: {}</a>".
    format(local_run.get_portal_url(), local_run.id))

# COMMAND ----------

# MAGIC %md **Run After AutoML Experiement is Complete**

# COMMAND ----------