Пример #1
0
    def upload_dataset(self, dataset_name: str, local_folder: str, datastore_name: str = None, overwrite: bool = False, tags: dict = None) -> pd.DataFrame:
        '''
        Uploads data from a local directory into an AzureML Datastore that points to Azure Data lake
        Args:
            dataset_name (str): The name of the dataset to register
            local_folder (str): The location of the local directory to take files from
            datastore_path (str): The name of a DataStore that will contain the dataset
        Returns:
            FileDataset: The registered dataset, containing the files
        '''
        if not datastore_name:
            # No datastore name is given, so we'll take the default one
            datastore_name = self.__datastore_path

        # Connecting data store
        datastore = Datastore(self.__workspace, name=datastore_name)

        # TODO : check type of datastore
        datastore.upload(local_folder, dataset_name, overwrite, True)
        
        datastore_paths = [(datastore, dataset_name)]
        file_ds = Dataset.File.from_files(path=datastore_paths)

        file_ds = file_ds.register(workspace=self.__workspace,
                                 name=dataset_name,
                                 description=dataset_name, 
                                 tags = tags, create_new_version=True)
Пример #2
0
    def __exit__(self, *exc_details):
        """Upload files for datastore.

        :param exc_details:
        :return:
        """
        from azureml.core.datastore import Datastore
        from azureml.data._dataprep_helper import dataprep

        module_logger.debug("Enter __exit__ function of datastore cmgr")
        for key, value in self._config.items():
            df_config, force_read = self._to_data_reference_config(value)
            if self._is_upload(df_config):
                self._validate_config(df_config, key)
                ds = Datastore(workspace=self._workspace,
                               name=df_config.data_store_name)
                if os.path.isdir(df_config.path_on_compute):
                    if self._is_datastore_adlsgen1(ds):
                        module_logger.debug(
                            "AzureDataLake Gen1 used as Datastore for upload dir."
                        )
                        dataprep().api.engineapi.api.get_engine_api(
                        ).upload_directory(
                            dataprep().api.engineapi.typedefinitions.
                            UploadDirectoryMessageArguments(
                                base_path=df_config.path_on_compute,
                                folder_path=df_config.path_on_compute,
                                destination=dataprep(
                                ).api._datastore_helper._to_stream_info_value(
                                    ds, df_config.path_on_data_store),
                                force_read=force_read,
                                overwrite=df_config.overwrite,
                                concurrent_task_count=1))
                    else:
                        ds.upload(src_dir=df_config.path_on_compute,
                                  target_path=df_config.path_on_data_store,
                                  overwrite=df_config.overwrite)
                elif os.path.isfile(df_config.path_on_compute):
                    if self._is_datastore_adlsgen1(ds):
                        module_logger.debug(
                            "AzureDataLake Gen1 used as Datastore for upload file."
                        )
                        dataprep().api.engineapi.api.get_engine_api(
                        ).upload_file(
                            dataprep().api.engineapi.typedefinitions.
                            UploadFileMessageArguments(
                                base_path=os.path.dirname(
                                    df_config.path_on_compute),
                                local_path=df_config.path_on_compute,
                                destination=dataprep(
                                ).api._datastore_helper._to_stream_info_value(
                                    ds, df_config.path_on_data_store),
                                force_read=force_read,
                                overwrite=df_config.overwrite))
                    else:
                        ds.upload_files(
                            files=[df_config.path_on_compute],
                            target_path=df_config.path_on_data_store,
                            overwrite=df_config.overwrite)
        module_logger.debug("Exit __exit__ function of datastore cmgr")
Пример #3
0
 def get_by_data_reference(cls, workspace, path):
     data_store = Datastore(workspace, cls.DEFAULT_GLOBAL_DATASET_STORE)
     return DataReference(
         datastore=data_store,
         data_reference_name=cls.DEFAULT_DATA_REFERENCE_NAME,
         path_on_datastore=path,
     )
Пример #4
0
    def load_tabular_partition(self,
                               partition_name: str,
                               datastore_name: str = None,
                               columns: np.array = None,
                               first_row_header: bool = False,
                               cloud_storage: bool = True) -> pd.DataFrame:
        '''
        Loads a partition from a tabular dataset. 
            The implementation will connect to the DataStore and get all delimited files matching the partition_name
            When configured locally, the implementation will append all files in the datastore_path with name {partition_name}.csv
        Args:
            partition_name (str): The name of the partition as a wildcard filter.  Example: B* will take all files starting with B, ending with csv
            columns: (np.array): The column names to assign to the dataframe
            datastore_path (str): The name of a DataStore that contains Datasets
            cloud_storage (bool): When changed to False, the dataset will be loaded from the local folder
        Returns:
            pd.DataFrame: The dataset, loaded as a DataFrame
        '''
        if not datastore_name:
            # No datastore name is given, so we'll take the default one
            datastore_name = self.__datastore_path

        if cloud_storage:
            # Connecting data store
            datastore = Datastore(self.__workspace, name=datastore_name)
            try:
                _header = PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS if first_row_header else False
                _aml_dataset = Dataset.Tabular.from_delimited_files(
                    header=_header,
                    path=DataPath(datastore, '/' + partition_name +
                                  '.csv'))  #, set_column_types=columns
                _df = _aml_dataset.to_pandas_dataframe()
            except DatasetValidationError as dsvalex:
                if 'provided path is not valid' in str(dsvalex):
                    return None
                else:
                    raise
        else:
            # Reading data from sub files in a folder
            _folder_path = datastore_name
            _partition_files = glob.glob(_folder_path + '/' + partition_name +
                                         '.csv')
            _record_found = False
            _df = None
            for filename in _partition_files:
                _header = 0 if first_row_header else None
                df = pd.read_csv(filename, index_col=None, header=_header)
                if not _record_found:
                    _df = df
                    _record_found = True
                else:
                    _df = _df.append(df)

            if not _record_found:
                return None

        if columns != None:
            _df.columns = columns
        return _df
Пример #5
0
    def _get_datastore_and_path(self, config):
        from azureml.core import Datastore

        output_location = config["OutputLocation"]
        data_path = output_location["DataPath"]
        datastore = Datastore(self._workspace, data_path["DatastoreName"])

        return datastore, data_path["RelativePath"]
Пример #6
0
    def __enter__(self):
        """Download files for datastore.

        :return:
        """
        module_logger.debug("Enter __enter__ function of datastore cmgr")
        from azureml.core import Datastore, Dataset
        for key, value in self._config.items():
            df_config, _ = self._to_data_reference_config(value)
            if self._is_upload(df_config):
                if df_config.path_on_compute:
                    dir_to_create = os.path.normpath(
                        os.path.dirname(df_config.path_on_compute))
                    if dir_to_create:
                        _safe_mkdirs(dir_to_create)
            else:
                target_path = df_config.data_store_name
                if df_config.path_on_compute:
                    target_path = os.path.join(df_config.data_store_name,
                                               df_config.path_on_compute)
                    # The target_path is always set using the data store name with no way
                    # for the user to overwrite this behavior. The user might attempt to use ../ in
                    # the path on compute as a solution but this throws an exception
                    # because the path is not normalized.
                    # Normalizing the path to allow the user to use up-level references.
                    target_path = os.path.normpath(target_path)
                if self._is_download(df_config):
                    self._validate_config(df_config, key)
                    ds = Datastore(workspace=self._workspace,
                                   name=df_config.data_store_name)
                    if self._is_datastore_adlsgen1(ds):
                        _log_and_print(
                            "AzureDataLake Gen1 used as Datastore for download"
                        )
                        if df_config.path_on_data_store is None:
                            df_config.path_on_data_store = ""
                        Dataset.File.from_files(
                            (ds, df_config.path_on_data_store)).download(
                                os.path.join(target_path,
                                             df_config.path_on_data_store),
                                overwrite=df_config.overwrite)
                    else:
                        count = ds.download(
                            target_path=target_path,
                            prefix=df_config.path_on_data_store,
                            overwrite=df_config.overwrite)
                        if count == 0:
                            import warnings
                            warnings.warn(
                                "Downloaded 0 files from datastore {} with path {}."
                                .format(ds.name, df_config.path_on_data_store))
                else:
                    _safe_mkdirs(target_path)

        module_logger.debug("Exit __enter__ function of datastore cmgr")
Пример #7
0
def submit_pipeline(
        workspace=None,  # Auto populated args + object
        pipeline_id=None,
        experiment_name=None,
        pipeline_yaml=None,
        pipeline_params=None,
        datapath_params=None,
        output_file=None,
        # We enforce a logger
        logger=None):
    """
    Submit a pipeline run based on a published pipeline ID
    """

    if pipeline_id is None and pipeline_yaml is None:
        raise UserErrorException("Please specify a pipeline ID or a pipeline YAML file")

    published_pipeline = None
    pipeline = None

    if pipeline_id is not None:
        from azureml.pipeline.core import PublishedPipeline
        published_pipeline = PublishedPipeline.get(workspace, pipeline_id)
        if experiment_name is None or experiment_name == '':
            # Use the pipeline name as the experiment name
            experiment_name = published_pipeline._sanitize_name()

    else:
        from azureml.pipeline.core import Pipeline
        pipeline = Pipeline.load_yaml(workspace, pipeline_yaml)

    if experiment_name is None:
        raise UserErrorException("Please specify an experiment name")

    assigned_params = _parse_key_values(pipeline_params, 'Parameter assignment')

    datapaths = _parse_key_values(datapath_params, 'Datapath assignment')
    for datapath_param_name in datapaths:
        datastore_with_path = datapaths[datapath_param_name]
        if '/' not in datastore_with_path:
            raise UserErrorException("Datapath value %s should have format datastore/path" % datastore_with_path)
        path_tokens = datastore_with_path.split('/', 1)
        from azureml.core import Datastore
        from azureml.data.datapath import DataPath
        datastore = Datastore(workspace, path_tokens[0])
        assigned_params[datapath_param_name] = DataPath(datastore=datastore, path_on_datastore=path_tokens[1])

    dict_output = _pipeline_run_submit(experiment_name, assigned_params, published_pipeline, pipeline,
                                       workspace, output_file, logger)

    return dict_output
Пример #8
0
def main(args):
    # Load workspace
    print("Loading Workspace")
    workspace = Workspace.from_config()
    print(
        f"Workspace name: {workspace.name}", 
        f"Azure region: {workspace.location}", 
        f"Subscription id: {workspace.subscription_id}", 
        f"Resource group: {workspace.resource_group}",
        sep="\n"
    )

    # Printing all datastores
    print("Printing all datastores")
    for name, datastore in workspace.datastores.items():
        print(name, datastore.datastore_type, sep="\t")
    
    # Load datastore
    print("Loading datastore")
    datastore = Datastore(
        workspace=workspace,
        name=args.datastore_name
    )

    # Upload dataset
    print("Uploading dataset")
    datastore.upload_files(
        files=["./train_dataset/iris.csv"],
        target_path="train_dataset/iris.csv",
        overwrite=True,
        show_progress=True
    )

    # Register dataset
    file_dataset = Dataset.File.from_files(
        
    )
Пример #9
0
def register_dataset(path,
                     system,
                     platform,
                     environment,
                     start_date,
                     end_date,
                     secret_scope,
                     datastore_name="dataprep"):
    # TODO: move parameters to Azure Key Vault
    sp_auth = ServicePrincipalAuthentication(
        tenant_id=dbutils.secrets.get(scope=secret_scope, key="tenant_id"),
        service_principal_id=dbutils.secrets.get(scope=secret_scope,
                                                 key="service_principal_id"),
        service_principal_password=dbutils.secrets.get(
            scope=secret_scope, key="service_principal_password"))
    ws = Workspace(subscription_id=parse_arg("--AZUREML_ARM_SUBSCRIPTION"),
                   resource_group=parse_arg("--AZUREML_ARM_RESOURCEGROUP"),
                   workspace_name=parse_arg("--AZUREML_ARM_WORKSPACE_NAME"),
                   auth=sp_auth)
    datastore = Datastore(workspace=ws, name=datastore_name)
    file_dataset = Dataset.File.from_files(
        path=[(datastore, f"{path}/tag_name=*/*.parquet")])
    system_name_clean = system_clean = system.replace(" ", "")
    file_dataset = file_dataset.register(
        workspace=ws,
        name=f"{system_name_clean}",
        description=f"{system_name_clean} dataset",
        tags={
            "system": system,
            "platform": platform,
            "environment": environment,
            "start_date": start_date,
            "end_date": end_date
        },
        create_new_version=True)
    return file_dataset
    leftcolumns='Survived;Pclass;Name',
    rightcolumns='Sex;Age;SibSp;Parch;Ticket;Fare;Cabin;Embarked',
    leftkeys='PassengerId',
    rightkeys='PassengerId',
    jointype='HashInner'
).set_inputs(
    left_input=input1,
    right_input=input2
)

# Configure inputs
ejoin.inputs.leftinput.configure(mode='mount')
print(ejoin.inputs.leftinput.mode)

# Configure outputs
ejoin.outputs.ejoin_output.configure(output_mode='mount', datastore=Datastore(ws, name="myownblob"))

print(ejoin.outputs.ejoin_output.output_mode)
print(ejoin.outputs.ejoin_output.datastore.name)

eselect = eselect_module_func(
    columns='Survived;Name;Sex;Age',
    input=ejoin.outputs.ejoin_output
)

# pipeline
pipeline = Pipeline(nodes=[ejoin, eselect], outputs=eselect.outputs, default_compute_target='aml-compute')


# In[ ]:
def create_pipeline(workspace):
    # Retreive compute cluster
    compute_target = workspace.compute_targets[args.compute_target]

    # Setup batch scoring environment from conda dependencies
    environment = Environment.from_conda_specification(
        name=args.environment_name, file_path=args.environment_specification
    )

    # Add environment variables
    environment.environment_variables = {
        "APPLICATIONINSIGHTS_CONNECTION_STRING": args.ai_connection_string
    }

    # Enable docker run
    environment.docker.enabled = True

    # Create run config
    run_config = RunConfiguration()
    run_config.environment = environment

    # Retreive input and output datastores
    input_datastore = Datastore(workspace, args.input_datastore_name)
    output_datastore = Datastore(workspace, args.output_datastore_name)

    # Define build id parameter
    build_id_param = PipelineParameter("build_id", default_value=args.build_id)

    # Define input datapath parameter
    input_datapath = DataPath(datastore=input_datastore, path_on_datastore="")
    input_datapath_param = (
        PipelineParameter(name="input_datapath", default_value=input_datapath),
        DataPathComputeBinding(mode="mount"),
    )

    # Define output datapath parameter
    output_datapath = DataPath(datastore=output_datastore, path_on_datastore="")
    output_datapath_param = (
        PipelineParameter(name="output_datapath", default_value=output_datapath),
        DataPathComputeBinding(mode="mount"),
    )

    # Define score step for pipeline
    score_step = PythonScriptStep(
        name="score_data",
        compute_target=compute_target,
        source_directory="src/score",
        script_name="score.py",
        inputs=[input_datapath_param, output_datapath_param],
        runconfig=run_config,
        allow_reuse=False,
        arguments=[
            "--build_id",
            build_id_param,
            "--input_datapath",
            input_datapath_param,
            "--output_datapath",
            output_datapath_param,
        ],
    )

    # Define pipeline for batch scoring
    pipeline = Pipeline(workspace=workspace, steps=[score_step])

    return pipeline
Пример #12
0
                     },
                     delimiter=",",
                     header=None)
df_log.columns = [
    'ModelType', 'FileName', 'ModelName', 'StartTime', 'EndTime', 'Duration',
    'Index', 'BatchSize', 'Status'
]
df_log['ModelType'] = df_log['ModelType'].apply(str).str.replace("'", '')
df_log['FileName'] = df_log['FileName'].apply(str).str.replace("'", '')
df_log['ModelName'] = df_log['ModelName'].apply(str).str.replace("'", '')
df_log['StartTime'] = df_log['StartTime'].apply(str).str.replace("'", '')
df_log['EndTime'] = df_log['EndTime'].apply(str).str.replace("'", '')
df_log['Duration'] = df_log['Duration'].apply(str).str.replace("'", '')
df_log['Status'] = df_log['Status'].apply(str).str.replace("'", '')
print(df_log.head())
print('Read and cleaned the log file')

# save the log file
output_path = os.path.join('./logs/', 'training_log')
df_log.to_csv(path_or_buf=output_path + '.csv', index=False)
print('Saved the training_log.csv')

# upload the log file
log_dstore = Datastore(ws, args.datastore)
log_dstore.upload_files(['./logs/training_log' + '.csv'],
                        target_path='training_log_' +
                        str(datetime.datetime.now().date()),
                        overwrite=args.overwrite_logs,
                        show_progress=True)
print('Uploaded the training_log.csv')
Пример #13
0
def run(args):
    with open(args.run_spec_file, "r") as f:
        run_spec = yaml.load(f, Loader=yaml.SafeLoader)

    log = read_log_file()

    ws = get_workspace()
    experiment = Experiment(workspace=ws, name=log['aml_experiment_name'])

    experiments = log['experiments']

    # Checking if experiment with same name already exists and cancelling and deleting it if needed
    if args.experiment_name in experiments:
        print("Experiment already exists. Please give a different name")
        exit(0)

    submitted_runs = []
    all = True
    if args.j:
        all = False

    source_directory = tempfile.TemporaryDirectory()
    entry_script_file = "entry.py"

    with open(os.path.join(source_directory.name, entry_script_file),
              "w") as f:
        f.write(textwrap.dedent(entry_script_content).strip() + "\n")

    script_params = {}
    environment_variables = {}
    for x in run_spec['volumes']:
        if 'path' in x:
            script_params["--{}".format(x['name'])] = Datastore(
                workspace=ws, name=x['datastore']).path(x['path']).as_mount()
            environment_variables[x['name']] = str(script_params["--{}".format(
                x['name'])])
        if x['name'] == 'OUTPUT_DIR':
            output_dir_datastore = x['datastore']

    if 'environment_variables' in run_spec:
        for x in run_spec['environment_variables']:
            environment_variables[x['name']] = x['value']

    setup_command = ""
    if 'setup' in run_spec:
        for x in run_spec['setup']:
            setup_command += x
            setup_command += '; '

    compute_target = ComputeTarget(workspace=ws, name=run_spec['compute_name'])
    description = run_spec['description']

    rtype = 'run'
    for run in run_spec['runs']:
        if all or run['name'] in args.j:
            script_params["--OUTPUT_DIR"] = Datastore(
                workspace=ws,
                name=output_dir_datastore).path("Experiments/{}/{}/{}".format(
                    log['aml_experiment_name'], args.experiment_name,
                    run['name'])).as_mount()
            environment_variables['OUTPUT_DIR'] = str(
                script_params["--OUTPUT_DIR"])
            command = setup_command + run['command']
            script_params['--command'] = command

            params = {
                'use_gpu': True,
                'custom_docker_image': run_spec['docker_image'],
                'user_managed': True,
                'source_directory': source_directory.name,
                'entry_script': entry_script_file,
                'script_params': script_params,
                'environment_variables': environment_variables,
                'compute_target': compute_target,
            }

            est = Estimator(**params)

            tags = {
                'name': run['name'],
                'experiment_name': args.experiment_name
            }
            submitted_run = experiment.submit(est, tags=tags)
            print("Submitting ", tags['name'], submitted_run.get_portal_url())

            submitted_runs.append([submitted_run.id, run['name']])

    log['experiments'][args.experiment_name] = {
        'type': rtype,
        'ids': submitted_runs,
        'modified': datetime.datetime.now().timestamp(),
        'output_dir_datastore': output_dir_datastore,
        'description': description
    }
    write_log_file(log)

    source_directory.cleanup()
Пример #14
0
def main():
    load_dotenv()
    workspace_name = os.environ.get("BASE_NAME") + "-AML-WS"
    resource_group = os.environ.get("BASE_NAME") + "-AML-RG"
    subscription_id = os.environ.get("SUBSCRIPTION_ID")
    tenant_id = os.environ.get("TENANT_ID")
    app_id = os.environ.get("SP_APP_ID")
    app_secret = os.environ.get("SP_APP_SECRET")
    sources_directory_train = os.environ.get("SOURCES_DIR_TRAIN")
    train_script_path = os.environ.get("TRAIN_SCRIPT_PATH")
    evaluate_script_path = os.environ.get("EVALUATE_SCRIPT_PATH")
    register_script_path = os.environ.get("REGISTER_SCRIPT_PATH")
    vm_size_cpu = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
    compute_name_cpu = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
    model_name = os.environ.get("MODEL_NAME")

    # Get Azure machine learning workspace
    aml_workspace = get_workspace(workspace_name, resource_group,
                                  subscription_id, tenant_id, app_id,
                                  app_secret)
    print(aml_workspace)

    # Get Azure machine learning cluster
    aml_compute_cpu = get_compute(aml_workspace, compute_name_cpu, vm_size_cpu)
    if aml_compute_cpu is not None:
        print(aml_compute_cpu)

    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
        conda_packages=[
            'numpy', 'pandas', 'scikit-learn', 'tensorflow', 'keras'
        ]))
    run_config.environment.docker.enabled = True

    model_name = PipelineParameter(name="model_name", default_value=model_name)
    def_blob_store = Datastore(aml_workspace, "workspaceblobstore")
    jsonconfigs = PipelineData("jsonconfigs", datastore=def_blob_store)
    config_suffix = datetime.datetime.now().strftime("%Y%m%d%H")

    train_step = PythonScriptStep(
        name="Train Model",
        script_name=train_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        # inputs=[jsonconfigs],
        outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step Train created")

    evaluate_step = PythonScriptStep(
        name="Evaluate Model ",
        script_name=evaluate_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
        ],
        runconfig=run_config,
        inputs=[jsonconfigs],
        # outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step Evaluate created")

    register_model_step = PythonScriptStep(
        name="Register New Trained Model",
        script_name=register_script_path,
        compute_target=aml_compute_cpu,
        source_directory=sources_directory_train,
        arguments=[
            "--config_suffix",
            config_suffix,
            "--json_config",
            jsonconfigs,
            "--model_name",
            model_name,
        ],
        runconfig=run_config,
        inputs=[jsonconfigs],
        # outputs=[jsonconfigs],
        allow_reuse=False,
    )
    print("Step register model created")

    evaluate_step.run_after(train_step)
    register_model_step.run_after(evaluate_step)
    steps = [register_model_step]

    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    train_pipeline.validate()
    published_pipeline = train_pipeline.publish(
        name="training-pipeline",
        description="Model training/retraining pipeline")

    train_pipeline_json = {}
    train_pipeline_json["rest_endpoint"] = published_pipeline.endpoint
    json_file_path = "ml_service/pipelines/train_pipeline.json"
    with open(json_file_path, "w") as outfile:
        json.dump(train_pipeline_json, outfile)

# #### Create a Run Configuration
# 

# In[ ]:


# Create run config
runconfig = RunConfiguration()
runconfig.target = batchai_cluster_name
runconfig.batchai.node_count = 2
runconfig.environment.docker.enabled = True

# Set the datastore config in the runconfig
_default_datastore = Datastore(ws)
data_ref_configs = {}
data_ref = _default_datastore._get_data_reference()
data_ref_configs[data_ref.data_reference_name] = data_ref._to_config()
runconfig.data_references = data_ref_configs;


# #### Run an experiment
# 

# In[ ]:


# Set AMLBatchAI as the compute backend
compute_strategy_batchai = AMLBatchAICompute(ws, runconfig)
grid_cv_rf.compute_strategy = compute_strategy_batchai
Пример #16
0
import os
import azureml.core
from azureml.core import Workspace, Datastore
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.core import Experiment, Environment, ScriptRunConfig, RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.contrib.pipeline.steps import ParallelRunConfig, ParallelRunStep
from azureml.core.runconfig import DEFAULT_GPU_IMAGE

ws = Workspace.from_config()

def_blob_store = Datastore(ws, 'workspaceblobstore')
output_data = PipelineData('output_data',
                           datastore=def_blob_store,
                           output_name='output_data',
                           is_directory=True)
batch_input = output_data.as_dataset()

classification_data = PipelineData('classification_data',
                                   datastore=def_blob_store,
                                   output_name='classification_data',
                                   is_directory=True)

compute_target = ws.compute_targets['cpu-cluster']

environment_variables = {
    'POSTGRES_PASSWORD':
    os.environ['POSTGRES_PASSWORD'],
    'POSTGRES_HOSTNAME':
    'ackbar-postgres.postgres.database.azure.com',
Пример #17
0
def main():
    e = Env()
    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group,
    )
    print(f"get_workspace:{aml_workspace}")

    # Get Azure machine learning cluster
    aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size)
    if aml_compute is not None:
        print(f"aml_compute:{aml_compute}")

    # Create a reusable Azure ML environment
    environment = get_environment(
        aml_workspace,
        e.aml_env_name,
        create_new=e.rebuild_env,
        enable_docker=True,
        dockerfile='ml_model/preprocess/Dockerfile'
    )  #
    run_config = RunConfiguration()
    run_config.environment = environment

    if e.datastore_name:
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name
    run_config.environment.environment_variables["DATASTORE_NAME"] = datastore_name  # NOQA: E501

    datastore = Datastore(aml_workspace, name=datastore_name)
    data_file_path_param = PipelineParameter(name="data_file_path", default_value=e.dataset_name)  # NOQA: E501

    # The version of the input/output dataset can't be determined at pipeline publish time, only run time.  # NOQA: E501
    # Options to store output data:
    # Option 1: Use blob API to write output data. Otherwise, no way to dynamically change the output dataset based on PipelineParameter, # NOQA: E501
    #     The following will not work. It generate a path like "PipelineParameter_Name:data_file_path_Default:gear_images"  # NOQA: E501
    #         output_ds = OutputFileDatasetConfig(destination=(datastore, data_file_path_param))  # NOQA: E501
    #     This option means writing a file locally and upload to the datastore. Fewer dataset, more code.  # NOQA: E501
    # Option 2: Use a dynamic path in OutputFileDatasetConfig, and register a new dataset at completion  # NOQA: E501
    #     Output dataset can be mounted, so more dataset to maintain, less code.   # NOQA: E501
    # Using Option 2 below.
    output_dataset = OutputFileDatasetConfig(
        name=e.processed_dataset_name,
        destination=(datastore, "/dataset/{output-name}/{run-id}")
    ).register_on_complete(
        name=e.processed_dataset_name)

    preprocess_step = PythonScriptStep(
        name="Preprocess Data with OS cmd",
        script_name='preprocess/preprocess_os_cmd_aml.py',
        compute_target=aml_compute,
        source_directory=e.sources_directory_train,
        arguments=[
            "--dataset_name", e.dataset_name,
            "--datastore_name", datastore_name,
            "--data_file_path", data_file_path_param,
            "--output_dataset", output_dataset,
        ],
        runconfig=run_config,
        allow_reuse=False,
    )
    print("Step Preprocess OS cmd created")

    steps = [preprocess_step]
    preprocess_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    preprocess_pipeline._set_experiment_name
    preprocess_pipeline.validate()
    published_pipeline = preprocess_pipeline.publish(
        name=e.preprocessing_pipeline_name,
        description="Data preprocessing OS cmd pipeline",
        version=e.build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")
Пример #18
0
def main():
    train_file = r"EdwardFry_Microsoft_issueDataset.csv"
    ws = Workspace.from_config()

    # Default datastore
    def_data_store = ws.get_default_datastore()  # Loads config.json

    # Get the blob storage associated with the workspace
    def_blob_store = Datastore(ws, "workspaceblobstore")

    # Get file storage associated with the workspace
    def_file_store = Datastore(ws, "workspacefilestore")

    # Set data input and output
    xyz_phishing_dataset = Dataset.File.from_files([(def_blob_store,
                                                     train_file)])
    output_data1 = OutputFileDatasetConfig(
        destination=(datastore, 'outputdataset/{run-id}'))
    output_data_dataset = output_data1.register_on_complete(
        name='prepared_output_data')

    # Set compute
    compute_name = "aml-compute"
    vm_size = "STANDARD_NC6"
    if compute_name in ws.compute_targets:
        compute_target = ws.compute_targets[compute_name]
        if compute_target and type(compute_target) is AmlCompute:
            print('Found compute target: ' + compute_name)
    else:
        print('Creating a new compute target...')
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size=vm_size,  # STANDARD_NC6 is GPU-enabled
            min_nodes=0,
            max_nodes=4)
        # create the compute target
        compute_target = ComputeTarget.create(ws, compute_name,
                                              provisioning_config)

        # Can poll for a minimum number of nodes and for a specific timeout.
        # If no min node count is provided it will use the scale settings for the cluster
        compute_target.wait_for_completion(show_output=True,
                                           min_node_count=None,
                                           timeout_in_minutes=20)

        # For a more detailed view of current cluster status, use the 'status' property
        print(compute_target.status.serialize())

    aml_run_config = RunConfiguration()
    # `compute_target` as defined in "Azure Machine Learning compute" section above
    aml_run_config.target = compute_target

    USE_CURATED_ENV = True
    if USE_CURATED_ENV:
        curated_environment = Environment.get(workspace=ws,
                                              name="AzureML-Tutorial")
        aml_run_config.environment = curated_environment
    else:
        aml_run_config.environment.python.user_managed_dependencies = False

        # Add some packages relied on by data prep step
        aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
            conda_packages=['pandas', 'scikit-learn'],
            pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]'],
            pin_sdk_version=False)

    dataprep_source_dir = "./dataprep_src"
    entry_point = "prepare.py"
    # `my_dataset` as defined above
    ds_input = xyz_phishing_dataset.as_named_input('input1')

    # `output_data1`, `compute_target`, `aml_run_config` as defined above
    data_prep_step = PythonScriptStep(script_name=entry_point,
                                      source_directory=dataprep_source_dir,
                                      arguments=[
                                          "--input",
                                          ds_input.as_download(), "--output",
                                          output_data1
                                      ],
                                      compute_target=compute_target,
                                      runconfig=aml_run_config,
                                      allow_reuse=True)

    train_source_dir = "./train_src"
    train_entry_point = "train.py"

    training_results = OutputFileDatasetConfig(name="training_results",
                                               destination=def_blob_store)

    train_step = PythonScriptStep(script_name=train_entry_point,
                                  source_directory=train_source_dir,
                                  arguments=[
                                      "--prepped_data",
                                      output_data1.as_input(),
                                      "--training_results", training_results
                                  ],
                                  compute_target=compute_target,
                                  runconfig=aml_run_config,
                                  allow_reuse=True)

    # list of steps to run (`compare_step` definition not shown)
    compare_models = [data_prep_step, train_step, compare_step]

    # Build the pipeline
    pipeline1 = Pipeline(workspace=ws, steps=[compare_models])

    #dataset_consuming_step = PythonScriptStep(
    #    script_name="iris_train.py",
    #    inputs=[iris_tabular_dataset.as_named_input("iris_data")],
    #    compute_target=compute_target,
    #    source_directory=project_folder
    #)

    #run_context = Run.get_context()
    #iris_dataset = run_context.input_datasets['iris_data']
    #dataframe = iris_dataset.to_pandas_dataframe()

    ## Within a PythonScriptStep

    #ws = Run.get_context().experiment.workspace

    #step = PythonScriptStep(name="Hello World",
    #                        script_name="hello_world.py",
    #                        compute_target=aml_compute,
    #                        source_directory=source_directory,
    #                        allow_reuse=False,
    #                        hash_paths=['hello_world.ipynb'])

    # Submit the pipeline to be run
    pipeline_run1 = Experiment(ws, 'Compare_Models_Exp').submit(pipeline1)
    pipeline_run1.wait_for_completion()
Пример #19
0
# use regular expressions to determine if the string is a decimal
import re
regex = r'^[+-]{0,1}((\d*\.)|\d*)\d+$'

# get the train size and calculate out the test size
if re.match(regex, train_size) is None:
    raise Exception("Please provide a decimal value as a string")

if Decimal(train_size) > 0.85:
    raise Exception('Training size cannot be equal to or larger than 0.85')

train_size = Decimal(train_size)
test_size = Decimal("1.00") - Decimal(train_size)

# get the datastore and the tabular dataset
datastore = Datastore(ws, datastore_name)
path_on_datastore = os.path.join(folder_name, file_name)

dataset = Dataset.Tabular.from_delimited_files(path=(datastore,
                                                     path_on_datastore))

# convert to pandas to split the data
data = dataset.to_pandas_dataframe()
X = data.drop(columns=label_name)
y = data[label_name]

# split the data using scikit-learn
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=float(test_size),
                                                    random_state=101)
Пример #20
0
import azureml.core
from azureml.core import Workspace
from azureml.core import Datastore
from azureml.core import Experiment
from azureml.core.compute import ComputeTarget
from azureml.core.runconfig import ContainerRegistry
from azureml.train.estimator import Estimator
from azureml.widgets import RunDetails

#
# Get the workspace, compute target, and datastore we prepared previously.
#

ws = Workspace.from_config()
ct = ComputeTarget(workspace=ws, name="cpucluster-II")
ds = Datastore(workspace=ws, name="hellotfstore")

#
# Create an estimator.
#

# Single node
est_1 = Estimator(
    compute_target=compute_target,
    use_gpu=False,
    node_count=1,
    pip_packages=['tensorflow==1.13.1'],
    source_directory="../",
    entry_script="mnist-mlp.py",
    script_params={"--data-dir": ds.path("data/mnist").as_mount()})
Пример #21
0
def main():
    # Parse command line arguments
    args = parse_args(sys.argv[1:])

    # Retreive workspace
    workspace = Workspace.get(
        subscription_id=args.subscription_id,
        resource_group=args.resource_group,
        name=args.workspace_name,
    )

    # Retreive compute cluster
    compute_target = workspace.compute_targets[args.compute_target]

    # Get baseline dataset
    baseline_dataset = Dataset.get_by_name(workspace, args.baseline_dataset_name)

    # Get model id and version
    model_name, model_version = args.model_id.split(":")

    # Get AKS Endpoint
    aks_endpoint = AksWebservice(workspace, args.endpoint_name)

    # Make call to endpoint with sample data and wait for the data to arrive in the storage account
    # [Note: this step is required to ensure a data sample is present for validation when
    # registering a new target dataset below - this can take up to 10 mins to appear]
    input_record = (
        baseline_dataset.take(1)
        .to_pandas_dataframe()
        .drop(["cardiovascular_disease", "datetime"], axis=1)
        .to_dict("records")
    )

    input_data = json.dumps({"data": input_record})

    print("Variable [input_data]:", input_data)

    aks_endpoint.run(input_data)
    time.sleep(600)

    # Define target dataset
    target_dataset_name = (
        f"inference-data-{model_name}-{model_version}-{args.endpoint_name}"
    )

    # Get current registered target dataset definition
    current_target_dataset = Dataset.get_by_name(workspace, name=target_dataset_name)
    current_target_dataset_definition = json.loads(current_target_dataset._definition)

    # Get current registered target dataset datasetore definition
    current_target_dataset_datastore_definition = current_target_dataset_definition[
        "blocks"
    ][0]["arguments"]["datastores"][0]

    # Define current registered target dataset datasetore
    target_dataset_datastore = Datastore(
        workspace, current_target_dataset_datastore_definition["datastoreName"]
    )

    # Define current registered target dataset datasetore path
    target_dataset_datastore_path = current_target_dataset_datastore_definition["path"]

    # Create updated target dataset with non-string feature data types
    target_dataset = Dataset.Tabular.from_delimited_files(
        path=(target_dataset_datastore, target_dataset_datastore_path),
        validate=False,
        infer_column_types=False,
        set_column_types={
            "age": DataType.to_float(decimal_mark="."),
            "height": DataType.to_float(decimal_mark="."),
            "weight": DataType.to_float(decimal_mark="."),
            "systolic": DataType.to_float(decimal_mark="."),
            "diastolic": DataType.to_float(decimal_mark="."),
            "gender": DataType.to_string(),
            "cholesterol": DataType.to_string(),
            "glucose": DataType.to_string(),
            "smoker": DataType.to_string(),
            "alcoholic": DataType.to_string(),
            "active": DataType.to_string(),
            "datetime": DataType.to_datetime(),
        },
    )

    # Assign timestamp column for Tabular Dataset to activate time series related APIs
    target_dataset = target_dataset.with_timestamp_columns(
        timestamp=target_dataset_timestamp_column
    )

    # Register updated dataset version
    target_dataset.register(
        workspace, name=target_dataset_name, create_new_version=True
    )

    print("Variable [target_dataset]:", target_dataset)
    print("Variable [baseline_dataset]:", baseline_dataset)

    # Define features to monitor
    feature_list = args.feature_list.split(",")

    print("Variable [feature_list]:", args.feature_list)

    # List data drift detectors
    drift_detector_list = DataDriftDetector.list(workspace)

    # Delete existing data drift detector
    for drift_monitor in drift_detector_list:
        if drift_monitor.name == args.data_drift_monitor_name:
            print("Deleteing existing data drift monitor...")
            drift_monitor.delete()

    # Define data drift detector
    monitor = DataDriftDetector.create_from_datasets(
        workspace,
        args.data_drift_monitor_name,
        baseline_dataset,
        target_dataset,
        compute_target=compute_target,
        frequency=args.frequency,
        feature_list=feature_list,
    )

    print("Variable [monitor]:", monitor)

    # Enable the pipeline schedule for the data drift detector
    monitor.enable_schedule()
Пример #22
0
daily = ScheduleRecurrence(frequency = 'Day', interval = 1)

pipeline_schedule = Schedule.create(ws, name = 'Daily training',
                                    description='trains model every day',
                                    pipeline_id = published_pipeline.id)
                                    experiment_name = 'Training_pipeline',
                                    recurrence = daily)


# we can also create it like, whenevr data will change pipeline will be run

from azureml.core import Datastore
from azureml.pipeline.core import Schedule

training_ds = Datastore(workspace = ws, name = "blob_data")

pipeline_schedule = Schedule.create(ws, name = 'Reactive training',
                                    description='trains model on data change',
                                    pipeline_id = published_pipeline_id,
                                    experiment_name = 'training_pipeline',
                                    datastore=training_ds,
                                    path_on_datastore='data/training')







Пример #23
0
def create_experiment_config(workspace):
    ########################################
    ### Creating data prep Pipeline Step ###
    ########################################

    # Load settings
    print("Loading settings")
    data_prep_step_path = os.path.join("steps", "data_prep")
    with open(os.path.join(data_prep_step_path, "step.json")) as f:
        data_prep_settings = json.load(f)

    # Setup datasets of first step
    print("Setting up datasets")
    data_prep_input = Dataset.get_by_name(workspace=workspace,
                                          name=data_prep_settings.get(
                                              "dataset_input_name",
                                              None)).as_named_input(
                                                  data_prep_settings.get(
                                                      "dataset_input_name",
                                                      None)).as_mount()
    data_prep_output = PipelineData(
        name=data_prep_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=data_prep_settings.get(
                                "datastore_output_name",
                                "workspaceblobstore")),
        output_mode="mount").as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #data_prep_output.register(
    #    name=data_prep_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    data_prep_dependencies = CondaDependencies.create(
        pip_packages=data_prep_settings.get("pip_packages", []),
        conda_packages=data_prep_settings.get("conda_packages", []),
        python_version=data_prep_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    data_prep_run_config = RunConfiguration(
        conda_dependencies=data_prep_dependencies,
        framework=data_prep_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    data_prep_compute_target = ComputeTarget(workspace=workspace,
                                             name=data_prep_settings.get(
                                                 "compute_target_name", None))

    # Create python step
    print("Creating Step")
    data_prep = PythonScriptStep(
        name=data_prep_settings.get("step_name", None),
        script_name=data_prep_settings.get("script_name", None),
        arguments=data_prep_settings.get("arguments", []),
        compute_target=data_prep_compute_target,
        runconfig=data_prep_run_config,
        inputs=[data_prep_input],
        outputs=[data_prep_output],
        params=data_prep_settings.get("parameters", []),
        source_directory=data_prep_step_path,
        allow_reuse=data_prep_settings.get("allow_reuse", True),
        version=data_prep_settings.get("version", None),
    )

    ###############################################
    ### Creating data model train Pipeline Step ###
    ###############################################

    # Load settings
    print("Loading settings")
    model_train_step_path = os.path.join("steps", "model_train")
    with open(os.path.join(model_train_step_path, "step.json")) as f:
        model_train_settings = json.load(f)
    hyperparameter_sampling_settings = model_train_settings.get(
        "hyperparameter_sampling", {})

    # Setup datasets of first step
    print("Setting up datasets")
    model_train_input = data_prep_output.as_named_input(
        name=model_train_settings.get("dataset_input_name", None))
    model_train_output = PipelineData(
        name=model_train_settings.get("dataset_output_name", None),
        datastore=Datastore(workspace=workspace,
                            name=model_train_settings.get(
                                "datastore_output_name", None)),
        output_mode="mount",
    ).as_dataset()
    # Uncomment next lines, if you want to register intermediate dataset
    #model_train_output.register(
    #    name=model_train_settings.get("dataset_output_name", None),
    #    create_new_version=True
    #)

    # Create conda dependencies
    print("Creating conda dependencies")
    model_train_dependencies = CondaDependencies.create(
        pip_packages=model_train_settings.get("pip_packages", []),
        conda_packages=model_train_settings.get("conda_packages", []),
        python_version=model_train_settings.get("python_version", "3.6.2"))

    # Create run configuration
    print("Creating RunConfiguration")
    model_train_run_config = RunConfiguration(
        conda_dependencies=model_train_dependencies,
        framework=model_train_settings.get("framework", "Python"))

    # Loading compute target
    print("Loading ComputeTarget")
    model_train_compute_target = ComputeTarget(workspace=workspace,
                                               name=model_train_settings.get(
                                                   "compute_target_name",
                                                   None))

    # Create distributed training backend
    print("Creating distributed training backend")
    distributed_training_backend = get_distributed_backend(
        backend_name=model_train_settings.get("distributed_backend", None))

    # Create Estimator for Training
    print("Creating Estimator for training")
    model_train_estimator = Estimator(
        source_directory=model_train_step_path,
        entry_script=model_train_settings.get("script_name", None),
        environment_variables=model_train_settings.get("parameters", None),
        compute_target=model_train_compute_target,
        node_count=model_train_settings.get("node_count", None),
        distributed_training=distributed_training_backend,
        conda_packages=model_train_settings.get("conda_packages", None),
        pip_packages=model_train_settings.get("pip_packages", None),
    )

    try:
        # Create parameter sampling
        print("Creating Parameter Sampling")
        parameter_dict = {}
        parameters = hyperparameter_sampling_settings.get(
            "parameters",
            {}) if "parameters" in hyperparameter_sampling_settings else {}
        for parameter_name, parameter_details in parameters.items():
            parameter_distr = get_parameter_distribution(
                distribution=parameter_details.get("distribution", None),
                **parameter_details.get("settings", {}))
            parameter_dict[f"--{parameter_name}"] = parameter_distr
        model_train_ps = get_parameter_sampling(
            sampling_method=hyperparameter_sampling_settings.get(
                "method", None),
            parameter_dict=parameter_dict)

        # Get Policy definition
        policy_settings = hyperparameter_sampling_settings.get("policy", {})
        kwargs = {
            key: value
            for key, value in policy_settings.items() if key not in
            ["policy_method", "evaluation_interval", "delay_evaluation"]
        }

        # Create termination policy
        print("Creating early termination policy")
        model_train_policy = get_policy(
            policy_method=policy_settings.get("method", ""),
            evaluation_interval=policy_settings.get("evaluation_interval",
                                                    None),
            delay_evaluation=policy_settings.get("delay_evaluation", None),
            **kwargs)

        # Create HyperDriveConfig
        print("Creating HyperDriveConfig")
        model_train_hyperdrive_config = HyperDriveConfig(
            estimator=model_train_estimator,
            hyperparameter_sampling=model_train_ps,
            policy=model_train_policy,
            primary_metric_name=hyperparameter_sampling_settings.get(
                "primary_metric", None),
            primary_metric_goal=PrimaryMetricGoal.MINIMIZE
            if "min" in hyperparameter_sampling_settings.get(
                "primary_metric_goal", None) else PrimaryMetricGoal.MAXIMIZE,
            max_total_runs=hyperparameter_sampling_settings.get(
                "max_total_runs", 1),
            max_concurrent_runs=hyperparameter_sampling_settings.get(
                "max_concurrent_runs", 1),
            max_duration_minutes=hyperparameter_sampling_settings.get(
                "max_duration_minutes", None))

        # Create HyperDriveStep
        print("Creating HyperDriveStep")
        model_train = HyperDriveStep(
            name=model_train_settings.get("step_name", None),
            hyperdrive_config=model_train_hyperdrive_config,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))
    except:
        print("Not all required parameters specified for HyperDrive step")

        # Create EstimatorStep
        print("Creating EstimatorStep")
        model_train = EstimatorStep(
            name=model_train_settings.get("step_name", None),
            estimator=model_train_estimator,
            estimator_entry_script_arguments=model_train_settings.get(
                "arguments", None),
            inputs=[model_train_input],
            outputs=[model_train_output],
            compute_target=model_train_compute_target,
            allow_reuse=model_train_settings.get("allow_reuse", True),
            version=model_train_settings.get("version", True))

    #########################
    ### Creating Pipeline ###
    #########################

    # Create Pipeline
    print("Creating Pipeline")
    pipeline = Pipeline(
        workspace=workspace,
        steps=[model_train],
        description="Training Pipeline",
    )

    # Validate pipeline
    print("Validating pipeline")
    pipeline.validate()

    return pipeline
from azureml.core import Workspace, Datastore, Dataset

ws = Workspace.from_config()

ds = Datastore(ws, "mydatastore")

#creating Dataset

#creating path, we can have multiple data path like this
dataset_path = [(ds, "loan.csv")]
loan_dataset = Dataset.Tabular.from_delimited_files(path = dataset_path)
dataset = loan_dataset.register(workspace=ws, name = "Loan Application")


# see all the datasets
for i in list(ws.datasets.keys()):
    print(i)


# get a dataset
df = Dataset.get_by_name(ws, "Loan Application")



Пример #25
0
    # Note: source_directory and entry_script are in local, source_directory/entry_script
    source_directory = "./"
    # print(sys.argv[1])
    # entry_script = sys.argv[1]
    entry_script = 'run.py'
    # entry_script = "./entry-script.py"

    # subscription_id = config.subscription_id
    # resource_group = config.resource_group
    # workspace_name = config.workspace_name
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)

    # cluster_name= config.cluster_name
    ct = ComputeTarget(workspace=ws, name=cluster_name)
    # datastore_name =config.datastore_name
    ds = Datastore(workspace=ws, name=datastore_name)

    workdir = os.path.realpath('.')[os.path.realpath('.').find('FixMatch-pytorch'):]
    workdir = workdir.replace('\\', '/')

    script_params = {
        "--workdir": ds.path('/projects/'+workdir).as_mount(), # REQUIRED !!!
        "--cxk_volna": ds.path('/').as_mount(),
        "--exp_name": workdir.split('/')[-1],
    }

    def make_container_registry(address, username, password):
        cr = ContainerRegistry()
        cr.address = address
        cr.username = username
        cr.password = password
def load(quick_run, data_path, cache_path, model_name, num_gpus, random_seed):

	# Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.
	QUICK_RUN = quick_run

	# Wikigold dataset
	DATA_URL = (
		"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
		"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt"
	)

	# fraction of the dataset used for testing
	TEST_DATA_FRACTION = 0.3

	# sub-sampling ratio
	SAMPLE_RATIO = 1

	# the data path used to save the downloaded data file
	DATA_PATH = data_path

	# the cache data path during find tuning
	CACHE_DIR = cache_path

	if not os.path.exists(os.path.dirname(DATA_PATH)):
		os.mkdir(os.path.dirname(DATA_PATH))
		if not os.path.exists(DATA_PATH):
			os.mkdir(DATA_PATH)
		if not os.path.exists(CACHE_DIR):
			os.mkdir(CACHE_DIR)

	# set random seeds
	RANDOM_SEED = random_seed
	torch.manual_seed(RANDOM_SEED)


	MODEL_NAME = model_name
	# MODEL_NAME = "distilbert"
	DO_LOWER_CASE = False
	MAX_SEQ_LENGTH = 200
	TRAILING_PIECE_TAG = "X"
	NUM_GPUS = num_gpus
	BATCH_SIZE = 16


	# update variables for quick run option
	if QUICK_RUN:
		SAMPLE_RATIO = 0.1
		NUM_TRAIN_EPOCHS = 1


	# download data
	file_name = DATA_URL.split("/")[-1]  # a name for the downloaded file
	maybe_download(DATA_URL, file_name, DATA_PATH)
	data_file = os.path.join(DATA_PATH, file_name)

	# parse CoNll file
	sentence_list, labels_list = read_conll_file(data_file, sep=" ", encoding='utf-8')

	# sub-sample (optional)
	random.seed(RANDOM_SEED)
	sample_size = int(SAMPLE_RATIO * len(sentence_list))
	sentence_list, labels_list = list(
		zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size))
	)

	# train-test split
	train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split(
		sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED
	)

	processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)


	label_map = TokenClassificationProcessor.create_label_map(
		label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG
	)

	train_dataset = processor.preprocess(
		text=train_sentence_list,
		max_len=MAX_SEQ_LENGTH,
		labels=train_labels_list,
		label_map=label_map,
		trailing_piece_tag=TRAILING_PIECE_TAG,
	)

	# train_data_loader = DataLoader(train_dataset)
	test_dataset = processor.preprocess(
		text=test_sentence_list,
		max_len=MAX_SEQ_LENGTH,
		labels=test_labels_list,
		label_map=label_map,
		trailing_piece_tag=TRAILING_PIECE_TAG,
	)

	torch.save(train_dataset, os.path.join(DATA_PATH, 'train.pt'))
	torch.save(test_dataset, os.path.join(DATA_PATH, 'test.pt'))
	torch.save(label_map, os.path.join(DATA_PATH, 'label_map.pt'))

	# Default datastore
	def_data_store = ws.get_default_datastore()

	# Get the blob storage associated with the workspace
	def_blob_store = Datastore(ws, "workspaceblobstore")

	# Get file storage associated with the workspace
	def_file_store = Datastore(ws, "workspacefilestore")

	try:
		def_blob_store.upload_files(
	    			[os.path.join(DATA_PATH, 'train.pt')], target_path="nerdata", overwrite=True, show_progress=True)
	except Exception as e:
		print(f"Failed to upload -> {e}")

	try:
		def_blob_store.upload_files(
                    [os.path.join(DATA_PATH, 'test.pt')], target_path="nerdata", overwrite=True, show_progress=True)
	except Exception as e:
		print(f"Failed to upload -> {e}")

	try:
		def_blob_store.upload_files(
                    [os.path.join(DATA_PATH, 'label_map.pt')], target_path="nerdata", overwrite=True, show_progress=True)
	except Exception as e:
		print(f"Failed to upload -> {e}")

	train_datastore_paths = [(def_blob_store, 'nerdata/train.pt')]
	test_datastore_paths = [(def_blob_store, 'nerdata/test.pt')]
	label_map_datastore_paths = [(def_blob_store, 'nerdata/label_map.pt')]

	# def_blob_store.upload(src_dir=DATA_PATH, target_path="nerdata", overwrite=True, show_progress=True)

	train_ds = Dataset.File.from_files(path=train_datastore_paths)
	test_ds = Dataset.File.from_files(path=test_datastore_paths)
	label_map_ds = Dataset.File.from_files(path=label_map_datastore_paths)

	train_ds = train_ds.register(workspace=ws,
                                  name='ner_bert_train_ds',
                                  description='Named Entity Recognition with BERT (Training set)',
                                  create_new_version=False)

	test_ds = test_ds.register(workspace=ws,
                                  name='ner_bert_test_ds',
                                  description='Named Entity Recognition with BERT (Testing set)',
                                  create_new_version=False)

	label_map_ds = label_map_ds.register(workspace=ws,
                            name='ner_bert_label_map_ds_ds',
                                  description='Named Entity Recognition with BERT (Testing set)',
                                  create_new_version=False)

	train_dataloader = dataloader_from_dataset(
		train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False
	)

	test_dataloader = dataloader_from_dataset(
		test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False
	)

	return (train_dataloader, test_dataloader, label_map)
Пример #27
0
parser = argparse.ArgumentParser("Pipeline")
parser.add_argument(
    "--pipeline_action",
    type=str,
    choices=["pipeline-test", "publish"],
    help="Determines if pipeline needs to run on small data set \
                                        or pipeline needs to be republished",
    #default="pipeline-test",
)

args = parser.parse_args()

# Get workspace
ws = Workspace.from_config(path="aml_config/config.json", auth=cli_auth)
def_blob_store = Datastore(ws, "workspaceblobstore")

# Get AML Compute name and Experiment Name
with open("aml_config/security_config.json") as f:
    config = json.load(f)

experiment_name = config["experiment_name"]
aml_cluster_name = config["aml_cluster_name"]
aml_pipeline_name = "training-pipeline"

source_directory = "code"

# Run Config
# Declare packages dependencies required in the pipeline (these can also be expressed as a YML file)
# cd = CondaDependencies.create(pip_packages=["azureml-defaults", 'tensorflow==1.8.0'])
cd = CondaDependencies("aml_config/conda_dependencies.yml")
Пример #28
0
# In[ ]:


# Module
select_columns_in_dataset = Module.load(ws, namespace='azureml', name='Select Columns in Dataset')
clean_missing_data = Module.load(ws, namespace='azureml', name='Clean Missing Data')
split_data = Module.load(ws, namespace='azureml', name='Split Data')
join_data = Module.load(ws, namespace='azureml', name='Join Data')


# Dataset
try:
    dset = Dataset.get_by_name(ws, 'Automobile_price_data_(Raw)')
except Exception:
    global_datastore = Datastore(ws, name="azureml_globaldatasets")
    dset = Dataset.File.from_files(global_datastore.path('GenericCSV/Automobile_price_data_(Raw)'))
    dset.register(workspace=ws,
                  name='Automobile_price_data_(Raw)',
                  create_new_version=True)
blob_input_data = dset


# In[ ]:


# sub pipeline: TODO improve this experience
@dsl.pipeline(name='sub sub', description='sub')
def sub_sub_pipeline(minimum_missing_value_ratio):
    module1 = select_columns_in_dataset(
        dataset=blob_input_data,
Пример #29
0
    def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger):
        current_run = Run.get_submitted_run()
        parent_run_id = _get_parent_run_id(current_run._run_id)
        print("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        logger.info("[ParentRunId:{}]: Start getting data using dataprep.".format(parent_run_id))
        try:
            import azureml.train.automl._dataprep_utilities as dataprep_utilities
        except Exception as e:
            e.error_type = ErrorTypes.Unclassified
            log_traceback(e, logger)
            logger.error(e)
            raise e

        fit_iteration_parameters_dict = dict()

        class RetrieveNumpyArrayError(Exception):
            def __init__(self):
                super().__init__()

        try:
            print("Resolving Dataflows...")
            logger.info("Resolving Dataflows...")
            dataprep_json_obj = json.loads(dataprep_json)
            if 'activities' in dataprep_json_obj: # json is serialized dataflows
                dataflow_dict = dataprep_utilities.load_dataflows_from_json(
                    dataprep_json)
                for k in ['X', 'X_valid', 'sample_weight', 'sample_weight_valid']:
                    fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_pandas_dataframe(dataflow_dict.get(k))
                for k in ['y', 'y_valid']:
                    try:
                        fit_iteration_parameters_dict[k] = dataprep_utilities.try_retrieve_numpy_array(dataflow_dict.get(k))
                    except IndexError:
                        raise RetrieveNumpyArrayError()

                cv_splits_dataflows = []
                i = 0
                while 'cv_splits_indices_{0}'.format(i) in dataflow_dict:
                    cv_splits_dataflows.append(
                        dataflow_dict['cv_splits_indices_{0}'.format(i)])
                    i = i + 1
                fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \
                    else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows)
            else: # json is dataprep options
                print('Creating Dataflow from options...\r\nOptions:')
                logger.info('Creating Dataflow from options...')
                print(dataprep_json_obj)
                datastore_name = dataprep_json_obj['datastoreName'] # mandatory
                data_path = dataprep_json_obj['dataPath'] # mandatory
                label_column = dataprep_json_obj['label'] # mandatory
                separator = dataprep_json_obj.get('columnSeparator', ',')
                header = dataprep_json_obj.get('promoteHeader', True)
                encoding = dataprep_json_obj.get('encoding', None)
                quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False)
                skip_rows = dataprep_json_obj.get('skipRows', 0)
                feature_columns = dataprep_json_obj.get('features', [])

                from azureml.core import Datastore
                import azureml.dataprep as dprep
                if header:
                    header = dprep.PromoteHeadersMode.CONSTANTGROUPED
                else:
                    header = dprep.PromoteHeadersMode.NONE
                try:
                    encoding = dprep.FileEncoding[encoding]
                except:
                    encoding = dprep.FileEncoding.UTF8

                ws = Run.get_context().experiment.workspace
                datastore = Datastore(ws, datastore_name)
                dflow = dprep.read_csv(path=datastore.path(data_path),
                                        separator=separator,
                                        header=header,
                                        encoding=encoding,
                                        quoting=quoting,
                                        skip_rows=skip_rows)

                if len(feature_columns) == 0:
                    X = dflow.drop_columns(label_column)
                else:
                    X = dflow.keep_columns(feature_columns)

                print('Inferring types for feature columns...')
                logger.info('Inferring types for feature columns...')
                sct = X.builders.set_column_types()
                sct.learn()
                sct.ambiguous_date_conversions_drop()
                X = sct.to_dataflow()

                y = dflow.keep_columns(label_column)
                if automl_settings_obj.task_type.lower() == 'regression':
                    y = y.to_number(label_column)

                print('X:')
                print(X)
                logger.info('X:')
                logger.info(X)

                print('y:')
                print(y)
                logger.info('y:')
                logger.info(y)

                try:
                    from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb
                    _X = try_retrieve_pandas_dataframe_adb(X)
                    fit_iteration_parameters_dict['X'] = _X.values
                    fit_iteration_parameters_dict['x_raw_column_names'] = _X.columns.values
                except ImportError:
                    logger.info("SDK version does not support column names extraction, fallback to old path")
                    fit_iteration_parameters_dict['X'] = dataprep_utilities.try_retrieve_pandas_dataframe(X)

                try:
                    fit_iteration_parameters_dict['y'] = dataprep_utilities.try_retrieve_numpy_array(y)
                except IndexError:
                    raise RetrieveNumpyArrayError()

            logger.info("Finish getting data using dataprep.")
            return fit_iteration_parameters_dict
        except Exception as e:
            print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            logger.error("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".format(parent_run_id, e.__class__, e))
            if isinstance(e, RetrieveNumpyArrayError):
                logger.debug("Label column (y) does not exist in user's data.")
                e.error_type = ErrorTypes.User
            elif "The provided path is not valid." in str(e):
                logger.debug("User's data is not accessible from remote run.")
                e.error_type = ErrorTypes.User
            elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(e):
                logger.debug("User should use Datastore to data that requires secrets.")
                e.error_type = ErrorTypes.User
            else:
                e.error_type = ErrorTypes.Client
            log_traceback(e, logger)
            raise RuntimeError("Error during extracting Dataflows")

# In[121]:


from azureml.core import Workspace, Datastore


# In[122]:


# Default datastore 
def_data_store = ws.get_default_datastore()

# Get the blob storage associated with the workspace
def_blob_store = Datastore(ws, "workspaceblobstore")

# Get file storage associated with the workspace
def_file_store = Datastore(ws, "workspacefilestore")


# In[123]:


def_blob_store.upload_files(
    ["Downloads/005930.KS.csv"],
    target_path="xyz",
    overwrite=True)


# In[ ]: