Exemplo n.º 1
0
def upload_files_to_azure(cfg, ws):
    ''' look in the cfg object to file directories and files to upload to AFS and ABS
    input params :
    ws : Description : aml workspace object
    ws : Type : aml workspace object (defined in azureml.core.workspace.Workspace)
    '''
    for ref in cfg.DataReference.localDirectoryBlobList:
        uploadContentBeforeRun = ref.uploadContentBeforeRun
        if uploadContentBeforeRun:
            overwriteOnUpload = ref.overwriteOnUpload
            remoteBlobContainer = ref.remoteBlobContainer
            localDirectoryName  = ref.localDirectoryName
            remoteMountPath = ref.remoteMountPath
            ds = Datastore(workspace = ws, name = remoteBlobContainer)
            ds.upload(src_dir=localDirectoryName, target_path=remoteMountPath, overwrite=overwriteOnUpload, show_progress=True)

    for ref in cfg.DataReference.localDirectoryFilesList:
        uploadContentBeforeRun = ref.uploadContentBeforeRun
        if uploadContentBeforeRun:
            overwriteOnUpload = ref.overwriteOnUpload
            remoteFileShare = ref.remoteFileShare
            localDirectoryName = ref.localDirectoryName
            remoteMountPath = ref.remoteMountPath
            ds = Datastore(workspace = ws, name = remoteFileShare)
            ds.upload(src_dir = localDirectoryName, target_path=remoteMountPath, overwrite=overwriteOnUpload, show_progress=True)
Exemplo n.º 2
0
def create_and_attach_file_storage(cfg, ws):
    if len(cfg.DataReference.localDirectoryFilesList) > 0:
        for ref in cfg.DataReference.localDirectoryFilesList:
            log.info("Attempting to create file share '%s' on storage account '%s'.", ref.remoteFileShare, ref.storageAccountName)
            file_service = FileService(ref.storageAccountName, ref.storageAccountKey)
            exist = file_service.create_share(ref.remoteFileShare, fail_on_exist=False)
            if exist:
                log.info("File Share '%s' on storage account '%s' created.", ref.remoteFileShare, ref.storageAccountName)
            else:
                log.info("File Share '%s' on storage account '%s' already existed.", ref.remoteFileShare, ref.storageAccountName)
            # Get most recent list of datastores linked to current workspace
            datastores = ws.datastores()
            # Validate if share_ds is created
            ds = None if ref.dataref_id not in datastores else Datastore(workspace = ws, name = ref.dataref_id)
            # Register the DS to the workspace
            if ds:
                if ds.account_name == ref.storageAccountName and ds.container_name == ref.remoteFileShare:
                    recreate = False
                else:
                    recreate = True
                    # also remove the existing reference
                    ds.unregister()
            else:
                recreate = True
            if recreate:
                log.info('Registering file share "{}" to AML datastore for AML workspace "{}" under datastore id "{}".'.format(ref.remoteFileShare, ws.name, ref.dataref_id))
                ds = Datastore.register_azure_file_share(workspace = ws,
                                                    datastore_name = ref.dataref_id, 
                                                    file_share_name = ref.remoteFileShare, 
                                                    account_name = ref.storageAccountName, 
                                                    account_key= ref.storageAccountKey,
                                                    overwrite=True,
                                                    )
            else:
                log.info('File share "{}" under AML workspace "{}" already registered under datastore id "{}".'.format(ref.remoteFileShare, ws.name, ref.dataref_id))
Exemplo n.º 3
0
def create_dataset(ws):  
    kaggle_api.dataset_download_file('divg07/malware-analysis-dataset','data.csv')

    data = pd.read_csv(
            './data.csv.zip',
            compression='zip',
            sep='|'
        )

    # Clean dataset 
    data = clean_data(data)

    # Register Dataset in Workspace
    datastore = Datastore(ws)
    name = "Malware Dataset"
    description_text = "Malware DataSet for Udacity Capstone Project"
    dataset = TabularDatasetFactory.register_pandas_dataframe(data,
                               datastore,
                               name,
                               description=description_text)
    
    return dataset
Exemplo n.º 4
0
def create_and_attach_blob_storage(cfg, ws):
    """ If required, creates the blob storage containers in the datareferences of cfg """
    if len(cfg.DataReference.localDirectoryBlobList) > 0:
        for ref in cfg.DataReference.localDirectoryBlobList:
            log.info("Attempting to create Blob Container '%s' on storage account '%s'.", ref.remoteBlobContainer, ref.storageAccountName)
            blob_service = BlockBlobService(ref.storageAccountName, ref.storageAccountKey)
            exist = blob_service.create_container(ref.remoteBlobContainer, fail_on_exist=False)
            if exist:
                log.info("Blob Container '%s' on storage account '%s' created.", ref.remoteBlobContainer, ref.storageAccountName)
            else:
                log.info("Blob Container '%s' on storage account '%s' already existed.", ref.remoteBlobContainer, ref.storageAccountName)
            # Get most recent list of datastores linked to current workspace
            datastores = ws.datastores()
            # Validate if blob_ds is created
            ds = None if ref.dataref_id not in datastores else Datastore(workspace = ws, name = ref.dataref_id)
            # If DS exists and isn't mapped to the right place
            if ds:
                if ds.account_name == ref.storageAccountName and ds.container_name == ref.remoteBlobContainer:
                    recreate = False
                else:
                    recreate = True
                    # also remove the existing reference
                    ds.unregister()
            else:
                recreate = True
            if recreate:
                log.info('Registering blob "{}" to AML datastore for AML workspace "{}" under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id))
                ds = Datastore.register_azure_blob_container(workspace = ws,
                                                    datastore_name = ref.dataref_id, 
                                                    container_name = ref.remoteBlobContainer, 
                                                    account_name = ref.storageAccountName, 
                                                    account_key = ref.storageAccountKey,
                                                    overwrite = True,  # Overwrites the datastore (not the data itself, the object) if it already is part of this workspace
                                                    )
            else:
                log.info('Blob "{}" under AML workspace "{}" already registered under datastore id "{}".'.format(ref.remoteBlobContainer, ws.name, ref.dataref_id))
Exemplo n.º 5
0
def main():
    """
    Builds the Azure ML pipeline for data engineering and model training.
    """
    databricks_workspace_name = os.environ['DATABRICKS_WORKSPACE_NAME']
    training_data_account_name = os.environ['TRAINING_DATA_ACCOUNT_NAME']
    build_id = os.getenv('BUILD_BUILDID', 0)

    # Get Azure machine learning workspace
    aml_workspace = Workspace.get(
        name=os.environ['AML_WORKSPACE_NAME'],
        subscription_id=os.environ['SUBSCRIPTION_ID'],
        resource_group=os.environ['RESOURCE_GROUP'],
    )
    print(aml_workspace)

    # Generate Databricks credentials, see https://aka.ms/databricks-aad
    dbricks_region = aml_workspace.location
    dbricks_api = f"https://{dbricks_region}.azuredatabricks.net/api/2.0"

    dbricks_client = databricks_client.create(dbricks_api)
    dbricks_client.auth_azuread(resource_group=aml_workspace.resource_group,
                                workspace_name=databricks_workspace_name)
    dbricks_client.ensure_available()

    # Attach Databricks as Azure ML training compute
    dbricks_compute_name = "databricks"
    dbricks_compute = get_databricks_compute(
        aml_workspace,
        dbricks_compute_name,
    )
    if dbricks_compute is None:
        pat_token = dbricks_client.post(
            'token/create',
            json={"comment": "Azure ML Token generated by Build " + build_id
                  })['token_value']
        dbricks_compute = create_databricks_compute(
            aml_workspace,
            databricks_workspace_name,
            dbricks_compute_name,
            pat_token,
        )

    print("dbricks_compute:")
    print(dbricks_compute)

    # Create Databricks instance pool
    pool_name = "azureml_training"
    instance_pool_id = get_instance_pool(dbricks_client, pool_name)
    if not instance_pool_id:
        dbricks_client.post('instance-pools/create',
                            json={
                                "instance_pool_name":
                                pool_name,
                                "node_type_id":
                                "Standard_D3_v2",
                                "idle_instance_autotermination_minutes":
                                10,
                                "preloaded_spark_versions":
                                [DATABRICKS_RUNTIME_VERSION],
                            })
        instance_pool_id = get_instance_pool(dbricks_client, pool_name)

    notebook_folder = f"/Shared/AzureMLDeployed"
    workspace_datastore = Datastore(aml_workspace, "workspaceblobstore")

    # Create a datastore for the training data container
    credentials, subscription = get_azure_cli_credentials()
    storage_client = StorageManagementClient(credentials, subscription)
    training_storage_keys = storage_client.storage_accounts.list_keys(
        aml_workspace.resource_group, training_data_account_name)
    training_datastore = Datastore.register_azure_blob_container(
        workspace=aml_workspace,
        datastore_name="trainingdata",
        container_name="trainingdata",
        account_name=training_data_account_name,
        account_key=training_storage_keys.keys[0].value,
    )

    # FEATURE ENGINEERING STEP (DATABRICKS)
    # Create feature engineering pipeline step

    training_data_input = DataReference(datastore=training_datastore,
                                        path_on_datastore="/",
                                        data_reference_name="training")

    feature_eng_output = PipelineData("feature_engineered",
                                      datastore=workspace_datastore)

    notebook_path = upload_notebook(dbricks_client, notebook_folder,
                                    "code/prepare", "feature_engineering")

    training_dataprep_step = DatabricksStep(
        name="FeatureEngineering",
        inputs=[training_data_input],
        outputs=[feature_eng_output],
        spark_version=DATABRICKS_RUNTIME_VERSION,
        instance_pool_id=instance_pool_id,
        num_workers=3,
        notebook_path=notebook_path,
        run_name="FeatureEngineering",
        compute_target=dbricks_compute,
        allow_reuse=True,
    )

    # You can add Azure ML model training tasks using
    #   feature_eng_output as input.
    # ...

    # Create Azure ML Pipeline
    steps = [training_dataprep_step]

    ml_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
    ml_pipeline.validate()
    published_pipeline = ml_pipeline.publish(
        name="Feature Engineering",
        description="Feature engineering pipeline",
        version=build_id,
    )
    print(f"Published pipeline: {published_pipeline.name}")
    print(f"for build {published_pipeline.version}")

    # When running in Azure DevOps, set AMLPIPELINE_ID variable
    # for AML Pipeline task in next job
    print("Setting Azure DevOps variable")
    print(f"##vso[task.setvariable variable=AMLPIPELINE_ID;isOutput=true]"
          f"{published_pipeline.id}")
Exemplo n.º 6
0
from azureml.pipeline.core import Schedule
from azureml.core.datastore import Datastore

   datastore = Datastore(workspace="Playground", name="workspaceblobstore")

   schedule = Schedule.create(workspace, name="TestSchedule", pipeline_id="3100e87c-3300-400b-a5a5-470e85a100b3"
                              experiment_name="working version", datastore=datastore,
                              polling_interval=25, path_on_datastore="file/path")
Exemplo n.º 7
0
def InitAML(model_name, env, svcpw, interactive=False, create_ws=False):

    print("Environment is  ", env)

    configFilePath = "./environment_setup/Config/config_" + env + ".ini"
    configFile = ConfigParser()
    configFile.read(configFilePath)

    svc_pr_pd = svcpw
    tenant_id = configFile.get('PARAMS', 'tenant_id')
    service_principal_id = configFile.get('PARAMS', 'service_principal_id')
    subscription_id = configFile.get('PARAMS', 'subscription_id')
    resource_group = configFile.get('PARAMS', 'resource_group')
    blobname = configFile.get('PARAMS', 'BlobName')
    workspace_name = configFile.get('PARAMS', 'WorkSpace')
    data_factory_name = configFile.get('PARAMS', 'Data_factory_name')
    location = configFile.get('PARAMS', 'location')

    fp = './' + model_name + '/aml_service/setup.ini'
    conf = ConfigParser()
    conf.read(fp)

    AML_COMPUTE_CLUSTER_NAME = conf.get('PARAMS', 'AML_COMPUTE_CLUSTER_NAME')
    AML_COMPUTE_CLUSTER_MIN_NODES = conf.get('PARAMS',
                                             'AML_COMPUTE_CLUSTER_MIN_NODES')
    AML_COMPUTE_CLUSTER_MAX_NODES = conf.get('PARAMS',
                                             'AML_COMPUTE_CLUSTER_MAX_NODES')
    AML_COMPUTE_CLUSTER_SKU = conf.get('PARAMS', 'AML_COMPUTE_CLUSTER_SKU')

    if interactive:
        auth = InteractiveLoginAuthentication(tenant_id=tenant_id)

    else:
        auth = ServicePrincipalAuthentication(
            tenant_id=tenant_id,
            service_principal_id=service_principal_id,
            service_principal_password=svc_pr_pd)

    subscription_id = subscription_id
    resource_group = resource_group

    try:
        ws = Workspace(subscription_id=subscription_id,
                       resource_group=resource_group,
                       workspace_name=workspace_name,
                       auth=auth)
        print('Library configuration succeeded')
    except:
        if create_ws:
            ws = Workspace.create(name=workspace_name,
                                  auth=auth,
                                  subscription_id=subscription_id,
                                  resource_group=resource_group,
                                  create_resource_group=False,
                                  location=location)
            print('Workspace not found and is created')
        else:
            print('Workspace not found and not created')

    print('workspace_name:',
          ws.name,
          '\nworkspace_location:',
          ws.location,
          '\nworkspace_resource_group:',
          ws.resource_group,
          sep='\t')

    # choose a name for your cluster
    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME",
                                  AML_COMPUTE_CLUSTER_NAME)

    if compute_name in ws.compute_targets:
        compute_target = ws.compute_targets[compute_name]
        if compute_target and type(compute_target) is AmlCompute:
            print('found compute target. just use it. ' + compute_name)
    else:
        print('creating a new compute target...')
        compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES",
                                           AML_COMPUTE_CLUSTER_MIN_NODES)
        compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES",
                                           AML_COMPUTE_CLUSTER_MAX_NODES)
        # This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
        vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU",
                                 AML_COMPUTE_CLUSTER_SKU)
        provisioning_config = AmlCompute.provisioning_configuration(
            vm_size=vm_size,
            min_nodes=compute_min_nodes,
            max_nodes=compute_max_nodes)

        # create the cluster
        compute_target = ComputeTarget.create(ws, compute_name,
                                              provisioning_config)

        # can poll for a minimum number of nodes and for a specific timeout.
        # if no min node count is provided it will use the scale settings for the cluster
        compute_target.wait_for_completion(show_output=True,
                                           min_node_count=None,
                                           timeout_in_minutes=20)

        # For a more detailed view of current AmlCompute status, use get_status()
        print(compute_target.get_status().serialize())

    try:
        datastore = Datastore(ws, name=blobname)
        print("Found Blob Datastore with name: %s" % datastore)
    except:
        print("No datastore with name: %s" % blobname)
        sys.exit(-1)

    try:
        data_factory = DataFactoryCompute(ws, data_factory_name)
        print('data_factory ', data_factory)
    except ComputeTargetException as e:
        if 'ComputeTargetNotFound' in e.message:
            print('Data factory Compute not found, creating...')
            provisioning_config = DataFactoryCompute.provisioning_configuration(
            )
            data_factory = ComputeTarget.create(ws, data_factory_name,
                                                provisioning_config)
            data_factory.wait_for_completion()
        else:
            print('Data factory Compute not found, Entering Else Section...')
            raise e

    return datastore, compute_target, ws, data_factory
Exemplo n.º 8
0
    workspace_name="<your workspace name>"

    if subscription_id.startswith("<"): 
        raise ex
    else: # write and reload from config file
        config = {"Scope": "/subscriptions/" + subscription_id + "/resourceGroups/" + resource_group + "/providers/Microsoft.MachineLearningServices/workspaces/" + workspace_name +"/projects/samples"}
        import json
        import os
        os.makedirs(os.path.dirname(config_path), exist_ok=True)
        with open(config_path, "w") as fo:
            fo.write(json.dumps(config))
        ws = Workspace.from_config(path=config_path)

from azureml.core.datastore import Datastore

relevance_datastore = Datastore(ws, 'adls_relevance09')

# Retrieve or create the computer target

from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

cluster_name = "Cmpt-112GB-16Cr"
# cluster_name = "Cmpt-512GB-64Cr"
if cluster_name not in ws.compute_targets:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_v2",
                                                               max_nodes=4)
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)
    ws = Workspace.get(name=workspace_name,
                       subscription_id=subscription_id,
                       resource_group=resource_group,
                       auth=service_principal)
    return (ws)


if __name__ == '__main__':
    global run
    run = Run.get_context()
    auth_params = get_args()
    ws = get_ws(auth_params)

    datastore_names = list(ws.datastores.keys())
    def_data_store = ws.get_default_datastore()
    def_blob_store = Datastore(ws, "workspaceblobstore")

    data_temp_folder = os.path.join(cwd, "data_temp")
    create_folders([data_temp_folder])

    dataset = {
        'dataset':
        "https://github.com/rouzbeh-afrasiabi/PublicDatasets/raw/master/train.csv.zip"
    }
    word_vectors = {
        "en_vectors_web_lg":
        "https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz"
    }

    toDownload = [dataset, word_vectors]
    download_files(toDownload, data_temp_folder)
def create_DDoS_datasets(ws):  
    dtypes = {
        'Src IP': 'category',
        'Src Port': 'uint16',
        'Dst IP': 'category',
        'Dst Port': 'uint16',
        'Protocol': 'category',
        'Flow Duration': 'uint32',
        'Tot Fwd Pkts': 'uint32',
        'Tot Bwd Pkts': 'uint32',
        'TotLen Fwd Pkts': 'float32',
        'TotLen Bwd Pkts': 'float32',
        'Fwd Pkt Len Max': 'float32',
        'Fwd Pkt Len Min': 'float32',
        'Fwd Pkt Len Mean': 'float32',
        'Fwd Pkt Len Std': 'float32',
        'Bwd Pkt Len Max': 'float32',
        'Bwd Pkt Len Min': 'float32',
        'Bwd Pkt Len Mean': 'float32',
        'Bwd Pkt Len Std': 'float32',
        'Flow Byts/s': 'float32',
        'Flow Pkts/s': 'float32',
        'Flow IAT Mean': 'float32',
        'Flow IAT Std': 'float32',
        'Flow IAT Max': 'float32',
        'Flow IAT Min': 'float32',
        'Fwd IAT Tot': 'float32',
        'Fwd IAT Mean': 'float32',
        'Fwd IAT Std': 'float32',
        'Fwd IAT Max': 'float32',
        'Fwd IAT Min': 'float32',
        'Bwd IAT Tot': 'float32',
        'Bwd IAT Mean': 'float32',
        'Bwd IAT Std': 'float32',
        'Bwd IAT Max': 'float32',
        'Bwd IAT Min': 'float32',
        'Fwd PSH Flags': 'category',
        'Bwd PSH Flags': 'category',
        'Fwd URG Flags': 'category',
        'Bwd URG Flags': 'category',
        'Fwd Header Len': 'uint32',
        'Bwd Header Len': 'uint32',
        'Fwd Pkts/s': 'float32',
        'Bwd Pkts/s': 'float32',
        'Pkt Len Min': 'float32',
        'Pkt Len Max': 'float32',
        'Pkt Len Mean': 'float32',
        'Pkt Len Std': 'float32',
        'Pkt Len Var': 'float32',
        'FIN Flag Cnt': 'category',
        'SYN Flag Cnt': 'category',
        'RST Flag Cnt': 'category',
        'PSH Flag Cnt': 'category',
        'ACK Flag Cnt': 'category',
        'URG Flag Cnt': 'category',
        'CWE Flag Count': 'category',
        'ECE Flag Cnt': 'category',
        'Down/Up Ratio': 'float32',
        'Pkt Size Avg': 'float32',
        'Fwd Seg Size Avg': 'float32',
        'Bwd Seg Size Avg': 'float32',
        'Fwd Byts/b Avg': 'uint32',
        'Fwd Pkts/b Avg': 'uint32',
        'Fwd Blk Rate Avg': 'uint32',
        'Bwd Byts/b Avg': 'uint32',
        'Bwd Pkts/b Avg': 'uint32',
        'Bwd Blk Rate Avg': 'uint32',
        'Subflow Fwd Pkts': 'uint32',
        'Subflow Fwd Byts': 'uint32',
        'Subflow Bwd Pkts': 'uint32',
        'Subflow Bwd Byts': 'uint32',
        'Init Fwd Win Byts': 'uint32',
        'Init Bwd Win Byts': 'uint32',
        'Fwd Act Data Pkts': 'uint32',
        'Fwd Seg Size Min': 'uint32',
        'Active Mean': 'float32',
        'Active Std': 'float32',
        'Active Max': 'float32',
        'Active Min': 'float32',
        'Idle Mean': 'float32',
        'Idle Std': 'float32',
        'Idle Max': 'float32',
        'Idle Min': 'float32',
        'Label': 'category'
    }

    data = pd.read_csv(
            './final_dataset.csv',
            parse_dates=['Timestamp'],
            usecols=[*dtypes.keys(), 'Timestamp'],
            engine='c',
            low_memory=True,
            na_values=np.inf
        )

    # There are over 12 million rows in this orignal dataset. For this project, that much data is taking far too long, so I'm randomly sampling only .5% of the data
    data = data.sample(frac=0.005)

    # Register Base Dataset in Workspace
    datastore = Datastore(ws)
    name = "DDoS Dataset"
    description_text = "DDoS DataSet for Udacity Capstone Project"
    dataset = TabularDatasetFactory.register_pandas_dataframe(data,
                               datastore,
                               name,
                               description=description_text)
    
    # Clean dataset and register the clean version
    cleaned_data = clean_data(data)
    
    clean_dataset_name = "Clean DDoS Dataset"
    clean_description_text = description_text + " that has been cleaned"
    clean_dataset = TabularDatasetFactory.register_pandas_dataframe(cleaned_data,
                               datastore,
                               clean_dataset_name,
                               description=clean_description_text)
Exemplo n.º 11
0
    cv_results['n_features'] = X.shape[1]
    cv_results['y_0'] = y.tolist().count(0)
    cv_results['y_1'] = y.tolist().count(1)

    print(cv_results["mean_test_pr_auc"].to_string(index=False))
    run.log(name="mean_test_pr_auc",
            value=cv_results["mean_test_pr_auc"].to_string(index=False))

    if not os.path.isdir(args.train_model):
        os.makedirs(args.train_model, exist_ok=True)

    timestamp_id = datetime.datetime.now()
    time = timestamp_id.strftime("%m-%d-%Y_%H%M")

    model_name = "{}__{}.json".format(args.repo_owner, args.repo_name)
    output_path = os.path.join(args.train_model, model_name)

    with open(output_path, 'w') as outfile:
        cv_results.to_json(outfile, orient='table', index=False)

    # Get the blob storage associated with the workspace
    pipeline_datastore = Datastore(ws, "datastore_pipeline")

    #Upload production model to main blob folder
    pipeline_datastore.upload_files([args.train_model + '/' + model_name],
                                    target_path="train_model" + '/' +
                                    args.repo_owner + '/' + args.repo_name +
                                    '/' + time,
                                    overwrite=True)

print("Model is trained!")