示例#1
0
def run(input_dataset_name: str, waves_dataset_name: str):
    logger.info('start the process')
    logger.info(f"input dataset name: {input_dataset_name}")
    logger.info(f"waves dataset name: {waves_dataset_name}")

    run = Run.get_context()
    workspace = run.experiment.workspace
    input_dataset = Dataset.get_by_name(workspace=workspace,
                                        name=input_dataset_name)
    input_dataset.download(target_path='input', overwrite=True)
    wave_dataset = Dataset.get_by_name(workspace=workspace,
                                       name=waves_dataset_name)
    wave_dataset.download(target_path='waves_yesno', overwrite=True)

    # Run run.sh
    results = subprocess.run(['bash', 'run.sh'],
                             capture_output=True,
                             encoding='utf-8')

    logger.info(results.stdout)
    logger.info(results.stderr)

    # Get generated model and register it to Azure ML model store
    model_name = 'yesno_model'
    upload_folder_name = 'models'

    run.upload_folder(name=upload_folder_name, path='exp/mono0a')
    model = run.register_model(model_name=model_name,
                               model_path=upload_folder_name,
                               description='Generted model in Azure ML')
    logger.info(f'Model registered: {model.name}')
    logger.info(f'Model version: {model.version}')
def get_registered_dataset(dataset_name):
    run = Run.get_context()
    dset = None
    mount_folder = ''

    if isinstance(run, _OfflineRun):
        ws = Workspace.from_config()
        dset = Dataset.get_by_name(ws, dataset_name)

        if isinstance(dset, FileDataset):
            mount_folder = tempfile.mkdtemp()
            ws = Workspace.from_config()
            dset = Dataset.get_by_name(ws, dataset_name)
            print('This is a file dataset and therefore mounting to ' +
                  mount_folder)
            mount_context = dset.mount(mount_folder)
            mount_context.start()
    else:
        ws = run.experiment.workspace
        print("dataset name " + dataset_name)
        dset = run.input_datasets[dataset_name]
        print(dset)
        if isinstance(dset, str):
            mount_folder = dset
            print(
                'This is a file dataset and therefore it has already been mounted to '
                + mount_folder)
            print('contents of folder')
            print(os.listdir(mount_folder))

    return dset, mount_folder
def main(name, input_folder, output_folder):
    ws = Workspace.from_config()
    datastore = ws.get_default_datastore()
    
    datastore.upload(input_folder, output_folder, show_progress=True, overwrite=True)

    try:
        Dataset.get_by_name(ws, name)
        create_new_version=True
    except:
        create_new_version=True
        
    ds = Dataset.Tabular.from_delimited_files([(datastore, output_folder)])
    ds.register(ws, name, create_new_version=create_new_version)
示例#4
0
def experiment(reg_rate, ds_name):
    # Get the experiment run context and ws
    run = Run.get_context()
    ws = run.experiment.workspace

    # Prepare the dataset
    dataset = Dataset.get_by_name(workspace=ws, name=ds_name)
    data = dataset.to_pandas_dataframe()
    X, y = data.iloc[:, :-1].values, data.iloc[:, -1].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

    # Train a logistic regression model
    model = LogisticRegression(C=1 / reg_rate,
                               solver="liblinear").fit(X_train, y_train)

    # calculate accuracy
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    run.log('Accuracy', np.float(acc))

    # Save the trained model
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=model, filename='outputs/diabetes.pkl')

    run.complete()
def train_mlp():
    os.makedirs(os.path.dirname('./outputs/'), exist_ok=True)
    precalced = Dataset.get_by_name(workspace, name='distilbert-base-uncased_pack')
    precalced.download(target_path='./outputs/', overwrite=False)
    train_test_bert('./media.csv', './dist.dat', './models/768dBertModel',
                    10, result_path='./result768dBert.txt', check=1,
                    pretrained_weights='distilbert-base-uncased')
示例#6
0
def main():

    model_name, dataset_name = getRuntimeArgs()
    dotenv.load_dotenv()

    run = Run.get_context()

    if run._run_id.startswith("_OfflineRun"):
        run = None

    credit_data_df = None

    #Load data from Dataset or from local file(for offline runs)
    if run is None:
        dataset_filename = os.environ.get("DATASET_FILE_NAME", )
        credit_data_df = pd.read_csv("dataset/" + dataset_filename)
    else:
        dataset = Dataset.get_by_name(workspace=run.experiment.workspace,
                                      name=dataset_name)
        #dataset = run.input_datasets[dataset_name]
        credit_data_df = dataset.to_pandas_dataframe()

    clf = model_train(credit_data_df, run)

    #copying to "outputs" directory, automatically uploads it to azure ml
    output_dir = './outputs/'
    os.makedirs(output_dir, exist_ok=True)
    joblib.dump(value=clf, filename=output_dir + model_name)
示例#7
0
def main(args):
    # create the outputs folder
    os.makedirs('outputs', exist_ok=True)
    
    # Log arguments
    run.log('Model Name', np.str(args.model_name))
    run.log('Version', np.float(args.dataset_version))
    run.log('DataFile Path', np.str(args.data_file_path))
    run.log('DataSet Name', np.str(args.dataset_name))

    model_name = args.model_name
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name
    
    run = Run.get_context()
    
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version)  
            columns_to_ignore = ['sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
                     'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files', 'url', 's2_id' ]
            dataset = dataset.drop_columns(columns_to_ignore) 
        else:
            dataset = register_dataset(run.experiment.workspace,
                                       dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)
        
    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    data = split_data(df)

    train_args = {"alpha": 0.5}
    # Train the model
    model = train_model(data, train_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)
   
  
    
    # files saved in the "outputs" folder are automatically uploaded into run history
    model_file_name = "COVID19Articles_model.pkl"
    joblib.dump(model, os.path.join('outputs', model_file_name))
    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
def run(mini_batch):

    # Define parameters
    pad_len = 20
    skill_filter_prob = 0.8
    master_match_prob = 80

    skill_final_df = pd.DataFrame(columns=[
        'employee_number', 'skill', 'match_skill', 'match_skill_prob'
    ])

    # Call text splitter
    skill_df = skill_splitter(mini_batch)

    # Call skill filter
    skill_filter_df = skill_filter(skill_tokenizer, skill_classifier, skill_df,
                                   skill_filter_prob, pad_len)

    # Find closest matching skill in master skill set based on threshold
    ds_master = 'skill_master'  # AML master skill set registered dataset
    df = Dataset.get_by_name(
        workspace=ws,
        name=ds_master)  # Read master skill set registered in AML Datastore
    masterskill_df = df.to_pandas_dataframe()
    master_list = masterskill_df['NAME'].to_list(
    )  # Convert master skill to list

    skill_match_df = skill_matcher(skill_filter_df, master_list,
                                   master_match_prob)

    return skill_match_df
示例#9
0
def get_dataset(workspace, name):
    try:
        dataset = Dataset.get_by_name(workspace=workspace,
                                      name=name,
                                      version="latest")
    except Exception:
        dataset = None
    return dataset
示例#10
0
def main():
    try:
        dataset = Dataset.get_by_name(ws, dataset_name)
        print("Dataset found: ", dataset_name)
    except Exception:
        print("Dataset not found: ", dataset_name)

    all_df = dataset.to_pandas_dataframe()
    train_scaled, test_scaled = split_normalize_data(all_df)
    x_dim = train_scaled.shape[1]
    print("train and test data shape after scaling: ", train_scaled.shape,
          test_scaled.shape)

    train_X = reshape(train_scaled)
    test_X = reshape(test_scaled)

    opt = keras.optimizers.Adam(learning_rate=0.001,
                                epsilon=1e-6,
                                amsgrad=True)

    if mode == "train":
        model = LSTM_VAE(time_step, x_dim, lstm_h_dim, z_dim, dtype='float32')
        model.compile(optimizer=opt)
        train_dataset = data.Dataset.from_tensor_slices(train_X)
        train_dataset = train_dataset.shuffle(buffer_size=1024).batch(
            batch_size, drop_remainder=True)

        history = model.fit(train_dataset, epochs=epoch_num,
                            shuffle=False).history
        model.summary()
        plot_loss_moment(history)
        save_model(model)
    elif mode == "infer":
        model = load_model()
        model.compile(optimizer=opt)
    else:
        print("Unknown mode: ", mode)
        exit(1)

    _, _, train_log_px = model.predict(train_X, batch_size=1)
    train_log_px = train_log_px.reshape(train_log_px.shape[0],
                                        train_log_px.shape[2])
    df_train_log_px = pd.DataFrame()
    df_train_log_px['log_px'] = np.mean(train_log_px, axis=1)
    plot_log_likelihood(df_train_log_px)

    _, _, test_log_px = model.predict(test_X, batch_size=1)
    test_log_px = test_log_px.reshape(test_log_px.shape[0],
                                      test_log_px.shape[2])
    df_log_px = pd.DataFrame()
    df_log_px['log_px'] = np.mean(test_log_px, axis=1)
    df_log_px = pd.concat([df_train_log_px, df_log_px])
    df_log_px['threshold'] = 0.65
    df_log_px['anomaly'] = df_log_px['log_px'] > df_log_px['threshold']
    df_log_px.index = np.array(all_df)[:, 0]

    df_log_px.plot(logy=True, figsize=(16, 9), color=['blue', 'red'])
    plt.savefig(image_dir + 'anomaly_lstm_vae_' + mode + '.png')
示例#11
0
def getMountContext(path):
    workspace = Workspace.from_config()
    workspace
    dataset = Dataset.get_by_name(workspace, name=path)
    type(dataset)

    mount_context = dataset.mount()
    mount_context.start()  # this will mount the file streams
    print(mount_context.mount_point)
    return mount_context
示例#12
0
 def delete(self, name=None):
     ws = self._get_ws()
     if name is None:
         name = self.ctx.config.get('dataset', None)
     if name is None:
         raise AzureException('Please specify dataset name...')
     ds = Dataset.get_by_name(ws, name)
     ds.unregister_all_versions()
     self._select(None, False)
     self.ctx.log('Deleted dataset %s' % name)
     return {'deleted': name}
示例#13
0
def split_data(ds_name, test_size):

    dataset = Dataset.get_by_name(workspace, name=ds_name)
    df = dataset.to_pandas_dataframe()
    y = df.iloc[:, -1].values  # output variable
    X = df.iloc[:, :-1].values  # feature variables
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=42)
    return X_train, X_test, y_train, y_test  # return the train and test split data
示例#14
0
    def _columns(self, dataset=None, name=None):
        if not dataset:
            ws = self._get_ws()
            if name is None:
                name = self.ctx.config.get('dataset', None)
            if name is None:
                raise AzureException('Please specify dataset name...')
            dataset = Dataset.get_by_name(ws, name)

        df = dataset.take(1).to_pandas_dataframe()
        return df.columns.tolist()
def test_pipeline_functionally_works(pipeline):
    training_dataset = Dataset.get_by_name(ws, training_dataset_name)

    run = pipeline.submit(ws,
        experiment_name="training-pipeline-acceptance-test",
        pipeline_parameters={'training_dataset': training_dataset})
    run.wait_for_completion()
    assert run.status == "Completed"
    # Add more asserts

# Add more tests, e.g., to make sure the produced model works functionally, etc.
示例#16
0
def main():
    # Retrieve argument configured through script_params in estimator
    parser = argparse.ArgumentParser()
    parser.add_argument('--new_model_folder',
                        dest='new_model_folder',
                        type=str,
                        help='input folder path for reading the new \
                        model .pkl file')
    parser.add_argument('--new_model_file',
                        dest='new_model_file',
                        type=str,
                        help='name of the model .pkl file')
    parser.add_argument("--model_name",
                        dest='model_name',
                        type=str,
                        help="Name of the model to register into \
                        Azure ML Workspace")
    args = parser.parse_args()

    # Get the current run
    run = Run.get_context()
    # Adding metrics to tags so that these information can
    # be used for model comparison purpose.
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
    tags = {}
    for key in metrics:
        tags[key] = run.parent.get_metrics(key).get(key)

    # Store BuildId
    parent_tags = run.parent.get_tags()
    build_id = 'BuildId'
    try:
        build_id = parent_tags["BuildId"]
        tags['BuildId'] = build_id
    except KeyError:
        print("BuildId tag not found on parent run.")
        print(f"Tags present: {parent_tags}")

    # Register the new model, note the metric values are stored in "tags".
    model_pkl_file = args.new_model_folder + args.new_model_file
    workspace = run.experiment.workspace
    dataset_name = 'predict-employee-retention-training-data'
    model = Model.register(workspace=workspace,
                           model_name=args.model_name,
                           model_path=model_pkl_file,
                           tags=tags,
                           datasets=[
                               ('training data',
                                Dataset.get_by_name(workspace, dataset_name))
                           ])

    run.log(
        'Model registered', 'New model ' + model.name + ' version ' +
        str(model.version) + ' BuildId ' + build_id)
示例#17
0
    def get_dataset(self, config):
        name = config.get("name")
        version = config.get("version")
        if version == "latest":
            version = None

        dataset = Dataset.get_by_name(
            workspace=self.workspace, 
            name=name, 
            version=version)
        
        return dataset
示例#18
0
def get_data(run, fitted_model, target_column_name, test_dataset_name):

    # get input dataset by name
    test_dataset = Dataset.get_by_name(run.experiment.workspace,
                                       test_dataset_name)
    test_df = test_dataset.to_pandas_dataframe()
    if target_column_name in test_df:
        y_test = test_df.pop(target_column_name).values
    else:
        y_test = np.full(test_df.shape[0], np.nan)

    return test_df, y_test
示例#19
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--n_estimators',
                        type=int,
                        default=100,
                        help='Number of estimator')
    parser.add_argument('--max_features',
                        type=str,
                        default='sqrt',
                        help='Max Feature')
    parser.add_argument('--dataset_name',
                        type=str,
                        default='ISDWeatherDS',
                        help='Name of the dataset')

    args = parser.parse_args()
    run.log('max_features', np.str(args.max_features))
    run.log('n_estimators', np.float(args.n_estimators))

    # loading the iris dataset
    #     iris = datasets.load_iris()
    dataset = Dataset.get_by_name(run.experiment.workspace,
                                  args.dataset_name)  # NOQA: E402, E501
    dataset = dataset.to_pandas_dataframe()
    dataset = dataset.fillna(0)

    # X -> features, y -> label
    cols = list(dataset.columns)
    cols = [
        col for col in cols if dataset.dtypes[col] != 'object'
        and col not in ['version', 'datetime']
    ]
    X = dataset[[col for col in cols if col not in ['temperature']]]
    y = dataset.temperature

    # dividing X, y into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    rf = RandomForestRegressor(n_estimators=args.n_estimators,
                               max_features=args.max_features).fit(
                                   X_train, y_train)
    rf_predictions = rf.predict(X_test)
    mse = mean_squared_error(y_test, rf_predictions)
    print('MSE  on test set: {:.2f}'.format(mse))

    # model accuracy for X_test
    run.log('MSE', np.float(mse))

    os.makedirs('outputs', exist_ok=True)
    # files saved in the "outputs" folder are automatically uploaded into run history
    joblib.dump(rf, 'outputs/model.joblib')
示例#20
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--n_estimators', type=int, default=100, help="Number of trees in the forest")
    parser.add_argument('--max_depth', type=int, default=None, help="The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.")
    parser.add_argument('--min_samples_split', type=int, default=2, help="The minimum number of samples required to split an internal node.")
    parser.add_argument('--min_samples_leaf', type=int, default=1, help="The minimum number of samples required to be at a leaf node.")
    
    args = parser.parse_args()
    if args.max_depth == 0:
        max_depth = None
    else:
        max_depth = args.max_depth

    run.log("Num Estimators:", np.float(args.n_estimators))
    run.log("Max Depth:", max_depth)
    run.log("Min Samples Split:", np.int(args.min_samples_split))
    run.log("Min Samples Leaf:", np.int(args.min_samples_leaf))

    workspace = run.experiment.workspace
    dataset_name = 'Malware Dataset'
    dataset = Dataset.get_by_name(workspace=workspace, name=dataset_name)

    df = dataset.to_pandas_dataframe()

    y = df.pop("legitimate")

    x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.33)

    model = RandomForestClassifier(n_estimators=args.n_estimators, 
                                   max_depth=max_depth, 
                                   min_samples_split=args.min_samples_split, 
                                   min_samples_leaf=args.min_samples_leaf, 
                                   )
    
    model = model.fit(x_train,y_train)

    joblib.dump(model, './outputs/model.joblib')

    accuracy = model.score(x_test, y_test)
    y_pred = model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    run.log("Accuracy", np.float(accuracy))
    run.log("F1", np.float(f1))
    run.log("Precision", np.float(precision))
    run.log("Recall", np.float(recall))
示例#21
0
def init():
    global meta_df
    run = Run.get_context()
    # get the input dataset by name
    workspace = run.experiment.workspace
    covid_meta = Dataset.get_by_name(workspace,
                                     'covid-19 metadata').download()[0]
    # load the TabularDataset to pandas DataFrame
    meta_df = pd.read_csv(covid_meta,
                          dtype={
                              'pubmed_id': str,
                              'Microsoft Academic Paper ID': str,
                              'doi': str
                          })
示例#22
0
def main():
    ws = Workspace.from_config()
    callback = AzureLossCallback(Run.get_context())
    dataset = Dataset.get_by_name(ws, name='TravelInsurance')
    data_path = dataset.download()
    data = pd.read_feather(data_path[0])
    label = data["label"]
    data.drop(["label"], axis=1, inplace=True)

    train_model_func = get_trained_model((data.shape[1],), learning_rate=0.001, callback=callback)
    metrics = cross_validate(train_model_func=train_model_func,
                             data=data, label=label,
                             data_processing_func=process_data)
    print(metrics)
示例#23
0
def train():
    
    # Get variable hyperparameters as arguments
    parser = argparse.ArgumentParser(description='Training with specified Hyperparameters')
    
    parser.add_argument('--n_estimators',
                        help='the number of trees in the forest',
                        type=int, default=100)
    
    parser.add_argument('--min_weight_fraction_leaf',
                        help='The minimum weighted fraction of the sum total of weights required to be at a leaf node',
                        type=float, default=0.0)
    
    args = parser.parse_args()
    
    n_estimators = args.n_estimators
    min_weight_fraction_leaf = args.min_weight_fraction_leaf
    
    # Get run context from AzureML
    run = Run.get_context()
    ws = run.experiment.workspace
    
    # Read Data
    df = Dataset.get_by_name(ws, 'heart-disease-uci').to_pandas_dataframe()
    
    X, y = df.drop('target', axis=1), df['target']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define Classifier and Train
    clf = ExtraTreesClassifier(n_estimators = n_estimators, min_weight_fraction_leaf=min_weight_fraction_leaf)
    
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    
    # Log metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    run.log('Accuracy', accuracy)
    run.log('n_estimators', n_estimators)
    run.log('leaf_frac', min_weight_fraction_leaf)
    
    # Dump the model
    
    os.makedirs('./outputs', exist_ok=True)
    
    model_path = "outputs/hp-heart-disease_{}.joblib".format(accuracy)
    
    joblib.dump(clf, model_path)
示例#24
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--C',
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument('--data', type=str, help="Loading dataset")
    parser.add_argument('--max_iter',
                        type=int,
                        default=100,
                        help="Maximum number of iterations to converge")

    args = parser.parse_args()

    # split data to train and test sets
    dataset = Dataset.get_by_name(ws, name='diabetes')
    dataset = dataset.to_pandas_dataframe()

    # Scaling data
    scaler = StandardScaler()
    scaler.fit(dataset.drop('Outcome', axis=1))
    scaler_features = scaler.transform(dataset.drop('Outcome', axis=1))
    df_feat = pd.DataFrame(scaler_features, columns=dataset.columns[:-1])
    # appending the outcome feature
    df_feat['Outcome'] = dataset['Outcome'].astype(int)
    dataset = df_feat.copy()

    x = dataset.drop(columns=['Outcome'])
    y = dataset['Outcome']
    x_train, x_test, y_train, y_test = train_test_split(x, y)

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C,
                               max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)

    os.makedirs('outputs', exist_ok=True)

    joblib.dump(model, 'outputs/model.joblib')

    run.log("Accuracy", np.float(accuracy))
示例#25
0
def main():
    model_name, dataset_name = getRuntimeArgs()

    run = Run.get_context()

    dataset = Dataset.get_by_name(workspace=run.experiment.workspace,
                                  name=dataset_name)
    credit_data_df = dataset.to_pandas_dataframe()

    clf = model_train(credit_data_df, run)

    #copying model to "outputs" directory, this will automatically upload it to Azure ML
    output_dir = './outputs/'
    os.makedirs(output_dir, exist_ok=True)
    joblib.dump(value=clf, filename=output_dir + model_name)
示例#26
0
def register_dataset(aml_workspace: Workspace, dataset_name: str,
                     datastore_name: str, file_path: str) -> Dataset:
    if datastore_name:
        datastore = Datastore.get(aml_workspace, datastore_name)
    else:
        datastore = aml_workspace.get_default_datastore()
    # if the path is same as the latest version, no new version will be registered  # NOQA: E501
    # however, run.input_datasets['name'] = dataset will not log the dataset in the run  # NOQA: E501
    # in this case, the dataset returned from Dataset.get_by_name does get logged  # NOQA: E501
    dataset = Dataset.File.from_files(path=(datastore, file_path))
    dataset = dataset.register(workspace=aml_workspace,
                               name=dataset_name,
                               create_new_version=True)

    return Dataset.get_by_name(aml_workspace, dataset_name)
示例#27
0
def main():
    ws = Run.get_context().experiment.workspace
    os.makedirs('./outputs', exist_ok=True)
    df = Dataset.get_by_name(ws, 'telcochurn').to_pandas_dataframe()
    df = df.dropna(how="all")  # remove samples with all missing values
    df["Churn_numerical"] = df["Churn"]
    target = df["Churn"]
    total_charges_filter = df.TotalCharges == " "
    df = df[~total_charges_filter]
    df.TotalCharges = pd.to_numeric(df.TotalCharges)
    df = df.drop(['Churn_numerical', 'Churn'], axis=1)
    train_model(df, target)
    run = Run.get_context()
    model = run.register_model(model_name='Churn_model',
                               model_path='outputs/classifier.pkl')
示例#28
0
def get_or_create_dataset(
    workspace: Workspace, blob_storage_paths: List[str], dataset_name: str
) -> Dataset:

    try:
        return Dataset.get_by_name(workspace=workspace, name=f"{dataset_name}")
    except Exception:
        logger.info(f"Registering {dataset_name} with {len(blob_storage_paths)} files.")
        dataset = Dataset.File.from_files(path=blob_storage_paths)
        return dataset.register(
            workspace=workspace,
            name=dataset_name,
            description="training and test dataset",
            create_new_version=True,
        )
示例#29
0
    def __init__(self, name, dataset, mode=DIRECT_MODE, path_on_compute=None):
        """Represent how to deliver the dataset to the compute target.

        :param name: The name of the dataset in the run, which can be different to the registered name.
            The name will be registered as environment variable and can be used in data plane.
        :type name: str
        :param dataset: The dataset to be delivered, as a Dataset object, Pipeline Parameter that ingests a Dataset,
            a tuple of (workspace, Dataset name), or a tuple of (workspace, Dataset name, Dataset version).
            If only a name is provided, the DatasetConsumptionConfig will use the latest version of the Dataset.
        :type dataset: azureml.core.dataset.Dataset
            or azureml.pipeline.core.PipelineParameter
            or tuple(azureml.core.workspace.Workspace, str)
            or tuple(azureml.core.workspace.Workspace, str, str)
            or azureml.data.output_dataset_config.OutputDatasetConfig
        :param mode: Defines how the dataset should be delivered to the compute target. There are three modes:
            1. 'direct': consume the dataset as dataset.
            2. 'download': download the dataset and consume the dataset as downloaded path.
            3. 'mount': mount the dataset and consume the dataset as mount path.
            4. 'hdfs': consume the dataset from resolved hdfs path (Currently only supported on SynapseSpark compute).
        :type mode: str
        :param path_on_compute: The target path on the compute to make the data available at. The folder structure
            of the source data will be kept, however, we might add prefixes to this folder structure to avoid
            collision. We recommend calling `tabular_dataset.to_path` to see the output folder structure.
        :type path_on_compute: str
        """
        mode = mode.lower()
        DatasetConsumptionConfig._validate_mode(dataset, mode)

        from azureml.core import Dataset
        if isinstance(dataset, tuple):
            ws, ds_name = dataset[0], dataset[1]
            try:
                ds_version = dataset[2]
            except IndexError:
                # No version specified, use latest
                ds_version = 'latest'
            dataset = Dataset.get_by_name(ws, ds_name, ds_version)

            if ds_version == 'latest':
                dataset._consume_latest = True

        self.dataset = self._validate_if_pipeline_parameter(dataset)
        self._name = _validate_name(name, 'input') \
            if name else (self.__class__._generate_pipeline_name_by_dataset_id('input_', self.dataset.id)
                          if self.dataset.id else self.__class__._generate_random_name('input_'))
        self.arg_val = _DATASET_ARGUMENT_TEMPLATE.format(self._name)
        self.mode = mode
        self.path_on_compute = path_on_compute
def get_or_create_dataset(azure_config: AzureConfig,
                          azure_dataset_id: str) -> Dataset:
    """
    Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is created
    and registered, assuming that the files are in a folder that has the same name as the dataset. For example, if
    azure_dataset_id is 'foo', then the 'foo' dataset is pointing to <container_root>/datasets/foo folder.

    WARNING: the behaviour of Dataset.File.from_files, used below, is idiosyncratic. For example,
    if "mydataset" storage has two "foo..." subdirectories each containing
    a file dataset.csv and a directory ABC,

    datastore = Datastore.get(workspace, "mydataset")
    # This dataset has the file(s) in foo-bar01 at top level, e.g. dataset.csv
    ds1 = Dataset.File.from_files([(datastore, "foo-bar01/*")])
    # This dataset has two directories at top level, each with a name matching foo-bar*, and each
    # containing dataset.csv.
    ds2 = Dataset.File.from_files([(datastore, "foo-bar*/*")])
    # This dataset contains a single directory "mydataset" at top level, containing a subdirectory
    # foo-bar01, containing dataset.csv and (part of) ABC.
    ds3 = Dataset.File.from_files([(datastore, "foo-bar01/*"),
                                   (datastore, "foo-bar01/ABC/abc_files/*/*.nii.gz")])

    These behaviours can be verified by calling "ds.download()" on each dataset ds.
    """
    if not azure_config.azureml_datastore:
        raise ValueError(
            "No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)"
        )
    logging.info(
        f"Retrieving datastore '{azure_config.azureml_datastore}' from AzureML workspace"
    )
    workspace = azure_config.get_workspace()
    datastore = Datastore.get(workspace, azure_config.azureml_datastore)
    try:
        logging.info(
            f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
        azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
        logging.info("Dataset found.")
    except:
        logging.info(
            f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'"
        )
        # See WARNING above before changing the from_files call!
        azureml_dataset = Dataset.File.from_files([(datastore,
                                                    azure_dataset_id)])
        logging.info("Registering the dataset for future use.")
        azureml_dataset.register(workspace, name=azure_dataset_id)
    return azureml_dataset