def test_get_model_metrics(): class MockModel: @staticmethod def predict(valid_data): return np.array([0, 0]) train_data, valid_data = __get_test_datasets() metrics = get_model_metrics(MockModel(), train_data, valid_data) # verify that metrics is a dictionary containing the auc value. assert "auc" in metrics auc = metrics["auc"] np.testing.assert_almost_equal(auc, 0.5)
def test_get_model_metrics(): class MockHistory: history = { 'loss': [1.5012110471725464, 0.6115774512290955], 'accuracy': [0.5195071697235107, 0.7885010242462158], 'val_loss': [0.6773713827133179, 0.5661255717277527], 'val_accuracy': [0.7746031880378723, 0.8095238208770752] } metrics = get_model_metrics(MockHistory()) assert 'loss' in metrics mse = metrics['loss'] np.testing.assert_almost_equal(mse, 0.6115774512290955)
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="insure_model_model.pkl", ) parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--dataset_version", type=str, help=("dataset version")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument( "--caller_run_id", type=str, help=("caller run id, for example ADF pipeline run id")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [caller_run_id]: %s" % args.caller_run_id) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, os.environ.get("DATASTORE_NAME"), data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train df = dataset.to_pandas_dataframe() data = split_data(df) # Train the model model = train_model(data, train_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data[1]) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) joblib.dump(value=model, filename=model_output_path) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument("--model_name", type=str, help="Name of the Model") parser.add_argument("--step_output", type=str, help=("output for passing data to next step")) parser.add_argument("--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered")) parser.add_argument("--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation")) parser.add_argument("--datastore_name", type=str, help=("Datastore name.")) parser.add_argument( "--ml_params", type=str, help= "Parameters for ML pipelne in json format with defaults defined in parameters.json", # NOQA: E501 ) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [dataset_name]: %s" % args.dataset_name) print("Argument [datastore_name]: %s" % args.datastore_name) print("Argument [ml_params]: %s" % args.ml_params) model_name = args.model_name step_output_path = args.step_output data_file_path = args.data_file_path dataset_name = args.dataset_name datastore_name = args.datastore_name run = Run.get_context() training_args, preprocessing_args = parse_ml_params(run, args.ml_params) # Get the dataset dataset = get_or_register_dataset(dataset_name=dataset_name, datastore_name=datastore_name, data_file_path=data_file_path, aml_workspace=run.experiment.workspace) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Train the model # mount the dynamic version of the dataset, which can't be determined at pipeline publish time # NOQA: E501 mount_context = dataset.mount() mount_context.start() print(f"mount_point is: {mount_context.mount_point}") data = split_data(mount_context.mount_point, preprocessing_args) model, history = train_model(data, training_args, preprocessing_args) mount_context.stop() # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(history) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) model.save(model_output_path) with open(os.path.join(step_output_path, "run_id.txt"), "w") as text_file: print(f"{run.id}", file=text_file) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) model.save(output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="sales_model.h5", ) parser.add_argument( "--step_output", type=str, help=("output for passing data to next step") ) parser.add_argument( "--dataset_version", type=str, help=("dataset version") ) parser.add_argument( "--data_file_path", type=str, help=("data file path, if specified,\ a new version of the dataset will be registered") ) parser.add_argument( "--caller_run_id", type=str, help=("caller run id, for example ADF pipeline run id") ) parser.add_argument( "--dataset_name", type=str, help=("Dataset name. Dataset must be passed by name\ to always get the desired dataset version\ rather than the one used while the pipeline creation") ) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [step_output]: %s" % args.step_output) print("Argument [dataset_version]: %s" % args.dataset_version) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [caller_run_id]: %s" % args.caller_run_id) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name step_output_path = args.step_output dataset_version = args.dataset_version data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, os.environ.get("DATASTORE_NAME"), data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train df = dataset.to_pandas_dataframe() (train, test) = tts(df) # Train the model model = lstm_model(train, test) #Saving the model # model.save("sales_forecast_model.h5") # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, train, test) for (k, v) in metrics.items(): run.log(k, v) run.parent.log(k, v) # Pass model file to next step # model_output_path = "outputs/sales_forecast_model.pkl" os.makedirs(step_output_path, exist_ok=True) model_output_path = os.path.join(step_output_path, model_name) keras.models.save_model(model,model_output_path) print("Saved model in model_output_path") #print("printing output path: ") #print(model_output_path) #print("printing model name: ") #print(model_name) #joblib.dump(value=model, filename=model_output_path) #checkpoints = ModelCheckpoint(model_output_path, verbose=1, # save_best_only=False, # save_weights_only=True, mode='auto', period=0) #callbacks_list = [checkpoints] #model.save(model) #model.save(model_output_path) #model.save('sales_model.pb') # new lines added ---------------------------- # serialize model to JSON # model_json = model.to_json() #with open("model.json", "w") as json_file: # json_file.write(model_json) # serialize weights to HDF5 #model_output_path= model.save_weights("model.h5") #print("Saved model to disk") #-------------------------------------- # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) keras.models.save_model(model,output_path) print("Model saved") #print("printing output path: ") #print(output_path) #checkpoints = ModelCheckpoint(output_path, verbose=1, # save_best_only=False, # save_weights_only=True, mode='auto', period=0) # serialize model to JSON #model_json = model.to_json() #with open("model.json", "w") as json_file: # json_file.write(model_json) # serialize weights to HDF5 #model.save_weights("model.h5") #print("Saved model to disk") #callbacks_list = [checkpoints] #model.save('output_path') #model.save('sales_model.pb') #model.save(model) # joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") run.complete()
def main(): print("Running train_aml.py") parser = argparse.ArgumentParser("train") parser.add_argument( "--model_name", type=str, help="Name of the Model", default="insurance_model.pkl", ) parser.add_argument( "--data_file_path", type=str, help= ("data file path, if specified,a new version of the dataset will be registered" ), default="insurance", ) parser.add_argument( "--dataset_name", type=str, help="Dataset name", default="insurance_dataset", ) args = parser.parse_args() print("Argument [model_name]: %s" % args.model_name) print("Argument [data_file_path]: %s" % args.data_file_path) print("Argument [dataset_name]: %s" % args.dataset_name) model_name = args.model_name data_file_path = args.data_file_path dataset_name = args.dataset_name run = Run.get_context() print("Getting training parameters") # Load the training parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: train_args = pars["training"] except KeyError: print("Could not load training values from file") train_args = {} # Log the training parameters print(f"Parameters: {train_args}") for (k, v) in train_args.items(): run.log(k, v) #run.parent.log(k, v) # Get the dataset if (dataset_name): if (data_file_path == 'none'): dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name) # NOQA: E402, E501 else: dataset = register_dataset(run.experiment.workspace, dataset_name, "workspaceblobstore", data_file_path) else: e = ("No dataset provided") print(e) raise Exception(e) # Link dataset to the step run so it is trackable in the UI run.input_datasets['training_data'] = dataset #run.parent.tag("dataset_id", value=dataset.id) # Split the data into test/train df = dataset.to_pandas_dataframe() data = split_data(df) # Train the model model = train_model(data, train_args) # Evaluate and log the metrics returned from the train function metrics = get_model_metrics(model, data) for (k, v) in metrics.items(): run.log(k, v) #run.parent.log(k, v) # Also upload model file to run outputs for history os.makedirs('outputs', exist_ok=True) output_path = os.path.join('outputs', model_name) joblib.dump(value=model, filename=output_path) run.tag("run_type", value="train") print(f"tags now present for run: {run.tags}") # upload the model file explicitly into artifacts print("Uploading the model into run artifacts...") run.upload_file(name="./outputs/models/" + model_name, path_or_stream=output_path) print("Uploaded the model {} to experiment {}".format( model_name, run.experiment.name)) dirpath = os.getcwd() print(dirpath) print("Following files are uploaded ") print(run.get_file_names()) run.complete()