def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() # Fetch the data url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(url) # Separate features and target x, y = clean_data(ds) # Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Setup the run run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) # Setup the model model = LogisticRegression(C=args.C, max_iter=args.max_iter,solver='liblinear').fit(x_train, y_train) # Log the accuracy accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) # Save model os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.pkl')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() ds = TabularDatasetFactory.from_delimited_files(path=URL) x, y = split_data(ds) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42) run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(X_train, y_train) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.joblib') accuracy = model.score(X_test, y_test) run.log("Accuracy", np.float(accuracy))
def main(): data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(data_url) run = Run.get_context() x, y = clean_data(ds) # TODO: Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0) # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def main(): url = 'https://raw.githubusercontent.com/AnshuTrivedi/Capstone-Project---Azure-Machine-Learning-Engineer/main/mobile_sales_data.csv' data = TabularDatasetFactory.from_delimited_files(url) x = data.to_pandas_dataframe() y = x.pop("price_range") x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=42) parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") parser.add_argument('--solver', type=str, default='lbfgs', help="chose the algorithm to train the model") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) run.log("Algorithm: ", args.solver) model = LogisticRegression(solver=args.solver, C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model,'outputs/model.joblib')
def main(): # Add arguments to the script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() # Create TabularDataset using TabularDatasetFactory # Data is located at: path_file="https://gist.githubusercontent.com/Nwaneto/0d1477bd10c92f8b16ab19306d21a17f/raw/0af3078c0d174e26039ab31525487ceaceda77b0/parkinson-classification-data.csv ds =TabularDatasetFactory.from_delimited_files(path=path_file) x, y = clean_data(ds) # Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30) run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(path=data_path) x, y = get_labels_and_data(ds) x_train, x_test, y_train, y_test = train_test_split(x,y) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) joblib.dump(model, './outputs/model.joblib') accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def get_cleaned_dataset(ws): found = False ds_key = "machine-cpu" description_text = "CPU performance dataset (UCI)." if ds_key in ws.datasets.keys(): found = True ds_cleaned = ws.datasets[ds_key] # Otherwise, create it from the file if not found: with zipfile.ZipFile("./data/machine.zip", "r") as zip_ref: zip_ref.extractall("data") #Reading a json lines file into a DataFrame data = pd.read_csv('./data/machine.csv') # DataFrame with cleaned data cleaned_data = clean_data(data) exported_df = 'cleaned-machine-cpu.parquet' cleaned_data.to_parquet(exported_df) # Register Dataset in Workspace using experimental funcionality to upload and register pandas dataframe at once ds_cleaned = TabularDatasetFactory.register_pandas_dataframe( dataframe=cleaned_data, target=(ws.get_default_datastore(), exported_df), name=ds_key, description=description_text, show_progress=True) return ds_cleaned
def main(): # TODO: Create TabularDataset using TabularDatasetFactory ds = TabularDatasetFactory.from_delimited_files(path = csv_path) # Call clean_data to preprocess the dataset x, y = clean_data(ds) # TODO: Split data into train and test sets. x_train, x_test, y_train,y_test = train_test_split(x,y,train_size = 0.8,random_state = 42) run = Run.get_context() # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default= 1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) ds =TabularDatasetFactory.from_delimited_files(path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv") x, y = clean_data(ds) # TODO: Split data into train and test sets. x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) # files saved in the "outputs" folder are automatically uploaded into run history joblib.dump(LogisticRegression, 'outputs/model.joblib')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ### YOUR CODE HERE ### ## Added code to inport TabularDataset using TabularDatasetFactory Class factory = TabularDatasetFactory() train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = factory.from_delimited_files(path=train_data_path) # Clean the data x, y = clean_data(ds) # TODO: Split data into train and test sets. ### YOUR CODE HERE ### x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Fit model using Logistic Regression with inpur arguments C -> regularization strength # Importance of Regularization in Logistic regression: # https://stackoverflow.com/questions/22851316/what-is-the-inverse-of-regularization-strength-in-logistic-regression-how-shoul # https://www.coursera.org/lecture/machine-learning/regularized-logistic-regression-4BHEy model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) # Download best model using the joblib library os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.joblib') accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def main(): # Add arguments to src parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" factory = TabularDatasetFactory() ds = factory.from_delimited_files(path) X, y = clean_data(ds) # Split data into train and test sets. train_data, test_data, train_label, test_label = train_test_split( X, y, test_size=0.3, random_state=42) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit( train_data, train_label) accuracy = model.score(test_data, test_label) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) # Save model joblib.dump(model, 'outputs/model.joblib')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) # Create TabularDataset using TabularDatasetFactory # from web url: # "https://raw.githubusercontent.com/atan4583/datasets/master/train.csv" wurl = 'https://raw.githubusercontent.com/atan4583/datasets/master/train.csv' ds = TabularDatasetFactory.from_delimited_files(wurl) x, y = clean_data(ds) print( f'x null chk: \n{x.isnull().sum()}\n \ny null chk: \n{y.isnull().sum()}\n' ) # Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42) print( f'x_train null chk: \n{x_train.isnull().sum()}\n \ny_train null chk: \n{y_train.isnull().sum()}\n' ) print( f'x_test null chk: \n{x_test.isnull().sum()}\n \ny_test null chk: \n{y_test.isnull().sum()}\n' ) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.pkl')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--n_estimators', type=int, default=100, help="Number of trees in the forest") parser.add_argument( '--min_samples_split', type=int, default=2, help="Minimum number of samples required to split an internal node") parser.add_argument('--max_features', type=str, default='auto', help="{'auto', 'sqrt', 'log2'}") parser.add_argument('--bootstrap', type=bool, default=True, help="Whether bootstrap samples are used or not") args = parser.parse_args() ds = TabularDatasetFactory.from_delimited_files(path=web_path) x, y = split_data(ds) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42) run = Run.get_context() run.log("No of Estimators:", np.int(args.n_estimators)) run.log("Min No of Samples to Split:", np.int(args.min_samples_split)) run.log("No of Features Considered:", np.str(args.max_features)) run.log("Bootstrap:", np.bool(args.bootstrap)) model = RandomForestClassifier(n_estimators=args.n_estimators, min_samples_split=args.min_samples_split, bootstrap=args.bootstrap, max_features=args.max_features).fit( X_train, y_train) accuracy = model.score(X_test, y_test) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(value=model, filename='outputs/model.pkl')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) factory = TabularDatasetFactory() train_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" valid_data_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv" train_ds = factory.from_delimited_files(train_data_path) valid_ds = factory.from_delimited_files(valid_data_path) X_train, y_train = clean_data(train_ds) X_valid, y_valid = clean_data(valid_ds) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(X_train, y_train) accuracy = model.score(X_valid, y_valid) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/bankmarketing-logit-model.joblib')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() # 1. Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" # Useful reference: # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.dataset_factory.tabulardatasetfactory raw_data_url = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(raw_data_url) x, y = clean_data(ds) # 2. Split data into train and test sets. # Useful reference which explains how this works and can guide parameter choice: # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=8, stratify=y) # Note: Using the stratify parameter ensures that the split contains the same distribution of target values in the # training and test sets as the proportion of values in the entire dataset when the target data for the entire # dataset (in this case: y) is passed to the stratify parameter. # Useful reference: # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.run(class)?view=azure-ml-py run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) # Note: It may also be worth investigating the 'class_weight' parameter in the LogisticRegression model to deal with # the dataset imbalance: # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) # Note: could be worth trying out a different performance metric, e.g. AUC, due to dataset imbalance # (88% target outputs: 'no') accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) # Save the model # See these links for useful information: # https://knowledge.udacity.com/questions/424266 # https://www.kaggle.com/pankaj1234/azure-machine-learning-model-training # https://towardsdatascience.com/azure-machine-learning-service-train-a-model-df72c6b5dc os.makedirs("outputs", exist_ok=True) # Precautionary, creation should be automatic joblib.dump(value=model, filename="./outputs/my_model.joblib")
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( "--C", type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization", ) parser.add_argument( "--max_iter", type=int, default=100, help="Maximum number of iterations to converge", ) args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) ds = TabularDatasetFactory().from_delimited_files( path= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ) x, y = clean_data(ds) # Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=7) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) pred_prob = model.predict_proba(x_test) auc_score = roc_auc_score(y_test, pred_prob[:, 1], average="weighted") run.log("AUC", np.float(auc_score)) # files saved in the "outputs" folder are automatically uploaded into run history os.makedirs("outputs", exist_ok=True) joblib.dump(model, "./outputs/model.joblib")
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) # Create TabularDataset using TabularDatasetFactory # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.dataset_factory.tabulardatasetfactory?view=azure-ml-py dataset_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(path=dataset_path) x, y = clean_data(ds) # Split data into train and test sets. # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=53) # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html model = LogisticRegression(C=args.C, max_iter=args.max_iter, solver='lbfgs').fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) # save model os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.joblib')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ### YOUR CODE HERE ### ds = TabularDatasetFactory.from_delimited_files( path= 'https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv' ) x, y = clean_data(ds) # TODO: Split data into train and test sets. ### YOUR CODE HERE ### x_train, x_test, y_train, y_test = train_test_split(x, y) run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.joblib')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv" path_train = "https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv" ds = TabularDatasetFactory.from_delimited_files(path=path_train) data = ds.to_pandas_dataframe().dropna() y = data['Classification'] x = data x.drop("Classification", inplace=True, axis=1) # TODO: Split data into train and test sets. ### YOUR CODE HERE ### x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.261) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) predictions = model.predict(x_test) avg_prec_sc = average_precision_score(y_test, predictions, average='weighted') run.log("average_precision_score_weighted", np.float(avg_prec_sc))
def main(): # TODO: Create TabularDataset using TabularDatasetFactory # Data is located at: # "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" datapath = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(datapath) x, y = clean_data(ds) # TODO: Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y) run = Run.get_context() # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) # Save model filename = "bankmarketing_model.pkl" output_dir = './outputs/model' os.makedirs(output_dir, exist_ok=True) full_path = os.path.join(output_dir, filename) joblib.dump(value=model, filename=full_path) print("model saved in {}".format(full_path))
def main(): url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00451/dataR2.csv' data = TabularDatasetFactory.from_delimited_files(url) x = data.to_pandas_dataframe() y = x.pop("Classification") x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=200) parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") parser.add_argument('--solver', type=str, default='lbfgs', help="chose the algorithm to train the model") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) run.log("Algorithm: ", args.solver) model = LogisticRegression(solver=args.solver, C=args.C, max_iter=args.max_iter).fit(x_train, y_train) pred_prob = model.predict_proba(x_test) AUC = roc_auc_score(y_test, pred_prob[:, 1]) run.log("AUC", np.float(AUC)) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.joblib')
def main(): url_path = "https://raw.githubusercontent.com/maulingogri/Azure-Udacity-MLE-ND-Capstone/master/data/heart_failure_clinical_records_dataset.csv" ds = TabularDatasetFactory.from_delimited_files(path=url_path) # Split data into train and score sets # train, score = ds.random_split(percentage=0.75, seed=121) x, y = clean_data(ds) # TODO: Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=121) run = Run.get_context() # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) joblib.dump(model, os.path.join('outputs', 'hd_model.joblib')) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) #Create TabularDataset using TabularDatasetFactory # Data is located at: # https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv path_train = "https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv" ds = TabularDatasetFactory.from_delimited_files(path=path_train) x, y = clean_data(ds) #Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) predictions = model.predict(x_test) avg_prec_sc = average_precision_score(y_test, predictions, average='weighted') run.log("average_precision_score_weighted", np.float(avg_prec_sc))
def main(): url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv' data = TabularDatasetFactory.from_delimited_files(url) x = data.to_pandas_dataframe() y = x.pop("DEATH_EVENT") x_train, x_test, y_train, y_test = train_test_split(x, y) parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") parser.add_argument('--solver', type=str, default='lbfgs', help="chose the algorithm to train the model") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) run.log("Algorithm: ", args.solver) model = LogisticRegression(solver=args.solver, C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model, 'outputs/model.joblib')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) web_path = "https://raw.githubusercontent.com/MonishkaDas/nd00333-capstone/master/starter_file/cardio_train.csv" ds = TabularDatasetFactory.from_delimited_files(path=web_path, separator=";") x, y = clean_data(ds) #Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42, shuffle=True) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def infer_forecasting_dataset_tcn(X_test, y_test, model, output_path, output_dataset_name="results"): y_pred, df_all = model.forecast(X_test, y_test) run = Run.get_context() registered_train = TabularDatasetFactory.register_pandas_dataframe( df_all, target=( run.experiment.workspace.get_default_datastore(), datetime.now().strftime("%Y-%m-%d-") + str(uuid.uuid4())[:6], ), name=output_dataset_name, ) df_all.to_csv(os.path.join(output_path, output_dataset_name + ".csv"), index=False)
def main(): # Add arguments to the script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() # Create TabularDataset using TabularDatasetFactory # Data is located at: path_file = "https://raw.githubusercontent.com/hananeouhammouch/Parkinsons-detection/master/parkinsons.data" ds = TabularDatasetFactory.from_delimited_files(path=path_file) x, y = clean_data(ds) # Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30) run = Run.get_context() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy))
def create_dataset(ws): kaggle_api.dataset_download_file('divg07/malware-analysis-dataset','data.csv') data = pd.read_csv( './data.csv.zip', compression='zip', sep='|' ) # Clean dataset data = clean_data(data) # Register Dataset in Workspace datastore = Datastore(ws) name = "Malware Dataset" description_text = "Malware DataSet for Udacity Capstone Project" dataset = TabularDatasetFactory.register_pandas_dataframe(data, datastore, name, description=description_text) return dataset
def dataset_register_tabular(args): """ Register a tabular dataset into the workspace """ workspace = package_utils.get_workspace() datastore_path, target_path = datastore_upload_files(args) kwargs = {"path": datastore_path, "set_column_types": DATA_TYPES} logger.info(msg="TabularDatasetFactory.from_delimited_files", extra={"kwargs": kwargs}) if not args.dry_run: tabular = TabularDatasetFactory.from_delimited_files(**kwargs) kwargs = { "workspace": workspace, "name": target_path, "create_new_version": False } logger.info(msg="tabular.register", extra={"kwargs": kwargs}) if not args.dry_run: _ = tabular.register(**kwargs)
def main(): dataset_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv" ds = TabularDatasetFactory.from_delimited_files(dataset_path) x, y = clean_data(ds) run = Run.get_context() # TODO: Split data into train and test sets. ### YOUR CODE HERE ### x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=625, shuffle=True) # Add arguments to script # gets the arguments for regularization strength and max iterations parser = argparse.ArgumentParser() parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization") parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") parser.add_argument('--solver', type=str, default='lbfgs', help="chose the algorithm to train the model") args = parser.parse_args() run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) model = LogisticRegression(solver=args.solver, C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) os.makedirs('outputs', exist_ok=True) joblib.dump(model,'outputs/model.joblib') if __name__ == '__main__': main()