def train(self, model_name, hyperparams): # create mlflow run self.mlflow_create_run() # get data df = get_data() # clean data df = clean_data(df) # mlflow param self.mlflow_log_param("estimator", model_name) # create pipeline self.pipeline = MyPipeline(self) dyn_model = self.create_estimator(model_name) self.model = self.pipeline.create_pipeline(dyn_model) # get df self.X_train, self.X_val, self.y_train, self.y_val = holdout(df) self.fit_model(model_name, hyperparams) self.evaluate() return self
def generate_submission_csv(nrows, kaggle_upload=False): df_test = get_test_data(nrows) df_test = clean_data(df_test) #pipeline = joblib.load(PATH_TO_LOCAL_MODEL) pipeline = download_model() if "best_estimator_" in dir(pipeline): y_pred = pipeline.best_estimator_.predict(df_test) else: y_pred = pipeline.predict(df_test) df_test["fare_amount"] = y_pred df_sample = df_test[["key", "fare_amount"]] name = f"predictions_test_ex_10k.csv" df_sample.to_csv(name, index=False) print("prediction saved under kaggle format") # Set kaggle_upload to False unless you install kaggle cli if kaggle_upload: kaggle_message_submission = name[:-4] command = f'kaggle competitions submit -c new-york-city-taxi-fare-prediction -f {name} -m "{kaggle_message_submission}"' os.system(command)
def run_grid_search(self): grid = {} df = get_data() df = clean_data(df) X,y = get_Xy(df) X_train, X_val, y_train, y_val = hold_out(X,y) trainer = Trainer(X_train,y_train) self.pipeline = trainer.set_pipeline() search = GridSearchCV(self.pipeline, grid, scoring = 'neg_mean_squared_error', cv = 5, n_jobs=-1) search.fit(X_train,y_train) return search
def mlflow_log_metric(self, key, value): self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value) knn = KNeighborsRegressor() rf = RandomForestRegressor() svr = SVR() lasso = Lasso() ridge = Ridge() models = [rf] if __name__ == "__main__": # get data df = get_data() # clean data df = clean_data(df) # set X and y X = df.drop(columns=['fare_amount']) y = df['fare_amount'] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) for estimator in models: # train trainer = Trainer(X_train, y_train) trainer.run(estimator) # evaluate #trainer.evaluate(X_test, y_test, estimator) # save trainer.save_model(estimator)
('distance', dist_pipe, ["pickup_latitude", "pickup_longitude", 'dropoff_latitude', 'dropoff_longitude']), ('time', time_pipe, ['pickup_datetime']) ], remainder="drop") pipe = Pipeline([ ('preproc', preproc_pipe), ('linear_model', LinearRegression()) ]) self.pipeline = pipe def run(self): '''returns a trained pipelined model''' self.set_pipeline() self.pipeline.fit(self.X, self.y) def evaluate(self, X_test, y_test): '''returns the value of the RMSE''' y_pred = self.pipeline.predict(X_test) rmse = compute_rmse(y_pred, y_test) return rmse if __name__ == "__main__": df = get_data(nrows=10_000) df = clean_data(df, test=False) y = df["fare_amount"] X = df.drop("fare_amount", axis=1) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15) trainer = Trainer(X_train,y_train) trainer.run() rmse = trainer.evaluate(X_val, y_val) print(rmse)
joblib.dump(reg, 'model.joblib') # Implement here print("saved model.joblib locally") client = storage.Client() bucket = client.bucket(BUCKET_NAME) STORAGE_LOCATION = f'{MODEL_NAME}_{MODEL_VERSION}' blob = bucket.blob(STORAGE_LOCATION) blob.upload_from_filename('model.joblib') # Implement here print( f"uploaded model.joblib to gcp cloud storage under \n => {STORAGE_LOCATION}" ) if __name__ == "__main__": # get data df = get_data() # clean data df_cleaned = clean_data(df) # set X and y X = df_cleaned.drop(columns='fare_amount') y = df_cleaned.fare_amount # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # # train trainer = Trainer(X_train, y_train) trainer.run() # # evaluate print(trainer.evaluate(X_test, y_test)) print('TODO')
def test_cleaned_data(): df = get_data(nrows=100) assert clean_data(df).shape[0] <= df.shape[0]
(model["name"], model["_class"]())]) return self.pipeline def run(self): """set and train the pipeline""" return self.pipeline.fit(self.X, self.y) def evaluate(self, X_test, y_test): """evaluates the pipeline on df_test and return the RMSE""" return self.pipeline.score(X_test, y_test) if __name__ == "__main__": # get data raw_data = data.get_data() # clean data df = data.clean_data(raw_data) # set X and y target = "fare_amount" X = df.drop(columns=target) y = df[target] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # train trainer = Trainer(X_train, y_train) model_ready = trainer.set_pipeline() model_trained = trainer.run() # evaluate model_eval = trainer.evaluate(X_test, y_test)
joblib.dump(model, f"model.joblib") #joblib.dump(model,'model.joblib') print("saved model.joblib locally") # Implement here self.upload_model_to_gcp() print( f"uploaded model.joblib to gcp cloud storage under \n => {self.STORAGE_LOCATION}" ) return self if __name__ == "__main__": # get data data = get_data() # clean data data = clean_data(data) # set X and y X, y = getXy(data, col_target="fare_amount") # hold out X_train, X_val, y_train, y_val = getholdout(X, y) # train trainer = Trainer(X_train, y_train) trainer.run() # evaluate score = trainer.evaluate(X_val, y_val) # joblib model = trainer.best_model trainer.save_model(score) print(trainer.best_params)
class Trainer(): def __init__(self, X, y): """ X: pandas DataFrame y: pandas Series """ self.pipeline = None self.X = X self.y = y self.experiment_name = EXPERIMENT_NAME @memoized_property def mlflow_client(self): mlflow.set_tracking_uri(MLFLOW_URI) return MlflowClient() @memoized_property def mlflow_experiment_id(self): try: return self.mlflow_client.create_experiment(self.experiment_name) except BaseException: return self.mlflow_client.get_experiment_by_name(self.experiment_name).experiment_id @memoized_property def mlflow_run(self): return self.mlflow_client.create_run(self.mlflow_experiment_id) def mlflow_log_param(self, key, value): self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value) def mlflow_log_metric(self, key, value): self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value) def set_pipeline(self): """defines the pipeline as a class attribute""" distance_pipe = make_pipeline(DistanceTransformer(), StandardScaler()) time_pipe = make_pipeline( TimeFeaturesEncoder(time_column = 'pickup_datetime'), OneHotEncoder(handle_unknown = 'ignore') ) preprocessor = ColumnTransformer([ ('distance_trans', distance_pipe, ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']), ('time_trans', time_pipe, ['pickup_datetime'])]) model_pipeline = Pipeline(steps = [('preprocessing', preprocessor), ('regressor', LinearRegression())]) self.pipeline = model_pipeline return self def run(self): """set and train the pipeline""" self.set_pipeline() self.pipeline.fit(self.X, self.y) return self def evaluate(self, X_test, y_test): """evaluates the pipeline on df_test and return the RMSE""" y_pred = self.pipeline.predict(X_test) rmse = compute_rmse(y_pred, y_test) print(f'ID:{trainer.mlflow_experiment_id}') self.mlflow_log_param('model', str(self.pipeline.get_params()['model']) .strip('()')) self.mlflow_log_metric('rmse', rmse) return rmse if __name__ == "__main__": # get & clean data data = clean_data(get_data()) # set X and y X = data.drop(columns = ['fare_amount']) y = data['fare_amount'] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) trainer = Trainer(X_train, y_train) trainer.run() trainer.evaluate(X_test, y_test) # build pipeline #train_pipe = Trainer(X_train, y_train).set_pipeline() # train the pipeline #model = pipeline.run(X_train, y_train, train_pipe) # evaluate the pipeline #result = pipeline.evaluate(X_test, y_test, model) print(trainer.evaluate(X_test, y_test))
def run(self): """set and train the pipeline""" self.set_pipeline() self.pipeline.fit(self.X, self.y) def evaluate(self, X_test, y_test): """evaluates the pipeline on df_test and return the RMSE""" y_pred = self.pipeline.predict(X_test) rmse = compute_rmse(y_pred, y_test) print(f"rmse = {rmse}") if __name__ == "__main__": # Get and clean data df = get_data() clean_data(df) # Get features y = df.pop("fare_amount") X = df # Holdout X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Train trainer = Trainer(X_train, y_train) trainer.run() # Evaluate trainer.evaluate(X_test, y_test)
model_pipe = Pipeline([('preprocessing', prepro_pipe), ('model', LinearRegression())]) self.pipeline = model_pipe def run(self): """set and train the pipeline""" self.pipeline.fit(self.X_train, self.y_train) def evaluate(self): """evaluates the pipeline on df_test and return the RMSE""" print(compute_rmse(self.pipeline.predict(self.X_test), self.y_test)) if __name__ == "__main__": # get data and clean data df = clean_data(get_data()) # set X and y features = [ 'key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count' ] X, y = df[features], df['fare_amount'] # train trainer = Trainer(X, y) trainer.set_pipeline() trainer.run() # evaluate trainer.evaluate()
return self.mlflow_client.create_experiment(self.experiment_name) except BaseException: return self.mlflow_client.get_experiment_by_name( self.experiment_name).experiment_id @memoized_property def mlflow_run(self): return self.mlflow_client.create_run(self.mlflow_experiment_id) def mlflow_log_param(self, key, value): self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value) def mlflow_log_metric(self, key, value): self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value) if __name__ == "__main__": # get data df = get_data() # clean data df_clean = clean_data(df) # set X and y y = df.fare_amount X = df.drop(columns='fare_amount') # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # train model = Trainer(X_train, y_train) model.run() print(f"RMSE is {model.evaluate(X_test, y_test)}")