def train(self, model_name, hyperparams): # create mlflow run self.mlflow_create_run() # get data df = get_data() # clean data df = clean_data(df) # mlflow param self.mlflow_log_param("estimator", model_name) # create pipeline self.pipeline = MyPipeline(self) dyn_model = self.create_estimator(model_name) self.model = self.pipeline.create_pipeline(dyn_model) # get df self.X_train, self.X_val, self.y_train, self.y_val = holdout(df) self.fit_model(model_name, hyperparams) self.evaluate() return self
def test_time_encoder(): df = get_data(nrows=1) X = df.drop(columns='fare_amount') y = df.fare_amount time_enc = TimeFeaturesEncoder('pickup_datetime') time_features = time_enc.fit_transform(X, y) assert time_features.shape[1] == 4, "shape[1] is not 4"
def test_distance_transformer(): df = get_data(nrows=1) X = df.drop(columns='fare_amount') y = df.fare_amount dist_enc = DistanceTransformer() enc_features = dist_enc.fit_transform(X, y) assert enc_features.columns[0] == 'distance', "column name is not distance"
def test_pipeline(): df = get_data(nrows=1) X = df.drop(columns='fare_amount') y = df.fare_amount trainer = Trainer(X, y) trainer.set_pipeline() assert len(trainer.pipeline.get_params()['steps']) == 2
def load_data(params): print("############ Loading Data ############") df = get_data(**params) df = clean_df(df) y = df["fare_amount"] X = df.drop("fare_amount", axis=1) print("shape: {}".format(X.shape)) print("size: {} Mb".format(X.memory_usage().sum() / 1e6)) return X, y
def df_ny_box(): df = get_data(nrows=10000) df = clean_df(df) #setting boundries df = df[df["pickup_latitude"].between(left=40, right=42)] df = df[df["pickup_longitude"].between(left=-74.3, right=-72.9)] df = df[df["dropoff_latitude"].between(left=40, right=42)] df = df[df["dropoff_longitude"].between(left=-74, right=-72.9)] return df
def train(self): if not isinstance(self.df, pd.DataFrame): self.df = get_data(nrows=100000, bq=True) y_train = self.df["fare_amount"] X_train = self.df.drop("fare_amount", axis=1) self.split = self.kwargs.get("split", self.TRAINING_NROWS) # cf doc above if self.split: X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15) print(X_train.shape) self.set_pipeline() self.pipeline.fit(X_train, y_train) if self.split: rmse = self.evaluate(X_val, y_val)
def run_grid_search(self): grid = {} df = get_data() df = clean_data(df) X,y = get_Xy(df) X_train, X_val, y_train, y_val = hold_out(X,y) trainer = Trainer(X_train,y_train) self.pipeline = trainer.set_pipeline() search = GridSearchCV(self.pipeline, grid, scoring = 'neg_mean_squared_error', cv = 5, n_jobs=-1) search.fit(X_train,y_train) return search
def test_number_of_columns(): assert get_data(nrows=5).shape == (5, 8)
(model["name"], model["_class"]())]) return self.pipeline def run(self): """set and train the pipeline""" return self.pipeline.fit(self.X, self.y) def evaluate(self, X_test, y_test): """evaluates the pipeline on df_test and return the RMSE""" return self.pipeline.score(X_test, y_test) if __name__ == "__main__": # get data raw_data = data.get_data() # clean data df = data.clean_data(raw_data) # set X and y target = "fare_amount" X = df.drop(columns=target) y = df[target] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # train trainer = Trainer(X_train, y_train) model_ready = trainer.set_pipeline() model_trained = trainer.run() # evaluate model_eval = trainer.evaluate(X_test, y_test)
if __name__ == "__main__": warnings.simplefilter(action='ignore', category=FutureWarning) # Get and clean data experiment = "taxifare_test_jean" params = dict( nrows=100000, upload=True, local=False, # set to False to get data from GCP (Storage or BigQuery) gridsearch=False, optimize=False, estimator="xgboost", mlflow=True, # set to True to log params to mlflow experiment_name=experiment) print("############ Loading Data ############") df = get_data(**params) df = clean_df(df) y_train = df["fare_amount"] X_train = df.drop("fare_amount", axis=1) del df print("shape: {}".format(X_train.shape)) print("size: {} Mb".format(X_train.memory_usage().sum() / 1e6)) # Train and save model, locally and t = Trainer(X=X_train, y=y_train, **params) del X_train, y_train print(colored("############ Training model ############", "red")) t.train() print(colored("############ Evaluating model ############", "blue")) t.evaluate() print(colored("############ Saving model ############", "green")) t.save_model()
]), ('time', time_pipe, ['pickup_datetime'])], remainder="drop") self.pipeline = Pipeline([('preproc', preproc_pipe), ('linear_model', LinearRegression())]) def run(self): """set and train the pipeline""" self.set_pipeline() self.pipeline.fit(self.X, self.y) def evaluate(self, X_test, y_test): """evaluates the pipeline on df_test and return the RMSE""" y_pred = self.pipeline.predict(X_test) rmse = compute_rmse(y_pred, y_test) return rmse if __name__ == "__main__": N = 10_000 df = get_data(nrows=N) df = clean_data(df) X = df.drop(columns=["fare_amount"]) y = df["fare_amount"] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) train = Trainer(X_train, y_train) train.run() rmse = train.evaluate(X_test, y_test) print(f"rmse: {rmse}")
def test_haversine(): df = get_data(nrows=1) assert round(haversine_vectorized(df)[0], 2) == 1.03, "Distance not right"
model_pipe = Pipeline([('preprocessing', prepro_pipe), ('model', LinearRegression())]) self.pipeline = model_pipe def run(self): """set and train the pipeline""" self.pipeline.fit(self.X_train, self.y_train) def evaluate(self): """evaluates the pipeline on df_test and return the RMSE""" print(compute_rmse(self.pipeline.predict(self.X_test), self.y_test)) if __name__ == "__main__": # get data and clean data df = clean_data(get_data()) # set X and y features = [ 'key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count' ] X, y = df[features], df['fare_amount'] # train trainer = Trainer(X, y) trainer.set_pipeline() trainer.run() # evaluate trainer.evaluate()
self.experiment_name).experiment_id @memoized_property def mlflow_run(self): return self.mlflow_client.create_run(self.mlflow_experiment_id) def mlflow_log_param(self, key, value): self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value) def mlflow_log_metric(self, key, value): self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value) if __name__ == "__main__": # get data df = get_data(10000) # clean data df_cleaned = clean_data(df) # set X and y target = "fare_amount" features = list(df.drop(columns=[target]).columns) X = df_cleaned[features] y = df_cleaned[target] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # train XGBoost or Lasso model = Trainer('XGBoost', X_train, y_train) model.run()
def test_cleaned_data(): df = get_data(nrows=100) assert clean_data(df).shape[0] <= df.shape[0]
self.set_pipeline() pipe_trained = self.pipe.fit(self.X, self.y) return pipe_trained def evaluate(self, X_test, y_test): """evaluates the pipeline on df_test and return the RMSE""" pipe_trained = self.run() y_pred = pipe_trained.predict(X_test) rmse = compute_rmse(y_pred, y_test) return rmse if __name__ == "__main__": n_rows = 10000 df = get_data(n_rows) df = clean_data(df) y = df['fare_amount'] X = df.drop(columns='fare_amount') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) trainer = Trainer(X_train, y_train) trainer.run() rmse = trainer.evaluate(X_test, y_test) print(rmse) print('OK model trained')
class Trainer(): def __init__(self, X, y): """ X: pandas DataFrame y: pandas Series """ self.pipeline = None self.X = X self.y = y self.experiment_name = EXPERIMENT_NAME @memoized_property def mlflow_client(self): mlflow.set_tracking_uri(MLFLOW_URI) return MlflowClient() @memoized_property def mlflow_experiment_id(self): try: return self.mlflow_client.create_experiment(self.experiment_name) except BaseException: return self.mlflow_client.get_experiment_by_name(self.experiment_name).experiment_id @memoized_property def mlflow_run(self): return self.mlflow_client.create_run(self.mlflow_experiment_id) def mlflow_log_param(self, key, value): self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value) def mlflow_log_metric(self, key, value): self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value) def set_pipeline(self): """defines the pipeline as a class attribute""" distance_pipe = make_pipeline(DistanceTransformer(), StandardScaler()) time_pipe = make_pipeline( TimeFeaturesEncoder(time_column = 'pickup_datetime'), OneHotEncoder(handle_unknown = 'ignore') ) preprocessor = ColumnTransformer([ ('distance_trans', distance_pipe, ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']), ('time_trans', time_pipe, ['pickup_datetime'])]) model_pipeline = Pipeline(steps = [('preprocessing', preprocessor), ('regressor', LinearRegression())]) self.pipeline = model_pipeline return self def run(self): """set and train the pipeline""" self.set_pipeline() self.pipeline.fit(self.X, self.y) return self def evaluate(self, X_test, y_test): """evaluates the pipeline on df_test and return the RMSE""" y_pred = self.pipeline.predict(X_test) rmse = compute_rmse(y_pred, y_test) print(f'ID:{trainer.mlflow_experiment_id}') self.mlflow_log_param('model', str(self.pipeline.get_params()['model']) .strip('()')) self.mlflow_log_metric('rmse', rmse) return rmse if __name__ == "__main__": # get & clean data data = clean_data(get_data()) # set X and y X = data.drop(columns = ['fare_amount']) y = data['fare_amount'] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) trainer = Trainer(X_train, y_train) trainer.run() trainer.evaluate(X_test, y_test) # build pipeline #train_pipe = Trainer(X_train, y_train).set_pipeline() # train the pipeline #model = pipeline.run(X_train, y_train, train_pipe) # evaluate the pipeline #result = pipeline.evaluate(X_test, y_test, model) print(trainer.evaluate(X_test, y_test))
@memoized_property def mlflow_run(self): return self.mlflow_client.create_run(self.mlflow_experiment_id) def mlflow_log_param(self, key, value): self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value) def mlflow_log_metric(self, key, value): self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value) if __name__ == "__main__": # get data df = get_data(nrows=10_000) # clean data df = clean_data(df) # set X and y X = df.drop(columns='fare_amount') y = df['fare_amount'] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # train pipe = Trainer(X_train, y_train) pipe.run()
""" Save the trained model into a model.joblib file """ joblib.dump(model, f"model.joblib") #joblib.dump(model,'model.joblib') print("saved model.joblib locally") # Implement here self.upload_model_to_gcp() print( f"uploaded model.joblib to gcp cloud storage under \n => {self.STORAGE_LOCATION}" ) return self if __name__ == "__main__": # get data data = get_data() # clean data data = clean_data(data) # set X and y X, y = getXy(data, col_target="fare_amount") # hold out X_train, X_val, y_train, y_val = getholdout(X, y) # train trainer = Trainer(X_train, y_train) trainer.run() # evaluate score = trainer.evaluate(X_val, y_val) # joblib model = trainer.best_model trainer.save_model(score) print(trainer.best_params)
def mlflow_log_metric(self, key, value): self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value) knn = KNeighborsRegressor() rf = RandomForestRegressor() svr = SVR() lasso = Lasso() ridge = Ridge() models = [rf] if __name__ == "__main__": # get data df = get_data() # clean data df = clean_data(df) # set X and y X = df.drop(columns=['fare_amount']) y = df['fare_amount'] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) for estimator in models: # train trainer = Trainer(X_train, y_train) trainer.run(estimator) # evaluate #trainer.evaluate(X_test, y_test, estimator) # save trainer.save_model(estimator)
def read_data(n_rows=10000): df = get_data(n_rows=n_rows, local=False) return df