示例#1
0
    def train(self, model_name, hyperparams):

        # create mlflow run
        self.mlflow_create_run()

        # get data
        df = get_data()

        # clean data
        df = clean_data(df)

        # mlflow param
        self.mlflow_log_param("estimator", model_name)

        # create pipeline
        self.pipeline = MyPipeline(self)

        dyn_model = self.create_estimator(model_name)

        self.model = self.pipeline.create_pipeline(dyn_model)

        # get df
        self.X_train, self.X_val, self.y_train, self.y_val = holdout(df)

        self.fit_model(model_name, hyperparams)

        self.evaluate()

        return self
示例#2
0
def test_time_encoder():
    df = get_data(nrows=1)
    X = df.drop(columns='fare_amount')
    y = df.fare_amount
    time_enc = TimeFeaturesEncoder('pickup_datetime')
    time_features = time_enc.fit_transform(X, y)
    assert time_features.shape[1] == 4, "shape[1] is not 4"
示例#3
0
def test_distance_transformer():
    df = get_data(nrows=1)
    X = df.drop(columns='fare_amount')
    y = df.fare_amount
    dist_enc = DistanceTransformer()
    enc_features = dist_enc.fit_transform(X, y)
    assert enc_features.columns[0] == 'distance', "column name is not distance"
def test_pipeline():
    df = get_data(nrows=1)
    X = df.drop(columns='fare_amount')
    y = df.fare_amount
    trainer = Trainer(X, y)
    trainer.set_pipeline()
    assert len(trainer.pipeline.get_params()['steps']) == 2
示例#5
0
def load_data(params):
    print("############   Loading Data   ############")
    df = get_data(**params)
    df = clean_df(df)
    y = df["fare_amount"]
    X = df.drop("fare_amount", axis=1)
    print("shape: {}".format(X.shape))
    print("size: {} Mb".format(X.memory_usage().sum() / 1e6))
    return X, y
示例#6
0
def df_ny_box():
    df = get_data(nrows=10000)
    df = clean_df(df)
    #setting boundries
    df = df[df["pickup_latitude"].between(left=40, right=42)]
    df = df[df["pickup_longitude"].between(left=-74.3, right=-72.9)]
    df = df[df["dropoff_latitude"].between(left=40, right=42)]
    df = df[df["dropoff_longitude"].between(left=-74, right=-72.9)]
    return df
示例#7
0
 def train(self):
     if not isinstance(self.df, pd.DataFrame):
         self.df = get_data(nrows=100000, bq=True)
     y_train = self.df["fare_amount"]
     X_train = self.df.drop("fare_amount", axis=1)
     self.split = self.kwargs.get("split", self.TRAINING_NROWS)  # cf doc above
     if self.split:
         X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15)
     print(X_train.shape)
     self.set_pipeline()
     self.pipeline.fit(X_train, y_train)
     if self.split:
         rmse = self.evaluate(X_val, y_val)
示例#8
0
 def run_grid_search(self):
     grid = {}
     
     df = get_data()
     df = clean_data(df)
     X,y = get_Xy(df)
     
     
     X_train, X_val, y_train, y_val = hold_out(X,y)
     trainer = Trainer(X_train,y_train)
     self.pipeline = trainer.set_pipeline()
     
     search = GridSearchCV(self.pipeline, grid, 
                        scoring = 'neg_mean_squared_error',
                        cv = 5,
                        n_jobs=-1)
     
     search.fit(X_train,y_train)
     return search
def test_number_of_columns():
    assert get_data(nrows=5).shape == (5, 8)
示例#10
0
                                  (model["name"], model["_class"]())])

        return self.pipeline

    def run(self):
        """set and train the pipeline"""
        return self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the RMSE"""
        return self.pipeline.score(X_test, y_test)


if __name__ == "__main__":
    # get data
    raw_data = data.get_data()
    # clean data
    df = data.clean_data(raw_data)
    # set X and y
    target = "fare_amount"
    X = df.drop(columns=target)
    y = df[target]
    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # train
    trainer = Trainer(X_train, y_train)
    model_ready = trainer.set_pipeline()
    model_trained = trainer.run()
    # evaluate
    model_eval = trainer.evaluate(X_test, y_test)
示例#11
0
if __name__ == "__main__":
    warnings.simplefilter(action='ignore', category=FutureWarning)
    # Get and clean data
    experiment = "taxifare_test_jean"
    params = dict(
        nrows=100000,
        upload=True,
        local=False,  # set to False to get data from GCP (Storage or BigQuery)
        gridsearch=False,
        optimize=False,
        estimator="xgboost",
        mlflow=True,  # set to True to log params to mlflow
        experiment_name=experiment)
    print("############   Loading Data   ############")
    df = get_data(**params)
    df = clean_df(df)
    y_train = df["fare_amount"]
    X_train = df.drop("fare_amount", axis=1)
    del df
    print("shape: {}".format(X_train.shape))
    print("size: {} Mb".format(X_train.memory_usage().sum() / 1e6))
    # Train and save model, locally and
    t = Trainer(X=X_train, y=y_train, **params)
    del X_train, y_train
    print(colored("############  Training model   ############", "red"))
    t.train()
    print(colored("############  Evaluating model ############", "blue"))
    t.evaluate()
    print(colored("############   Saving model    ############", "green"))
    t.save_model()
示例#12
0
        ]), ('time', time_pipe, ['pickup_datetime'])],
                                         remainder="drop")

        self.pipeline = Pipeline([('preproc', preproc_pipe),
                                  ('linear_model', LinearRegression())])

    def run(self):
        """set and train the pipeline"""
        self.set_pipeline()
        self.pipeline.fit(self.X, self.y)

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the RMSE"""
        y_pred = self.pipeline.predict(X_test)
        rmse = compute_rmse(y_pred, y_test)
        return rmse


if __name__ == "__main__":
    N = 10_000
    df = get_data(nrows=N)
    df = clean_data(df)
    X = df.drop(columns=["fare_amount"])
    y = df["fare_amount"]
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    train = Trainer(X_train, y_train)
    train.run()
    rmse = train.evaluate(X_test, y_test)
    print(f"rmse: {rmse}")
示例#13
0
def test_haversine():
    df = get_data(nrows=1)
    assert round(haversine_vectorized(df)[0], 2) == 1.03, "Distance not right"
示例#14
0
        model_pipe = Pipeline([('preprocessing', prepro_pipe),
                               ('model', LinearRegression())])

        self.pipeline = model_pipe

    def run(self):
        """set and train the pipeline"""
        self.pipeline.fit(self.X_train, self.y_train)

    def evaluate(self):
        """evaluates the pipeline on df_test and return the RMSE"""
        print(compute_rmse(self.pipeline.predict(self.X_test), self.y_test))


if __name__ == "__main__":
    # get data and clean data
    df = clean_data(get_data())
    # set X and y
    features = [
        'key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
        'dropoff_longitude', 'dropoff_latitude', 'passenger_count'
    ]
    X, y = df[features], df['fare_amount']
    # train
    trainer = Trainer(X, y)
    trainer.set_pipeline()
    trainer.run()
    # evaluate
    trainer.evaluate()
示例#15
0
                self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)


if __name__ == "__main__":
    # get data
    df = get_data(10000)
    # clean data
    df_cleaned = clean_data(df)
    # set X and y
    target = "fare_amount"
    features = list(df.drop(columns=[target]).columns)
    X = df_cleaned[features]
    y = df_cleaned[target]
    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    # train XGBoost or Lasso
    model = Trainer('XGBoost', X_train, y_train)
    model.run()
示例#16
0
def test_cleaned_data():
    df = get_data(nrows=100)
    assert clean_data(df).shape[0] <= df.shape[0]
示例#17
0
        self.set_pipeline()
        pipe_trained = self.pipe.fit(self.X, self.y)
        return pipe_trained

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the RMSE"""
        pipe_trained = self.run()
        y_pred = pipe_trained.predict(X_test)
        rmse = compute_rmse(y_pred, y_test)
        return rmse


if __name__ == "__main__":
    n_rows = 10000

    df = get_data(n_rows)
    df = clean_data(df)

    y = df['fare_amount']
    X = df.drop(columns='fare_amount')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    trainer = Trainer(X_train, y_train)
    trainer.run()

    rmse = trainer.evaluate(X_test, y_test)

    print(rmse)
    print('OK model trained')
示例#18
0
class Trainer():

    def __init__(self, X, y):
        """
            X: pandas DataFrame
            y: pandas Series
        """
        self.pipeline = None
        self.X = X
        self.y = y
        self.experiment_name = EXPERIMENT_NAME

        @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(self.experiment_name).experiment_id

    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)



    def set_pipeline(self):
        """defines the pipeline as a class attribute"""
        distance_pipe = make_pipeline(DistanceTransformer(), StandardScaler())

        time_pipe = make_pipeline(
            TimeFeaturesEncoder(time_column = 'pickup_datetime'),
            OneHotEncoder(handle_unknown = 'ignore')
            )

        preprocessor = ColumnTransformer([
            ('distance_trans', distance_pipe, ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']),
            ('time_trans', time_pipe, ['pickup_datetime'])])

        model_pipeline = Pipeline(steps = [('preprocessing', preprocessor),
                                            ('regressor', LinearRegression())])

        self.pipeline = model_pipeline

        return self

    def run(self):
        """set and train the pipeline"""
        self.set_pipeline()
        self.pipeline.fit(self.X, self.y)
        return self

    def evaluate(self, X_test, y_test):
        """evaluates the pipeline on df_test and return the RMSE"""
        y_pred = self.pipeline.predict(X_test)
        rmse = compute_rmse(y_pred, y_test)
        print(f'ID:{trainer.mlflow_experiment_id}')
        self.mlflow_log_param('model', str(self.pipeline.get_params()['model'])
                              .strip('()'))
        self.mlflow_log_metric('rmse', rmse)
        return rmse


if __name__ == "__main__":
    # get & clean data
    data = clean_data(get_data())

    # set X and y
    X = data.drop(columns = ['fare_amount'])
    y = data['fare_amount']

    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

    trainer = Trainer(X_train, y_train)
    trainer.run()
    trainer.evaluate(X_test, y_test)
    # build pipeline
    #train_pipe = Trainer(X_train, y_train).set_pipeline()

    # train the pipeline
    #model = pipeline.run(X_train, y_train, train_pipe)

    # evaluate the pipeline
    #result = pipeline.evaluate(X_test, y_test, model)

    print(trainer.evaluate(X_test, y_test))
示例#19
0
    @memoized_property
    def mlflow_run(self):
        return self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)


if __name__ == "__main__":

    # get data
    df = get_data(nrows=10_000)

    # clean data
    df = clean_data(df)

    # set X and y
    X = df.drop(columns='fare_amount')
    y = df['fare_amount']

    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    # train
    pipe = Trainer(X_train, y_train)
    pipe.run()
示例#20
0
        """ Save the trained model into a model.joblib file """
        joblib.dump(model, f"model.joblib")
        #joblib.dump(model,'model.joblib')
        print("saved model.joblib locally")

        # Implement here
        self.upload_model_to_gcp()
        print(
            f"uploaded model.joblib to gcp cloud storage under \n => {self.STORAGE_LOCATION}"
        )
        return self


if __name__ == "__main__":
    # get data
    data = get_data()
    # clean data
    data = clean_data(data)
    # set X and y
    X, y = getXy(data, col_target="fare_amount")
    # hold out
    X_train, X_val, y_train, y_val = getholdout(X, y)
    # train
    trainer = Trainer(X_train, y_train)
    trainer.run()
    # evaluate
    score = trainer.evaluate(X_val, y_val)
    # joblib
    model = trainer.best_model
    trainer.save_model(score)
    print(trainer.best_params)
示例#21
0
    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)


knn = KNeighborsRegressor()
rf = RandomForestRegressor()
svr = SVR()
lasso = Lasso()
ridge = Ridge()

models = [rf]

if __name__ == "__main__":
    # get data
    df = get_data()
    # clean data
    df = clean_data(df)
    # set X and y
    X = df.drop(columns=['fare_amount'])
    y = df['fare_amount']
    # hold out
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    for estimator in models:
        # train
        trainer = Trainer(X_train, y_train)
        trainer.run(estimator)
        # evaluate
        #trainer.evaluate(X_test, y_test, estimator)
        # save
        trainer.save_model(estimator)
def read_data(n_rows=10000):
    df = get_data(n_rows=n_rows, local=False)
    return df