Python engineer_features示例，data_ingestion.engineer_features Python示例

示例#1

0

显示文件

def model_predict(year, month, day, country = 'all'):
    """
    Make predictions based on a country.
    """
    
    ## Timer
    time_start = time.time()
    
    ## Load all data
    ts = load_ts()
    eng_datasets = {country: engineer_features(ts[country], training = False) for country in ts.keys()}
    
    
    ## Load all models
    models = model_load()
    
    
    ## check if model for country 
    if country not in models.keys():
        raise Exception(f'ERROR: (model_predict) for country {country} is unavailable.')
        
        
    ## Check if dataset is available
    if country not in eng_datasets.keys():
        raise Exception(f'ERROR: (dataset) for country {country} is unavailable.')
    
 
    ## Load data and model for country
    model = models[country]
    eng_dataset = eng_datasets[country]
    
    
    ## Check date
    target_date = f'{year}-{str(month).zfill(2)}-{str(day).zfill(2)}'
    print(target_date)
    
    ## Data to predict on:
    X_pred = eng_dataset[eng_dataset['dates'] == target_date].drop(['target','dates'], axis =1 )
    
    
    
    # Prediction
    y_pred = model.predict(X_pred)
    
    _update_predict_log(tag = country, y_pred = y_pred,  target_date = target_date,\
                        MODEL_VERSION = MODEL_VERSION, MODEL_VERSION_NOTE = MODEL_VERSION_NOTE)
    
    return(y_pred)

示例#2

0

显示文件

def model_load(prefix='sl', data_dir=None, training=True, save_pickle=False):
    """
    example funtion to load model

    The prefix allows the loading of different models
    """

    if not data_dir:
        data_dir = os.path.join(".", "data", "cs-train")

    models = [
        f for f in os.listdir(os.path.join(".", "models"))
        if re.search("sl", f)
    ]

    if len(models) == 0:
        raise Exception(
            f"Models with prefix '{prefix}' cannot be found did you train?")

    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(".", "models", model))

    ## load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    if save_pickle:
        version_ = re.sub("\.", "_", str(MODEL_VERSION))
        pickle.dump(
            (all_data, all_models),
            open(os.path.join("models", f"all_data_model-{version_}.pickle"),
                 "wb"))
        print('Pickle file saved.')
    return (all_data, all_models)

示例#3

0

显示文件

def model_compare(data_dir, country='United Kingdom'):
    '''
    train all models for one country using gridsearch, return a
    df that compares the performance of models
    '''

    print('Ingesting data')
    df = fetch_data(data_dir)
    df_country = convert_to_ts(df, country)
    X, y, dates = engineer_features(df_country)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)
    # define all pipelines and the param grids
    pipe_lr = Pipeline(steps=[('scaler', StandardScaler()), ('lr',
                                                             ElasticNet())])
    pipe_sgd = Pipeline(steps=[('scaler',
                                StandardScaler()), ('sgd', SGDRegressor())])
    pipe_svr = Pipeline(steps=[('scaler', StandardScaler()), ('svr', SVR())])
    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])
    pipe_gbt = Pipeline(
        steps=[('scaler',
                StandardScaler()), ('gbt', GradientBoostingRegressor())])
    param_grid_lr = {
        'lr__max_iter': [10000],
        'lr__alpha': np.logspace(-3, 0, 5),
        'lr__l1_ratio': np.linspace(0, 1, 5)
    }
    param_grid_sgd = {
        'sgd__penalty': ['elasticnet'],
        'sgd__alpha': np.logspace(-4, 1, 5),
        'sgd__l1_ratio': np.linspace(0, 1, 5),
        'sgd__max_iter': np.linspace(50, 250, 5, dtype='int'),
        'sgd__learning_rate': ['optimal', 'invscaling']
    }
    param_grid_svr = {
        'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'svr__C': np.logspace(-2, 2, 5),
        'svr__gamma': np.logspace(-3, 0, 4),
    }
    param_grid_rf = {
        'rf__n_estimators': np.linspace(25, 100, 4, dtype='int'),
        'rf__max_depth': np.linspace(6, 15, 4, dtype='int'),
        'rf__min_samples_split': np.linspace(2, 8, 4, dtype='int')
    }
    param_grid_gbt = {
        'gbt__learning_rate': np.logspace(-3, -1.5, 5),
        'gbt__n_estimators': np.linspace(25, 100, 4, dtype='int'),
        'gbt__max_depth': np.linspace(6, 15, 4, dtype='int'),
        'gbt__min_samples_split': np.linspace(2, 8, 4, dtype='int'),
    }
    all_pipes = {
        pipe_lr: param_grid_lr,
        pipe_sgd: param_grid_sgd,
        pipe_svr: param_grid_svr,
        pipe_rf: param_grid_rf,
        pipe_gbt: param_grid_gbt
    }

    # train each model
    time_start_all = time.time()
    results = []
    for pipe in all_pipes:
        time_start = time.time()

        pipe_name = '->'.join([step[0] for step in pipe.steps])
        print(f'Training {pipe_name}')
        grid = GridSearchCV(pipe,
                            param_grid=all_pipes[pipe],
                            cv=5,
                            n_jobs=-1,
                            verbose=0)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        run_time = time.time() - time_start

        rmse = mse(y_test, y_pred, squared=False)
        comb = 1
        for param in [*all_pipes[pipe].values()]:
            comb = comb * len(param)
        # divide the total run time by the numbe of possible combinations of params
        avg_time = run_time / comb
        results.append(
            [pipe_name, rmse, run_time, avg_time, grid.best_params_])

    run_time_all = time.time() - time_start_all
    print(f'Training finished! Total training time {round(run_time_all)}s')

    df = pd.DataFrame(results,
                      columns=[
                          'pipeline', 'test_rmse', 'total_time', 'avg_time',
                          'best_params'
                      ])

    return df

示例#4

0

显示文件

def _model_train(df, tag, test=False):
    """
    example funtion to train model

    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file

    """

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)
    ## train a random forest model
    param_grid_rf = {
        'rf__n_estimators': np.linspace(25, 100, 4, dtype='int'),
        'rf__max_depth': np.linspace(6, 15, 4, dtype='int'),
        'rf__min_samples_split': np.linspace(2, 8, 4, dtype='int')
    }

    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

    grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mse(y_test, y_pred)))

    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   f"test-{tag}-{model_name}.joblib")
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR, f"sl-{tag}-{model_name}.joblib")
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse},
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=test)

示例#5

0

显示文件

def _model_train(dataset,tag,test = TEST):
    """
    Train models and select the best one out of DecisionTreeRegression,  GradientBoostingRegression, AdaBoostRegression
    and XGBoostRegressor. Feed the model the timeseries_datasets.
    """
    
    ## start timer for runtime
    time_start = time.time()
    
    dataset = engineer_features(dataset, training = True)
    
    
    X = dataset.drop(['target','dates'], axis = 1)
    y = dataset.target
    
    #Train_Test_Split Data
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 0)
    
    
    ##Train Models
    
    GridSearchParameters = {'criterion': ['mse', 'mae', 'friedman_mse'],
                            'max_depth': [None, 10,20,50],
                            'max_features': ['auto', 'sqrt', 'log2']}, \
    {'criterion': ['mse', 'mae'],
     'max_features' : ['auto', 'sqrt'] }, \
    {'loss' : ['ls', 'lad', 'huber', 'quantile'],
     'learning_rate' : [0.1,0.01,0.001]}, \
    {'loss' : ['linear', 'square',],
     'learning_rate' : [0.05, 0.1, 0.01]}, \
    {'learning_rate': [0.05, 0.1, 0.01],
     'max_depth': [1, 5, 50],
     'n_estimators': [100, 1000, 500]
    }

    params = {
        'DTR_P' : GridSearchParameters[0],
        'RFR_P' : GridSearchParameters[1],
        'GBR_P' : GridSearchParameters[2],
        'ADA_P' : GridSearchParameters[3],
        'XGB_P' : GridSearchParameters[4],
    }
    
    regressor_dict = {
        'DTR' : DecisionTreeRegressor(random_state = 42),
        'RFR' : RandomForestRegressor(random_state = 42),
        'GBR' : GradientBoostingRegressor(random_state = 42),
        'ADA' : AdaBoostRegressor(random_state = 42),
        'XGB' : xgb.XGBRegressor(seed = 42)

    }
    

    
    models = {}
    
    for model_name in regressor_dict:
        
        pipe = Pipeline(steps = [('scaler', StandardScaler()),
                                ('regressor', regressor_dict[model_name])])
        grid = GridSearchCV(regressor_dict[model_name],
                           param_grid = params[model_name + '_P'], cv = 5)
        grid.fit(X_train, y_train)
        
        models[model_name] = grid
        
     
    model_scores = []
    
    #Test which model is optimal.
    for model in models:
        y_pred = models[model].predict(X_test)
        rmse = np.sqrt(mse(y_pred, y_test))
        model_scores.append(rmse)
    
    model_index = np.argmin(model_scores)
    model_score = min(model_scores)
    model_name = list(models.keys())[model_index]
    best_model =  list(models.values())[model_index]
    
    print(f'The best model for {tag} is {model_name}.')
   
    
    #Retrain on best model.
    best_model.fit(X,y)
    
    #Save model.
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)
    
    if test:
        saved_model = os.path.join(MODEL_DIR, f'test-{tag}-{model_name}.joblib')
    else:
        saved_model = os.path.join(MODEL_DIR, f'sl-{tag}-{model_name}.joblib')
        
    
    joblib.dump(best_model,saved_model)
    
    m, s = divmod(time.time()-time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d"%(h, m, s)
    
    
    # Update Train Log.
    _update_train_log(tag, best_model, model_index, model_score, dataset.shape, runtime, MODEL_VERSION,
                     MODEL_VERSION_NOTE,test)