def model_load(prefix='sl', data_dir=None, training=True):
    """
    example funtion to load model
    
    The prefix allows the loading of different models
    """

    if not data_dir:
        #         data_dir = os.path.join("..","data","cs-train")
        data_dir = os.path.join(os.getcwd(), "data", "cs-train")

    models = [
        f for f in os.listdir(os.path.join(".", "models"))
        if re.search("sl", f)
    ]

    if len(models) == 0:
        raise Exception(
            "Models with prefix '{}' cannot be found did you train?".format(
                prefix))

    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(".", "models", model))

    ## load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
예제 #2
0
def _model_train(df =None,prefix ='sl',country=None,test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """


    ## start timer for runtime
    time_start = time.time()
    
    X,y,dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size),subset_indices)
        y=y[mask]
        X=X[mask]
        dates=dates[mask]
        
    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                        shuffle=True, random_state=42)
    ## train a random forest model
    param_grid_rf = {
    'rf__criterion': ['mse','mae'],
    'rf__n_estimators': [10,15,20,25]
    }

    pipe_rf = Pipeline(steps=[('scaler', StandardScaler()),
                              ('rf', RandomForestRegressor())])
    
    grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1)
    grid.fit(X_train[:5], y_train[:5])
    y_pred = grid.predict(X_test)
    eval_rmse =  round(np.sqrt(mean_squared_error(y_test,y_pred)))
    
    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.","_",str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(country,model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "{}-{}-{}.joblib".format(prefix,country,model_name))
        print("... saving model: {}".format(saved_model))
        
    joblib.dump(grid,saved_model)

    runtime = convert((time.time()-time_start))

    ## update log
    date_range = f"{str(dates[0])} - {str(dates[-1])}"
예제 #3
0
def model_load(prefix='sl', data_dir=None, training=True):

    if not data_dir:
        data_dir = os.path.join("..", "capstone-w", "cs-train")

    models = [
        f for f in os.listdir(os.path.join(".", "models"))
        if re.search("sl", f)
    ]

    if len(models) == 0:
        raise Exception(
            "Models with prefix '{}' cannot be found did you train?".format(
                prefix))

    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(".", "models", model))

    # load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        df = clean_data(df)
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
예제 #4
0
def _model_train(df, tag, pipe, param_grid, test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))

    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse},
                     runtime,
                     MODEL_VERSION,
                     test=test)
예제 #5
0
def model_monitor(country="all", dev=DEV, training=True):
    """
    performance monitoring
    """
    print("Monitor Model")
    
    ## import data
    #datasets = engineer_features(training=training, dev=dev)
    datasets = engineer_features(training=training)
    X, y, dates, labels = datasets[country]
    dates = pd.to_datetime(dates)
    print(X.shape)
    
    ## train the model
    if training:
        _model_train(X, y, labels, tag=country, dev=dev)
    
    ## monitor RMSE
    samples = [10, 20, 30, 50, 60]

    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n, X, y, dates)
        queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new]
        y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries]
        rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred))
        print("sample size: {}, RSME: {}".format(n, rmse.round(2)))
        
    ## monitor performance
    ## scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    samples = [25, 50, 75, 90]

    clf_y = EllipticEnvelope(random_state=0,contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0,contamination=0.01)

    clf_X.fit(X)
    clf_y.fit(y.reshape(y.size,1))

    results = defaultdict(list)
    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n,X,y, dates)
        results["sample_size"].append(n)
        results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2))
        results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2))
        test1 = clf_X.predict(X_new)
        test2 = clf_y.predict(y_new.reshape(y_new.size,1))
        results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2))
        results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2))
    
    return pd.DataFrame(results)
예제 #6
0
def _model_train_gradient_boost(df, tag, test=False):
    """
    example funtion to train model

    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file

    """

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)
    ## train a gradient boost regressor model
    param_grid_gb = {
        'gb__criterion': ['mse', 'mae'],
        'gb__n_estimators': [10, 15, 20, 25]
    }

    pipe_gb = Pipeline(
        steps=[('scaler', StandardScaler()), ('gb',
                                              GradientBoostingRegressor())])

    grid = GridSearchCV(pipe_gb, param_grid=param_grid_gb, cv=5,
                        n_jobs=-1)  #iid=False
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_mse = round(mean_squared_error(y_test, y_pred))
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))
    print("country", tag, "--  gradient boost regressor eval  --")
    print("sme:", eval_mse)
    print("rsme:", eval_rmse)
예제 #7
0
파일: model.py 프로젝트: tdody/RevenueModel
def model_load(prefix='sl', data_dir=None, training=True, country=None):
    """
    example function to load model
    
    The prefix allows the loading of different models
    """
    ## if data path not specified, use generic
    if not data_dir:
        data_dir = os.path.join(".", "data", "cs-train")

    ## load all models (or filter for country)
    if country is None:
        models = [
            f for f in os.listdir(os.path.join(".", "models"))
            if re.search(prefix, f)
        ]
    else:
        country_id = re.sub("\s+", "_", country.lower())
        models = [
            f for f in os.listdir(os.path.join(".", "models"))
            if (re.search(prefix, f) and re.search(country_id, f))
        ]

    if len(models) == 0:
        if country is None:
            raise Exception(
                "Models with prefix '{}' cannot be found, did you train?".
                format(prefix))
        else:
            raise Exception(
                "Model for '{0}' with predix '{1}' cannot be found, did you train it?"
                .format(prefix, country))

    ## store model in dictionary
    ## key = model name
    ## value = model
    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(".", "models", model))

    ## load data
    ts_data = fetch_ts(data_dir, country=country)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
예제 #8
0
def get_latest_train_data():
    """
    load the data used in the latest training
    """

    #data_file = os.path.join("models",'latest-train.pickle')
    Prod_files = r"C:\Users\AshwiniShitole\Desktop\Ashwini\Personal\Data Science\AI Academy\AI Enterprise Workflow Certification\AI in Production\Capstone_Project\case-study-soln\data\cs-production"

    if not os.path.exists(Prod_files):
        raise Exception(
            "cannot find {}-- did you train the model?".format(data_file))

    Prod_data = fetch_data(Prod_files)
    Prod_data = Prod_data.drop(columns=['customer_id'])
    PROD_TS = convert_to_ts(Prod_data)
    X, y, dates = engineer_features(PROD_TS, training=False)
    return (X, y)
예제 #9
0
def model_load(prefix='sl', data_dir=None, training=True):
    """
    example funtion to load model
    
    The prefix allows the loading of different models
    """

    if not data_dir:
        data_dir = os.path.join(PARENT_DIR, "Final_Capstone/cs-train")

    all_models = model_load_only(prefix=prefix)

    ## load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
예제 #10
0
def model_load(country, prefix='sl', data_dir=None, training=True):
    """
    example funtion to load model
    
    The prefix allows the loading of different models
    """
    warnings.filterwarnings("ignore")

    if not data_dir:
        data_dir = os.path.join(DATA_DIR)

    # country when passed will load that country's model. 'all' will all models
    model_name = prefix + '-' + country
    models = [
        f for f in os.listdir(os.path.join(MODEL_DIR))
        if re.search(model_name, f)
    ]
    if len(models) == 0:
        raise Exception(
            "Models with prefix '{}' cannot be found did you train?".format(
                prefix))

    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(MODEL_DIR, model))

    # load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
예제 #11
0
def _model_train(df, tag, test=False):
    """
    example funtion to train model

    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file

    """

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    rs = 42

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)

    ## build models
    regressor_names = [
        "SGDRegressor", "RandomForestRegressor", "GradientBoostingRegressor",
        "AdaBoostRegressor"
    ]

    regressors = (SGDRegressor(random_state=rs),
                  RandomForestRegressor(random_state=rs),
                  GradientBoostingRegressor(random_state=rs),
                  AdaBoostRegressor(random_state=rs))

    params = [{
        "reg__penalty": ["l1", "l2", "elasticnet"],
        "reg__learning_rate": ["constant", "optimal", "invscaling"]
    }, {
        "reg__n_estimators": [10, 30, 50],
        "reg__max_features": [3, 4, 5],
        "reg__bootstrap": [True, False]
    }, {
        "reg__n_estimators": [10, 30, 50],
        "reg__max_features": [3, 4, 5],
        "reg__learning_rate": [1, 0.1, 0.01, 0.001]
    }, {
        "reg__n_estimators": [10, 30, 50],
        "reg__learning_rate": [1, 0.1, 0.01, 0.001]
    }]

    ## train models
    models = {}
    total = len(regressor_names)
    for iteration, (name, regressor, param) in enumerate(
            zip(regressor_names, regressors, params)):

        pipe = Pipeline(steps=[('scaler', StandardScaler()), ("reg",
                                                              regressor)])

        grid = GridSearchCV(pipe,
                            param_grid=param,
                            scoring="neg_mean_squared_error",
                            cv=5,
                            n_jobs=-1,
                            return_train_score=True)
        grid.fit(X_train, y_train)
        models[name] = grid, grid.best_estimator_["reg"].get_params()

    ## evaluation on the validation set
    val_scores = []
    for key, model in models.items():
        y_pred = model[0].predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_pred, y_test))
        val_scores.append(rmse)

    ## select best model
    bm = regressor_names[np.argmin(val_scores)]
    opt_model, params = models[bm]

    print("cuurent optimal model is: ", bm)

    ## retrain best model using all data
    opt_model.fit(X, y)

    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

        print("... saving latest data")
        data_file = os.path.join("models", 'latest-train.pickle')
        with open(data_file, 'wb') as tmp:
            pickle.dump({'y': y, 'X': X}, tmp)

    joblib.dump(opt_model, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## plot the figure of rmse
    model_names = ['SGD', 'RF', 'GBM', 'ADA']
    model_rmses = val_scores

    fig = plt.figure(figsize=(10, 5))

    # creating the bar plot
    plt.bar(model_names, model_rmses, width=0.4)

    plt.xlabel("Model Names")
    plt.ylabel("Model Errors")
    plt.title("Model Training RMSE Comparisons")
    plt.show()

    ## update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])),
                     {'rmse': min(val_scores)},
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=True)
def _model_train(df,tag,test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """


    ## start timer for runtime
    time_start = time.time()
    
    X,y,dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size),subset_indices)
        y=y[mask]
        X=X[mask]
        dates=dates[mask]
        
    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                       shuffle=True, random_state=42)
                                                       
    ###################################################################################
    ## train a random forest model  
    ###################################################################################
    print("\nTRAINING MODELS: RANDOM FOREST MODEL")
    param_grid_rf = {
    'rf__criterion': ['mse','mae'],
    'rf__n_estimators': [10,15,20,25]
    }

    pipe_rf = Pipeline(steps=[('scaler', StandardScaler()),
                              ('rf', RandomForestRegressor())])
    
    grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1,return_train_score=True)
    grid_rf.fit(X_train, y_train)
    scores_df_rf = pd.DataFrame(grid_rf.cv_results_).sort_values(by='rank_test_score')
    scores_df_rf['model']=grid_rf
    scores_df_rf = scores_df_rf[scores_df_rf['rank_test_score'] == 1]
    
    #print(scores_df_rf.columns)
    print(grid_rf.best_params_)
    print(grid_rf.best_score_)
    
    y_pred = grid_rf.predict(X_test)
    eval_rmse =  round(np.sqrt(mean_squared_error(y_test,y_pred)))
    
    scores_df_rf['eval_rmse']=eval_rmse
    print ("eval_rmse: {}".format(eval_rmse))
    #print(scores_df_rf[['rank_test_score','params','mean_test_score','mean_fit_time','mean_score_time','eval_rmse']])
    print("\nEND OF TRAINING MODELS: RANDOM FOREST MODEL")
    
    ###################################################################################
    ## train a bagging model  
    ###################################################################################
    print("\nTRAINING MODELS: BAGGING MODEL")
    ## train a bagging model
    pipe_bag = Pipeline(steps=[('scaler', StandardScaler()),
                               ('bag', BaggingRegressor(base_estimator=SVR(), random_state=0))])
    
    param_grid_bag = {
    'bag__n_estimators': [10,15,20,25]
    }
    
    grid_bag = GridSearchCV(pipe_bag, param_grid=param_grid_bag, cv=5, iid=False, n_jobs=-1)
    grid_bag.fit(X_train, y_train)
    #print(grid_bag.get_params())
    #print(grid_bag.score(X_train, y_train))
    #print(grid_bag.cv_results_)
    scores_df_bag = pd.DataFrame(grid_bag.cv_results_).sort_values(by='rank_test_score')
    scores_df_bag['model']=grid_bag
    scores_df_bag = scores_df_bag[scores_df_bag['rank_test_score'] == 1]
    #print(scores_df_bag.columns)
    print(grid_bag.best_params_)
    print(grid_bag.best_score_)
        
    y_pred = grid_bag.predict(X_test)
    eval_rmse =  round(np.sqrt(mean_squared_error(y_test,y_pred)))
    
    scores_df_bag['eval_rmse']=eval_rmse
    print ("eval_rmse: {}".format(eval_rmse))
    #print(scores_df_bag[['rank_test_score','params','mean_test_score','mean_fit_time','mean_score_time','eval_rmse']])
    print("\nEND OF TRAINING MODELS: BAGGING MODEL")
    
    ## Compare models
    results_df = scores_df_rf.append(scores_df_bag,ignore_index=True).sort_values(by='mean_test_score',ascending=False)
    print(results_df[['model','params','mean_test_score','mean_fit_time','mean_score_time','eval_rmse']])
    
    best_model = results_df['model'].loc[0]
    print("best_model: {}".format(best_model))
    
    ## retrain using all data and the Random Forest model
    best_model.fit(X, y)
     
    model_name = re.sub("\.","_",str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag,model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag,model_name))
        print("... saving model: {}".format(saved_model))
        
    joblib.dump(best_model,saved_model)

    m, s = divmod(time.time()-time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d"%(h, m, s)

    ## update log
    update_train_log(tag,(str(dates[0]),str(dates[-1])),{'rmse':eval_rmse},runtime,
                     MODEL_VERSION, MODEL_VERSION_NOTE,test=True)
    '''update_train_log((str(dates[0]),str(dates[-1])),{'rmse':eval_rmse},runtime,
예제 #13
0
def _model_train(prefix,
                 df,
                 tag,
                 test=False,
                 model=DEFAULT_MODEL,
                 model_param_grid=DEFAULT_PARAM_GRID,
                 scaler=DEFAULT_SCALER):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """
    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)

    pipe_rf = Pipeline(steps=[('scaler', scaler), ('rf', model)])

    grid = GridSearchCV(pipe_rf, param_grid=model_param_grid, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))

    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))

    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(
            MODEL_DIR, "{}-{}-{}.joblib".format(prefix, tag, model_name))
        print("... saving model: {}".format(saved_model))
        data_file = os.path.join(
            MODEL_DIR, '{}-{}-{}-train.pickle'.format(prefix, tag, model_name))
        with open(data_file, 'wb') as tmp:
            pickle.dump({'y': y, 'X': X}, tmp)
        print("... saving latest data")

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse},
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=test)
예제 #14
0
def _model_train(df, tag, test=False):
    """
    example function to train model

    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file

    """

    # start timer for runtime
    time_start = time.time()

    x, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * x.shape[0]))
        subset_indices = np.random.choice(np.arange(x.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        x = x[mask]
        dates = dates[mask]

    # Perform a train-test split
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)

    print("... training random forest for {}".format(tag))
    # train a random forest model
    param_grid_rf = {
        'rf__criterion': ['mse', 'mae'],
        'rf__n_estimators': [10, 15, 20, 25]
    }

    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

    grid_rf = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=1)
    grid_rf.fit(x_train, y_train)
    y_pred_rf = grid_rf.predict(x_test)
    eval_rmse_rf = round(np.sqrt(mean_squared_error(y_test, y_pred_rf)))

    # retrain using all data
    grid_rf.fit(x, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "rf-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid_rf, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    # update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])),
                     {'rmse': eval_rmse_rf}, runtime, MODEL_VERSION,
                     MODEL_VERSION_NOTE)

    print("... training XGBRegressor for {}".format(tag))
    # training an XGBoost model
    pipe_xgb = Pipeline(
        steps=[('scaler', StandardScaler()), ('xgb_model',
                                              xgb.XGBRegressor())])

    param_grid_xgb = {
        'xgb_model__subsample': np.arange(.05, 1, .05),
        'xgb_model__max_depth': np.arange(3, 20, 1),
        'xgb_model__colsample_bytree': np.arange(.1, 1.05, .05)
    }

    grid_xgb = RandomizedSearchCV(estimator=pipe_xgb,
                                  param_distributions=param_grid_xgb,
                                  n_iter=10,
                                  scoring='neg_mean_squared_error',
                                  cv=4)

    grid_xgb.fit(x_train, y_train)

    y_pred_xgb = grid_xgb.predict(x_test)
    eval_rmse_xgb = round(np.sqrt(mean_squared_error(y_test, y_pred_xgb)))

    # retrain using all data
    grid_xgb.fit(x, y)

    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "xgb-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid_xgb, saved_model)

    update_train_log(tag, (str(dates[0]), str(dates[-1])),
                     {'rmse': eval_rmse_xgb}, runtime, MODEL_VERSION,
                     MODEL_VERSION_NOTE)
예제 #15
0
def _model_train(df, tag, test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)
    ## train a random forest model
    param_grid_rf = [{
        'regr': [RandomForestRegressor()],
        'regr__criterion': ['mse', 'mae'],
        'regr__n_estimators': [10, 15, 20, 25]
    }, {
        'regr': [SVR()],
        'regr__kernel': ['rbf', 'linear'],
        'regr__C': [1.0, 1.5]
    }]

    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('regr',
                                              RandomForestRegressor())])

    grid = GridSearchCV(pipe_rf,
                        param_grid=param_grid_rf,
                        cv=5,
                        iid=False,
                        n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))

    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse},
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=True)
예제 #16
0
def _model_train(df, tag, test=False):

    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)

    param_grid_rf = {
        'rf__criterion': ['mse', 'mae'],
        'rf__n_estimators': [10, 15, 20, 25]
    }

    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

    grid = GridSearchCV(pipe_rf,
                        param_grid=param_grid_rf,
                        cv=5,
                        iid=False,
                        n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))

    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse},
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=True)
예제 #17
0
def _model_train(df, tag, test=False, regressor=None):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """
    # Models available for training
    regressorsList = {
        'randomforest': RandomForestRegressor(),
        'extratrees': ExtraTreesRegressor()
    }
    if regressor.lower() not in regressorsList.keys():
        raise Exception(
            "Regressor with name '{}' not found (available: {})".format(
                regressor, ', '.join(regressorsList.keys())))
    regressor = regressor.lower()  # match is case insensitive

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)
    ## train a regression model
    param_grid_rf = {
        'reg__criterion': ['mse', 'mae'],
        'reg__n_estimators': [10, 15, 20, 25]
    }

    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('reg',
                                              regressorsList[regressor])])

    grid = GridSearchCV(pipe_rf,
                        param_grid=param_grid_rf,
                        cv=5,
                        iid=False,
                        n_jobs=-1)
    print("... using model: {}".format(regressor))
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))

    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)