Exemplo n.º 1
0
    for line in data:
        try:
            power_data.append(float(line[2]))
            d = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')
            d = datetime.strptime(line[0] + " " + line[1], '%d/%m/%Y %H:%M:%S')
            cycle_data.append([d.month, d.weekday(), d.hour])
            #d.year
        except ValueError:
            pass




normalizing_factor = l2norm(power_data)

power_data, parmsFromNormalization = preprocess1DtoZeroMeanUnit(power_data)


window_size = 7


#build rolling window. It is of size row_count - window_size ( due to first window_size -1 due to lack of data, and last value
#due to lack of training (Y) data
X_all, Y_all = build_rolling_window_dataset(power_data, window_size)

#X_all = X_all[:10000,]
#Y_all = Y_all[:10000]

row_count = X_all.shape[0];
training_set_size = int(0.7*row_count)
Exemplo n.º 2
0
#http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano/
#http://danielhnyk.cz/predicting-sequences-vectors-keras-using-rnn-lstm/

from mltools import rolling_univariate_window, build_rolling_window_dataset
from mltools import train_test_split, print_graph_test, almost_correct_based_accuracy
from mltools import regression_with_dl, print_regression_model_summary, preprocess1DtoZeroMeanUnit

df = pd.read_csv("data/rossmann300stores.csv")

print("data frame Shape", df.shape)

#takes numpy array from data frame
sales_data = df['Sales'].values
row_count = len(df.index)

sales_data, parmsFromNormalization = preprocess1DtoZeroMeanUnit(sales_data)
df.Sales = sales_data
lb2IntEncoder = preprocessing.LabelEncoder()
df.StateHoliday = lb2IntEncoder.fit_transform(df.StateHoliday)

df = df.drop('Date', 1)
df = df.drop('Customers', 1)
#df = df.drop('SchoolHoliday',1)
#df = df.drop('StateHoliday',1)
#df = df.drop('Sales',1)

#remove last row from df and remove first from sales data
#df = df.drop(df.index[[row_count-1]])
print df.head(10)
X_all = df.values.copy()
Exemplo n.º 3
0
#http://danielhnyk.cz/predicting-sequences-vectors-keras-using-rnn-lstm/

from mltools import rolling_univariate_window, build_rolling_window_dataset
from mltools import train_test_split,print_graph_test,almost_correct_based_accuracy
from mltools import regression_with_dl, print_regression_model_summary, preprocess1DtoZeroMeanUnit

df = pd.read_csv("data/rossmann300stores.csv")

print("data frame Shape", df.shape)

#takes numpy array from data frame
sales_data = df['Sales'].values
row_count = len(df.index);


sales_data, parmsFromNormalization = preprocess1DtoZeroMeanUnit(sales_data)
df.Sales = sales_data
lb2IntEncoder = preprocessing.LabelEncoder()
df.StateHoliday  = lb2IntEncoder.fit_transform(df.StateHoliday)

df = df.drop('Date',1)
df = df.drop('Customers',1)
#df = df.drop('SchoolHoliday',1)
#df = df.drop('StateHoliday',1)
#df = df.drop('Sales',1)

#remove last row from df and remove first from sales data
#df = df.drop(df.index[[row_count-1]])
print df.head(10)
X_all = df.values.copy()
Exemplo n.º 4
0
print a
print a[10:-1]
print a[0:0]

xx = np.array([[0, 1, 2, 3],
               [10, 11, 12, 13],
               [20, 21, 22, 23],
               [30, 31, 32, 33],
               [40, 41, 42, 43]])
print xx
print xx - [1,1,1,1]

print xx - np.mean(xx, axis=0)

print "mean", np.mean(xx, axis=0)


xx = np.random.rand(1000)

normalized, parmsFromNormalization = preprocess1DtoZeroMeanUnit(xx)
new_xx = undoPreprocessing(normalized, parmsFromNormalization)
print "xx", xx
print "mean,std,sqrt", parmsFromNormalization.mean, parmsFromNormalization.std, parmsFromNormalization.sqrtx2
print "newxx", new_xx
print np.allclose(xx, new_xx)


import itertools
for i in itertools.product([1,2,3],['a','b'],[4,5]):
    print i
Exemplo n.º 5
0
    df, ['Venta_uni_hoy', 'Venta_hoy', 'Dev_uni_proxima', 'Dev_proxima'])
y_actual = df['Demanda_uni_equil'].values

training_set_size = int(0.7 * df.shape[0])
test_set_size = df.shape[0] - training_set_size

y_actual_test = y_actual[-1 *
                         test_set_size:]  # this we used for validation later

y_actual_log = transfrom_to_log(df['Demanda_uni_equil'].values)

if do_normalize_data:
    #normlization we do here to avoid warning overwriting copied data frame
    y_actual_log_train = y_actual[:training_set_size]
    y_actual_log_test = y_actual[-1 * test_set_size:]
    y_actual_log_train_norm, parmsFromNormalization = preprocess1DtoZeroMeanUnit(
        y_actual_log_train)
    y_actual_log_test_norm = apply_zeroMeanUnit2D(y_actual_log_test,
                                                  parmsFromNormalization)
    df['Demanda_uni_equil'] = np.concatenate(
        (y_actual_log_train_norm, y_actual_log_test_norm))
else:
    df['Demanda_uni_equil'] = y_actual_log

#break training and test
train_df = df[:training_set_size]
test_df = df[-1 * test_set_size:]

default_sales = train_df['Demanda_uni_equil'].median()

stat_df = calculate_feild_stats(train_df, 'Agencia_ID', 'Demanda_uni_equil')
train_df = merge_stats_with_df(train_df,
Exemplo n.º 6
0
    data = csv.reader(f, delimiter=";")
    power_data = []
    cycle_data = []
    for line in data:
        try:
            power_data.append(float(line[2]))
            d = datetime.strptime('Jun 1 2005  1:33PM', '%b %d %Y %I:%M%p')
            d = datetime.strptime(line[0] + " " + line[1], '%d/%m/%Y %H:%M:%S')
            cycle_data.append([d.month, d.weekday(), d.hour])
            #d.year
        except ValueError:
            pass

normalizing_factor = l2norm(power_data)

power_data, parmsFromNormalization = preprocess1DtoZeroMeanUnit(power_data)

window_size = 7

#build rolling window. It is of size row_count - window_size ( due to first window_size -1 due to lack of data, and last value
#due to lack of training (Y) data
X_all, Y_all = build_rolling_window_dataset(power_data, window_size)

#X_all = X_all[:10000,]
#Y_all = Y_all[:10000]

row_count = X_all.shape[0]
training_set_size = int(0.7 * row_count)

print("X_all.shape", X_all.shape)
df = drop_feilds_1df(df, ['Venta_uni_hoy', 'Venta_hoy', 'Dev_uni_proxima', 'Dev_proxima'])
y_actual = df['Demanda_uni_equil'].values

training_set_size = int(0.7*df.shape[0])
test_set_size = df.shape[0] - training_set_size

y_actual_test = y_actual[-1*test_set_size:] # this we used for validation later

y_actual_log = transfrom_to_log(df['Demanda_uni_equil'].values)


if do_normalize_data:
    #normlization we do here to avoid warning overwriting copied data frame
    y_actual_log_train = y_actual[:training_set_size]
    y_actual_log_test = y_actual[-1*test_set_size:]
    y_actual_log_train_norm, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_actual_log_train)
    y_actual_log_test_norm = apply_zeroMeanUnit2D(y_actual_log_test, parmsFromNormalization)
    df['Demanda_uni_equil'] = np.concatenate((y_actual_log_train_norm,y_actual_log_test_norm))
else:
    df['Demanda_uni_equil'] = y_actual_log

#break training and test
train_df = df[:training_set_size]
test_df = df[-1*test_set_size:]

default_sales = train_df['Demanda_uni_equil'].median()

stat_df = calculate_feild_stats(train_df, 'Agencia_ID', 'Demanda_uni_equil')
train_df = merge_stats_with_df(train_df, stat_df, 'Agencia_ID', default_mean=default_sales, default_stddev=None)
test_df = merge_stats_with_df(test_df, stat_df, 'Agencia_ID', default_mean=default_sales, default_stddev=None)
Exemplo n.º 8
0
def blend_models(conf, forecasts, model_index_by_acc, y_actual,
                 submissions_ids, submissions, blend_data,
                 blend_data_submission):
    use_complex_features = True
    if use_complex_features:
        X_all, forecasting_feilds = generate_forecast_features(
            forecasts, model_index_by_acc)
    else:
        X_all, forecasting_feilds = forecasts, [
            "f" + str(f) for f in range(forecasts.shape[1])
        ]

    X_all = np.column_stack([X_all, blend_data])
    forecasting_feilds = forecasting_feilds + get_blend_features()

    #removing NaN and inf if there is any
    y_actual_saved = y_actual
    if conf.target_as_log:
        X_all = transfrom_to_log2d(X_all)
        y_actual = transfrom_to_log(y_actual)

    X_all = fillna_and_inf(X_all)
    y_actual = fillna_and_inf(y_actual)

    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual) * 0.50))
    X_train, X_test, y_train, y_test = train_test_split(
        no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual_saved[no_of_training_instances:]
    '''
    rfr = RandomForestRegressor(n_jobs=4, oob_score=True)
    rfr.fit(X_train, y_train)
    print_feature_importance(rfr.feature_importances_, forecasting_feilds)
    rfr_forecast_as_log = rfr.predict(X_test)
    rfr_forecast = retransfrom_from_log(rfr_forecast_as_log)
    rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast)


    lr_model =linear_model.Lasso(alpha = 0.1)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forcast_revered = retransfrom_from_log(lr_forecast)
    calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered)
    '''

    xgb_params = {
        "objective": "reg:linear",
        "booster": "gbtree",
        "eta": 0.1,
        "nthread": 4,
        'min_child_weight': 5
    }
    model, y_pred = regression_with_xgboost(X_train,
                                            y_train,
                                            X_test,
                                            y_test,
                                            features=forecasting_feilds,
                                            use_cv=True,
                                            use_sklean=False,
                                            xgb_params=xgb_params)
    #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds,
    #                                                  xgb_params=xgb_params,num_rounds=100)
    xgb_forecast = model.predict(X_test)
    xgb_forecast = retransfrom_from_log(xgb_forecast)
    calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast)

    if submissions_ids is not None and submissions is not None:
        if use_complex_features:
            submissions, _ = generate_forecast_features(
                submissions, model_index_by_acc)
        submissions = np.column_stack([submissions, blend_data_submission])
        submissions = np.where(
            np.isnan(submissions), 0,
            np.where(np.isinf(submissions), 10000, submissions))
        rfr_ensamble_forecasts = model.predict(submissions)
        if conf.target_as_log:
            rfr_ensamble_forecasts = retransfrom_from_log(
                rfr_ensamble_forecasts)
        save_submission_file("rfr_blend_submission.csv", submissions_ids,
                             rfr_ensamble_forecasts)
    else:
        print "submissions not found"

    #we randomly select 5 million values
    x_size = X_train.shape[0]
    sample_indexes = np.random.randint(0, X_train.shape[0],
                                       min(5000000, x_size))
    X_train = X_train[sample_indexes]
    y_train = y_train[sample_indexes]

    dlconf = MLConfigs(nodes_in_layer=10,
                       number_of_hidden_layers=2,
                       dropout=0.3,
                       activation_fn='relu',
                       loss="mse",
                       epoch_count=4,
                       optimizer=Adam(lr=0.0001),
                       regularization=0.2)
    y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train)
    y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization)
    X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train)
    X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D)

    model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test,
                                           dlconf)

    y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization)
    y_forecast = retransfrom_from_log(y_forecast)
    rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)
    MLConfigs(nodes_in_layer=20,
              number_of_hidden_layers=3,
              dropout=0,
              activation_fn='relu',
              loss="mse",
              epoch_count=200,
              optimizer=Adam(lr=0.001)),
    #MLConfigs(nodes_in_layer=20, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse",
    #          epoch_count=200, optimizer=Adam(lr=0.001), regularization=0.005),
]

datasetIndex = 0
for dataset in datasets:
    #check for assending
    dataset = np.array(dataset)
    dataset, parmsFromNormalization = preprocess1DtoZeroMeanUnit(dataset)
    X_all, Y_all = build_rolling_window_dataset(dataset, window_size)

    row_count = X_all.shape[0]
    training_set_size = int(0.7 * row_count)
    #print("X_all.shape", X_all.shape)

    #X_all = np.column_stack((X_all, cycle_data, zscore_vals, entropy_vals, mavg1_vals, mavg2_vals, mavg4_vals, mavg8_vals, mavg16_vals))
    X_all, Y_all = shuffle_data(X_all, Y_all)

    X_train, X_test, y_train, y_test = train_test_split(
        training_set_size, X_all, Y_all)

    print ">> Dataset %s" % (datasetsNames[datasetIndex])
    #run_timeseries_froecasts(X_train, y_train, X_test, y_test, window_size, epoch_count=10, parmsFromNormalization=parmsFromNormalization)
Exemplo n.º 10
0
def blend_models(conf, forecasts, model_index_by_acc, y_actual, submissions_ids, submissions,
                 blend_data, blend_data_submission):
    use_complex_features = True
    if use_complex_features:
        X_all, forecasting_feilds = generate_forecast_features(forecasts, model_index_by_acc)
    else:
        X_all,forecasting_feilds = forecasts, ["f"+str(f) for f in range(forecasts.shape[1])]

    X_all = np.column_stack([X_all, blend_data])
    forecasting_feilds = forecasting_feilds + get_blend_features()

    #removing NaN and inf if there is any
    y_actual_saved = y_actual
    if conf.target_as_log:
        X_all = transfrom_to_log2d(X_all)
        y_actual = transfrom_to_log(y_actual)

    X_all = fillna_and_inf(X_all)
    y_actual = fillna_and_inf(y_actual)

    #we use 10% full data to train the ensamble and 30% for evalaution
    no_of_training_instances = int(round(len(y_actual)*0.50))
    X_train, X_test, y_train, y_test = train_test_split(no_of_training_instances, X_all, y_actual)
    y_actual_test = y_actual_saved[no_of_training_instances:]

    '''
    rfr = RandomForestRegressor(n_jobs=4, oob_score=True)
    rfr.fit(X_train, y_train)
    print_feature_importance(rfr.feature_importances_, forecasting_feilds)
    rfr_forecast_as_log = rfr.predict(X_test)
    rfr_forecast = retransfrom_from_log(rfr_forecast_as_log)
    rmsle = calculate_accuracy("rfr_forecast", y_actual_test, rfr_forecast)


    lr_model =linear_model.Lasso(alpha = 0.1)
    lr_model.fit(X_train, y_train)
    lr_forecast = lr_model.predict(X_test)
    lr_forcast_revered = retransfrom_from_log(lr_forecast)
    calculate_accuracy("vote__lr_forecast " + str(conf.command), y_actual_test, lr_forcast_revered)
    '''




    xgb_params = {"objective": "reg:linear", "booster":"gbtree", "eta":0.1, "nthread":4, 'min_child_weight':5}
    model, y_pred = regression_with_xgboost(X_train, y_train, X_test, y_test, features=forecasting_feilds, use_cv=True,
                            use_sklean=False, xgb_params=xgb_params)
    #model, y_pred = regression_with_xgboost_no_cv(X_train, y_train, X_test, y_test, features=forecasting_feilds,
    #                                                  xgb_params=xgb_params,num_rounds=100)
    xgb_forecast = model.predict(X_test)
    xgb_forecast = retransfrom_from_log(xgb_forecast)
    calculate_accuracy("xgb_forecast", y_actual_test, xgb_forecast)

    if submissions_ids is not None and submissions is not None:
        if use_complex_features:
            submissions, _ = generate_forecast_features(submissions, model_index_by_acc)
        submissions = np.column_stack([submissions, blend_data_submission])
        submissions = np.where(np.isnan(submissions), 0, np.where(np.isinf(submissions), 10000, submissions))
        rfr_ensamble_forecasts = model.predict(submissions)
        if conf.target_as_log:
            rfr_ensamble_forecasts = retransfrom_from_log(rfr_ensamble_forecasts)
        save_submission_file("rfr_blend_submission.csv", submissions_ids, rfr_ensamble_forecasts)
    else:
        print "submissions not found"

    #we randomly select 5 million values
    x_size = X_train.shape[0]
    sample_indexes = np.random.randint(0, X_train.shape[0], min(5000000, x_size))
    X_train = X_train[sample_indexes]
    y_train = y_train[sample_indexes]

    dlconf = MLConfigs(nodes_in_layer=10, number_of_hidden_layers=2, dropout=0.3, activation_fn='relu', loss="mse",
                epoch_count=4, optimizer=Adam(lr=0.0001), regularization=0.2)
    y_train, parmsFromNormalization = preprocess1DtoZeroMeanUnit(y_train)
    y_test = apply_zeroMeanUnit(y_test, parmsFromNormalization)
    X_train, parmsFromNormalization2D = preprocess2DtoZeroMeanUnit(X_train)
    X_test = apply_zeroMeanUnit2D(X_test, parmsFromNormalization2D)

    model, y_forecast = regression_with_dl(X_train, y_train, X_test, y_test, dlconf)

    y_forecast = undoPreprocessing(y_forecast, parmsFromNormalization)
    y_forecast = retransfrom_from_log(y_forecast)
    rmsle = calculate_accuracy("ml_forecast", y_actual_test, y_forecast)
window_size = 14

configs = [
    #lr=0.01
    MLConfigs(nodes_in_layer=20, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse",
              epoch_count=200, optimizer=Adam(lr=0.001)),
    #MLConfigs(nodes_in_layer=20, number_of_hidden_layers=3, dropout=0, activation_fn='relu', loss="mse",
    #          epoch_count=200, optimizer=Adam(lr=0.001), regularization=0.005),
    ]

datasetIndex = 0
for dataset in datasets:
    #check for assending
    dataset = np.array(dataset)
    dataset, parmsFromNormalization = preprocess1DtoZeroMeanUnit(dataset)
    X_all, Y_all = build_rolling_window_dataset(dataset, window_size)

    row_count = X_all.shape[0];
    training_set_size = int(0.7*row_count)
    #print("X_all.shape", X_all.shape)

    #X_all = np.column_stack((X_all, cycle_data, zscore_vals, entropy_vals, mavg1_vals, mavg2_vals, mavg4_vals, mavg8_vals, mavg16_vals))
    X_all, Y_all = shuffle_data(X_all, Y_all)

    X_train, X_test, y_train, y_test = train_test_split(training_set_size, X_all, Y_all)

    print ">> Dataset %s" %(datasetsNames[datasetIndex])
    #run_timeseries_froecasts(X_train, y_train, X_test, y_test, window_size, epoch_count=10, parmsFromNormalization=parmsFromNormalization)

    index = 0