preds[preds < 0] = 0 for i in range(min(len(preds), number_print)): print("PRED: ", preds[i], " y: ", y[i]) print("R2: ", eva.r2(ds, preds, 'NumberOfSales')) if __name__ == '__main__': TRAIN = True LOAD = False SAVE_DF = False name = "test" ds = d.read_imputed_onehot_dataset() ds = prepare_ds(ds) d.save_dataset(ds, "fully_preprocessed_ds.csv") ds_train = utils.get_frame_in_range(ds, 3, 2016, 12, 2017) ds_test = utils.get_frame_in_range(ds, 1, 2018, 2, 2018) y = prepare_out(ds_train) real_y = np.array(y) dy = np.zeros(y.shape) x = drop_useless(ds_train) y_test = prepare_out(ds_test) if SAVE_DF: d.save_dataset(ds_test, "dataset_to_predict_sales.csv") x_test = drop_useless(ds_test) models = [] for i in range(number_of_model): if not LOAD: models.append(m.nonsequentialNNDropout(x.shape[1], i == 0)) else:
ds = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=dts)\ .exclude('NumberOfSales', 'Month')\ .build() models = [] x, y = ds.xtr, ds.ytr real_y = np.array(y) x_test, y_test = ds.xts, ds.yts for i in range(number_of_model): if not LOAD: models.append(m.single_relu(x.shape[1], i == 0)) else: models.append(k.models.load_model("mod" + name + str(i) + ".h5")) opt = k.optimizers.adam(lr=2e-4) models[i].compile(optimizer=opt, loss='mean_squared_error', metrics=['mae']) models[i].summary() if TRAIN: models[i].fit(x=x, y=y, batch_size=20000, epochs=80, verbose=2, validation_data=(x_test, y_test)) models[i].save("mod" + name + str(i) + ".h5") dy = models[i].predict(x, 500) print(dy.shape, y.shape) y = y.squeeze() - dy.squeeze() print(y) y.reshape([len(y), 1]) print("################################################") print("EVALUATION ON TEST:") evaluate(models, number_of_model, utils.get_frame_in_range(dts, 1, 2018, 2, 2018), y_test, x_test, 1000) print("################################################") print("EVALUATION ON TRAIN:") evaluate(models, number_of_model, utils.get_frame_in_range(dts, 2, 2016, 12, 2017), real_y, x, 0)
def regtree(): return tree.DecisionTreeRegressor(max_depth=9) def gradboostreg(): return ensemble.GradientBoostingRegressor(max_depth=8, n_estimators=5) if __name__ == '__main__': datas = ds.read_dataset("mean_var_on_customers_from_tain.csv") datas['Month'] = datas['Date'] datas['Month'] = datas['Month'].apply(lambda x: x.split("-")[1]) datas = imp.one_hot_numeric(datas, 'Month', 'Month_') datas = imp.one_hot_numeric(datas, 'Region', 'Region_') datas = preprocessing_utils.mean_cust_per_month_per_region( datas, utils.get_frame_in_range(datas, 3, 2016, 12, 2017)) datas = preprocessing_utils.mean_cust_per_month_per_shop( datas, utils.get_frame_in_range(datas, 3, 2016, 12, 2017)) datas = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datas)\ .exclude('NumberOfSales', 'Month', 'Max_Humidity', 'Max_Sea_Level_PressurehPa', 'Max_TemperatureC', 'Max_VisibilityKm', 'Max_Wind_SpeedKm_h', 'Mean_Humidity', 'Mean_Sea_Level_PressurehPa', 'Mean_VisibilityKm', 'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity', 'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Min_VisibilitykM')\ .build() model = [ ridge, linear_model.LinearRegression, lasso, regtree, gradboostreg ] final = ridge n = len(model) mods = [] modpreds = []
axis=1)).squeeze() clean_row = clean_row.reshape([1, -1]) if sc: return self.predict_sc(clean_row).squeeze() else: return self.predict_oth(clean_row).squeeze() def model1(): return linear_model.Ridge(alpha=10) def model2(): return tree.DecisionTreeRegressor(max_depth=9) data = data_manager.read_dataset("best_for_customers.csv") model = CustomersPredoctorSeparateShopCenters(model1, model1, 10, 10) model.train(data) model.test(data) data = utils.get_frame_in_range(data, 1, 2018, 2, 2018) preds = [] for i in range(len(data)): irow = data.iloc[[i]] preds.append(model.predict(irow)) preds = np.array(preds) data = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=data) \ .exclude('NumberOfSales', 'Month') \ .build() print("FINAL R2: ", eval.evaluate(data.yts, preds))
'CloudCover', 'Max_Sea_Level_PressurehPa', 'WindDirDegrees', 'Max_Dew_PointC', 'NumberOfCustomers', 'Day', 'Mean_Sea_Level_PressurehPa', 'Min_Sea_Level_PressurehPa' ], axis=axis)) return x def model(): return ensemble.GradientBoostingRegressor(max_depth=6, learning_rate=0.1) datas = ds.read_dataset("mean_var_on_customers_from_tain.csv") datas['Month'] = datas['Date'] datas['Month'] = datas['Month'].apply(lambda x: x.split("-")[1]) train = utils.get_frame_in_range(datas, 3, 2016, 12, 2017) test = utils.get_frame_in_range(datas, 1, 2018, 2, 2018) poly = PolynomialFeatures(degree=1) sum = 0 types = [ 'AssortmentType_General', 'AssortmentType_With Fish Department', 'AssortmentType_With Non-Food Department' ] models = {} for i in types: print("TYPE " + str(i)) d_reg = utils.get_frames_per_assortmenttype(train, i) d_reg_t = utils.get_frames_per_assortmenttype(test, i) print("N_SAMPLES: ", len(d_reg) + len(d_reg_t)) if len(d_reg) == 0: continue
def monthlyplot(df, bm=3, by=2016, em=2, ey=2018, regions=None, storetype=False, perstore=False, target="NumberOfSales", show=True, save=False): if em == 12: em = 1 ey += 1 else: em += 1 base_bm = bm base_by = by base_em = em base_ey = ey if regions is None and not storetype and not perstore: month_x = [] sales_y = [] while bm != em or by != ey: dfframe = get_frame_in_range(df, bm, by, bm, by) month_x.append(str(bm) + "-" + str(by)) sales_y.append(dfframe[target].sum()) bm += 1 if bm == 13: bm = 1 by += 1 sb.barplot(x=month_x, y=sales_y).set_title("Monthly " + target + " (All shops)") elif storetype: title = "Monthly " + target + " per StoreType" palette = sb.color_palette("hls", 4) month_x = [] sales_y_1 = [] sales_y_2 = [] sales_y_3 = [] sales_y_4 = [] while bm != em or by != ey: dfframe = get_frame_in_range(df, bm, by, bm, by) month_x.append(str(bm) + "-" + str(by)) sales_y_1.append( dfframe[dfframe["StoreType_Hyper Market"] == 1][target].mean()) sales_y_2.append( dfframe[dfframe["StoreType_Super Market"] == 1][target].mean()) sales_y_3.append(dfframe[dfframe["StoreType_Standard Market"] == 1] [target].mean()) sales_y_4.append(dfframe[dfframe["StoreType_Shopping Center"] == 1] [target].mean()) bm += 1 if bm == 13: bm = 1 by += 1 pdf1 = pd.DataFrame(data={ "Month": month_x, target: sales_y_1, "Type": "Hyper Market" }) pdf2 = pd.DataFrame(data={ "Month": month_x, target: sales_y_2, "Type": "Super Market" }) pdf3 = pd.DataFrame(data={ "Month": month_x, target: sales_y_3, "Type": "Standard Market" }) pdf4 = pd.DataFrame(data={ "Month": month_x, target: sales_y_4, "Type": "Shopping Center" }) pdf = pd.concat([pdf1, pdf2, pdf3, pdf4]) sb.pointplot(x="Month", y=target, data=pdf, hue="Type").set_title(title) elif perstore: pdf = [] palette = sb.color_palette("hls", 750) for i in values_of(df, "StoreID"): month_x = [] sales_y = [] while bm != em or by != ey: dfframe = get_frame_in_range(df[df["StoreID"] == i], bm, by, bm, by) month_x.append(str(bm) + "-" + str(by)) sales_y.append(dfframe[target].mean()) bm += 1 if bm == 13: bm = 1 by += 1 pdf.append( pd.DataFrame(data={ "Month": month_x, target: sales_y, "StoreID": i })) bm = base_bm by = base_by em = base_em ey = base_ey pdf = pd.concat(pdf) sb.pointplot( x="Month", y=target, data=pdf, hue="StoreID").set_title("Monthly NumberOfCustomers per Store") else: pdf = [] palette = sb.color_palette("hls", 11) for i in regions: month_x = [] sales_y = [] while bm != em or by != ey: dfframe = get_frame_in_range(df[df["Region"] == i], bm, by, bm, by) month_x.append(str(bm) + "-" + str(by)) sales_y.append(dfframe[target].mean()) bm += 1 if bm == 13: bm = 1 by += 1 pdf.append( pd.DataFrame(data={ "Month": month_x, target: sales_y, "Region": i })) bm = base_bm by = base_by em = base_em ey = base_ey pdf = pd.concat(pdf) sb.pointplot( x="Month", y=target, data=pdf, hue="Region").set_title("Monthly NumberOfCustomers per Region") plt.legend() fig = plt.gcf() fig.set_size_inches(18, 9) if show: plt.show() if save: fig.savefig("monthly_plot.png")
df = ds.read_dataset("best_for_customers.csv") sb.lmplot(x="meancustshop", y="meancust_std_shop", data=df, hue="StoreType_Shopping Center") fig = plt.gcf() fig.set_size_inches(18, 9) if show: plt.show() if save: fig.savefig("meanstdscatterpershop.png") if __name__ == '__main__': import dataset.dataset as d import dataset.utility as utils import pandas as pd ds = d.read_imputed_onehot_dataset() monthlyplot(ds) y = 2016 m = 3 while y != 2018 or m != 3: sub_ds = utils.get_frame_in_range(ds, m, y, m, y) expected_out = d.to_numpy(sub_ds[['NumberOfSales']]).squeeze() print(str(m) + "/" + str(y) + ": ", expected_out.sum()) m += 1 if m == 13: m = 1 y += 1