def __prepare_sales_train_ds(ds): ds['Date'] = p.to_datetime(ds['Date'], format='%d/%m/%Y') ds['Day'] = ds['Date'].dt.weekday_name ds['Date'] = ds['Date'].apply(lambda x: x.strftime('%Y-%m-%d')) ds['Month'] = ds['Date'] ds['Month'] = ds['Month'].apply(lambda x: x.split("-")[1]) ds = imp.one_hot_numeric(ds, 'Month', 'Month_') ds = imp.one_hot_numeric(ds, 'Region', 'Region_') ds = imp.one_hot(ds, 'Day', header='Day_') ds = pre_u.eliminate_IsOpen_zeros(ds) ds = pre_u.mean_std_sales_per_shop_per_day(ds) ds = pre_u.add_avg_per_shop(ds) ds = pre_u.add_std_per_shop(ds) ds = pre_u.add_max_per_shop(ds) ds = pre_u.add_min_per_shop(ds) ds = pre_u.mean_sales_per_month_per_region(ds) return ds
def __prepare_customers_train_ds(das, m1, a1, m2, a2): das['Date'] = pandas.to_datetime(das['Date'], format='%d/%m/%Y') das['Day'] = das['Date'].dt.weekday_name das['Date'] = das['Date'].apply(lambda x: x.strftime('%Y-%m-%d')) das['Month'] = das['Date'] das['Month'] = das['Month'].apply(lambda x: x.split("-")[1]) das = imp.one_hot(das, 'Day', header='Day_') das = imp.one_hot_numeric(das, 'Month', 'Month_') das = imp.one_hot_numeric(das, 'Region', 'Region_') dfrom = utils.get_frame_out_of_range(das, m1, a1, m2, a2) das = preu.eliminate_IsOpen_zeros(das) das = preu.mean_std_cust_per_shop_per_day(das, dfrom) das = preu.add_avg_cust_per_shop(das, dfrom) das = preu.add_std_cust_per_shop(das, dfrom) das = preu.add_max_cust_per_shop(das, dfrom) das = preu.add_min_cust_per_shop(das, dfrom) das = preu.mean_cust_per_month_per_shop(das, dfrom) das = preu.mean_cust_per_month_per_region(das, dfrom) return das
def __prepare_customers_test_ds(ds, dfrom): ds['NumberOfSales'] = p.Series(np.zeros(len(ds)), ds.index) ds['NumberOfCustomers'] = p.Series(np.zeros(len(ds)), ds.index) ds['Date'] = p.to_datetime(ds['Date'], format='%d/%m/%Y') ds['Day'] = ds['Date'].dt.weekday_name ds['Date'] = ds['Date'].apply(lambda x: x.strftime('%Y-%m-%d')) ds['Month'] = ds['Date'] ds['Month'] = ds['Month'].apply(lambda x: x.split("-")[1]) ds = imp.one_hot(ds, 'Day', header='Day_') ds = imp.one_hot_numeric(ds, 'Month', 'Month_') ds = imp.one_hot_numeric(ds, 'Region', 'Region_') ds = pre_u.eliminate_IsOpen_zeros(ds) ds = pre_u.mean_std_cust_per_shop_per_day(ds, dfrom) ds = pre_u.add_avg_cust_per_shop(ds, dfrom) ds = pre_u.add_std_cust_per_shop(ds, dfrom) ds = pre_u.add_max_cust_per_shop(ds, dfrom) ds = pre_u.add_min_cust_per_shop(ds, dfrom) ds = pre_u.mean_cust_per_month_per_shop(ds, dfrom) ds = pre_u.mean_cust_per_month_per_region(ds, dfrom) return ds
return linear_model.Lasso(alpha=5) def regtree(): return tree.DecisionTreeRegressor(max_depth=9) def gradboostreg(): return ensemble.GradientBoostingRegressor(max_depth=8, n_estimators=5) if __name__ == '__main__': datas = ds.read_dataset("mean_var_on_customers_from_tain.csv") datas['Month'] = datas['Date'] datas['Month'] = datas['Month'].apply(lambda x: x.split("-")[1]) datas = imp.one_hot_numeric(datas, 'Month', 'Month_') datas = imp.one_hot_numeric(datas, 'Region', 'Region_') datas = preprocessing_utils.mean_cust_per_month_per_region( datas, utils.get_frame_in_range(datas, 3, 2016, 12, 2017)) datas = preprocessing_utils.mean_cust_per_month_per_shop( datas, utils.get_frame_in_range(datas, 3, 2016, 12, 2017)) datas = sb.SetBuilder(target='NumberOfCustomers', autoexclude=True, df=datas)\ .exclude('NumberOfSales', 'Month', 'Max_Humidity', 'Max_Sea_Level_PressurehPa', 'Max_TemperatureC', 'Max_VisibilityKm', 'Max_Wind_SpeedKm_h', 'Mean_Humidity', 'Mean_Sea_Level_PressurehPa', 'Mean_VisibilityKm', 'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity', 'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Min_VisibilitykM')\ .build() model = [ ridge, linear_model.LinearRegression, lasso, regtree, gradboostreg ] final = ridge