def one_hot_numeric(ds, attr, header): """Transforms the given attribute of the given DataFrame object into one hot encoding. If you plan to use this, don't use split attribute. Returns a DataFrame object.""" vals = d.values_of(ds, attr) new_cols = vals for new in new_cols: ds[header + str(new)] = p.Series(np.zeros(len(ds)), ds.index) for i in ds.index.tolist(): if d.content_of(ds, attr, i) == new: ds.set_value(i, header + str(new), 1) return ds
def frequencystatplot(df, show=True, save=False): regions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] compstat = [] for i in regions: compstat.append(len(values_of(df[df["Region"] == i], "StoreID"))) sb.barplot(x=regions, y=compstat) fig = plt.gcf() fig.set_size_inches(18, 9) if show: plt.show() if save: fig.savefig("frequencystatplot.png")
def add_min_per_shop(df, data_from=None): if data_from is None: data_from = df ids = d.values_of(data_from, 'StoreID') stds = dict() for id in ids: try: _ = stds[str(id)] except KeyError: stds[str(id)] = min_per_shop(data_from, id) df['min_shop'] = p.Series(np.zeros(len(df)), df.index) for i in df.index.tolist(): df.set_value(i, 'min_shop', stds[str(d.content_of(df, 'StoreID', i))]) return df
def add_avg_cust_per_shop(df, data_from=None): if data_from is None: data_from = df ids = d.values_of(data_from, 'StoreID') means = dict() for id in ids: try: _ = means[str(id)] except KeyError: means[str(id)] = average_cust_per_shop(data_from, id) df['meancustshop'] = p.Series(np.zeros(len(df)), df.index) for i in df.index.tolist(): df.set_value(i, 'meancustshop', means[str(d.content_of(df, 'StoreID', i))]) return df
def one_hot(ds, attr, header, split=False): """Transforms the given attribute of the given DataFrame object into one hot encoding. If you plan to use this, don't use split attribute. Returns a DataFrame object.""" vals = d.values_of(ds, attr) if split: new_cols = [] for v in vals: split = v.split("-") for s in split: if not new_cols.__contains__(s): new_cols.append(s) else: new_cols = vals for new in new_cols: ds[header + new] = p.Series(np.zeros(len(ds)), ds.index) for i in ds.index.tolist(): if d.content_of(ds, attr, i).find(new) != -1: ds.set_value(i, header + new, 1) return ds
print("merged dataframe:", new_df.shape) new_df_corr = new_df.corr(method="pearson") numeric_clustermap = sb.clustermap(new_df_corr, square="True", cmap="Blues", annot=True) pl.figure(figsize=(20, 20)) pl.show() #ax = sb.jointplot(x="NearestCompetitor", y="NumberOfSales", data=data_numeric, marker='+') #pl.show() #ax = sb.jointplot(x="NearestCompetitor", y="NumberOfCustomers", data=data_numeric, marker='+') #pl.show() df = datasetfun.read_dataset() num_hyperm = [] # the index identifies the region for regionId in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]: ds = df[df["StoreType"] == "Hyper Market"] ds = ds[ds["Region"] == regionId] num_hyperm.append(len(datasetfun.values_of(ds, "StoreID"))) print(num_hyperm) sb.barplot(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], y=num_hyperm) pl.show() sb.pointplot()
def monthlyplot(df, bm=3, by=2016, em=2, ey=2018, regions=None, storetype=False, perstore=False, target="NumberOfSales", show=True, save=False): if em == 12: em = 1 ey += 1 else: em += 1 base_bm = bm base_by = by base_em = em base_ey = ey if regions is None and not storetype and not perstore: month_x = [] sales_y = [] while bm != em or by != ey: dfframe = get_frame_in_range(df, bm, by, bm, by) month_x.append(str(bm) + "-" + str(by)) sales_y.append(dfframe[target].sum()) bm += 1 if bm == 13: bm = 1 by += 1 sb.barplot(x=month_x, y=sales_y).set_title("Monthly " + target + " (All shops)") elif storetype: title = "Monthly " + target + " per StoreType" palette = sb.color_palette("hls", 4) month_x = [] sales_y_1 = [] sales_y_2 = [] sales_y_3 = [] sales_y_4 = [] while bm != em or by != ey: dfframe = get_frame_in_range(df, bm, by, bm, by) month_x.append(str(bm) + "-" + str(by)) sales_y_1.append( dfframe[dfframe["StoreType_Hyper Market"] == 1][target].mean()) sales_y_2.append( dfframe[dfframe["StoreType_Super Market"] == 1][target].mean()) sales_y_3.append(dfframe[dfframe["StoreType_Standard Market"] == 1] [target].mean()) sales_y_4.append(dfframe[dfframe["StoreType_Shopping Center"] == 1] [target].mean()) bm += 1 if bm == 13: bm = 1 by += 1 pdf1 = pd.DataFrame(data={ "Month": month_x, target: sales_y_1, "Type": "Hyper Market" }) pdf2 = pd.DataFrame(data={ "Month": month_x, target: sales_y_2, "Type": "Super Market" }) pdf3 = pd.DataFrame(data={ "Month": month_x, target: sales_y_3, "Type": "Standard Market" }) pdf4 = pd.DataFrame(data={ "Month": month_x, target: sales_y_4, "Type": "Shopping Center" }) pdf = pd.concat([pdf1, pdf2, pdf3, pdf4]) sb.pointplot(x="Month", y=target, data=pdf, hue="Type").set_title(title) elif perstore: pdf = [] palette = sb.color_palette("hls", 750) for i in values_of(df, "StoreID"): month_x = [] sales_y = [] while bm != em or by != ey: dfframe = get_frame_in_range(df[df["StoreID"] == i], bm, by, bm, by) month_x.append(str(bm) + "-" + str(by)) sales_y.append(dfframe[target].mean()) bm += 1 if bm == 13: bm = 1 by += 1 pdf.append( pd.DataFrame(data={ "Month": month_x, target: sales_y, "StoreID": i })) bm = base_bm by = base_by em = base_em ey = base_ey pdf = pd.concat(pdf) sb.pointplot( x="Month", y=target, data=pdf, hue="StoreID").set_title("Monthly NumberOfCustomers per Store") else: pdf = [] palette = sb.color_palette("hls", 11) for i in regions: month_x = [] sales_y = [] while bm != em or by != ey: dfframe = get_frame_in_range(df[df["Region"] == i], bm, by, bm, by) month_x.append(str(bm) + "-" + str(by)) sales_y.append(dfframe[target].mean()) bm += 1 if bm == 13: bm = 1 by += 1 pdf.append( pd.DataFrame(data={ "Month": month_x, target: sales_y, "Region": i })) bm = base_bm by = base_by em = base_em ey = base_ey pdf = pd.concat(pdf) sb.pointplot( x="Month", y=target, data=pdf, hue="Region").set_title("Monthly NumberOfCustomers per Region") plt.legend() fig = plt.gcf() fig.set_size_inches(18, 9) if show: plt.show() if save: fig.savefig("monthly_plot.png")
def frequencypershop(df, storeID, target="NumberOfSales", daily=False, shoptype=False, cloudcover=False, events=False, region=False, show=True, save=False): title = target + " Distribution" if storeID > 0: df = df[df["StoreID"] == storeID] title = title + " - Shop" + str(storeID) if daily: df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y') df['Day'] = df['Date'].dt.weekday_name sb.distplot(a=df[df["Day"] == "Monday"][target], label="Monday").set_title(title) sb.distplot(a=df[df["Day"] == "Tuesday"][target], label="Tuesday").set_title(title) sb.distplot(a=df[df["Day"] == "Wednesday"][target], label="Wednesday").set_title(title) sb.distplot(a=df[df["Day"] == "Thursday"][target], label="Thursday").set_title(title) sb.distplot(a=df[df["Day"] == "Friday"][target], label="Friday").set_title(title) sb.distplot(a=df[df["Day"] == "Saturday"][target], label="Saturday").set_title(title) sb.distplot(a=df[df["Day"] == "Sunday"][target], label="Sunday").set_title(title) plt.legend() elif shoptype: sb.distplot(a=df[df["StoreType_Hyper Market"] == 1][target], label="Hyper Market").set_title(title) sb.distplot(a=df[df["StoreType_Super Market"] == 1][target], label="Super Market").set_title(title) sb.distplot(a=df[df["StoreType_Standard Market"] == 1][target], label="Standard Market").set_title(title) sb.distplot(a=df[df["StoreType_Shopping Center"] == 1][target], label="Shopping Center").set_title(title) plt.legend() elif cloudcover: sb.distplot(a=df[df["CloudCover"] == 0][target], label="0").set_title(title) sb.distplot(a=df[df["CloudCover"] == 1][target], label="1").set_title(title) sb.distplot(a=df[df["CloudCover"] == 2][target], label="2").set_title(title) sb.distplot(a=df[df["CloudCover"] == 3][target], label="3").set_title(title) sb.distplot(a=df[df["CloudCover"] == 4][target], label="4").set_title(title) sb.distplot(a=df[df["CloudCover"] == 5][target], label="5").set_title(title) sb.distplot(a=df[df["CloudCover"] == 6][target], label="6").set_title(title) sb.distplot(a=df[df["CloudCover"] == 7][target], label="7").set_title(title) sb.distplot(a=df[df["CloudCover"] == 8][target], label="8").set_title(title) plt.legend() elif events: sb.distplot(a=df[df["Events_none"] == 1][target], label="None").set_title(title) sb.distplot(a=df[df["Events_Fog"] == 1][target], label="Fog").set_title(title) sb.distplot(a=df[df["Events_Rain"] == 1][target], label="Rain").set_title(title) sb.distplot(a=df[df["Events_Thunderstorm"] == 1][target], label="Thunderstorm").set_title(title) sb.distplot(a=df[df["Events_Snow"] == 1][target], label="Snow").set_title(title) sb.distplot(a=df[df["Events_Hail"] == 1][target], label="Hail").set_title(title) plt.legend() elif region: sb.distplot(a=df[df["Region"] == 0][target], label="0").set_title(title) sb.distplot(a=df[df["Region"] == 1][target], label="1").set_title(title) sb.distplot(a=df[df["Region"] == 2][target], label="2").set_title(title) sb.distplot(a=df[df["Region"] == 3][target], label="3").set_title(title) sb.distplot(a=df[df["Region"] == 4][target], label="4").set_title(title) sb.distplot(a=df[df["Region"] == 5][target], label="5").set_title(title) sb.distplot(a=df[df["Region"] == 6][target], label="6").set_title(title) sb.distplot(a=df[df["Region"] == 7][target], label="7").set_title(title) sb.distplot(a=df[df["Region"] == 8][target], label="8").set_title(title) sb.distplot(a=df[df["Region"] == 9][target], label="9").set_title(title) sb.distplot(a=df[df["Region"] == 10][target], label="10").set_title(title) plt.legend() else: if storeID == -1: sb.boxplot(x="StoreID", y=target, data=df).set_title(title) plt.legend() elif storeID == -2: df = df[df["StoreType_Shopping Center"] == 1] for id in values_of(df, "StoreID"): sb.distplot(a=df[df["StoreID"] == id][target], hist=False, label=str(id)).set_title(title) plt.legend() elif storeID == -3: df1 = df[df["StoreID"] == 1307] df2 = df[df["StoreID"] == 1330] df = pd.concat([df1, df2]) sb.distplot(a=df[df["StoreID"] == 1307]["NumberOfSales"], hist=False, label="1307 - Sales").set_title(title) sb.distplot(a=df[df["StoreID"] == 1307]["NumberOfCustomers"], hist=False, label="1307 - Customers").set_title(title) sb.distplot(a=df[df["StoreID"] == 1330]["NumberOfSales"], hist=False, label="1330 - Sales").set_title(title) sb.distplot(a=df[df["StoreID"] == 1330]["NumberOfCustomers"], hist=False, label="1330 - Customers").set_title(title) plt.legend() else: sb.distplot(a=df[target]).set_title(title) fig = plt.gcf() fig.set_size_inches(18, 9) if show: plt.show() if save: fig.savefig("frequencypershop.png")
# 'Min_VisibilitykM>CloudCover': 0.8677897336390897, # 'Min_VisibilitykM>Events': 0.9578408890456871, # 'Min_VisibilitykM>Max_Gust_SpeedKm_h': 0.8377138825189627, # 'Min_VisibilitykM>Max_VisibilityKm': 1.0, # 'Min_VisibilitykM>Mean_VisibilityKm': 1.0} print("MISS CORRELATIONS: ") print(misses) print( "###################################################################") ds = imp.full_preprocess(ds) print("PREPROCESSING COMPLETE") print( "###################################################################") st.missing(ds, list(ds), printa=True) print( "###################################################################") for attr in d.nominal_only(ds): print("VALUES OF " + attr + ": " + str(d.values_of(ds, attr))) print( "###################################################################") for attr in d.numeric_only(ds): print("DISTINCT NUMBER OF " + attr + ": " + str(len(d.values_of(ds, attr)))) for attr in list(d.numeric_only(ds)): gr.scatterplot(ds, attr, 'NumberOfSales')