示例#1
0
def one_hot_numeric(ds, attr, header):
    """Transforms the given attribute of the given DataFrame object into one hot encoding.
    If you plan to use this, don't use split attribute.
    Returns a DataFrame object."""
    vals = d.values_of(ds, attr)
    new_cols = vals
    for new in new_cols:
        ds[header + str(new)] = p.Series(np.zeros(len(ds)), ds.index)
        for i in ds.index.tolist():
            if d.content_of(ds, attr, i) == new:
                ds.set_value(i, header + str(new), 1)
    return ds
示例#2
0
def frequencystatplot(df, show=True, save=False):
    regions = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    compstat = []
    for i in regions:
        compstat.append(len(values_of(df[df["Region"] == i], "StoreID")))

    sb.barplot(x=regions, y=compstat)
    fig = plt.gcf()
    fig.set_size_inches(18, 9)

    if show:
        plt.show()
    if save:
        fig.savefig("frequencystatplot.png")
示例#3
0
def add_min_per_shop(df, data_from=None):
    if data_from is None:
        data_from = df
    ids = d.values_of(data_from, 'StoreID')
    stds = dict()
    for id in ids:
        try:
            _ = stds[str(id)]
        except KeyError:
            stds[str(id)] = min_per_shop(data_from, id)

    df['min_shop'] = p.Series(np.zeros(len(df)), df.index)
    for i in df.index.tolist():
        df.set_value(i, 'min_shop', stds[str(d.content_of(df, 'StoreID', i))])
    return df
示例#4
0
def add_avg_cust_per_shop(df, data_from=None):
    if data_from is None:
        data_from = df
    ids = d.values_of(data_from, 'StoreID')
    means = dict()
    for id in ids:
        try:
            _ = means[str(id)]
        except KeyError:
            means[str(id)] = average_cust_per_shop(data_from, id)

    df['meancustshop'] = p.Series(np.zeros(len(df)), df.index)
    for i in df.index.tolist():
        df.set_value(i, 'meancustshop', means[str(d.content_of(df, 'StoreID', i))])
    return df
示例#5
0
def one_hot(ds, attr, header, split=False):
    """Transforms the given attribute of the given DataFrame object into one hot encoding.
    If you plan to use this, don't use split attribute.
    Returns a DataFrame object."""
    vals = d.values_of(ds, attr)
    if split:
        new_cols = []
        for v in vals:
            split = v.split("-")
            for s in split:
                if not new_cols.__contains__(s):
                    new_cols.append(s)
    else:
        new_cols = vals
    for new in new_cols:
        ds[header + new] = p.Series(np.zeros(len(ds)), ds.index)
        for i in ds.index.tolist():
            if d.content_of(ds, attr, i).find(new) != -1:
                ds.set_value(i, header + new, 1)
    return ds
示例#6
0
            print("merged dataframe:", new_df.shape)

            new_df_corr = new_df.corr(method="pearson")

            numeric_clustermap = sb.clustermap(new_df_corr,
                                               square="True",
                                               cmap="Blues",
                                               annot=True)
            pl.figure(figsize=(20, 20))
            pl.show()

    #ax = sb.jointplot(x="NearestCompetitor", y="NumberOfSales", data=data_numeric, marker='+')
    #pl.show()
    #ax = sb.jointplot(x="NearestCompetitor", y="NumberOfCustomers", data=data_numeric, marker='+')
    #pl.show()

    df = datasetfun.read_dataset()
    num_hyperm = []  # the index identifies the region
    for regionId in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
        ds = df[df["StoreType"] == "Hyper Market"]
        ds = ds[ds["Region"] == regionId]
        num_hyperm.append(len(datasetfun.values_of(ds, "StoreID")))

    print(num_hyperm)

    sb.barplot(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], y=num_hyperm)
    pl.show()

    sb.pointplot()
示例#7
0
def monthlyplot(df,
                bm=3,
                by=2016,
                em=2,
                ey=2018,
                regions=None,
                storetype=False,
                perstore=False,
                target="NumberOfSales",
                show=True,
                save=False):
    if em == 12:
        em = 1
        ey += 1
    else:
        em += 1

    base_bm = bm
    base_by = by
    base_em = em
    base_ey = ey

    if regions is None and not storetype and not perstore:
        month_x = []
        sales_y = []
        while bm != em or by != ey:
            dfframe = get_frame_in_range(df, bm, by, bm, by)
            month_x.append(str(bm) + "-" + str(by))
            sales_y.append(dfframe[target].sum())
            bm += 1
            if bm == 13:
                bm = 1
                by += 1

        sb.barplot(x=month_x,
                   y=sales_y).set_title("Monthly " + target + " (All shops)")
    elif storetype:
        title = "Monthly " + target + " per StoreType"
        palette = sb.color_palette("hls", 4)
        month_x = []
        sales_y_1 = []
        sales_y_2 = []
        sales_y_3 = []
        sales_y_4 = []
        while bm != em or by != ey:
            dfframe = get_frame_in_range(df, bm, by, bm, by)
            month_x.append(str(bm) + "-" + str(by))
            sales_y_1.append(
                dfframe[dfframe["StoreType_Hyper Market"] == 1][target].mean())
            sales_y_2.append(
                dfframe[dfframe["StoreType_Super Market"] == 1][target].mean())
            sales_y_3.append(dfframe[dfframe["StoreType_Standard Market"] == 1]
                             [target].mean())
            sales_y_4.append(dfframe[dfframe["StoreType_Shopping Center"] == 1]
                             [target].mean())
            bm += 1
            if bm == 13:
                bm = 1
                by += 1

        pdf1 = pd.DataFrame(data={
            "Month": month_x,
            target: sales_y_1,
            "Type": "Hyper Market"
        })
        pdf2 = pd.DataFrame(data={
            "Month": month_x,
            target: sales_y_2,
            "Type": "Super Market"
        })
        pdf3 = pd.DataFrame(data={
            "Month": month_x,
            target: sales_y_3,
            "Type": "Standard Market"
        })
        pdf4 = pd.DataFrame(data={
            "Month": month_x,
            target: sales_y_4,
            "Type": "Shopping Center"
        })
        pdf = pd.concat([pdf1, pdf2, pdf3, pdf4])
        sb.pointplot(x="Month", y=target, data=pdf,
                     hue="Type").set_title(title)
    elif perstore:
        pdf = []
        palette = sb.color_palette("hls", 750)
        for i in values_of(df, "StoreID"):
            month_x = []
            sales_y = []
            while bm != em or by != ey:
                dfframe = get_frame_in_range(df[df["StoreID"] == i], bm, by,
                                             bm, by)
                month_x.append(str(bm) + "-" + str(by))
                sales_y.append(dfframe[target].mean())
                bm += 1
                if bm == 13:
                    bm = 1
                    by += 1
            pdf.append(
                pd.DataFrame(data={
                    "Month": month_x,
                    target: sales_y,
                    "StoreID": i
                }))

            bm = base_bm
            by = base_by
            em = base_em
            ey = base_ey
        pdf = pd.concat(pdf)
        sb.pointplot(
            x="Month", y=target, data=pdf,
            hue="StoreID").set_title("Monthly NumberOfCustomers per Store")
    else:
        pdf = []
        palette = sb.color_palette("hls", 11)
        for i in regions:
            month_x = []
            sales_y = []
            while bm != em or by != ey:
                dfframe = get_frame_in_range(df[df["Region"] == i], bm, by, bm,
                                             by)
                month_x.append(str(bm) + "-" + str(by))
                sales_y.append(dfframe[target].mean())
                bm += 1
                if bm == 13:
                    bm = 1
                    by += 1
            pdf.append(
                pd.DataFrame(data={
                    "Month": month_x,
                    target: sales_y,
                    "Region": i
                }))

            bm = base_bm
            by = base_by
            em = base_em
            ey = base_ey
        pdf = pd.concat(pdf)
        sb.pointplot(
            x="Month", y=target, data=pdf,
            hue="Region").set_title("Monthly NumberOfCustomers per Region")

    plt.legend()
    fig = plt.gcf()
    fig.set_size_inches(18, 9)

    if show:
        plt.show()
    if save:
        fig.savefig("monthly_plot.png")
示例#8
0
def frequencypershop(df,
                     storeID,
                     target="NumberOfSales",
                     daily=False,
                     shoptype=False,
                     cloudcover=False,
                     events=False,
                     region=False,
                     show=True,
                     save=False):
    title = target + " Distribution"

    if storeID > 0:
        df = df[df["StoreID"] == storeID]
        title = title + " - Shop" + str(storeID)

    if daily:
        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
        df['Day'] = df['Date'].dt.weekday_name

        sb.distplot(a=df[df["Day"] == "Monday"][target],
                    label="Monday").set_title(title)
        sb.distplot(a=df[df["Day"] == "Tuesday"][target],
                    label="Tuesday").set_title(title)
        sb.distplot(a=df[df["Day"] == "Wednesday"][target],
                    label="Wednesday").set_title(title)
        sb.distplot(a=df[df["Day"] == "Thursday"][target],
                    label="Thursday").set_title(title)
        sb.distplot(a=df[df["Day"] == "Friday"][target],
                    label="Friday").set_title(title)
        sb.distplot(a=df[df["Day"] == "Saturday"][target],
                    label="Saturday").set_title(title)
        sb.distplot(a=df[df["Day"] == "Sunday"][target],
                    label="Sunday").set_title(title)
        plt.legend()
    elif shoptype:
        sb.distplot(a=df[df["StoreType_Hyper Market"] == 1][target],
                    label="Hyper Market").set_title(title)
        sb.distplot(a=df[df["StoreType_Super Market"] == 1][target],
                    label="Super Market").set_title(title)
        sb.distplot(a=df[df["StoreType_Standard Market"] == 1][target],
                    label="Standard Market").set_title(title)
        sb.distplot(a=df[df["StoreType_Shopping Center"] == 1][target],
                    label="Shopping Center").set_title(title)
        plt.legend()
    elif cloudcover:
        sb.distplot(a=df[df["CloudCover"] == 0][target],
                    label="0").set_title(title)
        sb.distplot(a=df[df["CloudCover"] == 1][target],
                    label="1").set_title(title)
        sb.distplot(a=df[df["CloudCover"] == 2][target],
                    label="2").set_title(title)
        sb.distplot(a=df[df["CloudCover"] == 3][target],
                    label="3").set_title(title)
        sb.distplot(a=df[df["CloudCover"] == 4][target],
                    label="4").set_title(title)
        sb.distplot(a=df[df["CloudCover"] == 5][target],
                    label="5").set_title(title)
        sb.distplot(a=df[df["CloudCover"] == 6][target],
                    label="6").set_title(title)
        sb.distplot(a=df[df["CloudCover"] == 7][target],
                    label="7").set_title(title)
        sb.distplot(a=df[df["CloudCover"] == 8][target],
                    label="8").set_title(title)
        plt.legend()
    elif events:
        sb.distplot(a=df[df["Events_none"] == 1][target],
                    label="None").set_title(title)
        sb.distplot(a=df[df["Events_Fog"] == 1][target],
                    label="Fog").set_title(title)
        sb.distplot(a=df[df["Events_Rain"] == 1][target],
                    label="Rain").set_title(title)
        sb.distplot(a=df[df["Events_Thunderstorm"] == 1][target],
                    label="Thunderstorm").set_title(title)
        sb.distplot(a=df[df["Events_Snow"] == 1][target],
                    label="Snow").set_title(title)
        sb.distplot(a=df[df["Events_Hail"] == 1][target],
                    label="Hail").set_title(title)
        plt.legend()
    elif region:
        sb.distplot(a=df[df["Region"] == 0][target],
                    label="0").set_title(title)
        sb.distplot(a=df[df["Region"] == 1][target],
                    label="1").set_title(title)
        sb.distplot(a=df[df["Region"] == 2][target],
                    label="2").set_title(title)
        sb.distplot(a=df[df["Region"] == 3][target],
                    label="3").set_title(title)
        sb.distplot(a=df[df["Region"] == 4][target],
                    label="4").set_title(title)
        sb.distplot(a=df[df["Region"] == 5][target],
                    label="5").set_title(title)
        sb.distplot(a=df[df["Region"] == 6][target],
                    label="6").set_title(title)
        sb.distplot(a=df[df["Region"] == 7][target],
                    label="7").set_title(title)
        sb.distplot(a=df[df["Region"] == 8][target],
                    label="8").set_title(title)
        sb.distplot(a=df[df["Region"] == 9][target],
                    label="9").set_title(title)
        sb.distplot(a=df[df["Region"] == 10][target],
                    label="10").set_title(title)
        plt.legend()

    else:
        if storeID == -1:
            sb.boxplot(x="StoreID", y=target, data=df).set_title(title)
            plt.legend()
        elif storeID == -2:
            df = df[df["StoreType_Shopping Center"] == 1]
            for id in values_of(df, "StoreID"):
                sb.distplot(a=df[df["StoreID"] == id][target],
                            hist=False,
                            label=str(id)).set_title(title)
            plt.legend()
        elif storeID == -3:
            df1 = df[df["StoreID"] == 1307]
            df2 = df[df["StoreID"] == 1330]
            df = pd.concat([df1, df2])
            sb.distplot(a=df[df["StoreID"] == 1307]["NumberOfSales"],
                        hist=False,
                        label="1307 - Sales").set_title(title)
            sb.distplot(a=df[df["StoreID"] == 1307]["NumberOfCustomers"],
                        hist=False,
                        label="1307 - Customers").set_title(title)
            sb.distplot(a=df[df["StoreID"] == 1330]["NumberOfSales"],
                        hist=False,
                        label="1330 - Sales").set_title(title)
            sb.distplot(a=df[df["StoreID"] == 1330]["NumberOfCustomers"],
                        hist=False,
                        label="1330 - Customers").set_title(title)
            plt.legend()
        else:
            sb.distplot(a=df[target]).set_title(title)
    fig = plt.gcf()
    fig.set_size_inches(18, 9)

    if show:
        plt.show()
    if save:
        fig.savefig("frequencypershop.png")
        # 'Min_VisibilitykM>CloudCover': 0.8677897336390897,
        # 'Min_VisibilitykM>Events': 0.9578408890456871,
        # 'Min_VisibilitykM>Max_Gust_SpeedKm_h': 0.8377138825189627,
        # 'Min_VisibilitykM>Max_VisibilityKm': 1.0,
        # 'Min_VisibilitykM>Mean_VisibilityKm': 1.0}

        print("MISS CORRELATIONS: ")
        print(misses)
    print(
        "###################################################################")
    ds = imp.full_preprocess(ds)
    print("PREPROCESSING COMPLETE")

    print(
        "###################################################################")
    st.missing(ds, list(ds), printa=True)

    print(
        "###################################################################")
    for attr in d.nominal_only(ds):
        print("VALUES OF " + attr + ": " + str(d.values_of(ds, attr)))

    print(
        "###################################################################")
    for attr in d.numeric_only(ds):
        print("DISTINCT NUMBER OF " + attr + ": " +
              str(len(d.values_of(ds, attr))))

    for attr in list(d.numeric_only(ds)):
        gr.scatterplot(ds, attr, 'NumberOfSales')