예제 #1
0
def barchart(dataframe, numerical_column, is_percent=False):
    """
    df: frequency table
    """
    
    rcParams['figure.figsize'] = 8, 5
    
    if (is_percent):
        ## removing the % sign
#         dataframe[numerical_column] = dataframe[numerical_column].str.rstrip('%').astype('float')
#         sns.barplot(x=numerical_column, y=dataframe.index, data=dataframe, orient="h", order=dataframe.index, color="#337ab7")
        dataframe.sort_values(numerical_column, ascending=True)[numerical_column].plot.barh(color="#337ab7")
#         plt.barh(dataframe.index.values, dataframe[numerical_column])
        plt.xlim(0, 100)
    else:
#         sns.barplot(x=numerical_column, y=dataframe.index, data=dataframe, orient="h", order=dataframe.index, color="#337ab7")
        dataframe.sort_values(numerical_column, ascending=True)[numerical_column].plot.barh(color="#337ab7")
#     df.plot.barh()
#     plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2)

    plt.xticks(fontsize=12, rotation=0)
    plt.yticks(fontsize=12, rotation=0)
    plt.xlabel(numerical_column, fontsize=15)
    plt.ylabel(dataframe.index.name, fontsize=15)
#     plt.subplots_adjust(left=0.4, right=0.6, top=0.9, bottom=0.1)
    plt.tight_layout(rect=(0.1, 0.1, 0.9, 0.9))
    
    return plot_360_n0sc0pe(plt)
예제 #2
0
def stackbarchart(df, cat_a, cat_b):
    
    rcParams['figure.figsize'] = (10, 4)

    ## keeping only top 10 categories
    temp_df = df[[cat_a, cat_b]]
    sorted_order = list(temp_df[cat_a].value_counts().index)

    if (df[cat_a].nunique() > 10):
        temp_df.loc[~temp_df[cat_a].isin(sorted_order[0:9]), cat_a] = "Others"
        sorted_order = sorted_order[0:9] + ["Others"]

    if (df[cat_b].nunique() > 10):
        top_categories = list(temp_df[cat_b].value_counts().index[0:9])
        temp_df.loc[~temp_df[cat_b].isin(top_categories), cat_b] = "Others" 

    ## plots
    fig, (ax1, ax2) = plt.subplots(1, 2)

    grouped_df = temp_df.groupby([cat_a, cat_b]).size()
    grouped_df.unstack().reindex(index=sorted_order).plot(kind='bar', stacked=True, ax = ax1, legend=False)
    ax1.set_title("Bar Chart",weight="bold").set_fontsize('12')

    cross_df = pd.crosstab(temp_df[cat_a], temp_df[cat_b]).apply(lambda r: r/r.sum(), axis=1)
    cross_df.reindex(index=sorted_order).plot(kind='bar', stacked=True, ax=ax2 )
    ax2.set_title("Stacked Chart",weight="bold").set_fontsize('12')
    
    ax2.legend(title=cat_b, loc='center left', bbox_to_anchor=(1, 0.5))
    
    
    plt.tight_layout(rect=(0.1, 0.05, 0.9, 0.95))
    return plot_360_n0sc0pe(plt)
예제 #3
0
def missing_count_row_wise(df):
    rcParams['figure.figsize'] = 10, 4

    dataframe = df

    # def missing_count_row_wise(dataframe)

    dataframe = df
    column_count = len(df.columns)

    row_count = dataframe.shape[0]

    ## missing: column wise
    missing_row_df = pd.DataFrame(
        dataframe.isnull().sum(axis=1).value_counts(),
        columns=["Number of rows"]).reset_index()

    missing_row_df["percent"] = missing_row_df["index"].apply(
        lambda x: 100 * round(x / column_count, 2)).astype(int)
    missing_row_df["Missing cells %"] = missing_row_df.apply(
        lambda x: str(x["index"]) + " (" + str(x["percent"]) + "%)", axis=1)

    missing_row_df.sort_values("index", ascending=False, inplace=True)

    sns.barplot(x="Number of rows",
                y="Missing cells %",
                data=missing_row_df,
                orient="h",
                color="#337ab7")

    plt.xlabel('Number of rows', fontsize='12')
    plt.ylabel('Number of missing cells', fontsize='12')
    #     plt.title("Missing values % in each column",weight="bold", fontsize='15')
    plt.tight_layout(rect=(0.1, 0, 0.9, 1))
    return plot_360_n0sc0pe(plt)
예제 #4
0
def scatterplot(df, num_a, num_b):
    
    ## plots
    fig, (ax1, ax2) = plt.subplots(1, 2)

    sns.distplot( df[num_a].dropna() , color="skyblue", label=num_a, ax=ax1)
    sns.distplot( df[num_b].dropna() , color="red", label=num_b, ax=ax1)
    ax1.set_title("",weight="bold").set_fontsize('12')

    sns.regplot(df[num_a], df[num_b], ax=ax2)
    reg_coeff = round(df[num_a].corr(df[num_b]), 2)
    ax2.set_title("R: " + str(reg_coeff), weight="bold").set_fontsize('12')

    return plot_360_n0sc0pe(plt)
예제 #5
0
def wordcloud(series):
    
    stopwords = set(STOPWORDS)
    
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        random_state=42
    ).generate(str(series.values))

    fig = plt.figure(1)
    plt.imshow(wordcloud)
    plt.axis('off')
    return plot_360_n0sc0pe(plt)
예제 #6
0
def missing_matrix(data: pd.DataFrame) -> str:
    """Generate missing values matrix plot
    Args:
      data: Pandas DataFrame to generate missing values matrix from.
    Returns:
      The resulting missing values matrix encoded as a string.
    """
    labels = True
    missingno.matrix(
        data,
        figsize=(10, 4),
        color=hex_to_rgb("#337ab7"),
        fontsize=10,
        sparkline=False,
        labels=labels,
    )
    plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2)
    return plot_360_n0sc0pe(plt)
예제 #7
0
def missing_count_column_wise(df):
    rcParams['figure.figsize'] = 10, 4

    dataframe = df

    row_count = dataframe.shape[0]

    ## missing: column wise
    missing_column_df = dataframe.isnull().sum()
    missing_column_df = pd.DataFrame(missing_column_df,
                                     columns=["Missing values"])

    missing_column_df["values"] = row_count
    missing_column_df = missing_column_df.apply(
        lambda x: 100 * round(x / row_count, 2)).astype(int)
    missing_column_df.sort_values("Missing values",
                                  ascending=False,
                                  inplace=True)

    ## bar plot
    sns.set_style("darkgrid")
    ## plot 1 - "total" - (top) series
    sns.barplot(x=missing_column_df.index,
                y="values",
                data=missing_column_df,
                color="#337ab7",
                label="Values")

    ## plot 2 - overlay - "bottom" series
    sns.barplot(x=missing_column_df.index,
                y="Missing values",
                data=missing_column_df,
                color="red",
                label="Missing")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.xticks(rotation="90")
    plt.ylabel('(%)', fontsize='12')
    plt.title("Missing values % in each column",
              weight="bold").set_fontsize('12')
    plt.tight_layout()
    return plot_360_n0sc0pe(plt)
예제 #8
0
def boxplot(df, num, cat):
    
    rcParams['figure.figsize'] = 10, 5
    
    temp_df = df[[num, cat]]
    sorted_order = temp_df.groupby(cat, as_index=False)[num].mean().sort_values(num, ascending=False)

    if (sorted_order.shape[0] > 10):
        top_categories = list(sorted_order[cat][0:9])
        temp_df.loc[~temp_df[cat].isin(top_categories), cat] = "Others"
        sns.boxplot(x=cat, y=num, data=temp_df, order = top_categories + ["Others"])

    else:
        sns.boxplot(x=cat, y=num, data=temp_df, order = sorted_order[cat])
        
    plt.xticks(fontsize=12, rotation=0)
    plt.yticks(fontsize=12, rotation=0)
    plt.xlabel(cat, fontsize=15)
    plt.ylabel(num, fontsize=15)
#     plt.subplots_adjust(left=0.4, right=0.6, top=0.9, bottom=0.1)
    plt.tight_layout(rect=(0.1, 0.1, 0.9, 0.9))
    
    return plot_360_n0sc0pe(plt)
예제 #9
0
def histogram(series: pd.Series, col_name):
    
    ## drawing
    ## - fixing the size of the figure 
    rcParams['figure.figsize'] = 10, 5
    
    x = np.array(series.dropna())
    
    # Cut the window in 2 parts
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, 
                                    gridspec_kw={"height_ratios": (.15, .85)}
                                   )

    sns.boxplot(x, ax=ax_box, color="#337ab7")
    sns.distplot(x, ax=ax_hist, color="#337ab7")

    ax_box.set(yticks=[])
    sns.despine(ax=ax_hist)
    sns.despine(ax=ax_box, left=True)
    
    plt.xticks(fontsize=12, rotation=0)
    plt.yticks(fontsize=12, rotation=0)
    plt.tight_layout()
    return plot_360_n0sc0pe(plt)
예제 #10
0
def networkplot(column_types, associations):

    ## preprocessing assns
    assn_df = associations.copy()
    assn_df["assn"] = abs(assn_df["association"])
    ## - filter
    assn_df = assn_df[assn_df["assn"] >= .01]

    ## creating graph
    G = nx.from_pandas_edgelist(assn_df,
                                source='col_a',
                                target='col_b',
                                edge_attr=["assn", "type_"],
                                create_using=nx.DiGraph())

    ## edges
    corr_edges = [(u, v) for (u, v, d) in G.edges(data=True)
                  if d['type_'] == "NUM-NUM"]
    corr_ratio_edges = [(u, v) for (u, v, d) in G.edges(data=True)
                        if d['type_'] in ("NUM-CAT", "CAT-NUM")]
    theil_edges = [(u, v) for (u, v, d) in G.edges(data=True)
                   if d['type_'] in ("CAT-CAT")]

    ## size and color attribute of nodeS
    size_df = assn_df.groupby(
        "col_a", as_index=False)["assn"].sum().rename(columns={"assn": "size"})
    ## - normalizing
    max_size = size_df["size"].max()
    size_df["size"] = size_df["size"] / max_size

    for i in list(G.nodes()):
        G.nodes[i]['size'] = size_df[size_df['col_a'] == i]['size'].values[0]

        if (column_types[i] == "BOOL"):
            G.nodes[i]['color'] = "#c03d3e"
        elif (column_types[i] == "CAT"):
            G.nodes[i]['color'] = "#3a923a"
        elif (column_types[i] == "NUM"):
            G.nodes[i]['color'] = "#337ab7"
        else:
            G.nodes[i]['color'] = "blue"

    ## drawing
    ## - fixing the size of the figure
    plt.figure(figsize=(10, 4))

    ## color, size, width
    node_color = [nx.get_node_attributes(G, 'color')[v] for v in G]
    node_size = [1000 * nx.get_node_attributes(G, 'size')[v] for v in G]
    edge_width = [20 * G[u][v]['assn'] for u, v in G.edges()]

    ## layout
    pos = nx.spring_layout(G, iterations=50)

    # node labels
    nx.draw_networkx_labels(G,
                            pos,
                            with_labels=True,
                            font_size=15,
                            font_family='sans-serif',
                            font_color="#000000",
                            font_weight="bold")

    ## nodes
    nx.draw_networkx_nodes(G,
                           pos,
                           node_color=node_color,
                           node_size=node_size,
                           node_shape="o",
                           alpha=0.9,
                           linewidths=10)

    ## edges
    #     nx.draw_networkx_edges(G, pos, edgelist=corr_edges,
    #                            width=edge_width, alpha=0.2, style='solid', edge_color="grey", arrows=False)

    #     nx.draw_networkx_edges(G, pos, edgelist=corr_ratio_edges,
    #                            width=edge_width, alpha=0.2, style='solid', edge_color="grey", arrows=False)

    #     nx.draw_networkx_edges(G, pos, edgelist=theil_edges,
    #                            width=edge_width, alpha=0.2, style='solid', edge_color="grey", arrows=False)
    nx.draw_networkx_edges(G,
                           pos,
                           width=edge_width,
                           alpha=0.15,
                           style='solid',
                           edge_color="grey",
                           arrows=False)

    ## edge labels
    #     edge_labels =dict([((u, v), d['assn']) for u, v, d in G.edges(data=True)])
    #     nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)

    plt.axis('off')
    plt.tight_layout()
    return plot_360_n0sc0pe(plt)