예제 #1
0
 def test_boundaries_get_set_correctly_by_default(self):
     c = ContinuousMapper(self.df["age"])
     db = self.bmax - self.bmin
     test_boundaries = [
         self.bmin, self.bmin + db / 3., self.bmin + 2. * db / 3., self.bmax
     ]
     assert np.allclose(c.boundaries, test_boundaries)
예제 #2
0
 def test_map_works_correctly(self):
     n_out = 1
     boundaries = [self.bmin + 2, self.bmax - 2]
     c = ContinuousMapper(self.df["age"],
                          n_out=n_out,
                          boundaries=boundaries)
     cname = "{:2f}_{:2f}".format(boundaries[0], boundaries[1])
     m = c._map[cname]
     test_val = boundaries[0] + np.diff(boundaries) / 2.0
     assert m(test_val) == True
     test_val = self.bmin
     assert m(test_val) == False
     test_val = self.bmax
     assert m(test_val) == False
예제 #3
0
    def test_keys_set_correctly(self):
        n_out = 3
        test_boundaries = [
            self.bmin + i * (self.bmax - self.bmin) / n_out
            for i in range(n_out + 1)
        ]

        c = ContinuousMapper(self.df["age"],
                             n_out=n_out,
                             boundaries=test_boundaries)
        cname = [
            "{:2f}_{:2f}".format(test_boundaries[i], test_boundaries[i + 1])
            for i in range(n_out)
        ]

        for i, key in enumerate(c.targets.keys()):
            assert key in cname
예제 #4
0
 def test_default_targets(self):
     for i in range(5):
         c = ContinuousMapper(self.df["age"], n_out=i + 1)
         keys = list(c.targets.keys())
         assert np.isclose(c.targets[keys[i]], 1. / (i + 1))
예제 #5
0
 def test_user_boundaries_get_set_correctly_when_n_out_is_one(self):
     n_out = 1
     test_boundaries = [self.bmin + 10.0, self.bmax - 5.0]
     c = ContinuousMapper(self.df["age"], n_out, boundaries=test_boundaries)
     assert np.allclose(c.boundaries, test_boundaries)
예제 #6
0
 def test_number_of_boundaries_get_set_correctly_by_user(self):
     n_out = 3
     c = ContinuousMapper(self.df["age"], n_out=n_out)
     assert len(c.boundaries) == n_out + 1
예제 #7
0
 def test_number_of_boundaries_set_correctly_by_default(self):
     c = ContinuousMapper(self.df["age"])
     assert len(c.boundaries) == 4
예제 #8
0
 def test_runs_with_default_values(self):
     ContinuousMapper(self.df["age"])
예제 #9
0
def plot_distribution(df, xlabel, xmapper=None, xtype="categorical", ax=None,
                      cmap="YlGnBu", bins=30):
    """
    Plot the distribution of a single variable in the DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame with the data

    xlabel : str
        The column name for the variable on the x-axis

    xmapper : entrofy.mappers.BaseMapper subclass object
        A mapper object to use for the data on the x-axis.
        If None, the object is created within this function using some defaults.

    xtype : {"categorical" | "continuous"}
        The type of the data in df[xlabel]

    ax : matplotlib.Axes object
        The matplotlib.Axes object to plot the bubble plot into

    cmap : matplotlib.cm.colormap
        A matplotlib colormap to use for shading the bubbles

    bins : int
        The number of bins for the histogram.

    Returns
    -------
    ax : matplotlib.Axes object
        The same matplotlib.Axes object for further manipulation

    """

    if xmapper is None:
        if xtype == "categorical":
            xmapper = ObjectMapper(df[xlabel])
        elif xtype == "continuous":
            xmapper = ContinuousMapper(df[xlabel])
        else:
            raise Exception("xtype not valid.")

    c = sns.color_palette(cmap, 5)[2]

    if ax is None:
        fig, ax = plt.subplots(1,1, figsize=(8,6))
    if xtype == "categorical":
        summary = _make_counts_summary(df[xlabel], xlabel,
                                       xmapper, datatype="all")

        summary = summary.sort_values(by=xlabel)

        #make figure
        sns.barplot(x=xlabel, y="counts", data=summary, ax=ax, color=c)
        ax.set_ylabel("Fraction of sample")

    elif xtype == "continuous":
        column = df[xlabel]
        c_clean = column[np.isfinite(column)]
        sns.distplot(c_clean, bins=bins, hist={"histtype": "stepfilled"},
                    color=c, kde=True, ax=ax)
        ax.set_xlabel(xlabel)
        plt.ylabel("Number of occurrences")

    return ax
예제 #10
0
def plot_correlation(df, xlabel, ylabel, xmapper=None, ymapper=None,
                      ax = None, xtype="categorical", ytype="categorical",
                      cmap="YlGnBu", prefac=10., cat_type="box",
                      cont_type="kde", s=2):

    """
    Plot two variables against each other. Produces different types of
    Figures depending on the type of data being plotted.

    Parameters
    ----------
    df : pd.DataFrame
        A pandas DataFrame with the data

    xlabel : str
        The column name for the variable on the x-axis

    ylabel : str
        The column name for the variable on the y-axis

    xmapper : entrofy.mappers.BaseMapper subclass object
        A mapper object to use for the data on the x-axis.
        If None, the object is created within this function using some defaults.

    ymapper : entrofy.mappers.BaseMapper subclass object
        A mapper object to use for the data on the y-axis.
        If None, the object is created within this function using some defaults.

    ax : matplotlib.Axes object
        The matplotlib.Axes object to plot the bubble plot into

    xtype : {"categorical" | "continuous"}
        The type of the data in df[xlabel]

    ytype : {"categorical" | "continuous"}
        The type of the data in df[ylabel]

    cmap : matplotlib.cm.colormap
        A matplotlib colormap to use for shading the bubbles

    prefac : float
        A pre-factor steering the shading of the bubbles

    cat_type : {"box" | "strip" | "swarm" | "violin" | "categorical"}
        The type of plot for any plot including both categorical and continuous
        data.

    cont_type : {"kde" | "scatter"}
        The type of plot to produce. Either a kernel density estimate ("kde")
        or a scatter plor ("scatter").

    s : float
        A pre-factor changing the overall size of the bubbles

    Returns
    -------
    ax : matplotlib.Axes object
        The same matplotlib.Axes object for further manipulation

    """
    if ax is None:
        fig, ax = plt.subplots(1,1, figsize=(9,7))

    if xtype == "categorical":
        if xmapper is None:
            xmapper = ObjectMapper(df[xlabel])

        x_fields = len(xmapper.targets)
        x_keys = np.sort(list(xmapper.targets.keys()))

    elif xtype == "continuous":
        if xmapper is None:
            xmapper = ContinuousMapper(df[xlabel], n_out=4)
        x_fields = None
        x_keys = xlabel
    else:
        raise Exception("Type of data in xcolumn is not recognized!")

    if ytype == "categorical":
        if ymapper is None:
            ymapper = ObjectMapper(df[ylabel])
        y_fields = len(ymapper.targets)
        y_keys = np.sort(list(ymapper.targets.keys()))

    elif ytype == "continuous":
        if ymapper is None:
            ymapper = ContinuousMapper(df[ylabel], n_out=4)
        y_fields = None
        y_keys = ylabel

    if (xtype == "categorical") & (ytype == "categorical"):
        ax = _plot_categorical(df, xlabel, ylabel,
                               x_keys, y_keys, prefac,
                               ax, cmap, s)

    elif ((xtype == "categorical") & (ytype == "continuous")):
        n_cat = x_fields
        if cat_type == "categorical":
            cat_column = _convert_continuous_to_categorical(df[ylabel],
                                                            ymapper)
            cat_column.name = ylabel
            y_fields = len(ymapper.targets)
            y_keys = np.sort(list(ymapper.targets.keys()))
            df_temp = pd.DataFrame([df[xlabel], cat_column]).transpose()

            ax = _plot_categorical(df_temp, xlabel, ylabel,
                                   x_keys, y_keys, prefac,
                                   ax, cmap)
        else:
            ax = _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys,
                                                  y_keys, ax, cmap, n_cat=n_cat,
                                                  plottype=cat_type)

    elif ((xtype == "continuous") & (ytype == "categorical")):
        n_cat = y_fields

        if cat_type == "categorical":
            cat_column = _convert_continuous_to_categorical(df[xlabel],
                                                            xmapper)
            x_fields = len(xmapper.targets)
            x_keys = np.sort(list(xmapper.targets.keys()))

            df_temp = pd.DataFrame([cat_column, df[ylabel]],
                                   columns=[xlabel, ylabel])

            ax = _plot_categorical(df_temp, xlabel, ylabel, x_fields, y_fields,
                                   x_keys, y_keys, prefac, ax, cmap)

        else:
            ax = _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys,
                                                  y_keys, ax, cmap,
                                                  n_cat=n_cat,
                                                  plottype=cat_type)

    elif ((xtype == "continuous") & (ytype == "continuous")):
        ax = _plot_continuous(df, xlabel, ylabel, ax, plottype=cont_type,
                              n_levels=10, cmap="YlGnBu", shade=True)

    else:
        raise Exception("Not currently supported!")

    return ax