def test_boundaries_get_set_correctly_by_default(self): c = ContinuousMapper(self.df["age"]) db = self.bmax - self.bmin test_boundaries = [ self.bmin, self.bmin + db / 3., self.bmin + 2. * db / 3., self.bmax ] assert np.allclose(c.boundaries, test_boundaries)
def test_map_works_correctly(self): n_out = 1 boundaries = [self.bmin + 2, self.bmax - 2] c = ContinuousMapper(self.df["age"], n_out=n_out, boundaries=boundaries) cname = "{:2f}_{:2f}".format(boundaries[0], boundaries[1]) m = c._map[cname] test_val = boundaries[0] + np.diff(boundaries) / 2.0 assert m(test_val) == True test_val = self.bmin assert m(test_val) == False test_val = self.bmax assert m(test_val) == False
def test_keys_set_correctly(self): n_out = 3 test_boundaries = [ self.bmin + i * (self.bmax - self.bmin) / n_out for i in range(n_out + 1) ] c = ContinuousMapper(self.df["age"], n_out=n_out, boundaries=test_boundaries) cname = [ "{:2f}_{:2f}".format(test_boundaries[i], test_boundaries[i + 1]) for i in range(n_out) ] for i, key in enumerate(c.targets.keys()): assert key in cname
def test_default_targets(self): for i in range(5): c = ContinuousMapper(self.df["age"], n_out=i + 1) keys = list(c.targets.keys()) assert np.isclose(c.targets[keys[i]], 1. / (i + 1))
def test_user_boundaries_get_set_correctly_when_n_out_is_one(self): n_out = 1 test_boundaries = [self.bmin + 10.0, self.bmax - 5.0] c = ContinuousMapper(self.df["age"], n_out, boundaries=test_boundaries) assert np.allclose(c.boundaries, test_boundaries)
def test_number_of_boundaries_get_set_correctly_by_user(self): n_out = 3 c = ContinuousMapper(self.df["age"], n_out=n_out) assert len(c.boundaries) == n_out + 1
def test_number_of_boundaries_set_correctly_by_default(self): c = ContinuousMapper(self.df["age"]) assert len(c.boundaries) == 4
def test_runs_with_default_values(self): ContinuousMapper(self.df["age"])
def plot_distribution(df, xlabel, xmapper=None, xtype="categorical", ax=None, cmap="YlGnBu", bins=30): """ Plot the distribution of a single variable in the DataFrame. Parameters ---------- df : pd.DataFrame A pandas DataFrame with the data xlabel : str The column name for the variable on the x-axis xmapper : entrofy.mappers.BaseMapper subclass object A mapper object to use for the data on the x-axis. If None, the object is created within this function using some defaults. xtype : {"categorical" | "continuous"} The type of the data in df[xlabel] ax : matplotlib.Axes object The matplotlib.Axes object to plot the bubble plot into cmap : matplotlib.cm.colormap A matplotlib colormap to use for shading the bubbles bins : int The number of bins for the histogram. Returns ------- ax : matplotlib.Axes object The same matplotlib.Axes object for further manipulation """ if xmapper is None: if xtype == "categorical": xmapper = ObjectMapper(df[xlabel]) elif xtype == "continuous": xmapper = ContinuousMapper(df[xlabel]) else: raise Exception("xtype not valid.") c = sns.color_palette(cmap, 5)[2] if ax is None: fig, ax = plt.subplots(1,1, figsize=(8,6)) if xtype == "categorical": summary = _make_counts_summary(df[xlabel], xlabel, xmapper, datatype="all") summary = summary.sort_values(by=xlabel) #make figure sns.barplot(x=xlabel, y="counts", data=summary, ax=ax, color=c) ax.set_ylabel("Fraction of sample") elif xtype == "continuous": column = df[xlabel] c_clean = column[np.isfinite(column)] sns.distplot(c_clean, bins=bins, hist={"histtype": "stepfilled"}, color=c, kde=True, ax=ax) ax.set_xlabel(xlabel) plt.ylabel("Number of occurrences") return ax
def plot_correlation(df, xlabel, ylabel, xmapper=None, ymapper=None, ax = None, xtype="categorical", ytype="categorical", cmap="YlGnBu", prefac=10., cat_type="box", cont_type="kde", s=2): """ Plot two variables against each other. Produces different types of Figures depending on the type of data being plotted. Parameters ---------- df : pd.DataFrame A pandas DataFrame with the data xlabel : str The column name for the variable on the x-axis ylabel : str The column name for the variable on the y-axis xmapper : entrofy.mappers.BaseMapper subclass object A mapper object to use for the data on the x-axis. If None, the object is created within this function using some defaults. ymapper : entrofy.mappers.BaseMapper subclass object A mapper object to use for the data on the y-axis. If None, the object is created within this function using some defaults. ax : matplotlib.Axes object The matplotlib.Axes object to plot the bubble plot into xtype : {"categorical" | "continuous"} The type of the data in df[xlabel] ytype : {"categorical" | "continuous"} The type of the data in df[ylabel] cmap : matplotlib.cm.colormap A matplotlib colormap to use for shading the bubbles prefac : float A pre-factor steering the shading of the bubbles cat_type : {"box" | "strip" | "swarm" | "violin" | "categorical"} The type of plot for any plot including both categorical and continuous data. cont_type : {"kde" | "scatter"} The type of plot to produce. Either a kernel density estimate ("kde") or a scatter plor ("scatter"). s : float A pre-factor changing the overall size of the bubbles Returns ------- ax : matplotlib.Axes object The same matplotlib.Axes object for further manipulation """ if ax is None: fig, ax = plt.subplots(1,1, figsize=(9,7)) if xtype == "categorical": if xmapper is None: xmapper = ObjectMapper(df[xlabel]) x_fields = len(xmapper.targets) x_keys = np.sort(list(xmapper.targets.keys())) elif xtype == "continuous": if xmapper is None: xmapper = ContinuousMapper(df[xlabel], n_out=4) x_fields = None x_keys = xlabel else: raise Exception("Type of data in xcolumn is not recognized!") if ytype == "categorical": if ymapper is None: ymapper = ObjectMapper(df[ylabel]) y_fields = len(ymapper.targets) y_keys = np.sort(list(ymapper.targets.keys())) elif ytype == "continuous": if ymapper is None: ymapper = ContinuousMapper(df[ylabel], n_out=4) y_fields = None y_keys = ylabel if (xtype == "categorical") & (ytype == "categorical"): ax = _plot_categorical(df, xlabel, ylabel, x_keys, y_keys, prefac, ax, cmap, s) elif ((xtype == "categorical") & (ytype == "continuous")): n_cat = x_fields if cat_type == "categorical": cat_column = _convert_continuous_to_categorical(df[ylabel], ymapper) cat_column.name = ylabel y_fields = len(ymapper.targets) y_keys = np.sort(list(ymapper.targets.keys())) df_temp = pd.DataFrame([df[xlabel], cat_column]).transpose() ax = _plot_categorical(df_temp, xlabel, ylabel, x_keys, y_keys, prefac, ax, cmap) else: ax = _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys, y_keys, ax, cmap, n_cat=n_cat, plottype=cat_type) elif ((xtype == "continuous") & (ytype == "categorical")): n_cat = y_fields if cat_type == "categorical": cat_column = _convert_continuous_to_categorical(df[xlabel], xmapper) x_fields = len(xmapper.targets) x_keys = np.sort(list(xmapper.targets.keys())) df_temp = pd.DataFrame([cat_column, df[ylabel]], columns=[xlabel, ylabel]) ax = _plot_categorical(df_temp, xlabel, ylabel, x_fields, y_fields, x_keys, y_keys, prefac, ax, cmap) else: ax = _plot_categorical_and_continuous(df, xlabel, ylabel, x_keys, y_keys, ax, cmap, n_cat=n_cat, plottype=cat_type) elif ((xtype == "continuous") & (ytype == "continuous")): ax = _plot_continuous(df, xlabel, ylabel, ax, plottype=cont_type, n_levels=10, cmap="YlGnBu", shade=True) else: raise Exception("Not currently supported!") return ax