def test_visualize_two_fields(self): # check if the function works without any error np.random.seed(1) df = load_iris(target="species") df["cat"] = np.random.choice(["a","b","c"], size=df.shape[0], replace=True) inspector = Inspector(df) ## continuous x continuous inspector.visualize_two_fields("sepal_width","sepal_length") ## continuous x categorical inspector.visualize_two_fields("sepal_length", "species") ## categorical x continuous inspector.visualize_two_fields("species", "petal_width") ## categorical x categorical inspector.visualize_two_fields("species","cat")
# Or there must be a (relatively) clear relation between `workclass` and `label`. Looking at the crosstab of the two fields, we can find that `Never-worked` and `Without-pay` imply "low income". This is obvious, but important to check. pd.crosstab(df["workclass"], df["label"]) # Anyway, it is very important to see the relation between two fields, because it is part of understanding data. # # |feature\target|categorical|continuous| # |-|-|-| # |categorical|Bar chart or heatmap|violine| # |continuous|KDE or histogram|joint plot| # #### categorical vs categorical # # The following bar chart shows the distributions of workclass by label. That is the sum of the (length/area of) blue/red bar is 1. inspector.visualize_two_fields("workclass", "label") # We can show the same information by using heatmap. Since we compute the distribution of the `workclass` by `label`, the sum of each row in the heat map is 1. inspector.visualize_two_fields("workclass", "label", heatmap=True, rotation=10) # If one or both categorical fields can take lots of values, then it is better to use just a crosstab or its heatmap. A visualization is not always the best solution. inspector.visualize_two_fields("workclass", "occupation") inspector.visualize_two_fields("workclass", "occupation", heatmap=True, rotation=10) # #### categorical vs continuous
# + np.random.seed(1) df = load_iris(target="species") df["cat1"] = np.random.choice(["a", "b", "c"], size=df.shape[0], replace=True) df["cat2"] = (df.iloc[:, 0] * df.iloc[:, 1] - df.iloc[:, 2] * df.iloc[:, 3] > 11).map({ True: 1, False: 0 }) inspector = Inspector(df) inspector ## 4 continuous variables and 3 categorical variables # - inspector.visualize_two_fields("sepal_width", "sepal_length") ## continuous x continuous inspector.visualize_two_fields("petal_width", "species") ## continuous x categorical inspector.visualize_two_fields("species", "petal_width") ## categorical x continuous inspector.visualize_two_fields("species", "cat2") inspector.visualize_two_fields("species", "cat2", heatmap=True) # + from sklearn.tree import DecisionTreeClassifier features = [ "sepal_length", "sepal_width", "petal_length", "petal_width", "cat2"