def euclidean_dist(self):
        """ This metric measures the preservation of intrinsic patterns occurring between the attributes
        of the original dataset in the corresponding synthetic dataset. The lower the value is the better the data generation
        tool preserves the patterns.
        The threshold limit for this metric is a value below 14."""

        real_cat, synth_cat = self.to_cat(self.origdst, self.synthdst)

        real_cat_dem = self.get_demographics(real_cat)
        synth_cat_dem = self.get_demographics(synth_cat)

        corr_real_obj = associations(real_cat_dem,
                                     theil_u=True,
                                     bias_correction=False,
                                     plot=False)
        corr_synth_obj = associations(synth_cat_dem,
                                      theil_u=True,
                                      bias_correction=False,
                                      plot=False)

        corr_real = corr_real_obj['corr']
        corr_rand = corr_synth_obj['corr']

        eucl_matr = distance.cdist(corr_real, corr_rand, 'euclidean')

        eucl = LA.norm(eucl_matr)

        return eucl, eucl_matr
    def pairwise_correlation_difference(self):
        """ PCD measures the difference in terms of Frobenius norm of the correlation matrices computed from real and synthetic
        datasets. The smaller the PCD, the closer the synthetic data is to the real data in terms of linear correlations across
        the variables.
        The threshold limit for this metric is a value below 2.4 """

        real_cat, synth_cat = self.to_cat(self.origdst, self.synthdst)

        real_cat_dem = self.get_demographics(real_cat)
        synth_cat_dem = self.get_demographics(synth_cat)

        corr_real_obj = associations(real_cat_dem,
                                     theil_u=True,
                                     bias_correction=False,
                                     plot=False)
        corr_synth_obj = associations(synth_cat_dem,
                                      theil_u=True,
                                      bias_correction=False,
                                      plot=False)

        corr_real = corr_real_obj['corr']
        corr_rand = corr_synth_obj['corr']

        substract_m = np.subtract(corr_real, corr_rand)
        prwcrdst = LA.norm(substract_m)

        return prwcrdst, substract_m
Exemplo n.º 3
0
def correlationMatrix(data, encoded):

    if (encoded):
        correlation_matrix = data.corr(method='spearman')
        plt.figure(figsize=(8, 8))
        ax = sns.heatmap(correlation_matrix,
                         vmax=1,
                         square=True,
                         annot=True,
                         fmt='.2f',
                         cmap='GnBu',
                         cbar_kws={"shrink": .5},
                         robust=True)
        plt.title('Macierz korelacji pomiędzy cechami', fontsize=20)
        plt.show()
    else:
        nominal.associations(data,
                             theil_u=True,
                             nominal_columns=[
                                 'SendingCountry', 'ReceivingCountry',
                                 'MobilityType', 'SpecialNeeds',
                                 'SubjectAreaName', 'LevelOfStudy',
                                 'ParticipantGender', 'Language',
                                 'SendingPartnerErasmusID',
                                 'HostingPartnerCity'
                             ])
Exemplo n.º 4
0
def registration_correlations(passed_df=None,
                              save_path=None,
                              columns=None,
                              prediction_window=None,
                              scaled=False,
                              cmap='coolwarm'):
    """
    registration_correlations(save_path = None, columns = None, prediction_window=None, 
                              scaled=False, drop_course=False, cmap='coolwarm')
    ---
    Loads registrations according to giving prediction window, and creates a 
    dython.associations() correlation plot between all or listed columns.
    ---
    save_path: (optional) path to save figure to.
    columns: (optional) columns to plot correlations between
    prediction_window: (int) how far into the course the dataframe should include
    scaled: (boolean) whether to use CourseScaler to scale data by course.
    cmap: (default is 'coolwarm') colormap for plotted correlations
    """
    import matplotlib.pyplot as plt
    from dython.nominal import associations
    if type(passed_df) == type(None):
        df = load_OU_data(prediction_window=prediction_window)
    else:
        df = passed_df.copy()
    if 'final_result' in df.columns:
        df.loc[df['final_result'] == 'Withdrawn', 'final_result'] = 0
        df.loc[df['final_result'] == 'Fail', 'final_result'] = 1
        df.loc[df['final_result'] == 'Pass', 'final_result'] = 2
        df.loc[df['final_result'] == 'Distinction', 'final_result'] = 3
    if 'age_band' in df.columns:
        df.loc[df['age_band'] == '0-35', 'age_band'] = 0
        df.loc[df['age_band'] == '35-55', 'age_band'] = 1
        df.loc[df['age_band'] == '55<=', 'age_band'] = 2

    to_drop = ['code_presentation', 'id_student', 'module_presentation_length']
    if prediction_window == None:
        to_drop.append('date_unregistration')
    for column in to_drop:
        if column in df.columns:
            df = df.drop(column, axis=1)
    if scaled:
        if 'code_module' in df.columns:
            cs = CourseScaler(drop_course=False)
            df = cs.fit_transform(df)
        else:
            print('cannot scale, code_module not found in columns')
    if type(columns) == list:
        df = df[columns]
    fig, ax = plt.subplots(1,
                           1,
                           figsize=(len(df.columns) * 2**1.2,
                                    len(df.columns) * 1.5**1.2))
    fig.suptitle('Variable Correlations', fontsize=len(df.columns) * 2 + 5)

    associations(df, ax=ax, mark_columns=False, cmap=cmap)
    if type(save_path) == str:
        fig.savefig(save_path, dpi=250)
    plt.show()
Exemplo n.º 5
0
def associations_example():
    """
    Plot an example of an associations heat-map of the Iris dataset features
    """
    iris = datasets.load_iris()
    X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    y = pd.DataFrame(data=iris.target, columns=['target'])
    df = pd.concat([X, y], axis=1)
    associations(df, nominal_columns=['target'])
Exemplo n.º 6
0
    def fit(self, data_original, data_synthetic):
        data_original, data_synthetic = self._check_input_data(data_original, data_synthetic)

        self.stats_original_ = associations(data_original, nom_nom_assoc=self.nom_nom_assoc,
                                            nominal_columns=self.nominal_columns, nan_replace_value='nan',
                                            compute_only=True)['corr']
        self.stats_synthetic_ = associations(data_synthetic, nom_nom_assoc=self.nom_nom_assoc,
                                             nominal_columns=self.nominal_columns, nan_replace_value='nan',
                                             compute_only=True)['corr']
        return self
Exemplo n.º 7
0
def associations_example():
    """
    Plot an example of an associations heat-map of the Iris dataset features
    """
    iris = datasets.load_iris()

    # Convert int classes to strings to allow associations method auto recognition of categorical columns
    target = ['C{}'.format(i) for i in iris.target]

    X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    y = pd.DataFrame(data=target, columns=['target'])
    df = pd.concat([X, y], axis=1)
    associations(df)
Exemplo n.º 8
0
def main():

    # Data importation -------------------------------------------------------------
    df = pd.read_csv("agaricus-lepiota.csv")

    panda_rules()
    # Construct barplot ===================================================
    make_barplot(df, "odor")
    make_barplot(df, "spore-print-color")

    # Make heatmap to show correlation between attribute
    associations(df, nominal_columns="all")  # Use Cramer V nominal association
    associations(df, nominal_columns="all",
                 theil_u=True)  # Use Theil's U nominal association

    # Construct Pie chart ===================================================
    make_pie_chart(df, "odor")

    # Show the number of mushroom for each odor =============================
    count_value_for_attribute(df, "odor")

    # Find the number of instance in each class ==============================
    find_nb_instance_class(df)
    print("==============================")

    # Count the number of edible mush by odor ==============================
    count_edible_mush_by_odor(df)
    print("==============================")

    # Count the number of toxic mush by odor ==============================
    count_toxic_mush_by_odor(df)
    print("==============================")

    # Count the number of toxic mush by spore color ==============================
    count_toxic_mush_by_spore_color(df)
    print("==============================")

    # Count the number of edible mush by spore-color ==============================
    count_edible_mush_by_spore_color(df)
    print("==============================")

    # All mush with spore-color green are poisonous
    show_rules_two(df)
    print("==============================")

    # All mush with habitat = leaves and cap-color = white are poisonous
    show_rules_four(df)
    print("==============================")
    return
Exemplo n.º 9
0
def categorical_matrix(dataframe, theil_u=True, return_results=False):
    """Displays a kind of "correlation matrix" including categorical features."""

    # loading library
    from dython.nominal import associations

    # Get the categorical and boolean columns
    categorical_columns = list(
        dataframe.select_dtypes(include='category').columns)
    categorical_columns += list(
        dataframe.select_dtypes(include='bool').columns)

    # Drop NaN values to avoid errors
    df_for_correlations = dataframe.dropna()

    # Drop 'object', 'datetime' and 'timedelta' columns
    df_for_correlations = df_for_correlations.select_dtypes(
        exclude=['object', 'datetime', 'timedelta'])

    # Calculate associations (returns None) and display graph
    corr = associations(
        df_for_correlations,
        figsize=(15, 7),
        theil_u=theil_u,  # asymetric measure of correlation for nominal feature
        nominal_columns=categorical_columns,
        return_results=return_results)

    # Returns correlation matrix, if requested
    if return_results:
        return corr
Exemplo n.º 10
0
def categorical_matrix(dataframe):
    """Displays a kind of "correlation matrix" including categorical features."""

    # loading library
    from dython.nominal import associations, cluster_correlations

    # Get the categorical and boolean columns
    categorical_columns = list(
        dataframe.select_dtypes(include='category').columns)
    categorical_columns += list(
        dataframe.select_dtypes(include='bool').columns)

    # Drop NaN values to avoid errors
    df_for_correlations = dataframe.dropna()

    # Drop 'object', 'datetime' and 'timedelta' columns
    df_for_correlations = df_for_correlations.select_dtypes(
        exclude=['object', 'datetime', 'timedelta'])

    # Calculate associations and display graph
    assoc = associations(df_for_correlations.select_dtypes(
        exclude=['object', 'timedelta64[ns]', 'datetime64[ns]']),
                         nan_strategy='drop_samples',
                         figsize=(30, 30),
                         plot=False)

    # Sort the correlations_matrix s and display graph
    correlations_matrix = assoc['corr']
    correlations_matrix, _ = cluster_correlations(correlations_matrix)
    return (correlations_matrix)
Exemplo n.º 11
0
def plot_correlation_difference(real: pd.DataFrame, fake: pd.DataFrame, plot_diff: bool = True, cat_cols: list = None, annot=False, model_dir_path="~"):
    """
    Plot the association matrices for the `real` dataframe, `fake` dataframe and plot the difference between them. Has support for continuous and Categorical
    (Male, Female) data types. All Object and Category dtypes are considered to be Categorical columns if `dis_cols` is not passed.

    - Continuous - Continuous: Uses Pearson's correlation coefficient
    - Continuous - Categorical: Uses so called correlation ratio (https://en.wikipedia.org/wiki/Correlation_ratio) for both continuous - categorical and categorical - continuous.
    - Categorical - Categorical: Uses Theil's U, an asymmetric correlation metric for Categorical associations

    :param real: DataFrame with real data
    :param fake: DataFrame with synthetic data
    :param plot_diff: Plot difference if True, else not
    :param cat_cols: List of Categorical columns
    :param boolean annot: Whether to annotate the plot with numbers indicating the associations.
    """
    assert isinstance(real, pd.DataFrame), f'`real` parameters must be a Pandas DataFrame'
    assert isinstance(fake, pd.DataFrame), f'`fake` parameters must be a Pandas DataFrame'
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    if cat_cols is None:
        cat_cols = real.select_dtypes(['object', 'category'])
    if plot_diff:
        fig, ax = plt.subplots(1, 3, figsize=(24, 7))
    else:
        fig, ax = plt.subplots(1, 2, figsize=(20, 8))

    real_corr = associations(real, nominal_columns=cat_cols, plot=False, theil_u=True,
                             mark_columns=True, annot=annot, ax=ax[0], cmap=cmap)['corr']
    fake_corr = associations(fake, nominal_columns=cat_cols, plot=False, theil_u=True,
                             mark_columns=True, annot=annot, ax=ax[1], cmap=cmap)['corr']

    if plot_diff:
        diff = abs(real_corr - fake_corr)
        sns.set(style="white")
        sns.heatmap(diff, ax=ax[2], cmap=cmap, vmax=.3, square=True, annot=annot, center=0,
                    linewidths=.5, cbar_kws={"shrink": .5}, fmt='.2f')

    titles = ['Real', 'Synthetic', 'Difference'] if plot_diff else ['Real', 'Synthetic']
    for i, label in enumerate(titles):
        title_font = {'size': '18'}
        ax[i].set_title(label, **title_font)
    plt.tight_layout()
    plt.savefig(model_dir_path + "/correlation.jpg")
    plt.show()
Exemplo n.º 12
0
def plot_correlation_comparison(evaluators: List, annot=False):
    """
    Plot the correlation differences of multiple TableEvaluator objects.

    :param evaluators: list of TableEvaluator objects
    :param boolean annot: Whether to annotate the plots with numbers.
    """
    nr_plots = len(evaluators) + 1
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    fig, ax = plt.subplots(2, nr_plots, figsize=(4 * nr_plots, 7))
    flat_ax = ax.flatten()
    flat_ax[nr_plots + 1].clear()
    fake_corr = []
    real_corr = associations(evaluators[0].real, nominal_columns=evaluators[0].categorical_columns, plot=False, theil_u=True,
                             mark_columns=True, annot=False, cmap=cmap, cbar=False, ax=flat_ax[0])['corr']
    for i in range(1, nr_plots):
        cbar = True if i % (nr_plots - 1) == 0 else False
        fake_corr.append(
            associations(evaluators[i - 1].fake, nominal_columns=evaluators[0].categorical_columns, plot=False, theil_u=True,
                         mark_columns=True, annot=False, cmap=cmap, cbar=cbar, ax=flat_ax[i])['corr']
        )
        if i % (nr_plots - 1) == 0:
            cbar = flat_ax[i].collections[0].colorbar
            cbar.ax.tick_params(labelsize=20)

    for i in range(1, nr_plots):
        cbar = True if i % (nr_plots - 1) == 0 else False
        diff = abs(real_corr - fake_corr[i - 1])
        sns.set(style="white")
        az = sns.heatmap(diff, ax=flat_ax[i + nr_plots], cmap=cmap, vmax=.3, square=True, annot=annot, center=0,
                         linewidths=0, cbar=cbar, fmt='.2f')
        if i % (nr_plots - 1) == 0:
            cbar = az.collections[0].colorbar
            cbar.ax.tick_params(labelsize=20)
    titles = ['Real'] + [e.name if e.name is not None else idx for idx, e in enumerate(evaluators)]
    for i, label in enumerate(titles):
        flat_ax[i].set_yticklabels([])
        flat_ax[i].set_xticklabels([])
        flat_ax[i + nr_plots].set_yticklabels([])
        flat_ax[i + nr_plots].set_xticklabels([])
        title_font = {'size': '28'}
        flat_ax[i].set_title(label, **title_font)
    plt.tight_layout()
Exemplo n.º 13
0
def associations(dataset,
                 nominal_columns=None,
                 mark_columns=False,
                 theil_u=False,
                 plot=True,
                 return_results=False,
                 **kwargs):
    """
    See 'associations' in the 'nominal' module.
    """
    return nominal.associations(dataset, nominal_columns, mark_columns,
                                theil_u, plot, return_results, **kwargs)
Exemplo n.º 14
0
def associations_iris_example():
    """
    Plot an example of an associations heat-map of the Iris dataset features.
    All features of this dataset are numerical (except for the target).
    """

    # Load data
    iris = datasets.load_iris()

    # Convert int classes to strings to allow associations method
    # to automatically recognize categorical columns
    target = ['C{}'.format(i) for i in iris.target]

    # Prepare data
    X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    y = pd.DataFrame(data=target, columns=['target'])
    df = pd.concat([X, y], axis=1)

    # Plot features associations
    return associations(df)
Exemplo n.º 15
0
def associations_mushrooms_example():
    """
    Plot an example of an associations heat-map of the UCI Mushrooms dataset features.
    All features of this dataset are categorical. This example will use Theil's U.
    """

    # Download and load data from UCI
    df = pd.read_csv(
        'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
    )
    df.columns = [
        'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
        'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
        'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
        'stalk-surface-below-ring', 'stalk-color-above-ring',
        'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
        'ring-type', 'spore-print-color', 'population', 'habitat'
    ]

    # Plot features associations
    return associations(df, theil_u=True, figsize=(15, 15))
# %%
correlation_ratio(categories=t1['topic'], measurements=t1['score'])

# %%
t2 = pd.DataFrame({
    'topic': [
        'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Geometry',
        'Geometry', 'Geometry', 'Geometry', 'Statistics', 'Statistics',
        'Statistics', 'Statistics', 'Statistics', 'Statistics'
    ],
    'score': [36, 36, 36, 36, 36, 33, 33, 33, 33, 78, 78, 78, 78, 78, 78]
})

violinPlot(data=t2, varx='topic', vary='score', title='', xlab='', ylab='')

# %%
correlation_ratio(categories=t2['topic'], measurements=t2['score'])

# %%
# Calculate correlation coefficients for a Pandas dataframe regardless column data types
ass = associations(df,
                   nom_nom_assoc='theil',
                   num_num_assoc='pearson',
                   figsize=(10, 10),
                   clustering=True)

# %%
ass['corr']

# %%
Exemplo n.º 17
0
plt.show()

#Relation between attributes
plt.figure(figsize=(14, 12))
foo = sns.heatmap(train.corr(), vmax=0.6, square=True, annot=True)
plt.show()
'''I found a new package! Although in its tutorial showed its performance on
mixed categorical-numerical datasets, I just get the categorical becuase the
the computation is so heavy!
'''
from dython.nominal import associations
train_categorical = train[[
    'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain',
    'app_category', 'device_id', 'device_ip', 'device_model'
]]
associations(train_categorical, theil_u=True, figsize=(30, 30))
'''
Seems that 'Unnamed: 0', 'click', 'hour', 'C14', 'C17' are highly correlated to click rate.
In the next section we will plot their inter connections
'''

#Pairplot
warnings.filterwarnings(action="ignore")
cols = ['Unnamed: 0', 'click', 'hour', 'C14', 'C17']
g = sns.pairplot(data=train, vars=cols, size=1.5, hue='click')
g.set(xticklabels=[])
plt.show()
plt.tight_layout()

#jointplot of correlated
sns.jointplot("C14",
def main():
    # Read csv file and prepare dataframe
    data = pd.read_csv("data/auto-mpg.data-original",
                       sep="\s+",
                       header=None,
                       names=[
                           "mpg", "cylinders", "displacement", "horsepower",
                           "weight", "acceleration", "model_year", "origin",
                           "car_name"
                       ])

    print(data.info())
    print(data.shape)

    # Find How many null entries
    print(data.isnull().sum())

    # Drop na columns
    data = data.dropna()

    # Drop car_name feature since it has too many unique values
    # TODO: Create etl to map car names to model
    data = data.drop(["car_name"], axis=1)

    # Need to look at the target variable
    print(data.mpg.describe())

    # Print Distribution of the target variable
    sns.distplot(data.mpg)
    plt.savefig("plots/mpg_distribution.jpg")
    plt.show()

    # Plot all numerical features against each other
    sns.pairplot(data,
                 vars=["displacement", "horsepower", "weight", "acceleration"],
                 hue="cylinders")
    plt.savefig("plots/pair_plot_cylinders.jpg")
    sns.pairplot(data,
                 vars=["displacement", "horsepower", "weight", "acceleration"],
                 hue="origin")
    plt.savefig("plots/pair_plot_origin.jpg")
    sns.pairplot(data,
                 vars=["displacement", "horsepower", "weight", "acceleration"],
                 hue="model_year")
    plt.savefig("plots/pair_plot_model_year.jpg")
    plt.show()

    # Plot categorical values against the target variable
    sns.boxplot(x="cylinders", y="mpg", data=data)
    plt.axhline(data.mpg.mean(), color='r', linestyle='dashed', linewidth=2)
    plt.savefig("plots/box_plot_cylinders.jpg")
    plt.show()
    sns.boxplot(x="origin", y="mpg", data=data)
    plt.axhline(data.mpg.mean(), color='r', linestyle='dashed', linewidth=2)
    plt.savefig("plots/box_plot_origin.jpg")
    plt.show()
    sns.boxplot(x="model_year", y="mpg", data=data)
    plt.axhline(data.mpg.mean(), color='r', linestyle='dashed', linewidth=2)
    plt.savefig("plots/box_plot_model_year.jpg")
    plt.show()

    # Get correlation plot
    associations(data, figsize=(15, 15), cmap="viridis")
    plt.show()
Exemplo n.º 19
0
]

train['capital.loss'] = [
    float(re.sub(',', '.', aa)) for aa in train['capital.loss']
]
train['capital.gain'] = [
    float(re.sub(',', '.', aa)) for aa in train['capital.gain']
]

#=====================================
# Select the test set
#=====================================
competitors = os.listdir(res_folder + design + '/' + sub_design)

assocs = {}
assoc_test = associations(test.astype(dtype), nominal_columns = list(test.columns[cat_features]),\
                          plot = False)['corr']

#ax = axs[0], cbar = False, annot = False
assocs['Test'] = assoc_test

for c_idx, competitor in enumerate(competitors):
    try:
        preds = pd.read_csv(res_folder + design + '/' + sub_design +  '/' + competitor\
                           + '/preds' + str(filenum) + '.csv', sep = ',')
        if preds.shape[1] == 1:
            preds = pd.read_csv(res_folder + design + '/' + sub_design +  '/' + competitor\
                               + '/preds' + str(filenum) + '.csv', sep = ';')
        #if preds.shape[1] == p + 1:
        #preds = preds.iloc[:, 1:]
    except ParserError:
        preds = pd.read_csv(res_folder + design + '/' + sub_design +  '/' + competitor\
Exemplo n.º 20
0
def correlations(data):
    associations(data, figsize=(15, 15), cmap="viridis")
    plt.show()
Exemplo n.º 21
0
def associations_example():
    iris = datasets.load_iris()
    X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    y = pd.DataFrame(data=iris.target, columns=['target'])
    df = pd.concat([X, y], axis=1)
    associations(df, nominal_columns=['target'])
Exemplo n.º 22
0
cate_col = [
    'payment_plan', 'program_name', 'application_type_name', 'referrer',
    'gender', 'home_country', 'work_country', 'practice_type',
    'professional_assoc', 'home_state', 'work_state', 'Orientation'
]

nume_col = [
    "response", "hours_online", 'preprobation', 'currentafterpreprbation',
    "Unit 1", "Unit 2", "Unit 3", "Unit 4"
]

# Numerical Heatmap
#nominal.associations(x[nume_col])
# Total Heatmap
nominal.associations(x, nominal_columns=cate_col)

# Drop the unnecessary column
drop_column = [
    "Unit 1", "Unit 2", "Unit 3", "Unit 4", "status_id_binary", "user_id",
    "application_id", "gender", "application_type_name", "home_country",
    "home_state", "professional_assoc"
]
tmp = x.drop(columns=drop_column)
# One Hot-Encoding
cate_col_new = [x for x in cate_col if x not in drop_column]
x_train = pd.get_dummies(data=tmp, columns=cate_col_new)
columns = list(x.columns)

# Data Split
train_x, test_x, train_y, test_y = train_test_split(x_train,
Exemplo n.º 23
0
def associations_example():
    df = pd.DataFrame(data=data, columns=data.columns.values)
    associations(df, nominal_columns=data.columns.values)
Exemplo n.º 24
0
                    gow.append((true.astype(int) != imputed.astype(int)).mean())
            error.loc[method, 'gow'] = np.mean(gow)
        
        #error.T[['full']].T.to_csv(res + 'Run' + str(run_idx) + '/res' + dataset_name + '.csv', index = False)

#=============================
# Comparing associations structure
#=============================

import seaborn as sns
from dython.nominal import compute_associations, associations
from sklearn.metrics.pairwise import cosine_similarity

original_assoc = compute_associations(full_pima, nominal_columns = cat_features)

associations(full_pima, nominal_columns = cat_features)

Ez = out2['Ez.y']
vc = vars_contributions(completed_y2, Ez, assoc_thr = 0.0, \
                       title = 'Contribution of the variables to the latent dimensions',\
                       storage_path = None)

assoc = cosine_similarity(vc, dense_output=True)

labels = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'D.P. Function', 'Age', 'Outcome']

fig, axn = plt.subplots(1, 2, sharex=True, sharey=True, figsize = (12,10)) 
cbar_ax = fig.add_axes([.91, .3, .03, .4])

Exemplo n.º 25
0
s_full = cosine_similarity(vc_full, dense_output=True)

# Compare the representation between full and complete
idx = 0
fig, ax = plt.subplots(figsize = (4,4))
ax.scatter(s2[idx], s_full[idx])
plt.title(full_contra.columns[idx])

for i, txt in enumerate(full_contra.columns):
    ax.annotate(txt, (s2[idx][i], s_full[idx][i]))
ax.set_xlim([-1,1])
ax.set_ylim([-1,1])

# Compare the representation between completed cosine similarity and original associations
associations(complete_y.astype(float), nominal_columns = cat_features)
associations(completed_y.astype(float), nominal_columns = cat_features)
associations(full_contra.astype(float), nominal_columns = cat_features)

assoc = compute_associations(full_contra.astype(float), nominal_columns = cat_features).values     


idx = 0
fig, ax = plt.subplots(figsize = (4,4))
ax.scatter(assoc[idx], s_full[idx])
plt.title(full_contra.columns[idx])

for i, txt in enumerate(full_contra.columns):
    ax.annotate(txt, (assoc[idx][i], s_full[idx][i]))
ax.set_xlim([-1,1])
ax.set_ylim([-1,1])
Exemplo n.º 26
0
base_col = "accommodates"

for col in ["log_bedrooms", "log_bathrooms"]:
    new_col_name = f"{col}_per_{base_col}"
    df[new_col_name] = df[col] / df[base_col]
    num_attribs.append(new_col_name)

# ### Feature Elimination

# We can also examine the correlation heatmap, which would serve as guidance for further data elimination.

# In[970]:

fig, ax = plt.subplots(figsize=(30, 30))
associations(df[num_attribs], theil_u=True, ax=ax)
plt.xticks(rotation=45)
plt.suptitle("Initial Correlation Heatmap")
plt.savefig("../imgs/correlation_heatmap_before.png")

# Check the saved correlation heatmap [here](../imgs/correlation_heatmap_before.png).
#
# We now proceed to examine the highly correlated ones.

# In[971]:

review_col_names = [
    'review_scores_accuracy', 'review_scores_cleanliness',
    'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value'
]
Exemplo n.º 27
0
 def ml3(self):
     global df
     nominal.associations(df, nominal_columns=['Process', 'Library'], theil_u=True)
     return
Exemplo n.º 28
0
def correlations(data):
    associations(data, figsize=(15, 15), cmap="viridis")
    plt.savefig("plots/correlations.png")
    plt.show()
Exemplo n.º 29
0
def exploratory_data_analysis(data,
                              categoricals,
                              numericals,
                              plot_with_target=False,
                              plot_corr_mat=False,
                              save=False):

    if plot_corr_mat:
        associations(
            data[["tenure", "MonthlyCharges", "TotalCharges", "Churn"]])

    for cat in categoricals:
        sns.countplot(x=cat, data=data)
        plt.title("Distribution of " + str(cat))
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.xlabel(None)
        if save:
            plt.savefig(fname=str(cat) + "_count.png")
        plt.show()

        if plot_with_target:
            if cat != 'Churn':
                splot = sns.countplot(x=cat, data=data, hue='Churn')
                plt.title("Distribution of " + str(cat) +
                          " dependent to target variable")
                plt.xticks(rotation=90)
                plt.tight_layout()
                plt.xlabel(None)
                for p in splot.patches:
                    height = p.get_height()
                    splot.text(
                        p.get_x() + p.get_width() / 2.,
                        height + 3,
                        '{:1.2f}'.format(height / float(len(data)) * 100) +
                        "%",
                        ha="center")
                if save:
                    plt.savefig(fname=str(cat) + "_count_target.png")
                plt.show()

    for num in numericals:

        sns.distplot(data[num])
        plt.tight_layout()
        plt.title("Distribution of " + str(num))
        plt.xlabel(None)
        if save:
            plt.savefig(fname=str(num) + "_dist.png")
        plt.show()

        if plot_with_target:
            data.groupby("Churn")[num].apply(
                lambda x: sns.distplot(x, label=x.name))
            plt.tight_layout()
            plt.title("Distribution of " + str(num) +
                      " dependent to target variable")
            plt.xlabel(None)
            plt.legend()
            if save:
                plt.savefig(fname=str(num) + "_dist_target.png")
            plt.show()

        sns.boxplot(x=num, data=data)
        plt.tight_layout()
        plt.title(str(num) + " boxplot")
        plt.xlabel(None)
        if save:
            plt.savefig(fname=str(num) + "_box.png")
        plt.show()
    df_pvals[column1] = p_list

# In[66]:

df_p.index = df_p.columns
p_vals = df_p.values
print(p_vals[p_vals > .01])
print(np.nonzero(p_vals > .01))

# Running the $\chi^2$ test on all pairs of variables returns only four unique pairs that fail to reject the null hypothesis on a selected $\alpha = 0.01$, the null hypothesis being that the two selected variables are independent of each other. I opted for a lower value of $\alpha$ because I believed the relatively large sample size I'm using would otherwise make it easier to detect false positives. The pairs that fail to reject the null hypothesis ((7, 20), (7, 21), (7, 22), and (9, 16)) correspond to (`flushot`, `any_exercise`), (`flushot`, `blindness`), (`flushot`, `trouble_concentrating`), and (`sex`, `kidney_disease`). Ultimately, I am most interested in the significance of values related to the target variable, `diabetes`, and all variables related to `diabetes` successfully reject the null hypothesis.
#
# An additional test I can run after computing the $\chi^2$ statistics and their p-value is Cramér's V, which uses value of $\chi^2$ to compute the strength of association between two variables. This measure of association ranges from 0 to 1 and serves a similar purpose to measuring the correlation between two continuous variables, so it seems like a potentially useful tool to use on data that is entirely categorical (whether ordinal or nominal). Again, I am most interested in the strength of association between  `diabetes` and all other variables, but I have also computed Cramér's V for all other pairs since it may be useful to know how strongly associated two features are to each other when it comes time to build the machine learning model.
#
# The bias correction operation is included in the computation of Cramér's V.

# In[4]:

from dython.nominal import associations

associations(df_diabetes,
             figsize=(15, 15))  #Bias correction set to True by default
plt.show()

# Although most pairs of features were found to likely have some association by their low p-values on the $\chi^2$ test, very few pairs of features have a strength of association higher than $0.25$. Perhaps the large sample size made it possible to be reasonably confident that even weak associations were statistically significant (recall that this is related to the reason I selected a very low p-value). Three of the four pairs of features that failed to reject the null hypothesis on the chi-squared test were also found to have a Cramér's of $0.00$. The fourth, `flushot` and `blindness`, has a Cramér's V of $0.01$.
#
# On `diabetes`, the strongest association, $0.30$, is to `general_health`, and the lowest, $0.02$, is to `mscode`. The next-highest associations to `diabetes` are to `bmi_category` at $0.23$, `employed` at $0.20$, `income` at $0.18$, and `age_category` at $0.17$.
#
# Some pairs of features, such as `mental_health_days_per_month` and `depressive_disorder`, have a moderate degree of association to each other. However, no pair of features has an association higher than $0.44$, so at this point it does not seem especially likely that any of the predictive variables will be too strongly associated with each other, so all of the features recommended by the scientific literature on diabetes will likely be retained by the model.

# In[ ]: