def test_binary_binary(): """ Tests binary-binary. :return: None. """ get_data = lambda x, y, n: [(x, y) for _ in range(n)] data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242) a = [a for a, _ in data] b = [b for _, b in data] for m in BinaryTable.measures(): r = binary_binary(a, b, m) print(f'{r}: {m}')
from pypair.association import binary_binary from pypair.contingency import BinaryTable get_data = lambda x, y, n: [(x, y) for _ in range(n)] data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data( 0, 1, 231) + get_data(0, 0, 242) a = [a for a, _ in data] b = [b for _, b in data] for m in BinaryTable.measures(): r = binary_binary(a, b, m) print(f'{r}: {m}') print('-' * 15) table = BinaryTable(a, b) for m in table.measures(): r = table.get(m) print(f'{r}: {m}')
from random import randint import pandas as pd from pypair.association import binary_binary from pypair.util import corr def get_data(n_rows=1000, n_cols=5): data = [ tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows) ] cols = [f'x{i}' for i in range(n_cols)] return pd.DataFrame(data, columns=cols) if __name__ == '__main__': jaccard = lambda a, b: binary_binary(a, b, measure='jaccard') tanimoto = lambda a, b: binary_binary(a, b, measure='tanimoto_i') df = get_data() jaccard_corr = corr(df, jaccard) tanimoto_corr = corr(df, tanimoto) print(jaccard_corr) print('-' * 15) print(tanimoto_corr)
def plot_colinearity_variations(df): from pypair.association import binary_binary jaccard = lambda a, b: binary_binary(a, b, measure="jaccard") tanimoto = lambda a, b: binary_binary(a, b, measure="tanimoto_i") # This measure is typically used to judge the similarity between two clusters. ochiai = lambda a, b: binary_binary(a, b, measure="ochia_i") # Yule's Q is based off of the odds ratio or cross-product ratio, a measure of proportional reduction in error (PRE) yule = lambda a, b: binary_binary(a, b, measure="yule_q") # A higher mutual information value implies strong association m_inf = lambda a, b: binary_binary(a, b, measure="mutual_information") # Tetrachoric correlation ranges from :math:`[-1, 1]`, where 0 indicates no agreement, # 1 indicates perfect agreement and -1 indicates perfect disagreement. tetrachoric = lambda a, b: binary_binary(a, b, measure="tetrachoric") train_df = pd_scale_norm_df(df) # Identify collinearity between columns¶ fig = plt.figure(figsize=(20, 15)) corr_df_1 = train_df.corr(method="pearson") ax = fig.add_subplot(3, 3, 1) ax.title.set_text("collinearity, Pearson similarity measure: ") sns.heatmap(corr_df_1, ax=ax, cmap="RdYlBu") corr_df_2 = train_df.corr(method="spearman") ax = fig.add_subplot(3, 3, 2) ax.title.set_text("Spearman similarity measure: ") sns.heatmap(corr_df_2, ax=ax, cmap="RdYlBu") corr_df_3 = train_df.corr(method=histogram_intersection) ax = fig.add_subplot(3, 3, 3) ax.title.set_text( "collinearity, Histogram Intersection similarity measure: ") sns.heatmap(corr_df_3, ax=ax, cmap="RdYlBu") corr_df_4 = train_df.corr(method=jaccard) ax = fig.add_subplot(3, 3, 4) ax.title.set_text("Jaccard similarity measure: ") sns.heatmap(corr_df_4, ax=ax, cmap="RdYlBu") corr_df_5 = train_df.corr(method=tanimoto) ax = fig.add_subplot(3, 3, 5) ax.title.set_text("Tanimoto similarity measure (Jaccard Index): ") sns.heatmap(corr_df_5, ax=ax, cmap="RdYlBu") corr_df_6 = train_df.corr(method=ochiai) ax = fig.add_subplot(3, 3, 6) ax.title.set_text("Ochiai similarity measure (cosine similarity): ") sns.heatmap(corr_df_6, ax=ax, cmap="RdYlBu") corr_df_7 = train_df.corr(method=yule) ax = fig.add_subplot(3, 3, 7) ax.title.set_text("yule Q measure (cosine similarity): ") sns.heatmap(corr_df_7, ax=ax, cmap="RdYlBu") corr_df_8 = train_df.corr(method=m_inf) ax = fig.add_subplot(3, 3, 8) ax.title.set_text("Mutual Information: ") sns.heatmap(corr_df_8, ax=ax, cmap="RdYlBu") corr_df_9 = train_df.corr(method=tetrachoric) ax = fig.add_subplot(3, 3, 9) ax.title.set_text("tetrachoric correlation: ") sns.heatmap(corr_df_9, ax=ax, cmap="RdYlBu") fig.tight_layout() return fig
def compute(a, b, df): x = df[a] y = df[b] return f'{a}_{b}', binary_binary(x, y, measure='jaccard')