Пример #1
0
def test_binary_binary():
    """
    Tests binary-binary.

    :return: None.
    """
    get_data = lambda x, y, n: [(x, y) for _ in range(n)]
    data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(0, 1, 231) + get_data(0, 0, 242)
    a = [a for a, _ in data]
    b = [b for _, b in data]

    for m in BinaryTable.measures():
        r = binary_binary(a, b, m)
        print(f'{r}: {m}')
Пример #2
0
from pypair.association import binary_binary
from pypair.contingency import BinaryTable

get_data = lambda x, y, n: [(x, y) for _ in range(n)]
data = get_data(1, 1, 207) + get_data(1, 0, 282) + get_data(
    0, 1, 231) + get_data(0, 0, 242)
a = [a for a, _ in data]
b = [b for _, b in data]

for m in BinaryTable.measures():
    r = binary_binary(a, b, m)
    print(f'{r}: {m}')

print('-' * 15)

table = BinaryTable(a, b)
for m in table.measures():
    r = table.get(m)
    print(f'{r}: {m}')
Пример #3
0
from random import randint

import pandas as pd

from pypair.association import binary_binary
from pypair.util import corr


def get_data(n_rows=1000, n_cols=5):
    data = [
        tuple([randint(0, 1) for _ in range(n_cols)]) for _ in range(n_rows)
    ]
    cols = [f'x{i}' for i in range(n_cols)]
    return pd.DataFrame(data, columns=cols)


if __name__ == '__main__':
    jaccard = lambda a, b: binary_binary(a, b, measure='jaccard')
    tanimoto = lambda a, b: binary_binary(a, b, measure='tanimoto_i')

    df = get_data()
    jaccard_corr = corr(df, jaccard)
    tanimoto_corr = corr(df, tanimoto)

    print(jaccard_corr)
    print('-' * 15)
    print(tanimoto_corr)
Пример #4
0
def plot_colinearity_variations(df):
    from pypair.association import binary_binary

    jaccard = lambda a, b: binary_binary(a, b, measure="jaccard")
    tanimoto = lambda a, b: binary_binary(a, b, measure="tanimoto_i")
    # This measure is typically used to judge the similarity between two clusters.
    ochiai = lambda a, b: binary_binary(a, b, measure="ochia_i")
    # Yule's Q is based off of the odds ratio or cross-product ratio, a measure of proportional reduction in error (PRE)
    yule = lambda a, b: binary_binary(a, b, measure="yule_q")
    #  A higher mutual information value implies strong association
    m_inf = lambda a, b: binary_binary(a, b, measure="mutual_information")
    # Tetrachoric correlation ranges from :math:`[-1, 1]`, where 0 indicates no agreement,
    # 1 indicates perfect agreement and -1 indicates perfect disagreement.
    tetrachoric = lambda a, b: binary_binary(a, b, measure="tetrachoric")

    train_df = pd_scale_norm_df(df)
    # Identify collinearity between columns¶
    fig = plt.figure(figsize=(20, 15))

    corr_df_1 = train_df.corr(method="pearson")
    ax = fig.add_subplot(3, 3, 1)
    ax.title.set_text("collinearity, Pearson similarity measure: ")
    sns.heatmap(corr_df_1, ax=ax, cmap="RdYlBu")

    corr_df_2 = train_df.corr(method="spearman")
    ax = fig.add_subplot(3, 3, 2)
    ax.title.set_text("Spearman similarity measure: ")
    sns.heatmap(corr_df_2, ax=ax, cmap="RdYlBu")

    corr_df_3 = train_df.corr(method=histogram_intersection)
    ax = fig.add_subplot(3, 3, 3)
    ax.title.set_text(
        "collinearity, Histogram Intersection similarity measure: ")
    sns.heatmap(corr_df_3, ax=ax, cmap="RdYlBu")

    corr_df_4 = train_df.corr(method=jaccard)
    ax = fig.add_subplot(3, 3, 4)
    ax.title.set_text("Jaccard similarity measure: ")
    sns.heatmap(corr_df_4, ax=ax, cmap="RdYlBu")

    corr_df_5 = train_df.corr(method=tanimoto)
    ax = fig.add_subplot(3, 3, 5)
    ax.title.set_text("Tanimoto similarity measure (Jaccard Index): ")
    sns.heatmap(corr_df_5, ax=ax, cmap="RdYlBu")

    corr_df_6 = train_df.corr(method=ochiai)
    ax = fig.add_subplot(3, 3, 6)
    ax.title.set_text("Ochiai similarity measure (cosine similarity): ")
    sns.heatmap(corr_df_6, ax=ax, cmap="RdYlBu")

    corr_df_7 = train_df.corr(method=yule)
    ax = fig.add_subplot(3, 3, 7)
    ax.title.set_text("yule Q measure (cosine similarity): ")
    sns.heatmap(corr_df_7, ax=ax, cmap="RdYlBu")

    corr_df_8 = train_df.corr(method=m_inf)
    ax = fig.add_subplot(3, 3, 8)
    ax.title.set_text("Mutual Information: ")
    sns.heatmap(corr_df_8, ax=ax, cmap="RdYlBu")

    corr_df_9 = train_df.corr(method=tetrachoric)
    ax = fig.add_subplot(3, 3, 9)
    ax.title.set_text("tetrachoric correlation: ")
    sns.heatmap(corr_df_9, ax=ax, cmap="RdYlBu")
    fig.tight_layout()
    return fig
Пример #5
0
def compute(a, b, df):
    x = df[a]
    y = df[b]
    return f'{a}_{b}', binary_binary(x, y, measure='jaccard')