示例#1
0
def test_engine_with_mapper_stl(gendf):
    engine = Engine(gendf(), n_models=4,
                    mapper=lambda f, args: list(map(f, args)))
    engine.init_models()
    engine.run(2)

    assert len(engine.models) == 4
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter):
    dg = DataGenerator(n_rows, ['continuous']*n_cols, cat_weights=n_cats,
                       cat_sep=cat_sep, seed=1337)
    engine = Engine(dg.df, use_mp=False)
    engine.init_models(n_models)
    engine.run(n_iter)

    return dg, engine
示例#3
0
def test_engine_with_mapper_stl(gendf):
    engine = Engine(gendf(),
                    n_models=4,
                    mapper=lambda f, args: list(map(f, args)))
    engine.init_models()
    engine.run(2)

    assert len(engine.models) == 4
示例#4
0
def test_engine_run_smoke_multiple(gendf):
    df = gendf()

    engine = Engine(df, n_models=10, use_mp=False)
    engine.init_models()
    engine.run()
    engine.run(10)

    assert len(engine.models) == 10
示例#5
0
def test_engine_with_mapper_ipyparallel(gendf):
    c = ipp.Client()
    v = c[:]

    engine = Engine(gendf(), n_models=4, mapper=v.map)
    engine.init_models()
    engine.run(2)

    assert len(engine.models) == 4
示例#6
0
def test_engine_with_mapper_mp(gendf):
    pool = Pool()

    with Pool() as pool:
        engine = Engine(gendf(), n_models=4, mapper=pool.map)
        engine.init_models()
        engine.run(2)

        assert len(engine.models) == 4
示例#7
0
def test_engine_with_mapper_ipyparallel(gendf):
    c = ipp.Client()
    v = c[:]

    engine = Engine(gendf(), n_models=4, mapper=v.map)
    engine.init_models()
    engine.run(2)

    assert len(engine.models) == 4
示例#8
0
def test_engine_run_smoke_multiple(gendf):
    df = gendf()

    engine = Engine(df, n_models=10, use_mp=False)
    engine.init_models()
    engine.run()
    engine.run(10)

    assert len(engine.models) == 10
示例#9
0
def test_engine_with_mapper_mp(gendf):
    pool = Pool()

    with Pool() as pool:
        engine = Engine(gendf(), n_models=4, mapper=pool.map)
        engine.init_models()
        engine.run(2)

        assert len(engine.models) == 4
示例#10
0
def gen_comp_engines(df, subsample_size):
    engine_full = Engine(df, n_models=8, use_mp=False)
    engine_full.init_models()
    engine_full.run(100)

    engine_mod = Engine(df, n_models=8, use_mp=False)
    engine_mod.init_models(subsample_size=subsample_size)
    engine_mod.run(100)

    return engine_full, engine_mod
示例#11
0
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter):
    dg = DataGenerator(n_rows, ['continuous'] * n_cols,
                       cat_weights=n_cats,
                       cat_sep=cat_sep,
                       seed=1337)
    engine = Engine(dg.df, use_mp=False)
    engine.init_models(n_models)
    engine.run(n_iter)

    return dg, engine
示例#12
0
def gen_comp_engines(df, subsample_size):
    engine_full = Engine(df, n_models=8, use_mp=False)
    engine_full.init_models()
    engine_full.run(100)

    engine_mod = Engine(df, n_models=8, use_mp=False)
    engine_mod.init_models(subsample_size=subsample_size)
    engine_mod.run(100)

    return engine_full, engine_mod
示例#13
0
def run(n_models=10, n_iter=200, iter_step=10, n_needles=2, n_distractors=8,
        n_rows=100, pairtype=None, pair_kws=None):

    needle_idxs = [(2*i, 2*i+1,) for i in range(n_needles)]
    needle_cols = list(range(n_needles*2))
    distractor_cols = list(range(n_needles*2, n_needles*2+n_distractors))
    combs = list(it.product(needle_cols, distractor_cols))
    distractor_idxs = random.sample(combs, min(len(combs), 32))

    df = _gen_data(n_needles, n_distractors, n_rows, pairtype, pair_kws)

    engine = Engine(df, n_models=n_models)
    engine.init_models()
    # for model in engine._models:
    #     # XXX: emulates the log grid expected alpha
    #     # e.g. mean(exp(linspace(log(1/n_rows), log(rows))))
    #     # model['state_alpha'] = .5*(n_needles*2. + n_distractors)
    #     model['state_alpha'] = 100.

    # no column_alpha transition
    tlist = [b'row_assignment', b'column_assignment', b'row_alpha',
             b'column_hypers']

    n_steps = int(n_iter/iter_step)
    needle_dps = np.zeros((n_needles, n_steps+1,))
    distractor_dps = np.zeros((len(distractor_idxs), n_steps+1,))
    for i in range(n_steps+1):
        engine.run(iter_step, trans_kwargs={'transition_list': tlist})
        # engine.run(iter_step)

        for nidx, (a, b) in enumerate(needle_idxs):
            a = df.columns[a]
            b = df.columns[b]
            needle_dps[nidx, i] = engine.dependence_probability(a, b)

        for didx, (a, b) in enumerate(distractor_idxs):
            a = df.columns[a]
            b = df.columns[b]
            distractor_dps[didx, i] = engine.dependence_probability(a, b)

    iter_count = np.cumsum([1]+[iter_step]*n_steps)

    for y in distractor_dps:
        plt.plot(iter_count, y, color='gray', alpha=.3)

    for y in needle_dps:
        plt.plot(iter_count, y, color='crimson')

    # plt.gca().set_xscale('log')
    plt.ylim([-.05, 1.05])
    plt.xlim([1, iter_count[-1]])
    plt.show()

    engine.heatmap('dependence_probability')
    plt.show()
示例#14
0
def test_view_alpha_should_change_if_transition(gendf):
    df = gendf()

    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()

    view_alpha_start = engine._models[0]['view_alphas']

    engine.run(10)

    view_alpha_end = engine._models[0]['view_alphas']

    assert view_alpha_start != view_alpha_end
示例#15
0
def test_view_alpha_should_change_if_transition(gendf):
    df = gendf()

    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()

    view_alpha_start = engine._models[0]['view_alphas']

    engine.run(10)

    view_alpha_end = engine._models[0]['view_alphas']

    assert view_alpha_start != view_alpha_end
示例#16
0
def engine():
    x = np.random.randn(30)
    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)

    df = pd.concat([s1, s2] + [pd.Series(np.random.rand(30)) for _ in range(10)], axis=1)
    df.columns = ["c_%d" % i for i in range(12)]

    engine = Engine(df)
    engine.init_models(8)
    engine.run(20)

    return engine
示例#17
0
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype="continuous", ax=None):

    rhos = [0.1, 0.25, 0.4, 0.5, 0.75, 0.9]

    true_mis = np.zeros(len(rhos))
    mis = np.zeros((n_times, len(rhos)))

    for i, rho in enumerate(rhos):
        print("Rho: %1.1f" % (rho,))

        if vartype == "categorical":
            p, true_mi = _gen_categorical_joint_dist(rho, n_grid)
            metadata = {
                "x_1": {"dtype": "categorical", "values": [i for i in range(n_grid)]},
                "x_2": {"dtype": "categorical", "values": [i for i in range(n_grid)]},
            }
        elif vartype == "continuous":
            true_mi = -0.5 * log(1.0 - rho ** 2.0)
            metadata = {}
        else:
            raise ValueError("invalid vartype")

        for t in range(n_times):
            if vartype == "categorical":
                x = _sample_from_bivariate_discrete(p, n)
            elif vartype == "continuous":
                sigma = np.array([[1, rho], [rho, 1]])
                mu = np.zeros(2)
                x = np.random.multivariate_normal(mu, sigma, size=n)
            else:
                raise ValueError("invalid vartype")

            df = pd.DataFrame(x, columns=["x_1", "x_2"])

            engine = Engine(df, n_models=1, metadata=metadata, use_mp=False)
            engine.init_models()
            engine.run(n_iter)

            true_mis[i] = true_mi
            mis[t, i] = engine.mutual_information("x_1", "x_2", n_samples=500, normed=False)

    if ax is not None:
        ax.errorbar(rhos, y=np.mean(mis, axis=0), yerr=np.std(mis, axis=0), label="BaxCat")
        ax.plot(rhos, true_mis, label="True")

        ax.set_xlabel("rho")
        ax.set_ylabel("Mutual Information")
        ax.set_title(vartype)
        ax.legend(loc=0)
    else:
        return mis, true_mis
示例#18
0
def test_logp_scaling(df):
    engine = Engine(df)
    engine.init_models(8)
    engine.run(500)

    x = np.linspace(3, 7, 200)

    p_true = norm.pdf(x, loc=5., scale=.5)
    lp_baxcat = engine.probability(x[:, np.newaxis], ['t'],
                                   given=[('x', 1), ('y', 2)])

    inftest_plot(x, p_true, np.exp(lp_baxcat), 'p_t-xy', RESDIR)

    assert abs(max(p_true) - max(np.exp(lp_baxcat))) < .05
def test_logp_scaling(df):
    engine = Engine(df)
    engine.init_models(8)
    engine.run(500)

    x = np.linspace(3, 7, 200)

    p_true = norm.pdf(x, loc=5., scale=.5)
    lp_baxcat = engine.probability(x[:, np.newaxis], ['t'],
                                   given=[('x', 1), ('y', 2)])

    inftest_plot(x, p_true, np.exp(lp_baxcat), 'p_t-xy', RESDIR)

    assert abs(max(p_true) - max(np.exp(lp_baxcat))) < .05
示例#20
0
def test_view_alpha_should_not_change_if_no_transition(gendf):
    df = gendf()

    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()

    view_alpha_start = engine._models[0]['view_alphas']

    t_list = [b'row_assignment', b'column_alpha']
    engine.run(10, trans_kwargs={'transition_list': t_list})

    view_alpha_end = engine._models[0]['view_alphas']

    assert view_alpha_start == view_alpha_end
示例#21
0
def test_view_alpha_should_not_change_if_no_transition(gendf):
    df = gendf()

    engine = Engine(df, n_models=1, use_mp=False)
    engine.init_models()

    view_alpha_start = engine._models[0]['view_alphas']

    t_list = [b'row_assignment', b'column_alpha']
    engine.run(10, trans_kwargs={'transition_list': t_list})

    view_alpha_end = engine._models[0]['view_alphas']

    assert view_alpha_start == view_alpha_end
示例#22
0
def gen_data_and_engine(n_rows, n_cols, n_cats, cat_sep, n_models, n_iter):
    dg = DataGenerator(n_rows, ['categorical'] * n_cols,
                       cat_weights=n_cats,
                       cat_sep=cat_sep,
                       seed=1337)
    col_md = {'dtype': 'categorical', 'values': [0, 1, 2, 3, 4]}
    md = dict((
        col,
        col_md,
    ) for col in range(n_cols))
    engine = Engine(dg.df, metadata=md, use_mp=False)
    engine.init_models(n_models)
    engine.run(n_iter)

    return dg, engine
示例#23
0
def engine():
    x = np.random.randn(30)
    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)

    df = pd.concat([s1, s2] +
                   [pd.Series(np.random.rand(30)) for _ in range(10)],
                   axis=1)
    df.columns = ['c_%d' % i for i in range(12)]

    engine = Engine(df)
    engine.init_models(8)
    engine.run(20)

    return engine
示例#24
0
def onerun(shapefunc, n=250, n_iter=100, n_models=8, subsample_size=None):
    xo, yo = shapefunc(n)

    s1 = pd.Series(xo)
    s2 = pd.Series(yo)
    df = pd.concat([s1, s2], axis=1)
    df.columns = ['x', 'y']

    engine = Engine(df, n_models=n_models, use_mp=True)
    engine.init_models(subsample_size=subsample_size)
    engine.run(n_iter)

    xy = engine.sample(['x', 'y'], n=n)
    xe = xy[:, 0]
    ye = xy[:, 1]

    return xo, yo, xe, ye
示例#25
0
def test_run_with_checkpoint_valid_diagnostic_output(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()
    engine.run(10, checkpoint=5)

    tables = engine._diagnostic_tables

    assert len(tables) == 5

    for table in tables:
        assert len(table) == 3
        for entry in table:
            assert 'log_score' in entry
            assert 'iters' in entry
            assert 'time' in entry
示例#26
0
def onerun(shapefunc, n=250, n_iter=100, n_models=8, subsample_size=None):
    xo, yo = shapefunc(n)

    s1 = pd.Series(xo)
    s2 = pd.Series(yo)
    df = pd.concat([s1, s2], axis=1)
    df.columns = ['x', 'y']

    engine = Engine(df, n_models=n_models, use_mp=True)
    engine.init_models(subsample_size=subsample_size)
    engine.run(n_iter)

    xy = engine.sample(['x', 'y'], n=n)
    xe = xy[:, 0]
    ye = xy[:, 1]

    return xo, yo, xe, ye
示例#27
0
def test_run_on_model_subset_should_only_run_those_models(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()
    engine.run(10, checkpoint=5)
    engine.run(10, checkpoint=5, model_idxs=[1, 2])

    tables = engine._diagnostic_tables

    assert len(tables) == 5

    assert len(tables[0]) == 3
    assert len(tables[1]) == 5
    assert len(tables[2]) == 5
    assert len(tables[3]) == 3
    assert len(tables[4]) == 3
示例#28
0
def test_run_with_checkpoint_valid_diagnostic_output(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()
    engine.run(10, checkpoint=5)

    tables = engine._diagnostic_tables

    assert len(tables) == 5

    for table in tables:
        assert len(table) == 3
        for entry in table:
            assert 'log_score' in entry
            assert 'iters' in entry
            assert 'time' in entry
示例#29
0
def test_run_on_model_subset_should_only_run_those_models(gendf):
    df = gendf()

    engine = Engine(df, n_models=5, use_mp=False)
    engine.init_models()
    engine.run(10, checkpoint=5)
    engine.run(10, checkpoint=5, model_idxs=[1, 2])

    tables = engine._diagnostic_tables

    assert len(tables) == 5

    assert len(tables[0]) == 3
    assert len(tables[1]) == 5
    assert len(tables[2]) == 5
    assert len(tables[3]) == 3
    assert len(tables[4]) == 3
示例#30
0
def test_dependence_probability():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=20, use_mp=False)
    engine.init_models()
    engine.run(10)
    depprob_01 = engine.dependence_probability('c0', 'c1')
    depprob_02 = engine.dependence_probability('c0', 'c2')
    depprob_12 = engine.dependence_probability('c1', 'c2')

    assert depprob_01 > depprob_02
    assert depprob_01 > depprob_12
示例#31
0
def test_dependence_probability():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=20, use_mp=False)
    engine.init_models()
    engine.run(10)
    depprob_01 = engine.dependence_probability('c0', 'c1')
    depprob_02 = engine.dependence_probability('c0', 'c2')
    depprob_12 = engine.dependence_probability('c1', 'c2')

    assert depprob_01 > depprob_02
    assert depprob_01 > depprob_12
示例#32
0
def test_pairwise_dependence_probability():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=10, use_mp=False)
    engine.init_models()
    engine.run(5)

    depprob = engine.pairwise_func('dependence_probability')
    assert depprob.ix[0, 0] == 1.
    assert depprob.ix[1, 1] == 1.
    assert depprob.ix[2, 2] == 1.

    assert depprob.ix[0, 1] == depprob.ix[1, 0]
    assert depprob.ix[0, 2] == depprob.ix[2, 0]
    assert depprob.ix[1, 2] == depprob.ix[2, 1]
示例#33
0
def test_pairwise_dependence_probability():
    x = np.random.randn(30)

    s1 = pd.Series(x)
    s2 = pd.Series(x + 1.0)
    s3 = pd.Series(np.random.rand(30))

    df = pd.concat([s1, s2, s3], axis=1)
    df.columns = ['c0', 'c1', 'c2']

    engine = Engine(df, n_models=10, use_mp=False)
    engine.init_models()
    engine.run(5)

    depprob = engine.pairwise_func('dependence_probability')
    assert depprob.ix[0, 0] == 1.
    assert depprob.ix[1, 1] == 1.
    assert depprob.ix[2, 2] == 1.

    assert depprob.ix[0, 1] == depprob.ix[1, 0]
    assert depprob.ix[0, 2] == depprob.ix[2, 0]
    assert depprob.ix[1, 2] == depprob.ix[2, 1]
示例#34
0
def gen_engine_half(df):
    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models(subsample_size=0.5)
    engine.run(10)

    return engine
示例#35
0
        fname = func.__name__
        s = pd.Series([fname]*n)
        df = pd.concat([s, pd.Series(xo), pd.Series(yo)], axis=1)
        df.columns = ['func', 'x', 'y']
        dfs.append(df)

        ax = axes[0, i]
        ax.scatter(xo, yo, color='crimson', alpha=.3)

        ax = axes[1, i]
        ax.scatter(xe, ye, color='gray', alpha=.3)
        ax.set_xlim(axes[0, i].get_xlim())
        ax.set_ylim(axes[0, i].get_ylim())

    df = pd.concat(dfs, ignore_index=True)
    engine = Engine(df, n_models=8)
    engine.init_models()
    engine.run(1000, checkpoint=20)

    dfs = []
    for i, func in enumerate(funcs):
        func_name = func.__name__
        x = engine.sample(['x', 'y'], given=[('func', func_name)], n=n)

        ax = axes[2, i]
        ax.scatter(x[:, 0], x[:, 1], color='navy', alpha=.3)
        ax.set_xlim(axes[0, i].get_xlim())
        ax.set_ylim(axes[0, i].get_ylim())
    plt.show()
示例#36
0
def gen_engine(df):
    engine = Engine(df, n_models=2, use_mp=False)
    engine.init_models()
    engine.run(10)
    # print(engine.col_info())
    return engine
示例#37
0
def gen_engine_full(df):
    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models()
    engine.run(10)

    return engine
示例#38
0
        x = np.random.randn() * std + mu

        data.append([x, a, b])

    return pd.DataFrame(data)


n_rows = 100
n_cols = 32

da = gen_phenotype_data(n_rows)
db = pd.DataFrame(np.random.randint(3, size=(
    n_rows,
    n_cols,
)))
df = pd.concat([da, db], axis=1)

df.columns = ['T', 'A', 'B'] + ['x_%d' % i for i in range(n_cols)]

engine = Engine(df, n_models=32)
engine.init_models()
engine.run(100)

for col in df.columns:
    if col != 'T':
        print("1/H(%s|T) = %f" %
              (col, 1 / engine.conditional_entropy(col, 'T')))

engine.heatmap('dependence_probability')
plt.show()
示例#39
0
        fname = func.__name__
        s = pd.Series([fname] * n)
        df = pd.concat([s, pd.Series(xo), pd.Series(yo)], axis=1)
        df.columns = ['func', 'x', 'y']
        dfs.append(df)

        ax = axes[0, i]
        ax.scatter(xo, yo, color='crimson', alpha=.3)

        ax = axes[1, i]
        ax.scatter(xe, ye, color='gray', alpha=.3)
        ax.set_xlim(axes[0, i].get_xlim())
        ax.set_ylim(axes[0, i].get_ylim())

    df = pd.concat(dfs, ignore_index=True)
    engine = Engine(df, n_models=8)
    engine.init_models()
    engine.run(1000, checkpoint=20)

    dfs = []
    for i, func in enumerate(funcs):
        func_name = func.__name__
        x = engine.sample(['x', 'y'], given=[('func', func_name)], n=n)

        ax = axes[2, i]
        ax.scatter(x[:, 0], x[:, 1], color='navy', alpha=.3)
        ax.set_xlim(axes[0, i].get_xlim())
        ax.set_ylim(axes[0, i].get_ylim())
    plt.show()
示例#40
0
def run(n_times=5, n_grid=5, n=200, n_iter=200, vartype='continuous', ax=None):

    rhos = [.1, .25, .4, .5, .75, .9]

    true_mis = np.zeros(len(rhos))
    mis = np.zeros((
        n_times,
        len(rhos),
    ))

    for i, rho in enumerate(rhos):
        print('Rho: %1.1f' % (rho, ))

        if vartype == 'categorical':
            p, true_mi = _gen_categorical_joint_dist(rho, n_grid)
            metadata = {
                'x_1': {
                    'dtype': 'categorical',
                    'values': [i for i in range(n_grid)]
                },
                'x_2': {
                    'dtype': 'categorical',
                    'values': [i for i in range(n_grid)]
                }
            }
        elif vartype == 'continuous':
            true_mi = -.5 * log(1. - rho**2.)
            metadata = {}
        else:
            raise ValueError('invalid vartype')

        for t in range(n_times):
            if vartype == 'categorical':
                x = _sample_from_bivariate_discrete(p, n)
            elif vartype == 'continuous':
                sigma = np.array([[1, rho], [rho, 1]])
                mu = np.zeros(2)
                x = np.random.multivariate_normal(mu, sigma, size=n)
            else:
                raise ValueError('invalid vartype')

            df = pd.DataFrame(x, columns=['x_1', 'x_2'])

            engine = Engine(df, n_models=1, metadata=metadata, use_mp=False)
            engine.init_models()
            engine.run(n_iter)

            true_mis[i] = true_mi
            mis[t, i] = engine.mutual_information('x_1',
                                                  'x_2',
                                                  n_samples=500,
                                                  normed=False)

    if ax is not None:
        ax.errorbar(rhos,
                    y=np.mean(mis, axis=0),
                    yerr=np.std(mis, axis=0),
                    label='BaxCat')
        ax.plot(rhos, true_mis, label='True')

        ax.set_xlabel('rho')
        ax.set_ylabel('Mutual Information')
        ax.set_title(vartype)
        ax.legend(loc=0)
    else:
        return mis, true_mis
示例#41
0
def gen_engine(df):
    engine = Engine(df, n_models=2, use_mp=False)
    engine.init_models()
    engine.run(10)
    # print(engine.col_info())
    return engine
示例#42
0
import seaborn as sns

from baxcat.engine import Engine

x = np.hstack((
    np.random.randn(100) - 6,
    np.random.randn(100) * 3,
    np.random.randn(100) + 6,
))

s1 = pd.Series(x)
df = pd.DataFrame(s1, columns=['x'])

engine = Engine(df, n_models=8)
engine.init_models()
engine.run(100)
y = engine.sample('x', n=300)

plt.subplot(1, 2, 1)
sns.distplot(x, bins=30, label='original')
sns.distplot(y, bins=30, label='model')
plt.xlim([-10, 10])

engine_sub = Engine(df, n_models=8)
engine_sub.init_models(.5)
engine_sub.run(100)
y = engine_sub.sample('x', n=300)

plt.subplot(1, 2, 2)
sns.distplot(x, bins=30, label='original')
sns.distplot(y, bins=30, label='model')
示例#43
0
s_a2 = pd.Series(np.random.randn(n) - 2.)

s_b1 = pd.Series(np.ones(n, dtype=int))
s_b2 = pd.Series(np.random.randn(n) + 2.)


df = pd.concat([pd.concat([s_a1, s_a2], axis=1),
                pd.concat([s_b1, s_b2], axis=1)], axis=0)
assert df.shape == (2*n, 2,)

df.columns = ['label', 'x']


engine = Engine(df, n_models=8)
engine.init_models()
engine.run(200)

x = np.linspace(-6., 6., 200)[np.newaxis].T

p_01 = np.exp(engine.probability(x, ['x']))
p_0 = .5*np.exp(engine.probability(x, ['x'], given=[('label', 0,)]))
p_1 = .5*np.exp(engine.probability(x, ['x'], given=[('label', 1,)]))

plt.figure(figsize=(4, 4,))
plt.hist(df['x'], 31, histtype='stepfilled', color='#aaaaaa', edgecolor='None',
         normed=True)
plt.plot(x.flatten(), p_0, label='p(x|label=0)')
plt.plot(x.flatten(), p_1, label='p(x|label=1)')
plt.plot(x.flatten(), p_01, ls='--', label='p(x)')
plt.xlabel('x')
plt.ylabel('PDF')
示例#44
0
    std = 1.
    for i in range(n_rows):
        a = np.random.randint(3)
        b = np.random.randint(3)
        mu = mus[a, b]
        x = np.random.randn()*std + mu

        data.append([x, a, b])

    return pd.DataFrame(data)

n_rows = 100
n_cols = 32

da = gen_phenotype_data(n_rows)
db = pd.DataFrame(np.random.randint(3, size=(n_rows, n_cols,)))
df = pd.concat([da, db], axis=1)

df.columns = ['T', 'A', 'B'] + ['x_%d' % i for i in range(n_cols)]

engine = Engine(df, n_models=32)
engine.init_models()
engine.run(100)

for col in df.columns:
    if col != 'T':
        print("1/H(%s|T) = %f" % (col, 1/engine.conditional_entropy(col, 'T')))

engine.heatmap('dependence_probability')
plt.show()
示例#45
0
s_b1 = pd.Series(np.ones(n, dtype=int))
s_b2 = pd.Series(np.random.randn(n) + 2.)

df = pd.concat(
    [pd.concat([s_a1, s_a2], axis=1),
     pd.concat([s_b1, s_b2], axis=1)], axis=0)
assert df.shape == (
    2 * n,
    2,
)

df.columns = ['label', 'x']

engine = Engine(df, n_models=8)
engine.init_models()
engine.run(200)

x = np.linspace(-6., 6., 200)[np.newaxis].T

p_01 = np.exp(engine.probability(x, ['x']))
p_0 = .5 * np.exp(engine.probability(x, ['x'], given=[(
    'label',
    0,
)]))
p_1 = .5 * np.exp(engine.probability(x, ['x'], given=[(
    'label',
    1,
)]))

plt.figure(figsize=(
    4,
示例#46
0
def gen_engine_half(df):
    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models(subsample_size=0.5)
    engine.run(10)

    return engine
示例#47
0
def gen_engine_full(df):
    engine = Engine(df, n_models=4, use_mp=False)
    engine.init_models()
    engine.run(10)

    return engine
示例#48
0
# how to model each column.
engine = Engine(df, n_models=32)

# We can see how baxcat decided to model each column by checking `col_info`
col_info = engine.col_info()
print(col_info)

# To do inference, we intialize some cross-categorization states with
# `init_models` then `run` the inference. We intitialize many models to hedge
# the inferences we make. Every model is a draw from the posterior. We want to
# make inference about the data given the posterior distribution of states, so
# we take several models.
print('Initializing 32 models...')
engine.init_models()
print('Running models for 200 iterations...')
engine.run(200, checkpoint=5)

# To check whether inference has converges, we plot the log score for each
# model as a function of time and make sure they all have leveled out.
engine.convergence_plot()
plt.show()

# We can view which columns are dependent on which other columns by plotting
# a n_cols by n_cols matrix where each cell is the dependence probability
# between two columns. Note that the dependence probability is simply the
# probability that a dependence exists, not the strength of the dependence.
engine.heatmap('dependence_probability', plot_kwargs={'figsize': (
    10,
    10,
)})
plt.show()
示例#49
0
import matplotlib.pyplot as plt
import seaborn as sns

from baxcat.engine import Engine

x = np.hstack((
    np.random.randn(100) - 6,
    np.random.randn(100)*3,
    np.random.randn(100) + 6,))

s1 = pd.Series(x)
df = pd.DataFrame(s1, columns=['x'])

engine = Engine(df, n_models=8)
engine.init_models()
engine.run(100)
y = engine.sample('x', n=300)

plt.subplot(1, 2, 1)
sns.distplot(x, bins=30, label='original')
sns.distplot(y, bins=30, label='model')
plt.xlim([-10, 10])

engine_sub = Engine(df, n_models=8)
engine_sub.init_models(.5)
engine_sub.run(100)
y = engine_sub.sample('x', n=300)

plt.subplot(1, 2, 2)
sns.distplot(x, bins=30, label='original')
sns.distplot(y, bins=30, label='model')
示例#50
0
# how to model each column.
engine = Engine(df, n_models=32)

# We can see how baxcat decided to model each column by checking `col_info`
col_info = engine.col_info()
print(col_info)

# To do inference, we intialize some cross-categorization states with
# `init_models` then `run` the inference. We intitialize many models to hedge
# the inferences we make. Every model is a draw from the posterior. We want to
# make inference about the data given the posterior distribution of states, so
# we take several models.
print('Initializing 32 models...')
engine.init_models()
print('Running models for 200 iterations...')
engine.run(200, checkpoint=5)

# To check whether inference has converges, we plot the log score for each
# model as a function of time and make sure they all have leveled out.
engine.convergence_plot()
plt.show()

# We can view which columns are dependent on which other columns by plotting
# a n_cols by n_cols matrix where each cell is the dependence probability
# between two columns. Note that the dependence probability is simply the
# probability that a dependence exists, not the strength of the dependence.
engine.heatmap('dependence_probability', plot_kwargs={'figsize': (10, 10,)})
plt.show()

engine.heatmap('row_similarity', plot_kwargs={'figsize': (10, 10,)})
plt.show()
示例#51
0
def row_to_img(df, row_idx):
    pixels = df.iloc[row_idx, 1:].values.reshape((28, 28))
    return pixels


assert __name__ == "__main__"

exdir = os.path.dirname(os.path.realpath(__file__))

df = pd.read_csv(os.path.join(exdir, "mnist.csv.gz"), compression="gzip")
df = df.sample(2000)
testdata = df["label"][1500:]
df["label"][1500:] = float("NaN")

engine = Engine(df)
engine.init_models(4)
engine.run(1000, checkpoint=4, verbose=True)

engine.convergence_plot()
plt.show()

_, m = engine.eval(testdata, metric=Accuracy())
print("Acuracy = %f" % (m,))

# engine.heatmap('row_similarity')
# plt.show()

# engine.heatmap('dependence_probability')
# plt.show()
示例#52
0
        28,
        28,
    ))
    return pixels


assert __name__ == "__main__"

exdir = os.path.dirname(os.path.realpath(__file__))

df = pd.read_csv(os.path.join(exdir, 'mnist.csv.gz'), compression='gzip')
df = df.sample(2000)
testdata = df['label'][1500:]
df['label'][1500:] = float('NaN')

engine = Engine(df)
engine.init_models(4)
engine.run(1000, checkpoint=4, verbose=True)

engine.convergence_plot()
plt.show()

_, m = engine.eval(testdata, metric=Accuracy())
print('Acuracy = %f' % (m, ))

# engine.heatmap('row_similarity')
# plt.show()

# engine.heatmap('dependence_probability')
# plt.show()