예제 #1
0
def test_near_shannon_limit():
    X, Y_true, clusters, tcs = generate_noisy_data(n_samples=1000,
                                                   group_sizes=[200],
                                                   erasure_p=1. - 3. / 200)
    out = corex.Corex(n_hidden=1, seed=seed, verbose=verbose).fit(X)
    assert max(np.mean(Y_true == out.labels.T), 1 - np.mean(
        Y_true == out.labels.T)) > 0.95  # rate = 3*capacity, near perfect

    X, Y_true, clusters, tcs = generate_noisy_data(n_samples=1000,
                                                   group_sizes=[200],
                                                   erasure_p=1. - 1. / 200)
    out = corex.Corex(n_hidden=1, seed=seed, verbose=verbose).fit(X)
    assert max(
        np.mean(Y_true == out.labels.T), 1 -
        np.mean(Y_true == out.labels.T)) < 0.9  # rate=capacity, not perfect
예제 #2
0
def test_missing_values():
    n_samples = 100
    dim_hidden = 2
    missing = 0.1
    group_sizes = [10,
                   7]  # Chance of entire row missing smaller than missing^n
    np.random.seed(seed)
    X, Y_true, clusters, tcs = generate_data(n_samples=n_samples,
                                             group_sizes=group_sizes,
                                             dim_hidden=dim_hidden,
                                             missing=missing)
    methods = [
        corex.Corex(n_hidden=len(group_sizes),
                    dim_hidden=dim_hidden,
                    missing_values=-1,
                    seed=seed,
                    verbose=verbose).fit(X)
    ]

    for i, method in enumerate(methods):
        f = partial(check_correct, clusters, method.tcs, Y_true, X, method)
        update_wrapper(f, check_correct)
        f.description = 'missing values, ' + [
            'base', 'gaussian', 'discrete', 'discrete NT', 'gaussian NT'
        ][i] + ', seed: ' + str(seed)
        yield (f, )
예제 #3
0
def test_no_tc_in_random():
    sizes = [(100, 10), (200, 20)]
    tcs = []
    for size in sizes:
        test_data = np.random.randint(0, 2, size)
        tcs.append(ce.Corex(seed=seed).fit(test_data).tc)
    assert np.allclose(tcs, 0, atol=0.15), zip(sizes, tcs)
예제 #4
0
def test_mi():
    test_data = np.repeat(np.array(
        [[0, 0, 0], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 1, 0], [1, 1, 1],
         [1, 1, 0], [1, 1, 1]],
        dtype=int),
                          3,
                          axis=0)
    mis = ce.Corex(seed=seed).fit(test_data).mis
    assert np.allclose(mis, np.array([np.log(2), np.log(2), 0]),
                       atol=0.05), mis
예제 #5
0
def test_stable_solution_with_many_starting_points():
    test_data = np.repeat(np.array([[0, 0], [1, 1]], dtype=int), 10, axis=0)
    n_correct = []
    for i in range(10):
        this_tc = ce.Corex(seed=i, verbose=verbose,
                           smooth_marginals=False).fit(test_data).tc
        print this_tc
        n_correct.append(this_tc > 0.64)
    assert np.all(n_correct), "number correct %d / %d" % (np.sum(n_correct),
                                                          len(n_correct))
예제 #6
0
def test_near_shannon_limit():
    counts, Y_true, clusters, tcs = generate_noisy_data(n_samples=1000,
                                                        group_sizes=[200],
                                                        erasure_p=1. -
                                                        3. / 200)
    out = ce.Corex(n_hidden=1, seed=seed, verbose=verbose).fit(counts)
    out_labels = np.rint(out.labels).astype(int)
    frac_correct = max(np.mean(Y_true[0] == out_labels),
                       1 - np.mean(Y_true[0] == out_labels.T))
    assert frac_correct > 0.94, 'fraction correct should be high: %f' % frac_correct  # rate = 3*capacity, near perfect

    counts, Y_true, clusters, tcs = generate_noisy_data(n_samples=1000,
                                                        group_sizes=[200],
                                                        erasure_p=1. -
                                                        1. / 200)
    out = ce.Corex(n_hidden=1, seed=seed, verbose=verbose).fit(counts)
    out_labels = np.rint(out.labels).astype(int)
    assert max(
        np.mean(Y_true[0] == out_labels), 1 -
        np.mean(Y_true[0] == out_labels)) < 0.9  # rate=capacity, not perfect
예제 #7
0
def test_BinaryGaussianCorEx():
    n_samples = 100
    for group_sizes in [[2], [3, 2]]:
        np.random.seed(seed)
        counts, Y_true, clusters, tcs = generate_data(n_samples=n_samples,
                                                      group_sizes=group_sizes)
        method = ce.Corex(seed=seed, verbose=verbose).fit(counts)

        f = partial(check_correct, clusters, tcs, Y_true, counts, method)
        update_wrapper(f, check_correct)
        f.description = 'groups:' + str(group_sizes) + ' seed: ' + str(seed)
        yield (f, )
예제 #8
0
 def __init__(self, x, **kwargs):
     k_max = kwargs.pop('k_max',
                        2)  # Sets max cardinality for Remainder objects
     self.verbose = kwargs.get('verbose', False)
     self.corex = ce.Corex(**kwargs).fit(x)
     self.labels = self.corex.labels
     self.remainders = [
         re.Remainder(xs[xs >= 0], self.labels[xs >= 0], k_max=k_max)
         for xs in x.T
     ]
     if self.verbose:
         print 'z cardinalities', [
             r.pz_xy.shape[0] for r in self.remainders
         ]
예제 #9
0
def main():
    features, datalist = data('jeu_hota4')

    layer1 = ce.Corex(n_hidden=2, dim_hidden=4, verbose=1, seed=123)
    layer1.fit(features)

    for i in range(len(datalist)):
        if layer1.labels[i][0] == 0:
            with open('JH4_CL0', 'a') as f:
                f.write(datalist[i].strip('\n') + '\n')
        elif layer1.labels[i][0] == 1:
            with open('JH4_CL1', 'a') as f:
                f.write(datalist[i].strip('\n') + '\n')
        elif layer1.labels[i][0] == 2:
            with open('JH4_CL2', 'a') as f:
                f.write(datalist[i].strip('\n') + '\n')
        elif layer1.labels[i][0] == 3:
            with open('JH4_CL3', 'a') as f:
                f.write(datalist[i].strip('\n') + '\n')
예제 #10
0
def test_large_k():
    n_samples = 300
    d = 10
    np.random.seed(seed)
    counts, Y_true, clusters, tcs = generate_data(n_samples=n_samples,
                                                  group_sizes=[3],
                                                  dim_hidden=d)
    counts_n = np.hstack([counts, np.random.randint(0, d, (n_samples, 5))])
    method = ce.Corex(seed=seed,
                      n_repeat=10,
                      dim_hidden=d,
                      verbose=verbose,
                      smooth_marginals=True).fit(counts_n)
    print method.mis

    f = partial(check_correct, clusters, tcs, Y_true, counts_n, method)
    update_wrapper(f, check_correct)
    f.description = 'large k'
    yield (f, )
예제 #11
0
def test_constant():
    # TODO: labels are kind of random if there is no signal... it might be nice to get that out somehow.
    for i in range(10):
        test_data = np.repeat(np.array(
            [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
             [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]],
            dtype=int),
                              1,
                              axis=0)
        method = ce.Corex(seed=i).fit(test_data)
        print 'Constant data, seed %d' % i
        print 'tc, mi', method.mis, method.tc
        print 'labels', method.labels
        print 'labels', method.transform(test_data)
        assert np.array_equal(method.transform(test_data),
                              method.labels)  # Correctness of transform
        assert np.allclose(
            method.tc, 0, atol=0.001,
            rtol=0.1), "TC error: %f, %f" % (corex.tc, np.max(tcs))
예제 #12
0
def test_corex_all():
    n_samples = 100
    for group_sizes in [[2], [3, 2]]:
        for dim_hidden in [2, 3]:
            np.random.seed(seed)
            X, Y_true, clusters, tcs = generate_data(n_samples=n_samples,
                                                     group_sizes=group_sizes,
                                                     dim_hidden=dim_hidden)
            methods = [
                corex.Corex(n_hidden=len(group_sizes),
                            dim_hidden=dim_hidden,
                            missing_values=-1,
                            seed=seed,
                            verbose=verbose).fit(X)
            ]
            for i, method in enumerate(methods):
                f = partial(check_correct, clusters, method.tcs, Y_true, X,
                            method)
                update_wrapper(f, check_correct)
                f.description = 'method: ' + ['base', 'gaussian', 'discrete', 'discrete NT', 'gaussian NT', 'beta NT'][i] + \
                                ', groups:' + str(group_sizes) + ', dim_hidden:' + str(dim_hidden) + ', seed: '+str(seed)
                yield (f, )
예제 #13
0
                        fill_value=-1)

# "We included only the 388 companies which were on the S&P 500 for the entire period."
X = data.loc[:, ~(data == -1).any(axis=0)].as_matrix()
labels = data.loc[:, ~(data == -1).any(axis=0)].columns

# "We use a representation with m1 = 20, m2 = 3, m3 = 1 and Yj were discrete trinary variables."
layers = []
for n in (20, 3, 1):
    layer = ce.Corex(n_hidden=n,
                     dim_hidden=3,
                     max_iter=500,
                     n_repeat=10,
                     ram=16.,
                     max_samples=1000,
                     n_cpu=4,
                     eps=1e-5,
                     marginal_description='gaussian',
                     smooth_marginals=True,
                     missing_values=-1,
                     seed=1,
                     verbose=True)
    layers.append(layer)

# Fit the model
start = default_timer()
Y1 = layers[0].fit_transform(X)
Y2 = layers[1].fit_transform(Y1)
Y3 = layers[2].fit_transform(Y2)
end = default_timer()
print("Start: {0}\tEnd: {1}\tElapsed: {2}".format(start, end,
예제 #14
0
 # Run CorEx on data
 if verbose:
     print('Getting CorEx results')
     corexes = []
 if not options.regraph:
     for l, layer in enumerate(layers):
         if verbose:
             print("Layer ", l)
         if l == 0:
             t0 = time()
             corexes = [
                 ce.Corex(n_hidden=layer,
                          dim_hidden=options.dim_hidden,
                          verbose=verbose,
                          marginal_description=marg,
                          smooth_marginals=options.smooth,
                          missing_values=options.missing,
                          n_repeat=options.repeat,
                          max_iter=options.max_iter,
                          n_cpu=options.cpu,
                          ram=options.ram).fit(X)
             ]
             print('Time for first layer: %0.2f' % (time() - t0))
         else:
             X_prev = corexes[-1].labels
             corexes.append(
                 ce.Corex(n_hidden=layer,
                          dim_hidden=options.dim_hidden,
                          verbose=verbose,
                          marginal_description='discrete',
                          smooth_marginals=options.smooth,
                          n_repeat=options.repeat,