def test_near_shannon_limit(): X, Y_true, clusters, tcs = generate_noisy_data(n_samples=1000, group_sizes=[200], erasure_p=1. - 3. / 200) out = corex.Corex(n_hidden=1, seed=seed, verbose=verbose).fit(X) assert max(np.mean(Y_true == out.labels.T), 1 - np.mean( Y_true == out.labels.T)) > 0.95 # rate = 3*capacity, near perfect X, Y_true, clusters, tcs = generate_noisy_data(n_samples=1000, group_sizes=[200], erasure_p=1. - 1. / 200) out = corex.Corex(n_hidden=1, seed=seed, verbose=verbose).fit(X) assert max( np.mean(Y_true == out.labels.T), 1 - np.mean(Y_true == out.labels.T)) < 0.9 # rate=capacity, not perfect
def test_missing_values(): n_samples = 100 dim_hidden = 2 missing = 0.1 group_sizes = [10, 7] # Chance of entire row missing smaller than missing^n np.random.seed(seed) X, Y_true, clusters, tcs = generate_data(n_samples=n_samples, group_sizes=group_sizes, dim_hidden=dim_hidden, missing=missing) methods = [ corex.Corex(n_hidden=len(group_sizes), dim_hidden=dim_hidden, missing_values=-1, seed=seed, verbose=verbose).fit(X) ] for i, method in enumerate(methods): f = partial(check_correct, clusters, method.tcs, Y_true, X, method) update_wrapper(f, check_correct) f.description = 'missing values, ' + [ 'base', 'gaussian', 'discrete', 'discrete NT', 'gaussian NT' ][i] + ', seed: ' + str(seed) yield (f, )
def test_no_tc_in_random(): sizes = [(100, 10), (200, 20)] tcs = [] for size in sizes: test_data = np.random.randint(0, 2, size) tcs.append(ce.Corex(seed=seed).fit(test_data).tc) assert np.allclose(tcs, 0, atol=0.15), zip(sizes, tcs)
def test_mi(): test_data = np.repeat(np.array( [[0, 0, 0], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 1, 0], [1, 1, 1], [1, 1, 0], [1, 1, 1]], dtype=int), 3, axis=0) mis = ce.Corex(seed=seed).fit(test_data).mis assert np.allclose(mis, np.array([np.log(2), np.log(2), 0]), atol=0.05), mis
def test_stable_solution_with_many_starting_points(): test_data = np.repeat(np.array([[0, 0], [1, 1]], dtype=int), 10, axis=0) n_correct = [] for i in range(10): this_tc = ce.Corex(seed=i, verbose=verbose, smooth_marginals=False).fit(test_data).tc print this_tc n_correct.append(this_tc > 0.64) assert np.all(n_correct), "number correct %d / %d" % (np.sum(n_correct), len(n_correct))
def test_near_shannon_limit(): counts, Y_true, clusters, tcs = generate_noisy_data(n_samples=1000, group_sizes=[200], erasure_p=1. - 3. / 200) out = ce.Corex(n_hidden=1, seed=seed, verbose=verbose).fit(counts) out_labels = np.rint(out.labels).astype(int) frac_correct = max(np.mean(Y_true[0] == out_labels), 1 - np.mean(Y_true[0] == out_labels.T)) assert frac_correct > 0.94, 'fraction correct should be high: %f' % frac_correct # rate = 3*capacity, near perfect counts, Y_true, clusters, tcs = generate_noisy_data(n_samples=1000, group_sizes=[200], erasure_p=1. - 1. / 200) out = ce.Corex(n_hidden=1, seed=seed, verbose=verbose).fit(counts) out_labels = np.rint(out.labels).astype(int) assert max( np.mean(Y_true[0] == out_labels), 1 - np.mean(Y_true[0] == out_labels)) < 0.9 # rate=capacity, not perfect
def test_BinaryGaussianCorEx(): n_samples = 100 for group_sizes in [[2], [3, 2]]: np.random.seed(seed) counts, Y_true, clusters, tcs = generate_data(n_samples=n_samples, group_sizes=group_sizes) method = ce.Corex(seed=seed, verbose=verbose).fit(counts) f = partial(check_correct, clusters, tcs, Y_true, counts, method) update_wrapper(f, check_correct) f.description = 'groups:' + str(group_sizes) + ' seed: ' + str(seed) yield (f, )
def __init__(self, x, **kwargs): k_max = kwargs.pop('k_max', 2) # Sets max cardinality for Remainder objects self.verbose = kwargs.get('verbose', False) self.corex = ce.Corex(**kwargs).fit(x) self.labels = self.corex.labels self.remainders = [ re.Remainder(xs[xs >= 0], self.labels[xs >= 0], k_max=k_max) for xs in x.T ] if self.verbose: print 'z cardinalities', [ r.pz_xy.shape[0] for r in self.remainders ]
def main(): features, datalist = data('jeu_hota4') layer1 = ce.Corex(n_hidden=2, dim_hidden=4, verbose=1, seed=123) layer1.fit(features) for i in range(len(datalist)): if layer1.labels[i][0] == 0: with open('JH4_CL0', 'a') as f: f.write(datalist[i].strip('\n') + '\n') elif layer1.labels[i][0] == 1: with open('JH4_CL1', 'a') as f: f.write(datalist[i].strip('\n') + '\n') elif layer1.labels[i][0] == 2: with open('JH4_CL2', 'a') as f: f.write(datalist[i].strip('\n') + '\n') elif layer1.labels[i][0] == 3: with open('JH4_CL3', 'a') as f: f.write(datalist[i].strip('\n') + '\n')
def test_large_k(): n_samples = 300 d = 10 np.random.seed(seed) counts, Y_true, clusters, tcs = generate_data(n_samples=n_samples, group_sizes=[3], dim_hidden=d) counts_n = np.hstack([counts, np.random.randint(0, d, (n_samples, 5))]) method = ce.Corex(seed=seed, n_repeat=10, dim_hidden=d, verbose=verbose, smooth_marginals=True).fit(counts_n) print method.mis f = partial(check_correct, clusters, tcs, Y_true, counts_n, method) update_wrapper(f, check_correct) f.description = 'large k' yield (f, )
def test_constant(): # TODO: labels are kind of random if there is no signal... it might be nice to get that out somehow. for i in range(10): test_data = np.repeat(np.array( [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int), 1, axis=0) method = ce.Corex(seed=i).fit(test_data) print 'Constant data, seed %d' % i print 'tc, mi', method.mis, method.tc print 'labels', method.labels print 'labels', method.transform(test_data) assert np.array_equal(method.transform(test_data), method.labels) # Correctness of transform assert np.allclose( method.tc, 0, atol=0.001, rtol=0.1), "TC error: %f, %f" % (corex.tc, np.max(tcs))
def test_corex_all(): n_samples = 100 for group_sizes in [[2], [3, 2]]: for dim_hidden in [2, 3]: np.random.seed(seed) X, Y_true, clusters, tcs = generate_data(n_samples=n_samples, group_sizes=group_sizes, dim_hidden=dim_hidden) methods = [ corex.Corex(n_hidden=len(group_sizes), dim_hidden=dim_hidden, missing_values=-1, seed=seed, verbose=verbose).fit(X) ] for i, method in enumerate(methods): f = partial(check_correct, clusters, method.tcs, Y_true, X, method) update_wrapper(f, check_correct) f.description = 'method: ' + ['base', 'gaussian', 'discrete', 'discrete NT', 'gaussian NT', 'beta NT'][i] + \ ', groups:' + str(group_sizes) + ', dim_hidden:' + str(dim_hidden) + ', seed: '+str(seed) yield (f, )
fill_value=-1) # "We included only the 388 companies which were on the S&P 500 for the entire period." X = data.loc[:, ~(data == -1).any(axis=0)].as_matrix() labels = data.loc[:, ~(data == -1).any(axis=0)].columns # "We use a representation with m1 = 20, m2 = 3, m3 = 1 and Yj were discrete trinary variables." layers = [] for n in (20, 3, 1): layer = ce.Corex(n_hidden=n, dim_hidden=3, max_iter=500, n_repeat=10, ram=16., max_samples=1000, n_cpu=4, eps=1e-5, marginal_description='gaussian', smooth_marginals=True, missing_values=-1, seed=1, verbose=True) layers.append(layer) # Fit the model start = default_timer() Y1 = layers[0].fit_transform(X) Y2 = layers[1].fit_transform(Y1) Y3 = layers[2].fit_transform(Y2) end = default_timer() print("Start: {0}\tEnd: {1}\tElapsed: {2}".format(start, end,
# Run CorEx on data if verbose: print('Getting CorEx results') corexes = [] if not options.regraph: for l, layer in enumerate(layers): if verbose: print("Layer ", l) if l == 0: t0 = time() corexes = [ ce.Corex(n_hidden=layer, dim_hidden=options.dim_hidden, verbose=verbose, marginal_description=marg, smooth_marginals=options.smooth, missing_values=options.missing, n_repeat=options.repeat, max_iter=options.max_iter, n_cpu=options.cpu, ram=options.ram).fit(X) ] print('Time for first layer: %0.2f' % (time() - t0)) else: X_prev = corexes[-1].labels corexes.append( ce.Corex(n_hidden=layer, dim_hidden=options.dim_hidden, verbose=verbose, marginal_description='discrete', smooth_marginals=options.smooth, n_repeat=options.repeat,