def forward_sample(X, n_iters, Zv=None, Zrcv=None, n_grid=30, n_chains=1, ct_kernel=0): total_iters = n_chains * n_iters n_cols = len(X) cctypes = ['normal'] * n_cols distargs = [None] * n_cols forward_samples = dict() stats = [] i = 0 for chain in range(n_chains): forward_samples[chain] = [] for itr in range(n_iters): i += 1 state = cc_state.cc_state(X, cctypes, distargs, Zv=Zv, Zrcv=Zrcv, n_grid=n_grid, ct_kernel=ct_kernel) Y = su.resample_data(state) forward_samples[chain].append(Y) stats.append(get_data_stats(Y, state)) string = "\r%1.2f " % (i * 100.0 / float(total_iters)) sys.stdout.write(string) sys.stdout.flush() return stats, forward_samples
def construct_state_from_legacy_metadata(T, M_c, X_L, X_D): """ Generates a state from CrossCat-formated data, T, and metadata """ # ignores suffstats, calculates them manually Zv = X_L['column_partition']['assignments'] Zrcv = [Z for Z in X_D] T_array = numpy.array(T) X = [T_array[:, col].flatten(1) for col in range(T_array.shape[1])] cctypes = ['normal'] * len(X) distargs = [None] * len(X) state = cc_state.cc_state(X, cctypes, distargs, Zv=Zv, Zrcv=Zrcv) # set Column alpha state.alpha = X_L['column_partition']['hypers']['alpha'] for v in range(state.V): view_state = X_L['view_state'][v] state.views[v].alpha = view_state['row_partition_model']['hypers'][ 'alpha'] for index, dim in state.views[v].dims.iteritems(): # dict_index = view_state.column_names.index(str(index)) hypers = X_L['column_hypers'][index] model_type = M_c['column_metadata'][index]['modeltype'] _set_dim_hypers_from_legacy(dim, hypers, model_type) return state
def construct_state_from_legacy_metadata(T, M_c, X_L, X_D): """ Generates a state from CrossCat-formated data, T, and metadata """ # ignores suffstats, calculates them manually Zv = X_L['column_partition']['assignments'] Zrcv = [ Z for Z in X_D ] T_array = numpy.array(T) X = [ T_array[:,col].flatten(1) for col in range(T_array.shape[1]) ] cctypes = ['normal']*len(X) distargs = [None]*len(X) state = cc_state.cc_state(X, cctypes, distargs, Zv=Zv, Zrcv=Zrcv) # set Column alpha state.alpha = X_L['column_partition']['hypers']['alpha'] for v in range(state.V): view_state = X_L['view_state'][v] state.views[v].alpha = view_state['row_partition_model']['hypers']['alpha'] for index, dim in state.views[v].dims.iteritems(): # dict_index = view_state.column_names.index(str(index)) hypers = X_L['column_hypers'][index] model_type = M_c['column_metadata'][index]['modeltype'] _set_dim_hypers_from_legacy(dim, hypers, model_type) return state
def _do_intialize(args): X = args[0] cctypes = args[1] distargs = args[2] init_mode = args[3] S = cc_state.cc_state(X, cctypes, distargs=distargs) return S.get_metadata()
def run_test(argsin): n_rows = args["num_rows"] n_iters = args["num_iters"] n_chains = args["num_chains"] ct_kernel = args["ct_kernel"] fig = pylab.figure(num=None, facecolor='w', edgecolor='k',frameon=False, tight_layout=True) plt = 0 data = {'x':[], 'sin':[], 'ring':[], 'dots':[]} xlims = dict() ylims = dict() for shape in shapes: plt += 1 data[shape] = gen_function[shape](n_rows) ax = pylab.subplot(n_chains+1,4,plt) pylab.scatter( data[shape][0], data[shape][1], s=10, color='blue', edgecolor='none', alpha=.2 ) # pylab.ylabel("X") # pylab.ylabel("Y") # pylab.title("%s original" % shape) ax.set_xticks([]) ax.set_yticks([]) pylab.suptitle( "Kernel %i" % ct_kernel) xlims[shape] = ax.get_xlim() ylims[shape] = ax.get_ylim() States = [] for chain in range(n_chains): print("chain %i of %i." % (chain+1, n_chains)) plt = 0 for shape in shapes: print("\tWorking on %s." % shape) plt += 1 T = data[shape] S = cc_state.cc_state(T, cctypes, ct_kernel=ct_kernel, distargs=distargs) S.transition(N=n_iters) T_chain = numpy.array(su.simple_predictive_sample(S, n_rows, [0,1], N=n_rows)) ax = pylab.subplot(n_chains+1,4,chain*4+4+plt) ax.set_xticks([]) ax.set_yticks([]) pylab.scatter( T_chain[:,0], T_chain[:,1], s=10, color='red', edgecolor='none', alpha=.2 ) pylab.xlim(xlims[shape]) pylab.ylim(ylims[shape]) # pylab.title("%s simulated (%i)" % (shape, chain)) print("Done.") pylab.show()
def initialize(self, M_c, M_r, T, initialization='from_the_prior', specified_s_grid=None, specified_mu_grid=None, row_initialization=-1, n_chains=1): # assumes all columns are normal data T = numpy.array(T) n_rows, n_cols = T.shape X = [T[:, c] for c in range(n_cols)] cctypes = ['normal'] * n_cols distargs = [None] * n_cols # it don't use M_r X_L_list = [] X_D_list = [] for chain in range(n_chains): state = cc_state.cc_state(X, cctypes, distargs) if specified_mu_grid is not None: if len(specified_mu_grid) > 0: for dim in state.dims: dim.hypers_grids['m'] = numpy.array(specified_mu_grid) dim.hypers['m'] = random.sample(specified_mu_grid, 1)[0] for cluster in dim.clusters: cluster.set_hypers(dim.hypers) if specified_s_grid is not None: if len(specified_s_grid) > 0: for dim in state.dims: dim.hypers_grids['s'] = numpy.array(specified_s_grid) dim.hypers['s'] = random.sample(specified_s_grid, 1)[0] for cluster in dim.clusters: cluster.set_hypers(dim.hypers) _, X_L, X_D = get_legacy_metadata(state) X_L_list.append(X_L) X_D_list.append(X_D) if n_chains == 1: X_L_list, X_D_list = X_L_list[0], X_D_list[0] return X_L_list, X_D_list
def initialize(self, M_c, M_r, T, initialization='from_the_prior', specified_s_grid=None, specified_mu_grid=None, row_initialization=-1, n_chains=1): # assumes all columns are normal data T = numpy.array(T) n_rows, n_cols = T.shape X = [ T[:,c] for c in range(n_cols) ] cctypes = ['normal']*n_cols distargs = [None]*n_cols # it don't use M_r X_L_list = [] X_D_list = [] for chain in range(n_chains): state = cc_state.cc_state(X, cctypes, distargs) if specified_mu_grid is not None: if len(specified_mu_grid) > 0: for dim in state.dims: dim.hypers_grids['m'] = numpy.array(specified_mu_grid) dim.hypers['m'] = random.sample(specified_mu_grid, 1)[0] for cluster in dim.clusters: cluster.set_hypers(dim.hypers) if specified_s_grid is not None: if len(specified_s_grid) > 0: for dim in state.dims: dim.hypers_grids['s'] = numpy.array(specified_s_grid) dim.hypers['s'] = random.sample(specified_s_grid, 1)[0] for cluster in dim.clusters: cluster.set_hypers(dim.hypers) _, X_L, X_D = get_legacy_metadata(state) X_L_list.append(X_L) X_D_list.append(X_D) if n_chains == 1: X_L_list, X_D_list = X_L_list[0], X_D_list[0] return X_L_list, X_D_list
def forward_sample(X, n_iters, Zv=None, Zrcv=None, n_grid=30, n_chains=1, ct_kernel=0): total_iters = n_chains*n_iters n_cols = len(X) cctypes = ['normal']*n_cols distargs = [None]*n_cols forward_samples = dict() stats = [] i = 0 for chain in range(n_chains): forward_samples[chain] = [] for itr in range(n_iters): i += 1 state = cc_state.cc_state(X, cctypes, distargs, Zv=Zv, Zrcv=Zrcv, n_grid=n_grid, ct_kernel=ct_kernel) Y = su.resample_data(state) forward_samples[chain].append(Y) stats.append(get_data_stats(Y, state)) string = "\r%1.2f " % (i*100.0/float(total_iters)) sys.stdout.write(string) sys.stdout.flush() return stats, forward_samples
def run_test(argsin): n_rows = args["num_rows"] n_iters = args["num_iters"] n_chains = args["num_chains"] n_per_chain = int(float(n_rows) / n_chains) plt = 0 for shape in shapes: print "Shape: %s" % shape plt += 1 T_o = gen_function[shape](n_rows) T_i = [] for chain in range(n_chains): print "chain %i of %i" % (chain + 1, n_chains) S = cc_state.cc_state(T_o, cctypes, ct_kernel=1, distargs=distargs) S.transition(N=n_iters) T_i.extend( su.simple_predictive_sample(S, n_rows, [0, 1], N=n_per_chain)) T_i = numpy.array(T_i) ax = pylab.subplot(2, 4, plt) pylab.scatter(T_o[0], T_o[1], color='blue', edgecolor='none') pylab.ylabel("X") pylab.ylabel("Y") pylab.title("%s original" % shape) pylab.subplot(2, 4, plt + 4) pylab.scatter(T_i[:, 0], T_i[:, 1], color='red', edgecolor='none') pylab.ylabel("X") pylab.ylabel("Y") pylab.xlim(ax.get_xlim()) pylab.ylim(ax.get_ylim()) pylab.title("%s simulated" % shape) print "Done." pylab.show()
def run_test(argsin): n_rows = args["num_rows"] n_iters = args["num_iters"] n_chains = args["num_chains"] n_per_chain = int(float(n_rows) / n_chains) plt = 0 for shape in shapes: print "Shape: %s" % shape plt += 1 T_o = gen_function[shape](n_rows) T_i = [] for chain in range(n_chains): print "chain %i of %i" % (chain + 1, n_chains) S = cc_state.cc_state(T_o, cctypes, ct_kernel=1, distargs=distargs) S.transition(N=n_iters) T_i.extend(su.simple_predictive_sample(S, n_rows, [0, 1], N=n_per_chain)) T_i = numpy.array(T_i) ax = pylab.subplot(2, 4, plt) pylab.scatter(T_o[0], T_o[1], color="blue", edgecolor="none") pylab.ylabel("X") pylab.ylabel("Y") pylab.title("%s original" % shape) pylab.subplot(2, 4, plt + 4) pylab.scatter(T_i[:, 0], T_i[:, 1], color="red", edgecolor="none") pylab.ylabel("X") pylab.ylabel("Y") pylab.xlim(ax.get_xlim()) pylab.ylim(ax.get_ylim()) pylab.title("%s simulated" % shape) print "Done." pylab.show()
def posterior_sample(X, n_iters, kernels=_all_kernels, Zv=None, Zrcv=None, n_grid=30, n_chains=1, ct_kernel=0): n_cols = len(X) cctypes = ['normal']*n_cols distargs = [None]*n_cols stats = [] posterior_samples = dict() i = 0.0; total_iters = n_chains*n_iters for chain in range(n_chains): state = cc_state.cc_state(X, cctypes, distargs, Zv=Zv, Zrcv=Zrcv, n_grid=n_grid, ct_kernel=ct_kernel) Y = su.resample_data(state) posterior_samples[chain] = Y for _ in range(n_iters): state.transition(kernel_list=kernels) Y = su.resample_data(state) stats.append(get_data_stats(Y, state)) posterior_samples[chain].append(Y) i += 1.0 string = "\r%1.2f " % (i*100.0/float(total_iters)) sys.stdout.write(string) sys.stdout.flush() return stats, posterior_samples
def posterior_sample(X, n_iters, kernels=_all_kernels, Zv=None, Zrcv=None, n_grid=30, n_chains=1, ct_kernel=0): n_cols = len(X) cctypes = ['normal'] * n_cols distargs = [None] * n_cols stats = [] posterior_samples = dict() i = 0.0 total_iters = n_chains * n_iters for chain in range(n_chains): state = cc_state.cc_state(X, cctypes, distargs, Zv=Zv, Zrcv=Zrcv, n_grid=n_grid, ct_kernel=ct_kernel) Y = su.resample_data(state) posterior_samples[chain] = Y for _ in range(n_iters): state.transition(kernel_list=kernels) Y = su.resample_data(state) stats.append(get_data_stats(Y, state)) posterior_samples[chain].append(Y) i += 1.0 string = "\r%1.2f " % (i * 100.0 / float(total_iters)) sys.stdout.write(string) sys.stdout.flush() return stats, posterior_samples
for kernel in range(2): MI = numpy.zeros((n_data_sets * n_samples, len(W_list))) c = 0 for w in W_list: r = 0 for ds in range(n_data_sets): # seed control so that data is always the same numpy.random.seed(r + ds) random.seed(r + ds) X = _gen_ring(N, w) for _ in range(n_samples): S = cc_state.cc_state([X[:, 0], X[:, 1]], ["normal"] * 2, ct_kernel=kernel, distargs=[None] * 2) S.transition(N=200) mi = iu.mutual_information(S, 0, 1) # linfoot = iu.mutual_information_to_linfoot(MI) MI[r, c] = mi print("w: %1.2f, MI: %1.6f" % (w, mi)) print("%i of %i" % (i + 1, len(W_list) * n_data_sets * n_samples * 2)) del S i += 1 r += 1 c += 1
# for kernel in range(2): for kernel in range(2): L = numpy.zeros((n_data_sets*n_samples, len(rho_list))) c = 0 for rho in rho_list: r = 0 for ds in range(n_data_sets): # seed control so that data is always the same numpy.random.seed(r+ds) random.seed(r+ds) sigma = numpy.array([[1,rho],[rho,1]]) X = numpy.random.multivariate_normal(mu,sigma,N) for _ in range(n_samples): S = cc_state.cc_state([X[:,0], X[:,1]], ['normal']*2, Zv=[0,0], ct_kernel=kernel, distargs=distargs) S.transition(N=100) MI = iu.mutual_information(S, 0, 1) linfoot = iu.mutual_information_to_linfoot(MI) # del S L[r,c] = linfoot print("rho: %1.2f, MI: %1.6f, Linfoot: %1.6f" %(rho, MI, linfoot)) print("%i of %i" % (i+1, len(rho_list)*n_data_sets*n_samples*2)) del S
cluster_weights = [numpy.ones(3)/3.0, numpy.ones(2)/2.0] cctypes = ['normal']*5 distargs = [None]*5 separation = [.7, .9] T, Zv, Zc, dims = tu.gen_data_table( n_rows, view_weights, cluster_weights, cctypes, distargs, separation, return_dims=True) state = cc_state.cc_state(T, cctypes, distargs) state.transition(N=10) M_c, X_L, X_D = lu.get_legacy_metadata(state) Tcc = T[0] for i in range(1,len(T)): Tcc = numpy.vstack( (Tcc, T[i]) ) Tcc = numpy.transpose(Tcc) # make sure the data came out right for i in range(len(T)): assert numpy.all(T[i] == Tcc[:,i]) state_b = lu.construct_state_from_legacy_metadata(Tcc, M_c, X_L, X_D)
for kernel in range(2): MI = numpy.zeros((n_data_sets * n_samples, len(W_list))) c = 0 for w in W_list: r = 0 for ds in range(n_data_sets): # seed control so that data is always the same numpy.random.seed(r + ds) random.seed(r + ds) X = _gen_ring(N, w) for _ in range(n_samples): S = cc_state.cc_state([X[:, 0], X[:, 1]], ['normal'] * 2, ct_kernel=kernel, distargs=[None] * 2) S.transition(N=200) mi = iu.mutual_information(S, 0, 1) # linfoot = iu.mutual_information_to_linfoot(MI) MI[r, c] = mi print("w: %1.2f, MI: %1.6f" % (w, mi)) print("%i of %i" % (i + 1, len(W_list) * n_data_sets * n_samples * 2)) del S i += 1
def run_test(argsin): n_rows = args["num_rows"] n_iters = args["num_iters"] n_chains = args["num_chains"] ct_kernel = args["ct_kernel"] fig = pylab.figure(num=None, facecolor='w', edgecolor='k', frameon=False, tight_layout=True) plt = 0 data = {'x': [], 'sin': [], 'ring': [], 'dots': []} xlims = dict() ylims = dict() for shape in shapes: plt += 1 data[shape] = gen_function[shape](n_rows) ax = pylab.subplot(n_chains + 1, 4, plt) pylab.scatter(data[shape][0], data[shape][1], s=10, color='blue', edgecolor='none', alpha=.2) # pylab.ylabel("X") # pylab.ylabel("Y") # pylab.title("%s original" % shape) ax.set_xticks([]) ax.set_yticks([]) pylab.suptitle("Kernel %i" % ct_kernel) xlims[shape] = ax.get_xlim() ylims[shape] = ax.get_ylim() States = [] for chain in range(n_chains): print("chain %i of %i." % (chain + 1, n_chains)) plt = 0 for shape in shapes: print("\tWorking on %s." % shape) plt += 1 T = data[shape] S = cc_state.cc_state(T, cctypes, ct_kernel=ct_kernel, distargs=distargs) S.transition(N=n_iters) T_chain = numpy.array( su.simple_predictive_sample(S, n_rows, [0, 1], N=n_rows)) ax = pylab.subplot(n_chains + 1, 4, chain * 4 + 4 + plt) ax.set_xticks([]) ax.set_yticks([]) pylab.scatter(T_chain[:, 0], T_chain[:, 1], s=10, color='red', edgecolor='none', alpha=.2) pylab.xlim(xlims[shape]) pylab.ylim(ylims[shape]) # pylab.title("%s simulated (%i)" % (shape, chain)) print("Done.") pylab.show()
Ts, Zv, Zc = tu.gen_data_table(n_rows, numpy.array([.5,.5]), [numpy.array([1./2]*2), numpy.array([1./5]*5)], cctypes, distargs, [1.0]*n_cols) for kernel in range(n_kernels): # for a set number of chains ARI_view = numpy.zeros((n_data_sets, n_transitions)) ARI_cols = numpy.zeros((n_data_sets, n_transitions)) for r in range(n_data_sets): S = cc_state.cc_state(Ts, cctypes, ct_kernel=kernel, distargs=distargs) for c in range(n_transitions): S.transition(N=1) # calucalte ARI ari_view = adjusted_rand_score(Zv, S.Zv.tolist()) ari_cols = tu.column_average_ari(Zv, Zc, S) ARI_view[r,c] = ari_view ARI_cols[r,c] = ari_cols itr += 1 print("itr %i of %i." % (itr, total_itr)) ### pylab.subplot(2,n_kernels,kernel+1)
# four cols of for _ in range(4): x_clustered = [] for i in range(n_rows): x_clustered.append( numpy.random.randn()+Z[i]*4 ) X.append( numpy.array( x_clustered) ) # S = cc_state.cc_state(X, cctypes, distargs, ct_kernel=0, seed=random.randrange(200000)) # S.transition(N=200, do_plot=True) states = [] for s in range(n_states): states.append( cc_state.cc_state(X, cctypes, distargs, ct_kernel=1, seed=random.randrange(200000)) ) num_iters = 200 i = 0 for state in states: i += 1 state.transition(N=num_iters) print("state %i of %i" % (i, n_states) ) Zvs = [] for state in states: Zvs.append(state.Zv.tolist())
for _ in range(4): x_clustered = [] for i in range(n_rows): x_clustered.append(numpy.random.randn() + Z[i] * 4) X.append(numpy.array(x_clustered)) # S = cc_state.cc_state(X, cctypes, distargs, ct_kernel=0, seed=random.randrange(200000)) # S.transition(N=200, do_plot=True) states = [] for s in range(n_states): states.append( cc_state.cc_state(X, cctypes, distargs, ct_kernel=1, seed=random.randrange(200000))) num_iters = 200 i = 0 for state in states: i += 1 state.transition(N=num_iters) print("state %i of %i" % (i, n_states)) Zvs = [] for state in states: Zvs.append(state.Zv.tolist())