def quick_le(seed, n_chains=1): # Specify synthetic dataset structure. cctypes = [ 'continuous', 'continuous', 'multinomial', 'multinomial', 'continuous' ] distargs = [None, None, dict(K=9), dict(K=7), None] cols_to_views = [0, 0, 0, 1, 1] separation = [0.6, 0.9] cluster_weights = [[.2, .3, .5], [.9, .1]] # Obtain the generated dataset and metadata/ T, M_c, M_r = sdg.gen_data(cctypes, N_ROWS, cols_to_views, cluster_weights, separation, seed=seed, distargs=distargs, return_structure=True) # Create, initialize, and analyze the engine. engine = LocalEngine() X_L, X_D = engine.initialize(M_c, M_r, T, seed, n_chains=n_chains) return T, M_r, M_c, X_L, X_D, engine
def test_proper_set_up_all_continuous(self): T, M_c = sdg.gen_data(self.cctypes_all_contiuous, self.n_rows, self.cols_to_views_good, self.cluster_weights_good, self.separation_good, seed=0, distargs=None) assert(len(T) == self.n_rows) assert(len(T[0]) == len(self.cols_to_views_good))
def test_proper_set_up_mixed(self): distargs = [ None, None, dict(K=5), None, dict(K=5)] T, M_c = sdg.gen_data(self.cctypes_mixed, self.n_rows, self.cols_to_views_good, self.cluster_weights_good, self.separation_good, seed=0, distargs=distargs) assert(len(T) == self.n_rows) assert(len(T[0]) == len(self.cols_to_views_good))
def test_different_seeds_should_produce_the_different_data(self): distargs = [None]*5 T1, M_c = sdg.gen_data(self.cctypes_all_contiuous, self.n_rows, self.cols_to_views_good, self.cluster_weights_good, self.separation_good, seed=0, distargs=distargs) T2, M_c = sdg.gen_data(self.cctypes_all_contiuous, self.n_rows, self.cols_to_views_good, self.cluster_weights_good, self.separation_good, seed=12345, distargs=distargs) A1 = numpy.array(T1) A2 = numpy.array(T2) assert not numpy.all(A1==A2)
def quick_le(seed, n_chains=1): # Specify synthetic dataset structure. cctypes = ['continuous', 'continuous', 'multinomial', 'multinomial', 'continuous'] distargs = [None, None, dict(K=9), dict(K=7), None] cols_to_views = [0, 0, 0, 1, 1] separation = [0.6, 0.9] cluster_weights = [[.2, .3, .5],[.9, .1]] # Obtain the generated dataset and metadata/ T, M_c, M_r = sdg.gen_data(cctypes, N_ROWS, cols_to_views, cluster_weights, separation, seed=seed, distargs=distargs, return_structure=True) # Create, initialize, and analyze the engine. engine = LocalEngine(seed=seed) X_L, X_D = engine.initialize(M_c, M_r, T, n_chains=n_chains) return T, M_r, M_c, X_L, X_D, engine
def get_synthetic_data(n_sample, seed=438): cols_to_views = [ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 4 ] colnames = [ 'm1', 'c1', 'm2', 'c2', 'm3', 'm4', 'c3', 'c4', 'm5', 'c5', 'c6', 'c7', 'c8', 'c9' ] cctypes = [ 'multinomial', 'continuous', 'multinomial', 'continuous', 'multinomial', 'multinomial', 'continuous', 'continuous', 'multinomial', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous' ] distargs = [ dict(K=9), None, dict(K=9), None, dict(K=7), dict(K=4), None, None, dict(K=9), None, None, None, None, None ] component_weights = [ [.2, .3, .5], [.9, .1], [.4, .4, .2], [.8, .2], [.4, .5, .1] ] separation = [0.8, 0.9, 0.65, 0.7, 0.75] synthetic_data = sdg.gen_data(cctypes, n_sample, cols_to_views, component_weights, separation, seed, distargs=distargs) data = pd.DataFrame(synthetic_data[0]) data.columns = colnames return df_to_table(data)
def check_predictive_sample_improvement(component_model_type, seed=0, show_plot=True): """ Shows the error of predictive sample over iterations. """ num_transitions = 100 num_samples = 10 num_clusters = 2 separation = .9 # cluster separation N = 150 random.seed(seed) get_next_seed = lambda : random.randrange(2147483647) # generate a single column of data from the component_model cctype = component_model_type.cctype T, M_c, struc = sdg.gen_data([cctype], N, [0], [[.5,.5]], [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_array = numpy.array(T) X = numpy.zeros((N,num_transitions)) KL = numpy.zeros((num_samples, num_transitions)) support = qtu.get_mixture_support(cctype, component_model_type, struc['component_params'][0], nbins=1000, support=.995) true_log_pdf = qtu.get_mixture_pdf(support, component_model_type, struc['component_params'][0],[.5,.5]) for s in range(num_samples): # generate the state state = State.p_State(M_c, T, SEED=get_next_seed()) for i in range(num_transitions): # transition state.transition() # get partitions and generate a predictive column X_L = state.get_X_L() X_D = state.get_X_D() T_inf = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) if cctype == 'multinomial': K = distargs[cctype]['K'] weights = numpy.zeros(numpy.array(K)) for params in struc['component_params'][0]: weights += numpy.array(params['weights'])*(1.0/num_clusters) weights *= float(N) inf_hist = qtu.bincount(T_inf, bins=list(range(K))) err, _ = stats.power_divergence(inf_hist, weights, lambda_='pearson') err = numpy.ones(N)*err else: err = (T_array-T_inf)**2.0 KL[s,i] = qtu.KL_divergence(component_model_type, struc['component_params'][0], [.5,.5], M_c, X_L, X_D, true_log_pdf=true_log_pdf, support=support) for j in range(N): X[j,i] += err[j] X /= num_samples # mean and standard error X_mean = numpy.mean(X,axis=0) X_err = numpy.std(X,axis=0)/float(num_samples)**.5 KL_mean = numpy.mean(KL, axis=0) KL_err = numpy.std(KL, axis=0)/float(num_samples)**.5 if show_plot: pylab.subplot(1,2,1) pylab.errorbar(list(range(num_transitions)), X_mean, yerr=X_err) pylab.xlabel('iteration') pylab.ylabel('error across each data point') pylab.title('error of predictive sample over iterations, N=%i' % N) pylab.subplot(1,2,2) pylab.errorbar(list(range(num_transitions)), KL_mean, yerr=KL_err) pylab.xlabel('iteration') pylab.ylabel('KL divergence') pylab.title('KL divergence, N=%i' % N) pylab.show() # error should decrease over time return X_mean[0] > X_mean[-1] and KL_mean[0] > KL_mean[-1]
def test_kl_divergence_as_a_function_of_N_and_transitions(): n_clusters = 3 n_chains = 8 do_times = 4 # N_list = [25, 50, 100, 250, 500, 1000, 2000] N_list = [25, 50, 100, 175, 250, 400, 500] # max_transitions = 500 max_transitions = 500 transition_interval = 50 t_iterations = max_transitions / transition_interval cctype = "continuous" cluster_weights = [1.0 / float(n_clusters)] * n_clusters separation = 0.5 get_next_seed = lambda: random.randrange(2147483647) # data grid KLD = numpy.zeros((len(N_list), t_iterations + 1)) for _ in range(do_times): for n in range(len(N_list)): N = N_list[n] T, M_c, struc = sdg.gen_data( [cctype], N, [0], [cluster_weights], [separation], seed=get_next_seed(), distargs=[None], return_structure=True, ) M_r = du.gen_M_r_from_T(T) # precompute the support and pdf to speed up calculation of KL divergence support = qtu.get_mixture_support( cctype, ccmext.p_ContinuousComponentModel, struc["component_params"][0], nbins=1000, support=0.995 ) true_log_pdf = qtu.get_mixture_pdf( support, ccmext.p_ContinuousComponentModel, struc["component_params"][0], cluster_weights ) # intialize a multiprocessing engine mstate = mpe.MultiprocessingEngine(cpu_count=8) X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains) # kl_divergences klds = numpy.zeros(len(X_L_list)) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n, 0] += qtu.KL_divergence( ccmext.p_ContinuousComponentModel, struc["component_params"][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf, ) # run transition_interval then take a reading. Rinse and repeat. for t in range(t_iterations): X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list, n_steps=transition_interval) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n, t + 1] += qtu.KL_divergence( ccmext.p_ContinuousComponentModel, struc["component_params"][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf, ) KLD /= float(n_chains * do_times) pylab.subplot(1, 3, 1) pylab.contourf(list(range(0, max_transitions + 1, transition_interval)), N_list, KLD) pylab.title("KL divergence") pylab.ylabel("N") pylab.xlabel("# transitions") pylab.subplot(1, 3, 2) m_N = numpy.mean(KLD, axis=1) e_N = numpy.std(KLD, axis=1) / float(KLD.shape[1]) ** -0.5 pylab.errorbar(N_list, m_N, yerr=e_N) pylab.title("KL divergence by N") pylab.xlabel("N") pylab.ylabel("KL divergence") pylab.subplot(1, 3, 3) m_t = numpy.mean(KLD, axis=0) e_t = numpy.std(KLD, axis=0) / float(KLD.shape[0]) ** -0.5 pylab.errorbar(list(range(0, max_transitions + 1, transition_interval)), m_t, yerr=e_t) pylab.title("KL divergence by transitions") pylab.xlabel("trasition") pylab.ylabel("KL divergence") pylab.show() return KLD
def test_kl_divergence_as_a_function_of_N_and_transitions(): n_clusters = 3 n_chains = 8 do_times = 4 # N_list = [25, 50, 100, 250, 500, 1000, 2000] N_list = [25, 50, 100, 175, 250, 400, 500] # max_transitions = 500 max_transitions = 500 transition_interval = 50 t_iterations = max_transitions/transition_interval cctype = 'continuous' cluster_weights = [1.0/float(n_clusters)]*n_clusters separation = .5 get_next_seed = lambda : random.randrange(2147483647) # data grid KLD = numpy.zeros((len(N_list), t_iterations+1)) for _ in range(do_times): for n in range(len(N_list)): N = N_list[n] T, M_c, struc = sdg.gen_data([cctype], N, [0], [cluster_weights], [separation], seed=get_next_seed(), distargs=[None], return_structure=True) M_r = du.gen_M_r_from_T(T) # precompute the support and pdf to speed up calculation of KL divergence support = qtu.get_mixture_support(cctype, ccmext.p_ContinuousComponentModel, struc['component_params'][0], nbins=1000, support=.995) true_log_pdf = qtu.get_mixture_pdf(support, ccmext.p_ContinuousComponentModel, struc['component_params'][0],cluster_weights) # intialize a multiprocessing engine mstate = mpe.MultiprocessingEngine(cpu_count=8) X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains) # kl_divergences klds = numpy.zeros(len(X_L_list)) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n,0] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel, struc['component_params'][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf) # run transition_interval then take a reading. Rinse and repeat. for t in range( t_iterations ): X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list, n_steps=transition_interval) for i in range(len(X_L_list)): X_L = X_L_list[i] X_D = X_D_list[i] KLD[n,t+1] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel, struc['component_params'][0], cluster_weights, M_c, X_L, X_D, n_samples=1000, support=support, true_log_pdf=true_log_pdf) KLD /= float(n_chains*do_times) pylab.subplot(1,3,1) pylab.contourf(list(range(0,max_transitions+1,transition_interval), N_list, KLD)) pylab.title('KL divergence') pylab.ylabel('N') pylab.xlabel('# transitions') pylab.subplot(1,3,2) m_N = numpy.mean(KLD,axis=1) e_N = numpy.std(KLD,axis=1)/float(KLD.shape[1])**-.5 pylab.errorbar(N_list, m_N, yerr=e_N) pylab.title('KL divergence by N') pylab.xlabel('N') pylab.ylabel('KL divergence') pylab.subplot(1,3,3) m_t = numpy.mean(KLD,axis=0) e_t = numpy.std(KLD,axis=0)/float(KLD.shape[0])**-.5 pylab.errorbar(list(range(0,max_transitions+1,transition_interval), m_t, yerr=e_t)) pylab.title('KL divergence by transitions') pylab.xlabel('trasition') pylab.ylabel('KL divergence') pylab.show() return KLD
def check_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda : random.randrange(2147483647) cluster_weights = [[1.0/float(num_clusters)]*num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support(cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = list(range(len(discrete_support))) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0/num_clusters]*num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p
def check_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda: random.randrange(2147483647) cluster_weights = [[1.0 / float(num_clusters)] * num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support( cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns( M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = list(range(len(discrete_support))) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0 / num_clusters] * num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p