def test_behavior_X_array(self): X = numpy.array([0, 1, 2, 3]) counts = qtu.bincount(X) assert counts == [1, 1, 1, 1] X = numpy.array([1, 2, 2, 4, 6]) counts = qtu.bincount(X) assert counts == [1, 2, 0, 1, 0, 1] bins = list(range(7)) counts = qtu.bincount(X,bins) assert counts == [0, 1, 2, 0, 1, 0, 1] bins = [1,2,4,6] counts = qtu.bincount(X,bins) assert counts == [1, 2, 1, 1]
def test_behavior_X_list(self): X = [0, 1, 2, 3] counts = qtu.bincount(X) assert counts == [1, 1, 1, 1] X = [1, 2, 2, 4, 6] counts = qtu.bincount(X) assert counts == [1, 2, 0, 1, 0, 1] bins = list(range(7)) counts = qtu.bincount(X,bins) assert counts == [0, 1, 2, 0, 1, 0, 1] bins = [1,2,4,6] counts = qtu.bincount(X,bins) assert counts == [1, 2, 1, 1]
def check_predictive_sample_improvement(component_model_type, seed=0, show_plot=True): """ Shows the error of predictive sample over iterations. """ num_transitions = 100 num_samples = 10 num_clusters = 2 separation = .9 # cluster separation N = 150 random.seed(seed) get_next_seed = lambda : random.randrange(2147483647) # generate a single column of data from the component_model cctype = component_model_type.cctype T, M_c, struc = sdg.gen_data([cctype], N, [0], [[.5,.5]], [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_array = numpy.array(T) X = numpy.zeros((N,num_transitions)) KL = numpy.zeros((num_samples, num_transitions)) support = qtu.get_mixture_support(cctype, component_model_type, struc['component_params'][0], nbins=1000, support=.995) true_log_pdf = qtu.get_mixture_pdf(support, component_model_type, struc['component_params'][0],[.5,.5]) for s in range(num_samples): # generate the state state = State.p_State(M_c, T, SEED=get_next_seed()) for i in range(num_transitions): # transition state.transition() # get partitions and generate a predictive column X_L = state.get_X_L() X_D = state.get_X_D() T_inf = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) if cctype == 'multinomial': K = distargs[cctype]['K'] weights = numpy.zeros(numpy.array(K)) for params in struc['component_params'][0]: weights += numpy.array(params['weights'])*(1.0/num_clusters) weights *= float(N) inf_hist = qtu.bincount(T_inf, bins=list(range(K))) err, _ = stats.power_divergence(inf_hist, weights, lambda_='pearson') err = numpy.ones(N)*err else: err = (T_array-T_inf)**2.0 KL[s,i] = qtu.KL_divergence(component_model_type, struc['component_params'][0], [.5,.5], M_c, X_L, X_D, true_log_pdf=true_log_pdf, support=support) for j in range(N): X[j,i] += err[j] X /= num_samples # mean and standard error X_mean = numpy.mean(X,axis=0) X_err = numpy.std(X,axis=0)/float(num_samples)**.5 KL_mean = numpy.mean(KL, axis=0) KL_err = numpy.std(KL, axis=0)/float(num_samples)**.5 if show_plot: pylab.subplot(1,2,1) pylab.errorbar(list(range(num_transitions)), X_mean, yerr=X_err) pylab.xlabel('iteration') pylab.ylabel('error across each data point') pylab.title('error of predictive sample over iterations, N=%i' % N) pylab.subplot(1,2,2) pylab.errorbar(list(range(num_transitions)), KL_mean, yerr=KL_err) pylab.xlabel('iteration') pylab.ylabel('KL divergence') pylab.title('KL divergence, N=%i' % N) pylab.show() # error should decrease over time return X_mean[0] > X_mean[-1] and KL_mean[0] > KL_mean[-1]
def check_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda : random.randrange(2147483647) cluster_weights = [[1.0/float(num_clusters)]*num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support(cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = list(range(len(discrete_support))) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0/num_clusters]*num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p
def check_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda: random.randrange(2147483647) cluster_weights = [[1.0 / float(num_clusters)] * num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support( cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns( M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = list(range(len(discrete_support))) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0 / num_clusters] * num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p