def test_should_have_nan_entries_if_specified(self): # for one column columns_list = [0] optargs = [dict(missing_data=1.0)] # every entry will be missing NaN X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list, optional_settings=optargs) assert numpy.all(numpy.isnan(X)) # for two columns columns_list = [0,1] optargs = [dict(missing_data=1.0)]*2 X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list, optional_settings=optargs) assert numpy.all(numpy.isnan(X)) # for one of two columns (no dict means no missing data) columns_list = [0,1] optargs = [dict(missing_data=1.0), None] X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list, optional_settings=optargs) assert numpy.all(numpy.isnan(X[:,0])) assert not numpy.any(numpy.isnan(X[:,1])) # for one of two columns. Missing data specified 0 for second column columns_list = [0,1] optargs = [dict(missing_data=1.0), dict(missing_data=0.0)] X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list, optional_settings=optargs) assert numpy.all(numpy.isnan(X[:,0])) assert not numpy.any(numpy.isnan(X[:,1]))
def test_should_return_array_of_proper_size(self): columns_list = [0] X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list) assert isinstance(X, numpy.ndarray) assert X.shape[0] == self.num_rows assert X.shape[1] == len(columns_list) columns_list = [0,1] X = sdg.predictive_columns(self.M_c, self.X_L, self.X_D, columns_list) assert isinstance(X, numpy.ndarray) assert X.shape[0] == self.num_rows assert X.shape[1] == len(columns_list)
def check_predictive_sample_improvement(component_model_type, seed=0, show_plot=True): """ Shows the error of predictive sample over iterations. """ num_transitions = 100 num_samples = 10 num_clusters = 2 separation = .9 # cluster separation N = 150 random.seed(seed) get_next_seed = lambda : random.randrange(2147483647) # generate a single column of data from the component_model cctype = component_model_type.cctype T, M_c, struc = sdg.gen_data([cctype], N, [0], [[.5,.5]], [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_array = numpy.array(T) X = numpy.zeros((N,num_transitions)) KL = numpy.zeros((num_samples, num_transitions)) support = qtu.get_mixture_support(cctype, component_model_type, struc['component_params'][0], nbins=1000, support=.995) true_log_pdf = qtu.get_mixture_pdf(support, component_model_type, struc['component_params'][0],[.5,.5]) for s in range(num_samples): # generate the state state = State.p_State(M_c, T, SEED=get_next_seed()) for i in range(num_transitions): # transition state.transition() # get partitions and generate a predictive column X_L = state.get_X_L() X_D = state.get_X_D() T_inf = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) if cctype == 'multinomial': K = distargs[cctype]['K'] weights = numpy.zeros(numpy.array(K)) for params in struc['component_params'][0]: weights += numpy.array(params['weights'])*(1.0/num_clusters) weights *= float(N) inf_hist = qtu.bincount(T_inf, bins=list(range(K))) err, _ = stats.power_divergence(inf_hist, weights, lambda_='pearson') err = numpy.ones(N)*err else: err = (T_array-T_inf)**2.0 KL[s,i] = qtu.KL_divergence(component_model_type, struc['component_params'][0], [.5,.5], M_c, X_L, X_D, true_log_pdf=true_log_pdf, support=support) for j in range(N): X[j,i] += err[j] X /= num_samples # mean and standard error X_mean = numpy.mean(X,axis=0) X_err = numpy.std(X,axis=0)/float(num_samples)**.5 KL_mean = numpy.mean(KL, axis=0) KL_err = numpy.std(KL, axis=0)/float(num_samples)**.5 if show_plot: pylab.subplot(1,2,1) pylab.errorbar(list(range(num_transitions)), X_mean, yerr=X_err) pylab.xlabel('iteration') pylab.ylabel('error across each data point') pylab.title('error of predictive sample over iterations, N=%i' % N) pylab.subplot(1,2,2) pylab.errorbar(list(range(num_transitions)), KL_mean, yerr=KL_err) pylab.xlabel('iteration') pylab.ylabel('KL divergence') pylab.title('KL divergence, N=%i' % N) pylab.show() # error should decrease over time return X_mean[0] > X_mean[-1] and KL_mean[0] > KL_mean[-1]
def check_impute_vs_column_average_single(component_model_type, num_clusters, seed=0): """ tests predictive row generation vs column average Note: This test does not make sense for categorical data Inputs: - component_model_type: main class from datatype. Ex: ccmext.p_ContinuousComponentModel - num_clusters: the number of clusters in the data - seed: (optional) int to seed the RNG Returns: - the mean square error of the predictive sample column - the mean square error of the column average column """ random.seed(seed) N = 100 get_next_seed = lambda: random.randrange(2147483647) C = .9 # highly-separated clusters cctype = component_model_type.cctype component_model_parameters = sdg.generate_separated_model_parameters( cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype]) # generte a partition of rows to clusters (evenly-weighted) Z = range(num_clusters) for z in range(N - num_clusters): Z.append(random.randrange(num_clusters)) random.shuffle(Z) # generate the data T = numpy.array([[0]] * N, dtype=float) for x in range(N): z = Z[x] T[x] = component_model_type.generate_data_from_parameters( component_model_parameters[z], 1, gen_seed=get_next_seed())[0] T_list = T.tolist() # intialize the state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T) # transitions state.transition(n_steps=100) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate a row from the sample T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) # generate a row of column averages T_colave = numpy.ones(T.shape) * numpy.mean(T) # get the mean squared error err_sample = numpy.mean((T_generated - T)**2.0) err_colave = numpy.mean((T_colave - T)**2.0) return err_sample, err_colave
def check_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda : random.randrange(2147483647) cluster_weights = [[1.0/float(num_clusters)]*num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support(cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N,0,x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = list(range(len(discrete_support))) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist/float(numpy.sum(T_hist)) S_hist = S_hist/float(numpy.sum(S_hist)) edges = numpy.array(discrete_support,dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50,len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist*N freq_exp = numpy.exp(probabilities)*N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0/num_clusters]*num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges)-numpy.min(edges))/len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0,ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p
def check_impute_vs_column_average_single(component_model_type, num_clusters, seed=0): """ tests predictive row generation vs column average Note: This test does not make sense for categorical data Inputs: - component_model_type: main class from datatype. Ex: ccmext.p_ContinuousComponentModel - num_clusters: the number of clusters in the data - seed: (optional) int to seed the RNG Returns: - the mean square error of the predictive sample column - the mean square error of the column average column """ random.seed(seed) N = 100 get_next_seed = lambda : random.randrange(2147483647) C = .9 # highly-separated clusters cctype = component_model_type.cctype component_model_parameters = sdg.generate_separated_model_parameters( cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype]) # generte a partition of rows to clusters (evenly-weighted) Z = range(num_clusters) for z in range(N-num_clusters): Z.append(random.randrange(num_clusters)) random.shuffle(Z) # generate the data T = numpy.array([[0]]*N, dtype=float) for x in range(N): z = Z[x] T[x] = component_model_type.generate_data_from_parameters( component_model_parameters[z], 1, gen_seed=get_next_seed())[0] T_list = T.tolist() # intialize the state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T) # transitions state.transition(n_steps=100) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate a row from the sample T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) # generate a row of column averages T_colave = numpy.ones(T.shape)*numpy.mean(T) # get the mean squared error err_sample = numpy.mean( (T_generated-T)**2.0 ) err_colave = numpy.mean( (T_colave-T)**2.0 ) return err_sample, err_colave
def check_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda: random.randrange(2147483647) cluster_weights = [[1.0 / float(num_clusters)] * num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support( cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns( M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = list(range(len(discrete_support))) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0 / num_clusters] * num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p