def test_should_return_list_of_params(self): ret = sdg.generate_separated_model_parameters('continuous', .5, self.num_clusters, self.get_next_seed ) assert isinstance(ret, list) assert len(ret) == self.num_clusters for entry in ret: assert isinstance(entry, dict) for key in entry.keys(): assert key in ['mu', 'rho'] assert len(entry.keys()) == 2 ret = sdg.generate_separated_model_parameters('multinomial', .5, self.num_clusters, self.get_next_seed, distargs=self.distargs_multinomial) assert isinstance(ret, list) assert len(ret) == self.num_clusters for entry in ret: assert isinstance(entry, dict) for key in entry.keys(): assert key in ['weights'] assert len(entry.keys()) == 1
def test_normal_means_should_be_farther_apart_if_they_have_higer_separation(self): random.seed(0) closer = sdg.generate_separated_model_parameters('continuous', .1, 2, self.get_next_seed ) sum_std_close = closer[0]['rho']**(-.5) + closer[1]['rho']**(-.5) distance_close = ((closer[0]['mu']-closer[1]['mu'])/sum_std_close)**2.0 random.seed(0) farther = sdg.generate_separated_model_parameters('continuous', .5, 2, self.get_next_seed ) sum_std_far = farther[0]['rho']**(-.5) + farther[1]['rho']**(-.5) distance_far = ((farther[0]['mu']-farther[1]['mu'])/sum_std_far)**2.0 random.seed(0) farthest = sdg.generate_separated_model_parameters('continuous', 1.0, 2, self.get_next_seed ) sum_std_farthest = farthest[0]['rho']**(-.5) + farthest[1]['rho']**(-.5) distance_farthest = ((farthest[0]['mu']-farthest[1]['mu'])/sum_std_farthest)**2.0 assert distance_far > distance_close assert distance_farthest > distance_far
def check_impute_vs_column_average_single(component_model_type, num_clusters, seed=0): """ tests predictive row generation vs column average Note: This test does not make sense for categorical data Inputs: - component_model_type: main class from datatype. Ex: ccmext.p_ContinuousComponentModel - num_clusters: the number of clusters in the data - seed: (optional) int to seed the RNG Returns: - the mean square error of the predictive sample column - the mean square error of the column average column """ random.seed(seed) N = 100 get_next_seed = lambda: random.randrange(2147483647) C = .9 # highly-separated clusters cctype = component_model_type.cctype component_model_parameters = sdg.generate_separated_model_parameters( cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype]) # generte a partition of rows to clusters (evenly-weighted) Z = range(num_clusters) for z in range(N - num_clusters): Z.append(random.randrange(num_clusters)) random.shuffle(Z) # generate the data T = numpy.array([[0]] * N, dtype=float) for x in range(N): z = Z[x] T[x] = component_model_type.generate_data_from_parameters( component_model_parameters[z], 1, gen_seed=get_next_seed())[0] T_list = T.tolist() # intialize the state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T) # transitions state.transition(n_steps=100) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate a row from the sample T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) # generate a row of column averages T_colave = numpy.ones(T.shape) * numpy.mean(T) # get the mean squared error err_sample = numpy.mean((T_generated - T)**2.0) err_colave = numpy.mean((T_colave - T)**2.0) return err_sample, err_colave
def check_impute_vs_column_average_single(component_model_type, num_clusters, seed=0): """ tests predictive row generation vs column average Note: This test does not make sense for categorical data Inputs: - component_model_type: main class from datatype. Ex: ccmext.p_ContinuousComponentModel - num_clusters: the number of clusters in the data - seed: (optional) int to seed the RNG Returns: - the mean square error of the predictive sample column - the mean square error of the column average column """ random.seed(seed) N = 100 get_next_seed = lambda : random.randrange(2147483647) C = .9 # highly-separated clusters cctype = component_model_type.cctype component_model_parameters = sdg.generate_separated_model_parameters( cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype]) # generte a partition of rows to clusters (evenly-weighted) Z = range(num_clusters) for z in range(N-num_clusters): Z.append(random.randrange(num_clusters)) random.shuffle(Z) # generate the data T = numpy.array([[0]]*N, dtype=float) for x in range(N): z = Z[x] T[x] = component_model_type.generate_data_from_parameters( component_model_parameters[z], 1, gen_seed=get_next_seed())[0] T_list = T.tolist() # intialize the state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T) # transitions state.transition(n_steps=100) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate a row from the sample T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed()) # generate a row of column averages T_colave = numpy.ones(T.shape)*numpy.mean(T) # get the mean squared error err_sample = numpy.mean( (T_generated-T)**2.0 ) err_colave = numpy.mean( (T_colave-T)**2.0 ) return err_sample, err_colave