def quick_le(seed, n_chains=1): random.seed(seed) numpy.random.seed(seed) T, M_r, M_c = du.gen_factorial_data_objects(seed, 2, N_COLS, N_ROWS, 2) engine = LE.LocalEngine(seed=seed) X_L, X_D = engine.initialize(M_c, M_r, T, n_chains=n_chains) return T, M_r, M_c, X_L, X_D, engine
def quick_le(seed, n_chains=1): rng = random.Random(seed) T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2, N_COLS, N_ROWS, 2) engine = LE.LocalEngine(seed=get_next_seed(rng)) X_L, X_D = engine.initialize(M_c, M_r, T, seed=get_next_seed(rng), n_chains=n_chains) return T, M_r, M_c, X_L, X_D, engine
def generate_clean_state(gen_seed, num_clusters, num_cols, num_rows, num_splits, max_mean=10, max_std=1, plot=False): # generate the data T, M_r, M_c, data_inverse_permutation_indices = \ du.gen_factorial_data_objects(gen_seed, num_clusters, num_cols, num_rows, num_splits, max_mean=10, max_std=1, send_data_inverse_permutation_indices=True) # recover generative clustering X_L, X_D = get_generative_clustering(M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_splits) return T, M_c, M_r, X_L, X_D
def setUp(self): # generate a crosscat state and pull the metadata gen_seed = 0 num_clusters = 2 self.num_rows = 10 self.num_cols = 2 num_splits = 1 self.T, self.M_r, self.M_c = du.gen_factorial_data_objects(gen_seed, num_clusters, self.num_cols, self.num_rows, num_splits) state = State.p_State(self.M_c, self.T) self.X_L = state.get_X_L() self.X_D = state.get_X_D()
def test_two_dependent_one_dependent(): rng = random.Random(PASS_SEED) T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2, 3, 10, 1) engine = LE.LocalEngine(seed=get_next_seed(rng)) # These dependency constraints target the computation of unorm_crp_logps_avg # in the case that one variable (1) is independent of another variable (2) # which should ensure the CRP probability of the view of (2) is -INFINITY # when doing a block proposal of (0,1) into the view of (2). Refer to the # comment about compute the CRP probabilities in # State::sample_insert_features. dep_constraints = [(0, 1, True), (1, 2, False)] X_L, X_D = engine.initialize(M_c, M_r, T, seed=get_next_seed(rng), n_chains=1) X_L, X_D = engine.ensure_col_dep_constraints(M_c, M_r, T, X_L, X_D, dep_constraints, get_next_seed(rng)) for col1, col2, dep in dep_constraints: assert engine.assert_col_dep_constraints(X_L, X_D, col1, col2, dep, True) X_L, X_D = engine.analyze(M_c, T, X_L, X_D, get_next_seed(rng), n_steps=1000) for col1, col2, dep in dep_constraints: assert engine.assert_col_dep_constraints(X_L, X_D, col1, col2, dep, True)
num_clusters = args.num_clusters num_cols = args.num_cols num_rows = args.num_rows num_splits = args.num_splits max_mean = args.max_mean max_std = args.max_std num_transitions = args.num_transitions N_GRID = args.N_GRID URI = args.URI # create the data T, M_r, M_c = du.gen_factorial_data_objects( gen_seed, num_clusters, num_cols, num_rows, num_splits, max_mean=max_mean, max_std=max_std, ) # engine = JSONRPCEngine(inf_seed, URI=URI) # initialize X_L, X_D = engine.initialize(M_c, M_r, T) # analyze without do_diagnostics or do_timing X_L, X_D = engine.analyze(M_c, T, X_L, X_D, n_steps=num_transitions) # analyze with do_diagnostics
gen_seed = args.gen_seed inf_seed = args.inf_seed num_clusters = args.num_clusters num_cols = args.num_cols num_rows = args.num_rows num_splits = args.num_splits max_mean = args.max_mean max_std = args.max_std num_transitions = args.num_transitions N_GRID = args.N_GRID # create the data if True: T, M_r, M_c = du.gen_factorial_data_objects( gen_seed, num_clusters, num_cols, num_rows, num_splits, max_mean=max_mean, max_std=max_std, ) else: with open('SynData2.csv') as fh: import numpy import csv T = numpy.array([ row for row in csv.reader(fh) ], dtype=float).tolist() M_r = du.gen_M_r_from_T(T) M_c = du.gen_M_c_from_T(T) # create the state p_State = State.p_State(M_c, T, N_GRID=N_GRID, SEED=inf_seed)
num_views = args.num_views num_cols = args.num_cols numChains = args.numChains block_size = args.block_size engine = ccc.get_CrossCatClient('hadoop', seed = inf_seed) if filename is not None: # Load the data from table and sub-sample entities to max_rows T, M_r, M_c = du.read_model_data_from_csv(filename, max_rows, gen_seed) truth_flag = 0 else: T, M_r, M_c, data_inverse_permutation_indices = \ du.gen_factorial_data_objects(gen_seed, num_clusters, num_cols, max_rows, num_views, max_mean=100, max_std=1, send_data_inverse_permutation_indices=True) view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices(data_inverse_permutation_indices, max_rows,num_cols,num_views, num_clusters) truth_flag = 1 num_rows = len(T) num_cols = len(T[0]) ari_table = [] ari_views = [] print 'Initializing ...' # Call Initialize and Analyze M_c, M_r, X_L_list, X_D_list = engine.initialize(M_c, M_r, T, n_chains = numChains) if truth_flag:
inf_seed = 0 num_clusters = 4 num_cols = 32 num_rows = 400 num_views = 2 n_steps = 1 n_times = 5 n_chains = 3 n_test = 100 CT_KERNEL = 1 get_next_seed = make_get_next_seed(gen_seed) # generate some data T, M_r, M_c, data_inverse_permutation_indices = du.gen_factorial_data_objects( get_next_seed(), num_clusters, num_cols, num_rows, num_views, max_mean=100, max_std=1, send_data_inverse_permutation_indices=True) view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices( data_inverse_permutation_indices, num_rows, num_cols, num_views, num_clusters) # run some tests engine = LocalEngine() multi_state_ARIs = [] multi_state_mean_test_lls = [] X_L_list, X_D_list = engine.initialize(M_c, M_r, T, get_next_seed(), n_chains=n_chains) multi_state_ARIs.append( ctu.get_column_ARIs(X_L_list, view_assignment_truth)) for time_i in range(n_times): X_L_list, X_D_list = engine.analyze(
def convergence_analyze_helper(table_data, data_dict, command_dict): gen_seed = data_dict['SEED'] num_clusters = data_dict['num_clusters'] num_cols = data_dict['num_cols'] num_rows = data_dict['num_rows'] num_views = data_dict['num_views'] max_mean = data_dict['max_mean'] n_test = data_dict['n_test'] num_transitions = data_dict['n_steps'] block_size = data_dict['block_size'] init_seed = data_dict['init_seed'] # generate some data T, M_r, M_c, data_inverse_permutation_indices = \ du.gen_factorial_data_objects(gen_seed, num_clusters, num_cols, num_rows, num_views, max_mean=max_mean, max_std=1, send_data_inverse_permutation_indices=True) view_assignment_ground_truth = \ ctu.determine_synthetic_column_ground_truth_assignments(num_cols, num_views) X_L_gen, X_D_gen = ttu.get_generative_clustering( M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_views) T_test = ctu.create_test_set(M_c, T, X_L_gen, X_D_gen, n_test, seed_seed=0) generative_mean_test_log_likelihood = \ ctu.calc_mean_test_log_likelihood(M_c, T, X_L_gen, X_D_gen, T_test) # additional set up engine = LE.LocalEngine(init_seed) column_ari_list = [] mean_test_ll_list = [] elapsed_seconds_list = [] # get initial ARI, test_ll with gu.Timer('initialize', verbose=False) as timer: X_L, X_D = engine.initialize(M_c, M_r, T, initialization='from_the_prior') column_ari = ctu.get_column_ARI(X_L, view_assignment_ground_truth) column_ari_list.append(column_ari) mean_test_ll = ctu.calc_mean_test_log_likelihood(M_c, T, X_L, X_D, T_test) mean_test_ll_list.append(mean_test_ll) elapsed_seconds_list.append(timer.elapsed_secs) # run blocks of transitions, recording ARI, test_ll progression completed_transitions = 0 n_steps = min(block_size, num_transitions) while (completed_transitions < num_transitions): # We won't be limiting by time in the convergence runs with gu.Timer('initialize', verbose=False) as timer: X_L, X_D = engine.analyze(M_c, T, X_L, X_D, kernel_list=(), n_steps=n_steps, max_time=-1) completed_transitions = completed_transitions + block_size # column_ari = ctu.get_column_ARI(X_L, view_assignment_ground_truth) column_ari_list.append(column_ari) mean_test_ll = ctu.calc_mean_test_log_likelihood( M_c, T, X_L, X_D, T_test) mean_test_ll_list.append(mean_test_ll) elapsed_seconds_list.append(timer.elapsed_secs) ret_dict = dict( num_rows=num_rows, num_cols=num_cols, num_views=num_views, num_clusters=num_clusters, max_mean=max_mean, column_ari_list=column_ari_list, mean_test_ll_list=mean_test_ll_list, generative_mean_test_log_likelihood=generative_mean_test_log_likelihood, elapsed_seconds_list=elapsed_seconds_list, n_steps=num_transitions, block_size=block_size, ) return ret_dict
def run_test_continuous(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c= du.gen_factorial_data_objects(get_next_seed(),2,2,n_rows,1) state = State.p_State(M_c, T) T_array = numpy.array(T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # no constraints # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n) X_array = numpy.sort(numpy.array(samples)) std_X = numpy.std(X_array) mean_X = numpy.mean(X_array) # filter out extreme values X_filter_low = numpy.nonzero(X_array < mean_X-2.*std_X)[0] X_filter_high = numpy.nonzero(X_array > mean_X+2.*std_X)[0] X_filter = numpy.hstack((X_filter_low, X_filter_high)) X_array = numpy.delete(X_array, X_filter) # sort for area calculation later on X_array = numpy.sort(X_array) X = X_array.tolist() # build the queries Qs = []; for x in X: Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) # test that the area under Ps2 and pdfs is about 1 # calculated using the trapezoid rule area_density = 0; for i in range(len(X)-1): area_density += (X[i+1]-X[i])*(densities[i+1]+densities[i])/2.0 print "Area of PDF (should be close to, but not greater than, 1): " + str(area_density) print "*Note: The area will be less than one because the range (integral) is truncated." pylab.figure(facecolor='white') # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) pdf, bins, patches = pylab.hist(X,100,normed=1, histtype='stepfilled',label='samples', alpha=.5, color=[.5,.5,.5]) pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left',fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() raw_input("Press Enter when finished...")
def convergence_analyze_helper(table_data, data_dict, command_dict): gen_seed = data_dict['SEED'] num_clusters = data_dict['num_clusters'] num_cols = data_dict['num_cols'] num_rows = data_dict['num_rows'] num_views = data_dict['num_views'] max_mean = data_dict['max_mean'] n_test = data_dict['n_test'] num_transitions = data_dict['n_steps'] block_size = data_dict['block_size'] init_seed = data_dict['init_seed'] # generate some data T, M_r, M_c, data_inverse_permutation_indices = \ du.gen_factorial_data_objects(gen_seed, num_clusters, num_cols, num_rows, num_views, max_mean=max_mean, max_std=1, send_data_inverse_permutation_indices=True) view_assignment_ground_truth = \ ctu.determine_synthetic_column_ground_truth_assignments(num_cols, num_views) X_L_gen, X_D_gen = ttu.get_generative_clustering(M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_views) T_test = ctu.create_test_set(M_c, T, X_L_gen, X_D_gen, n_test, seed_seed=0) generative_mean_test_log_likelihood = \ ctu.calc_mean_test_log_likelihood(M_c, T, X_L_gen, X_D_gen, T_test) # additional set up engine=LE.LocalEngine(init_seed) column_ari_list = [] mean_test_ll_list = [] elapsed_seconds_list = [] # get initial ARI, test_ll with gu.Timer('initialize', verbose=False) as timer: X_L, X_D = engine.initialize(M_c, M_r, T, initialization='from_the_prior') column_ari = ctu.get_column_ARI(X_L, view_assignment_ground_truth) column_ari_list.append(column_ari) mean_test_ll = ctu.calc_mean_test_log_likelihood(M_c, T, X_L, X_D, T_test) mean_test_ll_list.append(mean_test_ll) elapsed_seconds_list.append(timer.elapsed_secs) # run blocks of transitions, recording ARI, test_ll progression completed_transitions = 0 n_steps = min(block_size, num_transitions) while (completed_transitions < num_transitions): # We won't be limiting by time in the convergence runs with gu.Timer('initialize', verbose=False) as timer: X_L, X_D = engine.analyze(M_c, T, X_L, X_D, kernel_list=(), n_steps=n_steps, max_time=-1) completed_transitions = completed_transitions + block_size # column_ari = ctu.get_column_ARI(X_L, view_assignment_ground_truth) column_ari_list.append(column_ari) mean_test_ll = ctu.calc_mean_test_log_likelihood(M_c, T, X_L, X_D, T_test) mean_test_ll_list.append(mean_test_ll) elapsed_seconds_list.append(timer.elapsed_secs) ret_dict = dict( num_rows=num_rows, num_cols=num_cols, num_views=num_views, num_clusters=num_clusters, max_mean=max_mean, column_ari_list=column_ari_list, mean_test_ll_list=mean_test_ll_list, generative_mean_test_log_likelihood=generative_mean_test_log_likelihood, elapsed_seconds_list=elapsed_seconds_list, n_steps=num_transitions, block_size=block_size, ) return ret_dict
n_test = 40 data_max_mean = 1 data_max_std = 1. # #num_rows = 800 #n_chains = 16 #config_filename = os.path.expanduser('~/.config/ipython/profile_ssh/security/ipcontroller-client.json') # num_rows = 100 n_chains = 2 config_filename = None # generate some data T, M_r, M_c, data_inverse_permutation_indices = du.gen_factorial_data_objects( gen_seed, num_clusters, num_cols, num_rows, num_views, max_mean=data_max_mean, max_std=data_max_std, send_data_inverse_permutation_indices=True) view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices( data_inverse_permutation_indices, num_rows, num_cols, num_views, num_clusters) X_L_gen, X_D_gen = ttu.get_generative_clustering(M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_views) T_test = ctu.create_test_set(M_c, T, X_L_gen, X_D_gen, n_test, seed_seed=0) # generative_mean_test_log_likelihood = ctu.calc_mean_test_log_likelihood(M_c, T, X_L_gen, X_D_gen, T_test) ground_truth_lookup = dict( ARI=1.0, mean_test_ll=generative_mean_test_log_likelihood, num_views=num_views, )
def run_test_continuous(n, observed): n_rows = 40 n_cols = 40 if observed: query_row = 10 else: query_row = n_rows query_column = 1 Q = [(query_row, query_column)] # do the test with multinomial data T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(), 2, 2, n_rows, 1) state = State.p_State(M_c, T) T_array = numpy.array(T) X_L = state.get_X_L() X_D = state.get_X_D() Y = [] # no constraints # pull n samples samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n) X_array = numpy.sort(numpy.array(samples)) std_X = numpy.std(X_array) mean_X = numpy.mean(X_array) # filter out extreme values X_filter_low = numpy.nonzero(X_array < mean_X - 2. * std_X)[0] X_filter_high = numpy.nonzero(X_array > mean_X + 2. * std_X)[0] X_filter = numpy.hstack((X_filter_low, X_filter_high)) X_array = numpy.delete(X_array, X_filter) # sort for area calculation later on X_array = numpy.sort(X_array) X = X_array.tolist() # build the queries Qs = [] for x in X: Qtmp = (query_row, query_column, x) Qs.append(Qtmp) # get pdf values densities = numpy.exp( su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)) # test that the area under Ps2 and pdfs is about 1 # calculated using the trapezoid rule area_density = 0 for i in range(len(X) - 1): area_density += (X[i + 1] - X[i]) * (densities[i + 1] + densities[i]) / 2.0 print("Area of PDF (should be close to, but not greater than, 1): " + str(area_density)) print( "*Note: The area will be less than one because the range (integral) is truncated." ) pylab.figure(facecolor='white') # PLOT: probability vs samples distribution # scale all histograms to be valid PDFs (area=1) pdf, bins, patches = pylab.hist(X, 100, normed=1, histtype='stepfilled', label='samples', alpha=.5, color=[.5, .5, .5]) pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none') pylab.legend(loc='upper left', fontsize='x-small') pylab.xlabel('value') pylab.ylabel('frequency/density') pylab.title('TEST: PDF (not scaled)') pylab.show() fd, fig_filename = tempfile.mkstemp(prefix='run_test_continuous_', suffix='.png', dir='.') pylab.savefig(fig_filename)
data_max_std = 1. # #num_rows = 800 #n_chains = 16 #config_filename = os.path.expanduser('~/.config/ipython/profile_ssh/security/ipcontroller-client.json') # num_rows = 100 n_chains = 2 config_filename = None # generate some data T, M_r, M_c, data_inverse_permutation_indices = du.gen_factorial_data_objects( gen_seed, num_clusters, num_cols, num_rows, num_views, max_mean=data_max_mean, max_std=data_max_std, send_data_inverse_permutation_indices=True) view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices( data_inverse_permutation_indices, num_rows, num_cols, num_views, num_clusters) X_L_gen, X_D_gen = ttu.get_generative_clustering( M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_views) T_test = ctu.create_test_set(M_c, T, X_L_gen, X_D_gen, n_test, seed_seed=0) # generative_mean_test_log_likelihood = ctu.calc_mean_test_log_likelihood( M_c, T, X_L_gen, X_D_gen, T_test) ground_truth_lookup = dict( ARI=1.0,