def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] with_id = argin["with_id"] needles = argin["needles"] mixed_types = argin["mixed_types"] multinomial_categories = argin["multinomial_categories"] separation = argin["separation"] num_indep_queries = argin["num_indep_queries"] independent_clusters = argin["independent_clusters"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0: random.seed(seed) # generate column indices and header col_names = [ "col_%i" % i for i in range(num_cols)] if mixed_types and multinomial_categories > 0: data_mode = 'mixed' elif multinomial_categories > 0: data_mode = 'multinomial' else: data_mode = 'continuous' if needles: T = [[0]*num_cols]*num_rows Zv = [0,0,1,1] # our needles Zv.extend(range(2,num_cols-2)) # random.shuffle(Zv) num_views = max(Zv)+1 separation = [.95]*2 separation.extend([0.0]*(num_views-2)) min_clusters = 4 max_clusters = 5 cluster_weights = [] # generate weights. for v in range(num_views): if v < 2: num_clusters = random.randrange(min_clusters, max_clusters) else: if independent_clusters: num_clusters = random.randrange(min_clusters, max_clusters) else: num_clusters = 1 cluster_weights.append( [1.0/num_clusters]*num_clusters ) cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories) T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs) else: T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols) # # preprend the row_id # if with_id: # needle_a_cols = (1,2) # needle_b_cols = (3,4) # col_names.insert(0, 'ID') # # TODO: ID type # cctypes.insert(0,'continuous') # # header = "ID,%s" % header # if needles: # Zv.insert(0, num_views) # for row in range(num_rows): # T[row].insert(0, row) # else: needle_a_cols = (col_names[0],col_names[1]) needle_b_cols = (col_names[2],col_names[3]) # save file to .csv filename = "needles_exp.csv" table = "needles_exp" T.insert(0, col_names) eu.list_to_csv(filename, T) # generate queries queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols, col_names, table, num_indep_queries) num_queries = len(queries) dependence_probs = numpy.zeros( (num_iters, num_queries) ) client = Client() client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table) print init_string client(init_string) client('SHOW DIAGNOSTICS FOR %s;' % table) # do the analyses for i in range(num_iters): if ct_kernel == 1: client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table ) else: client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table ) for q in range(num_queries): query = queries[q] out = client(query, pretty=False, pandas_output=False) dependence_probs[i,q] = out[0]['data'][0][1] result = dict() # store the queries in result result['query_col1'] = [] result['query_col2'] = [] result['dependence_probs'] = dependence_probs for pair in pairs: result['query_col1'].append(pair[0]) result['query_col2'].append(pair[1]) # for each query, get wether those columns were actually independent independent = [True]*num_queries if needles: for i in range(num_queries): col_idx_0 = pairs[i][0] col_idx_1 = pairs[i][1] if Zv[col_idx_0] == Zv[col_idx_1]: independent[i] = False result['cols_independent'] = independent result['config'] = argin result['config']['data_mode'] = data_mode client('SHOW DIAGNOSTICS FOR %s;' % table) return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] max_cols = argin["max_cols"] rho = argin["rho"] num_indep_queries = argin["num_indep_queries"] independent_clusters = argin["independent_clusters"] ct_kernel = argin["ct_kernel"] multimodal = argin["multimodal"] separation = argin["separation"] all_cols = max_cols + 4 # max_cols plus number of dependent columns seed = argin["seed"] if seed > 0: random.seed(seed) numpy.random.seed(seed) # build full data file # generate column indices and header col_names = [ "col_%i" % i for i in range(all_cols)] Zv = [0,0,1,1] # our needles Zv.extend(range(2,all_cols-2)) min_clusters = 3 max_clusters = 10 T_array = numpy.zeros( (num_rows, all_cols) ) Sigma = numpy.array( [[1.0,rho],[rho,1.0]]) mu = numpy.array([0,0]) if multimodal: T = [[0]*num_cols]*num_rows Zv = [0,0,1,1] # our needles Zv.extend(range(2,num_cols-2)) random.shuffle(Zv) num_views = max(Zv)+1 separation = [separation]*2 separation.extend([separation]*(num_views-2)) min_clusters = 4 max_clusters = 5 cluster_weights = [] # generate weights. for v in range(num_views): if v < 2: num_clusters = random.randrange(min_clusters, max_clusters) else: num_clusters = 1 cluster_weights.append( [1.0/num_clusters]*num_clusters ) cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories) T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs) T_array = numpy.array(T) else: T_array[:, 0:1+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows) T_array[:, 2:3+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows) separation = .5 for col in range(4, all_cols): num_clusters = random.randrange(min_clusters, max_clusters)+1 for row in range(num_rows): k = random.randrange(num_clusters) T_array[row, col] = numpy.random.randn()+k*6*separation T = T_array.tolist() # save file to .csv exp_path = 'expdata/hb/' eu.make_folder(exp_path) filename = exp_path + "haystack_break_exp.csv" table = "haystack_break_exp" T.insert(0, col_names) eu.list_to_csv(filename, T) # done building data file # get colum step size (powers of two) num_steps = int( math.log(max_cols, 2) )-1 step_size = [2**t for t in range(2, num_steps+1)] assert step_size[-1] <= max_cols if step_size[-1] < max_cols: step_size.append(max_cols) assert step_size[0] == 4 and step_size[-1] == max_cols # the needle column names needle_a_cols = (col_names[0],col_names[1]) needle_b_cols = (col_names[2],col_names[3]) result = dict() result['steps'] = [] for num_distractor_columns in step_size: # create subdata T_sub = take_T_column_subset(T, range(4+num_distractor_columns) ) subpath = exp_path+'d_'+str(num_distractor_columns)+'/' eu.make_folder(subpath) subfilename = subpath + "haystack_break_exp_" + str(num_distractor_columns) + ".csv" eu.list_to_csv(subfilename, T_sub) col_names_sub = T_sub[0] # generate queries queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols, col_names_sub, table, num_indep_queries) num_queries = len(queries) dependence_probs = numpy.zeros( (num_iters+1, num_queries) ) client = Client() client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, subfilename)) init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table) print init_string client(init_string) client('SHOW DIAGNOSTICS FOR %s;' % table) # do the analyses for i in range(0,num_iters+1): if i > 0: if ct_kernel == 1: client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table ) else: client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table ) for q in range(num_queries): query = queries[q] out = client(query, pretty=False, pandas_output=False) dependence_probs[i,q] = out[0]['data'][0][1] subresult = dict() # store the queries in subresult subresult['query_col1'] = [] subresult['query_col2'] = [] subresult['dependence_probs'] = dependence_probs for pair in pairs: subresult['query_col1'].append(pair[0]) subresult['query_col2'].append(pair[1]) # for each query, get wether those columns were actually independent independent = [True]*num_queries for i in range(num_queries): col_idx_0 = pairs[i][0] col_idx_1 = pairs[i][1] if Zv[col_idx_0] == Zv[col_idx_1]: independent[i] = False subresult['cols_independent'] = independent subresult['distractor_cols'] = num_distractor_columns result['steps'].append(subresult) result['config'] = argin result['data'] = T_array return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] with_id = argin["with_id"] needles = argin["needles"] mixed_types = argin["mixed_types"] multinomial_categories = argin["multinomial_categories"] separation = argin["separation"] num_indep_queries = argin["num_indep_queries"] independent_clusters = argin["independent_clusters"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0: random.seed(seed) # generate column indices and header col_names = ["col_%i" % i for i in range(num_cols)] if mixed_types and multinomial_categories > 0: data_mode = 'mixed' elif multinomial_categories > 0: data_mode = 'multinomial' else: data_mode = 'continuous' if needles: T = [[0] * num_cols] * num_rows Zv = [0, 0, 1, 1] # our needles Zv.extend(range(2, num_cols - 2)) # random.shuffle(Zv) num_views = max(Zv) + 1 separation = [.95] * 2 separation.extend([0.0] * (num_views - 2)) min_clusters = 4 max_clusters = 5 cluster_weights = [] # generate weights. for v in range(num_views): if v < 2: num_clusters = random.randrange(min_clusters, max_clusters) else: if independent_clusters: num_clusters = random.randrange(min_clusters, max_clusters) else: num_clusters = 1 cluster_weights.append([1.0 / num_clusters] * num_clusters) cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories) T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs) else: T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols) # # preprend the row_id # if with_id: # needle_a_cols = (1,2) # needle_b_cols = (3,4) # col_names.insert(0, 'ID') # # TODO: ID type # cctypes.insert(0,'continuous') # # header = "ID,%s" % header # if needles: # Zv.insert(0, num_views) # for row in range(num_rows): # T[row].insert(0, row) # else: needle_a_cols = (col_names[0], col_names[1]) needle_b_cols = (col_names[2], col_names[3]) # save file to .csv filename = "needles_exp.csv" table = "needles_exp" T.insert(0, col_names) eu.list_to_csv(filename, T) # generate queries queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols, col_names, table, num_indep_queries) num_queries = len(queries) dependence_probs = numpy.zeros((num_iters, num_queries)) client = Client() client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table) print init_string client(init_string) client('SHOW DIAGNOSTICS FOR %s;' % table) # do the analyses for i in range(num_iters): if ct_kernel == 1: client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table) else: client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table) for q in range(num_queries): query = queries[q] out = client(query, pretty=False, pandas_output=False) dependence_probs[i, q] = out[0]['data'][0][1] result = dict() # store the queries in result result['query_col1'] = [] result['query_col2'] = [] result['dependence_probs'] = dependence_probs for pair in pairs: result['query_col1'].append(pair[0]) result['query_col2'].append(pair[1]) # for each query, get wether those columns were actually independent independent = [True] * num_queries if needles: for i in range(num_queries): col_idx_0 = pairs[i][0] col_idx_1 = pairs[i][1] if Zv[col_idx_0] == Zv[col_idx_1]: independent[i] = False result['cols_independent'] = independent result['config'] = argin result['config']['data_mode'] = data_mode client('SHOW DIAGNOSTICS FOR %s;' % table) return result