def test_flights(): client = Client() cmds = [] cmds.append('drop ptable jayt;') cmds.append('create ptable jayt from /home/sgeadmin/tabular_predDB/Examples/flight_data_subset.csv;') cmds.append('create 2 models for jayt;') cmds.append('analyze jayt for 1 iterations;') cmds.append('select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20;') cmds.append('select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0;') cmds.append('select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0 with respect to actualelapsedtime;') cmds.append('select dayofweek, actualelapsedtime, similarity to 0 with respect to actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0 with respect to actualelapsedtime, dayofweek;') cmds.append('select dayofweek, actualelapsedtime, similarity to 0 from jayt where distance > 800 limit 5;') cmds.append('select dayofweek, actualelapsedtime, arrtime, similarity to 0 with respect to arrtime from jayt where distance > 800 limit 5 order by similarity to 0 with respect to arrtime, dayofweek;') cmds.append('select probability(actualelapsedtime=200) from jayt where distance > 800 limit 20;') # cmds.append('select * from jayt limit 5;') #cmds.append('infer actualelapsedtime from jayt with confidence 0.8 limit 20;') cmds.append('simulate dayofweek, deptime, crsdeptime FROM jayt where dayofweek = 7 TIMES 3;') cmds.append('estimate dependence probabilities from jayt;') cmds.append('estimate dependence probabilities from jayt referencing actualelapsedtime limit 6 save to fz;') cmds.append('estimate dependence probabilities from jayt referencing actualelapsedtime with confidence 0.5;') #cmds.append('drop ptable jayt;') #cmds.append('estimate dependence probabilities from dan_kiva referencing activity limit 10 save to activity_z;') #cmds.append('select * from dha_small;') #cmds.append('select probability(mdcr_spnd_outp=1), probability(mdcr_spnd_outp=2), probability(mdcr_spnd_outp=3) from dha_small;') for cmd in cmds: print '>>> %s' % cmd result = client.execute(cmd, timing=True) print result
def test_flights(): client = Client() cmds = [] cmds.append('drop ptable jayt;') cmds.append( 'create ptable jayt from /home/sgeadmin/tabular_predDB/Examples/flight_data_subset.csv;' ) cmds.append('create 2 models for jayt;') cmds.append('analyze jayt for 1 iterations;') cmds.append( 'select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20;' ) cmds.append( 'select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0;' ) cmds.append( 'select dayofweek, deptime, crsdeptime, actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0 with respect to actualelapsedtime;' ) cmds.append( 'select dayofweek, actualelapsedtime, similarity to 0 with respect to actualelapsedtime from jayt where distance > 800 limit 20 order by similarity to 0 with respect to actualelapsedtime, dayofweek;' ) cmds.append( 'select dayofweek, actualelapsedtime, similarity to 0 from jayt where distance > 800 limit 5;' ) cmds.append( 'select dayofweek, actualelapsedtime, arrtime, similarity to 0 with respect to arrtime from jayt where distance > 800 limit 5 order by similarity to 0 with respect to arrtime, dayofweek;' ) cmds.append( 'select probability(actualelapsedtime=200) from jayt where distance > 800 limit 20;' ) # cmds.append('select * from jayt limit 5;') #cmds.append('infer actualelapsedtime from jayt with confidence 0.8 limit 20;') cmds.append( 'simulate dayofweek, deptime, crsdeptime FROM jayt where dayofweek = 7 TIMES 3;' ) cmds.append('estimate dependence probabilities from jayt;') cmds.append( 'estimate dependence probabilities from jayt referencing actualelapsedtime limit 6 save to fz;' ) cmds.append( 'estimate dependence probabilities from jayt referencing actualelapsedtime with confidence 0.5;' ) #cmds.append('drop ptable jayt;') #cmds.append('estimate dependence probabilities from dan_kiva referencing activity limit 10 save to activity_z;') #cmds.append('select * from dha_small;') #cmds.append('select probability(mdcr_spnd_outp=1), probability(mdcr_spnd_outp=2), probability(mdcr_spnd_outp=3) from dha_small;') for cmd in cmds: print '>>> %s' % cmd result = client.execute(cmd, timing=True) print result
def test_dha_story_demo(): client = Client() tests_dir = os.path.split(os.path.realpath(__file__))[0] dha_csv_path = os.path.join(tests_dir, 'data/dha.csv') dha_samples_path = os.path.join(tests_dir, 'samples/dha_samples.pkl.gz') test_results_path = os.path.join( tests_dir, 'regression_test_output/dha_story_results_record.pkl') cmd_list = [ 'DROP BTABLE dha_demo;', 'CREATE BTABLE dha_demo FROM %s;' % dha_csv_path, 'IMPORT SAMPLES %s INTO dha_demo;' % dha_samples_path, 'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo LIMIT 10;', 'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo;', 'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING qual_score LIMIT 6;', 'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING qual_score WITH CONFIDENCE 0.9;', 'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING pymt_p_md_visit LIMIT 6;', # 'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\') LIMIT 10;', 'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\', qual_score), ami_score LIMIT 10;', 'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\', pymt_p_visit_ratio), ttl_mdcr_spnd LIMIT 10;', # 'SIMULATE name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo WHERE ami_score=95.0 TIMES 10;', # 'SIMULATE name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo WHERE ttl_mdcr_spnd=50000 TIMES 10;', ] dha_story_results = [] if len(sys.argv) > 1 and sys.argv[1] == 'record': print 'Recording new dha_story_results to %s' % test_results_path record = True else: ## Testing dha_story_results = pickle.load(open(test_results_path, 'r')) record = False for i, cmd in enumerate(cmd_list): print cmd result = client.execute(cmd, timing=False, pretty=True) if record: dha_story_results.append(result) else: if type(result) == dict: for k, v in result.iteritems(): if isinstance(v, numpy.ndarray): assert (v == dha_story_results[i][k]).all(), ( v, dha_story_results[i][k]) else: assert v == dha_story_results[i][k], ( v, dha_story_results[i][k]) else: #assert result == dha_story_results[i], (result, dha_story_results[i]) pass if record: pickle.dump(dha_story_results, open(filename, 'w'))
def test_dha_story_demo(): client = Client() tests_dir = os.path.split(os.path.realpath(__file__))[0] dha_csv_path = os.path.join(tests_dir, 'data/dha.csv') dha_samples_path = os.path.join(tests_dir, 'samples/dha_samples.pkl.gz') test_results_path = os.path.join(tests_dir, 'regression_test_output/dha_story_results_record.pkl') cmd_list = [ 'DROP BTABLE dha_demo;', 'CREATE BTABLE dha_demo FROM %s;' % dha_csv_path, 'IMPORT SAMPLES %s INTO dha_demo;' % dha_samples_path, 'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo LIMIT 10;', 'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo;', 'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING qual_score LIMIT 6;', 'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING qual_score WITH CONFIDENCE 0.9;', 'ESTIMATE DEPENDENCE PROBABILITIES FROM dha_demo REFERENCING pymt_p_md_visit LIMIT 6;', # 'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\') LIMIT 10;', 'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\', qual_score), ami_score LIMIT 10;', 'SELECT name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo ORDER BY similarity_to(name=\'Albany NY\', pymt_p_visit_ratio), ttl_mdcr_spnd LIMIT 10;', # 'SIMULATE name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo WHERE ami_score=95.0 TIMES 10;', # 'SIMULATE name, qual_score, ami_score, pymt_p_visit_ratio, ttl_mdcr_spnd, hosp_reimb_ratio, hosp_reimb_p_dcd, md_copay_p_dcd, ttl_copay_p_dcd FROM dha_demo WHERE ttl_mdcr_spnd=50000 TIMES 10;', ] dha_story_results = [] if len(sys.argv) > 1 and sys.argv[1] == 'record': print 'Recording new dha_story_results to %s' % test_results_path record = True else: ## Testing dha_story_results = pickle.load(open(test_results_path, 'r')) record = False for i, cmd in enumerate(cmd_list): print cmd result = client.execute(cmd, timing=False, pretty=True) if record: dha_story_results.append(result) else: if type(result) == dict: for k,v in result.iteritems(): if isinstance(v, numpy.ndarray): assert (v == dha_story_results[i][k]).all(), (v, dha_story_results[i][k]) else: assert v == dha_story_results[i][k], (v, dha_story_results[i][k]) else: #assert result == dha_story_results[i], (result, dha_story_results[i]) pass if record: pickle.dump(dha_story_results, open(filename, 'w'))
def setup_function(function): global test_tablenames, client, test_filenames test_tablenames = [] test_filenames = [] # Default upgrade_key_column is None, to let the user choose, but need to avoid # user input during testing, so for testing just create a new key column. client = Client(testing=True)
def test_btable_list(): global client, test_filenames out = set(client('list btables', pretty=False, debug=True)[0]['btable']) init_btable_count = len(out) test_tablename1 = create_dha() out = set(client('list btables', pretty=False, debug=True)[0]['btable']) assert len(out) == 1 + init_btable_count assert test_tablename1 in out test_tablename2 = create_dha() out = set(client('list btables', pretty=False, debug=True)[0]['btable']) assert len(out) == 2 + init_btable_count assert test_tablename1 in out assert test_tablename2 in out client('drop btable %s' % test_tablename1, yes=True, debug=True, pretty=False) out = set(client('list btables', pretty=False, debug=True)[0]['btable']) assert len(out) == 1 + init_btable_count assert test_tablename1 not in out assert test_tablename2 in out ## test to make sure btable list is persisted del client client = Client() out = set(client('list btables', pretty=False, debug=True)[0]['btable']) assert len(out) == 1 + init_btable_count assert test_tablename1 not in out assert test_tablename2 in out
def run_experiment(argin): num_rows = argin["num_rows"] num_iters = argin["num_iters"] num_chains = argin["num_chains"] ct_kernel = argin["ct_kernel"] datatype = argin["datatype"] # generate the data datasets = gen_shapetest_csvs(num_rows) client = Client() # drop tables print "Dropping tables." client('DROP BTABLE exp_sinwave;', yes=True) client('DROP BTABLE exp_x;', yes=True) client('DROP BTABLE exp_ring;', yes=True) client('DROP BTABLE exp_dots;', yes=True) data_out = dict() data_out['config'] = argin # recreate sin wave for shape in ["sinwave", "x", "ring", "dots"]: query_list = gen_base_queries(num_iters, num_chains, num_rows, shape, ct_kernel, datatype) for query in query_list: print query client(query) table = table_string[shape] datafile = table + '.csv' out = client('SIMULATE x,y FROM %s TIMES %i;' % (table, num_rows), pretty=False) X_original = datasets[shape] X_inferred = numpy.array(out[0]) # get the logps # latent_states = client.engine.persistence_layer.get_latent_states(table) # X_L_list = latent_states[0] # logps = [X_L['logp'] for X_L in X_L_list] # this_key = shape + "_logps" # data_out[this_key] = logps this_key = shape + "_inferred" data_out[this_key] = X_inferred this_key = shape + "_original" data_out[this_key] = X_original return data_out
def run_example(name): # Default upgrade_key_column is None, to let the user choose, but need to avoid # user input during testing, so default will be to create a new key column. client = Client(testing=True) file_path = os.path.join('../../examples/%s/%s_analysis.bql' % (name, name)) results = client(open(file_path, 'r'), yes=True, pretty=False, plots=False, key_column=0) for r in results: if 'Error' in r or ('error' in r and r['error']): raise Exception(str(r))
def run_command_line(): # Get command line arguments to specify hostname and port hostname = None port = None if len(sys.argv) > 1: # Treat the first argument as hostname[:port] input = sys.argv[1].split(':') hostname = input[0] if len(input) == 1: client = Client(hostname) print "Using hostname %s." % hostname if len(input) == 2: port = int(input[1]) client = Client(hostname, port) print "Using hostname %s, port %d" % (hostname, port) elif len(input) > 2: print "Run with 'python bql [hostname[:port]]'" else: client = Client() print """Welcome to BayesDB. You may enter BQL commands directly into this prompt. Type 'help' for help, and 'quit' to quit.""" app = BayesDBApp(client) app.cmdloop()
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] max_cols = argin["max_cols"] rho = argin["rho"] num_indep_queries = argin["num_indep_queries"] independent_clusters = argin["independent_clusters"] ct_kernel = argin["ct_kernel"] multimodal = argin["multimodal"] separation = argin["separation"] all_cols = max_cols + 4 # max_cols plus number of dependent columns seed = argin["seed"] if seed > 0: random.seed(seed) numpy.random.seed(seed) # build full data file # generate column indices and header col_names = [ "col_%i" % i for i in range(all_cols)] Zv = [0,0,1,1] # our needles Zv.extend(range(2,all_cols-2)) min_clusters = 3 max_clusters = 10 T_array = numpy.zeros( (num_rows, all_cols) ) Sigma = numpy.array( [[1.0,rho],[rho,1.0]]) mu = numpy.array([0,0]) if multimodal: T = [[0]*num_cols]*num_rows Zv = [0,0,1,1] # our needles Zv.extend(range(2,num_cols-2)) random.shuffle(Zv) num_views = max(Zv)+1 separation = [separation]*2 separation.extend([separation]*(num_views-2)) min_clusters = 4 max_clusters = 5 cluster_weights = [] # generate weights. for v in range(num_views): if v < 2: num_clusters = random.randrange(min_clusters, max_clusters) else: num_clusters = 1 cluster_weights.append( [1.0/num_clusters]*num_clusters ) cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories) T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs) T_array = numpy.array(T) else: T_array[:, 0:1+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows) T_array[:, 2:3+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows) separation = .5 for col in range(4, all_cols): num_clusters = random.randrange(min_clusters, max_clusters)+1 for row in range(num_rows): k = random.randrange(num_clusters) T_array[row, col] = numpy.random.randn()+k*6*separation T = T_array.tolist() # save file to .csv exp_path = 'expdata/hb/' eu.make_folder(exp_path) filename = exp_path + "haystack_break_exp.csv" table = "haystack_break_exp" T.insert(0, col_names) eu.list_to_csv(filename, T) # done building data file # get colum step size (powers of two) num_steps = int( math.log(max_cols, 2) )-1 step_size = [2**t for t in range(2, num_steps+1)] assert step_size[-1] <= max_cols if step_size[-1] < max_cols: step_size.append(max_cols) assert step_size[0] == 4 and step_size[-1] == max_cols # the needle column names needle_a_cols = (col_names[0],col_names[1]) needle_b_cols = (col_names[2],col_names[3]) result = dict() result['steps'] = [] for num_distractor_columns in step_size: # create subdata T_sub = take_T_column_subset(T, range(4+num_distractor_columns) ) subpath = exp_path+'d_'+str(num_distractor_columns)+'/' eu.make_folder(subpath) subfilename = subpath + "haystack_break_exp_" + str(num_distractor_columns) + ".csv" eu.list_to_csv(subfilename, T_sub) col_names_sub = T_sub[0] # generate queries queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols, col_names_sub, table, num_indep_queries) num_queries = len(queries) dependence_probs = numpy.zeros( (num_iters+1, num_queries) ) client = Client() client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, subfilename)) init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table) print init_string client(init_string) client('SHOW DIAGNOSTICS FOR %s;' % table) # do the analyses for i in range(0,num_iters+1): if i > 0: if ct_kernel == 1: client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table ) else: client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table ) for q in range(num_queries): query = queries[q] out = client(query, pretty=False, pandas_output=False) dependence_probs[i,q] = out[0]['data'][0][1] subresult = dict() # store the queries in subresult subresult['query_col1'] = [] subresult['query_col2'] = [] subresult['dependence_probs'] = dependence_probs for pair in pairs: subresult['query_col1'].append(pair[0]) subresult['query_col2'].append(pair[1]) # for each query, get wether those columns were actually independent independent = [True]*num_queries for i in range(num_queries): col_idx_0 = pairs[i][0] col_idx_1 = pairs[i][1] if Zv[col_idx_0] == Zv[col_idx_1]: independent[i] = False subresult['cols_independent'] = independent subresult['distractor_cols'] = num_distractor_columns result['steps'].append(subresult) result['config'] = argin result['data'] = T_array return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] separation = argin["separation"] seed = argin["seed"] ct_kernel = argin["ct_kernel"] if seed > 0: random.seed(seed) argin['cctypes'] = ['continuous'] * num_cols argin['separation'] = [argin['separation']] * num_views # have to generate synthetic data filename = "exp_estimate_joint_ofile.csv" table_name = 'exp_estimate_joint' # generate starting data T_o, structure = eu.gen_data(filename, argin, save_csv=True) # generate a new csv with bottom row removed (held-out data) data_filename = 'exp_estimate_joint.csv' T_h = eu.gen_held_out_data(filename, data_filename, 1) # get the column names with open(filename, 'r') as f: csv_header = f.readline() col_names = csv_header.split(',') col_names[-1] = col_names[-1].strip() # set up a dict fro the different config data result = dict() true_held_out_p = [] for col in range(num_cols): x = T_o[-1, col] logp = eu.get_true_logp(numpy.array([x]), col, structure) true_held_out_p.append(numpy.exp(logp)) # start a client client = Client() # do analyses for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, data_filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) these_ps = numpy.zeros(num_iters) these_ps_errors = numpy.zeros(num_iters) for i in range(num_iters): if ct_kernel == 1: client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table) else: client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table) # imput each index in indices and calculate the squared error mean_p = [] mean_p_error = [] for col in range(0, num_cols): col_name = col_names[col] x = T_o[-1, col] out = client('SELECT PROBABILITY OF %s=%f from %s;' % (col_name, x, table), pretty=False, pandas_output=False) p = out[0]['data'][0][1] mean_p.append(p) mean_p_error.append((true_held_out_p[col] - p)**2.0) these_ps[i] = numpy.mean(mean_p) these_ps_errors[i] = numpy.mean(mean_p_error) key_str_p = 'mean_held_out_p_' + config key_str_error = 'mean_error_' + config result[key_str_p] = these_ps result[key_str_error] = these_ps_errors retval = dict() retval['MSE_naive_bayes_indexer'] = result['mean_error_nb'] retval['MSE_crp_mixture_indexer'] = result['mean_error_crp'] retval['MSE_crosscat_indexer'] = result['mean_error_cc'] retval['MEAN_P_naive_bayes_indexer'] = result['mean_held_out_p_nb'] retval['MEAN_P_crp_mixture_indexer'] = result['mean_held_out_p_crp'] retval['MEAN_P_crosscat_indexer'] = result['mean_held_out_p_cc'] retval['config'] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] with_id = argin["with_id"] needles = argin["needles"] mixed_types = argin["mixed_types"] multinomial_categories = argin["multinomial_categories"] separation = argin["separation"] num_indep_queries = argin["num_indep_queries"] independent_clusters = argin["independent_clusters"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0: random.seed(seed) # generate column indices and header col_names = ["col_%i" % i for i in range(num_cols)] if mixed_types and multinomial_categories > 0: data_mode = 'mixed' elif multinomial_categories > 0: data_mode = 'multinomial' else: data_mode = 'continuous' if needles: T = [[0] * num_cols] * num_rows Zv = [0, 0, 1, 1] # our needles Zv.extend(range(2, num_cols - 2)) # random.shuffle(Zv) num_views = max(Zv) + 1 separation = [.95] * 2 separation.extend([0.0] * (num_views - 2)) min_clusters = 4 max_clusters = 5 cluster_weights = [] # generate weights. for v in range(num_views): if v < 2: num_clusters = random.randrange(min_clusters, max_clusters) else: if independent_clusters: num_clusters = random.randrange(min_clusters, max_clusters) else: num_clusters = 1 cluster_weights.append([1.0 / num_clusters] * num_clusters) cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories) T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs) else: T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols) # # preprend the row_id # if with_id: # needle_a_cols = (1,2) # needle_b_cols = (3,4) # col_names.insert(0, 'ID') # # TODO: ID type # cctypes.insert(0,'continuous') # # header = "ID,%s" % header # if needles: # Zv.insert(0, num_views) # for row in range(num_rows): # T[row].insert(0, row) # else: needle_a_cols = (col_names[0], col_names[1]) needle_b_cols = (col_names[2], col_names[3]) # save file to .csv filename = "needles_exp.csv" table = "needles_exp" T.insert(0, col_names) eu.list_to_csv(filename, T) # generate queries queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols, col_names, table, num_indep_queries) num_queries = len(queries) dependence_probs = numpy.zeros((num_iters, num_queries)) client = Client() client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table) print init_string client(init_string) client('SHOW DIAGNOSTICS FOR %s;' % table) # do the analyses for i in range(num_iters): if ct_kernel == 1: client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table) else: client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table) for q in range(num_queries): query = queries[q] out = client(query, pretty=False, pandas_output=False) dependence_probs[i, q] = out[0]['data'][0][1] result = dict() # store the queries in result result['query_col1'] = [] result['query_col2'] = [] result['dependence_probs'] = dependence_probs for pair in pairs: result['query_col1'].append(pair[0]) result['query_col2'].append(pair[1]) # for each query, get wether those columns were actually independent independent = [True] * num_queries if needles: for i in range(num_queries): col_idx_0 = pairs[i][0] col_idx_1 = pairs[i][1] if Zv[col_idx_0] == Zv[col_idx_1]: independent[i] = False result['cols_independent'] = independent result['config'] = argin result['config']['data_mode'] = data_mode client('SHOW DIAGNOSTICS FOR %s;' % table) return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_runs = argin["num_runs"] prop_missing = argin["prop_missing"] confidence = argin["confidence"] seed = argin["seed"] n_queries = 2 # random.seed(seed) # using dha, for now start_filename = "../data/dha.csv" table = 'exp_shrinks_with_iters' filename, indices, col_names = eu.gen_missing_data_csv( start_filename, prop_missing, [0]) # get some random column pairs to do DEPENDENCE PROBABILITY queries on # don't do queries on the first column columns = range(1, len(col_names)) column_queries = [random.sample(columns, 2) for _ in range(n_queries)] dependence_queries = [] for q in column_queries: col_1 = col_names[q[0]].lower() col_2 = col_names[q[1]].lower() this_query = "SELECT DEPENDENCE PROBABILITY OF %s WITH %s FROM %s;" % ( col_1, col_2, table) dependence_queries.append(this_query) # get some inference queries column_queries = random.sample(columns, n_queries) infer_queries = [] for q in column_queries: col = col_names[q].lower() this_query = 'INFER %s FROM %s WITH CONFIDENCE %f;' % (col, table, confidence) infer_queries.append(this_query) # create a client client = Client() dependence_results = [] inference_results = [] for _ in range(num_runs): # drop old table, create new table, init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) client('INITIALIZE %i MODELS FOR %s;' % (num_chains, table)) dependence_results_run = numpy.zeros((n_queries, num_iters)) inference_results_run = numpy.zeros((n_queries, num_iters)) for i in range(num_iters): # analyze client('ANALYZE %s FOR 1 ITERATIONS;' % (table)) # dependence for q in range(n_queries): out_dep = client(dependence_queries[q], pretty=False, pandas_output=False) dep = out_dep[0]['data'][0][1] dependence_results_run[q, i] = dep # infer for q in range(n_queries): out_inf = client(infer_queries[q], pretty=False, pandas_output=False) prop = _get_prop_inferred(out_inf[0]['data'], indices, column_queries[q]) inference_results_run[q, i] = prop dependence_results.append(dependence_results_run) inference_results.append(inference_results_run) # calculate mean and errors (dependence) dep_means = numpy.zeros((n_queries, num_iters)) dep_error = numpy.zeros((n_queries, num_iters)) for i in range(num_iters): X = numpy.zeros((n_queries, num_runs)) for r in range(num_runs): X[:, r] = dependence_results[r][:, i] dep_means[:, i] = numpy.mean(X, axis=1) dep_error[:, i] = numpy.std(X, axis=1) / float(num_runs)**.5 # calculate mean and errors (infer) inf_means = numpy.zeros((n_queries, num_iters)) inf_error = numpy.zeros((n_queries, num_iters)) for i in range(num_iters): X = numpy.zeros((n_queries, num_runs)) for r in range(num_runs): X[:, r] = inference_results[r][:, i] inf_means[:, i] = numpy.mean(X, axis=1) inf_error[:, i] = numpy.std(X, axis=1) / float(num_runs)**.5 result = dict() result['config'] = argin result['num_queries'] = n_queries result['iteration'] = range(1, num_iters + 1) result['dependence_probability_mean'] = dep_means result['dependence_probability_error'] = dep_error result['infer_means'] = inf_means result['infer_stderr'] = inf_error return result
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] impute_samples = argin["impute_samples"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0: random.seed(seed) filename = "exp_fills_in_ofile.csv" table_name = 'exp_fills_in' argin['cctypes'] = ['continuous'] * num_cols argin['separation'] = [argin['separation']] * num_views eu.gen_data(filename, argin, save_csv=True) # generate a new csv all_filenames = [] all_indices = [] for p in prop_missing: data_filename, indices, col_names, extra = eu.gen_missing_data_csv( filename, p, [], True) all_indices.append(indices) all_filenames.append(data_filename) # get the starting table so we can calculate errors T_array = extra['array_filled'] num_rows, num_cols = T_array.shape # create a client client = Client() # set up a dict fro the different config data result = dict() result['cc'] = numpy.zeros(len(prop_missing)) result['crp'] = numpy.zeros(len(prop_missing)) result['nb'] = numpy.zeros(len(prop_missing)) # do analyses for p in range(len(prop_missing)): this_indices = all_indices[p] this_filename = all_filenames[p] for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, this_filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) if ct_kernel == 1: client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' % (table, num_iters)) else: client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters)) MSE = 0.0 count = 0.0 # imput each index in indices and calculate the squared error for col in range(0, num_cols): col_name = col_names[col] # confidence is set to zero so that a value is always returned out = client( 'INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' % (col_name, table, 0, impute_samples), pretty=False, pandas_output=False) data = out[0]['data'] # calcaulte MSE for row, tcol in zip(this_indices[0], this_indices[1]): if tcol == col: MSE += (T_array[row, col] - data[row][1])**2.0 count += 1.0 result[config][p] = MSE / count print "error = %f" % result[config][p] retval = dict() retval['MSE_naive_bayes_indexer'] = result['nb'] retval['MSE_crp_mixture_indexer'] = result['crp'] retval['MSE_crosscat_indexer'] = result['cc'] retval['prop_missing'] = prop_missing retval['config'] = argin return retval
from bayesdb.client import Client client = Client() client('DROP BTABLE dialysisai;') client('CREATE BTABLE dialysisai FROM learn_data.csv;') client( 'UPDATE DATATYPES FROM dialysisai SET PROG_DURATION=continuous, BLOOD_VOLUME=continuous, REAL_SYMPTOM_ID=ignore;' ) client('CREATE 20 MODELS FOR dialysisai;') client('ANALYZE dialysisai FOR 100 ITERATIONS;')
def run_example(): client = Client() cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, 'flights_analysis.bql') print "\nA series of BQL commands will be displayed. Hit <Enter> to execute the displayed command.\n" client(open(file_path, 'r'), wait=True)
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] multinomial_categories = argin["multinomial_categories"] seed = argin["seed"] random.seed(seed) # TODO: use dha.csv ofilename = "reasonably_calibrated_ofile.csv" table_name = 'reasonably_calibrated' argin['distargs'] = [{"K": multinomial_categories}] * num_cols argin['cctypes'] = ['multinomial'] * num_cols argin['separation'] = [argin['separation']] * num_views T_array, structure = eu.gen_data(ofilename, argin, save_csv=True) filename, indices, col_names = eu.gen_missing_data_csv( ofilename, prop_missing, []) # create a client client = Client() # caluclate empirical frequency of each point frequencies = [] for col in range(num_cols): frequencies.append(numpy.zeros(multinomial_categories)) T_int = numpy.array(T_array, dtype=int) n_indices = len(indices[0]) for i in range(n_indices): r = indices[0][i] c = indices[1][i] x = T_int[r, c] frequencies[c][x] += 1.0 frequencies = [f / numpy.sum(f) for f in frequencies] # set up a dict fro the different config data result = dict() # do analyses for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) if ct_kernel == 1: client('ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;' % (table, num_iters)) else: client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters)) # imput each index in indices and calculate the squared error results_config = [] for col in range(num_cols): results_config.append(numpy.zeros(multinomial_categories)) for col in range(num_cols): col_name = col_names[col] out = client( "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" % (col_name, table), pretty=False, pandas_output=False) for i in range(n_indices): r = indices[0][i] c = indices[1][i] if c == col: x = out[0]['data'][r][1] results_config[c][int(x)] += 1.0 results_config = [f / sum(f) for f in results_config] result[config] = results_config retval = dict() retval['actual_frequencies'] = frequencies retval['inferred_P_cc'] = result['cc'] retval['inferred_P_crp'] = result['crp'] retval['inferred_P_nb'] = result['nb'] retval['config'] = argin return retval