def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] multinomial_categories = argin["multinomial_categories"] seed = argin["seed"] random.seed(seed) # TODO: use dha.csv ofilename = "reasonably_calibrated_ofile.csv" table_name = "reasonably_calibrated" argin["distargs"] = [{"K": multinomial_categories}] * num_cols argin["cctypes"] = ["multinomial"] * num_cols argin["separation"] = [argin["separation"]] * num_views T_array, structure = eu.gen_data(ofilename, argin, save_csv=True) filename, indices, col_names = eu.gen_missing_data_csv(ofilename, prop_missing, []) # create a client client = Client() # caluclate empirical frequency of each point frequencies = [] for col in range(num_cols): frequencies.append(numpy.zeros(multinomial_categories)) T_int = numpy.array(T_array, dtype=int) n_indices = len(indices[0]) for i in range(n_indices): r = indices[0][i] c = indices[1][i] x = T_int[r, c] frequencies[c][x] += 1.0 frequencies = [f / numpy.sum(f) for f in frequencies] # set up a dict fro the different config data result = dict() # do analyses for config in ["cc", "crp", "nb"]: config_string = eu.config_map[config] table = table_name + "-" + config # drop old btable, create a new one with the new data and init models client("DROP BTABLE %s;" % table, yes=True) client("CREATE BTABLE %s FROM %s;" % (table, filename)) client("INITIALIZE %i MODELS FOR %s %s;" % (num_chains, table, config_string)) if ct_kernel == 1: client("ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;" % (table, num_iters)) else: client("ANALYZE %s FOR %i ITERATIONS WAIT;" % (table, num_iters)) # imput each index in indices and calculate the squared error results_config = [] for col in range(num_cols): results_config.append(numpy.zeros(multinomial_categories)) for col in range(num_cols): col_name = col_names[col] out = client( "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" % (col_name, table), pretty=False, pandas_output=False, ) for i in range(n_indices): r = indices[0][i] c = indices[1][i] if c == col: x = out[0]["data"][r][1] results_config[c][int(x)] += 1.0 results_config = [f / sum(f) for f in results_config] result[config] = results_config retval = dict() retval["actual_frequencies"] = frequencies retval["inferred_P_cc"] = result["cc"] retval["inferred_P_crp"] = result["crp"] retval["inferred_P_nb"] = result["nb"] retval["config"] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] impute_samples = argin["impute_samples"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0 : random.seed(seed) filename = "exp_fills_in_ofile.csv" table_name = 'exp_fills_in' argin['cctypes'] = ['continuous']*num_cols argin['separation'] = [argin['separation']]*num_views eu.gen_data(filename, argin, save_csv=True) # generate a new csv all_filenames = [] all_indices = [] for p in prop_missing: data_filename, indices, col_names, extra = eu.gen_missing_data_csv(filename, p, [], True) all_indices.append(indices) all_filenames.append(data_filename) # get the starting table so we can calculate errors T_array = extra['array_filled'] num_rows, num_cols = T_array.shape # create a client client = Client() # set up a dict fro the different config data result = dict() result['cc'] = numpy.zeros(len(prop_missing)) result['crp'] = numpy.zeros(len(prop_missing)) result['nb'] = numpy.zeros(len(prop_missing)) # do analyses for p in range(len(prop_missing)): this_indices = all_indices[p] this_filename = all_filenames[p] for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, this_filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) if ct_kernel == 1: client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' % (table, num_iters) ) else: client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters) ) MSE = 0.0 count = 0.0 # imput each index in indices and calculate the squared error for col in range(0,num_cols): col_name = col_names[col] # confidence is set to zero so that a value is always returned out = client('INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' % (col_name, table, 0, impute_samples), pretty=False, pandas_output=False ) data = out[0]['data'] # calcaulte MSE for row, tcol in zip(this_indices[0], this_indices[1]): if tcol == col: MSE += ( T_array[row,col] - data[row][1] )**2.0 count += 1.0 result[config][p] = MSE/count print "error = %f" % result[config][p] retval = dict() retval['MSE_naive_bayes_indexer'] = result['nb'] retval['MSE_crp_mixture_indexer'] = result['crp'] retval['MSE_crosscat_indexer'] = result['cc'] retval['prop_missing'] = prop_missing retval['config'] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] separation = argin["separation"] seed = argin["seed"] ct_kernel = argin["ct_kernel"] if seed > 0: random.seed(seed) argin['cctypes'] = ['continuous'] * num_cols argin['separation'] = [argin['separation']] * num_views # have to generate synthetic data filename = "exp_estimate_joint_ofile.csv" table_name = 'exp_estimate_joint' # generate starting data T_o, structure = eu.gen_data(filename, argin, save_csv=True) # generate a new csv with bottom row removed (held-out data) data_filename = 'exp_estimate_joint.csv' T_h = eu.gen_held_out_data(filename, data_filename, 1) # get the column names with open(filename, 'r') as f: csv_header = f.readline() col_names = csv_header.split(',') col_names[-1] = col_names[-1].strip() # set up a dict fro the different config data result = dict() true_held_out_p = [] for col in range(num_cols): x = T_o[-1, col] logp = eu.get_true_logp(numpy.array([x]), col, structure) true_held_out_p.append(numpy.exp(logp)) # start a client client = Client() # do analyses for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, data_filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) these_ps = numpy.zeros(num_iters) these_ps_errors = numpy.zeros(num_iters) for i in range(num_iters): if ct_kernel == 1: client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table) else: client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table) # imput each index in indices and calculate the squared error mean_p = [] mean_p_error = [] for col in range(0, num_cols): col_name = col_names[col] x = T_o[-1, col] out = client('SELECT PROBABILITY OF %s=%f from %s;' % (col_name, x, table), pretty=False, pandas_output=False) p = out[0]['data'][0][1] mean_p.append(p) mean_p_error.append((true_held_out_p[col] - p)**2.0) these_ps[i] = numpy.mean(mean_p) these_ps_errors[i] = numpy.mean(mean_p_error) key_str_p = 'mean_held_out_p_' + config key_str_error = 'mean_error_' + config result[key_str_p] = these_ps result[key_str_error] = these_ps_errors retval = dict() retval['MSE_naive_bayes_indexer'] = result['mean_error_nb'] retval['MSE_crp_mixture_indexer'] = result['mean_error_crp'] retval['MSE_crosscat_indexer'] = result['mean_error_cc'] retval['MEAN_P_naive_bayes_indexer'] = result['mean_held_out_p_nb'] retval['MEAN_P_crp_mixture_indexer'] = result['mean_held_out_p_crp'] retval['MEAN_P_crosscat_indexer'] = result['mean_held_out_p_cc'] retval['config'] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] impute_samples = argin["impute_samples"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] seed = argin["seed"] if seed > 0: random.seed(seed) filename = "exp_fills_in_ofile.csv" table_name = 'exp_fills_in' argin['cctypes'] = ['continuous'] * num_cols argin['separation'] = [argin['separation']] * num_views eu.gen_data(filename, argin, save_csv=True) # generate a new csv all_filenames = [] all_indices = [] for p in prop_missing: data_filename, indices, col_names, extra = eu.gen_missing_data_csv( filename, p, [], True) all_indices.append(indices) all_filenames.append(data_filename) # get the starting table so we can calculate errors T_array = extra['array_filled'] num_rows, num_cols = T_array.shape # create a client client = Client() # set up a dict fro the different config data result = dict() result['cc'] = numpy.zeros(len(prop_missing)) result['crp'] = numpy.zeros(len(prop_missing)) result['nb'] = numpy.zeros(len(prop_missing)) # do analyses for p in range(len(prop_missing)): this_indices = all_indices[p] this_filename = all_filenames[p] for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, this_filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) if ct_kernel == 1: client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' % (table, num_iters)) else: client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters)) MSE = 0.0 count = 0.0 # imput each index in indices and calculate the squared error for col in range(0, num_cols): col_name = col_names[col] # confidence is set to zero so that a value is always returned out = client( 'INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' % (col_name, table, 0, impute_samples), pretty=False, pandas_output=False) data = out[0]['data'] # calcaulte MSE for row, tcol in zip(this_indices[0], this_indices[1]): if tcol == col: MSE += (T_array[row, col] - data[row][1])**2.0 count += 1.0 result[config][p] = MSE / count print "error = %f" % result[config][p] retval = dict() retval['MSE_naive_bayes_indexer'] = result['nb'] retval['MSE_crp_mixture_indexer'] = result['crp'] retval['MSE_crosscat_indexer'] = result['cc'] retval['prop_missing'] = prop_missing retval['config'] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] separation = argin["separation"] seed = argin["seed"] ct_kernel = argin["ct_kernel"] if seed > 0: random.seed(seed) argin['cctypes'] = ['continuous']*num_cols argin['separation'] = [argin['separation']]*num_views # have to generate synthetic data filename = "exp_estimate_joint_ofile.csv" table_name = 'exp_estimate_joint' # generate starting data T_o, structure = eu.gen_data(filename, argin, save_csv=True) # generate a new csv with bottom row removed (held-out data) data_filename = 'exp_estimate_joint.csv' T_h = eu.gen_held_out_data(filename, data_filename, 1) # get the column names with open(filename, 'r') as f: csv_header = f.readline() col_names = csv_header.split(',') col_names[-1] = col_names[-1].strip() # set up a dict fro the different config data result = dict() true_held_out_p = [] for col in range(num_cols): x = T_o[-1,col] logp = eu.get_true_logp(numpy.array([x]), col, structure) true_held_out_p.append(numpy.exp(logp)) # start a client client = Client() # do analyses for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, data_filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) these_ps = numpy.zeros(num_iters) these_ps_errors = numpy.zeros(num_iters) for i in range(num_iters): if ct_kernel == 1: client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table ) else: client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table ) # imput each index in indices and calculate the squared error mean_p = [] mean_p_error = [] for col in range(0,num_cols): col_name = col_names[col] x = T_o[-1,col] out = client('SELECT PROBABILITY OF %s=%f from %s;' % (col_name, x, table), pretty=False, pandas_output=False) p = out[0]['data'][0][1] mean_p.append(p) mean_p_error.append( (true_held_out_p[col]-p)**2.0 ) these_ps[i] = numpy.mean(mean_p) these_ps_errors[i] = numpy.mean(mean_p_error) key_str_p = 'mean_held_out_p_' + config key_str_error = 'mean_error_' + config result[key_str_p] = these_ps result[key_str_error] = these_ps_errors retval = dict() retval['MSE_naive_bayes_indexer'] = result['mean_error_nb'] retval['MSE_crp_mixture_indexer'] = result['mean_error_crp'] retval['MSE_crosscat_indexer'] = result['mean_error_cc'] retval['MEAN_P_naive_bayes_indexer'] = result['mean_held_out_p_nb'] retval['MEAN_P_crp_mixture_indexer'] = result['mean_held_out_p_crp'] retval['MEAN_P_crosscat_indexer'] = result['mean_held_out_p_cc'] retval['config'] = argin return retval
def run_experiment(argin): num_iters = argin["num_iters"] num_chains = argin["num_chains"] num_rows = argin["num_rows"] num_cols = argin["num_cols"] num_views = argin["num_views"] num_clusters = argin["num_clusters"] prop_missing = argin["prop_missing"] separation = argin["separation"] ct_kernel = argin["ct_kernel"] multinomial_categories = argin["multinomial_categories"] seed = argin["seed"] random.seed(seed) # TODO: use dha.csv ofilename = "reasonably_calibrated_ofile.csv" table_name = 'reasonably_calibrated' argin['distargs'] = [{"K": multinomial_categories}] * num_cols argin['cctypes'] = ['multinomial'] * num_cols argin['separation'] = [argin['separation']] * num_views T_array, structure = eu.gen_data(ofilename, argin, save_csv=True) filename, indices, col_names = eu.gen_missing_data_csv( ofilename, prop_missing, []) # create a client client = Client() # caluclate empirical frequency of each point frequencies = [] for col in range(num_cols): frequencies.append(numpy.zeros(multinomial_categories)) T_int = numpy.array(T_array, dtype=int) n_indices = len(indices[0]) for i in range(n_indices): r = indices[0][i] c = indices[1][i] x = T_int[r, c] frequencies[c][x] += 1.0 frequencies = [f / numpy.sum(f) for f in frequencies] # set up a dict fro the different config data result = dict() # do analyses for config in ['cc', 'crp', 'nb']: config_string = eu.config_map[config] table = table_name + '-' + config # drop old btable, create a new one with the new data and init models client('DROP BTABLE %s;' % table, yes=True) client('CREATE BTABLE %s FROM %s;' % (table, filename)) client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string)) if ct_kernel == 1: client('ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;' % (table, num_iters)) else: client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters)) # imput each index in indices and calculate the squared error results_config = [] for col in range(num_cols): results_config.append(numpy.zeros(multinomial_categories)) for col in range(num_cols): col_name = col_names[col] out = client( "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" % (col_name, table), pretty=False, pandas_output=False) for i in range(n_indices): r = indices[0][i] c = indices[1][i] if c == col: x = out[0]['data'][r][1] results_config[c][int(x)] += 1.0 results_config = [f / sum(f) for f in results_config] result[config] = results_config retval = dict() retval['actual_frequencies'] = frequencies retval['inferred_P_cc'] = result['cc'] retval['inferred_P_crp'] = result['crp'] retval['inferred_P_nb'] = result['nb'] retval['config'] = argin return retval