Python gen_data 예제들, experiment_utils.gen_data Python 예제들

예제 #1

0

파일 보기

파일: reasonably_calibrated_errors.py 프로젝트: JDReutt/BayesDB

def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    prop_missing = argin["prop_missing"]
    separation = argin["separation"]
    ct_kernel = argin["ct_kernel"]

    multinomial_categories = argin["multinomial_categories"]
    seed = argin["seed"]

    random.seed(seed)

    # TODO: use dha.csv
    ofilename = "reasonably_calibrated_ofile.csv"
    table_name = "reasonably_calibrated"

    argin["distargs"] = [{"K": multinomial_categories}] * num_cols
    argin["cctypes"] = ["multinomial"] * num_cols
    argin["separation"] = [argin["separation"]] * num_views

    T_array, structure = eu.gen_data(ofilename, argin, save_csv=True)

    filename, indices, col_names = eu.gen_missing_data_csv(ofilename, prop_missing, [])

    # create a client
    client = Client()

    # caluclate empirical frequency of each point
    frequencies = []
    for col in range(num_cols):
        frequencies.append(numpy.zeros(multinomial_categories))
    T_int = numpy.array(T_array, dtype=int)

    n_indices = len(indices[0])
    for i in range(n_indices):
        r = indices[0][i]
        c = indices[1][i]
        x = T_int[r, c]
        frequencies[c][x] += 1.0

    frequencies = [f / numpy.sum(f) for f in frequencies]

    # set up a dict fro the different config data
    result = dict()

    # do analyses
    for config in ["cc", "crp", "nb"]:
        config_string = eu.config_map[config]
        table = table_name + "-" + config

        # drop old btable, create a new one with the new data and init models
        client("DROP BTABLE %s;" % table, yes=True)
        client("CREATE BTABLE %s FROM %s;" % (table, filename))
        client("INITIALIZE %i MODELS FOR %s %s;" % (num_chains, table, config_string))

        if ct_kernel == 1:
            client("ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;" % (table, num_iters))
        else:
            client("ANALYZE %s FOR %i ITERATIONS WAIT;" % (table, num_iters))

        # imput each index in indices and calculate the squared error
        results_config = []
        for col in range(num_cols):
            results_config.append(numpy.zeros(multinomial_categories))
        for col in range(num_cols):
            col_name = col_names[col]
            out = client(
                "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" % (col_name, table),
                pretty=False,
                pandas_output=False,
            )
            for i in range(n_indices):
                r = indices[0][i]
                c = indices[1][i]
                if c == col:
                    x = out[0]["data"][r][1]
                    results_config[c][int(x)] += 1.0

        results_config = [f / sum(f) for f in results_config]
        result[config] = results_config

    retval = dict()
    retval["actual_frequencies"] = frequencies
    retval["inferred_P_cc"] = result["cc"]
    retval["inferred_P_crp"] = result["crp"]
    retval["inferred_P_nb"] = result["nb"]
    retval["config"] = argin

    return retval

예제 #2

0

파일 보기

파일: fills_in_the_blanks.py 프로젝트: JDReutt/BayesDB

def run_experiment(argin):
    num_iters       = argin["num_iters"]
    num_chains      = argin["num_chains"]
    num_rows        = argin["num_rows"]
    num_cols        = argin["num_cols"]
    num_views       = argin["num_views"]
    num_clusters    = argin["num_clusters"]
    prop_missing    = argin["prop_missing"]
    impute_samples  = argin["impute_samples"]
    separation      = argin["separation"]
    ct_kernel       = argin["ct_kernel"]
    seed            = argin["seed"]

    if seed > 0 :
        random.seed(seed)

    filename = "exp_fills_in_ofile.csv"
    table_name = 'exp_fills_in'

    argin['cctypes'] = ['continuous']*num_cols
    argin['separation'] = [argin['separation']]*num_views

    eu.gen_data(filename, argin, save_csv=True)

    # generate a new csv
    all_filenames = []
    all_indices = []
    for p in prop_missing:
        data_filename, indices, col_names, extra = eu.gen_missing_data_csv(filename,
                                        p, [], True)
        all_indices.append(indices)
        all_filenames.append(data_filename)

    # get the starting table so we can calculate errors
    T_array = extra['array_filled']
    num_rows, num_cols = T_array.shape

    # create a client
    client = Client()

    # set up a dict fro the different config data
    result = dict()
    result['cc'] = numpy.zeros(len(prop_missing))
    result['crp'] = numpy.zeros(len(prop_missing))
    result['nb'] = numpy.zeros(len(prop_missing))

    # do analyses
    for p in range(len(prop_missing)):
        this_indices = all_indices[p]
        this_filename = all_filenames[p]
        for config in ['cc', 'crp', 'nb']:
            config_string = eu.config_map[config]
            table = table_name + '-' + config

            # drop old btable, create a new one with the new data and init models
            client('DROP BTABLE %s;' % table, yes=True)
            client('CREATE BTABLE %s FROM %s;' % (table, this_filename))
            client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string))

            if ct_kernel == 1:
                client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' % (table, num_iters) )
            else:
                client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters) )

            MSE = 0.0
            count = 0.0
            # imput each index in indices and calculate the squared error
            for col in range(0,num_cols):
                col_name = col_names[col]
                # confidence is set to zero so that a value is always returned
                out = client('INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' % (col_name, table, 0, impute_samples), pretty=False, pandas_output=False )

                data = out[0]['data']

                # calcaulte MSE
                for row, tcol in zip(this_indices[0], this_indices[1]):
                    if tcol == col:
                        MSE += ( T_array[row,col] - data[row][1] )**2.0
                        count += 1.0

            result[config][p] = MSE/count
            print "error = %f" % result[config][p]

    retval = dict()
    retval['MSE_naive_bayes_indexer'] = result['nb']
    retval['MSE_crp_mixture_indexer'] = result['crp']
    retval['MSE_crosscat_indexer'] = result['cc']
    retval['prop_missing'] = prop_missing
    retval['config'] = argin

    return retval

예제 #3

0

파일 보기

def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    separation = argin["separation"]
    seed = argin["seed"]
    ct_kernel = argin["ct_kernel"]

    if seed > 0:
        random.seed(seed)

    argin['cctypes'] = ['continuous'] * num_cols
    argin['separation'] = [argin['separation']] * num_views

    # have to generate synthetic data
    filename = "exp_estimate_joint_ofile.csv"
    table_name = 'exp_estimate_joint'

    # generate starting data
    T_o, structure = eu.gen_data(filename, argin, save_csv=True)

    # generate a new csv with bottom row removed (held-out data)
    data_filename = 'exp_estimate_joint.csv'
    T_h = eu.gen_held_out_data(filename, data_filename, 1)

    # get the column names
    with open(filename, 'r') as f:
        csv_header = f.readline()
    col_names = csv_header.split(',')
    col_names[-1] = col_names[-1].strip()

    # set up a dict fro the different config data
    result = dict()

    true_held_out_p = []
    for col in range(num_cols):
        x = T_o[-1, col]
        logp = eu.get_true_logp(numpy.array([x]), col, structure)
        true_held_out_p.append(numpy.exp(logp))

    # start a client
    client = Client()

    # do analyses
    for config in ['cc', 'crp', 'nb']:
        config_string = eu.config_map[config]
        table = table_name + '-' + config

        # drop old btable, create a new one with the new data and init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, data_filename))
        client('INITIALIZE %i MODELS FOR %s %s;' %
               (num_chains, table, config_string))

        these_ps = numpy.zeros(num_iters)
        these_ps_errors = numpy.zeros(num_iters)
        for i in range(num_iters):
            if ct_kernel == 1:
                client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' %
                       table)
            else:
                client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table)

            # imput each index in indices and calculate the squared error
            mean_p = []
            mean_p_error = []
            for col in range(0, num_cols):
                col_name = col_names[col]
                x = T_o[-1, col]
                out = client('SELECT PROBABILITY OF %s=%f from %s;' %
                             (col_name, x, table),
                             pretty=False,
                             pandas_output=False)
                p = out[0]['data'][0][1]

                mean_p.append(p)
                mean_p_error.append((true_held_out_p[col] - p)**2.0)

            these_ps[i] = numpy.mean(mean_p)
            these_ps_errors[i] = numpy.mean(mean_p_error)

        key_str_p = 'mean_held_out_p_' + config
        key_str_error = 'mean_error_' + config
        result[key_str_p] = these_ps
        result[key_str_error] = these_ps_errors

    retval = dict()
    retval['MSE_naive_bayes_indexer'] = result['mean_error_nb']
    retval['MSE_crp_mixture_indexer'] = result['mean_error_crp']
    retval['MSE_crosscat_indexer'] = result['mean_error_cc']

    retval['MEAN_P_naive_bayes_indexer'] = result['mean_held_out_p_nb']
    retval['MEAN_P_crp_mixture_indexer'] = result['mean_held_out_p_crp']
    retval['MEAN_P_crosscat_indexer'] = result['mean_held_out_p_cc']

    retval['config'] = argin

    return retval

예제 #4

0

파일 보기

파일: fills_in_the_blanks.py 프로젝트: poppingtonic/BayesDB

def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    prop_missing = argin["prop_missing"]
    impute_samples = argin["impute_samples"]
    separation = argin["separation"]
    ct_kernel = argin["ct_kernel"]
    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)

    filename = "exp_fills_in_ofile.csv"
    table_name = 'exp_fills_in'

    argin['cctypes'] = ['continuous'] * num_cols
    argin['separation'] = [argin['separation']] * num_views

    eu.gen_data(filename, argin, save_csv=True)

    # generate a new csv
    all_filenames = []
    all_indices = []
    for p in prop_missing:
        data_filename, indices, col_names, extra = eu.gen_missing_data_csv(
            filename, p, [], True)
        all_indices.append(indices)
        all_filenames.append(data_filename)

    # get the starting table so we can calculate errors
    T_array = extra['array_filled']
    num_rows, num_cols = T_array.shape

    # create a client
    client = Client()

    # set up a dict fro the different config data
    result = dict()
    result['cc'] = numpy.zeros(len(prop_missing))
    result['crp'] = numpy.zeros(len(prop_missing))
    result['nb'] = numpy.zeros(len(prop_missing))

    # do analyses
    for p in range(len(prop_missing)):
        this_indices = all_indices[p]
        this_filename = all_filenames[p]
        for config in ['cc', 'crp', 'nb']:
            config_string = eu.config_map[config]
            table = table_name + '-' + config

            # drop old btable, create a new one with the new data and init models
            client('DROP BTABLE %s;' % table, yes=True)
            client('CREATE BTABLE %s FROM %s;' % (table, this_filename))
            client('INITIALIZE %i MODELS FOR %s %s;' %
                   (num_chains, table, config_string))

            if ct_kernel == 1:
                client('ANALYZE %s FOR %i ITERATIONS WITH MH KENEL WAIT;' %
                       (table, num_iters))
            else:
                client('ANALYZE %s FOR %i ITERATIONS WAIT;' %
                       (table, num_iters))

            MSE = 0.0
            count = 0.0
            # imput each index in indices and calculate the squared error
            for col in range(0, num_cols):
                col_name = col_names[col]
                # confidence is set to zero so that a value is always returned
                out = client(
                    'INFER %s from %s WITH CONFIDENCE %f WITH %i SAMPLES;' %
                    (col_name, table, 0, impute_samples),
                    pretty=False,
                    pandas_output=False)

                data = out[0]['data']

                # calcaulte MSE
                for row, tcol in zip(this_indices[0], this_indices[1]):
                    if tcol == col:
                        MSE += (T_array[row, col] - data[row][1])**2.0
                        count += 1.0

            result[config][p] = MSE / count
            print "error = %f" % result[config][p]

    retval = dict()
    retval['MSE_naive_bayes_indexer'] = result['nb']
    retval['MSE_crp_mixture_indexer'] = result['crp']
    retval['MSE_crosscat_indexer'] = result['cc']
    retval['prop_missing'] = prop_missing
    retval['config'] = argin

    return retval

예제 #5

0

파일 보기

파일: estimate_the_full_joint_dist.py 프로젝트: JDReutt/BayesDB

def run_experiment(argin):
    num_iters    = argin["num_iters"]
    num_chains   = argin["num_chains"]
    num_rows     = argin["num_rows"]
    num_cols     = argin["num_cols"]
    num_views    = argin["num_views"]
    num_clusters = argin["num_clusters"]
    separation   = argin["separation"]
    seed         = argin["seed"]
    ct_kernel    = argin["ct_kernel"]

    if seed > 0:
        random.seed(seed)

    argin['cctypes'] = ['continuous']*num_cols
    argin['separation'] = [argin['separation']]*num_views

    # have to generate synthetic data
    filename = "exp_estimate_joint_ofile.csv"
    table_name = 'exp_estimate_joint'

    # generate starting data
    T_o, structure = eu.gen_data(filename, argin, save_csv=True)

    # generate a new csv with bottom row removed (held-out data)
    data_filename = 'exp_estimate_joint.csv'
    T_h = eu.gen_held_out_data(filename, data_filename, 1)

    # get the column names
    with open(filename, 'r') as f:
      csv_header = f.readline()
    col_names = csv_header.split(',')
    col_names[-1] = col_names[-1].strip()

    # set up a dict fro the different config data
    result = dict()

    true_held_out_p = []
    for col in range(num_cols):
        x = T_o[-1,col]
        logp = eu.get_true_logp(numpy.array([x]), col, structure)
        true_held_out_p.append(numpy.exp(logp))

    # start a client
    client = Client()

    # do analyses
    for config in ['cc', 'crp', 'nb']:
        config_string = eu.config_map[config]
        table = table_name + '-' + config

        # drop old btable, create a new one with the new data and init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, data_filename))
        client('INITIALIZE %i MODELS FOR %s %s;' % (num_chains, table, config_string))

        these_ps = numpy.zeros(num_iters)
        these_ps_errors = numpy.zeros(num_iters)
        for i in range(num_iters):
            if ct_kernel == 1:
                client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table )
            else:
                client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table )

            # imput each index in indices and calculate the squared error
            mean_p = []
            mean_p_error = []
            for col in range(0,num_cols):
                col_name = col_names[col]
                x = T_o[-1,col]
                out = client('SELECT PROBABILITY OF %s=%f from %s;' % (col_name, x, table), pretty=False, pandas_output=False)
                p = out[0]['data'][0][1]

                mean_p.append(p)
                mean_p_error.append( (true_held_out_p[col]-p)**2.0 )

            these_ps[i] = numpy.mean(mean_p)
            these_ps_errors[i] = numpy.mean(mean_p_error)

        key_str_p = 'mean_held_out_p_' + config
        key_str_error = 'mean_error_' + config
        result[key_str_p] = these_ps
        result[key_str_error] = these_ps_errors

    retval = dict()
    retval['MSE_naive_bayes_indexer'] = result['mean_error_nb']
    retval['MSE_crp_mixture_indexer'] = result['mean_error_crp']
    retval['MSE_crosscat_indexer'] = result['mean_error_cc']

    retval['MEAN_P_naive_bayes_indexer'] = result['mean_held_out_p_nb']
    retval['MEAN_P_crp_mixture_indexer'] = result['mean_held_out_p_crp']
    retval['MEAN_P_crosscat_indexer'] = result['mean_held_out_p_cc']
    
    retval['config'] = argin

    return retval

예제 #6

0

파일 보기

파일: reasonably_calibrated_errors.py 프로젝트: poppingtonic/BayesDB

def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    num_views = argin["num_views"]
    num_clusters = argin["num_clusters"]
    prop_missing = argin["prop_missing"]
    separation = argin["separation"]
    ct_kernel = argin["ct_kernel"]

    multinomial_categories = argin["multinomial_categories"]
    seed = argin["seed"]

    random.seed(seed)

    # TODO: use dha.csv
    ofilename = "reasonably_calibrated_ofile.csv"
    table_name = 'reasonably_calibrated'

    argin['distargs'] = [{"K": multinomial_categories}] * num_cols
    argin['cctypes'] = ['multinomial'] * num_cols
    argin['separation'] = [argin['separation']] * num_views

    T_array, structure = eu.gen_data(ofilename, argin, save_csv=True)

    filename, indices, col_names = eu.gen_missing_data_csv(
        ofilename, prop_missing, [])

    # create a client
    client = Client()

    # caluclate empirical frequency of each point
    frequencies = []
    for col in range(num_cols):
        frequencies.append(numpy.zeros(multinomial_categories))
    T_int = numpy.array(T_array, dtype=int)

    n_indices = len(indices[0])
    for i in range(n_indices):
        r = indices[0][i]
        c = indices[1][i]
        x = T_int[r, c]
        frequencies[c][x] += 1.0

    frequencies = [f / numpy.sum(f) for f in frequencies]

    # set up a dict fro the different config data
    result = dict()

    # do analyses
    for config in ['cc', 'crp', 'nb']:
        config_string = eu.config_map[config]
        table = table_name + '-' + config

        # drop old btable, create a new one with the new data and init models
        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, filename))
        client('INITIALIZE %i MODELS FOR %s %s;' %
               (num_chains, table, config_string))

        if ct_kernel == 1:
            client('ANALYZE %s FOR %i ITERATIONS WITH MH KERNEL WAIT;' %
                   (table, num_iters))
        else:
            client('ANALYZE %s FOR %i ITERATIONS WAIT;' % (table, num_iters))

        # imput each index in indices and calculate the squared error
        results_config = []
        for col in range(num_cols):
            results_config.append(numpy.zeros(multinomial_categories))
        for col in range(num_cols):
            col_name = col_names[col]
            out = client(
                "INFER %s FROM %s WITH CONFIDENCE .95 WITH 1 SAMPLES;" %
                (col_name, table),
                pretty=False,
                pandas_output=False)
            for i in range(n_indices):
                r = indices[0][i]
                c = indices[1][i]
                if c == col:
                    x = out[0]['data'][r][1]
                    results_config[c][int(x)] += 1.0

        results_config = [f / sum(f) for f in results_config]
        result[config] = results_config

    retval = dict()
    retval['actual_frequencies'] = frequencies
    retval['inferred_P_cc'] = result['cc']
    retval['inferred_P_crp'] = result['crp']
    retval['inferred_P_nb'] = result['nb']
    retval['config'] = argin

    return retval