Exemplo n.º 1
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    with_id = argin["with_id"]
    needles = argin["needles"]
    mixed_types = argin["mixed_types"]
    multinomial_categories = argin["multinomial_categories"]
    separation = argin["separation"]
    num_indep_queries = argin["num_indep_queries"]
    independent_clusters = argin["independent_clusters"]
    ct_kernel = argin["ct_kernel"]

    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)

    # generate column indices and header
    col_names = [ "col_%i" % i for i in range(num_cols)]

    if mixed_types and multinomial_categories > 0:
        data_mode = 'mixed'
    elif multinomial_categories > 0:
        data_mode = 'multinomial'
    else:
        data_mode = 'continuous'

    if needles:
        T = [[0]*num_cols]*num_rows
        Zv = [0,0,1,1] # our needles
        Zv.extend(range(2,num_cols-2))
        # random.shuffle(Zv)

        num_views = max(Zv)+1

        separation = [.95]*2
        separation.extend([0.0]*(num_views-2))

        min_clusters = 4
        max_clusters = 5

        cluster_weights = []
        # generate weights. 
        for v in range(num_views):
            if v < 2:
                num_clusters = random.randrange(min_clusters, max_clusters)
            else:
                if independent_clusters:
                    num_clusters = random.randrange(min_clusters, max_clusters)
                else:
                    num_clusters = 1

            cluster_weights.append( [1.0/num_clusters]*num_clusters ) 

        cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories)
        T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs)
    else:
        T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols)


    # # preprend the row_id
    # if with_id:
    #     needle_a_cols = (1,2)
    #     needle_b_cols = (3,4)
    #     col_names.insert(0, 'ID')
    #     # TODO: ID type
    #     cctypes.insert(0,'continuous')
    #     # header = "ID,%s" % header
    #     if needles:
    #         Zv.insert(0, num_views)
    #     for row in range(num_rows):
    #         T[row].insert(0, row)
    # else:
    needle_a_cols = (col_names[0],col_names[1])
    needle_b_cols = (col_names[2],col_names[3])

    # save file to .csv
    filename = "needles_exp.csv"
    table = "needles_exp"
    T.insert(0, col_names)
    eu.list_to_csv(filename, T)

    # generate queries
    queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols,
                        col_names, table, num_indep_queries)
    num_queries = len(queries)

    dependence_probs = numpy.zeros( (num_iters, num_queries) )

    client = Client()

    client('DROP BTABLE %s;' % table, yes=True)
    client('CREATE BTABLE %s FROM %s;' % (table, filename))
    init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table)
    print init_string 
    client(init_string)
    client('SHOW DIAGNOSTICS FOR %s;' % table)

    # do the analyses
    for i in range(num_iters):
        if ct_kernel == 1:
            client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table )
        else:
            client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table )

        for q in range(num_queries):
            query = queries[q]
            out = client(query, pretty=False, pandas_output=False)
            dependence_probs[i,q] = out[0]['data'][0][1]

    result = dict()
    # store the queries in result
    result['query_col1'] = []
    result['query_col2'] = []
    result['dependence_probs'] = dependence_probs
    for pair in pairs:
        result['query_col1'].append(pair[0])
        result['query_col2'].append(pair[1])
    
    # for each query, get wether those columns were actually independent
    independent = [True]*num_queries
    if needles:
        for i in range(num_queries):
            col_idx_0 = pairs[i][0]
            col_idx_1 = pairs[i][1]            
            if Zv[col_idx_0] == Zv[col_idx_1]:
                independent[i] = False

    result['cols_independent'] = independent
    result['config'] = argin
    result['config']['data_mode'] = data_mode

    client('SHOW DIAGNOSTICS FOR %s;' % table)

    return result
Exemplo n.º 2
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    max_cols = argin["max_cols"]
    rho = argin["rho"]
    num_indep_queries = argin["num_indep_queries"]
    independent_clusters = argin["independent_clusters"]
    ct_kernel = argin["ct_kernel"]
    multimodal = argin["multimodal"]
    separation = argin["separation"]

    all_cols = max_cols + 4 # max_cols plus number of dependent columns

    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)
        numpy.random.seed(seed)

    # build full data file
    # generate column indices and header
    col_names = [ "col_%i" % i for i in range(all_cols)]

    Zv = [0,0,1,1] # our needles
    Zv.extend(range(2,all_cols-2))

    min_clusters = 3
    max_clusters = 10

    T_array = numpy.zeros( (num_rows, all_cols) )

    Sigma = numpy.array( [[1.0,rho],[rho,1.0]])
    mu = numpy.array([0,0])

    if multimodal:
        T = [[0]*num_cols]*num_rows
        Zv = [0,0,1,1] # our needles
        Zv.extend(range(2,num_cols-2))
        random.shuffle(Zv)

        num_views = max(Zv)+1

        separation = [separation]*2
        separation.extend([separation]*(num_views-2))

        min_clusters = 4
        max_clusters = 5

        cluster_weights = []
        # generate weights. 
        for v in range(num_views):
            if v < 2:
                num_clusters = random.randrange(min_clusters, max_clusters)
            else:
                num_clusters = 1
            cluster_weights.append( [1.0/num_clusters]*num_clusters ) 

        cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories)
        T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs)
        T_array = numpy.array(T)
    else:
        T_array[:, 0:1+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows)
        T_array[:, 2:3+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows)
        separation = .5
        for col in range(4, all_cols):
            num_clusters = random.randrange(min_clusters, max_clusters)+1
            for row in range(num_rows):
                k = random.randrange(num_clusters)
                T_array[row, col] = numpy.random.randn()+k*6*separation

        T = T_array.tolist()

    # save file to .csv
    exp_path = 'expdata/hb/'
    eu.make_folder(exp_path)
    filename = exp_path + "haystack_break_exp.csv"
    table = "haystack_break_exp"
    T.insert(0, col_names)
    eu.list_to_csv(filename, T)
    # done building data file

    # get colum step size (powers of two)
    num_steps = int( math.log(max_cols, 2) )-1
    step_size = [2**t for t in range(2, num_steps+1)]

    assert step_size[-1] <= max_cols

    if step_size[-1] < max_cols:
        step_size.append(max_cols)

    assert step_size[0] == 4 and step_size[-1] == max_cols

    # the needle column names
    needle_a_cols = (col_names[0],col_names[1])
    needle_b_cols = (col_names[2],col_names[3])

    result = dict()
    result['steps'] = []

    for num_distractor_columns in step_size:
        # create subdata
        T_sub = take_T_column_subset(T, range(4+num_distractor_columns) )
        subpath = exp_path+'d_'+str(num_distractor_columns)+'/'
        eu.make_folder(subpath)
        subfilename = subpath + "haystack_break_exp_" + str(num_distractor_columns) + ".csv"
        eu.list_to_csv(subfilename, T_sub)

        col_names_sub = T_sub[0]

        # generate queries
        queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols,
                            col_names_sub, table, num_indep_queries)
        num_queries = len(queries)

        dependence_probs = numpy.zeros( (num_iters+1, num_queries) )

        client = Client()

        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, subfilename))
        init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table)
        print init_string 
        client(init_string)
        client('SHOW DIAGNOSTICS FOR %s;' % table)

        # do the analyses
        for i in range(0,num_iters+1):
            if i > 0:
                if ct_kernel == 1:
                    client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table )
                else:
                    client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table )

            for q in range(num_queries):
                query = queries[q]
                out = client(query, pretty=False, pandas_output=False)
                dependence_probs[i,q] = out[0]['data'][0][1]

        subresult = dict()
        # store the queries in subresult
        subresult['query_col1'] = []
        subresult['query_col2'] = []
        subresult['dependence_probs'] = dependence_probs
        for pair in pairs:
            subresult['query_col1'].append(pair[0])
            subresult['query_col2'].append(pair[1])
        
        # for each query, get wether those columns were actually independent
        independent = [True]*num_queries
        for i in range(num_queries):
            col_idx_0 = pairs[i][0]
            col_idx_1 = pairs[i][1]            
            if Zv[col_idx_0] == Zv[col_idx_1]:
                independent[i] = False

        subresult['cols_independent'] = independent
        subresult['distractor_cols'] = num_distractor_columns
        result['steps'].append(subresult)
    
    result['config'] = argin
    result['data'] = T_array

    return result
Exemplo n.º 3
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    with_id = argin["with_id"]
    needles = argin["needles"]
    mixed_types = argin["mixed_types"]
    multinomial_categories = argin["multinomial_categories"]
    separation = argin["separation"]
    num_indep_queries = argin["num_indep_queries"]
    independent_clusters = argin["independent_clusters"]
    ct_kernel = argin["ct_kernel"]

    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)

    # generate column indices and header
    col_names = ["col_%i" % i for i in range(num_cols)]

    if mixed_types and multinomial_categories > 0:
        data_mode = 'mixed'
    elif multinomial_categories > 0:
        data_mode = 'multinomial'
    else:
        data_mode = 'continuous'

    if needles:
        T = [[0] * num_cols] * num_rows
        Zv = [0, 0, 1, 1]  # our needles
        Zv.extend(range(2, num_cols - 2))
        # random.shuffle(Zv)

        num_views = max(Zv) + 1

        separation = [.95] * 2
        separation.extend([0.0] * (num_views - 2))

        min_clusters = 4
        max_clusters = 5

        cluster_weights = []
        # generate weights.
        for v in range(num_views):
            if v < 2:
                num_clusters = random.randrange(min_clusters, max_clusters)
            else:
                if independent_clusters:
                    num_clusters = random.randrange(min_clusters, max_clusters)
                else:
                    num_clusters = 1

            cluster_weights.append([1.0 / num_clusters] * num_clusters)

        cctypes, distargs = eu.get_column_types(data_mode, num_cols,
                                                multinomial_categories)
        T, _ = sdg.gen_data(cctypes,
                            num_rows,
                            Zv,
                            cluster_weights,
                            separation,
                            distargs=distargs)
    else:
        T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols)

    # # preprend the row_id
    # if with_id:
    #     needle_a_cols = (1,2)
    #     needle_b_cols = (3,4)
    #     col_names.insert(0, 'ID')
    #     # TODO: ID type
    #     cctypes.insert(0,'continuous')
    #     # header = "ID,%s" % header
    #     if needles:
    #         Zv.insert(0, num_views)
    #     for row in range(num_rows):
    #         T[row].insert(0, row)
    # else:
    needle_a_cols = (col_names[0], col_names[1])
    needle_b_cols = (col_names[2], col_names[3])

    # save file to .csv
    filename = "needles_exp.csv"
    table = "needles_exp"
    T.insert(0, col_names)
    eu.list_to_csv(filename, T)

    # generate queries
    queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols,
                                                 col_names, table,
                                                 num_indep_queries)
    num_queries = len(queries)

    dependence_probs = numpy.zeros((num_iters, num_queries))

    client = Client()

    client('DROP BTABLE %s;' % table, yes=True)
    client('CREATE BTABLE %s FROM %s;' % (table, filename))
    init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table)
    print init_string
    client(init_string)
    client('SHOW DIAGNOSTICS FOR %s;' % table)

    # do the analyses
    for i in range(num_iters):
        if ct_kernel == 1:
            client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table)
        else:
            client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table)

        for q in range(num_queries):
            query = queries[q]
            out = client(query, pretty=False, pandas_output=False)
            dependence_probs[i, q] = out[0]['data'][0][1]

    result = dict()
    # store the queries in result
    result['query_col1'] = []
    result['query_col2'] = []
    result['dependence_probs'] = dependence_probs
    for pair in pairs:
        result['query_col1'].append(pair[0])
        result['query_col2'].append(pair[1])

    # for each query, get wether those columns were actually independent
    independent = [True] * num_queries
    if needles:
        for i in range(num_queries):
            col_idx_0 = pairs[i][0]
            col_idx_1 = pairs[i][1]
            if Zv[col_idx_0] == Zv[col_idx_1]:
                independent[i] = False

    result['cols_independent'] = independent
    result['config'] = argin
    result['config']['data_mode'] = data_mode

    client('SHOW DIAGNOSTICS FOR %s;' % table)

    return result