예제 #1
0
def quick_le(seed, n_chains=1):
    random.seed(seed)
    numpy.random.seed(seed)
    T, M_r, M_c = du.gen_factorial_data_objects(seed, 2, N_COLS, N_ROWS, 2)
    engine = LE.LocalEngine(seed=seed)
    X_L, X_D = engine.initialize(M_c, M_r, T, n_chains=n_chains)
    return T, M_r, M_c, X_L, X_D, engine
예제 #2
0
def quick_le(seed, n_chains=1):
    random.seed(seed)
    numpy.random.seed(seed)
    T, M_r, M_c = du.gen_factorial_data_objects(seed, 2, N_COLS, N_ROWS, 2)
    engine = LE.LocalEngine(seed=seed)
    X_L, X_D = engine.initialize(M_c, M_r, T, n_chains=n_chains)
    return T, M_r, M_c, X_L, X_D, engine
예제 #3
0
def quick_le(seed, n_chains=1):
    rng = random.Random(seed)
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2,
        N_COLS, N_ROWS, 2)
    engine = LE.LocalEngine(seed=get_next_seed(rng))
    X_L, X_D = engine.initialize(M_c, M_r, T, seed=get_next_seed(rng),
        n_chains=n_chains)
    return T, M_r, M_c, X_L, X_D, engine
예제 #4
0
def quick_le(seed, n_chains=1):
    rng = random.Random(seed)
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2, N_COLS,
                                                N_ROWS, 2)
    engine = LE.LocalEngine(seed=get_next_seed(rng))
    X_L, X_D = engine.initialize(M_c,
                                 M_r,
                                 T,
                                 seed=get_next_seed(rng),
                                 n_chains=n_chains)
    return T, M_r, M_c, X_L, X_D, engine
예제 #5
0
def generate_clean_state(gen_seed, num_clusters,
                         num_cols, num_rows, num_splits,
                         max_mean=10, max_std=1,
                         plot=False):
    # generate the data
    T, M_r, M_c, data_inverse_permutation_indices = \
        du.gen_factorial_data_objects(gen_seed, num_clusters,
                                      num_cols, num_rows, num_splits,
                                      max_mean=10, max_std=1,
                                      send_data_inverse_permutation_indices=True)
    # recover generative clustering
    X_L, X_D = get_generative_clustering(M_c, M_r, T,
                                         data_inverse_permutation_indices,
                                         num_clusters, num_splits)
    return T, M_c, M_r, X_L, X_D
	def setUp(self):
		# generate a crosscat state and pull the metadata
		gen_seed = 0
		num_clusters = 2
		self.num_rows = 10
		self.num_cols = 2
		num_splits = 1

		self.T, self.M_r, self.M_c = du.gen_factorial_data_objects(gen_seed,
							   num_clusters, self.num_cols, 
							   self.num_rows, num_splits)

		state = State.p_State(self.M_c, self.T)
		self.X_L = state.get_X_L()
		self.X_D = state.get_X_D()
예제 #7
0
def generate_clean_state(gen_seed,
                         num_clusters,
                         num_cols,
                         num_rows,
                         num_splits,
                         max_mean=10,
                         max_std=1,
                         plot=False):
    # generate the data
    T, M_r, M_c, data_inverse_permutation_indices = \
        du.gen_factorial_data_objects(gen_seed, num_clusters,
                                      num_cols, num_rows, num_splits,
                                      max_mean=10, max_std=1,
                                      send_data_inverse_permutation_indices=True)
    # recover generative clustering
    X_L, X_D = get_generative_clustering(M_c, M_r, T,
                                         data_inverse_permutation_indices,
                                         num_clusters, num_splits)
    return T, M_c, M_r, X_L, X_D
예제 #8
0
def test_two_dependent_one_dependent():
    rng = random.Random(PASS_SEED)
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2, 3, 10,
                                                1)
    engine = LE.LocalEngine(seed=get_next_seed(rng))

    # These dependency constraints target the computation of unorm_crp_logps_avg
    # in the case that one variable (1) is independent of another variable (2)
    # which should ensure the CRP probability of the view of (2) is -INFINITY
    # when doing a block proposal of (0,1) into the view of (2). Refer to the
    # comment about compute the CRP probabilities in
    # State::sample_insert_features.
    dep_constraints = [(0, 1, True), (1, 2, False)]

    X_L, X_D = engine.initialize(M_c,
                                 M_r,
                                 T,
                                 seed=get_next_seed(rng),
                                 n_chains=1)

    X_L, X_D = engine.ensure_col_dep_constraints(M_c, M_r, T, X_L, X_D,
                                                 dep_constraints,
                                                 get_next_seed(rng))

    for col1, col2, dep in dep_constraints:
        assert engine.assert_col_dep_constraints(X_L, X_D, col1, col2, dep,
                                                 True)

    X_L, X_D = engine.analyze(M_c,
                              T,
                              X_L,
                              X_D,
                              get_next_seed(rng),
                              n_steps=1000)

    for col1, col2, dep in dep_constraints:
        assert engine.assert_col_dep_constraints(X_L, X_D, col1, col2, dep,
                                                 True)
예제 #9
0
num_clusters = args.num_clusters
num_cols = args.num_cols
num_rows = args.num_rows
num_splits = args.num_splits
max_mean = args.max_mean
max_std = args.max_std
num_transitions = args.num_transitions
N_GRID = args.N_GRID
URI = args.URI

# create the data
T, M_r, M_c = du.gen_factorial_data_objects(
    gen_seed,
    num_clusters,
    num_cols,
    num_rows,
    num_splits,
    max_mean=max_mean,
    max_std=max_std,
)

#
engine = JSONRPCEngine(inf_seed, URI=URI)

# initialize
X_L, X_D = engine.initialize(M_c, M_r, T)

# analyze without do_diagnostics or do_timing
X_L, X_D = engine.analyze(M_c, T, X_L, X_D, n_steps=num_transitions)

# analyze with do_diagnostics
예제 #10
0
gen_seed = args.gen_seed
inf_seed = args.inf_seed
num_clusters = args.num_clusters
num_cols = args.num_cols
num_rows = args.num_rows
num_splits = args.num_splits
max_mean = args.max_mean
max_std = args.max_std
num_transitions = args.num_transitions
N_GRID = args.N_GRID

# create the data
if True:
    T, M_r, M_c = du.gen_factorial_data_objects(
        gen_seed, num_clusters,
        num_cols, num_rows, num_splits,
        max_mean=max_mean, max_std=max_std,
        )
else:
    with open('SynData2.csv') as fh:
        import numpy
        import csv
        T = numpy.array([
                row for row in csv.reader(fh)
                ], dtype=float).tolist()
        M_r = du.gen_M_r_from_T(T)
        M_c = du.gen_M_c_from_T(T)


# create the state
p_State = State.p_State(M_c, T, N_GRID=N_GRID, SEED=inf_seed)
예제 #11
0
num_views = args.num_views
num_cols = args.num_cols
numChains = args.numChains
block_size = args.block_size


engine = ccc.get_CrossCatClient('hadoop', seed = inf_seed)

if filename is not None:
    # Load the data from table and sub-sample entities to max_rows
    T, M_r, M_c = du.read_model_data_from_csv(filename, max_rows, gen_seed)
    truth_flag = 0
else:
    T, M_r, M_c, data_inverse_permutation_indices = \
        du.gen_factorial_data_objects(gen_seed, num_clusters,
                                      num_cols, max_rows, num_views,
                                      max_mean=100, max_std=1,
                                      send_data_inverse_permutation_indices=True)
    view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices(data_inverse_permutation_indices, max_rows,num_cols,num_views, num_clusters)
    truth_flag = 1

        
num_rows = len(T)
num_cols = len(T[0])

ari_table = []
ari_views = []

print 'Initializing ...'
# Call Initialize and Analyze
M_c, M_r, X_L_list, X_D_list = engine.initialize(M_c, M_r, T, n_chains = numChains)
if truth_flag:
예제 #12
0
    inf_seed = 0
    num_clusters = 4
    num_cols = 32
    num_rows = 400
    num_views = 2
    n_steps = 1
    n_times = 5
    n_chains = 3
    n_test = 100
    CT_KERNEL = 1

    get_next_seed = make_get_next_seed(gen_seed)

    # generate some data
    T, M_r, M_c, data_inverse_permutation_indices = du.gen_factorial_data_objects(
        get_next_seed(), num_clusters, num_cols, num_rows, num_views,
        max_mean=100, max_std=1, send_data_inverse_permutation_indices=True)
    view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices(
        data_inverse_permutation_indices, num_rows, num_cols, num_views, num_clusters)

    # run some tests
    engine = LocalEngine()
    multi_state_ARIs = []
    multi_state_mean_test_lls = []
    X_L_list, X_D_list = engine.initialize(M_c, M_r, T, get_next_seed(),
        n_chains=n_chains)
    multi_state_ARIs.append(
        ctu.get_column_ARIs(X_L_list, view_assignment_truth))

    for time_i in range(n_times):
        X_L_list, X_D_list = engine.analyze(
예제 #13
0
def convergence_analyze_helper(table_data, data_dict, command_dict):
    gen_seed = data_dict['SEED']
    num_clusters = data_dict['num_clusters']
    num_cols = data_dict['num_cols']
    num_rows = data_dict['num_rows']
    num_views = data_dict['num_views']
    max_mean = data_dict['max_mean']
    n_test = data_dict['n_test']
    num_transitions = data_dict['n_steps']
    block_size = data_dict['block_size']
    init_seed = data_dict['init_seed']

    # generate some data
    T, M_r, M_c, data_inverse_permutation_indices = \
            du.gen_factorial_data_objects(gen_seed, num_clusters,
                    num_cols, num_rows, num_views,
                    max_mean=max_mean, max_std=1,
                    send_data_inverse_permutation_indices=True)
    view_assignment_ground_truth = \
            ctu.determine_synthetic_column_ground_truth_assignments(num_cols,
                    num_views)
    X_L_gen, X_D_gen = ttu.get_generative_clustering(
        M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_views)
    T_test = ctu.create_test_set(M_c, T, X_L_gen, X_D_gen, n_test, seed_seed=0)
    generative_mean_test_log_likelihood = \
            ctu.calc_mean_test_log_likelihood(M_c, T, X_L_gen, X_D_gen, T_test)

    # additional set up
    engine = LE.LocalEngine(init_seed)
    column_ari_list = []
    mean_test_ll_list = []
    elapsed_seconds_list = []

    # get initial ARI, test_ll
    with gu.Timer('initialize', verbose=False) as timer:
        X_L, X_D = engine.initialize(M_c,
                                     M_r,
                                     T,
                                     initialization='from_the_prior')
    column_ari = ctu.get_column_ARI(X_L, view_assignment_ground_truth)
    column_ari_list.append(column_ari)
    mean_test_ll = ctu.calc_mean_test_log_likelihood(M_c, T, X_L, X_D, T_test)
    mean_test_ll_list.append(mean_test_ll)
    elapsed_seconds_list.append(timer.elapsed_secs)

    # run blocks of transitions, recording ARI, test_ll progression
    completed_transitions = 0
    n_steps = min(block_size, num_transitions)
    while (completed_transitions < num_transitions):
        # We won't be limiting by time in the convergence runs
        with gu.Timer('initialize', verbose=False) as timer:
            X_L, X_D = engine.analyze(M_c,
                                      T,
                                      X_L,
                                      X_D,
                                      kernel_list=(),
                                      n_steps=n_steps,
                                      max_time=-1)
        completed_transitions = completed_transitions + block_size
        #
        column_ari = ctu.get_column_ARI(X_L, view_assignment_ground_truth)
        column_ari_list.append(column_ari)
        mean_test_ll = ctu.calc_mean_test_log_likelihood(
            M_c, T, X_L, X_D, T_test)
        mean_test_ll_list.append(mean_test_ll)
        elapsed_seconds_list.append(timer.elapsed_secs)

    ret_dict = dict(
        num_rows=num_rows,
        num_cols=num_cols,
        num_views=num_views,
        num_clusters=num_clusters,
        max_mean=max_mean,
        column_ari_list=column_ari_list,
        mean_test_ll_list=mean_test_ll_list,
        generative_mean_test_log_likelihood=generative_mean_test_log_likelihood,
        elapsed_seconds_list=elapsed_seconds_list,
        n_steps=num_transitions,
        block_size=block_size,
    )
    return ret_dict
def run_test_continuous(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
    	query_row = 10
    else:
    	query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c= du.gen_factorial_data_objects(get_next_seed(),2,2,n_rows,1)

    state = State.p_State(M_c, T)

    T_array = numpy.array(T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = [] # no constraints

    # pull n samples
    samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)

    X_array = numpy.sort(numpy.array(samples))

    std_X = numpy.std(X_array)
    mean_X = numpy.mean(X_array)

    # filter out extreme values
    X_filter_low = numpy.nonzero(X_array < mean_X-2.*std_X)[0]
    X_filter_high = numpy.nonzero(X_array > mean_X+2.*std_X)[0]
    X_filter = numpy.hstack((X_filter_low, X_filter_high))
    X_array = numpy.delete(X_array, X_filter)

    # sort for area calculation later on
    X_array = numpy.sort(X_array)

    X = X_array.tolist()

    # build the queries
    Qs = [];
    for x in X:
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    # test that the area under Ps2 and pdfs is about 1 
    # calculated using the trapezoid rule
    area_density = 0;
    for i in range(len(X)-1):
    	area_density += (X[i+1]-X[i])*(densities[i+1]+densities[i])/2.0

    print "Area of PDF (should be close to, but not greater than, 1): " + str(area_density)
    print "*Note: The area will be less than one because the range (integral) is truncated."

    pylab.figure(facecolor='white')

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    pdf, bins, patches = pylab.hist(X,100,normed=1, histtype='stepfilled',label='samples', alpha=.5, color=[.5,.5,.5])
    pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left',fontsize='x-small')
    pylab.xlabel('value') 
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()

    raw_input("Press Enter when finished...")
예제 #15
0
def convergence_analyze_helper(table_data, data_dict, command_dict):
    gen_seed = data_dict['SEED']
    num_clusters = data_dict['num_clusters']
    num_cols = data_dict['num_cols']
    num_rows = data_dict['num_rows']
    num_views = data_dict['num_views']
    max_mean = data_dict['max_mean']
    n_test = data_dict['n_test']
    num_transitions = data_dict['n_steps']
    block_size = data_dict['block_size']
    init_seed = data_dict['init_seed']


    # generate some data
    T, M_r, M_c, data_inverse_permutation_indices = \
            du.gen_factorial_data_objects(gen_seed, num_clusters,
                    num_cols, num_rows, num_views,
                    max_mean=max_mean, max_std=1,
                    send_data_inverse_permutation_indices=True)
    view_assignment_ground_truth = \
            ctu.determine_synthetic_column_ground_truth_assignments(num_cols,
                    num_views)
    X_L_gen, X_D_gen = ttu.get_generative_clustering(M_c, M_r, T,
            data_inverse_permutation_indices, num_clusters, num_views)
    T_test = ctu.create_test_set(M_c, T, X_L_gen, X_D_gen, n_test, seed_seed=0)
    generative_mean_test_log_likelihood = \
            ctu.calc_mean_test_log_likelihood(M_c, T, X_L_gen, X_D_gen, T_test)

    # additional set up
    engine=LE.LocalEngine(init_seed)
    column_ari_list = []
    mean_test_ll_list = []
    elapsed_seconds_list = []

    # get initial ARI, test_ll
    with gu.Timer('initialize', verbose=False) as timer:
        X_L, X_D = engine.initialize(M_c, M_r, T, initialization='from_the_prior')
    column_ari = ctu.get_column_ARI(X_L, view_assignment_ground_truth)
    column_ari_list.append(column_ari)
    mean_test_ll = ctu.calc_mean_test_log_likelihood(M_c, T, X_L, X_D,
            T_test)
    mean_test_ll_list.append(mean_test_ll)
    elapsed_seconds_list.append(timer.elapsed_secs)

    # run blocks of transitions, recording ARI, test_ll progression
    completed_transitions = 0
    n_steps = min(block_size, num_transitions)
    while (completed_transitions < num_transitions):
        # We won't be limiting by time in the convergence runs
        with gu.Timer('initialize', verbose=False) as timer:
             X_L, X_D = engine.analyze(M_c, T, X_L, X_D, kernel_list=(),
                     n_steps=n_steps, max_time=-1)
        completed_transitions = completed_transitions + block_size
        #
        column_ari = ctu.get_column_ARI(X_L, view_assignment_ground_truth)
        column_ari_list.append(column_ari)
        mean_test_ll = ctu.calc_mean_test_log_likelihood(M_c, T, X_L, X_D,
                T_test)
        mean_test_ll_list.append(mean_test_ll)
        elapsed_seconds_list.append(timer.elapsed_secs)

    ret_dict = dict(
        num_rows=num_rows,
        num_cols=num_cols,
        num_views=num_views,
        num_clusters=num_clusters,
        max_mean=max_mean,
        column_ari_list=column_ari_list,
        mean_test_ll_list=mean_test_ll_list,
        generative_mean_test_log_likelihood=generative_mean_test_log_likelihood,
        elapsed_seconds_list=elapsed_seconds_list,
        n_steps=num_transitions,
        block_size=block_size,
        )
    return ret_dict
예제 #16
0
n_test = 40
data_max_mean = 1
data_max_std = 1.
#
#num_rows = 800
#n_chains = 16
#config_filename = os.path.expanduser('~/.config/ipython/profile_ssh/security/ipcontroller-client.json')
#
num_rows = 100
n_chains = 2
config_filename = None


# generate some data
T, M_r, M_c, data_inverse_permutation_indices = du.gen_factorial_data_objects(
        gen_seed, num_clusters, num_cols, num_rows, num_views,
        max_mean=data_max_mean, max_std=data_max_std,
        send_data_inverse_permutation_indices=True)
view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices(
        data_inverse_permutation_indices, num_rows, num_cols, num_views, num_clusters)
X_L_gen, X_D_gen = ttu.get_generative_clustering(M_c, M_r, T,
        data_inverse_permutation_indices, num_clusters, num_views)
T_test = ctu.create_test_set(M_c, T, X_L_gen, X_D_gen, n_test, seed_seed=0)
#
generative_mean_test_log_likelihood = ctu.calc_mean_test_log_likelihood(M_c, T,
        X_L_gen, X_D_gen, T_test)
ground_truth_lookup = dict(
        ARI=1.0,
        mean_test_ll=generative_mean_test_log_likelihood,
        num_views=num_views,
        )
예제 #17
0
def run_test_continuous(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
        query_row = 10
    else:
        query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(), 2, 2, n_rows,
                                                1)

    state = State.p_State(M_c, T)

    T_array = numpy.array(T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = []  # no constraints

    # pull n samples
    samples = su.simple_predictive_sample(M_c,
                                          X_L,
                                          X_D,
                                          Y,
                                          Q,
                                          get_next_seed,
                                          n=n)

    X_array = numpy.sort(numpy.array(samples))

    std_X = numpy.std(X_array)
    mean_X = numpy.mean(X_array)

    # filter out extreme values
    X_filter_low = numpy.nonzero(X_array < mean_X - 2. * std_X)[0]
    X_filter_high = numpy.nonzero(X_array > mean_X + 2. * std_X)[0]
    X_filter = numpy.hstack((X_filter_low, X_filter_high))
    X_array = numpy.delete(X_array, X_filter)

    # sort for area calculation later on
    X_array = numpy.sort(X_array)

    X = X_array.tolist()

    # build the queries
    Qs = []
    for x in X:
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(
        su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    # test that the area under Ps2 and pdfs is about 1
    # calculated using the trapezoid rule
    area_density = 0
    for i in range(len(X) - 1):
        area_density += (X[i + 1] - X[i]) * (densities[i + 1] +
                                             densities[i]) / 2.0

    print("Area of PDF (should be close to, but not greater than, 1): " +
          str(area_density))
    print(
        "*Note: The area will be less than one because the range (integral) is truncated."
    )

    pylab.figure(facecolor='white')

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    pdf, bins, patches = pylab.hist(X,
                                    100,
                                    normed=1,
                                    histtype='stepfilled',
                                    label='samples',
                                    alpha=.5,
                                    color=[.5, .5, .5])
    pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left', fontsize='x-small')
    pylab.xlabel('value')
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()
    fd, fig_filename = tempfile.mkstemp(prefix='run_test_continuous_',
                                        suffix='.png',
                                        dir='.')
    pylab.savefig(fig_filename)
예제 #18
0
data_max_std = 1.
#
#num_rows = 800
#n_chains = 16
#config_filename = os.path.expanduser('~/.config/ipython/profile_ssh/security/ipcontroller-client.json')
#
num_rows = 100
n_chains = 2
config_filename = None

# generate some data
T, M_r, M_c, data_inverse_permutation_indices = du.gen_factorial_data_objects(
    gen_seed,
    num_clusters,
    num_cols,
    num_rows,
    num_views,
    max_mean=data_max_mean,
    max_std=data_max_std,
    send_data_inverse_permutation_indices=True)
view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices(
    data_inverse_permutation_indices, num_rows, num_cols, num_views,
    num_clusters)
X_L_gen, X_D_gen = ttu.get_generative_clustering(
    M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_views)
T_test = ctu.create_test_set(M_c, T, X_L_gen, X_D_gen, n_test, seed_seed=0)
#
generative_mean_test_log_likelihood = ctu.calc_mean_test_log_likelihood(
    M_c, T, X_L_gen, X_D_gen, T_test)
ground_truth_lookup = dict(
    ARI=1.0,