Пример #1
0
def GenerateStateFromPartitions(col_parts,
                                row_parts,
                                seed,
                                mean_gen=0.0,
                                std_gen=1.0,
                                std_data=0.1):
    rng = np.random.RandomState(seed)
    T, M_r, M_c = GenDataFromPartitions(col_parts,
                                        row_parts,
                                        seed=get_next_seed(rng),
                                        mean_gen=mean_gen,
                                        std_gen=std_gen,
                                        std_data=std_data)
    state = State.p_State(M_c, T, N_GRID=100, SEED=get_next_seed(rng))

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    if type(col_parts) is not list:
        X_L['column_partition']['assignments'] = col_parts.tolist()

    if type(row_parts) is not list:
        X_D = row_parts.tolist()

    # create a new state with the updated X_D and X_L
    state = State.p_State(M_c,
                          T,
                          X_L=X_L,
                          X_D=X_D,
                          N_GRID=100,
                          SEED=get_next_seed(rng))

    return state, T, M_c, M_r, X_L, X_D
Пример #2
0
def GenerateRandomState(n_rows,
                        n_cols,
                        mean_gen=0.0,
                        std_gen=1.0,
                        std_data=0.1,
                        alpha_col=1.0,
                        alpha_rows=1.0):

    # check the inputs
    assert (type(n_rows) is int)
    assert (type(n_cols) is int)
    assert (type(mean_gen) is float)
    assert (type(std_gen) is float)
    assert (type(std_data) is float)
    assert (type(alpha_col) is float)
    assert (type(alpha_rows) is float)
    assert (n_rows > 0)
    assert (n_cols > 0)
    assert (std_gen > 0.0)
    assert (std_data > 0.0)
    assert (alpha_col > 0.0)
    assert (alpha_rows > 0.0)

    # generate the partitioning
    part = GenerateRandomPartition(n_rows, n_cols, alpha_col, alpha_rows)

    # fill it with data
    T, M_r, M_c = GenDataFromPartitions(part['col_parts'], part['row_parts'],
                                        mean_gen, std_gen, std_data)

    # this part is kind of hacky:
    # generate a state from the prior
    state = State.p_State(M_c, T, N_GRID=100)
    # get the X_L and X_D and implant part['col_parts'], part['row_parts'], then
    # create a new state with the new X_L and X_D defined
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # this should be all we need to change for
    # State.transform_latent_state_to_constructor_args(X_L, X_D) to be able
    # to construct the arguments to intialize a state
    X_L['column_partition']['assignments'] = part['col_parts'].tolist()
    X_D = part['row_parts'].tolist()

    # hack in the alpha values supplied (or not) by the user
    X_L['column_partition']['hypers']['alpha'] = alpha_col
    for i in range(len(X_L['view_state'])):
        X_L['view_state'][i]['row_partition_model']['hypers'][
            'alpha'] = alpha_col
    for i in range(n_cols):
        X_L['column_hypers'][i]['alpha'] = alpha_rows

    # create a new state with the updated X_D and X_L
    state = State.p_State(M_c, T, X_L=X_L, X_D=X_D, N_GRID=100)

    return state, T, M_r, M_c
def do_test(which_plot,
            max_plots,
            n,
            burn_in,
            cc_samples,
            which_test,
            correlation=0,
            do_plot=False):
    if which_test is "correlated":
        X = correlated(correlation, n=n)
    elif which_test is "square":
        X = square(n=n)
    elif which_test is "ring":
        X = ring(n=n)
    elif which_test is "circle":
        X = circle(n=n)
    elif which_test is "diamond":
        X = diamond(n=n)
    elif which_test is "blob":
        X = correlated(0.0, n=n)
    elif which_test is "dots":
        X = four_dots(n=n)
    elif which_test is "mixed":
        X = numpy.vstack((correlated(.95, n=n / 2), correlated(0, n=n / 2)))

    get_next_seed = lambda: random.randrange(32000)

    # Build a state
    M_c = du.gen_M_c_from_T(X.tolist())
    state = State.p_State(M_c, X.tolist())
    X_Ls = []
    X_Ds = []

    # collect crosscat samples
    for _ in range(cc_samples):
        state = State.p_State(M_c, X.tolist())
        state.transition(n_steps=burn_in)
        X_Ds.append(state.get_X_D())
        X_Ls.append(state.get_X_L())

    SX = sample_data_from_crosscat(M_c, X_Ls, X_Ds, get_next_seed, n)

    if do_plot:
        pl.subplot(2, max_plots, which_plot)
        pl.scatter(X[:, 0], X[:, 1], c='blue', alpha=.5)
        pl.title("Original data")
        pl.subplot(2, max_plots, max_plots + which_plot)
        pl.scatter(SX[:, 0], SX[:, 1], c='red', alpha=.5)
        pl.title("Sampled data")
        pl.show

    return M_c, X_Ls, X_Ds
def generate_X_L_and_X_D(T, M_c, cols_to_views, row_to_clusters, seed=0):
    state = State.p_State(M_c, T, SEED=seed)
    X_L = state.get_X_L()

    # insert assigment into X_L (this is not a valid X_L because the counts and 
    # suffstats will be wrong)
    X_L['column_partition']['assignments'] = cols_to_views
    state = State.p_State(M_c, T, X_L=X_L, X_D=row_to_clusters, SEED=seed)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    return X_L, X_D
def generate_X_L_and_X_D(T, M_c, cols_to_views, row_to_clusters, seed=0):
    state = State.p_State(M_c, T, SEED=seed)
    X_L = state.get_X_L()

    # insert assigment into X_L (this is not a valid X_L because the counts and
    # suffstats will be wrong)
    X_L['column_partition']['assignments'] = cols_to_views
    state = State.p_State(M_c, T, X_L=X_L, X_D=row_to_clusters, SEED=seed)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    return X_L, X_D
Пример #6
0
def GenerateRandomState(n_rows, n_cols, seed, mean_gen=0.0, std_gen=1.0, std_data=0.1, alpha_col=1.0, alpha_rows=1.0):

	# check the inputs 
	assert(type(n_rows) is int)
	assert(type(n_cols) is int)
	assert(type(mean_gen) is float)
	assert(type(std_gen) is float)
	assert(type(std_data) is float)
	assert(type(alpha_col) is float)
	assert(type(alpha_rows) is float)
	assert(n_rows > 0)
	assert(n_cols > 0)
	assert(std_gen > 0.0)
	assert(std_data > 0.0)
	assert(alpha_col > 0.0)
	assert(alpha_rows > 0.0)

	rng = np.random.RandomState(seed)

	# generate the partitioning
	part = GenerateRandomPartition(n_rows, n_cols, alpha_col, alpha_rows, seed=seed)

	# fill it with data
	T, M_r, M_c = GenDataFromPartitions(part['col_parts'], part['row_parts'], mean_gen, std_gen, std_data)

	# this part is kind of hacky:
	# generate a state from the prior 
	state = State.p_State(M_c, T, N_GRID=100)
	# get the X_L and X_D and implant part['col_parts'], part['row_parts'], then 
	# create a new state with the new X_L and X_D defined
	X_L = state.get_X_L()
	X_D = state.get_X_D()

	# this should be all we need to change for 
	# State.transform_latent_state_to_constructor_args(X_L, X_D) to be able
	# to construct the arguments to intialize a state
	X_L['column_partition']['assignments'] = part['col_parts'].tolist()
	X_D = part['row_parts'].tolist()

	# hack in the alpha values supplied (or not) by the user
	X_L['column_partition']['hypers']['alpha'] = alpha_col
	for i in range(len(X_L['view_state'])):
		X_L['view_state'][i]['row_partition_model']['hypers']['alpha'] = alpha_col
	for i in range(n_cols):
		X_L['column_hypers'][i]['alpha'] = alpha_rows

	# create a new state with the updated X_D and X_L
	state = State.p_State(M_c, T, X_L=X_L, X_D=X_D, N_GRID=100)

	return state, T, M_r, M_c
Пример #7
0
def gen_data_crosscat(mode, T):
    # edit transition list according to

    all_transitions = []

    M_c = du.gen_M_c_from_T(T, cctypes=['continuous'] * 2)

    state = State.p_state(M_c, T)
    if mode == 'crp_mixture':
        # fix the views
        X_D = state.get_X_D()
        X_L = state.get_X_L()
        X_D = [X_D[0]]
        X_L['column_partition']['assignments'] = [1, 1]
        state = State.p_state(M_c, T, X_L=X_L, X_D=X_D)
Пример #8
0
def gen_data_crosscat(mode, T):
    # edit transition list according to 
    
    all_transitions = []

    M_c = du.gen_M_c_from_T(T, cctypes=['continuous']*2)

    state = State.p_state(M_c, T)
    if mode == 'crp_mixture':
        # fix the views
        X_D = state.get_X_D();
        X_L = state.get_X_L();
        X_D = [X_D[0]]
        X_L['column_partition']['assignments'] = [1,1]
        state = State.p_state(M_c, T, X_L=X_L, X_D=X_D)
Пример #9
0
def _do_analyze(M_c, T, X_L, X_D, kernel_list, n_steps, c, r, max_iterations,
                max_time, SEED):
    p_State = State.p_State(M_c, T, X_L, X_D, SEED=SEED)
    p_State.transition(kernel_list, n_steps, c, r, max_iterations, max_time)
    X_L_prime = p_State.get_X_L()
    X_D_prime = p_State.get_X_D()
    return X_L_prime, X_D_prime
Пример #10
0
def _do_initialize(
    SEED,
    M_c,
    M_r,
    T,
    initialization,
    row_initialization,
    ROW_CRP_ALPHA_GRID,
    COLUMN_CRP_ALPHA_GRID,
    S_GRID,
    MU_GRID,
    N_GRID,
):
    p_State = State.p_State(
        M_c,
        T,
        initialization=initialization,
        row_initialization=row_initialization,
        SEED=SEED,
        ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
        COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
        S_GRID=S_GRID,
        MU_GRID=MU_GRID,
        N_GRID=N_GRID,
    )
    X_L = p_State.get_X_L()
    X_D = p_State.get_X_D()
    return X_L, X_D
Пример #11
0
def _do_analyze(
    SEED,
    X_L,
    X_D,
    M_c,
    T,
    kernel_list,
    n_steps,
    c,
    r,
    max_iterations,
    max_time,
    ROW_CRP_ALPHA_GRID,
    COLUMN_CRP_ALPHA_GRID,
    S_GRID,
    MU_GRID,
    N_GRID,
    CT_KERNEL,
):
    p_State = State.p_State(M_c,
                            T,
                            X_L,
                            X_D,
                            SEED=SEED,
                            ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
                            COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
                            S_GRID=S_GRID,
                            MU_GRID=MU_GRID,
                            N_GRID=N_GRID,
                            CT_KERNEL=CT_KERNEL)
    p_State.transition(kernel_list, n_steps, c, r, max_iterations, max_time)
    X_L_prime = p_State.get_X_L()
    X_D_prime = p_State.get_X_D()
    return X_L_prime, X_D_prime
Пример #12
0
def _do_analyze_with_diagnostic(
    SEED,
    X_L,
    X_D,
    M_c,
    T,
    kernel_list,
    n_steps,
    c,
    r,
    max_iterations,
    max_time,
    diagnostic_func_dict,
    every_N,
    ROW_CRP_ALPHA_GRID,
    COLUMN_CRP_ALPHA_GRID,
    S_GRID,
    MU_GRID,
    N_GRID,
    do_timing,
    CT_KERNEL,
):
    diagnostics_dict = collections.defaultdict(list)
    if diagnostic_func_dict is None:
        diagnostic_func_dict = dict()
        every_N = None
    child_n_steps_list = get_child_n_steps_list(n_steps, every_N)
    #
    p_State = State.p_State(
        M_c,
        T,
        X_L,
        X_D,
        SEED=SEED,
        ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
        COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
        S_GRID=S_GRID,
        MU_GRID=MU_GRID,
        N_GRID=N_GRID,
        CT_KERNEL=CT_KERNEL,
    )
    with gu.Timer('all transitions', verbose=False) as timer:
        for child_n_steps in child_n_steps_list:
            p_State.transition(kernel_list, child_n_steps, c, r,
                               max_iterations, max_time)
            for diagnostic_name, diagnostic_func in diagnostic_func_dict.iteritems(
            ):
                diagnostic_value = diagnostic_func(p_State)
                diagnostics_dict[diagnostic_name].append(diagnostic_value)
                pass
            pass
        pass
    X_L_prime = p_State.get_X_L()
    X_D_prime = p_State.get_X_D()
    #
    if do_timing:
        # diagnostics and timing are exclusive
        diagnostics_dict = timer.elapsed_secs
        pass
    return X_L_prime, X_D_prime, diagnostics_dict
Пример #13
0
def get_generative_clustering(M_c, M_r, T,
                              data_inverse_permutation_indices,
                              num_clusters, num_views):
    from crosscat.LocalEngine import LocalEngine
    import crosscat.cython_code.State as State
    # NOTE: this function only works because State.p_State doesn't use
    #       column_component_suffstats
    num_rows = len(T)
    num_cols = len(T[0])
    X_D_helper = numpy.repeat(range(num_clusters), (num_rows / num_clusters))
    gen_X_D = [
        X_D_helper[numpy.argsort(data_inverse_permutation_index)]
        for data_inverse_permutation_index in data_inverse_permutation_indices
        ]
    gen_X_L_assignments = numpy.repeat(range(num_views), (num_cols / num_views))
    # initialize to generate an X_L to manipulate
    local_engine = LocalEngine()
    bad_X_L, bad_X_D = local_engine.initialize(M_c, M_r, T,
                                                         initialization='apart')
    bad_X_L['column_partition']['assignments'] = gen_X_L_assignments
    # manually constrcut state in in generative configuration
    state = State.p_State(M_c, T, bad_X_L, gen_X_D)
    gen_X_L = state.get_X_L()
    gen_X_D = state.get_X_D()
    # run inference on hyperparameters to leave them in a reasonable state
    kernel_list = (
        'row_partition_hyperparameters',
        'column_hyperparameters',
        'column_partition_hyperparameter',
        )
    gen_X_L, gen_X_D = local_engine.analyze(M_c, T, gen_X_L, gen_X_D, n_steps=1,
                                            kernel_list=kernel_list)
    #
    return gen_X_L, gen_X_D
Пример #14
0
def _do_analyze_with_diagnostic(
        SEED, X_L, X_D, M_c, T, kernel_list, n_steps, c, r, max_iterations,
        max_time, diagnostic_func_dict, every_N, ROW_CRP_ALPHA_GRID,
        COLUMN_CRP_ALPHA_GRID, S_GRID, MU_GRID, N_GRID, do_timing, CT_KERNEL,
        progress,):

    diagnostics_dict = collections.defaultdict(list)

    if diagnostic_func_dict is None:
        diagnostic_func_dict = dict()
        every_N = None

    p_State = State.p_State(
        M_c, T, X_L, X_D, SEED=SEED, ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
        COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID, S_GRID=S_GRID,
        MU_GRID=MU_GRID, N_GRID=N_GRID, CT_KERNEL=CT_KERNEL)

    with gu.Timer('all transitions', verbose=False) as timer:
        p_State.transition(
            kernel_list, n_steps, c, r, max_iterations, max_time,
            progress=progress,
            diagnostic_func_dict=diagnostic_func_dict,
            diagnostics_dict=diagnostics_dict,
            diagnostics_every_N=every_N)

    X_L_prime = p_State.get_X_L()
    X_D_prime = p_State.get_X_D()

    if do_timing:
        # Diagnostics and timing are exclusive.
        diagnostics_dict = timer.elapsed_secs

    return X_L_prime, X_D_prime, diagnostics_dict
Пример #15
0
def get_generative_clustering(M_c, M_r, T,
                              data_inverse_permutation_indices,
                              num_clusters, num_views):
    # NOTE: this function only works because State.p_State doesn't use
    #       column_component_suffstats
    num_rows = len(T)
    num_cols = len(T[0])
    X_D_helper = numpy.repeat(range(num_clusters), (num_rows / num_clusters))
    gen_X_D = [
        X_D_helper[numpy.argsort(data_inverse_permutation_index)]
        for data_inverse_permutation_index in data_inverse_permutation_indices
        ]
    gen_X_L_assignments = numpy.repeat(range(num_views), (num_cols / num_views))
    # initialize to generate an X_L to manipulate
    local_engine = LE.LocalEngine()
    bad_X_L, bad_X_D = local_engine.initialize(M_c, M_r, T,
                                                         initialization='apart')
    bad_X_L['column_partition']['assignments'] = gen_X_L_assignments
    # manually constrcut state in in generative configuration
    state = State.p_State(M_c, T, bad_X_L, gen_X_D)
    gen_X_L = state.get_X_L()
    gen_X_D = state.get_X_D()
    # run inference on hyperparameters to leave them in a reasonable state
    kernel_list = (
        'row_partition_hyperparameters',
        'column_hyperparameters',
        'column_partition_hyperparameter',
        )
    gen_X_L, gen_X_D = local_engine.analyze(M_c, T, gen_X_L, gen_X_D, n_steps=1,
                                            kernel_list=kernel_list)
    #
    return gen_X_L, gen_X_D
def _do_analyze2((M_c, T, X_L, X_D, kernel_list, n_steps, c, r,
               max_iterations, max_time, SEED)):
    p_State = State.p_State(M_c, T, X_L, X_D, SEED=SEED)
    p_State.transition(kernel_list, n_steps, c, r,
                       max_iterations, max_time)
    X_L_prime = p_State.get_X_L()
    X_D_prime = p_State.get_X_D()
    return X_L_prime, X_D_prime
Пример #17
0
def do_test(which_plot, max_plots, n, burn_in, cc_samples, which_test, correlation=0, do_plot=False):
    if which_test is "correlated":
        X = correlated(correlation, n=n)
    elif which_test is "square":
        X = square(n=n)
    elif which_test is "ring":
        X = ring(n=n)
    elif which_test is "circle":
        X = circle(n=n)
    elif which_test is "diamond":
        X = diamond(n=n)
    elif which_test is "blob":
        X = correlated(0.0, n=n)
    elif which_test is "dots":
        X = four_dots(n=n)
    elif which_test is "mixed":
        X = numpy.vstack((correlated(0.95, n=n / 2), correlated(0, n=n / 2)))

    get_next_seed = lambda: random.randrange(32000)

    # Build a state
    M_c = du.gen_M_c_from_T(X.tolist())
    state = State.p_State(M_c, X.tolist())
    X_Ls = []
    X_Ds = []

    # collect crosscat samples
    for _ in range(cc_samples):
        state = State.p_State(M_c, X.tolist())
        state.transition(n_steps=burn_in)
        X_Ds.append(state.get_X_D())
        X_Ls.append(state.get_X_L())

    SX = sample_data_from_crosscat(M_c, X_Ls, X_Ds, get_next_seed, n)

    if do_plot:
        pl.subplot(2, max_plots, which_plot)
        pl.scatter(X[:, 0], X[:, 1], c="blue", alpha=0.5)
        pl.title("Original data")
        pl.subplot(2, max_plots, max_plots + which_plot)
        pl.scatter(SX[:, 0], SX[:, 1], c="red", alpha=0.5)
        pl.title("Sampled data")
        pl.show

    return M_c, X_Ls, X_Ds
Пример #18
0
def GenerateStateFromPartitions(col_parts, row_parts, mean_gen=0.0, std_gen=1.0, std_data=0.1):
	
	T, M_r, M_c = GenDataFromPartitions(col_parts, row_parts, mean_gen=mean_gen, std_gen=std_gen, std_data=std_data)
	state = State.p_State(M_c, T, N_GRID=100)

	X_L = state.get_X_L()
	X_D = state.get_X_D()

	if type(col_parts) is not list:
		X_L['column_partition']['assignments'] = col_parts.tolist()

	if type(row_parts) is not list:
		X_D = row_parts.tolist()

	# create a new state with the updated X_D and X_L
	state = State.p_State(M_c, T, X_L=X_L, X_D=X_D, N_GRID=100)

	return state, T, M_c, M_r, X_L, X_D
Пример #19
0
 def _sample_and_insert(self, M_c, T, X_L, X_D, matching_row_indices):
     p_State = State.p_State(M_c, T, X_L, X_D)
     draws = []
     for matching_row_idx in matching_row_indices:
         random_seed = self.get_next_seed()
         draw = p_State.get_draw(matching_row_idx, random_seed)
         p_State.insert_row(draw, matching_row_idx)
         draws.append(draw)
         T.append(draw)
     X_L, X_D = p_State.get_X_L(), p_State.get_X_D()
     return draws, T, X_L, X_D
Пример #20
0
 def _sample_and_insert(self, M_c, T, X_L, X_D, matching_row_indices):
     p_State = State.p_State(M_c, T, X_L, X_D)
     draws = []
     for matching_row_idx in matching_row_indices:
         random_seed = self.get_next_seed()
         draw = p_State.get_draw(matching_row_idx, random_seed)
         p_State.insert_row(draw, matching_row_idx)
         draws.append(draw)
         T.append(draw)
     X_L, X_D = p_State.get_X_L(), p_State.get_X_D()
     return draws, T, X_L, X_D
Пример #21
0
def _do_insert(M_c, T, X_L, X_D, new_rows, N_GRID, CT_KERNEL):
    p_State = State.p_State(M_c, T, X_L=X_L, X_D=X_D,
                            N_GRID=N_GRID,
                            CT_KERNEL=CT_KERNEL)

    row_idx = len(T)
    for row_data in new_rows:
        p_State.insert_row(row_data, row_idx)
        p_State.transition(which_transitions=['row_partition_assignments'], r=[row_idx])
        row_idx += 1

    X_L_prime = p_State.get_X_L()
    X_D_prime = p_State.get_X_D()
    return X_L_prime, X_D_prime
Пример #22
0
def _do_initialize(
        SEED, M_c, M_r, T, initialization, row_initialization,
         ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID, S_GRID, MU_GRID, N_GRID,):

    p_State = State.p_State(
        M_c, T, initialization=initialization,
        row_initialization=row_initialization, SEED=SEED,
        ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
        COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID, S_GRID=S_GRID,
        MU_GRID=MU_GRID, N_GRID=N_GRID,)

    X_L = p_State.get_X_L()
    X_D = p_State.get_X_D()
    return X_L, X_D
	def setUp(self):
		# generate a crosscat state and pull the metadata
		gen_seed = 0
		num_clusters = 2
		self.num_rows = 10
		self.num_cols = 2
		num_splits = 1

		self.T, self.M_r, self.M_c = du.gen_factorial_data_objects(gen_seed,
							   num_clusters, self.num_cols, 
							   self.num_rows, num_splits)

		state = State.p_State(self.M_c, self.T)
		self.X_L = state.get_X_L()
		self.X_D = state.get_X_D()
Пример #24
0
def _do_analyze(
        SEED, X_L, X_D, M_c, T, kernel_list, n_steps, c, r,
        max_iterations, max_time, ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID,
        S_GRID, MU_GRID, N_GRID, CT_KERNEL, progress):

    p_State = State.p_State(
        M_c, T, X_L, X_D, SEED=SEED, ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
        COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID, S_GRID=S_GRID,
        MU_GRID=MU_GRID, N_GRID=N_GRID, CT_KERNEL=CT_KERNEL)

    p_State.transition(
        kernel_list, n_steps, c, r, max_iterations, max_time, progress)

    X_L_prime = p_State.get_X_L()
    X_D_prime = p_State.get_X_D()
    return X_L_prime, X_D_prime
Пример #25
0
def _do_analyze_with_diagnostic(SEED, X_L, X_D, M_c, T, kernel_list, n_steps, c, r,
                                max_iterations, max_time, diagnostic_func_dict, every_N,
                                ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID,
                                S_GRID, MU_GRID,
                                N_GRID,
                                do_timing,
                                CT_KERNEL,
                                ):
    diagnostics_dict = collections.defaultdict(list)
    if diagnostic_func_dict is None:
        diagnostic_func_dict = dict()
        every_N = None
    child_n_steps_list = get_child_n_steps_list(n_steps, every_N)
    # import ipdb; ipdb.set_trace()
    p_State = State.p_State(M_c, T, X_L, X_D, SEED=SEED,
                            ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
                            COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
                            S_GRID=S_GRID,
                            MU_GRID=MU_GRID,
                            N_GRID=N_GRID,
                            CT_KERNEL=CT_KERNEL,
                            )
    with gu.Timer('all transitions', verbose=False) as timer:
        for child_n_steps in child_n_steps_list:
            p_State.transition(kernel_list, child_n_steps, c, r,
                               max_iterations, max_time)
            for diagnostic_name, diagnostic_func in six.iteritems(diagnostic_func_dict):
                diagnostic_value = diagnostic_func(p_State)
                diagnostics_dict[diagnostic_name].append(diagnostic_value)
                pass
            pass
        pass
    X_L_prime = p_State.get_X_L()
    X_D_prime = p_State.get_X_D()
    #
    if do_timing:
        # diagnostics and timing are exclusive
        diagnostics_dict = timer.elapsed_secs
        pass
    return X_L_prime, X_D_prime, diagnostics_dict
def calc_mean_test_log_likelihood(M_c, T, X_L, X_D, T_test):
    state = State.p_State(M_c, T, X_L, X_D)
    test_log_likelihoods = map(state.calc_row_predictive_logp, T_test)
    mean_test_log_likelihood = numpy.mean(test_log_likelihoods)
    return mean_test_log_likelihood
Пример #27
0
def run_test(args):

    rho = args.rho
    num_times = args.num_times
    min_num_rows = args.min_num_rows
    max_num_rows = args.max_num_rows
    n_grid = args.n_grid
    filename = args.filename
    discrete = args.discrete

    num_samples = []
    for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist():
        num_samples.append(int(ns))

    variances = []

    burn_in = 200

    MIs = numpy.zeros((num_times, len(num_samples)))

    mi_diff = numpy.zeros((len(num_samples), num_times))

    if not discrete:
        T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho)
        cctypes = ["continuous"] * 2
    else:
        T, true_mi, external_mi = gen_correlated_data_discrete(num_samples[-1], rho)
        cctypes = ["multinomial"] * 2

    data_subs = []

    n_index = 0
    for n in num_samples:
        T_sub = numpy.copy(T[0 : n - 1, :])

        data = []

        data_subs.append(T_sub)

        print("%i: " % n)
        for t in range(num_times):
            M_c = du.gen_M_c_from_T(T_sub, cctypes)
            state = State.p_State(M_c, T_sub)
            state.transition(n_steps=burn_in)
            X_D = state.get_X_D()
            X_L = state.get_X_L()

            MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)], n_samples=5000)

            mi_diff[n_index, t] = true_mi - MI[0][0]

            print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0]))

            MIs[t, n_index] = MI[0][0]

        n_index += 1

    if discrete:
        dtype_str = "discrete"
    else:
        dtype_str = "continuous"

    basefilename = filename + str(int(time.time()))
    figname = basefilename + ".png"
    datname = basefilename + "_DATA.png"

    pl.figure

    # plot data
    # pl.subplot(1,2,1)
    pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4))
    i = 0
    for T_s in data_subs:
        pl.subplot(1, len(data_subs), i + 1)
        num_rows = num_samples[i]
        if discrete:
            heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0], T_s[:, 1], bins=10)
            extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
            pl.imshow(heatmap, extent=extent, interpolation="nearest")
        else:
            pl.scatter(T_s[:, 0], T_s[:, 1], alpha=0.3, s=81)
        pl.title("#r: " + str(num_rows))

        i += 1

    pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(datname)
    pl.clf()

    pl.figure(tight_layout=True, figsize=(5, 4))
    # plot convergence
    # pl.subplot(1,2,2)
    # standard deviation
    stderr = numpy.std(MIs, axis=0)  # /(float(num_times)**.5)
    mean = numpy.mean(MIs, axis=0)
    pl.errorbar(num_samples, mean, yerr=stderr, c="blue")
    pl.plot(num_samples, mean, c="blue", alpha=0.8, label="mean MI")
    pl.plot(num_samples, [true_mi] * len(num_samples), color="red", alpha=0.8, label="true MI")
    pl.plot(num_samples, [external_mi] * len(num_samples), color=(0, 0.5, 0.5), alpha=0.8, label="external MI")
    pl.title("convergence")
    pl.xlabel("#rows in X (log)")
    pl.ylabel("CrossCat MI - true MI")

    pl.legend(loc=0, prop={"size": 8})
    pl.gca().set_xscale("log")

    # save output
    pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(figname)
Пример #28
0
    # Generate data from this state partition
    T, M_r, M_c = eu.GenDataFromPartitions(state['col_parts'],
                                           state['row_parts'], 0, 10, .5,
                                           get_next_seed(rng))
    # calculate the probability of the data under each state
    P = np.exp(eu.CCML(state_partitions, T, mu, r, nu, s, alpha, alpha))
    # print "done."

    # initialize state samples counter
    state_count = np.zeros(NS)

    # print "Sampling..."
    # start collecting samples
    # initalize the sampler
    p_State = State.p_State(M_c, T, N_GRID=100, SEED=get_next_seed(rng))
    X_L = eu.FixPriors(p_State.get_X_L(), alpha, mu, s, r, nu)
    X_D = p_State.get_X_D()
    p_State = State.p_State(M_c,
                            T,
                            N_GRID=100,
                            X_L=X_L,
                            X_D=X_D,
                            SEED=get_next_seed(rng))

    for b in range(200):
        p_State.transition(which_transitions=[
            'column_partition_assignments', 'row_partition_assignments'
        ])

    mlen = 0
def check_predictive_sample_improvement(component_model_type, seed=0, show_plot=True):
	""" Shows the error of predictive sample over iterations.
	"""

	num_transitions = 100
	num_samples = 10	
	num_clusters = 2
	separation = .9	# cluster separation
	N = 150
	
	random.seed(seed)
	get_next_seed = lambda : random.randrange(2147483647)

	# generate a single column of data from the component_model 
	cctype = component_model_type.cctype
	T, M_c, struc = sdg.gen_data([cctype], N, [0], [[.5,.5]], [separation], 
				seed=get_next_seed(), distargs=[distargs[cctype]], 
				return_structure=True)

	T_array = numpy.array(T)

	X = numpy.zeros((N,num_transitions))
	KL = numpy.zeros((num_samples, num_transitions))


	support = qtu.get_mixture_support(cctype, component_model_type, 
					struc['component_params'][0], nbins=1000, support=.995)
	true_log_pdf = qtu.get_mixture_pdf(support, component_model_type, 
					struc['component_params'][0],[.5,.5])

	for s in range(num_samples):
		# generate the state
		state = State.p_State(M_c, T, SEED=get_next_seed())

		for i in range(num_transitions):
			# transition
			state.transition()

			# get partitions and generate a predictive column
			X_L = state.get_X_L()
			X_D = state.get_X_D()

			T_inf = sdg.predictive_columns(M_c, X_L, X_D, [0], 
					seed=get_next_seed())

			if cctype == 'multinomial':
				K = distargs[cctype]['K']
				weights = numpy.zeros(numpy.array(K))
				for params in struc['component_params'][0]:
					weights += numpy.array(params['weights'])*(1.0/num_clusters)
				weights *= float(N)
				inf_hist = qtu.bincount(T_inf, bins=list(range(K)))
				err, _ = stats.power_divergence(inf_hist, weights, lambda_='pearson')
				err = numpy.ones(N)*err
			else:
				err = (T_array-T_inf)**2.0

			KL[s,i] = qtu.KL_divergence(component_model_type, 
						struc['component_params'][0], [.5,.5], M_c, X_L, X_D,
						true_log_pdf=true_log_pdf, support=support)

			for j in range(N):
				X[j,i] += err[j]

	X /= num_samples

	# mean and standard error
	X_mean = numpy.mean(X,axis=0)
	X_err = numpy.std(X,axis=0)/float(num_samples)**.5

	KL_mean = numpy.mean(KL, axis=0)
	KL_err = numpy.std(KL, axis=0)/float(num_samples)**.5

	if show_plot:
		pylab.subplot(1,2,1)
		pylab.errorbar(list(range(num_transitions)), X_mean, yerr=X_err)
		pylab.xlabel('iteration')
		pylab.ylabel('error across each data point')
		pylab.title('error of predictive sample over iterations, N=%i' % N)

		pylab.subplot(1,2,2)
		pylab.errorbar(list(range(num_transitions)), KL_mean, yerr=KL_err)
		pylab.xlabel('iteration')
		pylab.ylabel('KL divergence')
		pylab.title('KL divergence, N=%i' % N)

		pylab.show()

	# error should decrease over time
	return X_mean[0] > X_mean[-1] and KL_mean[0] > KL_mean[-1]
Пример #30
0
        num_cols, num_rows, num_splits,
        max_mean=max_mean, max_std=max_std,
        )
else:
    with open('SynData2.csv') as fh:
        import numpy
        import csv
        T = numpy.array([
                row for row in csv.reader(fh)
                ], dtype=float).tolist()
        M_r = du.gen_M_r_from_T(T)
        M_c = du.gen_M_c_from_T(T)


# create the state
p_State = State.p_State(M_c, T, N_GRID=N_GRID, SEED=inf_seed)
p_State.plot_T(filename='T')

# transition the sampler
print("p_State.get_marginal_logp():", p_State.get_marginal_logp())
for transition_idx in range(num_transitions):
    print("transition #: %s" % transition_idx)
    p_State.transition()
    counts = [
        view_state['row_partition_model']['counts']
        for view_state in p_State.get_X_L()['view_state']
        ]
    format_list = '; '.join([
            "s.num_views: %s",
            "cluster counts: %s",
            "s.column_crp_score: %.3f",
def run_test_multinomial(n, observed):
	n_rows = 40
	n_cols = 40

	if observed:
		query_row = 10
	else:
		query_row = n_rows

	query_column = 1

	Q = [(query_row, query_column)]

	# do the test with multinomial data
	T, M_r, M_c = generate_multinomial_data(get_next_seed(),2,n_rows,1)
	
	state = State.p_State(M_c, T)

	X_L = state.get_X_L()
	X_D = state.get_X_D()

	Y = []

	# pull n samples
	samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)
	X_array = numpy.sort(numpy.array(samples))
	X = numpy.unique(X_array)
	X = X.tolist()

	# build the queries
	Qs = [];
	for x in X:
	    # Qtmp = (query_row, query_column, x[0])
	    Qtmp = (query_row, query_column, x)
	    Qs.append(Qtmp)

	# get pdf values
	densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

	print "Sum of densities (should be 1): %f" % (numpy.sum(densities))

	pylab.clf()

	# PLOT: probability vs samples distribution
	# scale all histograms to be valid PDFs (area=1)
	mbins = numpy.unique(X_array)

	mbins = numpy.append(mbins,max(mbins)+1)

	pdf, bins = numpy.histogram(X_array,mbins)

	pdf = pdf/float(numpy.sum(pdf))
	pylab.bar(mbins[0:-1],pdf,label="samples",alpha=.5)
	pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none')

	pylab.legend(loc='upper left',fontsize='x-small')
	pylab.xlabel('value') 
	pylab.ylabel('frequency/density')
	pylab.title('TEST: PDF (not scaled)')

	pylab.show()

	raw_input("Press Enter when finished...")
Пример #32
0
def run_test_multinomial(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
        query_row = 10
    else:
        query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c = generate_multinomial_data(get_next_seed(), 2, n_rows, 1)

    state = State.p_State(M_c, T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = []

    # pull n samples
    samples = su.simple_predictive_sample(M_c,
                                          X_L,
                                          X_D,
                                          Y,
                                          Q,
                                          get_next_seed,
                                          n=n)
    X_array = numpy.sort(numpy.array(samples))
    X = numpy.unique(X_array)
    X = X.tolist()

    # build the queries
    Qs = []
    for x in X:
        # Qtmp = (query_row, query_column, x[0])
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(
        su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    print("Sum of densities (should be 1): %f" % (numpy.sum(densities)))

    pylab.clf()

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    mbins = numpy.unique(X_array)

    mbins = numpy.append(mbins, max(mbins) + 1)

    pdf, bins = numpy.histogram(X_array, mbins)

    pdf = pdf / float(numpy.sum(pdf))
    pylab.bar(mbins[0:-1], pdf, label="samples", alpha=.5)
    pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left', fontsize='x-small')
    pylab.xlabel('value')
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()

    fd, fig_filename = tempfile.mkstemp(prefix='run_test_multinomial_',
                                        suffix='.png',
                                        dir='.')
    pylab.savefig(fig_filename)
Пример #33
0
	progress = "[State %i] Collecting samples..." % (state['idx'])
	sys.stdout.write(progress)

	# Generate data from this state partition
	T, M_r, M_c = eu.GenDataFromPartitions(state['col_parts'], state['row_parts'], 0, 10, .5)
	# calculate the probability of the data under each state
	P = np.exp(eu.CCML(state_partitions, T, mu, r, nu, s, alpha, alpha))
	# print "done."
	
	# initialize state samples counter
	state_count = np.zeros(NS)

	# print "Sampling..."
	# start collecting samples
	# initalize the sampler
	p_State = State.p_State(M_c, T, N_GRID=100)
	X_L = eu.FixPriors(p_State.get_X_L(), alpha, mu, s, r, nu)
	X_D = p_State.get_X_D()
	p_State = State.p_State(M_c, T, N_GRID=100, X_L=X_L, X_D=X_D)


	for b in range(200):
		p_State.transition(which_transitions=['column_partition_assignments','row_partition_assignments'])

	mlen = 0;
	for j in range(iters):
		for b in range(burns):
			p_State.transition(which_transitions=['column_partition_assignments','row_partition_assignments'])

		progress1 = "%i of %i" % (j, iters)
		progress = "%s%s" % ('\b'*mlen, progress1)
def run_test_continuous(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
    	query_row = 10
    else:
    	query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c= du.gen_factorial_data_objects(get_next_seed(),2,2,n_rows,1)

    state = State.p_State(M_c, T)

    T_array = numpy.array(T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = [] # no constraints

    # pull n samples
    samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)

    X_array = numpy.sort(numpy.array(samples))

    std_X = numpy.std(X_array)
    mean_X = numpy.mean(X_array)

    # filter out extreme values
    X_filter_low = numpy.nonzero(X_array < mean_X-2.*std_X)[0]
    X_filter_high = numpy.nonzero(X_array > mean_X+2.*std_X)[0]
    X_filter = numpy.hstack((X_filter_low, X_filter_high))
    X_array = numpy.delete(X_array, X_filter)

    # sort for area calculation later on
    X_array = numpy.sort(X_array)

    X = X_array.tolist()

    # build the queries
    Qs = [];
    for x in X:
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    # test that the area under Ps2 and pdfs is about 1 
    # calculated using the trapezoid rule
    area_density = 0;
    for i in range(len(X)-1):
    	area_density += (X[i+1]-X[i])*(densities[i+1]+densities[i])/2.0

    print "Area of PDF (should be close to, but not greater than, 1): " + str(area_density)
    print "*Note: The area will be less than one because the range (integral) is truncated."

    pylab.figure(facecolor='white')

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    pdf, bins, patches = pylab.hist(X,100,normed=1, histtype='stepfilled',label='samples', alpha=.5, color=[.5,.5,.5])
    pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left',fontsize='x-small')
    pylab.xlabel('value') 
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()

    raw_input("Press Enter when finished...")
	progress = "[State %i] Collecting samples..." % (state['idx'])
	sys.stdout.write(progress)

	# Generate data from this state partition
	T, M_r, M_c = eu.GenDataFromPartitions(state['col_parts'], state['row_parts'], 0, 10, .5, get_next_seed(rng))
	# calculate the probability of the data under each state
	P = np.exp(eu.CCML(state_partitions, T, mu, r, nu, s, alpha, alpha))
	# print "done."
	
	# initialize state samples counter
	state_count = np.zeros(NS)

	# print "Sampling..."
	# start collecting samples
	# initalize the sampler
	p_State = State.p_State(M_c, T, N_GRID=100, SEED=get_next_seed(rng))
	X_L = eu.FixPriors(p_State.get_X_L(), alpha, mu, s, r, nu)
	X_D = p_State.get_X_D()
	p_State = State.p_State(M_c, T, N_GRID=100, X_L=X_L, X_D=X_D, SEED=get_next_seed(rng))


	for b in range(200):
		p_State.transition(which_transitions=['column_partition_assignments','row_partition_assignments'])

	mlen = 0;
	for j in range(iters):
		for b in range(burns):
			p_State.transition(which_transitions=['column_partition_assignments','row_partition_assignments'])

		progress1 = "%i of %i" % (j, iters)
		progress = "%s%s" % ('\b'*mlen, progress1)
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None):
    """

    """
    random.seed(seed)

    N = 1000
    separation = .9
    
    get_next_seed = lambda : random.randrange(2147483647)

    cluster_weights = [[1.0/float(num_clusters)]*num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights,
                        [separation], seed=get_next_seed(),
                        distargs=[distargs[cctype]],
                        return_structure=True)

    T = numpy.array(T)
    T_list = T
    
    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])
    
    state = State.p_State(M_c, T_list)
    
    # transitions
    state.transition(n_steps=200)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0],
                            seed=get_next_seed()).flatten(1)
    
    # Get support over all component models
    discrete_support = qtu.get_mixture_support(cctype, component_model_type,
                         structure['component_params'][0], nbins=500)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q)
    
    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, 
                structure['component_params'][0], [1.0/num_clusters]*num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1)
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(lpdf), 
            c="blue", 
            edgecolor="none",
            s=100, 
            label="true pdf", 
            alpha=1,
            zorder=3)
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            edgecolor="none",
            s=100, 
            label="predictive probability", 
            alpha=1,
            zorder=4)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        pylab.show()

    return p
	
	for r in correlations:
		for d in range(n_data_sets): # 3 data sets
			#
			T = gen_correlated_data( n, r, SEED=get_next_seed())

			pr, p = pearsonr(T[:,0], T[:,1])

			print "num_samples: %i, R: %f, d: %i. Actual R: %f" % (n, r, d+1, pr)

			M_c = du.gen_M_c_from_T(T)
			X_Ls = []
			X_Ds = []

			for _ in range(n_samples):
				state = State.p_State(M_c, T)
				state.transition(n_steps=burn_in)
				X_Ds.append(state.get_X_D())
				X_Ls.append(state.get_X_L())
			
			MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0,1)], n_samples=200)

			if d == 0:
				data_d = numpy.transpose(Linfoot)
			else:
				data_d = numpy.vstack((data_d, numpy.transpose(Linfoot)))

		if nr == 0:
			data = data_d
		else:
			data = numpy.hstack((data, data_d))
Пример #38
0
def test_impute_vs_column_average_single(component_model_type,
                                         num_clusters,
                                         seed=0):
    """	tests predictive row generation vs column average
		Note: This test does not make sense for categorical data
		Inputs:
			- component_model_type: main class from datatype. Ex:
				ccmext.p_ContinuousComponentModel 
			- num_clusters: the number of clusters in the data
			- seed: (optional) int to seed the RNG 
		Returns:
			- the mean square error of the predictive sample column
			- the mean square error of the column average column
	"""

    random.seed(seed)

    N = 100

    get_next_seed = lambda: random.randrange(2147483647)

    C = .9  # highly-separated clusters

    cctype = component_model_type.cctype

    component_model_parameters = sdg.generate_separated_model_parameters(
        cctype, C, num_clusters, get_next_seed, distargs=distargs[cctype])

    # generte a partition of rows to clusters (evenly-weighted)
    Z = range(num_clusters)
    for z in range(N - num_clusters):
        Z.append(random.randrange(num_clusters))

    random.shuffle(Z)

    # generate the data
    T = numpy.array([[0]] * N, dtype=float)

    for x in range(N):
        z = Z[x]
        T[x] = component_model_type.generate_data_from_parameters(
            component_model_parameters[z], 1, gen_seed=get_next_seed())[0]

    T_list = T.tolist()

    # intialize the state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T)

    # transitions
    state.transition(n_steps=100)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate a row from the sample
    T_generated = sdg.predictive_columns(M_c,
                                         X_L,
                                         X_D, [0],
                                         seed=get_next_seed())

    # generate a row of column averages
    T_colave = numpy.ones(T.shape) * numpy.mean(T)

    # get the mean squared error
    err_sample = numpy.mean((T_generated - T)**2.0)
    err_colave = numpy.mean((T_colave - T)**2.0)

    return err_sample, err_colave
    sys.stdout.write(progress)

    # Generate data from this state partition
    T, M_r, M_c = eu.GenDataFromPartitions(state['col_parts'],
                                           state['row_parts'], 0, 10, .5)
    # calculate the probability of the data under each state
    P = np.exp(eu.CCML(state_partitions, T, mu, r, nu, s, alpha, alpha))
    # print "done."

    # initialize state samples counter
    state_count = np.zeros(NS)

    # print "Sampling..."
    # start collecting samples
    # initalize the sampler
    p_State = State.p_State(M_c, T, N_GRID=100)
    X_L = eu.FixPriors(p_State.get_X_L(), alpha, mu, s, r, nu)
    X_D = p_State.get_X_D()
    p_State = State.p_State(M_c, T, N_GRID=100, X_L=X_L, X_D=X_D)

    for b in range(200):
        p_State.transition(which_transitions=[
            'column_partition_assignments', 'row_partition_assignments'
        ])

    mlen = 0
    for j in range(iters):
        for b in range(burns):
            p_State.transition(which_transitions=[
                'column_partition_assignments', 'row_partition_assignments'
            ])
Пример #40
0
def calc_mean_test_log_likelihood(M_c, T, X_L, X_D, T_test):
    state = State.p_State(M_c, T, X_L, X_D)
    test_log_likelihoods = map(state.calc_row_predictive_logp, T_test)
    mean_test_log_likelihood = numpy.mean(test_log_likelihoods)
    return mean_test_log_likelihood
def test_one_feature_mixture(component_model_type,
                             num_clusters=3,
                             show_plot=False,
                             seed=None):
    """

    """
    random.seed(seed)

    N = 300
    separation = .9

    get_next_seed = lambda: random.randrange(2147483647)

    cluster_weights = [[1.0 / float(num_clusters)] * num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype],
                                     N, [0],
                                     cluster_weights, [separation],
                                     seed=get_next_seed(),
                                     distargs=[distargs[cctype]],
                                     return_structure=True)

    T_list = list(T)
    T = numpy.array(T)

    # pdb.set_trace()
    # create a crosscat state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T_list)

    # Get support over all component models
    discrete_support = qtu.get_mixture_support(
        cctype,
        component_model_type,
        structure['component_params'][0],
        nbins=250)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    # transitions
    state.transition(n_steps=200)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(
        M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1)

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D,
                                                     [] * len(Q), Q)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    # T = T[:,0]
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(50, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type,
                                   structure['component_params'][0],
                                   [1.0 / num_clusters] * num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data',
                  zorder=1)
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples',
                  zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(lpdf),
                      c="blue",
                      edgecolor="none",
                      s=100,
                      label="true pdf",
                      alpha=1,
                      zorder=3)

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      edgecolor="none",
                      s=100,
                      label="predictive probability",
                      alpha=1,
                      zorder=4)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_mixtrue.png"
        pylab.savefig(filename)
        pylab.close()

    return p
Пример #42
0
T, M_r, M_c, header = du.all_continuous_from_file(filename, max_rows, gen_seed)
is_multinomial = [label in multinomial_labels for label in header]
multinomial_column_indices = numpy.nonzero(is_multinomial)[0]
T, M_c = du.convert_columns_to_multinomial(T, M_c, multinomial_column_indices)

burn_in = 10
lag = 10
num_samples = 10
engine = Engine()

# initialize
kernel_list = None
c, r, max_iterations, max_time = None, None, None, None
X_L, X_D = engine.initialize(M_c, M_r, T)

# burn in
X_L, X_D = engine.analyze(M_c, T, X_L, X_D, kernel_list, burn_in, c, r,
                          max_iterations, max_time)

# draw sample states
for sample_idx in range(num_samples):
    print "starting sample_idx #: %s" % sample_idx
    X_L, X_D = engine.analyze(M_c, T, X_L, X_D, kernel_list, lag, c, r,
                              max_iterations, max_time)
    p_State = State.p_State(M_c, T, X_L, X_D, N_GRID=N_GRID)
    plot_filename = 'sample_%s_X_D' % sample_idx
    pkl_filename = 'sample_%s_pickled_state.pkl.gz' % sample_idx
    p_State.save(filename=pkl_filename, M_c=M_c, T=T)
    p_State.plot(filename=plot_filename)
def check_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250
    
    get_next_seed = lambda : random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]
    
    X = component_model_type.generate_data_from_parameters(data_params, N, gen_seed=get_next_seed())
    
    hyperparameters = component_model_type.draw_hyperparameters(X, gen_seed=get_next_seed())[0]
    
    component_model = component_model_type.from_data(X, hyperparameters)
    
    model_parameters = component_model.sample_parameters_given_hyper()
    
    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])
    
    state = State.p_State(M_c, T)
    
    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(su.simple_predictive_sample(M_c, X_L, X_D, [], [(N,0)], get_next_seed, n=N)).flatten(1)
    
    # get support
    discrete_support = component_model_type.generate_discrete_support(model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q,)
    
    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data')
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(component_model_type.log_pdf(numpy.array(discrete_support), 
            model_parameters)), 
            c="blue", 
            s=100, 
            label="true pdf", 
            alpha=1)

        # pylab.ylim([0,2])
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            s=100, 
            label="predictive probability", 
            alpha=1)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
Пример #44
0
def _do_initialize2((M_c, M_r, T, initialization, SEED)):
    p_State = State.p_State(M_c, T, initialization=initialization, SEED=SEED)
    X_L = p_State.get_X_L()
    X_D = p_State.get_X_D()
    return X_L, X_D
Пример #45
0
datas = []

nr = 0
for w in widths:
    T, mi_est = gen_ring(n, w, SEED=get_next_seed())

    datas.append(T)

    print "num_samples: %i, width: %f" % (n, w)

    M_c = du.gen_M_c_from_T(T, cctypes)
    X_Ls = []
    X_Ds = []

    for ns in range(n_samples):
        state = State.p_State(M_c, T)
        state.transition(n_steps=burn_in)
        X_Ds.append(state.get_X_D())
        X_Ls.append(state.get_X_L())

    MI, Linfoot = iu.mutual_information(M_c,
                                        X_Ls,
                                        X_Ds, [(0, 1)],
                                        n_samples=5000)

    data_d = numpy.transpose(MI)

    if nr == 0:
        data = data_d
    else:
        data = numpy.hstack((data, data_d))
def test_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250

    get_next_seed = lambda: random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]

    X = component_model_type.generate_data_from_parameters(
        data_params, N, gen_seed=get_next_seed())

    hyperparameters = component_model_type.draw_hyperparameters(X)[0]

    component_model = component_model_type.from_data(X, hyperparameters)

    model_parameters = component_model.sample_parameters_given_hyper()

    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(
        model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])

    state = State.p_State(M_c, T)

    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(
        su.simple_predictive_sample(M_c,
                                    X_L,
                                    X_D, [], [(N, 0)],
                                    get_next_seed,
                                    n=N)).flatten(1)

    # get support
    discrete_support = component_model_type.generate_discrete_support(
        model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(
        M_c,
        X_L,
        X_D,
        [] * len(Q),
        Q,
    )

    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ = numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(20, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data')
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(
                          component_model_type.log_pdf(
                              numpy.array(discrete_support),
                              model_parameters)),
                      c="blue",
                      s=100,
                      label="true pdf",
                      alpha=1)

        # pylab.ylim([0,2])

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      s=100,
                      label="predictive probability",
                      alpha=1)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
Пример #47
0
is_multinomial = [label in multinomial_labels for label in header]
multinomial_column_indices = numpy.nonzero(is_multinomial)[0]
T, M_c = du.convert_columns_to_multinomial(T, M_c,
                                           multinomial_column_indices)


burn_in = 10
lag = 10
num_samples = 10
engine = Engine()

# initialize
kernel_list = None
c, r, max_iterations, max_time = None, None, None, None
X_L, X_D = engine.initialize(M_c, M_r, T)

# burn in 
X_L, X_D = engine.analyze(M_c, T, X_L, X_D, kernel_list, burn_in,
                          c, r, max_iterations, max_time)

# draw sample states
for sample_idx in range(num_samples):
    print "starting sample_idx #: %s" % sample_idx
    X_L, X_D = engine.analyze(M_c, T, X_L, X_D, kernel_list, lag,
                              c, r, max_iterations, max_time)
    p_State = State.p_State(M_c, T, X_L, X_D, N_GRID=N_GRID)
    plot_filename = 'sample_%s_X_D' % sample_idx
    pkl_filename = 'sample_%s_pickled_state.pkl.gz' % sample_idx
    p_State.save(filename=pkl_filename, M_c=M_c, T=T)
    p_State.plot(filename=plot_filename)
Пример #48
0
def continuous_imputation_confidence(samples,
                                     imputed,
                                     column_component_suffstats_i,
                                     n_steps=100,
                                     n_chains=1,
                                     return_metadata=False):
    # XXX: the confidence in continuous imputation is "the probability that
    # there exists a unimodal summary" which is defined as the proportion of
    # probability mass in the largest mode of a DPMM inferred from the simulate
    # samples. We use crosscat on the samples for a given number of iterations,
    # then calculate the proportion of mass in the largest mode.
    #
    # NOTE: The definition of confidence and its implementation do not agree.
    # The probability of a unimodal summary is P(k=1|X), where k is the number
    # of components in some infinite mixture model. I would describe the
    # current implementation as "Is there a mode with sufficient enough mass
    # that we can ignore the other modes". If this second formulation is to be
    # used, it means that we need to not use the median of all the samples as
    # the imputed value, but the median of the samples of the summary mode,
    # because the summary (the imputed value) should come from the summary
    # mode.
    #
    # There are a lot of problems with this second formulation.
    # 0. SLOW. Like, for real.
    # 1. Non-deterministic. The answer will be different given the same
    #   samples.
    # 2. Inaccurate. Approximate inference about approximate inferences.
    #   In practice confidences on the sample samples could be significantly
    #   different because the Gibbs sampler that underlies crosscat is
    #   susceptible to getting stuck in local maximum. Of course, this could be
    #   mitigated to some extent by using more chains, but things are slow
    #   enough as it is.
    # 3. Confidence (interval) has a distinct meaning to the people who will
    #   be using this software. A unimodal summary does not necessarily mean
    #   that inferences are within an acceptable range. We are going to need to
    #   be loud about this. Maybe there should be a notion of tolerance?
    #
    # An alternative: mutual predictive coverage
    # ------------------------------------------
    # Divide the number of samples in the intersection of the 90% CI's of each
    # component model by the number of samples in the union of the 90% CI's of
    # each component model.

    from crosscat.cython_code import State

    # XXX: assumes samples somes in as a 1-D numpy.array or 1-D list
    num_samples = float(len(samples))
    T = [[x] for x in samples]

    # XXX: This is a higly problematic consequence of the current definition of
    # confidence. If the number of samples is 1, then the confidence is always
    # 1 because there will be exactly 1 mode in the DPMM (recall the DPMM can
    # have, at maximum, as many modes at data points). I figure if we're going
    # to give a bad answer, we shoud give it quickly.
    if num_samples == 1:
        return 1.0

    confs = []
    tlist = [
        'column_hyperparameters', 'row_partition_hyperparameters',
        'row_partition_assignments'
    ]
    M_c = du.gen_M_c_from_T(T, cctypes=['continuous'])

    if return_metadata:
        X_L_list = []
        X_D_list = []

    for _ in range(n_chains):
        ccstate = State.p_State(M_c, T)
        ccstate.transition(which_transitions=tlist, n_steps=n_steps)

        X_D = ccstate.get_X_D()

        assignment = X_D[0]
        num_cats = max(assignment) + 1
        props = numpy.histogram(assignment, num_cats)[0] / num_samples
        confs.append(max(props))

        if return_metadata:
            X_L_list.append(ccstate.get_X_L())
            X_D_list.append(X_D)

    conf = numpy.mean(confs)
    if return_metadata:
        return conf, X_L_list, X_D_list
    else:
        return conf
Пример #49
0
def run_test(args):

    rho = args.rho
    num_times = args.num_times
    min_num_rows = args.min_num_rows
    max_num_rows = args.max_num_rows
    n_grid = args.n_grid
    filename = args.filename
    discrete = args.discrete

    num_samples = []
    for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist():
        num_samples.append(int(ns))

    variances = []

    burn_in = 200

    MIs = numpy.zeros((num_times, len(num_samples)))

    mi_diff = numpy.zeros((len(num_samples), num_times))

    if not discrete:
        T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho)
        cctypes = ['continuous'] * 2
    else:
        T, true_mi, external_mi = gen_correlated_data_discrete(
            num_samples[-1], rho)
        cctypes = ['multinomial'] * 2

    data_subs = []

    n_index = 0
    for n in num_samples:
        T_sub = numpy.copy(T[0:n - 1, :])

        data = []

        data_subs.append(T_sub)

        print("%i: " % n)
        for t in range(num_times):
            M_c = du.gen_M_c_from_T(T_sub, cctypes)
            state = State.p_State(M_c, T_sub)
            state.transition(n_steps=burn_in)
            X_D = state.get_X_D()
            X_L = state.get_X_L()

            MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)],
                                                n_samples=5000)

            mi_diff[n_index, t] = true_mi - MI[0][0]

            print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0]))

            MIs[t, n_index] = MI[0][0]

        n_index += 1

    if discrete:
        dtype_str = "discrete"
    else:
        dtype_str = "continuous"

    basefilename = filename + str(int(time.time()))
    figname = basefilename + ".png"
    datname = basefilename + "_DATA.png"

    pl.figure

    # plot data
    # pl.subplot(1,2,1)
    pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4))
    i = 0
    for T_s in data_subs:
        pl.subplot(1, len(data_subs), i + 1)
        num_rows = num_samples[i]
        if discrete:
            heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0],
                                                        T_s[:, 1],
                                                        bins=10)
            extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
            pl.imshow(heatmap, extent=extent, interpolation="nearest")
        else:
            pl.scatter(T_s[:, 0], T_s[:, 1], alpha=.3, s=81)
        pl.title('#r: ' + str(num_rows))

        i += 1

    pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(datname)
    pl.clf()

    pl.figure(tight_layout=True, figsize=(5, 4))
    # plot convergence
    # pl.subplot(1,2,2)
    # standard deviation
    stderr = numpy.std(MIs, axis=0)  #/(float(num_times)**.5)
    mean = numpy.mean(MIs, axis=0)
    pl.errorbar(num_samples, mean, yerr=stderr, c='blue')
    pl.plot(num_samples, mean, c="blue", alpha=.8, label='mean MI')
    pl.plot(num_samples, [true_mi] * len(num_samples),
            color='red',
            alpha=.8,
            label='true MI')
    pl.plot(num_samples, [external_mi] * len(num_samples),
            color=(0, .5, .5),
            alpha=.8,
            label='external MI')
    pl.title('convergence')
    pl.xlabel('#rows in X (log)')
    pl.ylabel('CrossCat MI - true MI')

    pl.legend(loc=0, prop={'size': 8})
    pl.gca().set_xscale('log')

    # save output
    pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(figname)
Пример #50
0
def continuous_imputation_confidence(
        samples, imputed, column_component_suffstats_i, n_steps=100,
        n_chains=1, return_metadata=False):
    # XXX: the confidence in continuous imputation is "the probability that
    # there exists a unimodal summary" which is defined as the proportion of
    # probability mass in the largest mode of a DPMM inferred from the simulate
    # samples. We use crosscat on the samples for a given number of iterations,
    # then calculate the proportion of mass in the largest mode.
    #
    # NOTE: The definition of confidence and its implementation do not agree.
    # The probability of a unimodal summary is P(k=1|X), where k is the number
    # of components in some infinite mixture model. I would describe the
    # current implementation as "Is there a mode with sufficient enough mass
    # that we can ignore the other modes". If this second formulation is to be
    # used, it means that we need to not use the median of all the samples as
    # the imputed value, but the median of the samples of the summary mode,
    # because the summary (the imputed value) should come from the summary
    # mode.
    #
    # There are a lot of problems with this second formulation.
    # 0. SLOW. Like, for real.
    # 1. Non-deterministic. The answer will be different given the same
    #   samples.
    # 2. Inaccurate. Approximate inference about approximate inferences.
    #   In practice confidences on the sample samples could be significantly
    #   different because the Gibbs sampler that underlies crosscat is
    #   susceptible to getting stuck in local maximum. Of course, this could be
    #   mitigated to some extent by using more chains, but things are slow
    #   enough as it is.
    # 3. Confidence (interval) has a distinct meaning to the people who will
    #   be using this software. A unimodal summary does not necessarily mean
    #   that inferences are within an acceptable range. We are going to need to
    #   be loud about this. Maybe there should be a notion of tolerance?
    #
    # An alternative: mutual predictive coverage
    # ------------------------------------------
    # Divide the number of samples in the intersection of the 90% CI's of each
    # component model by the number of samples in the union of the 90% CI's of
    # each component model.

    from crosscat.cython_code import State

    # XXX: assumes samples somes in as a 1-D numpy.array or 1-D list
    num_samples = float(len(samples))
    T = [[x] for x in samples]

    # XXX: This is a higly problematic consequence of the current definition of
    # confidence. If the number of samples is 1, then the confidence is always
    # 1 because there will be exactly 1 mode in the DPMM (recall the DPMM can
    # have, at maximum, as many modes at data points). I figure if we're going
    # to give a bad answer, we shoud give it quickly.
    if num_samples == 1:
        return 1.0

    confs = []
    tlist = ['column_hyperparameters',
             'row_partition_hyperparameters',
             'row_partition_assignments']
    M_c = du.gen_M_c_from_T(T, cctypes=['continuous'])

    if return_metadata:
        X_L_list = []
        X_D_list = []

    for _ in range(n_chains):
        ccstate = State.p_State(M_c, T)
        ccstate.transition(which_transitions=tlist, n_steps=n_steps)

        X_D = ccstate.get_X_D()

        assignment = X_D[0]
        num_cats = max(assignment)+1
        props = numpy.histogram(assignment, num_cats)[0]/num_samples
        confs.append(max(props))

        if return_metadata:
            X_L_list.append(ccstate.get_X_L())
            X_D_list.append(X_D)

    conf = numpy.mean(confs)
    if return_metadata:
        return conf, X_L_list, X_D_list
    else:
        return conf
Пример #51
0
def run_test_continuous(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
        query_row = 10
    else:
        query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(), 2, 2, n_rows,
                                                1)

    state = State.p_State(M_c, T)

    T_array = numpy.array(T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = []  # no constraints

    # pull n samples
    samples = su.simple_predictive_sample(M_c,
                                          X_L,
                                          X_D,
                                          Y,
                                          Q,
                                          get_next_seed,
                                          n=n)

    X_array = numpy.sort(numpy.array(samples))

    std_X = numpy.std(X_array)
    mean_X = numpy.mean(X_array)

    # filter out extreme values
    X_filter_low = numpy.nonzero(X_array < mean_X - 2. * std_X)[0]
    X_filter_high = numpy.nonzero(X_array > mean_X + 2. * std_X)[0]
    X_filter = numpy.hstack((X_filter_low, X_filter_high))
    X_array = numpy.delete(X_array, X_filter)

    # sort for area calculation later on
    X_array = numpy.sort(X_array)

    X = X_array.tolist()

    # build the queries
    Qs = []
    for x in X:
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(
        su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    # test that the area under Ps2 and pdfs is about 1
    # calculated using the trapezoid rule
    area_density = 0
    for i in range(len(X) - 1):
        area_density += (X[i + 1] - X[i]) * (densities[i + 1] +
                                             densities[i]) / 2.0

    print("Area of PDF (should be close to, but not greater than, 1): " +
          str(area_density))
    print(
        "*Note: The area will be less than one because the range (integral) is truncated."
    )

    pylab.figure(facecolor='white')

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    pdf, bins, patches = pylab.hist(X,
                                    100,
                                    normed=1,
                                    histtype='stepfilled',
                                    label='samples',
                                    alpha=.5,
                                    color=[.5, .5, .5])
    pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left', fontsize='x-small')
    pylab.xlabel('value')
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()
    fd, fig_filename = tempfile.mkstemp(prefix='run_test_continuous_',
                                        suffix='.png',
                                        dir='.')
    pylab.savefig(fig_filename)
Пример #52
0
def test_impute_vs_column_average_single(component_model_type, num_clusters, seed=0):
	"""	tests predictive row generation vs column average
		Note: This test does not make sense for categorical data
		Inputs:
			- component_model_type: main class from datatype. Ex:
				ccmext.p_ContinuousComponentModel 
			- num_clusters: the number of clusters in the data
			- seed: (optional) int to seed the RNG 
		Returns:
			- the mean square error of the predictive sample column
			- the mean square error of the column average column
	"""

	random.seed(seed)

	N = 100

	get_next_seed = lambda : random.randrange(2147483647)

	C = .9 # highly-separated clusters

	cctype = component_model_type.cctype

	component_model_parameters = sdg.generate_separated_model_parameters(
						cctype, C, num_clusters, get_next_seed,
						distargs=distargs[cctype])

	# generte a partition of rows to clusters (evenly-weighted)
	Z = range(num_clusters)
	for z in range(N-num_clusters):
		Z.append(random.randrange(num_clusters))

	random.shuffle(Z)

	# generate the data
	T = numpy.array([[0]]*N, dtype=float)

	for x in range(N):
		z = Z[x]
		T[x] = component_model_type.generate_data_from_parameters(
				component_model_parameters[z], 1, gen_seed=get_next_seed())[0]

	T_list = T.tolist()

	# intialize the state
	M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

	state = State.p_State(M_c, T)

	# transitions
	state.transition(n_steps=100)

	# get the sample
	X_L = state.get_X_L()
	X_D = state.get_X_D()

	# generate a row from the sample
	T_generated = sdg.predictive_columns(M_c, X_L, X_D, [0], seed=get_next_seed())

	# generate a row of column averages
	T_colave = numpy.ones(T.shape)*numpy.mean(T)

	# get the mean squared error
	err_sample = numpy.mean( (T_generated-T)**2.0 )
	err_colave = numpy.mean( (T_colave-T)**2.0 )

	return err_sample, err_colave