Пример #1
0
def _do_simple_predictive_sample(M_c, X_L, X_D, Y, Q, n, get_next_seed):
    is_multistate = su.get_is_multistate(X_L, X_D)
    if is_multistate:
        samples = su.simple_predictive_sample_multistate(M_c, X_L, X_D, Y, Q,
                                                         get_next_seed, n)
    else:
        samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q,
                                              get_next_seed, n)
    return samples
Пример #2
0
def create_test_set(M_c, T, X_L, X_D, n_test, seed_seed=0):
    sample_row_idx = len(T) + 1
    n_cols = len(T[0])
    Y = []
    Q = [(sample_row_idx, col_idx) for col_idx in range(n_cols)]
    int_generator = gu.int_generator(seed_seed)
    get_next_seed = lambda: int_generator.next()
    samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n_test)
    return samples
def create_test_set(M_c, T, X_L, X_D, n_test, seed_seed=0):
    sample_row_idx = len(T) + 1
    n_cols = len(T[0])
    Y = []
    Q = [(sample_row_idx, col_idx) for col_idx in range(n_cols)]
    int_generator = gu.int_generator(seed_seed)
    get_next_seed = lambda: int_generator.next()
    samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n_test)
    return samples
Пример #4
0
def run_test_continuous(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
        query_row = 10
    else:
        query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(), 2, 2, n_rows,
                                                1)

    state = State.p_State(M_c, T)

    T_array = numpy.array(T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = []  # no constraints

    # pull n samples
    samples = su.simple_predictive_sample(M_c,
                                          X_L,
                                          X_D,
                                          Y,
                                          Q,
                                          get_next_seed,
                                          n=n)

    X_array = numpy.sort(numpy.array(samples))

    std_X = numpy.std(X_array)
    mean_X = numpy.mean(X_array)

    # filter out extreme values
    X_filter_low = numpy.nonzero(X_array < mean_X - 2. * std_X)[0]
    X_filter_high = numpy.nonzero(X_array > mean_X + 2. * std_X)[0]
    X_filter = numpy.hstack((X_filter_low, X_filter_high))
    X_array = numpy.delete(X_array, X_filter)

    # sort for area calculation later on
    X_array = numpy.sort(X_array)

    X = X_array.tolist()

    # build the queries
    Qs = []
    for x in X:
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(
        su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    # test that the area under Ps2 and pdfs is about 1
    # calculated using the trapezoid rule
    area_density = 0
    for i in range(len(X) - 1):
        area_density += (X[i + 1] - X[i]) * (densities[i + 1] +
                                             densities[i]) / 2.0

    print("Area of PDF (should be close to, but not greater than, 1): " +
          str(area_density))
    print(
        "*Note: The area will be less than one because the range (integral) is truncated."
    )

    pylab.figure(facecolor='white')

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    pdf, bins, patches = pylab.hist(X,
                                    100,
                                    normed=1,
                                    histtype='stepfilled',
                                    label='samples',
                                    alpha=.5,
                                    color=[.5, .5, .5])
    pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left', fontsize='x-small')
    pylab.xlabel('value')
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()
    fd, fig_filename = tempfile.mkstemp(prefix='run_test_continuous_',
                                        suffix='.png',
                                        dir='.')
    pylab.savefig(fig_filename)
Пример #5
0
def run_test_multinomial(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
        query_row = 10
    else:
        query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c = generate_multinomial_data(get_next_seed(), 2, n_rows, 1)

    state = State.p_State(M_c, T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = []

    # pull n samples
    samples = su.simple_predictive_sample(M_c,
                                          X_L,
                                          X_D,
                                          Y,
                                          Q,
                                          get_next_seed,
                                          n=n)
    X_array = numpy.sort(numpy.array(samples))
    X = numpy.unique(X_array)
    X = X.tolist()

    # build the queries
    Qs = []
    for x in X:
        # Qtmp = (query_row, query_column, x[0])
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(
        su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    print("Sum of densities (should be 1): %f" % (numpy.sum(densities)))

    pylab.clf()

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    mbins = numpy.unique(X_array)

    mbins = numpy.append(mbins, max(mbins) + 1)

    pdf, bins = numpy.histogram(X_array, mbins)

    pdf = pdf / float(numpy.sum(pdf))
    pylab.bar(mbins[0:-1], pdf, label="samples", alpha=.5)
    pylab.scatter(X, densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left', fontsize='x-small')
    pylab.xlabel('value')
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()

    fd, fig_filename = tempfile.mkstemp(prefix='run_test_multinomial_',
                                        suffix='.png',
                                        dir='.')
    pylab.savefig(fig_filename)
def check_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250
    
    get_next_seed = lambda : random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]
    
    X = component_model_type.generate_data_from_parameters(data_params, N, gen_seed=get_next_seed())
    
    hyperparameters = component_model_type.draw_hyperparameters(X, gen_seed=get_next_seed())[0]
    
    component_model = component_model_type.from_data(X, hyperparameters)
    
    model_parameters = component_model.sample_parameters_given_hyper()
    
    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])
    
    state = State.p_State(M_c, T)
    
    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(su.simple_predictive_sample(M_c, X_L, X_D, [], [(N,0)], get_next_seed, n=N)).flatten(1)
    
    # get support
    discrete_support = component_model_type.generate_discrete_support(model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q,)
    
    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data')
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(component_model_type.log_pdf(numpy.array(discrete_support), 
            model_parameters)), 
            c="blue", 
            s=100, 
            label="true pdf", 
            alpha=1)

        # pylab.ylim([0,2])
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            s=100, 
            label="predictive probability", 
            alpha=1)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p
def predictive_columns(M_c, X_L, X_D, columns_list, optional_settings=False, seed=0):
	""" Generates rows of data from the inferred distributions
	Inputs:
		- M_c: crosscat metadata (See documentation)
		- X_L: crosscat metadata (See documentation)
		- X_D: crosscat metadata (See documentation)
		- columns_list: a list of columns to sample
		- optinal_settings: list of dicts of optional arguments. Each column
		  in columns_list should have its own list entry which is either None
		  or a dict with possible keys:
			- missing_data: Proportion missing data
	Returns:
		- a num_rows by len(columns_list) numpy array, where n_rows is the
		original number of rows in the crosscat table. 
	"""
	# supported arguments for optional_settings
	supported_arguments = ['missing_data']

	num_rows = len(X_D[0])
	num_cols = len(M_c['column_metadata'])

	if not isinstance(columns_list, list):
		raise TypeError("columns_list should be a list")

	for col in columns_list:
		if not isinstance(col, int):
			raise TypeError("every entry in columns_list shuold be an integer")
		if col < 0 or col >= num_cols:
			raise ValueError("%i is not a valid column. Should be valid entries\
			 are 0-%i" % (col, num_cols))

	if not isinstance(seed, int):
		raise TypeError("seed should be an int")

	if seed < 0:
		raise ValueError("seed should be positive")

	if optional_settings:
		if not isinstance(optional_settings, list):
			raise TypeError("optional_settings should be a list")

		for col_setting in optional_settings:
			if isinstance(col_setting, dict):
				for key, value in col_setting.iteritems():
					if key not in supported_arguments:
						raise KeyError("Invalid key in optional_settings, '%s'" % key)
	else:
		optional_settings = [None]*len(columns_list)

	random.seed(seed)

	X = numpy.zeros((num_rows, len(columns_list)))

	get_next_seed = lambda : random.randrange(2147483647)

	for c in range(len(columns_list)):
		col = columns_list[c]
		for row in range(num_rows):
			X[row,c] = su.simple_predictive_sample(M_c, X_L, X_D, [],
						 [(row,col)], get_next_seed, n=1)[0][0]

		# check if there are optional arguments
		if isinstance(optional_settings[c], dict):
			# missing data argument
			if has_key(optional_settings[c], 'missing_data'):
				proportion = optional_settings[c]['missing_data']
				X = add_missing_data_to_column(X, c, proportion)

	assert X.shape[0] == num_rows
	assert X.shape[1] == len(columns_list)

	return X
	p_State.transition(which_transitions=['column_partition_assignments','row_partition_assignments'])

# quick test just to make sure things output what they're supposed to 
x = 0.0;
query_row = len(row[0]) # tests unobserved
# query_row = 3;		# tests observed
Q = [(query_row,0,x)]


Y = [] # no contraints
# Y = [(1,0,.1),(3,0,.1),(22,0,105),(30,0,100)] # generic constraints

p = su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)

n = 1000;
samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)

X = [sample[0] for sample in samples]

pylab.figure(facecolor='white')
pdf, bins, patches = pylab.hist(X,50,normed=True, histtype='bar',label='samples',edgecolor='none')
pylab.show()

pdf_max = max(pdf)

Qs = [];
for i in range(n):
    Qtmp = (query_row,0,X[i])
    Qs.append(Qtmp)

Ps = su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs)
Пример #9
0
X_D = save_dict['X_D']

# FIXME: test constraints
# Y = [su.Bunch(index=2,value=2.3), su.Bunch(index=0,value=-4.)]
Y = None

# test simple_predictive_sample_observed
views_replicating_samples_params = su.determine_replicating_samples_params(
    X_L, X_D)
views_samples = []
for replicating_samples_params in views_replicating_samples_params:
    this_view_samples = []
    for replicating_sample_params in replicating_samples_params:
        this_view_this_sample = su.simple_predictive_sample(
            M_c,
            X_L,
            X_D,
            get_next_seed=get_next_seed,
            **replicating_sample_params)
        this_view_samples.extend(this_view_this_sample)
    views_samples.append(this_view_samples)
for view_idx, view_samples in enumerate(views_samples):
    data_array = numpy.array(view_samples)
    pu.plot_T(data_array)
    pylab.title('simple_predictive_sample observed, view %s on local' %
                view_idx)

# test simple_predictive_sample_observed REMOTE
# hostname = 'ec2-23-22-208-4.compute-1.amazonaws.com'
URI = 'http://' + hostname + ':8007'
method_name = 'simple_predictive_sample'
#
Пример #10
0
random_state = numpy.random.RandomState(inf_seed)
M_c = save_dict['M_c']
X_L = save_dict['X_L']
X_D = save_dict['X_D']

# FIXME: test constraints
# Y = [su.Bunch(index=2,value=2.3), su.Bunch(index=0,value=-4.)]
Y = None

# test simple_predictive_sample_observed
views_replicating_samples_params = su.determine_replicating_samples_params(X_L, X_D)
views_samples = []
for replicating_samples_params in views_replicating_samples_params:
    this_view_samples = []
    for replicating_sample_params in replicating_samples_params:
        this_view_this_sample = su.simple_predictive_sample(
            M_c, X_L, X_D, get_next_seed=get_next_seed, **replicating_sample_params)
        this_view_samples.extend(this_view_this_sample)
    views_samples.append(this_view_samples)
for view_idx, view_samples in enumerate(views_samples):
    data_array = numpy.array(view_samples)
    pu.plot_T(data_array)
    pylab.title('simple_predictive_sample observed, view %s on local' % view_idx)

# test simple_predictive_sample_observed REMOTE
# hostname = 'ec2-23-22-208-4.compute-1.amazonaws.com'
URI = 'http://' + hostname + ':8007'
method_name = 'simple_predictive_sample'
#
views_samples = []
for replicating_samples_params in views_replicating_samples_params:
    this_view_samples = []
Пример #11
0
def predictive_columns(M_c,
                       X_L,
                       X_D,
                       columns_list,
                       optional_settings=False,
                       seed=0):
    """ Generates rows of data from the inferred distributions
	Inputs:
		- M_c: crosscat metadata (See documentation)
		- X_L: crosscat metadata (See documentation)
		- X_D: crosscat metadata (See documentation)
		- columns_list: a list of columns to sample
		- optinal_settings: list of dicts of optional arguments. Each column
		  in columns_list should have its own list entry which is either None
		  or a dict with possible keys:
			- missing_data: Proportion missing data
	Returns:
		- a num_rows by len(columns_list) numpy array, where n_rows is the
		original number of rows in the crosscat table. 
	"""
    # supported arguments for optional_settings
    supported_arguments = ['missing_data']

    num_rows = len(X_D[0])
    num_cols = len(M_c['column_metadata'])

    if not isinstance(columns_list, list):
        raise TypeError("columns_list should be a list")

    for col in columns_list:
        if not isinstance(col, int):
            raise TypeError("every entry in columns_list shuold be an integer")
        if col < 0 or col >= num_cols:
            raise ValueError(
                "%i is not a valid column. Should be valid entries\
			 are 0-%i" % (col, num_cols))

    if not isinstance(seed, int):
        raise TypeError("seed should be an int")

    if seed < 0:
        raise ValueError("seed should be positive")

    if optional_settings:
        if not isinstance(optional_settings, list):
            raise TypeError("optional_settings should be a list")

        for col_setting in optional_settings:
            if isinstance(col_setting, dict):
                for key, value in six.iteritems(col_setting):
                    if key not in supported_arguments:
                        raise KeyError(
                            "Invalid key in optional_settings, '%s'" % key)
    else:
        optional_settings = [None] * len(columns_list)

    random.seed(seed)

    X = numpy.zeros((num_rows, len(columns_list)))

    get_next_seed = lambda: random.randrange(2147483647)

    for c in range(len(columns_list)):
        col = columns_list[c]
        for row in range(num_rows):
            X[row, c] = su.simple_predictive_sample(M_c,
                                                    X_L,
                                                    X_D, [], [(row, col)],
                                                    get_next_seed,
                                                    n=1)[0][0]

        # check if there are optional arguments
        if isinstance(optional_settings[c], dict):
            # missing data argument
            if has_key(optional_settings[c], 'missing_data'):
                proportion = optional_settings[c]['missing_data']
                X = add_missing_data_to_column(X, c, proportion)

    assert X.shape[0] == num_rows
    assert X.shape[1] == len(columns_list)

    return X
def run_test_continuous(n, observed):
    n_rows = 40
    n_cols = 40

    if observed:
    	query_row = 10
    else:
    	query_row = n_rows

    query_column = 1

    Q = [(query_row, query_column)]

    # do the test with multinomial data
    T, M_r, M_c= du.gen_factorial_data_objects(get_next_seed(),2,2,n_rows,1)

    state = State.p_State(M_c, T)

    T_array = numpy.array(T)

    X_L = state.get_X_L()
    X_D = state.get_X_D()

    Y = [] # no constraints

    # pull n samples
    samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)

    X_array = numpy.sort(numpy.array(samples))

    std_X = numpy.std(X_array)
    mean_X = numpy.mean(X_array)

    # filter out extreme values
    X_filter_low = numpy.nonzero(X_array < mean_X-2.*std_X)[0]
    X_filter_high = numpy.nonzero(X_array > mean_X+2.*std_X)[0]
    X_filter = numpy.hstack((X_filter_low, X_filter_high))
    X_array = numpy.delete(X_array, X_filter)

    # sort for area calculation later on
    X_array = numpy.sort(X_array)

    X = X_array.tolist()

    # build the queries
    Qs = [];
    for x in X:
        Qtmp = (query_row, query_column, x)
        Qs.append(Qtmp)

    # get pdf values
    densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

    # test that the area under Ps2 and pdfs is about 1 
    # calculated using the trapezoid rule
    area_density = 0;
    for i in range(len(X)-1):
    	area_density += (X[i+1]-X[i])*(densities[i+1]+densities[i])/2.0

    print "Area of PDF (should be close to, but not greater than, 1): " + str(area_density)
    print "*Note: The area will be less than one because the range (integral) is truncated."

    pylab.figure(facecolor='white')

    # PLOT: probability vs samples distribution
    # scale all histograms to be valid PDFs (area=1)
    pdf, bins, patches = pylab.hist(X,100,normed=1, histtype='stepfilled',label='samples', alpha=.5, color=[.5,.5,.5])
    pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none')

    pylab.legend(loc='upper left',fontsize='x-small')
    pylab.xlabel('value') 
    pylab.ylabel('frequency/density')
    pylab.title('TEST: PDF (not scaled)')

    pylab.show()

    raw_input("Press Enter when finished...")
def run_test_multinomial(n, observed):
	n_rows = 40
	n_cols = 40

	if observed:
		query_row = 10
	else:
		query_row = n_rows

	query_column = 1

	Q = [(query_row, query_column)]

	# do the test with multinomial data
	T, M_r, M_c = generate_multinomial_data(get_next_seed(),2,n_rows,1)
	
	state = State.p_State(M_c, T)

	X_L = state.get_X_L()
	X_D = state.get_X_D()

	Y = []

	# pull n samples
	samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed,n=n)
	X_array = numpy.sort(numpy.array(samples))
	X = numpy.unique(X_array)
	X = X.tolist()

	# build the queries
	Qs = [];
	for x in X:
	    # Qtmp = (query_row, query_column, x[0])
	    Qtmp = (query_row, query_column, x)
	    Qs.append(Qtmp)

	# get pdf values
	densities = numpy.exp(su.simple_predictive_probability(M_c, X_L, X_D, Y, Qs))

	print "Sum of densities (should be 1): %f" % (numpy.sum(densities))

	pylab.clf()

	# PLOT: probability vs samples distribution
	# scale all histograms to be valid PDFs (area=1)
	mbins = numpy.unique(X_array)

	mbins = numpy.append(mbins,max(mbins)+1)

	pdf, bins = numpy.histogram(X_array,mbins)

	pdf = pdf/float(numpy.sum(pdf))
	pylab.bar(mbins[0:-1],pdf,label="samples",alpha=.5)
	pylab.scatter(X,densities, c="red", label="pdf", edgecolor='none')

	pylab.legend(loc='upper left',fontsize='x-small')
	pylab.xlabel('value') 
	pylab.ylabel('frequency/density')
	pylab.title('TEST: PDF (not scaled)')

	pylab.show()

	raw_input("Press Enter when finished...")
Пример #14
0
        'column_partition_assignments', 'row_partition_assignments'
    ])

# quick test just to make sure things output what they're supposed to
x = 0.0
query_row = len(row[0])  # tests unobserved
# query_row = 3;		# tests observed
Q = [(query_row, 0, x)]

Y = []  # no contraints
# Y = [(1,0,.1),(3,0,.1),(22,0,105),(30,0,100)] # generic constraints

p = su.simple_predictive_probability(M_c, X_L, X_D, Y, Q)

n = 1000
samples = su.simple_predictive_sample(M_c, X_L, X_D, Y, Q, get_next_seed, n=n)

X = [sample[0] for sample in samples]

pylab.figure(facecolor='white')
pdf, bins, patches = pylab.hist(X,
                                50,
                                normed=True,
                                histtype='bar',
                                label='samples',
                                edgecolor='none')
pylab.show()

pdf_max = max(pdf)

Qs = []
def test_one_feature_sampler(component_model_type, show_plot=False):
    """
    Tests the ability of component model of component_model_type to capture the
    distribution of the data.
    1. Draws 100 random points from a standard normal distribution
    2. Initializes a component model with that data (and random hyperparameters)
    3. Draws data from that component model
    4. Initialize a crosscat state with that data
    5. Get one sample after 100 transitions
    6. Draw predictive samples
    7. Caluclates the 95 precent support of the continuous distribution or the 
        entire support of the discrete distribution
    8. Calculate the true pdf for each point in the support
    9. Calculate the predictive probability given the sample for each point in
        the support
    10. (OPTIONAL) Plot the original data, predictive samples, pdf, and 
        predictive probabilities 
    11. Calculate goodness of fit stats (returns p value)
    """
    N = 250

    get_next_seed = lambda: random.randrange(2147483647)

    data_params = default_data_parameters[component_model_type.model_type]

    X = component_model_type.generate_data_from_parameters(
        data_params, N, gen_seed=get_next_seed())

    hyperparameters = component_model_type.draw_hyperparameters(X)[0]

    component_model = component_model_type.from_data(X, hyperparameters)

    model_parameters = component_model.sample_parameters_given_hyper()

    # generate data from the parameters
    T = component_model_type.generate_data_from_parameters(
        model_parameters, N, gen_seed=get_next_seed())

    # create a crosscat state
    M_c = du.gen_M_c_from_T(T, cctypes=[component_model_type.cctype])

    state = State.p_State(M_c, T)

    # transitions
    n_transitions = 100
    state.transition(n_steps=n_transitions)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = numpy.array(
        su.simple_predictive_sample(M_c,
                                    X_L,
                                    X_D, [], [(N, 0)],
                                    get_next_seed,
                                    n=N)).flatten(1)

    # get support
    discrete_support = component_model_type.generate_discrete_support(
        model_parameters)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(
        M_c,
        X_L,
        X_D,
        [] * len(Q),
        Q,
    )

    T = numpy.array(T)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        T_hist, edges = numpy.histogram(T, bins=len(discrete_support))
        S_hist, _ = numpy.histogram(predictive_samples, bins=edges)
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(20, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data')
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples')

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(
                          component_model_type.log_pdf(
                              numpy.array(discrete_support),
                              model_parameters)),
                      c="blue",
                      s=100,
                      label="true pdf",
                      alpha=1)

        # pylab.ylim([0,2])

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      s=100,
                      label="predictive probability",
                      alpha=1)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %s w/ params: \n%s\ninference after %i crosscat transitions\n%s test: p = %f" \
            % (N, component_model_type.cctype, str(get_params_string(model_parameters)), n_transitions, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_single.png"
        pylab.savefig(filename)
        pylab.close()

    return p