Пример #1
0
def gen_data(filename, arsgin, save_csv=True):
    """
        Generates a synthetic tablel with given properties. For full 
        documentation see sdg.gen_data
    """
    cctypes = arsgin['cctypes']
    n_rows = arsgin['num_rows']
    n_cols = arsgin['num_cols']
    n_views = arsgin['num_views']
    n_clusters = arsgin['num_clusters']
    separation = arsgin['separation']
    seed = arsgin['seed']

    if 'distargs' in arsgin.keys():
        distargs = arsgin['distargs']
    else:
        distargs = None

    random.seed(seed)

    # need to generate cluster_weights and cols_to_views
    cols_to_views = range(n_views)
    for c in range(n_views, n_cols):
        cols_to_views.append(random.randrange(n_views))

    cluster_weights = []
    for v in range(n_views):
        cluster_weights.append([1./n_clusters]*n_clusters)

    T, _, structure = sdg.gen_data(cctypes, n_rows, cols_to_views, 
                        cluster_weights, separation,
                        seed=seed, distargs=distargs, return_structure=True)

    T = numpy.array(T)

    if save_csv:
        header = [ 'col_'+str(col) for col in range(n_cols) ]

        # write the data to a list of list
        out = [header]
        for row in range(n_rows):
            row_out = []
            for col in range(n_cols):
                if cctypes[col] == 'continuous':
                    value = T[row][col]
                elif cctypes[col] == 'multinomial':
                    value = int(T[row][col])
                else:
                    raise ValueError("unsupported cctype: %s" % cctypes[col])
                row_out.append(value)
            out.append(row_out)

        list_to_csv(filename, out)

    return T, structure
	def test_proper_set_up_all_continuous(self):
		T, M_c = sdg.gen_data(self.cctypes_all_contiuous,
			self.n_rows,
			self.cols_to_views_good,
			self.cluster_weights_good,
			self.separation_good,
			seed=0,
			distargs=None)

		assert(len(T) == self.n_rows)
		assert(len(T[0]) == len(self.cols_to_views_good))
	def test_proper_set_up_mixed(self):
		distargs = [ None, None, dict(K=5), None, dict(K=5)]
		T, M_c = sdg.gen_data(self.cctypes_mixed,
			self.n_rows,
			self.cols_to_views_good,
			self.cluster_weights_good,
			self.separation_good,
			seed=0,
			distargs=distargs)

		assert(len(T) == self.n_rows)
		assert(len(T[0]) == len(self.cols_to_views_good))
	def test_different_seeds_should_produce_the_different_data(self):
		distargs = [None]*5
		T1, M_c = sdg.gen_data(self.cctypes_all_contiuous,
			self.n_rows,
			self.cols_to_views_good,
			self.cluster_weights_good,
			self.separation_good,
			seed=0,
			distargs=distargs)

		T2, M_c = sdg.gen_data(self.cctypes_all_contiuous,
			self.n_rows,
			self.cols_to_views_good,
			self.cluster_weights_good,
			self.separation_good,
			seed=12345,
			distargs=distargs)

		A1 = numpy.array(T1)
		A2 = numpy.array(T2)
		
		assert not numpy.all(A1==A2)
Пример #5
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    max_cols = argin["max_cols"]
    rho = argin["rho"]
    num_indep_queries = argin["num_indep_queries"]
    independent_clusters = argin["independent_clusters"]
    ct_kernel = argin["ct_kernel"]
    multimodal = argin["multimodal"]
    separation = argin["separation"]

    all_cols = max_cols + 4 # max_cols plus number of dependent columns

    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)
        numpy.random.seed(seed)

    # build full data file
    # generate column indices and header
    col_names = [ "col_%i" % i for i in range(all_cols)]

    Zv = [0,0,1,1] # our needles
    Zv.extend(range(2,all_cols-2))

    min_clusters = 3
    max_clusters = 10

    T_array = numpy.zeros( (num_rows, all_cols) )

    Sigma = numpy.array( [[1.0,rho],[rho,1.0]])
    mu = numpy.array([0,0])

    if multimodal:
        T = [[0]*num_cols]*num_rows
        Zv = [0,0,1,1] # our needles
        Zv.extend(range(2,num_cols-2))
        random.shuffle(Zv)

        num_views = max(Zv)+1

        separation = [separation]*2
        separation.extend([separation]*(num_views-2))

        min_clusters = 4
        max_clusters = 5

        cluster_weights = []
        # generate weights. 
        for v in range(num_views):
            if v < 2:
                num_clusters = random.randrange(min_clusters, max_clusters)
            else:
                num_clusters = 1
            cluster_weights.append( [1.0/num_clusters]*num_clusters ) 

        cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories)
        T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs)
        T_array = numpy.array(T)
    else:
        T_array[:, 0:1+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows)
        T_array[:, 2:3+1] = numpy.random.multivariate_normal(mu, Sigma, num_rows)
        separation = .5
        for col in range(4, all_cols):
            num_clusters = random.randrange(min_clusters, max_clusters)+1
            for row in range(num_rows):
                k = random.randrange(num_clusters)
                T_array[row, col] = numpy.random.randn()+k*6*separation

        T = T_array.tolist()

    # save file to .csv
    exp_path = 'expdata/hb/'
    eu.make_folder(exp_path)
    filename = exp_path + "haystack_break_exp.csv"
    table = "haystack_break_exp"
    T.insert(0, col_names)
    eu.list_to_csv(filename, T)
    # done building data file

    # get colum step size (powers of two)
    num_steps = int( math.log(max_cols, 2) )-1
    step_size = [2**t for t in range(2, num_steps+1)]

    assert step_size[-1] <= max_cols

    if step_size[-1] < max_cols:
        step_size.append(max_cols)

    assert step_size[0] == 4 and step_size[-1] == max_cols

    # the needle column names
    needle_a_cols = (col_names[0],col_names[1])
    needle_b_cols = (col_names[2],col_names[3])

    result = dict()
    result['steps'] = []

    for num_distractor_columns in step_size:
        # create subdata
        T_sub = take_T_column_subset(T, range(4+num_distractor_columns) )
        subpath = exp_path+'d_'+str(num_distractor_columns)+'/'
        eu.make_folder(subpath)
        subfilename = subpath + "haystack_break_exp_" + str(num_distractor_columns) + ".csv"
        eu.list_to_csv(subfilename, T_sub)

        col_names_sub = T_sub[0]

        # generate queries
        queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols,
                            col_names_sub, table, num_indep_queries)
        num_queries = len(queries)

        dependence_probs = numpy.zeros( (num_iters+1, num_queries) )

        client = Client()

        client('DROP BTABLE %s;' % table, yes=True)
        client('CREATE BTABLE %s FROM %s;' % (table, subfilename))
        init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table)
        print init_string 
        client(init_string)
        client('SHOW DIAGNOSTICS FOR %s;' % table)

        # do the analyses
        for i in range(0,num_iters+1):
            if i > 0:
                if ct_kernel == 1:
                    client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table )
                else:
                    client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table )

            for q in range(num_queries):
                query = queries[q]
                out = client(query, pretty=False, pandas_output=False)
                dependence_probs[i,q] = out[0]['data'][0][1]

        subresult = dict()
        # store the queries in subresult
        subresult['query_col1'] = []
        subresult['query_col2'] = []
        subresult['dependence_probs'] = dependence_probs
        for pair in pairs:
            subresult['query_col1'].append(pair[0])
            subresult['query_col2'].append(pair[1])
        
        # for each query, get wether those columns were actually independent
        independent = [True]*num_queries
        for i in range(num_queries):
            col_idx_0 = pairs[i][0]
            col_idx_1 = pairs[i][1]            
            if Zv[col_idx_0] == Zv[col_idx_1]:
                independent[i] = False

        subresult['cols_independent'] = independent
        subresult['distractor_cols'] = num_distractor_columns
        result['steps'].append(subresult)
    
    result['config'] = argin
    result['data'] = T_array

    return result
Пример #6
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    with_id = argin["with_id"]
    needles = argin["needles"]
    mixed_types = argin["mixed_types"]
    multinomial_categories = argin["multinomial_categories"]
    separation = argin["separation"]
    num_indep_queries = argin["num_indep_queries"]
    independent_clusters = argin["independent_clusters"]
    ct_kernel = argin["ct_kernel"]

    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)

    # generate column indices and header
    col_names = [ "col_%i" % i for i in range(num_cols)]

    if mixed_types and multinomial_categories > 0:
        data_mode = 'mixed'
    elif multinomial_categories > 0:
        data_mode = 'multinomial'
    else:
        data_mode = 'continuous'

    if needles:
        T = [[0]*num_cols]*num_rows
        Zv = [0,0,1,1] # our needles
        Zv.extend(range(2,num_cols-2))
        # random.shuffle(Zv)

        num_views = max(Zv)+1

        separation = [.95]*2
        separation.extend([0.0]*(num_views-2))

        min_clusters = 4
        max_clusters = 5

        cluster_weights = []
        # generate weights. 
        for v in range(num_views):
            if v < 2:
                num_clusters = random.randrange(min_clusters, max_clusters)
            else:
                if independent_clusters:
                    num_clusters = random.randrange(min_clusters, max_clusters)
                else:
                    num_clusters = 1

            cluster_weights.append( [1.0/num_clusters]*num_clusters ) 

        cctypes, distargs = eu.get_column_types(data_mode, num_cols, multinomial_categories)
        T, _ = sdg.gen_data(cctypes, num_rows, Zv, cluster_weights, separation, distargs=distargs)
    else:
        T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols)


    # # preprend the row_id
    # if with_id:
    #     needle_a_cols = (1,2)
    #     needle_b_cols = (3,4)
    #     col_names.insert(0, 'ID')
    #     # TODO: ID type
    #     cctypes.insert(0,'continuous')
    #     # header = "ID,%s" % header
    #     if needles:
    #         Zv.insert(0, num_views)
    #     for row in range(num_rows):
    #         T[row].insert(0, row)
    # else:
    needle_a_cols = (col_names[0],col_names[1])
    needle_b_cols = (col_names[2],col_names[3])

    # save file to .csv
    filename = "needles_exp.csv"
    table = "needles_exp"
    T.insert(0, col_names)
    eu.list_to_csv(filename, T)

    # generate queries
    queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols,
                        col_names, table, num_indep_queries)
    num_queries = len(queries)

    dependence_probs = numpy.zeros( (num_iters, num_queries) )

    client = Client()

    client('DROP BTABLE %s;' % table, yes=True)
    client('CREATE BTABLE %s FROM %s;' % (table, filename))
    init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table)
    print init_string 
    client(init_string)
    client('SHOW DIAGNOSTICS FOR %s;' % table)

    # do the analyses
    for i in range(num_iters):
        if ct_kernel == 1:
            client( 'ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table )
        else:
            client( 'ANALYZE %s FOR 1 ITERATIONS WAIT;' % table )

        for q in range(num_queries):
            query = queries[q]
            out = client(query, pretty=False, pandas_output=False)
            dependence_probs[i,q] = out[0]['data'][0][1]

    result = dict()
    # store the queries in result
    result['query_col1'] = []
    result['query_col2'] = []
    result['dependence_probs'] = dependence_probs
    for pair in pairs:
        result['query_col1'].append(pair[0])
        result['query_col2'].append(pair[1])
    
    # for each query, get wether those columns were actually independent
    independent = [True]*num_queries
    if needles:
        for i in range(num_queries):
            col_idx_0 = pairs[i][0]
            col_idx_1 = pairs[i][1]            
            if Zv[col_idx_0] == Zv[col_idx_1]:
                independent[i] = False

    result['cols_independent'] = independent
    result['config'] = argin
    result['config']['data_mode'] = data_mode

    client('SHOW DIAGNOSTICS FOR %s;' % table)

    return result
Пример #7
0
def run_experiment(argin):
    num_iters = argin["num_iters"]
    num_chains = argin["num_chains"]
    num_rows = argin["num_rows"]
    num_cols = argin["num_cols"]
    with_id = argin["with_id"]
    needles = argin["needles"]
    mixed_types = argin["mixed_types"]
    multinomial_categories = argin["multinomial_categories"]
    separation = argin["separation"]
    num_indep_queries = argin["num_indep_queries"]
    independent_clusters = argin["independent_clusters"]
    ct_kernel = argin["ct_kernel"]

    seed = argin["seed"]

    if seed > 0:
        random.seed(seed)

    # generate column indices and header
    col_names = ["col_%i" % i for i in range(num_cols)]

    if mixed_types and multinomial_categories > 0:
        data_mode = 'mixed'
    elif multinomial_categories > 0:
        data_mode = 'multinomial'
    else:
        data_mode = 'continuous'

    if needles:
        T = [[0] * num_cols] * num_rows
        Zv = [0, 0, 1, 1]  # our needles
        Zv.extend(range(2, num_cols - 2))
        # random.shuffle(Zv)

        num_views = max(Zv) + 1

        separation = [.95] * 2
        separation.extend([0.0] * (num_views - 2))

        min_clusters = 4
        max_clusters = 5

        cluster_weights = []
        # generate weights.
        for v in range(num_views):
            if v < 2:
                num_clusters = random.randrange(min_clusters, max_clusters)
            else:
                if independent_clusters:
                    num_clusters = random.randrange(min_clusters, max_clusters)
                else:
                    num_clusters = 1

            cluster_weights.append([1.0 / num_clusters] * num_clusters)

        cctypes, distargs = eu.get_column_types(data_mode, num_cols,
                                                multinomial_categories)
        T, _ = sdg.gen_data(cctypes,
                            num_rows,
                            Zv,
                            cluster_weights,
                            separation,
                            distargs=distargs)
    else:
        T, cctypes = eu.generate_noise(data_mode, num_rows, num_cols)

    # # preprend the row_id
    # if with_id:
    #     needle_a_cols = (1,2)
    #     needle_b_cols = (3,4)
    #     col_names.insert(0, 'ID')
    #     # TODO: ID type
    #     cctypes.insert(0,'continuous')
    #     # header = "ID,%s" % header
    #     if needles:
    #         Zv.insert(0, num_views)
    #     for row in range(num_rows):
    #         T[row].insert(0, row)
    # else:
    needle_a_cols = (col_names[0], col_names[1])
    needle_b_cols = (col_names[2], col_names[3])

    # save file to .csv
    filename = "needles_exp.csv"
    table = "needles_exp"
    T.insert(0, col_names)
    eu.list_to_csv(filename, T)

    # generate queries
    queries, pairs = generate_dependence_queries(needle_a_cols, needle_b_cols,
                                                 col_names, table,
                                                 num_indep_queries)
    num_queries = len(queries)

    dependence_probs = numpy.zeros((num_iters, num_queries))

    client = Client()

    client('DROP BTABLE %s;' % table, yes=True)
    client('CREATE BTABLE %s FROM %s;' % (table, filename))
    init_string = 'INITIALIZE %i MODELS FOR %s;' % (num_chains, table)
    print init_string
    client(init_string)
    client('SHOW DIAGNOSTICS FOR %s;' % table)

    # do the analyses
    for i in range(num_iters):
        if ct_kernel == 1:
            client('ANALYZE %s FOR 1 ITERATIONS WITH MH KERNEL WAIT;' % table)
        else:
            client('ANALYZE %s FOR 1 ITERATIONS WAIT;' % table)

        for q in range(num_queries):
            query = queries[q]
            out = client(query, pretty=False, pandas_output=False)
            dependence_probs[i, q] = out[0]['data'][0][1]

    result = dict()
    # store the queries in result
    result['query_col1'] = []
    result['query_col2'] = []
    result['dependence_probs'] = dependence_probs
    for pair in pairs:
        result['query_col1'].append(pair[0])
        result['query_col2'].append(pair[1])

    # for each query, get wether those columns were actually independent
    independent = [True] * num_queries
    if needles:
        for i in range(num_queries):
            col_idx_0 = pairs[i][0]
            col_idx_1 = pairs[i][1]
            if Zv[col_idx_0] == Zv[col_idx_1]:
                independent[i] = False

    result['cols_independent'] = independent
    result['config'] = argin
    result['config']['data_mode'] = data_mode

    client('SHOW DIAGNOSTICS FOR %s;' % table)

    return result
def test_predictive_sample_improvement(component_model_type, seed=0, show_plot=True):
	""" Shows the error of predictive sample over iterations.
	"""

	num_transitions = 100
	num_samples = 10	
	num_clusters = 2
	separation = .9	# cluster separation
	N = 150
	
	random.seed(seed)
	get_next_seed = lambda : random.randrange(2147483647)

	# generate a single column of data from the component_model 
	cctype = component_model_type.cctype
	T, M_c, struc = sdg.gen_data([cctype], N, [0], [[.5,.5]], [separation], 
				seed=get_next_seed(), distargs=[distargs[cctype]], 
				return_structure=True)

	T_array = numpy.array(T)

	X = numpy.zeros((N,num_transitions))
	KL = numpy.zeros((num_samples, num_transitions))


	support = qtu.get_mixture_support(cctype, component_model_type, 
					struc['component_params'][0], nbins=1000, support=.995)
	true_log_pdf = qtu.get_mixture_pdf(support, component_model_type, 
					struc['component_params'][0],[.5,.5])

	for s in range(num_samples):
		# generate the state
		state = State.p_State(M_c, T, SEED=get_next_seed())

		for i in range(num_transitions):
			# transition
			state.transition()

			# get partitions and generate a predictive column
			X_L = state.get_X_L()
			X_D = state.get_X_D()

			T_inf = sdg.predictive_columns(M_c, X_L, X_D, [0], 
					seed=get_next_seed())

			if cctype == 'multinomial':
				K = distargs[cctype]['K']
				weights = numpy.zeros(numpy.array(K))
				for params in struc['component_params'][0]:
					weights += numpy.array(params['weights'])*(1.0/num_clusters)
				weights *= float(N)
				inf_hist = qtu.bincount(T_inf, bins=range(K))
				err, _ = stats.power_divergence(inf_hist, weights, lambda_='pearson')
				err = numpy.ones(N)*err
			else:
				err = (T_array-T_inf)**2.0

			KL[s,i] = qtu.KL_divergence(component_model_type, 
						struc['component_params'][0], [.5,.5], M_c, X_L, X_D,
						true_log_pdf=true_log_pdf, support=support)

			for j in range(N):
				X[j,i] += err[j]

	X /= num_samples

	# mean and standard error
	X_mean = numpy.mean(X,axis=0)
	X_err = numpy.std(X,axis=0)/float(num_samples)**.5

	KL_mean = numpy.mean(KL, axis=0)
	KL_err = numpy.std(KL, axis=0)/float(num_samples)**.5

	if show_plot:
		pylab.subplot(1,2,1)
		pylab.errorbar(range(num_transitions), X_mean, yerr=X_err)
		pylab.xlabel('iteration')
		pylab.ylabel('error across each data point')
		pylab.title('error of predictive sample over iterations, N=%i' % N)

		pylab.subplot(1,2,2)
		pylab.errorbar(range(num_transitions), KL_mean, yerr=KL_err)
		pylab.xlabel('iteration')
		pylab.ylabel('KL divergence')
		pylab.title('KL divergence, N=%i' % N)

		pylab.show()

	# error should decrease over time
	return X_mean[0] > X_mean[-1] and KL_mean[0] > KL_mean[-1]
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None):
    """

    """
    random.seed(seed)

    N = 1000
    separation = .9
    
    get_next_seed = lambda : random.randrange(2147483647)

    cluster_weights = [[1.0/float(num_clusters)]*num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights,
                        [separation], seed=get_next_seed(),
                        distargs=[distargs[cctype]],
                        return_structure=True)

    T = numpy.array(T)
    T_list = T
    
    # create a crosscat state 
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])
    
    state = State.p_State(M_c, T_list)
    
    # transitions
    state.transition(n_steps=200)
    
    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()
    
    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(M_c, X_L, X_D, [0],
                            seed=get_next_seed()).flatten(1)
    
    # Get support over all component models
    discrete_support = qtu.get_mixture_support(cctype, component_model_type,
                         structure['component_params'][0], nbins=500)

    # calculate simple predictive probability for each point
    Q = [(N,0,x) for x in discrete_support]

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D, []*len(Q), Q)
    
    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist/float(numpy.sum(T_hist))
        S_hist = S_hist/float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support,dtype=float)
    else:
        T_hist, edges = numpy.histogram(T, bins=min(20,len(discrete_support)), normed=True)
        S_hist, _ =  numpy.histogram(predictive_samples, bins=edges, normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:,0]) # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist*N
        freq_exp = numpy.exp(probabilities)*N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"
    
    if show_plot:
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, 
                structure['component_params'][0], [1.0/num_clusters]*num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges)-numpy.min(edges))/len(edges)
        pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1)
        pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support, 
            numpy.exp(lpdf), 
            c="blue", 
            edgecolor="none",
            s=100, 
            label="true pdf", 
            alpha=1,
            zorder=3)
                
        # plot predictive probability of support points
        pylab.scatter(discrete_support, 
            numpy.exp(probabilities), 
            c="red", 
            edgecolor="none",
            s=100, 
            label="predictive probability", 
            alpha=1,
            zorder=4)
            
        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0,ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        pylab.show()

    return p
def test_one_feature_mixture(component_model_type,
                             num_clusters=3,
                             show_plot=False,
                             seed=None):
    """

    """
    random.seed(seed)

    N = 300
    separation = .9

    get_next_seed = lambda: random.randrange(2147483647)

    cluster_weights = [[1.0 / float(num_clusters)] * num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype],
                                     N, [0],
                                     cluster_weights, [separation],
                                     seed=get_next_seed(),
                                     distargs=[distargs[cctype]],
                                     return_structure=True)

    T_list = list(T)
    T = numpy.array(T)

    # pdb.set_trace()
    # create a crosscat state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T_list)

    # Get support over all component models
    discrete_support = qtu.get_mixture_support(
        cctype,
        component_model_type,
        structure['component_params'][0],
        nbins=250)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    # transitions
    state.transition(n_steps=200)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(
        M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1)

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D,
                                                     [] * len(Q), Q)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    # T = T[:,0]
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(50, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type,
                                   structure['component_params'][0],
                                   [1.0 / num_clusters] * num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data',
                  zorder=1)
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples',
                  zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(lpdf),
                      c="blue",
                      edgecolor="none",
                      s=100,
                      label="true pdf",
                      alpha=1,
                      zorder=3)

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      edgecolor="none",
                      s=100,
                      label="predictive probability",
                      alpha=1,
                      zorder=4)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_mixtrue.png"
        pylab.savefig(filename)
        pylab.close()

    return p
def test_kl_divergence_as_a_function_of_N_and_transitions():

	n_clusters = 3
	n_chains = 8
	do_times = 4

	# N_list = [25, 50, 100, 250, 500, 1000, 2000]
	N_list = [25, 50, 100, 175, 250, 400, 500]

	# max_transitions = 500
	max_transitions = 500
	transition_interval = 50
	t_iterations = max_transitions/transition_interval

	cctype = 'continuous'
	cluster_weights = [1.0/float(n_clusters)]*n_clusters
	separation = .5

	get_next_seed = lambda : random.randrange(2147483647)

	# data grid
	KLD = numpy.zeros((len(N_list), t_iterations+1))

	for _ in range(do_times):
		for n in range(len(N_list)):
			N = N_list[n]
			T, M_c, struc = sdg.gen_data([cctype], N, [0], [cluster_weights], 
							[separation], seed=get_next_seed(), distargs=[None],
							return_structure=True)

			M_r = du.gen_M_r_from_T(T)

			# precompute the support and pdf to speed up calculation of KL divergence
			support = qtu.get_mixture_support(cctype, 
						ccmext.p_ContinuousComponentModel, 
						struc['component_params'][0], nbins=1000, support=.995)
			true_log_pdf = qtu.get_mixture_pdf(support,
						ccmext.p_ContinuousComponentModel, 
						struc['component_params'][0],cluster_weights)

			# intialize a multiprocessing engine
			mstate = mpe.MultiprocessingEngine(cpu_count=8)
			X_L_list, X_D_list = mstate.initialize(M_c, M_r, T, n_chains=n_chains)

			# kl_divergences
			klds = numpy.zeros(len(X_L_list))

			for i in range(len(X_L_list)):
				X_L = X_L_list[i]
				X_D = X_D_list[i]
				KLD[n,0] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel,
						struc['component_params'][0], cluster_weights, M_c, 
						X_L, X_D, n_samples=1000, support=support, 
						true_log_pdf=true_log_pdf)


			# run transition_interval then take a reading. Rinse and repeat.
			for t in range( t_iterations ):
				X_L_list, X_D_list = mstate.analyze(M_c, T, X_L_list, X_D_list,
							n_steps=transition_interval)

				for i in range(len(X_L_list)):
					X_L = X_L_list[i]
					X_D = X_D_list[i]
					KLD[n,t+1] += qtu.KL_divergence(ccmext.p_ContinuousComponentModel,
							struc['component_params'][0], cluster_weights, M_c, 
							X_L, X_D, n_samples=1000, support=support, 
							true_log_pdf=true_log_pdf)


	KLD /= float(n_chains*do_times)

	pylab.subplot(1,3,1)
	pylab.contourf(range(0,max_transitions+1,transition_interval), N_list, KLD)
	pylab.title('KL divergence')
	pylab.ylabel('N')
	pylab.xlabel('# transitions')


	pylab.subplot(1,3,2)
	m_N = numpy.mean(KLD,axis=1)
	e_N = numpy.std(KLD,axis=1)/float(KLD.shape[1])**-.5
	pylab.errorbar(N_list,  m_N, yerr=e_N)
	pylab.title('KL divergence by N')
	pylab.xlabel('N')
	pylab.ylabel('KL divergence')

	pylab.subplot(1,3,3)
	m_t = numpy.mean(KLD,axis=0)
	e_t = numpy.std(KLD,axis=0)/float(KLD.shape[0])**-.5
	pylab.errorbar(range(0,max_transitions+1,transition_interval), m_t, yerr=e_t)
	pylab.title('KL divergence by transitions')
	pylab.xlabel('trasition')
	pylab.ylabel('KL divergence')

	pylab.show()

	return KLD