def MI_test(n,
            burn_in,
            cc_samples,
            which_test,
            n_MI_samples=500,
            correlation=0):
    M_c, X_Ls, X_Ds = do_test(0,
                              0,
                              n,
                              burn_in,
                              cc_samples,
                              "correlated",
                              correlation=correlation,
                              do_plot=False)
    # query column 0 and 1
    MI, Linfoot = iu.mutual_information(M_c,
                                        X_Ls,
                                        X_Ds, [(0, 1)],
                                        n_samples=n_MI_samples)

    MI = numpy.mean(MI)
    Linfoot = numpy.mean(Linfoot)

    if which_test == "correlated":
        test_strn = "Test: correlation (%1.2f), N: %i, burn_in: %i, samples: %i, MI_samples: %i\n\tMI: %f, Linfoot %f" % (
            correlation, n, burn_in, cc_samples, n_MI_samples, MI, Linfoot)
    else:
        test_strn = "Test: %s, N: %i, burn_in: %i, samples: %i, MI_samples: %i\n\tMI: %f, Linfoot %f" % (
            which_test, n, burn_in, cc_samples, n_MI_samples, MI, Linfoot)

    print test_strn
    return test_strn
示例#2
0
def MI_test(n, burn_in, cc_samples, which_test, n_MI_samples=500, correlation=0):
    get_next_seed = lambda: random.randrange(32000)
    M_c, X_Ls, X_Ds = do_test(0, 0, n, burn_in, cc_samples, "correlated", correlation=correlation, do_plot=False)
    # query column 0 and 1
    MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0, 1)], get_next_seed, n_samples=n_MI_samples)

    MI = numpy.mean(MI)
    Linfoot = numpy.mean(Linfoot)

    if which_test == "correlated":
        test_strn = (
            "Test: correlation (%1.2f), N: %i, burn_in: %i, samples: %i, MI_samples: %i\n\tMI: %f, Linfoot %f"
            % (correlation, n, burn_in, cc_samples, n_MI_samples, MI, Linfoot)
        )
    else:
        test_strn = "Test: %s, N: %i, burn_in: %i, samples: %i, MI_samples: %i\n\tMI: %f, Linfoot %f" % (
            which_test,
            n,
            burn_in,
            cc_samples,
            n_MI_samples,
            MI,
            Linfoot,
        )

    print(test_strn)
    return test_strn
示例#3
0
def run_mi_test_local(data_dict):

    gen_seed = data_dict['SEED']
    crosscat_seed = data_dict['CCSEED']
    num_clusters = data_dict['num_clusters']
    num_cols = data_dict['num_cols']
    num_rows = data_dict['num_rows']
    num_views = data_dict['num_views']
    corr = data_dict['corr']
    burn_in = data_dict['burn_in']
    mean_range = float(num_clusters)*2.0

    # 32 bit signed int
    random.seed(gen_seed)
    get_next_seed = lambda : random.randrange(2147483647)

    # generate the stats
    T, M_c, M_r, X_L, X_D, view_assignment = mitu.generate_correlated_state(num_rows,
        num_cols, num_views, num_clusters, mean_range, corr, seed=gen_seed);

    table_data = dict(T=T,M_c=M_c)

    engine = LE.LocalEngine(crosscat_seed)
    X_L_prime, X_D_prime = engine.analyze(M_c, T, X_L, X_D, n_steps=burn_in) 

    X_L = X_L_prime
    X_D = X_D_prime

    view_assignment = numpy.array(X_L['column_partition']['assignments'])
 
    # for each view calclate the average MI between all pairs of columns
    n_views = max(view_assignment)+1
    MI = []
    Linfoot = []
    queries = []
    MI = 0.0
    pairs = 0.0
    for view in range(n_views):
        columns_in_view = numpy.nonzero(view_assignment==view)[0]
        combinations = itertools.combinations(columns_in_view,2)
        for pair in combinations:
            any_pairs = True
            queries.append(pair)
            MI_i, Linfoot_i = iu.mutual_information(M_c, [X_L], [X_D], [pair], n_samples=1000)
            MI += MI_i[0][0]
            pairs += 1.0

    
    if pairs > 0.0:
        MI /= pairs

    ret_dict = dict(
        id=data_dict['id'],
        dataset=data_dict['dataset'],
        sample=data_dict['sample'],
        mi=MI,
        )

    return ret_dict
示例#4
0
    def mutual_information(
            self, M_c, X_L_list, X_D_list, Q, seed, n_samples=1000):
        """Estimate mutual information for each pair of columns on Q given
        the set of samples.

        :param Q: List of tuples where each tuple contains the two column
            indexes to compare
        :type Q: list of two-tuples of ints
        :param n_samples: the number of simple predictive samples to use
        :type n_samples: int

        :returns: list of list -- where each sublist is a set of MIs and
            Linfoots from each crosscat sample.
        """
        get_next_seed = make_get_next_seed(seed)
        return iu.mutual_information(
            M_c, X_L_list, X_D_list, Q, get_next_seed, n_samples)
示例#5
0
 def mutual_information(self, M_c, X_L_list, X_D_list, Q, n_samples=1000):
     """
     Return the estimated mutual information for each pair of columns on Q given
     the set of samples.
     
     :param M_c: The column metadata
     :type M_c: dict
     :param X_L_list: list of the latent variables associated with the latent state
     :type X_L_list: list of dict
     :param X_D_list: list of the particular cluster assignments of each row in each view
     :type X_D_list: list of list of lists
     :param Q: List of tuples where each tuple contains the two column indexes to compare
     :type Q: list of two-tuples of ints
     :param n_samples: the number of simple predictive samples to use
     :type n_samples: int
     :returns: list of list, where each sublist is a set of MIs and Linfoots from each crosscat sample.
     """
     return iu.mutual_information(M_c, X_L_list, X_D_list, Q, n_samples)
示例#6
0
 def mutual_information(self, M_c, X_L_list, X_D_list, Q, n_samples=1000):
     """
     Return the estimated mutual information for each pair of columns on Q given
     the set of samples.
     
     :param M_c: The column metadata
     :type M_c: dict
     :param X_L_list: list of the latent variables associated with the latent state
     :type X_L_list: list of dict
     :param X_D_list: list of the particular cluster assignments of each row in each view
     :type X_D_list: list of list of lists
     :param Q: List of tuples where each tuple contains the two column indexes to compare
     :type Q: list of two-tuples of ints
     :param n_samples: the number of simple predictive samples to use
     :type n_samples: int
     :returns: list of list, where each sublist is a set of MIs and Linfoots from each crosscat sample.
     """
     return iu.mutual_information(M_c, X_L_list, X_D_list, Q, n_samples)
示例#7
0
    datas.append(T)

    print "num_samples: %i, width: %f" % (n, w)

    M_c = du.gen_M_c_from_T(T, cctypes)
    X_Ls = []
    X_Ds = []

    for ns in range(n_samples):
        state = State.p_State(M_c, T)
        state.transition(n_steps=burn_in)
        X_Ds.append(state.get_X_D())
        X_Ls.append(state.get_X_L())

    MI, Linfoot = iu.mutual_information(M_c,
                                        X_Ls,
                                        X_Ds, [(0, 1)],
                                        n_samples=5000)

    data_d = numpy.transpose(MI)

    if nr == 0:
        data = data_d
    else:
        data = numpy.hstack((data, data_d))

    mi_ests[nr] = mi_est

    nr += 1

pl.figure(tight_layout=True, figsize=(len(widths) * 4, 4))
i = 0
			pr, p = pearsonr(T[:,0], T[:,1])

			print "num_samples: %i, R: %f, d: %i. Actual R: %f" % (n, r, d+1, pr)

			M_c = du.gen_M_c_from_T(T)
			X_Ls = []
			X_Ds = []

			for _ in range(n_samples):
				state = State.p_State(M_c, T)
				state.transition(n_steps=burn_in)
				X_Ds.append(state.get_X_D())
				X_Ls.append(state.get_X_L())
			
			MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0,1)], n_samples=200)

			if d == 0:
				data_d = numpy.transpose(Linfoot)
			else:
				data_d = numpy.vstack((data_d, numpy.transpose(Linfoot)))

		if nr == 0:
			data = data_d
		else:
			data = numpy.hstack((data, data_d))
		
		nr += 1


	pl.subplot(2,2,subplot)
示例#9
0
def run_test(args):

    rho = args.rho
    num_times = args.num_times
    min_num_rows = args.min_num_rows
    max_num_rows = args.max_num_rows
    n_grid = args.n_grid
    filename = args.filename
    discrete = args.discrete

    num_samples = []
    for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist():
        num_samples.append(int(ns))

    variances = []

    burn_in = 200

    MIs = numpy.zeros((num_times, len(num_samples)))

    mi_diff = numpy.zeros((len(num_samples), num_times))

    if not discrete:
        T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho)
        cctypes = ['continuous'] * 2
    else:
        T, true_mi, external_mi = gen_correlated_data_discrete(
            num_samples[-1], rho)
        cctypes = ['multinomial'] * 2

    data_subs = []

    n_index = 0
    for n in num_samples:
        T_sub = numpy.copy(T[0:n - 1, :])

        data = []

        data_subs.append(T_sub)

        print("%i: " % n)
        for t in range(num_times):
            M_c = du.gen_M_c_from_T(T_sub, cctypes)
            state = State.p_State(M_c, T_sub)
            state.transition(n_steps=burn_in)
            X_D = state.get_X_D()
            X_L = state.get_X_L()

            MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)],
                                                n_samples=5000)

            mi_diff[n_index, t] = true_mi - MI[0][0]

            print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0]))

            MIs[t, n_index] = MI[0][0]

        n_index += 1

    if discrete:
        dtype_str = "discrete"
    else:
        dtype_str = "continuous"

    basefilename = filename + str(int(time.time()))
    figname = basefilename + ".png"
    datname = basefilename + "_DATA.png"

    pl.figure

    # plot data
    # pl.subplot(1,2,1)
    pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4))
    i = 0
    for T_s in data_subs:
        pl.subplot(1, len(data_subs), i + 1)
        num_rows = num_samples[i]
        if discrete:
            heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0],
                                                        T_s[:, 1],
                                                        bins=10)
            extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
            pl.imshow(heatmap, extent=extent, interpolation="nearest")
        else:
            pl.scatter(T_s[:, 0], T_s[:, 1], alpha=.3, s=81)
        pl.title('#r: ' + str(num_rows))

        i += 1

    pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(datname)
    pl.clf()

    pl.figure(tight_layout=True, figsize=(5, 4))
    # plot convergence
    # pl.subplot(1,2,2)
    # standard deviation
    stderr = numpy.std(MIs, axis=0)  #/(float(num_times)**.5)
    mean = numpy.mean(MIs, axis=0)
    pl.errorbar(num_samples, mean, yerr=stderr, c='blue')
    pl.plot(num_samples, mean, c="blue", alpha=.8, label='mean MI')
    pl.plot(num_samples, [true_mi] * len(num_samples),
            color='red',
            alpha=.8,
            label='true MI')
    pl.plot(num_samples, [external_mi] * len(num_samples),
            color=(0, .5, .5),
            alpha=.8,
            label='external MI')
    pl.title('convergence')
    pl.xlabel('#rows in X (log)')
    pl.ylabel('CrossCat MI - true MI')

    pl.legend(loc=0, prop={'size': 8})
    pl.gca().set_xscale('log')

    # save output
    pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(figname)
示例#10
0
def run_test(args):

    rho = args.rho
    num_times = args.num_times
    min_num_rows = args.min_num_rows
    max_num_rows = args.max_num_rows
    n_grid = args.n_grid
    filename = args.filename
    discrete = args.discrete

    num_samples = []
    for ns in log_linspace(min_num_rows, max_num_rows, n_grid).tolist():
        num_samples.append(int(ns))

    variances = []

    burn_in = 200

    MIs = numpy.zeros((num_times, len(num_samples)))

    mi_diff = numpy.zeros((len(num_samples), num_times))

    if not discrete:
        T, true_mi, external_mi = gen_correlated_data(num_samples[-1], rho)
        cctypes = ["continuous"] * 2
    else:
        T, true_mi, external_mi = gen_correlated_data_discrete(num_samples[-1], rho)
        cctypes = ["multinomial"] * 2

    data_subs = []

    n_index = 0
    for n in num_samples:
        T_sub = numpy.copy(T[0 : n - 1, :])

        data = []

        data_subs.append(T_sub)

        print("%i: " % n)
        for t in range(num_times):
            M_c = du.gen_M_c_from_T(T_sub, cctypes)
            state = State.p_State(M_c, T_sub)
            state.transition(n_steps=burn_in)
            X_D = state.get_X_D()
            X_L = state.get_X_L()

            MI, Linfoot = iu.mutual_information(M_c, [X_L], [X_D], [(0, 1)], n_samples=5000)

            mi_diff[n_index, t] = true_mi - MI[0][0]

            print("\t%i TRUE: %e, EST: %e " % (t, true_mi, MI[0][0]))

            MIs[t, n_index] = MI[0][0]

        n_index += 1

    if discrete:
        dtype_str = "discrete"
    else:
        dtype_str = "continuous"

    basefilename = filename + str(int(time.time()))
    figname = basefilename + ".png"
    datname = basefilename + "_DATA.png"

    pl.figure

    # plot data
    # pl.subplot(1,2,1)
    pl.figure(tight_layout=True, figsize=(len(data_subs) * 4, 4))
    i = 0
    for T_s in data_subs:
        pl.subplot(1, len(data_subs), i + 1)
        num_rows = num_samples[i]
        if discrete:
            heatmap, xedges, yedges = numpy.histogram2d(T_s[:, 0], T_s[:, 1], bins=10)
            extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
            pl.imshow(heatmap, extent=extent, interpolation="nearest")
        else:
            pl.scatter(T_s[:, 0], T_s[:, 1], alpha=0.3, s=81)
        pl.title("#r: " + str(num_rows))

        i += 1

    pl.suptitle("data for rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(datname)
    pl.clf()

    pl.figure(tight_layout=True, figsize=(5, 4))
    # plot convergence
    # pl.subplot(1,2,2)
    # standard deviation
    stderr = numpy.std(MIs, axis=0)  # /(float(num_times)**.5)
    mean = numpy.mean(MIs, axis=0)
    pl.errorbar(num_samples, mean, yerr=stderr, c="blue")
    pl.plot(num_samples, mean, c="blue", alpha=0.8, label="mean MI")
    pl.plot(num_samples, [true_mi] * len(num_samples), color="red", alpha=0.8, label="true MI")
    pl.plot(num_samples, [external_mi] * len(num_samples), color=(0, 0.5, 0.5), alpha=0.8, label="external MI")
    pl.title("convergence")
    pl.xlabel("#rows in X (log)")
    pl.ylabel("CrossCat MI - true MI")

    pl.legend(loc=0, prop={"size": 8})
    pl.gca().set_xscale("log")

    # save output
    pl.title("convergence rho: %1.2f (%s)" % (rho, dtype_str))

    pl.savefig(figname)
		pr, p = pearsonr(T[:,0], T[:,1])

		print("num_samples: %i, R: %f, d: %i. Actual R: %f" % (n, r, d+1, pr))

		M_c = du.gen_M_c_from_T(T,cctypes)
		X_Ls = []
		X_Ds = []

		for _ in range(n_samples):
			state = State.p_State(M_c, T)
			state.transition(n_steps=burn_in)
			X_Ds.append(state.get_X_D())
			X_Ls.append(state.get_X_L())
		
		MI, Linfoot = iu.mutual_information(M_c, X_Ls, X_Ds, [(0,1)],
                    get_next_seed, n_samples=5000)

		if d == 0:
			data_d = numpy.transpose(Linfoot)
		else:
			data_d = numpy.vstack((data_d, numpy.transpose(Linfoot)))

	if nr == 0:
		data = data_d
	else:
		data = numpy.hstack((data, data_d))
	
	nr += 1


if discrete: