示例#1
0
def quick_le(seed, n_chains=1):
    rng = random.Random(seed)
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2, N_COLS,
                                                N_ROWS, 2)
    engine = LE.LocalEngine(seed=get_next_seed(rng))
    X_L, X_D = engine.initialize(M_c,
                                 M_r,
                                 T,
                                 seed=get_next_seed(rng),
                                 n_chains=n_chains)
    return T, M_r, M_c, X_L, X_D, engine
def initialize_helper(table_data, dict_in):
    M_c = table_data['M_c']
    M_r = table_data['M_r']
    T = table_data['T']
    initialization = dict_in['initialization']
    SEED = dict_in['SEED']
    engine = LE.LocalEngine(SEED)
    M_c_prime, M_r_prime, X_L, X_D = \
               engine.initialize(M_c, M_r, T, initialization=initialization)
    #
    ret_dict = dict(X_L=X_L, X_D=X_D)
    return ret_dict
示例#3
0
def get_CrossCatClient(client_type, **kwargs):
    """Helper which instantiates the appropriate Engine and returns a Client

    """

    client = None
    if client_type == 'local':
        import crosscat.LocalEngine as LocalEngine
        le = LocalEngine.LocalEngine(**kwargs)
        client = CrossCatClient(le)
    elif client_type == 'multiprocessing':
        import crosscat.MultiprocessingEngine as MultiprocessingEngine
        me = MultiprocessingEngine.MultiprocessingEngine(**kwargs)
        client = CrossCatClient(me)
    else:
        raise Exception('unknown client_type: %s' % client_type)
    return client
示例#4
0
def run(stdin, stdout, stderr, argv):
    args = parse_args(argv[1:])
    progname = argv[0]
    slash = progname.rfind('/')
    if slash:
        progname = progname[slash + 1:]
    if args.bdbpath is None and not args.memory:
        stderr.write('%s: pass filename or -m/--memory\n' % (progname, ))
        return 1
    if args.bdbpath == '-':
        stderr.write('%s: missing option?\n' % (progname, ))
        return 1
    bdb = bayeslite.bayesdb_open(pathname=args.bdbpath,
                                 builtin_metamodels=False)

    if args.jobs != 1:
        import crosscat.MultiprocessingEngine as ccme
        jobs = args.jobs if args.jobs > 0 else None
        crosscat = ccme.MultiprocessingEngine(seed=args.seed, cpu_count=jobs)
    else:
        import crosscat.LocalEngine as ccle
        crosscat = ccle.LocalEngine(seed=args.seed)
    metamodel = CrosscatMetamodel(crosscat)
    bayeslite.bayesdb_register_metamodel(bdb, metamodel)
    bdbshell = shell.Shell(bdb, 'crosscat', stdin, stdout, stderr)
    with hook.set_current_shell(bdbshell):
        if not args.no_init_file:
            init_file = os.path.join(os.path.expanduser('~/.bayesliterc'))
            if os.path.isfile(init_file):
                bdbshell.dot_read(init_file)

        if args.file is not None:
            for path in args.file:
                if os.path.isfile(path):
                    bdbshell.dot_read(path)
                else:
                    bdbshell.stdout.write('%s is not a file.  Aborting.\n' %
                                          (str(path), ))
                    break

        if not args.batch:
            bdbshell.cmdloop()
    return 0
def analyze_helper(table_data, dict_in):
    M_c = table_data['M_c']
    T = table_data['T']
    X_L = dict_in['X_L']
    X_D = dict_in['X_D']
    kernel_list = dict_in['kernel_list']
    n_steps = dict_in['n_steps']
    c = dict_in['c']
    r = dict_in['r']
    SEED = dict_in['SEED']
    engine = LE.LocalEngine(SEED)
    X_L_prime, X_D_prime = engine.analyze(M_c,
                                          T,
                                          X_L,
                                          X_D,
                                          kernel_list=kernel_list,
                                          n_steps=n_steps,
                                          c=c,
                                          r=r)
    #
    ret_dict = dict(X_L=X_L, X_D=X_D)
    return ret_dict
示例#6
0
def get_generative_clustering(M_c, M_r, T, data_inverse_permutation_indices,
                              num_clusters, num_views):
    # NOTE: this function only works because State.p_State doesn't use
    #       column_component_suffstats
    num_rows = len(T)
    num_cols = len(T[0])
    X_D_helper = numpy.repeat(range(num_clusters), (num_rows / num_clusters))
    gen_X_D = [
        X_D_helper[numpy.argsort(data_inverse_permutation_index)]
        for data_inverse_permutation_index in data_inverse_permutation_indices
    ]
    gen_X_L_assignments = numpy.repeat(range(num_views),
                                       (num_cols / num_views))
    # initialize to generate an X_L to manipulate
    local_engine = LE.LocalEngine()
    bad_X_L, bad_X_D = local_engine.initialize(M_c,
                                               M_r,
                                               T,
                                               initialization='apart')
    bad_X_L['column_partition']['assignments'] = gen_X_L_assignments
    # manually constrcut state in in generative configuration
    state = State.p_State(M_c, T, bad_X_L, gen_X_D)
    gen_X_L = state.get_X_L()
    gen_X_D = state.get_X_D()
    # run inference on hyperparameters to leave them in a reasonable state
    kernel_list = (
        'row_partition_hyperparameters',
        'column_hyperparameters',
        'column_partition_hyperparameter',
    )
    gen_X_L, gen_X_D = local_engine.analyze(M_c,
                                            T,
                                            gen_X_L,
                                            gen_X_D,
                                            n_steps=1,
                                            kernel_list=kernel_list)
    #
    return gen_X_L, gen_X_D
示例#7
0
def test_two_dependent_one_dependent():
    rng = random.Random(PASS_SEED)
    T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2, 3, 10,
                                                1)
    engine = LE.LocalEngine(seed=get_next_seed(rng))

    # These dependency constraints target the computation of unorm_crp_logps_avg
    # in the case that one variable (1) is independent of another variable (2)
    # which should ensure the CRP probability of the view of (2) is -INFINITY
    # when doing a block proposal of (0,1) into the view of (2). Refer to the
    # comment about compute the CRP probabilities in
    # State::sample_insert_features.
    dep_constraints = [(0, 1, True), (1, 2, False)]

    X_L, X_D = engine.initialize(M_c,
                                 M_r,
                                 T,
                                 seed=get_next_seed(rng),
                                 n_chains=1)

    X_L, X_D = engine.ensure_col_dep_constraints(M_c, M_r, T, X_L, X_D,
                                                 dep_constraints,
                                                 get_next_seed(rng))

    for col1, col2, dep in dep_constraints:
        assert engine.assert_col_dep_constraints(X_L, X_D, col1, col2, dep,
                                                 True)

    X_L, X_D = engine.analyze(M_c,
                              T,
                              X_L,
                              X_D,
                              get_next_seed(rng),
                              n_steps=1000)

    for col1, col2, dep in dep_constraints:
        assert engine.assert_col_dep_constraints(X_L, X_D, col1, col2, dep,
                                                 True)
示例#8
0
def run_posterior_chain(seed, M_c, T, num_iters,
        probe_columns=(0,),
        ROW_CRP_ALPHA_GRID=(), COLUMN_CRP_ALPHA_GRID=(),
        S_GRID=(), MU_GRID=(),
        N_GRID=default_n_grid,
        CT_KERNEL=0,
        plot_rand_idx=None,
        ):
    plot_rand_idx = arbitrate_plot_rand_idx(plot_rand_idx, num_iters)
    engine = LE.LocalEngine(seed)
    M_r = du.gen_M_r_from_T(T)
    X_L, X_D = engine.initialize(M_c, M_r, T, 'from_the_prior',
            ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID,
            COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID,
            S_GRID=S_GRID,
            MU_GRID=MU_GRID,
            N_GRID=N_GRID,
            )
    diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns)
    diagnostics_data = collections.defaultdict(list)
    for idx in range(num_iters):
        M_c, T, X_L, X_D = run_posterior_chain_iter(engine, M_c, T, X_L, X_D, diagnostics_data,
                diagnostics_funcs,
                ROW_CRP_ALPHA_GRID,
                COLUMN_CRP_ALPHA_GRID,
                S_GRID, MU_GRID,
                N_GRID=N_GRID,
                CT_KERNEL=CT_KERNEL,
                )
        if idx == plot_rand_idx:
            # This DOESN'T work with multithreading
            filename = 'T_%s' % idx
            pu.plot_views(numpy.array(T), X_D, X_L, M_c, filename=filename,
                    dir='./', close=True, format=image_format)
            pass
        pass
    return diagnostics_data
to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list)
fu.pickle(to_pickle, pkl_filename)

# to_pickle = fu.unpickle(pkl_filename)
# X_L_list = to_pickle['X_L_list']
# X_D_list = to_pickle['X_D_list']

# can we recreate a row given some of its values?
query_cols = [2, 6, 9]
query_names = col_names[query_cols]
Q = determine_Q(M_c, query_names, num_rows)
#
condition_cols = [3, 4, 10]
condition_names = col_names[condition_cols]
samples_list = []
engine = LE.LocalEngine(inf_seed)
for actual_row_idx in [1, 10, 100]:
    actual_row_values = T[actual_row_idx]
    condition_values = [
        actual_row_values[condition_col] for condition_col in condition_cols
    ]
    condition_tuples = zip(condition_names, condition_values)
    Y = determine_unobserved_Y(num_rows, M_c, condition_tuples)
    samples = engine.simple_predictive_sample(M_c, X_L_list, X_D_list, Y, Q,
                                              10)
    samples_list.append(samples)

round_1 = lambda value: round(value, 2)
# impute some values (as if they were missing)
for impute_row in [10, 20, 30, 40, 50, 60, 70, 80]:
    impute_cols = [31, 32, 52, 60, 62]
示例#10
0
def do_analyze((chain_tuple, seed)):
    (X_L, X_D) = chain_tuple
    engine = LE.LocalEngine(seed)
    X_L, X_D = engine.analyze(M_c, T, X_L, X_D, n_steps=num_transitions)
    return X_L, X_D
示例#11
0
def do_initialize(seed):
    engine = LE.LocalEngine(seed)
    X_L, X_D = engine.initialize(M_c, M_r, T)
    return X_L, X_D
示例#12
0
def do_initialize(seed):
    return LE._do_initialize(M_c, M_r, T, 'from_the_prior', seed)
示例#13
0
def determine_unobserved_Y(num_rows, M_c, condition_tuples):
    name_to_idx = M_c['name_to_idx']
    row_idx = num_rows + 1
    Y = []
    for col_name, col_value in condition_tuples:
        col_idx = name_to_idx[col_name]
        col_code = du.convert_value_to_code(M_c, col_idx, col_value)
        y = (row_idx, col_idx, col_code)
        Y.append(y)
    return Y

def do_initialize(seed):
    return LE._do_initialize(M_c, M_r, T, 'from_the_prior', seed)

def do_analyze(((X_L, X_D), seed)):
    return LE._do_analyze(M_c, T, X_L, X_D, (), num_transitions, (), (), -1, -1, seed)

# set everything up
T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed)
num_rows = len(T)
num_cols = len(T[0])
col_names = numpy.array([M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)])

# initialze and transition chains
seeds = range(num_chains)
chain_tuples = map(do_initialize, seeds)
chain_tuples = map(do_analyze, zip(chain_tuples, seeds))

# visualize the column cooccurence matrix    
X_L_list, X_D_list = map(list, zip(*chain_tuples))
示例#14
0
def run_mi_test_local(data_dict):

    gen_seed = data_dict['SEED']
    crosscat_seed = data_dict['CCSEED']
    num_clusters = data_dict['num_clusters']
    num_cols = data_dict['num_cols']
    num_rows = data_dict['num_rows']
    num_views = data_dict['num_views']
    corr = data_dict['corr']
    burn_in = data_dict['burn_in']
    mean_range = float(num_clusters) * 2.0

    # 32 bit signed int
    random.seed(gen_seed)
    get_next_seed = lambda: random.randrange(2147483647)

    # generate the stats
    T, M_c, M_r, X_L, X_D, view_assignment = mitu.generate_correlated_state(
        num_rows,
        num_cols,
        num_views,
        num_clusters,
        mean_range,
        corr,
        seed=gen_seed)

    table_data = dict(T=T, M_c=M_c)

    engine = LE.LocalEngine(crosscat_seed)
    X_L_prime, X_D_prime = engine.analyze(M_c, T, X_L, X_D, n_steps=burn_in)

    X_L = X_L_prime
    X_D = X_D_prime

    view_assignment = numpy.array(X_L['column_partition']['assignments'])

    # for each view calclate the average MI between all pairs of columns
    n_views = max(view_assignment) + 1
    MI = []
    Linfoot = []
    queries = []
    MI = 0.0
    pairs = 0.0
    for view in range(n_views):
        columns_in_view = numpy.nonzero(view_assignment == view)[0]
        combinations = itertools.combinations(columns_in_view, 2)
        for pair in combinations:
            any_pairs = True
            queries.append(pair)
            MI_i, Linfoot_i = iu.mutual_information(M_c, [X_L], [X_D], [pair],
                                                    n_samples=1000)
            MI += MI_i[0][0]
            pairs += 1.0

    if pairs > 0.0:
        MI /= pairs

    ret_dict = dict(
        id=data_dict['id'],
        dataset=data_dict['dataset'],
        sample=data_dict['sample'],
        mi=MI,
    )

    return ret_dict
示例#15
0
if ipython_parallel_sshserver is not None:
    ## set up parallel
    from IPython.parallel import Client
    c = Client(profile='ssh', sshserver=ipython_parallel_sshserver)
    dview = c[:]
    dview.execute('import sys')
    if sshserver_path_append is not None:
        dview.apply_sync(lambda: sys.path.append(sshserver_path_append))
    #
    with dview.sync_imports(): 
        import crosscat.LocalEngine as LE
    dview.push(dict(
            M_c=M_c,
            M_r=M_r,
            T=T))
    async_result = dview.map_async(lambda SEED: LE.do_initialize(M_c, M_r, T, 'from_the_prior', SEED), range(8))
    initialized_states = async_result.get()
    #
    async_result = dview.map_async(lambda (SEED, state_tuple): LE.do_analyze(M_c, T, state_tuple[0], state_tuple[1], (), 10, (), (), -1, -1, SEED), zip(range(len(initialized_states)), initialized_states))
    chain_tuples = async_result.get()
else:
    # initialize the chains    
    q = Queue()
    p_list = []
    for chain_idx in range(num_chains):
        p = Process(target=do_initialize, args=(engine, M_c, M_r, T, q))
        p.start()
        p_list.append(p)
    chain_tuples = [q.get() for idx in range(num_chains)]
    for p in p_list:
        p.join()