def quick_le(seed, n_chains=1): rng = random.Random(seed) T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2, N_COLS, N_ROWS, 2) engine = LE.LocalEngine(seed=get_next_seed(rng)) X_L, X_D = engine.initialize(M_c, M_r, T, seed=get_next_seed(rng), n_chains=n_chains) return T, M_r, M_c, X_L, X_D, engine
def initialize_helper(table_data, dict_in): M_c = table_data['M_c'] M_r = table_data['M_r'] T = table_data['T'] initialization = dict_in['initialization'] SEED = dict_in['SEED'] engine = LE.LocalEngine(SEED) M_c_prime, M_r_prime, X_L, X_D = \ engine.initialize(M_c, M_r, T, initialization=initialization) # ret_dict = dict(X_L=X_L, X_D=X_D) return ret_dict
def get_CrossCatClient(client_type, **kwargs): """Helper which instantiates the appropriate Engine and returns a Client """ client = None if client_type == 'local': import crosscat.LocalEngine as LocalEngine le = LocalEngine.LocalEngine(**kwargs) client = CrossCatClient(le) elif client_type == 'multiprocessing': import crosscat.MultiprocessingEngine as MultiprocessingEngine me = MultiprocessingEngine.MultiprocessingEngine(**kwargs) client = CrossCatClient(me) else: raise Exception('unknown client_type: %s' % client_type) return client
def run(stdin, stdout, stderr, argv): args = parse_args(argv[1:]) progname = argv[0] slash = progname.rfind('/') if slash: progname = progname[slash + 1:] if args.bdbpath is None and not args.memory: stderr.write('%s: pass filename or -m/--memory\n' % (progname, )) return 1 if args.bdbpath == '-': stderr.write('%s: missing option?\n' % (progname, )) return 1 bdb = bayeslite.bayesdb_open(pathname=args.bdbpath, builtin_metamodels=False) if args.jobs != 1: import crosscat.MultiprocessingEngine as ccme jobs = args.jobs if args.jobs > 0 else None crosscat = ccme.MultiprocessingEngine(seed=args.seed, cpu_count=jobs) else: import crosscat.LocalEngine as ccle crosscat = ccle.LocalEngine(seed=args.seed) metamodel = CrosscatMetamodel(crosscat) bayeslite.bayesdb_register_metamodel(bdb, metamodel) bdbshell = shell.Shell(bdb, 'crosscat', stdin, stdout, stderr) with hook.set_current_shell(bdbshell): if not args.no_init_file: init_file = os.path.join(os.path.expanduser('~/.bayesliterc')) if os.path.isfile(init_file): bdbshell.dot_read(init_file) if args.file is not None: for path in args.file: if os.path.isfile(path): bdbshell.dot_read(path) else: bdbshell.stdout.write('%s is not a file. Aborting.\n' % (str(path), )) break if not args.batch: bdbshell.cmdloop() return 0
def analyze_helper(table_data, dict_in): M_c = table_data['M_c'] T = table_data['T'] X_L = dict_in['X_L'] X_D = dict_in['X_D'] kernel_list = dict_in['kernel_list'] n_steps = dict_in['n_steps'] c = dict_in['c'] r = dict_in['r'] SEED = dict_in['SEED'] engine = LE.LocalEngine(SEED) X_L_prime, X_D_prime = engine.analyze(M_c, T, X_L, X_D, kernel_list=kernel_list, n_steps=n_steps, c=c, r=r) # ret_dict = dict(X_L=X_L, X_D=X_D) return ret_dict
def get_generative_clustering(M_c, M_r, T, data_inverse_permutation_indices, num_clusters, num_views): # NOTE: this function only works because State.p_State doesn't use # column_component_suffstats num_rows = len(T) num_cols = len(T[0]) X_D_helper = numpy.repeat(range(num_clusters), (num_rows / num_clusters)) gen_X_D = [ X_D_helper[numpy.argsort(data_inverse_permutation_index)] for data_inverse_permutation_index in data_inverse_permutation_indices ] gen_X_L_assignments = numpy.repeat(range(num_views), (num_cols / num_views)) # initialize to generate an X_L to manipulate local_engine = LE.LocalEngine() bad_X_L, bad_X_D = local_engine.initialize(M_c, M_r, T, initialization='apart') bad_X_L['column_partition']['assignments'] = gen_X_L_assignments # manually constrcut state in in generative configuration state = State.p_State(M_c, T, bad_X_L, gen_X_D) gen_X_L = state.get_X_L() gen_X_D = state.get_X_D() # run inference on hyperparameters to leave them in a reasonable state kernel_list = ( 'row_partition_hyperparameters', 'column_hyperparameters', 'column_partition_hyperparameter', ) gen_X_L, gen_X_D = local_engine.analyze(M_c, T, gen_X_L, gen_X_D, n_steps=1, kernel_list=kernel_list) # return gen_X_L, gen_X_D
def test_two_dependent_one_dependent(): rng = random.Random(PASS_SEED) T, M_r, M_c = du.gen_factorial_data_objects(get_next_seed(rng), 2, 3, 10, 1) engine = LE.LocalEngine(seed=get_next_seed(rng)) # These dependency constraints target the computation of unorm_crp_logps_avg # in the case that one variable (1) is independent of another variable (2) # which should ensure the CRP probability of the view of (2) is -INFINITY # when doing a block proposal of (0,1) into the view of (2). Refer to the # comment about compute the CRP probabilities in # State::sample_insert_features. dep_constraints = [(0, 1, True), (1, 2, False)] X_L, X_D = engine.initialize(M_c, M_r, T, seed=get_next_seed(rng), n_chains=1) X_L, X_D = engine.ensure_col_dep_constraints(M_c, M_r, T, X_L, X_D, dep_constraints, get_next_seed(rng)) for col1, col2, dep in dep_constraints: assert engine.assert_col_dep_constraints(X_L, X_D, col1, col2, dep, True) X_L, X_D = engine.analyze(M_c, T, X_L, X_D, get_next_seed(rng), n_steps=1000) for col1, col2, dep in dep_constraints: assert engine.assert_col_dep_constraints(X_L, X_D, col1, col2, dep, True)
def run_posterior_chain(seed, M_c, T, num_iters, probe_columns=(0,), ROW_CRP_ALPHA_GRID=(), COLUMN_CRP_ALPHA_GRID=(), S_GRID=(), MU_GRID=(), N_GRID=default_n_grid, CT_KERNEL=0, plot_rand_idx=None, ): plot_rand_idx = arbitrate_plot_rand_idx(plot_rand_idx, num_iters) engine = LE.LocalEngine(seed) M_r = du.gen_M_r_from_T(T) X_L, X_D = engine.initialize(M_c, M_r, T, 'from_the_prior', ROW_CRP_ALPHA_GRID=ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID=COLUMN_CRP_ALPHA_GRID, S_GRID=S_GRID, MU_GRID=MU_GRID, N_GRID=N_GRID, ) diagnostics_funcs = generate_diagnostics_funcs(X_L, probe_columns) diagnostics_data = collections.defaultdict(list) for idx in range(num_iters): M_c, T, X_L, X_D = run_posterior_chain_iter(engine, M_c, T, X_L, X_D, diagnostics_data, diagnostics_funcs, ROW_CRP_ALPHA_GRID, COLUMN_CRP_ALPHA_GRID, S_GRID, MU_GRID, N_GRID=N_GRID, CT_KERNEL=CT_KERNEL, ) if idx == plot_rand_idx: # This DOESN'T work with multithreading filename = 'T_%s' % idx pu.plot_views(numpy.array(T), X_D, X_L, M_c, filename=filename, dir='./', close=True, format=image_format) pass pass return diagnostics_data
to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list) fu.pickle(to_pickle, pkl_filename) # to_pickle = fu.unpickle(pkl_filename) # X_L_list = to_pickle['X_L_list'] # X_D_list = to_pickle['X_D_list'] # can we recreate a row given some of its values? query_cols = [2, 6, 9] query_names = col_names[query_cols] Q = determine_Q(M_c, query_names, num_rows) # condition_cols = [3, 4, 10] condition_names = col_names[condition_cols] samples_list = [] engine = LE.LocalEngine(inf_seed) for actual_row_idx in [1, 10, 100]: actual_row_values = T[actual_row_idx] condition_values = [ actual_row_values[condition_col] for condition_col in condition_cols ] condition_tuples = zip(condition_names, condition_values) Y = determine_unobserved_Y(num_rows, M_c, condition_tuples) samples = engine.simple_predictive_sample(M_c, X_L_list, X_D_list, Y, Q, 10) samples_list.append(samples) round_1 = lambda value: round(value, 2) # impute some values (as if they were missing) for impute_row in [10, 20, 30, 40, 50, 60, 70, 80]: impute_cols = [31, 32, 52, 60, 62]
def do_analyze((chain_tuple, seed)): (X_L, X_D) = chain_tuple engine = LE.LocalEngine(seed) X_L, X_D = engine.analyze(M_c, T, X_L, X_D, n_steps=num_transitions) return X_L, X_D
def do_initialize(seed): engine = LE.LocalEngine(seed) X_L, X_D = engine.initialize(M_c, M_r, T) return X_L, X_D
def do_initialize(seed): return LE._do_initialize(M_c, M_r, T, 'from_the_prior', seed)
def determine_unobserved_Y(num_rows, M_c, condition_tuples): name_to_idx = M_c['name_to_idx'] row_idx = num_rows + 1 Y = [] for col_name, col_value in condition_tuples: col_idx = name_to_idx[col_name] col_code = du.convert_value_to_code(M_c, col_idx, col_value) y = (row_idx, col_idx, col_code) Y.append(y) return Y def do_initialize(seed): return LE._do_initialize(M_c, M_r, T, 'from_the_prior', seed) def do_analyze(((X_L, X_D), seed)): return LE._do_analyze(M_c, T, X_L, X_D, (), num_transitions, (), (), -1, -1, seed) # set everything up T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed) num_rows = len(T) num_cols = len(T[0]) col_names = numpy.array([M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)]) # initialze and transition chains seeds = range(num_chains) chain_tuples = map(do_initialize, seeds) chain_tuples = map(do_analyze, zip(chain_tuples, seeds)) # visualize the column cooccurence matrix X_L_list, X_D_list = map(list, zip(*chain_tuples))
def run_mi_test_local(data_dict): gen_seed = data_dict['SEED'] crosscat_seed = data_dict['CCSEED'] num_clusters = data_dict['num_clusters'] num_cols = data_dict['num_cols'] num_rows = data_dict['num_rows'] num_views = data_dict['num_views'] corr = data_dict['corr'] burn_in = data_dict['burn_in'] mean_range = float(num_clusters) * 2.0 # 32 bit signed int random.seed(gen_seed) get_next_seed = lambda: random.randrange(2147483647) # generate the stats T, M_c, M_r, X_L, X_D, view_assignment = mitu.generate_correlated_state( num_rows, num_cols, num_views, num_clusters, mean_range, corr, seed=gen_seed) table_data = dict(T=T, M_c=M_c) engine = LE.LocalEngine(crosscat_seed) X_L_prime, X_D_prime = engine.analyze(M_c, T, X_L, X_D, n_steps=burn_in) X_L = X_L_prime X_D = X_D_prime view_assignment = numpy.array(X_L['column_partition']['assignments']) # for each view calclate the average MI between all pairs of columns n_views = max(view_assignment) + 1 MI = [] Linfoot = [] queries = [] MI = 0.0 pairs = 0.0 for view in range(n_views): columns_in_view = numpy.nonzero(view_assignment == view)[0] combinations = itertools.combinations(columns_in_view, 2) for pair in combinations: any_pairs = True queries.append(pair) MI_i, Linfoot_i = iu.mutual_information(M_c, [X_L], [X_D], [pair], n_samples=1000) MI += MI_i[0][0] pairs += 1.0 if pairs > 0.0: MI /= pairs ret_dict = dict( id=data_dict['id'], dataset=data_dict['dataset'], sample=data_dict['sample'], mi=MI, ) return ret_dict
if ipython_parallel_sshserver is not None: ## set up parallel from IPython.parallel import Client c = Client(profile='ssh', sshserver=ipython_parallel_sshserver) dview = c[:] dview.execute('import sys') if sshserver_path_append is not None: dview.apply_sync(lambda: sys.path.append(sshserver_path_append)) # with dview.sync_imports(): import crosscat.LocalEngine as LE dview.push(dict( M_c=M_c, M_r=M_r, T=T)) async_result = dview.map_async(lambda SEED: LE.do_initialize(M_c, M_r, T, 'from_the_prior', SEED), range(8)) initialized_states = async_result.get() # async_result = dview.map_async(lambda (SEED, state_tuple): LE.do_analyze(M_c, T, state_tuple[0], state_tuple[1], (), 10, (), (), -1, -1, SEED), zip(range(len(initialized_states)), initialized_states)) chain_tuples = async_result.get() else: # initialize the chains q = Queue() p_list = [] for chain_idx in range(num_chains): p = Process(target=do_initialize, args=(engine, M_c, M_r, T, q)) p.start() p_list.append(p) chain_tuples = [q.get() for idx in range(num_chains)] for p in p_list: p.join()