gen_seed = args.gen_seed num_transitions = args.num_transitions N_GRID = args.N_GRID max_rows = args.max_rows num_clusters = args.num_clusters num_views = args.num_views num_cols = args.num_cols numChains = args.numChains block_size = args.block_size engine = ccc.get_CrossCatClient('hadoop', seed = inf_seed) if filename is not None: # Load the data from table and sub-sample entities to max_rows T, M_r, M_c = du.read_model_data_from_csv(filename, max_rows, gen_seed) truth_flag = 0 else: T, M_r, M_c, data_inverse_permutation_indices = \ du.gen_factorial_data_objects(gen_seed, num_clusters, num_cols, max_rows, num_views, max_mean=100, max_std=1, send_data_inverse_permutation_indices=True) view_assignment_truth, X_D_truth = ctu.truth_from_permute_indices(data_inverse_permutation_indices, max_rows,num_cols,num_views, num_clusters) truth_flag = 1 num_rows = len(T) num_cols = len(T[0]) ari_table = []
chunk_dest_dir = args.chunk_dest_dir max_time = args.max_time table_filename = args.table_filename resume_filename = args.resume_filename pkl_filename = args.pkl_filename # command = args.command # assert command in set(gu.get_method_names(HadoopEngine)) # cctypes_filename = args.cctypes_filename cctypes = None if cctypes_filename is not None: cctypes = fu.unpickle(cctypes_filename) hdfs_uri, jobtracker_uri = hu.get_uris(base_uri, hdfs_uri, jobtracker_uri) T, M_r, M_c = du.read_model_data_from_csv(table_filename, gen_seed=0, cctypes=cctypes) he = HadoopEngine(which_engine_binary=which_engine_binary, which_hadoop_binary=which_hadoop_binary, which_hadoop_jar=which_hadoop_jar, hdfs_dir=hdfs_dir, hdfs_uri=hdfs_uri, jobtracker_uri=jobtracker_uri) X_L_list, X_D_list = None, None if command == 'initialize': hadoop_output = he.initialize(M_c, M_r, T, initialization='from_the_prior', n_chains=n_chains) if hadoop_output is not None: X_L_list, X_D_list = hadoop_output elif command == 'analyze': assert resume_filename is not None
def read_and_pickle_table_data(table_data_filename, pkl_filename): T, M_r, M_c = du.read_model_data_from_csv(table_data_filename, gen_seed=0) table_data = dict(T=T, M_r=M_r, M_c=M_c) fu.pickle(table_data, pkl_filename) return table_data
def do_intialize(SEED): _do_initialize = crosscat.LocalEngine._do_initialize return _do_initialize(M_c, M_r, T, 'from_the_prior', SEED) def do_analyze((SEED, state_tuple)): X_L, X_D = state_tuple _do_analyze = crosscat.LocalEngine._do_analyze return _do_analyze(M_c, T, X_L, X_D, (), num_transitions, (), (), -1, -1, SEED) # set everything up T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed) num_rows = len(T) num_cols = len(T[0]) col_names = numpy.array( [M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)]) ## set up parallel from IPython.parallel import Client c = Client(ipython_parallel_config) dview = c[:] with dview.sync_imports(): import crosscat import crosscat.LocalEngine import sys if path_append is not None: dview.apply_sync(lambda: sys.path.append(path_append))
Q = [(row_idx, col_idx) for col_idx in query_col_indices] return Q def determine_unobserved_Y(num_rows, M_c, condition_tuples): name_to_idx = M_c['name_to_idx'] row_idx = num_rows + 1 Y = [] for col_name, col_value in condition_tuples: col_idx = name_to_idx[col_name] col_code = du.convert_value_to_code(M_c, col_idx, col_value) y = (row_idx, col_idx, col_code) Y.append(y) return Y # set everything up T, M_r, M_c = du.read_model_data_from_csv(filename, gen_seed=gen_seed) num_rows = len(T) num_cols = len(T[0]) col_names = numpy.array([M_c['idx_to_name'][str(col_idx)] for col_idx in range(num_cols)]) # initialze and transition chains engine = LE.LocalEngine(inf_seed) X_L_list, X_D_list = engine.initialize(M_c, M_r, T, get_next_seed(), initialization='from_the_prior', n_chains=num_chains) X_L_list, X_D_list = engine.analyze(M_c, T, X_L_list, X_D_list, get_next_seed(), n_steps=num_transitions) # save the progress to_pickle = dict(X_L_list=X_L_list, X_D_list=X_D_list) fu.pickle(to_pickle, pkl_filename) # to_pickle = fu.unpickle(pkl_filename) # X_L_list = to_pickle['X_L_list']