assert T["dimensions"][0] == len(mc["name_to_idx"]) def assert_other(mr, mc, xl, xd, T): # is the number of views in xd equal to their cached counts in xl? assert len(xl["column_partition"]["counts"]) == len(xd) if __name__ == '__main__': import argparse import json parser = argparse.ArgumentParser('A script to validate a json file\'s compliance with the predictive-DB spec') parser.add_argument('filename', type=str) args = parser.parse_args() filename = args.filename # if filename.endswith('.pkl.gz'): parsed_sample = fu.unpickle(filename) parsed_sample['M_r'] = strify_M_r(parsed_sample['M_r']) parsed_sample['M_c'] = strify_M_c(parsed_sample['M_c']) parsed_sample['X_L'] = convert_X_L(parsed_sample['X_L']) parsed_sample['T'] = convert_T(parsed_sample['T']) else: with open(filename) as fh: one_line = "".join(fh.readlines()).translate(None,"\n\t ") parsed_sample = json.loads(one_line) M_c = parsed_sample["M_c"] M_r = parsed_sample["M_r"] X_L = parsed_sample["X_L"] X_D = parsed_sample["X_D"] T = parsed_sample["T"]
# parse some arguments parser = argparse.ArgumentParser() parser.add_argument('pkl_name', type=str) parser.add_argument('--inf_seed', default=0, type=int) args = parser.parse_args(['/usr/local/crosscat/cython_code/iter_90_pickled_state.pkl.gz']) pkl_name = args.pkl_name inf_seed = args.inf_seed random_state = numpy.random.RandomState(inf_seed) # FIXME: getting weird error on conversion to int: too large from inside pyx def get_next_seed(max_val=32767): # sys.maxint): return random_state.randint(max_val) # resume from saved name save_dict = fu.unpickle(pkl_name) M_c = save_dict['M_c'] X_L = save_dict['X_L'] X_D = save_dict['X_D'] T = save_dict['T'] num_cols = len(X_L['column_partition']['assignments']) row_idx = 205 col_idx = 13 Q = [(row_idx, col_idx)] imputed, confidence = su.impute_and_confidence( M_c, X_L, X_D, Y=None, Q=Q, n=400, get_next_seed=get_next_seed) T_array = numpy.array(T) which_view_idx = X_L['column_partition']['assignments'][col_idx] X_D_i = numpy.array(X_D[which_view_idx]) which_cluster_idx = X_D_i[row_idx]
n_steps = args.n_steps chunk_size = args.chunk_size chunk_filename_prefix = args.chunk_filename_prefix chunk_dest_dir = args.chunk_dest_dir max_time = args.max_time table_filename = args.table_filename resume_filename = args.resume_filename pkl_filename = args.pkl_filename # command = args.command # assert command in set(gu.get_method_names(HadoopEngine)) # cctypes_filename = args.cctypes_filename cctypes = None if cctypes_filename is not None: cctypes = fu.unpickle(cctypes_filename) hdfs_uri, jobtracker_uri = hu.get_uris(base_uri, hdfs_uri, jobtracker_uri) T, M_r, M_c = du.read_model_data_from_csv(table_filename, gen_seed=0, cctypes=cctypes) he = HadoopEngine(which_engine_binary=which_engine_binary, which_hadoop_binary=which_hadoop_binary, which_hadoop_jar=which_hadoop_jar, hdfs_dir=hdfs_dir, hdfs_uri=hdfs_uri, jobtracker_uri=jobtracker_uri) X_L_list, X_D_list = None, None if command == 'initialize': hadoop_output = he.initialize(M_c, M_r, T, initialization='from_the_prior', n_chains=n_chains)
# is the number of views in xd equal to their cached counts in xl? assert len(xl["column_partition"]["counts"]) == len(xd) if __name__ == '__main__': import argparse import json parser = argparse.ArgumentParser( 'A script to validate a json file\'s compliance with the predictive-DB spec' ) parser.add_argument('filename', type=str) args = parser.parse_args() filename = args.filename # if filename.endswith('.pkl.gz'): parsed_sample = fu.unpickle(filename) parsed_sample['M_r'] = strify_M_r(parsed_sample['M_r']) parsed_sample['M_c'] = strify_M_c(parsed_sample['M_c']) parsed_sample['X_L'] = convert_X_L(parsed_sample['X_L']) parsed_sample['T'] = convert_T(parsed_sample['T']) else: with open(filename) as fh: one_line = "".join(fh.readlines()).translate(None, "\n\t ") parsed_sample = json.loads(one_line) M_c = parsed_sample["M_c"] M_r = parsed_sample["M_r"] X_L = parsed_sample["X_L"] X_D = parsed_sample["X_D"] T = parsed_sample["T"]
parser.add_argument('pkl_name', type=str) parser.add_argument('--inf_seed', default=0, type=int) parser.add_argument('--hostname', default='127.0.0.1', type=str) args = parser.parse_args() pkl_name = args.pkl_name inf_seed = args.inf_seed hostname = args.hostname # FIXME: getting weird error on conversion to int: too large from inside pyx def get_next_seed(max_val=32767): # sys.maxint): return random_state.randint(max_val) # resume from saved name save_dict = fu.unpickle(pkl_name) random_state = numpy.random.RandomState(inf_seed) M_c = save_dict['M_c'] X_L = save_dict['X_L'] X_D = save_dict['X_D'] # FIXME: test constraints # Y = [su.Bunch(index=2,value=2.3), su.Bunch(index=0,value=-4.)] Y = None # test simple_predictive_sample_observed views_replicating_samples_params = su.determine_replicating_samples_params( X_L, X_D) views_samples = [] for replicating_samples_params in views_replicating_samples_params: this_view_samples = []
time_analyze=time_analyze_helper, convergence_analyze=convergence_analyze_helper, chunk_analyze=chunk_analyze_helper, mi_analyze=mi_analyze_helper ) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--table_data_filename', type=str, default=hs.default_table_data_filename) parser.add_argument('--command_dict_filename', type=str, default=hs.default_command_dict_filename) args = parser.parse_args() table_data_filename = args.table_data_filename command_dict_filename = args.command_dict_filename table_data = fu.unpickle(table_data_filename) command_dict = fu.unpickle(command_dict_filename) command = command_dict['command'] method = method_lookup[command] # from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE,SIG_DFL) for line in sys.stdin: key, data_dict = xu.parse_hadoop_line(line) ret_dict = method(table_data, data_dict, command_dict) xu.write_hadoop_line(sys.stdout, key, ret_dict)
) def process_line(line, table_data): key, dict_in = xu.parse_hadoop_line(line) if dict_in is None: return None, None command = dict_in['command'] method = method_lookup[command] ret_dict = method(table_data, dict_in) return key, ret_dict if __name__ == '__main__': pass # read the files table_data = fu.unpickle('table_data.pkl.gz') with open('hadoop_input') as fh: lines = [line for line in fh] sc = SparkContext("local", "Simple job") broadcasted_table_data = sc.broadcast(table_data) parallelized = sc.parallelize(lines) map_result = parallelized.map( lambda line: process_line(line, broadcasted_table_data.value)).collect() print map_result #
initialize=initialize_helper, analyze=analyze_helper, time_analyze=time_analyze_helper, ) def process_line(line, table_data): key, dict_in = xu.parse_hadoop_line(line) if dict_in is None: return None, None command = dict_in['command'] method = method_lookup[command] ret_dict = method(table_data, dict_in) return key, ret_dict if __name__ == '__main__': pass # read the files table_data = fu.unpickle('table_data.pkl.gz') with open('hadoop_input') as fh: lines = [line for line in fh] sc = SparkContext("local", "Simple job") broadcasted_table_data = sc.broadcast(table_data) parallelized = sc.parallelize(lines) map_result = parallelized.map(lambda line: process_line(line, broadcasted_table_data.value)).collect() print map_result #