def initialize(self, M_c, M_r, T, initialization='from_the_prior', n_chains=1): """Sample a latent state from prior :param M_c: The column metadata :type M_c: dict :param M_r: The row metadata :type M_r: dict :param T: The data table in mapped representation (all floats, generated by data_utils.read_data_objects) :type T: list of lists :returns: X_L, X_D -- the latent state """ output_path = self.output_path input_filename = self.input_filename table_data_filename = self.table_data_filename intialize_args_dict_filename = self.command_dict_filename xu.assert_vpn_is_connected() # table_data = dict(M_c=M_c, M_r=M_r, T=T) initialize_args_dict = dict(command='initialize', initialization=initialization) xu.write_initialization_files(input_filename, table_data, table_data_filename, initialize_args_dict, intialize_args_dict_filename, n_chains) os.system('cp %s initialize_input' % input_filename) self.send_hadoop_command(n_tasks=n_chains) was_successful = self.get_hadoop_results() hadoop_output = None if was_successful: hu.copy_hadoop_output(output_path, 'initialize_output') X_L_list, X_D_list = hu.read_hadoop_output(output_path) hadoop_output = X_L_list, X_D_list return hadoop_output
def analyze(self, M_c, T, X_L, X_D, kernel_list=(), n_steps=1, c=(), r=(), max_iterations=-1, max_time=-1, **kwargs): """Evolve the latent state by running MCMC transition kernels :param M_c: The column metadata :type M_c: dict :param T: The data table in mapped representation (all floats, generated by data_utils.read_data_objects) :type T: list of lists :param X_L: the latent variables associated with the latent state :type X_L: dict :param X_D: the particular cluster assignments of each row in each view :type X_D: list of lists :param kernel_list: names of the MCMC transition kernels to run :type kernel_list: list of strings :param n_steps: the number of times to run each MCMC transition kernel :type n_steps: int :param c: the (global) column indices to run MCMC transition kernels on :type c: list of ints :param r: the (global) row indices to run MCMC transition kernels on :type r: list of ints :param max_iterations: the maximum number of times ot run each MCMC transition kernel. Applicable only if max_time != -1. :type max_iterations: int :param max_time: the maximum amount of time (seconds) to run MCMC transition kernels for before stopping to return progress :type max_time: float :param kwargs: optional arguments to pass to hadoop_line_processor.jar. Currently, presence of a 'chunk_size' kwarg causes different behavior. :returns: X_L, X_D -- the evolved latent state """ output_path = self.output_path input_filename = self.input_filename table_data_filename = self.table_data_filename analyze_args_dict_filename = self.command_dict_filename xu.assert_vpn_is_connected() # table_data = dict(M_c=M_c, T=T) analyze_args_dict = dict(command='analyze', kernel_list=kernel_list, n_steps=n_steps, c=c, r=r, max_time=max_time) # chunk_analyze is a special case of analyze if 'chunk_size' in kwargs: chunk_size = kwargs['chunk_size'] chunk_filename_prefix = kwargs['chunk_filename_prefix'] chunk_dest_dir = kwargs['chunk_dest_dir'] analyze_args_dict['command'] = 'chunk_analyze' analyze_args_dict['chunk_size'] = chunk_size analyze_args_dict['chunk_filename_prefix'] = chunk_filename_prefix # WARNING: chunk_dest_dir MUST be writeable by hadoop user mapred analyze_args_dict['chunk_dest_dir'] = chunk_dest_dir if not su.get_is_multistate(X_L, X_D): X_L = [X_L] X_D = [X_D] # SEEDS = kwargs.get('SEEDS', None) xu.write_analyze_files(input_filename, X_L, X_D, table_data, table_data_filename, analyze_args_dict, analyze_args_dict_filename, SEEDS) os.system('cp %s analyze_input' % input_filename) n_tasks = len(X_L) self.send_hadoop_command(n_tasks) was_successful = self.get_hadoop_results() hadoop_output = None if was_successful: hu.copy_hadoop_output(output_path, 'analyze_output') X_L_list, X_D_list = hu.read_hadoop_output(output_path) hadoop_output = X_L_list, X_D_list return hadoop_output
T, M_c, M_r, X_L, X_D = generate_clean_state(gen_seed, num_clusters, num_cols, num_rows, num_splits, max_mean=10, max_std=1) # write table_data table_data = dict(M_c=M_c, M_r=M_r, T=T) fu.pickle(table_data, table_data_filename) # write hadoop input n_tasks = write_hadoop_input(input_filename, X_L, X_D, n_steps, SEED=gen_seed) # actually run if do_local: xu.run_script_local(input_filename, script_filename, output_filename, table_data_filename) elif do_remote: hadoop_engine = HE.HadoopEngine(output_path=output_path, input_filename=input_filename, table_data_filename=table_data_filename, ) hadoop_engine.send_hadoop_command(n_tasks) was_successful = hadoop_engine.get_hadoop_results() if was_successful: hu.copy_hadoop_output(output_path, output_filename) else: print('remote hadoop job NOT successful') else: hadoop_engine = HE.HadoopEngine() # print what the command would be print(HE.create_hadoop_cmd_str(hadoop_engine, n_tasks=n_tasks))
fu.pickle(table_data, table_data_filename) if do_local: xu.run_script_local(input_filename, script_filename, output_filename, table_data_filename) print('Local Engine for automated timing runs has not been completely implemented/tested') elif do_remote: hadoop_engine = HE.HadoopEngine(which_engine_binary=which_engine_binary, output_path=output_path, input_filename=input_filename, table_data_filename=table_data_filename) xu.write_support_files(table_data, hadoop_engine.table_data_filename, dict(command='time_analyze'), hadoop_engine.command_dict_filename) hadoop_engine.send_hadoop_command(n_tasks=n_tasks) was_successful = hadoop_engine.get_hadoop_results() if was_successful: hu.copy_hadoop_output(hadoop_engine.output_path, output_filename) parse_timing.parse_timing_to_csv(output_filename, outfile=parsed_out_file) coeff_list = find_regression_coeff(parsed_out_file, parameter_list) else: print('remote hadoop job NOT successful') else: # print what the command would be hadoop_engine = HE.HadoopEngine(which_engine_binary=which_engine_binary, output_path=output_path, input_filename=input_filename, table_data_filename=table_data_filename) cmd_str = hu.create_hadoop_cmd_str( hadoop_engine.hdfs_uri, hadoop_engine.hdfs_dir, hadoop_engine.jobtracker_uri, hadoop_engine.which_engine_binary, hadoop_engine.which_hadoop_binary, hadoop_engine.which_hadoop_jar,
print( 'Local Engine for automated timing runs has not been completely implemented/tested' ) elif do_remote: hadoop_engine = HE.HadoopEngine( which_engine_binary=which_engine_binary, output_path=output_path, input_filename=input_filename, table_data_filename=table_data_filename) xu.write_support_files(table_data, hadoop_engine.table_data_filename, dict(command='time_analyze'), hadoop_engine.command_dict_filename) hadoop_engine.send_hadoop_command(n_tasks=n_tasks) was_successful = hadoop_engine.get_hadoop_results() if was_successful: hu.copy_hadoop_output(hadoop_engine.output_path, output_filename) parse_timing.parse_timing_to_csv(output_filename, outfile=parsed_out_file) coeff_list = find_regression_coeff(parsed_out_file, parameter_list) else: print('remote hadoop job NOT successful') else: # print what the command would be hadoop_engine = HE.HadoopEngine( which_engine_binary=which_engine_binary, output_path=output_path, input_filename=input_filename, table_data_filename=table_data_filename) cmd_str = hu.create_hadoop_cmd_str( hadoop_engine.hdfs_uri, hadoop_engine.hdfs_dir,
# write table_data table_data = dict(M_c=M_c, M_r=M_r, T=T) fu.pickle(table_data, table_data_filename) # write hadoop input n_tasks = write_hadoop_input(input_filename, X_L, X_D, n_steps, SEED=gen_seed) # actually run if do_local: xu.run_script_local(input_filename, script_filename, output_filename, table_data_filename) elif do_remote: hadoop_engine = HE.HadoopEngine( output_path=output_path, input_filename=input_filename, table_data_filename=table_data_filename, ) hadoop_engine.send_hadoop_command(n_tasks) was_successful = hadoop_engine.get_hadoop_results() if was_successful: hu.copy_hadoop_output(output_path, output_filename) else: print('remote hadoop job NOT successful') else: hadoop_engine = HE.HadoopEngine() # print what the command would be print(HE.create_hadoop_cmd_str(hadoop_engine, n_tasks=n_tasks))