示例#1
0
    def initialize(self,
                   M_c,
                   M_r,
                   T,
                   initialization='from_the_prior',
                   n_chains=1):
        """Sample a latent state from prior

        :param M_c: The column metadata
        :type M_c: dict
        :param M_r: The row metadata
        :type M_r: dict
        :param T: The data table in mapped representation (all floats, generated
                  by data_utils.read_data_objects)
        :type T: list of lists
        :returns: X_L, X_D -- the latent state

        """

        output_path = self.output_path
        input_filename = self.input_filename
        table_data_filename = self.table_data_filename
        intialize_args_dict_filename = self.command_dict_filename
        xu.assert_vpn_is_connected()
        #
        table_data = dict(M_c=M_c, M_r=M_r, T=T)
        initialize_args_dict = dict(command='initialize',
                                    initialization=initialization)
        xu.write_initialization_files(input_filename, table_data,
                                      table_data_filename,
                                      initialize_args_dict,
                                      intialize_args_dict_filename, n_chains)
        os.system('cp %s initialize_input' % input_filename)
        self.send_hadoop_command(n_tasks=n_chains)
        was_successful = self.get_hadoop_results()
        hadoop_output = None
        if was_successful:
            hu.copy_hadoop_output(output_path, 'initialize_output')
            X_L_list, X_D_list = hu.read_hadoop_output(output_path)
            hadoop_output = X_L_list, X_D_list
            return hadoop_output
示例#2
0
    def initialize(self, M_c, M_r, T, initialization='from_the_prior',
                   n_chains=1):
        """Sample a latent state from prior

        :param M_c: The column metadata
        :type M_c: dict
        :param M_r: The row metadata
        :type M_r: dict
        :param T: The data table in mapped representation (all floats, generated
                  by data_utils.read_data_objects)
        :type T: list of lists
        :returns: X_L, X_D -- the latent state

        """

        output_path = self.output_path
        input_filename = self.input_filename
        table_data_filename = self.table_data_filename
        intialize_args_dict_filename = self.command_dict_filename
        xu.assert_vpn_is_connected()
          #
        table_data = dict(M_c=M_c, M_r=M_r, T=T)
        initialize_args_dict = dict(command='initialize',
                                    initialization=initialization)
        xu.write_initialization_files(input_filename,
                                      table_data, table_data_filename,
                                      initialize_args_dict,
                                      intialize_args_dict_filename,
                                      n_chains)
        os.system('cp %s initialize_input' % input_filename)
        self.send_hadoop_command(n_tasks=n_chains)
        was_successful = self.get_hadoop_results()
        hadoop_output = None
        if was_successful:
            hu.copy_hadoop_output(output_path, 'initialize_output')
            X_L_list, X_D_list = hu.read_hadoop_output(output_path)
            hadoop_output = X_L_list, X_D_list
            return hadoop_output
示例#3
0
    def analyze(self, M_c, T, X_L, X_D, kernel_list=(), n_steps=1, c=(), r=(),
                max_iterations=-1, max_time=-1, **kwargs):  
        """Evolve the latent state by running MCMC transition kernels

        :param M_c: The column metadata
        :type M_c: dict
        :param T: The data table in mapped representation (all floats, generated
                  by data_utils.read_data_objects)
        :type T: list of lists
        :param X_L: the latent variables associated with the latent state
        :type X_L: dict
        :param X_D: the particular cluster assignments of each row in each view
        :type X_D: list of lists
        :param kernel_list: names of the MCMC transition kernels to run
        :type kernel_list: list of strings
        :param n_steps: the number of times to run each MCMC transition kernel
        :type n_steps: int
        :param c: the (global) column indices to run MCMC transition kernels on
        :type c: list of ints
        :param r: the (global) row indices to run MCMC transition kernels on
        :type r: list of ints
        :param max_iterations: the maximum number of times ot run each MCMC
                               transition kernel. Applicable only if
                               max_time != -1.
        :type max_iterations: int
        :param max_time: the maximum amount of time (seconds) to run MCMC
                         transition kernels for before stopping to return
                         progress
        :type max_time: float
        :param kwargs: optional arguments to pass to hadoop_line_processor.jar.
                       Currently, presence of a 'chunk_size' kwarg causes
                       different behavior.
        :returns: X_L, X_D -- the evolved latent state
        
        """

        output_path = self.output_path
        input_filename = self.input_filename
        table_data_filename = self.table_data_filename
        analyze_args_dict_filename = self.command_dict_filename
        xu.assert_vpn_is_connected()
        #
        table_data = dict(M_c=M_c, T=T)
        analyze_args_dict = dict(command='analyze', kernel_list=kernel_list,
                                 n_steps=n_steps, c=c, r=r, max_time=max_time)
        # chunk_analyze is a special case of analyze
        if 'chunk_size' in kwargs:
          chunk_size = kwargs['chunk_size']
          chunk_filename_prefix = kwargs['chunk_filename_prefix']
          chunk_dest_dir = kwargs['chunk_dest_dir']
          analyze_args_dict['command'] = 'chunk_analyze'
          analyze_args_dict['chunk_size'] = chunk_size
          analyze_args_dict['chunk_filename_prefix'] = chunk_filename_prefix
          # WARNING: chunk_dest_dir MUST be writeable by hadoop user mapred
          analyze_args_dict['chunk_dest_dir'] = chunk_dest_dir
        if not su.get_is_multistate(X_L, X_D):
            X_L = [X_L]
            X_D = [X_D]
        #
        SEEDS = kwargs.get('SEEDS', None)
        xu.write_analyze_files(input_filename, X_L, X_D,
                               table_data, table_data_filename,
                               analyze_args_dict, analyze_args_dict_filename,
                               SEEDS)
        os.system('cp %s analyze_input' % input_filename)
        n_tasks = len(X_L)
        self.send_hadoop_command(n_tasks)
        was_successful = self.get_hadoop_results()
        hadoop_output = None
        if was_successful:
          hu.copy_hadoop_output(output_path, 'analyze_output')
          X_L_list, X_D_list = hu.read_hadoop_output(output_path)
          hadoop_output = X_L_list, X_D_list
        return hadoop_output
    T, M_c, M_r, X_L, X_D = generate_clean_state(gen_seed,
                                                 num_clusters,
                                                 num_cols, num_rows,
                                                 num_splits,
                                                 max_mean=10, max_std=1)

    # write table_data
    table_data = dict(M_c=M_c, M_r=M_r, T=T)
    fu.pickle(table_data, table_data_filename)
    # write hadoop input
    n_tasks = write_hadoop_input(input_filename, X_L, X_D, n_steps, SEED=gen_seed)

    # actually run
    if do_local:
        xu.run_script_local(input_filename, script_filename, output_filename, table_data_filename)
    elif do_remote:
        hadoop_engine = HE.HadoopEngine(output_path=output_path,
                                        input_filename=input_filename,
                                        table_data_filename=table_data_filename,
                                        )
        hadoop_engine.send_hadoop_command(n_tasks)
        was_successful = hadoop_engine.get_hadoop_results()
        if was_successful:
            hu.copy_hadoop_output(output_path, output_filename)
        else:
            print('remote hadoop job NOT successful')
    else:
        hadoop_engine = HE.HadoopEngine()
        # print what the command would be
        print(HE.create_hadoop_cmd_str(hadoop_engine, n_tasks=n_tasks))
    fu.pickle(table_data, table_data_filename)

    if do_local:
        xu.run_script_local(input_filename, script_filename, output_filename, table_data_filename)
        print('Local Engine for automated timing runs has not been completely implemented/tested')
    elif do_remote:
        hadoop_engine = HE.HadoopEngine(which_engine_binary=which_engine_binary,
                output_path=output_path,
                input_filename=input_filename,
                table_data_filename=table_data_filename)
        xu.write_support_files(table_data, hadoop_engine.table_data_filename,
                              dict(command='time_analyze'), hadoop_engine.command_dict_filename)
        hadoop_engine.send_hadoop_command(n_tasks=n_tasks)
        was_successful = hadoop_engine.get_hadoop_results()
        if was_successful:
            hu.copy_hadoop_output(hadoop_engine.output_path, output_filename)
            parse_timing.parse_timing_to_csv(output_filename, outfile=parsed_out_file)
            coeff_list = find_regression_coeff(parsed_out_file, parameter_list)

        else:
            print('remote hadoop job NOT successful')
    else:
        # print what the command would be
        hadoop_engine = HE.HadoopEngine(which_engine_binary=which_engine_binary,
                output_path=output_path,
                input_filename=input_filename,
                table_data_filename=table_data_filename)
        cmd_str = hu.create_hadoop_cmd_str(
                hadoop_engine.hdfs_uri, hadoop_engine.hdfs_dir, hadoop_engine.jobtracker_uri,
                hadoop_engine.which_engine_binary, hadoop_engine.which_hadoop_binary,
                hadoop_engine.which_hadoop_jar,
        print(
            'Local Engine for automated timing runs has not been completely implemented/tested'
        )
    elif do_remote:
        hadoop_engine = HE.HadoopEngine(
            which_engine_binary=which_engine_binary,
            output_path=output_path,
            input_filename=input_filename,
            table_data_filename=table_data_filename)
        xu.write_support_files(table_data, hadoop_engine.table_data_filename,
                               dict(command='time_analyze'),
                               hadoop_engine.command_dict_filename)
        hadoop_engine.send_hadoop_command(n_tasks=n_tasks)
        was_successful = hadoop_engine.get_hadoop_results()
        if was_successful:
            hu.copy_hadoop_output(hadoop_engine.output_path, output_filename)
            parse_timing.parse_timing_to_csv(output_filename,
                                             outfile=parsed_out_file)
            coeff_list = find_regression_coeff(parsed_out_file, parameter_list)

        else:
            print('remote hadoop job NOT successful')
    else:
        # print what the command would be
        hadoop_engine = HE.HadoopEngine(
            which_engine_binary=which_engine_binary,
            output_path=output_path,
            input_filename=input_filename,
            table_data_filename=table_data_filename)
        cmd_str = hu.create_hadoop_cmd_str(
            hadoop_engine.hdfs_uri, hadoop_engine.hdfs_dir,
示例#7
0
    # write table_data
    table_data = dict(M_c=M_c, M_r=M_r, T=T)
    fu.pickle(table_data, table_data_filename)
    # write hadoop input
    n_tasks = write_hadoop_input(input_filename,
                                 X_L,
                                 X_D,
                                 n_steps,
                                 SEED=gen_seed)

    # actually run
    if do_local:
        xu.run_script_local(input_filename, script_filename, output_filename,
                            table_data_filename)
    elif do_remote:
        hadoop_engine = HE.HadoopEngine(
            output_path=output_path,
            input_filename=input_filename,
            table_data_filename=table_data_filename,
        )
        hadoop_engine.send_hadoop_command(n_tasks)
        was_successful = hadoop_engine.get_hadoop_results()
        if was_successful:
            hu.copy_hadoop_output(output_path, output_filename)
        else:
            print('remote hadoop job NOT successful')
    else:
        hadoop_engine = HE.HadoopEngine()
        # print what the command would be
        print(HE.create_hadoop_cmd_str(hadoop_engine, n_tasks=n_tasks))