def _deserialize_output(self, task): """ Deserialize the output from a task. Parameters ---------- Task definition of interest. Returns ------- The output of the run-time task associated with the task definition. """ filepath = self._task_output_paths[task] non_hdfs_file_path = filepath # Unpickler has no support for passing in additional HADOOP_CONF_DIR # so we download HDFS folder first before calling to unpickler if _file_util.is_hdfs_path(filepath): non_hdfs_file_path = _make_temp_directory("job_output_") _file_util.download_from_hdfs(filepath, non_hdfs_file_path, hadoop_conf_dir=self.environment.hadoop_conf_dir, is_dir = True) unpickler = gl_pickle.GLUnpickler(non_hdfs_file_path) # We cannot delete this temporary file path becaue SFrame lazily load # the content from disk. But the temporary folder will be removed # eventually when the python session goes away return unpickler.load()
def prepare_job_files(environment, job): ''' Prepare all files needed to run a job in EC2 cluster ''' # Update job working directory s3_job_path_root = Ec2ExecutionEnvironment.create_job_home_dir( environment, job.name) _job.Ec2Job._update_exec_dir(job, s3_job_path_root) __LOGGER__.info('Job working directory: %s' % s3_job_path_root) # Prepare all files locally and then upload to S3 temp_local_folder = _make_temp_directory(prefix='ec2_job_') try: ExecutionEnvironment.prepare_job_exec_dir(job, temp_local_folder) _file_util.upload_to_s3( temp_local_folder, s3_job_path_root, is_dir=True, aws_credentials=environment.ec2_config.get_credentials(), silent=True) finally: _shutil.rmtree(temp_local_folder)
def _get_dependent_file_path(cls): '''We put all dependent_files under a fixed path so that newer file may overwrite older files to guarantee the modules can be reloaded correctly ''' if PredictiveObject._depedent_file_path is None: PredictiveObject._depedent_file_path = _make_temp_directory(prefix='predictive_object_files') sys.path.insert(1, PredictiveObject._depedent_file_path) return PredictiveObject._depedent_file_path
def submit_yarn_application(environment, job_working_dir, silent=False, native_conda=False): ''' Submit a yarn application ''' # Create temporary directory to write tmp files in for the job temp_local_directory = _make_temp_directory(prefix='hadoop_job_') req_file_path = None if hasattr(environment, 'additional_packages'): req_file_path = ExecutionEnvironment.write_required_packages( temp_local_directory, environment.additional_packages) # build actual hadoop command hadoop_cmd = HadoopExecutionEnvironment._build_hadoop_cmd( environment, job_working_dir, req_file_path, native_conda=native_conda) # call hadoop command if not silent: __LOGGER__.info( "Submitting job to Hadoop cluster using command= \n%s" % hadoop_cmd) proc = _subprocess.Popen(hadoop_cmd, shell=True, stderr=_subprocess.STDOUT, stdout=_subprocess.PIPE) app_info = HadoopExecutionEnvironment._parse_hadoop_cmd_output( proc, silent) if 'app_id' not in app_info or not app_info['app_id']: __LOGGER__.info( "Submitting job to Hadoop cluster using command= \n%s" % hadoop_cmd) raise RuntimeError( "Error submitting application or determining application id. Please confirm that you" " can correctly access the Hadoop environment specified (try running the above" " command from a terminal to see more diagnostic output).") else: _shutil.rmtree(temp_local_directory) return app_info
def submit_yarn_application(environment, job_working_dir, silent = False, native_conda = False): ''' Submit a yarn application ''' # Create temporary directory to write tmp files in for the job temp_local_directory = _make_temp_directory(prefix='hadoop_job_') req_file_path = None if hasattr(environment, 'additional_packages'): req_file_path = ExecutionEnvironment.write_required_packages( temp_local_directory, environment.additional_packages) # build actual hadoop command hadoop_cmd = HadoopExecutionEnvironment._build_hadoop_cmd( environment, job_working_dir, req_file_path, native_conda = native_conda) # call hadoop command if not silent: __LOGGER__.info("Submitting job to Hadoop cluster using command= \n%s" % hadoop_cmd) proc = _subprocess.Popen(hadoop_cmd, shell=True, stderr=_subprocess.STDOUT, stdout=_subprocess.PIPE) app_info = HadoopExecutionEnvironment._parse_hadoop_cmd_output(proc, silent) if 'app_id' not in app_info or not app_info['app_id']: __LOGGER__.info("Submitting job to Hadoop cluster using command= \n%s" % hadoop_cmd) raise RuntimeError("Error submitting application or determining application id. Please confirm that you" " can correctly access the Hadoop environment specified (try running the above" " command from a terminal to see more diagnostic output).") else: _shutil.rmtree(temp_local_directory) return app_info
def prepare_job_files(environment, job): ''' Upload all job related information to HDFS so that it can be executed remotely ''' exec_dir = HadoopExecutionEnvironment.create_job_home_dir(environment, job.name) _job.HadoopJob._update_exec_dir(job, exec_dir) logging.info("Job working directory: %s" % job._exec_dir) temp_job_folder = _make_temp_directory(prefix='hadoop_job_') try: ExecutionEnvironment.prepare_job_exec_dir(job, temp_job_folder) # Move everything to HDFS _file_util.upload_folder_to_hdfs( temp_job_folder, exec_dir, hadoop_conf_dir = environment.hadoop_conf_dir) finally: _shutil.rmtree(temp_job_folder)
def prepare_job_files(environment, job): ''' Upload all job related information to HDFS so that it can be executed remotely ''' exec_dir = HadoopExecutionEnvironment.create_job_home_dir( environment, job.name) _job.HadoopJob._update_exec_dir(job, exec_dir) logging.info("Job working directory: %s" % job._exec_dir) temp_job_folder = _make_temp_directory(prefix='hadoop_job_') try: ExecutionEnvironment.prepare_job_exec_dir(job, temp_job_folder) # Move everything to HDFS _file_util.upload_folder_to_hdfs( temp_job_folder, exec_dir, hadoop_conf_dir=environment.hadoop_conf_dir) finally: _shutil.rmtree(temp_job_folder)
def prepare_job_files(environment, job): ''' Prepare all files needed to run a job in EC2 cluster ''' # Update job working directory s3_job_path_root = Ec2ExecutionEnvironment.create_job_home_dir(environment, job.name) _job.Ec2Job._update_exec_dir(job, s3_job_path_root) __LOGGER__.info('Job working directory: %s' % s3_job_path_root) # Prepare all files locally and then upload to S3 temp_local_folder = _make_temp_directory(prefix='ec2_job_') try: ExecutionEnvironment.prepare_job_exec_dir(job, temp_local_folder) _file_util.upload_to_s3( temp_local_folder, s3_job_path_root, is_dir = True, aws_credentials = environment.ec2_config.get_credentials(), silent = True) finally: _shutil.rmtree(temp_local_folder)