def _after_run(self): super(SparkPipeline, self)._after_run() for local_uri_info in self._local_uri_infos: local_uri = local_uri_info['local_uri'] hdfs_uri = local_uri_info['hdfs_uri'] if local_uri_info['overwrite']: logger.info("Preparing local directory: %s" % local_uri) if not self._force_delete_file(local_uri): raise error.BigflowHDFSException( "Failed to remove target path: %s" % local_uri) else: if self._path_exists(local_uri): raise error.BigflowHDFSException( "Failed to output target path: %s, target path is existed" % local_uri) os.makedirs(local_uri) self._client.fs_get(hdfs_uri + "/*", local_uri, self._hadoop_config) self._local_uri_infos = [] if SparkPipeline.output_dir_conf_key in self._config["spark_conf"]: del self._config["spark_conf"][SparkPipeline.output_dir_conf_key]
def fs_mkdir(self, path, args=None): """ Wraps console command 'hadoop fs -mkdir -p <path>' Args: path (str): path to be created """ if not self.fs_test(path, args): commit_args = ["fs"] commit_args.extend(self.__build_args(path, args)) commit_args.extend(["-mkdir", "-p", path]) if not self.__commit(commit_args): raise error.BigflowHDFSException("Error create HDFS path %s" % path) return self.fs_test(path, args)
def fs_mv(self, source, target, args=None): """ Wraps console command 'hadoop fs -mv <source> <target>' Args: source (str): path of source target (str): path of target """ commit_args = ["fs"] commit_args.extend(self.__build_args(source, args)) commit_args.extend(["-mv", source, target]) if not self.__commit(commit_args): msg = "Error moving HDFS path ['%s'] to ['%s']" % (source, target) raise error.BigflowHDFSException(msg)
def fs_rmr(self, path, args=None): """ Wraps console command 'hadoop fs -rmr <path>' Args: path (str): path to be removed """ if self.fs_test(path, args): commit_args = ["fs"] commit_args.extend(self.__build_args(path, args)) commit_args.extend(["-rmr", path]) if not self.__commit(commit_args): raise error.BigflowHDFSException( "Error removing HDFS path %s" % path) return not self.fs_test(path, args)
def _prepare_cache_archive(self): logger.info("Checking PreparedArchive for Spark Pipeline...") existed = self._client.fs_test(self.prepared_archive_path, self._hadoop_config) tmp_path = self.prepared_archive_path + '-' + str(uuid.uuid4()) self._job_config.prepared_archive_path = self.prepared_archive_path self._job_config.tmp_data_path = tmp_path if self._config['reprepare_cache_archive'] or not existed: if self._config['reprepare_cache_archive']: if not existed: logger.info("Bigflow PreparedArchive does not exist") else: logger.info("Re-prepare Bigflow PreparedArchive") self._client.fs_rmr(self.prepared_archive_path, self._hadoop_config) import subprocess bigflow_home = self._get_bigflow_python_home() local_cache_archive = "bigflow_python_%s.tar.gz" % (str( uuid.uuid4())) cmd = "tar czf %s -C %s --exclude=flume/worker python_runtime flume" % ( local_cache_archive, bigflow_home) ret = subprocess.call(cmd, shell=True) if ret != 0: raise error.BigflowPlanningException( "Cannot make PreparedArchive file") try: self._client.fs_put(local_cache_archive, tmp_path, self._hadoop_config) self._client.fs_mv(tmp_path, self.prepared_archive_path, self._hadoop_config) except error.BigflowHDFSException: # only need to delete archive path when exception occurs. self._remote_temp_files.append(tmp_path) if not self._client.fs_test(self.prepared_archive_path, self._hadoop_config): msg = "Unable to upload Bigflow PreparedArchive, please " \ "make sure you have write permission to " \ "tmp_data_path['%s']" % self._config['tmp_data_path'] raise error.BigflowHDFSException(msg) finally: ret = subprocess.call("rm %s" % local_cache_archive, shell=True) self._client.fs_rmr(tmp_path, self._hadoop_config) else: logger.info("Bigflow PreparedArchive exists already")
def fs_put(self, source, target, args=None, need_mkdir=True): """ Wraps console command 'hadoop fs -put <source> <target>' Args: source (str): path of source target (str): path of target """ if need_mkdir: import os mk_path = os.path.dirname(target) self.fs_mkdir(mk_path) commit_args = ["fs"] commit_args.extend(self.__build_args(target, args)) commit_args.extend(["-put", source, target]) if not self.__commit(commit_args): msg = "Error uploading temp file from [%s] to [%s],"\ " please make sure source file exists on local filesystem" \ " and you have the write permission to the target hdfs directory," \ " or may be you can change your 'tmp_data_path'" % (source, target) raise error.BigflowHDFSException(msg)