def read_data(self, fd, offset, size): if offset >= fd.size: return "" else: try: contents = self.hdfs.read_file(fd.abspath, offset=offset, length=size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS read file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS read file: {0}, exc={1}" .format(fd.abspath, e)) return contents
def local_mv_file(self, src, dst): if self.simulateOnly: print("SIMULATE -> local move file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: try: self.hdfs.rename_file_dir(src.abspath, dst.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( src.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( src.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( src.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS rename file: {0}, exc={1}". format(src.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( src.abspath, e)) raise Errors.FsException( "An exception happened during HDFS rename file: {0}, exc={1}" .format(src.abspath, e))
def truncate_file(self, fd, size): if self.simulateOnly: print("SIMULATE -> truncate file: {0}, size={1}".format( fd.abspath, size)) else: try: self.hdfs.truncate_file(fd.abspath, size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS truncate file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS truncate file: {0}, exc={1}" .format(fd.abspath, e))
def concat_files(self, fd, chunkFdList): strList = list() for chunkFd in chunkFdList: strList.append(chunkFd.abspath) if self.simulateOnly: print("SIMULATE -> concat file: {0}, sources={1}".format( fd.abspath, ",".join(strList))) else: try: self.hdfs.concat_files(fd.abspath, strList) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS concat file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS concat file: {0}, exc={1}" .format(fd.abspath, e))
def list_dir(self, fd): try: status = self.hdfs.list_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(fd.abspath, e)) currentDir = status["FileStatuses"]["FileStatus"] for item in currentDir: yield HadoopFileDescriptor(self, fd.abspath, isSrc=True, needsDstDirCheck=False, fileJson=item)
def delete_file_dir(self, fd, recursive=False, force=False): if self.simulateOnly: print("SIMULATE -> remove file/dir: {0}, recursive={1}".format( fd.abspath, recursive)) else: try: if not recursive or force or \ query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"): status = self.hdfs.delete_file_dir(fd.abspath, recursive=recursive) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e))
def __init__(self, localFs, path, isSrc, dstDirMustExist): self.fs = localFs # Canonicalize path to "/" on descriptor creation path = localFs.canonicalize_path(path) # Local file system if not os.path.exists(path): if isSrc: raise Errors.FileNotFound("path: %s not found" % path) else: dstDir, dstName = localFs.get_dir_basename(path) if dstDirMustExist and not os.path.exists(dstDir): raise Errors.FileNotFound("destination directory: %s not found" % dstDir) self.name = dstName self.abspath = path self.type = "FILE" self.accessTime = datetime.datetime.fromtimestamp(86400) self.modificationTime = datetime.datetime.fromtimestamp(86400) self.exists = False self.replication = "1" self.permissions = "777" self.owner = "root" self.group = "supergroup" self.numChildren = "0" self.size = 0 return if os.path.islink(path): self.type = "SYMLINK" elif os.path.isfile(path): self.type = "FILE" elif os.path.isdir(path): self.type = "DIRECTORY" self.exists = True self.abspath = localFs.canonicalize_path(os.path.abspath(path)) _, self.name = localFs.get_dir_basename(self.abspath) self.replication = "1" self.numChildren = "1" if os.name == "nt": self.size = os.path.getsize(path) self.modificationTime = datetime.datetime.fromtimestamp(os.path.getmtime(path)) self.accessTime = datetime.datetime.fromtimestamp(os.path.getatime(path)) self.permissions = "777" self.owner = "-" self.group = "-" else: statinfo = os.stat(self.abspath) self.size = statinfo.st_size self.modificationTime = datetime.datetime.fromtimestamp(statinfo.st_mtime) self.accessTime = datetime.datetime.fromtimestamp(statinfo.st_atime) self.permissions = str(oct(statinfo.st_mode))[-3:] self.owner = pwd.getpwuid(statinfo.st_uid).pw_name self.group = grp.getgrgid(statinfo.st_gid).gr_name
def exists_file_dir(self, fd): try: return self.hdfs.exists_file_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS exists test: {0}, exc={1}". format(fd.abspath, e))
def make_dir(self, path): if self.simulateOnly: print("SIMULATE -> make dir: " + path) else: try: self.hdfs.make_dir(path) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error during HDFS create directory: {0}, exc={1}" .format(path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened during HDFS create directory: {0}, exc={1}" .format(path, e))
def make_fd(self, path, isSrc, dstDirMustExist): fd = None try: fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist) except pywebhdfs.errors.FileNotFound: self.logger.info("DESC: does not exist: " + path) raise Errors.FileNotFound("Path {0} does not exist".format(path)) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format(path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(path, e)) return fd
def remote_cp_file(self, src, dst, dstFs): dstChunkList = None # Step 1: Perform a copy progressString = "- Progress: " self.logger.info("REMOTE COPY ({0}): {1} -> {2}".format( src.size, src.abspath, dst.abspath)) if src.size <= Constants.DEFAULT_BIG_FILE_THRESHOLD: if self.verbose and is_normal_stdout(): sys.stdout.write(progressString) sys.stdout.flush() if dst.exists: dstFs.delete_file_dir(dst) dstFs.touch_file(dst) self.cp_chunk(src, dst, dstFs, 0, 0, True, "wb") else: chunk = 0 offset = 0 chunkSize = Constants.DEFAULT_BIG_FILE_THRESHOLD numChunks = (src.size / chunkSize) + 1 dstChunkList = list() while offset < src.size: dstChunk = dstFs.make_fd_retriable(dst.abspath + ".__chunk__" + str(chunk), isSrc=False, dstDirMustExist=True) dstChunkList.append(dstChunk) self.logger.info("BIG COPY: chunk={0}, dst={1}".format( chunk, dstChunk.abspath)) if dstChunk.exists: dstFs.delete_file_dir(dstChunk) dstFs.touch_file(dstChunk) if dstChunk.size == Constants.DEFAULT_BIG_FILE_THRESHOLD \ and src.modificationTime <= dstChunk.modificationTime: if self.verbose: print("%s -> %s: skipped" % (src.abspath, dstChunk.abspath)) elif dstChunk.size > Constants.DEFAULT_BIG_FILE_THRESHOLD: errMsg = "a chunk: {0} has its size bigger than max size, you need remove it before next retry".format( dstChunk.abspath) self.logger.error(errMsg) raise Errors.FsException(errMsg) else: if self.verbose: print("%s -> %s" % (src.abspath, dstChunk.abspath)) if is_normal_stdout(): progressFormatString = "Chunk ({0}/{1}) - " progressString += progressFormatString.format( chunk + 1, numChunks) sys.stdout.write(progressString) sys.stdout.flush() self.cp_chunk(src, dstChunk, dstFs, offset + dstChunk.size, dstChunk.size, chunk == numChunks - 1, "ab") if self.verbose and is_normal_stdout(): sys.stdout.write("\r") sys.stdout.flush() chunk += 1 offset = chunk * chunkSize # Step2: concat all chunk files into final file self.concat_chunk_files(dstFs, dst, dstChunkList)