def glob(path): if path.startswith('gs://'): # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio return gcsio.GcsIO().glob(path) else: return glob.glob(path)
def __enter__(self): if self.source.is_gcs_source: # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio self._file = gcsio.GcsIO().open(self.source.file_path, 'rb') else: self._file = open(self.source.file_path, 'rb') # Determine the real end_offset. # If not specified it will be the length of the file. if self.end_offset is None: self._file.seek(0, os.SEEK_END) self.end_offset = self._file.tell() if self.start_offset is None: self.start_offset = 0 self.current_offset = self.start_offset if self.start_offset > 0: # Read one byte before. This operation will either consume a previous # newline if start_offset was at the beginning of a line or consume the # line if we were in the middle of it. Either way we get the read position # exactly where we wanted: at the begining of the first full line. self._file.seek(self.start_offset - 1) self.current_offset -= 1 line = self._file.readline() self.current_offset += len(line) else: self._file.seek(self.start_offset) # Initializing range tracker after start and end offsets are finalized. self.range_tracker = range_trackers.OffsetRangeTracker( self.start_offset, self.end_offset) return self
def open(path, mode, mime_type): if path.startswith('gs://'): # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio return gcsio.GcsIO().open(path, mode, mime_type=mime_type) else: return open(path, mode)
def rm(path): if path.startswith('gs://'): # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio gcsio.GcsIO().delete(path) else: try: os.remove(path) except OSError as err: raise IOError(err)
def rename(src, dst): if src.startswith('gs://'): assert dst.startswith('gs://'), dst # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio gcsio.GcsIO().rename(src, dst) else: try: os.rename(src, dst) except OSError as err: raise IOError(err)
def __init__(self, source): self.source = source if source.is_gcs_source: # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio self.file_paths = gcsio.GcsIO().glob(self.source.file_path) else: self.file_paths = glob.glob(self.source.file_path) if not self.file_paths: raise RuntimeError( 'No files found for path: %s' % self.source.file_path)
def rmdir(path): if path.startswith('gs://'): # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio gcs = gcsio.GcsIO() if not path.endswith('/'): path += '/' # TODO(robertwb): Threadpool? for entry in gcs.glob(path + '*'): gcs.delete(entry) else: try: shutil.rmtree(path) except OSError as err: raise IOError(err)
def copytree(src, dst): if src.startswith('gs://'): assert dst.startswith('gs://'), dst assert src.endswith('/'), src assert dst.endswith('/'), dst # pylint: disable=g-import-not-at-top from google.cloud.dataflow.io import gcsio gcsio.GcsIO().copytree(src, dst) else: try: if os.path.exists(dst): shutil.rmtree(dst) shutil.copytree(src, dst) except OSError as err: raise IOError(err)
def setUp(self): self.client = FakeGcsClient() self.gcs = gcsio.GcsIO(self.client)
def _open(uri, mode='rb'): if uri.startswith('gs://'): return gcsio.GcsIO().open(uri, mode) else: return open(uri, mode)