示例#1
0
 def glob(path):
     if path.startswith('gs://'):
         # pylint: disable=g-import-not-at-top
         from google.cloud.dataflow.io import gcsio
         return gcsio.GcsIO().glob(path)
     else:
         return glob.glob(path)
示例#2
0
    def __enter__(self):
        if self.source.is_gcs_source:
            # pylint: disable=g-import-not-at-top
            from google.cloud.dataflow.io import gcsio
            self._file = gcsio.GcsIO().open(self.source.file_path, 'rb')
        else:
            self._file = open(self.source.file_path, 'rb')
        # Determine the real end_offset.
        # If not specified it will be the length of the file.
        if self.end_offset is None:
            self._file.seek(0, os.SEEK_END)
            self.end_offset = self._file.tell()

        if self.start_offset is None:
            self.start_offset = 0
            self.current_offset = self.start_offset
        if self.start_offset > 0:
            # Read one byte before. This operation will either consume a previous
            # newline if start_offset was at the beginning of a line or consume the
            # line if we were in the middle of it. Either way we get the read position
            # exactly where we wanted: at the begining of the first full line.
            self._file.seek(self.start_offset - 1)
            self.current_offset -= 1
            line = self._file.readline()
            self.current_offset += len(line)
        else:
            self._file.seek(self.start_offset)

        # Initializing range tracker after start and end offsets are finalized.
        self.range_tracker = range_trackers.OffsetRangeTracker(
            self.start_offset, self.end_offset)

        return self
示例#3
0
 def open(path, mode, mime_type):
     if path.startswith('gs://'):
         # pylint: disable=g-import-not-at-top
         from google.cloud.dataflow.io import gcsio
         return gcsio.GcsIO().open(path, mode, mime_type=mime_type)
     else:
         return open(path, mode)
示例#4
0
 def rm(path):
     if path.startswith('gs://'):
         # pylint: disable=g-import-not-at-top
         from google.cloud.dataflow.io import gcsio
         gcsio.GcsIO().delete(path)
     else:
         try:
             os.remove(path)
         except OSError as err:
             raise IOError(err)
示例#5
0
 def rename(src, dst):
     if src.startswith('gs://'):
         assert dst.startswith('gs://'), dst
         # pylint: disable=g-import-not-at-top
         from google.cloud.dataflow.io import gcsio
         gcsio.GcsIO().rename(src, dst)
     else:
         try:
             os.rename(src, dst)
         except OSError as err:
             raise IOError(err)
示例#6
0
 def __init__(self, source):
   self.source = source
   if source.is_gcs_source:
     # pylint: disable=g-import-not-at-top
     from google.cloud.dataflow.io import gcsio
     self.file_paths = gcsio.GcsIO().glob(self.source.file_path)
   else:
     self.file_paths = glob.glob(self.source.file_path)
   if not self.file_paths:
     raise RuntimeError(
         'No files found for path: %s' % self.source.file_path)
示例#7
0
 def rmdir(path):
     if path.startswith('gs://'):
         # pylint: disable=g-import-not-at-top
         from google.cloud.dataflow.io import gcsio
         gcs = gcsio.GcsIO()
         if not path.endswith('/'):
             path += '/'
         # TODO(robertwb): Threadpool?
         for entry in gcs.glob(path + '*'):
             gcs.delete(entry)
     else:
         try:
             shutil.rmtree(path)
         except OSError as err:
             raise IOError(err)
示例#8
0
 def copytree(src, dst):
     if src.startswith('gs://'):
         assert dst.startswith('gs://'), dst
         assert src.endswith('/'), src
         assert dst.endswith('/'), dst
         # pylint: disable=g-import-not-at-top
         from google.cloud.dataflow.io import gcsio
         gcsio.GcsIO().copytree(src, dst)
     else:
         try:
             if os.path.exists(dst):
                 shutil.rmtree(dst)
             shutil.copytree(src, dst)
         except OSError as err:
             raise IOError(err)
 def setUp(self):
     self.client = FakeGcsClient()
     self.gcs = gcsio.GcsIO(self.client)
示例#10
0
def _open(uri, mode='rb'):
    if uri.startswith('gs://'):
        return gcsio.GcsIO().open(uri, mode)
    else:
        return open(uri, mode)