Exemplos de FileSystems.delete em Python, exemplos de apache_beam.io.filesystems.FileSystems.delete em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: filesystems_test.py Projeto: wileeam/beam

 def test_delete_error(self):
     path1 = os.path.join(self.tmpdir, 'f1')
     with self.assertRaises(BeamIOError) as error:
         FileSystems.delete([path1])
     self.assertTrue(
         error.exception.message.startswith('Delete operation failed'))
     self.assertEqual(error.exception.exception_details.keys(), [path1])

Exemplo n.º 2

0

Exibir arquivo

Arquivo: filesystems_test.py Projeto: eljefe6a/incubator-beam

 def test_delete_error(self):
   path1 = os.path.join(self.tmpdir, 'f1')
   with self.assertRaises(BeamIOError) as error:
     FileSystems.delete([path1])
   self.assertTrue(
       error.exception.message.startswith('Delete operation failed'))
   self.assertEqual(error.exception.exception_details.keys(), [path1])

Exemplo n.º 3

0

Exibir arquivo

 def test_delete_error(self):
     path1 = os.path.join(self.tmpdir, 'f1')
     with self.assertRaisesRegex(BeamIOError,
                                 r'^Delete operation failed') as error:
         FileSystems.delete([path1])
     self.assertEqual(list(error.exception.exception_details.keys()),
                      [path1])

Exemplo n.º 4

0

Exibir arquivo

 def process(self, unused_element, signal):
     gcs_location = self.get_destination_uri()
     match_result = FileSystems.match([gcs_location
                                       ])[0].metadata_list
     logging.debug("%s: matched %s files", self.__class__.__name__,
                   len(match_result))
     paths = [x.path for x in match_result]
     FileSystems.delete(paths)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: filesystems_test.py Projeto: charlesccychen/incubator-beam

  def test_delete(self):
    path1 = os.path.join(self.tmpdir, 'f1')

    with open(path1, 'a') as f:
      f.write('Hello')

    self.assertTrue(FileSystems.exists(path1))
    FileSystems.delete([path1])
    self.assertFalse(FileSystems.exists(path1))

Exemplo n.º 6

0

Exibir arquivo

    def test_delete(self):
        path1 = os.path.join(self.tmpdir, 'f1')

        with open(path1, 'a') as f:
            f.write('Hello')

        self.assertTrue(FileSystems.exists(path1))
        FileSystems.delete([path1])
        self.assertFalse(FileSystems.exists(path1))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: filebasedsink.py Projeto: JavierRoger/beam

  def pre_finalize(self, init_result, writer_results):
    num_shards = len(list(writer_results))
    dst_glob = self._get_final_name_glob(num_shards)
    dst_glob_files = [file_metadata.path
                      for mr in FileSystems.match([dst_glob])
                      for file_metadata in mr.metadata_list]

    if dst_glob_files:
      logging.warn('Deleting %d existing files in target path matching: %s',
                   len(dst_glob_files), self.shard_name_glob_format)
      FileSystems.delete(dst_glob_files)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: filebasedsink.py Projeto: gene-telligent/beam-1

  def pre_finalize(self, init_result, writer_results):
    num_shards = len(list(writer_results))
    dst_glob = self._get_final_name_glob(num_shards)
    dst_glob_files = [file_metadata.path
                      for mr in FileSystems.match([dst_glob])
                      for file_metadata in mr.metadata_list]

    if dst_glob_files:
      logging.warn('Deleting %d existing files in target path matching: %s',
                   len(dst_glob_files), self.shard_name_glob_format)
      FileSystems.delete(dst_glob_files)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_utils.py Projeto: zhoufek/beam

def delete_files(file_paths):
  """A function to clean up files or directories using ``FileSystems``.

  Glob is supported in file path and directories will be deleted recursively.

  Args:
    file_paths: A list of strings contains file paths or directories.
  """
  if len(file_paths) == 0:
    raise RuntimeError('Clean up failed. Invalid file path: %s.' % file_paths)
  FileSystems.delete(file_paths)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: preprocessing_transforms_test.py Projeto: pranavshah01/sciencebeam-gym

 def test_should_write_multiple_entries(self):
     dict_list = [{KEY_1: VALUE_1}, {KEY_2: VALUE_2}]
     with TestPipeline() as p:
         _ = (  # flake8: noqa
             p | beam.Create(dict_list)
             | WritePropsToTFRecord(TFRECORDS_PATH, lambda x: [x]))
     filenames = list(find_matching_filenames(TFRECORDS_PATH + '*'))
     assert len(filenames) == 1
     records = list(iter_read_tfrecord_file_as_dict_list(filenames[0]))
     assert records == dict_list
     FileSystems.delete(filenames)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: file_util.py Projeto: codalab/codalab-worksheets

def remove_path(path):
    """
    Removes a path if it exists.
    """
    # We need to include this first if statement
    # to allow local broken symbolic links to be deleted
    # as well (which aren't matched by the Beam methods).
    if os.path.islink(path):
        os.remove(path)
    elif get_path_exists(path):
        FileSystems.delete([path])

Exemplo n.º 12

0

Exibir arquivo

Arquivo: filebasedsink.py Projeto: aljoscha/incubator-beam

 def pre_finalize(self, init_result, writer_results):
   writer_results = sorted(writer_results)
   num_shards = len(writer_results)
   existing_files = []
   for shard_num in range(len(writer_results)):
     final_name = self._get_final_name(shard_num, num_shards)
     if FileSystems.exists(final_name):
       existing_files.append(final_name)
   if existing_files:
     logging.info('Deleting existing files in target path: %d',
                  len(existing_files))
     FileSystems.delete(existing_files)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: filebasedsink.py Projeto: zahiralward/beam

 def pre_finalize(self, init_result, writer_results):
     writer_results = sorted(writer_results)
     num_shards = len(writer_results)
     existing_files = []
     for shard_num in range(len(writer_results)):
         final_name = self._get_final_name(shard_num, num_shards)
         if FileSystems.exists(final_name):
             existing_files.append(final_name)
     if existing_files:
         logging.info('Deleting existing files in target path: %d',
                      len(existing_files))
         FileSystems.delete(existing_files)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_utils.py Projeto: lukecwik/incubator-beam

def delete_files(file_paths):
  """A function to clean up files or directories using ``FileSystems``.

  Glob is supported in file path and directories will be deleted recursively.

  Args:
    file_paths: A list of strings contains file paths or directories.
  """
  if len(file_paths) == 0:
    raise RuntimeError('Clean up failed. Invalid file path: %s.' %
                       file_paths)
  FileSystems.delete(file_paths)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: partitionedsink.py Projeto: GlobalFishingWatch/pipe-tools

    def finalize_write(self, init_result, writer_results, pre_finalize_result):
        file_path_prefix = self.file_path_prefix.get()

        shard_paths = it.chain.from_iterable(writer_results)
        path_pairs = list(self._source_dest_shard_pairs(shard_paths))
        unique_dest_dirs = {pp.split(pair[1])[0] for pair in path_pairs}

        num_shards = len(path_pairs)
        min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        batch_size = FileSystems.get_chunk_size(file_path_prefix)
        batches = [
            path_pairs[i:i + batch_size]
            for i in six.moves.range(0, len(path_pairs), batch_size)
        ]

        logging.info(
            'Starting finalize_write threads with num_shards: %d, '
            'batches: %d, num_threads: %d', num_shards, len(batches),
            num_threads)
        start_time = time.time()

        if unique_dest_dirs:
            # Fix #18 run_using_threadpool raises if you pass in an empty list of inputs
            # so if we don't have any work to do, then just skip it
            util.run_using_threadpool(self._create_output_dir,
                                      unique_dest_dirs, num_threads)

            exception_batches = util.run_using_threadpool(
                self._rename_batch, batches, num_threads)

            all_exceptions = [
                e for exception_batch in exception_batches
                for e in exception_batch
            ]

            if all_exceptions:
                raise Exception('Encountered exceptions in finalize_write: %s',
                                all_exceptions)

        for _, final_name in path_pairs:
            yield final_name

        logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                     time.time() - start_time)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass

Exemplo n.º 16

0

Exibir arquivo

    def finalize_write(self, init_result, writer_results):
        # writer_results is LIST[ LIST[ TUPLE[date_ts, gcs_path] ] ]
        # so first we need to flatten it to just LIST[ TUPLE[date_ts, gcs_path] ]
        shard_paths = it.chain.from_iterable(writer_results)

        client = BigQueryWrapper()

        # start an async load table job for each date
        waiting_jobs = set(self._load_tables(client, shard_paths))

        # wait for jobs to finish
        while waiting_jobs:
            logging.info('Waiting for %s bigquery tables to load...',
                         len(waiting_jobs))
            completed_jobs = set()
            for job in waiting_jobs:
                job_id, table_ref, date_ts = job
                table_str = encode_table_ref(table_ref)
                response = client.get_job_status(self.project_id, job_id)
                if response.status.state == "DONE":
                    completed_jobs.add(job)
                    if response.status.errorResult:
                        logging.error('Bigquery table load failed for %s',
                                      table_str)
                        for error in response.status.errors:
                            logging.error('%s %s %s', error.reason,
                                          error.location, error.message)

                        # raise exception
                        raise RuntimeError(
                            'Bigquery table load failed for table %s.  %s' %
                            (table_str, response.status.errorResult.message))
                    else:
                        logging.info('Bigquery table load complete for %s',
                                     table_str)
                        yield table_str  # not sure what anyone is going to do with these...
                else:
                    #  Not done yet...
                    logging.debug('Bigquery table load status %s - %s' %
                                  (table_str, response.status.state))

            waiting_jobs -= completed_jobs
            time.sleep(1.0)  # wait for a bit and then check again
            continue

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass

Exemplo n.º 17

0

Exibir arquivo

def remove(path):
    """
    Remove the given path, whether it is a directory, file, or link.
    """
    if parse_linked_bundle_url(path).uses_beam:
        from apache_beam.io.filesystems import FileSystems

        if not FileSystems.exists(path):
            FileSystems.delete([path])
        return
    check_isvalid(path, 'remove')
    set_write_permissions(path)  # Allow permissions
    if os.path.islink(path):
        os.unlink(path)
    elif os.path.isdir(path):
        try:
            shutil.rmtree(path)
        except shutil.Error:
            pass
    else:
        os.remove(path)
    if os.path.exists(path):
        print('Failed to remove %s' % path)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: parquetio_it_test.py Projeto: sanjayksh/beam

 def _verify_data(self, pcol, init_size, data_size):
   read = pcol | 'read' >> ReadAllFromParquet()
   v1 = (
       read
       | 'get_number' >> Map(lambda x: x['number'])
       | 'sum_globally' >> CombineGlobally(sum)
       | 'validate_number' >>
       FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x)))
   v2 = (
       read
       | 'make_pair' >> Map(lambda x: (x['name'], x['number']))
       | 'count_per_key' >> Count.PerKey()
       | 'validate_name' >> FlatMap(
           lambda x: TestParquetIT._count_verifier(init_size, data_size, x)))
   _ = ((v1, v2, pcol)
        | 'flatten' >> Flatten()
        | 'reshuffle' >> Reshuffle()
        | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))

Exemplo n.º 19

0

Exibir arquivo

Arquivo: parquetio_it_test.py Projeto: iemejia/incubator-beam

 def _verify_data(self, pcol, init_size, data_size):
   read = pcol | 'read' >> ReadAllFromParquet()
   v1 = (read
         | 'get_number' >> Map(lambda x: x['number'])
         | 'sum_globally' >> CombineGlobally(sum)
         | 'validate_number' >> FlatMap(
             lambda x: TestParquetIT._sum_verifier(init_size, data_size, x)
         )
        )
   v2 = (read
         | 'make_pair' >> Map(lambda x: (x['name'], x['number']))
         | 'count_per_key' >> Count.PerKey()
         | 'validate_name' >> FlatMap(
             lambda x: TestParquetIT._count_verifier(init_size, data_size, x)
         )
        )
   _ = ((v1, v2, pcol)
        | 'flatten' >> Flatten()
        | 'reshuffle' >> Reshuffle()
        | 'cleanup' >> Map(lambda x: FileSystems.delete([x]))
       )

Exemplo n.º 20

0

Exibir arquivo

    def finalize_write(self, init_result, writer_results):
        file_path_prefix = self.file_path_prefix.get()
        file_name_suffix = self.file_name_suffix.get()
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)
        min_threads = min(num_shards, FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        source_files = []
        destination_files = []
        chunk_size = FileSystems.get_chunk_size(file_path_prefix)
        for shard_num, shard in enumerate(writer_results):
            final_name = ''.join([
                file_path_prefix, self.shard_name_format %
                dict(shard_num=shard_num, num_shards=num_shards),
                file_name_suffix
            ])
            source_files.append(shard)
            destination_files.append(final_name)

        source_file_batch = [
            source_files[i:i + chunk_size]
            for i in range(0, len(source_files), chunk_size)
        ]
        destination_file_batch = [
            destination_files[i:i + chunk_size]
            for i in range(0, len(destination_files), chunk_size)
        ]

        logging.info(
            'Starting finalize_write threads with num_shards: %d, '
            'batches: %d, num_threads: %d', num_shards, len(source_file_batch),
            num_threads)
        start_time = time.time()

        # Use a thread pool for renaming operations.
        def _rename_batch(batch):
            """_rename_batch executes batch rename operations."""
            source_files, destination_files = batch
            exceptions = []
            try:
                FileSystems.rename(source_files, destination_files)
                return exceptions
            except BeamIOError as exp:
                if exp.exception_details is None:
                    raise
                for (src,
                     dest), exception in exp.exception_details.iteritems():
                    if exception:
                        logging.warning('Rename not successful: %s -> %s, %s',
                                        src, dest, exception)
                        should_report = True
                        if isinstance(exception, IOError):
                            # May have already been copied.
                            try:
                                if FileSystems.exists(dest):
                                    should_report = False
                            except Exception as exists_e:  # pylint: disable=broad-except
                                logging.warning(
                                    'Exception when checking if file %s exists: '
                                    '%s', dest, exists_e)
                        if should_report:
                            logging.warning(
                                ('Exception in _rename_batch. src: %s, '
                                 'dest: %s, err: %s'), src, dest, exception)
                            exceptions.append(exception)
                    else:
                        logging.debug('Rename successful: %s -> %s', src, dest)
                return exceptions

        exception_batches = util.run_using_threadpool(
            _rename_batch, zip(source_file_batch, destination_file_batch),
            num_threads)

        all_exceptions = [
            e for exception_batch in exception_batches for e in exception_batch
        ]
        if all_exceptions:
            raise Exception('Encountered exceptions in finalize_write: %s' %
                            all_exceptions)

        for final_name in destination_files:
            yield final_name

        logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                     time.time() - start_time)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass

Exemplo n.º 21

0

Exibir arquivo

Arquivo: filesystems_test.py Projeto: charlesccychen/incubator-beam

 def test_delete_error(self):
   path1 = os.path.join(self.tmpdir, 'f1')
   with self.assertRaisesRegexp(BeamIOError,
                                r'^Delete operation failed') as error:
     FileSystems.delete([path1])
   self.assertEqual(list(error.exception.exception_details.keys()), [path1])

Exemplo n.º 22

0

Exibir arquivo

Arquivo: bigquery_read_internal.py Projeto: SarahTTAN107/code_snippets

 def process(self, unused_element, unused_signal, gcs_locations):
     FileSystems.delete(list(gcs_locations))

Exemplo n.º 23

0

Exibir arquivo

Arquivo: gcsio_integration_test.py Projeto: zhoufek/beam

 def tearDown(self):
   FileSystems.delete([self.gcs_tempdir + '/'])

Exemplo n.º 24

0

Exibir arquivo

Arquivo: fileio.py Projeto: vikkyrk/incubator-beam

  def finalize_write(self, init_result, writer_results):
    file_path_prefix = self.file_path_prefix.get()
    file_name_suffix = self.file_name_suffix.get()
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)
    min_threads = min(num_shards, FileSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    source_files = []
    destination_files = []
    chunk_size = FileSystems.get_chunk_size(file_path_prefix)
    for shard_num, shard in enumerate(writer_results):
      final_name = ''.join([
          file_path_prefix, self.shard_name_format % dict(
              shard_num=shard_num, num_shards=num_shards), file_name_suffix
      ])
      source_files.append(shard)
      destination_files.append(final_name)

    source_file_batch = [source_files[i:i + chunk_size]
                         for i in xrange(0, len(source_files),
                                         chunk_size)]
    destination_file_batch = [destination_files[i:i + chunk_size]
                              for i in xrange(0, len(destination_files),
                                              chunk_size)]

    logging.info(
        'Starting finalize_write threads with num_shards: %d, '
        'batches: %d, num_threads: %d',
        num_shards, len(source_file_batch), num_threads)
    start_time = time.time()

    # Use a thread pool for renaming operations.
    def _rename_batch(batch):
      """_rename_batch executes batch rename operations."""
      source_files, destination_files = batch
      exceptions = []
      try:
        FileSystems.rename(source_files, destination_files)
        return exceptions
      except BeamIOError as exp:
        if exp.exception_details is None:
          raise
        for (src, dest), exception in exp.exception_details.iteritems():
          if exception:
            logging.warning('Rename not successful: %s -> %s, %s', src, dest,
                            exception)
            should_report = True
            if isinstance(exception, IOError):
              # May have already been copied.
              try:
                if FileSystems.exists(dest):
                  should_report = False
              except Exception as exists_e:  # pylint: disable=broad-except
                logging.warning('Exception when checking if file %s exists: '
                                '%s', dest, exists_e)
            if should_report:
              logging.warning(('Exception in _rename_batch. src: %s, '
                               'dest: %s, err: %s'), src, dest, exception)
              exceptions.append(exception)
          else:
            logging.debug('Rename successful: %s -> %s', src, dest)
        return exceptions

    exception_batches = util.run_using_threadpool(
        _rename_batch, zip(source_file_batch, destination_file_batch),
        num_threads)

    all_exceptions = [e for exception_batch in exception_batches
                      for e in exception_batch]
    if all_exceptions:
      raise Exception('Encountered exceptions in finalize_write: %s',
                      all_exceptions)

    for final_name in destination_files:
      yield final_name

    logging.info('Renamed %d shards in %.2f seconds.', num_shards,
                 time.time() - start_time)

    try:
      FileSystems.delete([init_result])
    except IOError:
      # May have already been removed.
      pass

Exemplo n.º 25

0

Exibir arquivo

Arquivo: gcsio_integration_test.py Projeto: eralmas7/beam

 def tearDown(self):
   FileSystems.delete([self.gcs_tempdir + '/'])

Exemplo n.º 26

0

Exibir arquivo

Arquivo: filebasedsink.py Projeto: xsm110/Beam15.0

    def finalize_write(self, init_result, writer_results,
                       unused_pre_finalize_results):
        writer_results = sorted(writer_results)
        num_shards = len(writer_results)

        src_files, dst_files, delete_files, num_skipped = (
            self._check_state_for_finalize_write(writer_results, num_shards))
        num_skipped += len(delete_files)
        FileSystems.delete(delete_files)
        num_shards_to_finalize = len(src_files)
        min_threads = min(num_shards_to_finalize,
                          FileBasedSink._MAX_RENAME_THREADS)
        num_threads = max(1, min_threads)

        chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
        source_file_batch = [
            src_files[i:i + chunk_size]
            for i in range(0, len(src_files), chunk_size)
        ]
        destination_file_batch = [
            dst_files[i:i + chunk_size]
            for i in range(0, len(dst_files), chunk_size)
        ]

        if num_shards_to_finalize:
            logging.info(
                'Starting finalize_write threads with num_shards: %d (skipped: %d), '
                'batches: %d, num_threads: %d', num_shards_to_finalize,
                num_skipped, len(source_file_batch), num_threads)
            start_time = time.time()

            # Use a thread pool for renaming operations.
            def _rename_batch(batch):
                """_rename_batch executes batch rename operations."""
                source_files, destination_files = batch
                exceptions = []
                try:
                    FileSystems.rename(source_files, destination_files)
                    return exceptions
                except BeamIOError as exp:
                    if exp.exception_details is None:
                        raise
                    for (src,
                         dst), exception in iteritems(exp.exception_details):
                        if exception:
                            logging.error(
                                ('Exception in _rename_batch. src: %s, '
                                 'dst: %s, err: %s'), src, dst, exception)
                            exceptions.append(exception)
                        else:
                            logging.debug('Rename successful: %s -> %s', src,
                                          dst)
                    return exceptions

            exception_batches = util.run_using_threadpool(
                _rename_batch,
                list(zip(source_file_batch, destination_file_batch)),
                num_threads)

            all_exceptions = [
                e for exception_batch in exception_batches
                for e in exception_batch
            ]
            if all_exceptions:
                raise Exception(
                    'Encountered exceptions in finalize_write: %s' %
                    all_exceptions)

            for final_name in dst_files:
                yield final_name

            logging.info('Renamed %d shards in %.2f seconds.',
                         num_shards_to_finalize,
                         time.time() - start_time)
        else:
            logging.warning(
                'No shards found to finalize. num_shards: %d, skipped: %d',
                num_shards, num_skipped)

        try:
            FileSystems.delete([init_result])
        except IOError:
            # May have already been removed.
            pass

Exemplo n.º 27

0

Exibir arquivo

Arquivo: filebasedsink.py Projeto: aljoscha/incubator-beam

  def finalize_write(self, init_result, writer_results,
                     unused_pre_finalize_results):
    writer_results = sorted(writer_results)
    num_shards = len(writer_results)

    src_files = []
    dst_files = []
    delete_files = []
    chunk_size = FileSystems.get_chunk_size(self.file_path_prefix.get())
    num_skipped = 0
    for shard_num, shard in enumerate(writer_results):
      final_name = self._get_final_name(shard_num, num_shards)
      src = shard
      dst = final_name
      src_exists = FileSystems.exists(src)
      dst_exists = FileSystems.exists(dst)
      if not src_exists and not dst_exists:
        raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % (
            src, dst))
      if not src_exists and dst_exists:
        logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst)
        num_skipped += 1
        continue
      if (src_exists and dst_exists and
          FileSystems.checksum(src) == FileSystems.checksum(dst)):
        logging.debug('src: %s == dst: %s, deleting src', src, dst)
        delete_files.append(src)
        continue

      src_files.append(src)
      dst_files.append(dst)

    num_skipped = len(delete_files)
    FileSystems.delete(delete_files)
    num_shards_to_finalize = len(src_files)
    min_threads = min(num_shards_to_finalize, FileBasedSink._MAX_RENAME_THREADS)
    num_threads = max(1, min_threads)

    source_file_batch = [src_files[i:i + chunk_size]
                         for i in range(0, len(src_files), chunk_size)]
    destination_file_batch = [dst_files[i:i + chunk_size]
                              for i in range(0, len(dst_files), chunk_size)]

    if num_shards_to_finalize:
      logging.info(
          'Starting finalize_write threads with num_shards: %d (skipped: %d), '
          'batches: %d, num_threads: %d',
          num_shards_to_finalize, num_skipped, len(source_file_batch),
          num_threads)
      start_time = time.time()

      # Use a thread pool for renaming operations.
      def _rename_batch(batch):
        """_rename_batch executes batch rename operations."""
        source_files, destination_files = batch
        exceptions = []
        try:
          FileSystems.rename(source_files, destination_files)
          return exceptions
        except BeamIOError as exp:
          if exp.exception_details is None:
            raise
          for (src, dst), exception in exp.exception_details.iteritems():
            if exception:
              logging.error(('Exception in _rename_batch. src: %s, '
                             'dst: %s, err: %s'), src, dst, exception)
              exceptions.append(exception)
            else:
              logging.debug('Rename successful: %s -> %s', src, dst)
          return exceptions

      exception_batches = util.run_using_threadpool(
          _rename_batch, zip(source_file_batch, destination_file_batch),
          num_threads)

      all_exceptions = [e for exception_batch in exception_batches
                        for e in exception_batch]
      if all_exceptions:
        raise Exception(
            'Encountered exceptions in finalize_write: %s' % all_exceptions)

      for final_name in dst_files:
        yield final_name

      logging.info('Renamed %d shards in %.2f seconds.', num_shards_to_finalize,
                   time.time() - start_time)
    else:
      logging.warning(
          'No shards found to finalize. num_shards: %d, skipped: %d',
          num_shards, num_skipped)

    try:
      FileSystems.delete([init_result])
    except IOError:
      # May have already been removed.
      pass

Exemplo n.º 28

0

Exibir arquivo

Arquivo: flight_delays_it_test.py Projeto: piter75/apache-beam

 def tearDown(self):
     FileSystems.delete([self.outdir + '/'])

Exemplo n.º 29

0

Exibir arquivo

def delete_file(path, d_pl_options, recursive=False, r_level=0, debug=False):
    fs = FileSystems.get_filesystem(path)
    if type(fs) == GCSFileSystem:
        gcs_client = get_gcs_client()
        if debug:
            print(
                f"{'-'*(r_level)} delete_file (debug): path: {path}, recursive: {recursive}"
            )
        if recursive:
            child_paths = list_dir(path, d_pl_options, exclude_subdir=False)
            for child_path in child_paths:
                if child_path != path:
                    if debug:
                        print(
                            f"{'-'*(r_level+1)} delete_file (debug): path {path} has child: {child_path}"
                        )
                    delete_file(
                        child_path,
                        d_pl_options,
                        recursive=True,
                        r_level=r_level + 1
                    )  # don't need to recurse (return, since gcsio deletes all leaves from the root)

        # not stripped, not corrrected case
        blob_path = get_gcs_bucket(d_pl_options).blob(path)
        path_not_stripped_not_gcs_corrected_exists = blob_path.exists(
            gcs_client)
        if debug:
            print(
                f"{'-'*(r_level)} {path} (not stripped, not gcs corrected): {blob_path}, exists: {path_not_stripped_not_gcs_corrected_exists}"
            )
        if path_not_stripped_not_gcs_corrected_exists:
            blob_path_delete_result = blob_path.delete(gcs_client)
            if debug:
                print(
                    f"{'-'*(r_level)} {path} (not stripped, not gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)"
                )
            return blob_path_delete_result
        else:
            # not stripped, gcs corrected case
            path_not_stripped_gcs_corrected = gcs_correct_dir_path_form(
                path, d_pl_options, strip_prefix=False)
            blob_path = get_gcs_bucket(d_pl_options).blob(
                path_not_stripped_gcs_corrected)
            path_not_stripped_gcs_corrected_exists = blob_path.exists(
                gcs_client)
            if debug:
                print(
                    f"{'-'*(r_level)} {path_not_stripped_gcs_corrected} (not stripped, gcs corrected): {blob_path}, exists: {path_not_stripped_gcs_corrected_exists}"
                )
            if path_not_stripped_gcs_corrected_exists:
                blob_path_delete_result = blob_path.delete(gcs_client)
                if debug:
                    print(
                        f"{'-'*(r_level)} {path_not_stripped_gcs_corrected} (not stripped, gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)"
                    )
                return blob_path_delete_result
            else:
                # stripped, not gcs corrected case
                path_stripped_not_gcs_corrected = gcs_path_strip_prefix(
                    path, d_pl_options)
                blob_path = get_gcs_bucket(d_pl_options).blob(
                    path_stripped_not_gcs_corrected)
                path_stripped_not_gcs_corrected_exists = blob_path.exists(
                    gcs_client)
                if debug:
                    print(
                        f"{'-'*(r_level)} {path_stripped_not_gcs_corrected} (stripped, not gcs corrected): {blob_path}, exists: {path_stripped_not_gcs_corrected_exists}"
                    )
                if path_stripped_not_gcs_corrected_exists:
                    blob_path_delete_result = blob_path.delete(gcs_client)
                    if debug:
                        print(
                            f"{'-'*(r_level)} {path_stripped_not_gcs_corrected} (stripped, not gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)"
                        )
                    return blob_path_delete_result
                else:
                    # stripped, gcs corrected case
                    path_stripped_gcs_corrected = gcs_correct_dir_path_form(
                        path, d_pl_options, strip_prefix=True)
                    blob_path = get_gcs_bucket(d_pl_options).blob(
                        path_stripped_gcs_corrected)
                    path_stripped_gcs_corrected_exists = blob_path.exists(
                        gcs_client)
                    if debug:
                        print(
                            f"{'-'*(r_level)} {path_stripped_gcs_corrected} (stripped, gcs corrected): {blob_path}, exists: {path_stripped_gcs_corrected_exists}"
                        )
                    if path_stripped_gcs_corrected_exists:
                        blob_path_delete_result = blob_path.delete(gcs_client)
                        if debug:
                            print(
                                f"{'-'*(r_level)} {path_stripped_gcs_corrected} (stripped, gcs corrected)): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)"
                            )
                        return blob_path_delete_result
                    else:
                        if debug:
                            print(
                                f"{'-'*(r_level)} out of options trying to delete base path {path}!"
                            )
                        return False

    else:
        return FileSystems.delete([path])