Exemplo n.º 1
0
Arquivo: s3io.py Projeto: xubii/beam
  def list_prefix(self, path):
    """Lists files matching the prefix.

    Args:
      path: S3 file path pattern in the form s3://<bucket>/[name].

    Returns:
      Dictionary of file name -> size.
    """
    bucket, prefix = parse_s3_path(path, object_optional=True)
    request = messages.ListRequest(bucket=bucket, prefix=prefix)

    file_sizes = {}
    counter = 0
    start_time = time.time()

    logging.info("Starting the size estimation of the input")

    while True:
      response = self.client.list(request)
      for item in response.items:
        file_name = 's3://%s/%s' % (bucket, item.key)
        file_sizes[file_name] = item.size
        counter += 1
        if counter % 10000 == 0:
          logging.info("Finished computing size of: %s files", len(file_sizes))
      if response.next_token:
        request.continuation_token = response.next_token
      else:
        break

    logging.info("Finished listing %s files in %s seconds.",
                 counter, time.time() - start_time)

    return file_sizes
Exemplo n.º 2
0
  def list_prefix(self, path):
    """Lists files matching the prefix.

    Args:
      path: S3 file path pattern in the form s3://<bucket>/[name].

    Returns:
      Dictionary of file name -> size.
    """
    bucket, prefix = parse_s3_path(path, object_optional=True)
    request = messages.ListRequest(bucket=bucket, prefix=prefix)

    file_sizes = {}
    counter = 0
    start_time = time.time()

    logging.info("Starting the size estimation of the input")

    while True:
      #The list operation will raise an exception
      #when trying to list a nonexistent S3 path.
      #This should not be an issue here.
      #Ignore this exception or it will break the procedure.
      try:
        response = self.client.list(request)
      except messages.S3ClientError as e:
        if e.code == 404:
          break
        else:
          raise e

      for item in response.items:
        file_name = 's3://%s/%s' % (bucket, item.key)
        file_sizes[file_name] = item.size
        counter += 1
        if counter % 10000 == 0:
          logging.info("Finished computing size of: %s files", len(file_sizes))
      if response.next_token:
        request.continuation_token = response.next_token
      else:
        break

    logging.info(
        "Finished listing %s files in %s seconds.",
        counter,
        time.time() - start_time)

    return file_sizes
Exemplo n.º 3
0
    def list_prefix(self, path, with_metadata=False):
        """Lists files matching the prefix.

    Args:
      path: S3 file path pattern in the form s3://<bucket>/[name].
      with_metadata: Experimental. Specify whether returns file metadata.

    Returns:
      If ``with_metadata`` is False: dict of file name -> size; if
        ``with_metadata`` is True: dict of file name -> tuple(size, timestamp).
    """
        bucket, prefix = parse_s3_path(path, object_optional=True)
        request = messages.ListRequest(bucket=bucket, prefix=prefix)

        file_info = {}
        counter = 0
        start_time = time.time()

        if with_metadata:
            logging.info("Starting the file information of the input")
        else:
            logging.info("Starting the size estimation of the input")

        while True:
            #The list operation will raise an exception
            #when trying to list a nonexistent S3 path.
            #This should not be an issue here.
            #Ignore this exception or it will break the procedure.
            try:
                response = self.client.list(request)
            except messages.S3ClientError as e:
                if e.code == 404:
                    break
                else:
                    raise e

            for item in response.items:
                file_name = 's3://%s/%s' % (bucket, item.key)
                if with_metadata:
                    file_info[file_name] = (item.size,
                                            self._updated_to_seconds(
                                                item.last_modified))
                else:
                    file_info[file_name] = item.size
                counter += 1
                if counter % 10000 == 0:
                    if with_metadata:
                        logging.info(
                            "Finished computing file information of: %s files",
                            len(file_info))
                    else:
                        logging.info("Finished computing size of: %s files",
                                     len(file_info))
            if response.next_token:
                request.continuation_token = response.next_token
            else:
                break

        logging.info("Finished listing %s files in %s seconds.", counter,
                     time.time() - start_time)

        return file_info