Пример #1
0
    def __init__(self,
                 download_dir,
                 extract_dir=None,
                 manual_dir=None,
                 manual_dir_instructions=None,
                 dataset_name=None,
                 force_download=False,
                 force_extraction=False,
                 register_checksums=False):
        """Download manager constructor.

    Args:
      download_dir: `str`, path to directory where downloads are stored.
      extract_dir: `str`, path to directory where artifacts are extracted.
      manual_dir: `str`, path to manually downloaded/extracted data directory.
      manual_dir_instructions: `str`, human readable instructions on how to
                         prepare contents of the manual_dir for this dataset.
      dataset_name: `str`, name of dataset this instance will be used for. If
        provided, downloads will contain which datasets they were used for.
      force_download: `bool`, default to False. If True, always [re]download.
      force_extraction: `bool`, default to False. If True, always [re]extract.
      register_checksums: `bool`, default to False. If True, dl checksums aren't
        checked, but stored into file.
    """
        self._dataset_name = dataset_name
        self._download_dir = os.path.expanduser(download_dir)
        self._extract_dir = os.path.expanduser(
            extract_dir or os.path.join(download_dir, 'extracted'))
        self._manual_dir = manual_dir and os.path.expanduser(manual_dir)
        self._manual_dir_instructions = manual_dir_instructions
        tf.io.gfile.makedirs(self._download_dir)
        tf.io.gfile.makedirs(self._extract_dir)
        self._force_download = force_download
        self._force_extraction = force_extraction
        self._register_checksums = register_checksums
        # All known URLs: {url: (size, checksum)}
        self._sizes_checksums = checksums.get_all_sizes_checksums()
        # To record what is being used: {url: (size, checksum)}
        self._recorded_sizes_checksums = {}
        # These attributes are lazy-initialized since they must be cleared when this
        # object is pickled for Beam. They are then recreated on each worker.
        self.__downloader = None
        self.__extractor = None
Пример #2
0
    def __init__(self,
                 download_dir,
                 extract_dir=None,
                 manual_dir=None,
                 dataset_name=None,
                 force_download=False,
                 force_extraction=False,
                 register_checksums=False,
                 ignore_checksums=False):
        """Download manager constructor.

    Args:
      download_dir: `str`, path to directory where downloads are stored.
      extract_dir: `str`, path to directory where artifacts are extracted.
      manual_dir: `str`, path to manually downloaded/extracted data directory.
      dataset_name: `str`, name of dataset this instance will be used for. If
        provided, downloads will contain which datasets they were used for.
      force_download: `bool`, default to False. If True, always [re]download.
      force_extraction: `bool`, default to False. If True, always [re]extract.
      register_checksums: `bool`, default to False. If True, dl checksums aren't
        checked, but stored into file.
      ignore_checksums: `bool`, default to False. If True, dl checksums aren't
        checked or registered. Overrides `register_checksums`.
    """
        self._dataset_name = dataset_name
        self._download_dir = os.path.expanduser(download_dir)
        self._extract_dir = os.path.expanduser(
            extract_dir or os.path.join(download_dir, 'extracted'))
        self._manual_dir = manual_dir and os.path.expanduser(manual_dir)
        tf.io.gfile.makedirs(self._download_dir)
        tf.io.gfile.makedirs(self._extract_dir)
        self._force_download = force_download
        self._force_extraction = force_extraction
        self._extractor = extractor.get_extractor()
        self._downloader = downloader.get_downloader()
        self._register_checksums = register_checksums
        # All known URLs: {url: (size, checksum)}
        self._sizes_checksums = checksums.get_all_sizes_checksums()
        # To record what is being used: {url: (size, checksum)}
        self._recorded_sizes_checksums = {}
        self._ignore_checksums = ignore_checksums