def test_extract_archive_to_missing_output_directory(self): self.create_zip() self.assertTrue(os.path.isdir(self.OUTDIR)) shutil.rmtree(self.OUTDIR) self.assertFalse(os.path.isdir(self.OUTDIR)) extract_archive("/arch.zip", self.OUTDIR) self.assertTrue(os.path.isdir(self.OUTDIR))
def download_dataset(dataset_id, integrity_check): """ Downloads a dataset identified by it's dataset ID (Collection). The maybe already downloaded local copy is checked for integrity according to the specified integrity check. If the local version is up to date, then nothing is done. Otherwise, the dataset is downloaded. Returns a code (int): with the following semantics: * 1: dataset is available locally and the integrity check passed; * 2: the dataset has been downloaded (was not available locally). """ assert (isinstance(dataset_id, Collection)) if integrity_check(dataset_id): # Dataset is already downloaded. return 1 msg.info("Downloading {} ...".format(dataset_id.name)) config = load_datasets_config()[dataset_id.name] dataset_dir = os.path.join(datamine_cache_dir(), dataset_id.name) if not os.path.exists(dataset_dir): os.makedirs(dataset_dir, mode=0o755) # Download all the requirements. for requirement in config["requirements"]: url = requirement["URL"] expected_sha256 = requirement["SHA256"] # Attempt to guess the filename from the URL. In the future, # if it is required, we may have another field in the requirements. filename = url_to_filename(url) assert (filename is not None and len(filename) > 0) filepath = os.path.join(dataset_dir, filename) download_file_if_missing(url, filepath, expected_sha256=expected_sha256, desc="Downloading {}".format(filename)) assert (os.path.isfile(filepath)) # Unpack the file if it is archived or compressed. if is_archive(filepath): msg.info("Unpacking {} ...".format(filename)) extract_archive(filepath, outdir=dataset_dir) msg.info("{} has been downloaded.".format(dataset_id.name)) return 2
def test_extract_archive_for_zip(self): self.create_zip() extract_archive("/arch.zip", self.OUTDIR) self.assertEqual(self.num_extracted_files(), 21)
def test_extract_archive_for_tar_gzip(self): self.create_tar_gzip() extract_archive("/arch.tar.gz", self.OUTDIR) self.assertEqual(self.num_extracted_files(), 15)
def test_extract_archive_for_tar_bzip2(self): self.create_tar_bzip2() extract_archive("/arch.tar.bz2", self.OUTDIR) self.assertEqual(self.num_extracted_files(), 10)
def test_extract_invalid_archive(self): self.create_json() with self.assertRaises(AssertionError): extract_archive("/file.json", self.OUTDIR)
def test_extract_archive_when_file_is_missing(self): with self.assertRaises(AssertionError): extract_archive("/some/missing/file/2", "not important")