Exemplo n.º 1
0
 def download_raw_dataset(self):
     """Download the raw dataset files and store in the cache location."""
     with upload_output_directory(self.raw_dataset_path) as (tmpdir, _):
         for url in self.download_url:
             filename = url.split("/")[-1]
             fs, _ = get_fs_and_path(url)
             fs.get(url, os.path.join(tmpdir, filename), recursive=True)
Exemplo n.º 2
0
 def download_raw_dataset(self):
     """Download the raw dataset and store that in the cache location."""
     with upload_output_directory(self.raw_dataset_path) as (tmpdir, _):
         for file_download_url in self.download_urls:
             filename = file_download_url.split("/")[-1]
             with TqdmUpTo(unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=filename) as t:
                 urllib.request.urlretrieve(file_download_url, os.path.join(tmpdir, filename), t.update_to)
Exemplo n.º 3
0
    def download_raw_dataset(self):
        """
        Download the raw dataset and extract the contents of the tar file and
        store that in the cache location.
        """

        with upload_output_directory(self.raw_dataset_path) as (tmpdir, _):
            for url in self.download_urls:
                filename = url.split('/')[-1]
                with TqdmUpTo(unit='B',
                              unit_scale=True,
                              unit_divisor=1024,
                              miniters=1,
                              desc=filename) as t:
                    urllib.request.urlretrieve(url,
                                               os.path.join(tmpdir, filename),
                                               t.update_to)

                download_folder_name = url.split('/')[-1].split('.')[0]
                file_path = os.path.join(tmpdir, filename)
                with tarfile.open(file_path) as tar_file:
                    tar_file.extractall(path=tmpdir)

                for f in os.scandir(os.path.join(tmpdir,
                                                 download_folder_name)):
                    shutil.copyfile(f, os.path.join(tmpdir, f.name))
Exemplo n.º 4
0
 def download_raw_dataset(self):
     """
     Download the raw dataset files and store in the cache location.
     """
     with upload_output_directory(self.raw_dataset_path) as (tmpdir, _):
         for url in self.download_url:
             filename = url.split('/')[-1]
             urllib.request.urlretrieve(url, os.path.join(tmpdir, filename))
Exemplo n.º 5
0
    def download_raw_dataset(self):
        """Download the raw dataset and extract the contents of the zip file and store that in the cache
        location."""

        with upload_output_directory(self.raw_dataset_path) as (tmpdir, _):
            for url in self.download_urls:
                with urlopen(url) as zipresp:
                    with ZipFile(BytesIO(zipresp.read())) as zfile:
                        zfile.extractall(tmpdir)
Exemplo n.º 6
0
 def download_raw_dataset(self):
     """Download the raw dataset and extract the contents of the zip file and store that in the cache
     location."""
     with upload_output_directory(self.raw_dataset_path) as (tmpdir, _):
         for file_download_url in self.download_urls:
             filename = file_download_url.split("/")[-1]
             with TqdmUpTo(unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=filename) as t:
                 urllib.request.urlretrieve(file_download_url, os.path.join(tmpdir, filename), t.update_to)
             gzip_content_file = ".".join(filename.split(".")[:-1])
             with gzip.open(os.path.join(tmpdir, filename)) as gzfile:
                 with open(os.path.join(tmpdir, gzip_content_file), "wb") as output:
                     shutil.copyfileobj(gzfile, output)
Exemplo n.º 7
0
    def download_raw_dataset(self):
        """Download the raw dataset and extract the contents of the zip file and store that in the cache location.

        If the user has not specified creds in the kaggle.json file we lookup the passed in username and the api key and
        perform authentication.
        """
        with self.update_env(KAGGLE_USERNAME=self.kaggle_username, KAGGLE_KEY=self.kaggle_key):
            # Call authenticate explicitly to pick up new credentials if necessary
            api = create_kaggle_client()
            api.authenticate()

        with upload_output_directory(self.raw_dataset_path) as (tmpdir, _):
            if self.is_kaggle_competition:
                download_func = api.competition_download_files
            else:
                download_func = api.dataset_download_files
            # Download all files for a competition/dataset
            download_func(self.competition_name, path=tmpdir)

            archive_zip = os.path.join(tmpdir, self.archive_filename)
            with ZipFile(archive_zip, "r") as z:
                z.extractall(tmpdir)