Exemplo n.º 1
0
    def download_dataset(self, hq_files=True):
        """
        Downloads the dataset and return the input paths
        Args:
            hq_files (bool): Whether to download the hq files or not

        Returns:
            list: [train_data, test_data, metadata_csv, train_masks_csv, train_masks_data]

        """
        competition_name = "carvana-image-masking-challenge"

        script_dir = os.path.dirname(os.path.abspath(__file__))
        #        print(script_dir)
        #        1/0
        destination_path = os.path.join(
            script_dir, '../../input/')  #.\..\表示项目文件所在目录向上二级目录之下的目录。
        prefix = ""
        if hq_files:
            prefix = "_hq"
        files = [
            "train" + prefix + ".zip", "test" + prefix + ".zip",
            "metadata.csv.zip", "train_masks.csv.zip", "train_masks.zip"
        ]
        datasets_path = [
            destination_path + "train" + prefix,
            destination_path + "test" + prefix,
            destination_path + "metadata.csv",
            destination_path + "train_masks.csv",
            destination_path + "train_masks"
        ]
        is_datasets_present = True

        # If the folders already exists then the files may already be extracted
        # This is a bit hacky but it's sufficient for our needs
        for dir_path in datasets_path:
            if not os.path.exists(dir_path):
                is_datasets_present = False

        if not is_datasets_present:
            # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively
            downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"),
                                              os.getenv("KAGGLE_PASSWD"),
                                              competition_name)

            for file in files:
                output_path = downloader.download_dataset(
                    file, destination_path)
                downloader.decompress(output_path, destination_path)
                os.remove(output_path)
        else:
            print("All datasets are present.")

        self.train_data = datasets_path[0]
        self.test_data = datasets_path[1]
        self.train_masks_data = datasets_path[4]
        self.train_files = sorted(os.listdir(self.train_data))
        self.test_files = sorted(os.listdir(self.test_data))
        self.train_masks_files = sorted(os.listdir(self.train_masks_data))
        return datasets_path
Exemplo n.º 2
0
    def download_dataset(url: str, output_folder: str, decompress: bool):
        """
            Downloads the dataset and return the input paths.
                    Do not download again if the data is already present.
            Args:
                url (str): Http link to the archive
                output_folder (str): Path to save the downloaded files
                decompress (bool): To uncompress the downloaded archive
            Returns:
                tuple: (file_name, file_path)
        """
        file_name = os.path.split(url)[-1]
        output_file_arch = os.path.join(output_folder, file_name)
        if not os.path.exists(output_file_arch):
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            print('Beginning file download...')
            with TqdmUpTo(unit='B',
                          unit_scale=True,
                          miniters=1,
                          desc=f"Downloading {file_name}") as t:
                file, _ = urllib.request.urlretrieve(url,
                                                     output_file_arch,
                                                     reporthook=t.update_to)
            print("Unzipping file...")
            if decompress:
                KaggleDataDownloader.decompress(file, output_folder)
        else:
            print("File already exists.")

        return file_name, output_file_arch
    def download_dataset(self, hq_files=True):
        """
        Downloads the dataset and return the input paths
        Args:
            hq_files (bool): Whether to download the hq files or not

        Returns:
            list: [train_data, test_data, metadata_csv, train_masks_csv, train_masks_data]

        """
        competition_name = "RSNA_Bone_Age"

        script_dir = os.path.dirname(os.path.abspath(__file__))
        destination_path = os.path.join(script_dir, '../../input/')

        files = [
            "train.zip", "test.zip", "metadata.csv.zip", "train_masks.csv.zip",
            "train_masks.zip"
        ]
        datasets_path = [
            destination_path + "boneage-training-dataset",
            destination_path + "boneage-test-dataset",
            destination_path + "boneage-test-dataset.csv",
            destination_path + "boneage-training-dataset.csv",
            destination_path + "masks"
        ]
        is_datasets_present = True

        # If the folders already exists then the files may already be extracted
        # This is a bit hacky but it's sufficient for our needs
        for dir_path in datasets_path:
            if not os.path.exists(dir_path):
                is_datasets_present = False

        if not is_datasets_present:
            # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively
            downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"),
                                              os.getenv("KAGGLE_PASSWD"),
                                              competition_name)

            for file in files:
                output_path = downloader.download_dataset(
                    file, destination_path)
                downloader.decompress(output_path, destination_path)
                os.remove(output_path)
        else:
            print("All datasets are present.")

        self.train_data = datasets_path[0]
        self.test_data = datasets_path[1]
        self.train_masks_data = datasets_path[4]
        self.train_files = sorted(os.listdir(self.train_data))
        self.test_files = sorted(os.listdir(self.test_data))
        self.train_masks_files = sorted(os.listdir(self.train_masks_data))
        self.train_ids = list(set(t.split("_")[0] for t in self.train_files))
        self.masks_ids = list(
            set(t.split("_")[0] for t in self.train_masks_files))
        self.test_ids = list(set(t.split("_")[0] for t in self.test_files))
        return datasets_path
Exemplo n.º 4
0
    def test_download_data(self):
        competition_name = "planet-understanding-the-amazon-from-space"
        dataset_name = "test-jpg-additional.tar.7z"
        labels_name = "train_v2.csv.zip"
        destination_path = "input/"

        downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"),
                                          os.getenv("KAGGLE_PASSWD"),
                                          competition_name)

        output_path = downloader.download_dataset(dataset_name,
                                                  destination_path)
        downloader.decompress(output_path, destination_path)
        downloader.decompress(destination_path + "test-jpg-additional.tar",
                              destination_path)

        labels_output_path = downloader.download_dataset(
            labels_name, destination_path)
        downloader.decompress(labels_output_path, destination_path)
Exemplo n.º 5
0
    def download_dataset(competition_name: str, competition_files: list,
                         competition_files_ext: list, output_folder: str):
        """
            Downloads the dataset and return the input paths.
            Do not download again if the data is already present.
            You need to define $KAGGLE_USER and $KAGGLE_PASSWD in your environment
            and you must accept the competition rules beforehand.

            This downloader uses https://github.com/EKami/kaggle-data-downloader
            and assumes everything is properly installed.
        Args:
            competition_name (str): The name of the competition
            competition_files (list): List of files for the competition (in their uncompressed format)
            competition_files_ext (list): List of extensions for the competition files in the same order
            as competition_files. Ex: 'zip', '7z', 'xz'
            output_folder (str): Path to save the downloaded files

        Returns:
            tuple: (file_names, files_path)
        """
        assert len(competition_files) == len(competition_files_ext), \
            "Length of competition_files and competition_files_ext do not match"
        datasets_path = [output_folder + f for f in competition_files]

        is_dataset_present = True
        for file in datasets_path:
            if not os.path.exists(file):
                is_dataset_present = False

        if not is_dataset_present:
            # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively
            downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"),
                                              os.getenv("KAGGLE_PASSWD"),
                                              competition_name)

            zipfiles = [
                file + "." + ext
                for file, ext in zip(competition_files, competition_files_ext)
            ]
            for file in zipfiles:
                downloader.download_dataset(file, output_folder)

            # Unzip the files
            zipdatasets_path = [output_folder + f for f in zipfiles]
            for path in zipdatasets_path:
                downloader.decompress(path, output_folder)
                os.remove(path)
        else:
            print("All datasets are present.")

        return competition_files, datasets_path
def download(user_pwd,
             competition_name,
             data_file_name,
             directory=None,
             file_name=""):
    if directory is None:
        directory = os.getcwd()

    file_path = os.path.join(directory, file_name)
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    print(os.path.abspath(file_path))

    # We can not download the data without user info
    downloader = KaggleDataDownloader(user_pwd[0], user_pwd[1],
                                      competition_name)

    download_file_path = os.path.join(directory, data_file_name)
    if os.path.exists(download_file_path):
        print("Data exists")
    else:
        downloader.download_dataset(data_file_name, directory)

    downloader.decompress(download_file_path, file_path)
test, test_u = "test-jpg.tar.7z", "test-jpg.tar"
test_additional, test_additional_u = "test-jpg-additional.tar.7z", "test-jpg-additional.tar"
test_labels = "train_v2.csv.zip"
destination_path = "../input/"
is_datasets_present = False

# If the folders already exists then the files may already be extracted
# This is a bit hacky but it's sufficient for our needs
datasets_path = data_helper.get_jpeg_data_files_paths()
for dir_path in datasets_path:
    if os.path.exists(dir_path):
        is_datasets_present = True

if not is_datasets_present:
    # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively
    downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name)
    
    train_output_path = downloader.download_dataset(train, destination_path)
    downloader.decompress(train_output_path, destination_path) # Outputs a tar file
    downloader.decompress(destination_path + train_u, destination_path) # Extract the content of the previous tar file
    os.remove(train_output_path) # Removes the 7z file
    os.remove(destination_path + train_u) # Removes the tar file
    
    test_output_path = downloader.download_dataset(test, destination_path)
    downloader.decompress(test_output_path, destination_path) # Outputs a tar file
    downloader.decompress(destination_path + test_u, destination_path) # Extract the content of the previous tar file
    os.remove(test_output_path) # Removes the 7z file
    os.remove(destination_path + test_u) # Removes the tar file
    
    test_add_output_path = downloader.download_dataset(test_additional, destination_path)
    downloader.decompress(test_add_output_path, destination_path) # Outputs a tar file
test, test_u = "test-jpg.tar.7z", "test-jpg.tar"
test_additional, test_additional_u = "test-jpg-additional.tar.7z", "test-jpg-additional.tar"
test_labels = "train_v2.csv.zip"
destination_path = "../input/"
is_datasets_present = False

# If the folders already exists then the files may already be extracted
# This is a bit hacky but it's sufficient for our needs
datasets_path = data_helper.get_jpeg_data_files_paths()
for dir_path in datasets_path:
    if os.path.exists(dir_path):
        is_datasets_present = True

if not is_datasets_present:
    # Put your Kaggle user name and password in a $KAGGLE_USER and $KAGGLE_PASSWD env vars respectively
    downloader = KaggleDataDownloader(os.getenv("KAGGLE_USER"), os.getenv("KAGGLE_PASSWD"), competition_name)
    
    train_output_path = downloader.download_dataset(train, destination_path)
    downloader.decompress(train_output_path, destination_path) # Outputs a tar file
    downloader.decompress(destination_path + train_u, destination_path) # Extract the content of the previous tar file
    os.remove(train_output_path) # Removes the 7z file
    os.remove(destination_path + train_u) # Removes the tar file
    
    test_output_path = downloader.download_dataset(test, destination_path)
    downloader.decompress(test_output_path, destination_path) # Outputs a tar file
    downloader.decompress(destination_path + test_u, destination_path) # Extract the content of the previous tar file
    os.remove(test_output_path) # Removes the 7z file
    os.remove(destination_path + test_u) # Removes the tar file
    
    test_add_output_path = downloader.download_dataset(test_additional, destination_path)
    downloader.decompress(test_add_output_path, destination_path) # Outputs a tar file