Exemplo n.º 1
0
def download_kaggle_dataset(dataset_url, data_dir, force=False, dry_run=False):
    dataset_id = get_kaggle_dataset_id(dataset_url)
    id = dataset_id.split('/')[1]
    target_dir = os.path.join(data_dir, id)

    if not force and os.path.exists(target_dir) and len(
            os.listdir(target_dir)) > 0:
        print(
            'Skipping, found downloaded files in "{}" (use force=True to force download)'
            .format(target_dir))
        return

    if not read_kaggle_creds():
        print(
            "Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds"
        )
        os.environ['KAGGLE_USERNAME'] = click.prompt("Your Kaggle username")
        os.environ['KAGGLE_KEY'] = _get_kaggle_key()

    if not dry_run:
        from kaggle import api
        api.authenticate()
        if dataset_id.split('/')[0] == 'competitions' or dataset_id.split(
                '/')[0] == 'c':
            api.competition_download_files(id,
                                           target_dir,
                                           force=force,
                                           quiet=False)
            zip_fname = target_dir + '/' + id + '.zip'
            extract_archive(zip_fname, target_dir)
            try:
                os.remove(zip_fname)
            except OSError as e:
                print('Could not delete zip file, got' + str(e))
        else:
            api.dataset_download_files(dataset_id,
                                       target_dir,
                                       force=force,
                                       quiet=False,
                                       unzip=True)

    else:
        print("This is a dry run, skipping..")
Exemplo n.º 2
0
    def download_raw_dataset(self):
        """
        Download the raw dataset and extract the contents of the zip file and
        store that in the cache location.  If the user has not specified creds in the
        kaggle.json file we lookup the passed in username and the api key and
        perform authentication.
        """
        with self.update_env(KAGGLE_USERNAME=self.kaggle_username,
                             KAGGLE_API_KEY=self.kaggle_api_key):
            # Call authenticate explicitly to pick up new credentials if necessary
            api = create_kaggle_client()
            api.authenticate()
        os.makedirs(self.raw_temp_path, exist_ok=True)

        # Download all files for a competition
        api.competition_download_files(self.competition_name,
                                       path=self.raw_temp_path)

        titanic_zip = os.path.join(self.raw_temp_path, self.archive_filename)
        with ZipFile(titanic_zip, 'r') as z:
            z.extractall(self.raw_temp_path)
        os.rename(self.raw_temp_path, self.raw_dataset_path)
Exemplo n.º 3
0
 def work(self):
     api.competition_download_files(self.competition, self.output)
Exemplo n.º 4
0
    "tourism2", "chess", "informs2010", "tourism1", "hivprogression",
    "worldcup2010", "worldcupconf", "Eurovision2010"
]

print(len(small))

errors = []

for c in small:
    path = '../data/{}'.format(c)

    if os.path.isdir(path):
        print("Path {} already exists".format(path))
        continue

    print("Download files for", c)
    time.sleep(1)
    try:
        api.competition_download_files(c, path=path)
    except:
        errors.append(c)
        print("Error downloading files for", c)

print("Errors", errors)

s = json.dumps(errors)

text_file = open("./errors.json", "w")
text_file.write(s)
text_file.close()