def _fetch_higgs(download_if_missing=True): data_home = get_data_home() data_dir = join(data_home, "higgs") data_path = join(data_dir, "HIGGS.csv.gz") if download_if_missing and not exists(data_path): _mkdirp(data_dir) logger.info("Downloading %s" % ARCHIVE.url) _fetch_remote(ARCHIVE, dirname=data_dir)
def _fetch_adult(download_if_missing=True): data_home = get_data_home() data_dir = join(data_home, "adult") data_path = join(data_dir, "adult.csv.gz") columns = [ "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", ">50K?", ] if download_if_missing and not exists(data_path): _mkdirp(data_dir) logger.info("Downloading %s" % ARCHIVE_TRAIN.url) _fetch_remote(ARCHIVE_TRAIN, dirname=data_dir) logger.info("Downloading %s" % ARCHIVE_TEST.url) _fetch_remote(ARCHIVE_TEST, dirname=data_dir) logger.debug( "Converting as a single dataframe with the correct schema") archive_path_train = join(data_dir, ARCHIVE_TRAIN.filename) # We use skiprows = [0] since there is a weird first line containing : # |1x3 Cross validator df_train = pd.read_csv(archive_path_train, header=None, sep=",", skiprows=[0]) df_train.columns = columns archive_path_test = join(data_dir, ARCHIVE_TEST.filename) # We use skiprows = [0] since there is a weird first line containing : # |1x3 Cross validator df_test = pd.read_csv(archive_path_test, header=None, sep=",", skiprows=[0]) df_test.columns = columns df_test = df_test.replace(" >50K.", " >50K") df_test = df_test.replace(" <=50K.", " <=50K") df = pd.concat([df_train, df_test], axis="index") df.to_csv(data_path, compression="gzip", index=False) # Remove temporary files os.remove(archive_path_train) os.remove(archive_path_test)
def _fetch_bank(download_if_missing=True): data_home = get_data_home() data_dir = join(data_home, "bank") data_path = join(data_dir, "bank.zip") if download_if_missing and not exists(data_path): _mkdirp(data_dir) logger.info("Downloading %s" % ARCHIVE.url) _fetch_remote(ARCHIVE, dirname=data_dir)
def _fetch_epsilon(download_if_missing=True): data_home = get_data_home() data_dir = join(data_home, "epsilon") data_path1 = join(data_dir, "epsilon_normalized.bz2") data_path2 = join(data_dir, "epsilon_normalized.t.bz2") if download_if_missing and not exists(data_path1): _mkdirp(data_dir) logger.info("Downloading %s" % ARCHIVE_TRAIN.url) _fetch_remote(ARCHIVE_TRAIN, dirname=data_dir) logger.info("Downloading %s" % ARCHIVE_TEST.url) _fetch_remote(ARCHIVE_TEST, dirname=data_dir)
def download_data( cls, images_filename: str = 'lfw-funneled.tgz', images_url: str = 'https://ndownloader.figshare.com/files/5976015', images_checksum: str = 'b47c8422c8cded889dc5a13418c4bc2a' 'bbda121092b3533a83306f90d900100a', data_home: str = None, data_subdir: str = "lfw_home", image_subdir: str = "lfw_funneled", target_filenames: list = [], target_urls: list = [], target_checksums: list = []): ''' ''' # this function is based on SKLearn's _check_fetch_lfw function. Benchmark.Start() archive = RemoteFileMetadata(images_filename, images_url, checksum=(images_checksum)) if target_filenames != []: target_attributes = zip(target_filenames, target_urls, target_checksums) targets = () for target in target_attributes: filename, url, checksum = target targets = targets + (RemoteFileMetadata( filename, url, checksum)) data_home = get_data_home(data_home=data_home) lfw_home = join(data_home, data_subdir) if not exists(lfw_home): makedirs(lfw_home) for target in TARGETS: target_filepath = join(lfw_home, target.filename) _fetch_remote(target, dirname=lfw_home) data_folder_path = join(lfw_home, image_subdir) archive_path = join(lfw_home, archive.filename) _fetch_remote(archive, dirname=lfw_home) import tarfile tarfile.open(archive_path, "r:gz").extractall(path=lfw_home) remove(archive_path) Benchmark.Stop() return f'Data downloaded to {lfw_home}'
def _fetch_default_cb(download_if_missing=True): data_home = get_data_home() data_dir = join(data_home, "default_cb") data_path = join(data_dir, "default_cb.csv.gz") if download_if_missing and not exists(data_path): _mkdirp(data_dir) logger.info("Downloading %s" % ARCHIVE.url) _fetch_remote(ARCHIVE, dirname=data_dir) logger.debug("Converting as a single dataframe with the correct schema") filepath = join(data_dir, ARCHIVE.filename) df = pd.read_excel(filepath, skiprows=[0]) df.to_csv(data_path, compression="gzip", index=False) # Remove temporary files os.remove(filepath)
def _fetch_car(download_if_missing=True): data_home = get_data_home() data_dir = join(data_home, "car") data_path = join(data_dir, "car.csv.gz") columns = ["Buying", "Maint", "Doors", "Persons", "LugBoot", "Safety", "Evaluation"] if download_if_missing and not exists(data_path): _mkdirp(data_dir) logger.info("Downloading %s" % ARCHIVE.url) _fetch_remote(ARCHIVE, dirname=data_dir) logger.debug("Converting as a single dataframe with the correct schema") filepath = join(data_dir, ARCHIVE.filename) df = pd.read_csv(filepath, header=None, sep=",") df.columns = columns df.to_csv(data_path, compression="gzip", index=False) # Remove temporary files os.remove(filepath)
def _fetch_kick(download_if_missing=True): data_home = get_data_home() data_dir = join(data_home, "kick") data_path = join(data_dir, "kick.csv.gz") if download_if_missing and not exists(data_path): _mkdirp(data_dir) logger.info("Downloading %s" % ARCHIVE.url) _fetch_remote(ARCHIVE, dirname=data_dir) logger.debug( "Converting as a single dataframe with the correct schema") filepath = join(data_dir, ARCHIVE.filename) numerical_columns = [a for a, b in dtype.items() if b != "category"] df = pd.read_csv(filepath) for nc in numerical_columns: df[nc] = df[nc].replace("?", np.nan) df.to_csv(data_path, compression="gzip", index=False) # Remove temporary files os.remove(filepath)