示例#1
0
def _fetch_higgs(download_if_missing=True):
    data_home = get_data_home()
    data_dir = join(data_home, "higgs")
    data_path = join(data_dir, "HIGGS.csv.gz")

    if download_if_missing and not exists(data_path):
        _mkdirp(data_dir)
        logger.info("Downloading %s" % ARCHIVE.url)
        _fetch_remote(ARCHIVE, dirname=data_dir)
示例#2
0
def _fetch_adult(download_if_missing=True):
    data_home = get_data_home()
    data_dir = join(data_home, "adult")
    data_path = join(data_dir, "adult.csv.gz")

    columns = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        ">50K?",
    ]

    if download_if_missing and not exists(data_path):
        _mkdirp(data_dir)
        logger.info("Downloading %s" % ARCHIVE_TRAIN.url)
        _fetch_remote(ARCHIVE_TRAIN, dirname=data_dir)
        logger.info("Downloading %s" % ARCHIVE_TEST.url)
        _fetch_remote(ARCHIVE_TEST, dirname=data_dir)

        logger.debug(
            "Converting as a single dataframe with the correct schema")
        archive_path_train = join(data_dir, ARCHIVE_TRAIN.filename)
        # We use skiprows = [0] since there is a weird first line containing :
        #   |1x3 Cross validator
        df_train = pd.read_csv(archive_path_train,
                               header=None,
                               sep=",",
                               skiprows=[0])
        df_train.columns = columns
        archive_path_test = join(data_dir, ARCHIVE_TEST.filename)
        # We use skiprows = [0] since there is a weird first line containing :
        #   |1x3 Cross validator
        df_test = pd.read_csv(archive_path_test,
                              header=None,
                              sep=",",
                              skiprows=[0])
        df_test.columns = columns
        df_test = df_test.replace(" >50K.", " >50K")
        df_test = df_test.replace(" <=50K.", " <=50K")

        df = pd.concat([df_train, df_test], axis="index")
        df.to_csv(data_path, compression="gzip", index=False)
        # Remove temporary files
        os.remove(archive_path_train)
        os.remove(archive_path_test)
示例#3
0
def _fetch_bank(download_if_missing=True):
    data_home = get_data_home()
    data_dir = join(data_home, "bank")
    data_path = join(data_dir, "bank.zip")

    if download_if_missing and not exists(data_path):

        _mkdirp(data_dir)
        logger.info("Downloading %s" % ARCHIVE.url)
        _fetch_remote(ARCHIVE, dirname=data_dir)
示例#4
0
def _fetch_epsilon(download_if_missing=True):
    data_home = get_data_home()
    data_dir = join(data_home, "epsilon")
    data_path1 = join(data_dir, "epsilon_normalized.bz2")
    data_path2 = join(data_dir, "epsilon_normalized.t.bz2")

    if download_if_missing and not exists(data_path1):
        _mkdirp(data_dir)
        logger.info("Downloading %s" % ARCHIVE_TRAIN.url)
        _fetch_remote(ARCHIVE_TRAIN, dirname=data_dir)
        logger.info("Downloading %s" % ARCHIVE_TEST.url)
        _fetch_remote(ARCHIVE_TEST, dirname=data_dir)
    def download_data(
            cls,
            images_filename: str = 'lfw-funneled.tgz',
            images_url: str = 'https://ndownloader.figshare.com/files/5976015',
            images_checksum: str = 'b47c8422c8cded889dc5a13418c4bc2a'
        'bbda121092b3533a83306f90d900100a',
            data_home: str = None,
            data_subdir: str = "lfw_home",
            image_subdir: str = "lfw_funneled",
            target_filenames: list = [],
            target_urls: list = [],
            target_checksums: list = []):
        '''
        '''
        # this function is based on SKLearn's _check_fetch_lfw function.
        Benchmark.Start()
        archive = RemoteFileMetadata(images_filename,
                                     images_url,
                                     checksum=(images_checksum))

        if target_filenames != []:
            target_attributes = zip(target_filenames, target_urls,
                                    target_checksums)
            targets = ()
            for target in target_attributes:
                filename, url, checksum = target
                targets = targets + (RemoteFileMetadata(
                    filename, url, checksum))

        data_home = get_data_home(data_home=data_home)
        lfw_home = join(data_home, data_subdir)

        if not exists(lfw_home):
            makedirs(lfw_home)

        for target in TARGETS:
            target_filepath = join(lfw_home, target.filename)
            _fetch_remote(target, dirname=lfw_home)

        data_folder_path = join(lfw_home, image_subdir)
        archive_path = join(lfw_home, archive.filename)
        _fetch_remote(archive, dirname=lfw_home)

        import tarfile
        tarfile.open(archive_path, "r:gz").extractall(path=lfw_home)
        remove(archive_path)
        Benchmark.Stop()
        return f'Data downloaded to {lfw_home}'
示例#6
0
def _fetch_default_cb(download_if_missing=True):
    data_home = get_data_home()
    data_dir = join(data_home, "default_cb")
    data_path = join(data_dir, "default_cb.csv.gz")

    if download_if_missing and not exists(data_path):
        _mkdirp(data_dir)
        logger.info("Downloading %s" % ARCHIVE.url)
        _fetch_remote(ARCHIVE, dirname=data_dir)
        logger.debug("Converting as a single dataframe with the correct schema")
        filepath = join(data_dir, ARCHIVE.filename)

        df = pd.read_excel(filepath, skiprows=[0])

        df.to_csv(data_path, compression="gzip", index=False)
        # Remove temporary files
        os.remove(filepath)
示例#7
0
文件: _car.py 项目: linlearn/linlearn
def _fetch_car(download_if_missing=True):
    data_home = get_data_home()
    data_dir = join(data_home, "car")
    data_path = join(data_dir, "car.csv.gz")

    columns = ["Buying", "Maint", "Doors", "Persons", "LugBoot", "Safety", "Evaluation"]

    if download_if_missing and not exists(data_path):
        _mkdirp(data_dir)
        logger.info("Downloading %s" % ARCHIVE.url)
        _fetch_remote(ARCHIVE, dirname=data_dir)
        logger.debug("Converting as a single dataframe with the correct schema")
        filepath = join(data_dir, ARCHIVE.filename)

        df = pd.read_csv(filepath, header=None, sep=",")
        df.columns = columns

        df.to_csv(data_path, compression="gzip", index=False)
        # Remove temporary files
        os.remove(filepath)
示例#8
0
def _fetch_kick(download_if_missing=True):
    data_home = get_data_home()
    data_dir = join(data_home, "kick")
    data_path = join(data_dir, "kick.csv.gz")

    if download_if_missing and not exists(data_path):
        _mkdirp(data_dir)
        logger.info("Downloading %s" % ARCHIVE.url)
        _fetch_remote(ARCHIVE, dirname=data_dir)
        logger.debug(
            "Converting as a single dataframe with the correct schema")
        filepath = join(data_dir, ARCHIVE.filename)

        numerical_columns = [a for a, b in dtype.items() if b != "category"]

        df = pd.read_csv(filepath)
        for nc in numerical_columns:
            df[nc] = df[nc].replace("?", np.nan)

        df.to_csv(data_path, compression="gzip", index=False)
        # Remove temporary files
        os.remove(filepath)