示例#1
0
def fetch_tgz(dataname, urlname, data_home=None):
    """Fetch zipped dataset.

    Fetch a tgz file from a given url, unzips and stores it in a given
    directory.

    Parameters
    ----------
    dataname: string
              Dataset name.
    urlname: string
             Dataset url.
    data_home: string, default=None
               Dataset directory.

    Returns
    -------
    data_home: string
               Directory.

    """
    # fetch file
    filename = fetch_file(dataname, urlname, data_home=data_home)
    data_home = get_data_home(data_home=data_home)
    data_home = join(data_home, dataname)
    # unzip file
    try:
        with tarfile.open(filename, 'r:gz') as tar_file:
            tar_file.extractall(data_home)
    except Exception:
        remove(filename)
        raise
    return data_home
示例#2
0
def fetch_libsvm(collection, name, data_home=None):
    """Fetch LIBSVM dataset.

    Fetch a LIBSVM dataset by collection and name. More info at
    https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets.

    Parameters
    ----------
    collection : string
        Collection name.
    name : string
        Dataset name.
    data_home : string or None, default None
        Specify another download and cache folder for the data sets. By default
        all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders.

    Returns
    -------
    data : Bunch
        Dictionary-like object with all the data and metadata.

    """
    if collection not in COLLECTIONS:
        raise Exception('Avaliable collections are ' + str(list(COLLECTIONS)))
    dirname = os.path.join(get_data_home(data_home=data_home), 'libsvm',
                           collection, name.replace('/', '-'))
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    X, y, X_test, y_test, cv, X_remaining, y_remaining = _load(collection, name,
                                                               dirname=dirname)
    data = Bunch(data=X, target=y, data_test=X_test, target_test=y_test,
                 inner_cv=cv, outer_cv=None, data_remaining=X_remaining,
                 target_remaining=y_remaining, DESCR=name)
    return data
示例#3
0
def fetch_uci(name, data_home=None):
    """Fetch UCI dataset.

    Fetch a UCI dataset by name. More info at
    https://archive.ics.uci.edu/ml/datasets.html.

    Parameters
    ----------
    name : string
        Dataset name.
    data_home : string or None, default None
        Specify another download and cache folder for the data sets. By default
        all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders.

    Returns
    -------
    data : Bunch
        Dictionary-like object with all the data and metadata.

    """
    dirname = os.path.join(get_data_home(data_home=data_home), 'uci', name)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    X, y, X_test, y_test, DESCR = _fetch(name, dirname=dirname)
    data = Bunch(data=X,
                 target=y,
                 data_test=X_test,
                 target_test=y_test,
                 inner_cv=None,
                 outer_cv=None,
                 DESCR=DESCR)
    return data
示例#4
0
def fetch_indoor_pos(data_home=None,
                     is_train=True,
                     return_X_y=False,
                     remove_dup=False):
    """Load the code rally 2019 dataset (classification).

    =================   ====================================
    Samples total       25811                             
    Dimensionality      12
    Features            continuous (float)
    =================   ====================================

    Parameters
    ----------
    data_home : string, optional
        Specify the folder for the datasets.

    is_train : bool, default=True
        Whether is train dataset.

    return_X_y : bool, default=False
        If True, returns ``(data, target_x, target_y, orig_data)`` instead of a Bunch object.
   
    remove_dup : bool, default=False
        If True, remove the duplicate lines with the same features, only keep one 
        against one series of features.

    Returns
    -------
    data : Bunch
        Dictionary-like object, the interesting attributes are:
         - 'data', the data to learn removing duplicate entries.
         - 'target_x', the regression target x for each sample.
         - 'target_y', the regression target y for each sample.
         - 'orig_data', the data to learn.

    (data, target_x, target_y) : tuple if ``return_X_y`` is True

    """
    data_home = get_data_home(data_home=data_home)
    indoorpos = _fetch_brute_indoor_pos(data_home=data_home,
                                        is_train=is_train,
                                        remove_dup=remove_dup)

    data = indoorpos.data
    target_x = indoorpos.target_x
    target_y = indoorpos.target_y
    orig_data = indoorpos.orig_data

    if return_X_y:
        return data, target_x, target_y, orig_data

    return Bunch(data=data,
                 target_x=target_x,
                 target_y=target_y,
                 orig_data=orig_data)
示例#5
0
def fetch_mnist(data_home=None):
    mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
    data_home = get_data_home(data_home=data_home)
    data_home = os.path.join(data_home, 'mldata')
    if not os.path.exists(data_home):
        os.makedirs(data_home)
    mnist_save_path = os.path.join(data_home, "mnist-original.mat")
    if not os.path.exists(mnist_save_path):
        mnist_url = urllib.request.urlopen(mnist_alternative_url)
        with open(mnist_save_path, "wb") as matlab_file:
            copyfileobj(mnist_url, matlab_file)
    def __init__(self,
                 data_home=None,
                 download_if_missing=True,
                 frame_scale_factor=None,
                 shuffle=False,
                 random_state=None):
        self.data_home = get_data_home(data_home=data_home)
        self.download_if_missing = download_if_missing

        self.sd15ch1_home = os.path.join(self.data_home, SD15CH1_DIRNAME)
        self.framesdir = os.path.join(self.sd15ch1_home, "frames")

        self._scale_factor = None
        if frame_scale_factor is not None:
            if not (0 < frame_scale_factor <= 20):
                self.__err("frame_scale_factor parameter but be > 0 and < 20.",
                           ValueError)
            self._scale_factor = float(frame_scale_factor)

        # Open or try download or raise an exception if dataset is unavailable
        _ensure_dataset_is_downloaded(self.sd15ch1_home,
                                      self.download_if_missing)

        # Read metadata file
        frames_metadata_path = os.path.join(self.framesdir, "metadata.csv.gz")
        self._info('Loading frames metadata from %s' %
                   (frames_metadata_path, ))
        self._rawdata = pd.read_csv(frames_metadata_path)

        if shuffle:
            self._rawdata = self._rawdata.sample(
                frac=1, axis=0,
                random_state=random_state).reset_index(drop=True)

        for _rid, rseries in self._rawdata.iterrows():
            dfi_dict = {}
            # Copy content
            for colname in self._rawdata:  # .keys()
                dfi_dict[colname] = rseries[colname]
            # Add extra entries
            dfi_dict["image_path_absolute"] = os.path.join(
                self.framesdir, rseries["image_path"])  # string (path)
            dfi_dict["_scale_factor"] = self._scale_factor  # float
            # dfi_dict["frame_uid"] = rid  # int  # does not survive shuffling
            # hint: use df.reindex(np.random.permutation(df.index)) and keep original uids
            # Store
            self.append(Frame(dfi_dict))

        self._unique_background_ids = None
        self._unique_background_names = None
        self._unique_model_ids = None
        self._unique_model_names = None
        self._unique_modeltype_ids = None
        self._unique_modeltype_names = None
示例#7
0
def fetch_keel(collection, name, data_home=None, nfolds=None, dobscv=False):
    """Fetch Keel dataset.

    Fetch a Keel dataset by collection and name. More info at
    http://sci2s.ugr.es/keel.

    Parameters
    ----------
    collection : string
        Collection name.
    name : string
        Dataset name.
    data_home : string or None, default None
        Specify another download and cache folder for the data sets. By default
        all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders.
    nfolds : int, default=None
        Number of folds. Depending on the dataset, valid values are
        {None, 1, 5, 10}.
    dobscv : bool, default=False
        If folds are in {5, 10}, indicates that the cv folds are distribution
        optimally balanced stratified. Only available for some datasets.
    **kwargs : dict
        Optional key-value arguments

    Returns
    -------
    data : Bunch
        Dictionary-like object with all the data and metadata.

    """
    if collection not in COLLECTIONS:
        raise Exception('Avaliable collections are ' + str(list(COLLECTIONS)))
    dirname = os.path.join(get_data_home(data_home=data_home), 'keel',
                           collection, name)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    nattrs, DESCR = _load_descr(collection, name, dirname=dirname)
    X, y, cv = _load_folds(collection,
                           name,
                           nfolds,
                           dobscv,
                           nattrs,
                           dirname=dirname)
    data = Bunch(data=X,
                 target=y,
                 data_test=None,
                 target_test=None,
                 inner_cv=None,
                 outer_cv=cv,
                 DESCR=DESCR)
    return data
示例#8
0
def fetch_mnist(data_home=None):
    # where to store the data
    data_home = get_data_home(data_home=data_home)
    data_home = os.path.join(data_home, 'mldata')
    if not os.path.exists(data_home):
        os.makedirs(data_home)
    mnist_save_path = os.path.join(data_home, "mnist-original.mat")

    # download if needed
    if not os.path.exists(mnist_save_path):
        print("Download MNIST to", mnist_save_path)
        mnist_url = urlopen(
            "http://home.htw-berlin.de/~hezel/files/data/mnist-original.mat")
        with open(mnist_save_path, "wb") as matlab_file:
            copyfileobj(mnist_url, matlab_file)
    return fetch_mldata('MNIST original')
示例#9
0
def fetch_file(dataname, urlname, subfolder=None, data_home=None):
    """Fetch dataset.

    Fetch a file from a given url and stores it in a given directory.

    Parameters
    ----------
    dataname: string
              Dataset name.
    urlname: string
             Dataset url.
    data_home: string, default=None
               Dataset directory.

    Returns
    -------
    filename: string
              Name of the file.

    """
    # check if this data set has been already downloaded
    data_home = pathlib.Path(get_data_home(data_home=data_home))

    if subfolder:
        data_home = data_home / subfolder

    data_home = data_home / dataname
    if not data_home.exists():
        data_home.mkdir(parents=True)
    filename = data_home / basename(normpath(urlname))
    # if the file does not exist, download it
    if not filename.exists():
        try:
            data_url = urlopen(urlname)
        except HTTPError as e:
            if e.code == 404:
                e.msg = "Dataset '%s' not found." % dataname
            raise
        # store file
        try:
            with open(filename, 'w+b') as data_file:
                copyfileobj(data_url, data_file)
        except Exception:
            filename.unlink()
            raise
        data_url.close()
    return filename
示例#10
0
def fetch_mnist(data_home=None):
    '''
    Function to get original mnist data set if data does not already
    exist locally
    :param data_home:
    :return:
    '''
    mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
    data_home = get_data_home(data_home=data_home)
    data_home = os.path.join(data_home, 'mldata')
    if not os.path.exists(data_home):
        os.makedirs(data_home)
    mnist_save_path = os.path.join(data_home, "mnist-original.mat")
    if not os.path.exists(mnist_save_path):
        mnist_url = urllib.request.urlopen(mnist_alternative_url)
        with open(mnist_save_path, "wb") as matlab_file:
            copyfileobj(mnist_url, matlab_file)
示例#11
0
def load_credit_data():
    sk_data_dir = get_data_home()
    archive = RemoteFileMetadata(
        filename='default of credit card clients.xls',
        url='https://archive.ics.uci.edu/ml/machine-learning-databases/'
        '00350/default%20of%20credit%20card%20clients.xls',
        checksum=('30c6be3abd8dcfd3e6096c828bad8c2f'
                  '011238620f5369220bd60cfc82700933'))

    if not exists(join(sk_data_dir, archive.filename)):
        _fetch_remote(archive, dirname=sk_data_dir)

    data = pd.read_excel(join(sk_data_dir, archive.filename),
                         sheet_name='Data',
                         header=1)

    dataset = Bunch(data=(data.drop('default payment next month', axis=1)),
                    target=np.array(data['default payment next month']))
    return dataset
示例#12
0
def fetch_file(dataname, urlname, data_home=None):
    """Fetch dataset.

    Fetch a file from a given url and stores it in a given directory.

    Parameters
    ----------
    dataname: string
              Dataset name.
    urlname: string
             Dataset url.
    data_home: string, default=None
               Dataset directory.

    Returns
    -------
    filename: string
              Name of the file.

    """
    # check if this data set has been already downloaded
    data_home = get_data_home(data_home=data_home)
    data_home = join(data_home, dataname)
    if not exists(data_home):
        makedirs(data_home)
    filename = join(data_home, basename(normpath(urlname)))
    # if the file does not exist, download it
    if not exists(filename):
        try:
            data_url = urlopen(urlname)
        except HTTPError as e:
            if e.code == 404:
                e.msg = "Dataset '%s' not found." % dataname
            raise
        # store file
        try:
            with open(filename, 'w+b') as data_file:
                copyfileobj(data_url, data_file)
        except Exception:
            remove(filename)
            raise
        data_url.close()
    return filename
def load_data(subset):
    '''
    Fetch 20newsgroup data and return it in tf-idf matrix form.

    Parameters
    ----------
    subset: str
        subset of date to fetch, whose value can be train, test, and all.

    Returns
    -------
    A: array, shape (number of documents, number of words)
    vocab_matrix: array, shape (number of words, number of words)
        This is the matrix of eigenwords of words in the dictionary.
    vocabulary: list
        List contains vocabulary. Note that they actual word string.
    '''
    # Data preprocessing
    # ================================================================
    # Fetch 20newsgroups data, removing its header, footer and quotes.
    docs = datasets.fetch_20newsgroups(
            subset=subset,
            remove=('headers', 'footers', 'quotes')
            )
    # ================================================================

    # Convert the raw documents data into tf-idf matrix
    # ================================================================
    data_home = get_data_home()
    vocabulary_path = os.path.join(data_home, 'vocabulary.txt')
    vocabulary = load_vocabulary(vocabulary_path)

    vectorizer = TfidfVectorizer(
            vocabulary=vocabulary,
            norm='l2',
            use_idf=True,
            smooth_idf=True,
            sublinear_tf=True
            )
    docs_tfidf = vectorizer.fit_transform(docs.data)
    vocab_tfidf = vectorizer.transform(vocabulary)
    # ================================================================
    return docs_tfidf, vocab_tfidf, vocabulary
def fetch_or_load_lymphography():
    global Lymphography
    data_home = get_data_home()
    if not exists(data_home):
        makedirs(data_home)
    file_path = join(data_home, 'Lymphography', 'Lymphography_withoutdupl_idf.arff')
    if not exists(file_path):
        data_archive_path = _fetch_remote(Lymphography)
        tf = tarfile.open(data_archive_path)
        tf.extractall(data_home)
        remove(data_archive_path)
    f_descriptor = open(file_path, 'r')
    dataset = arff.load(f_descriptor)
    df = pd.DataFrame(dataset['data'])
    feature = df.iloc[:,1:19].to_numpy()
    ground_truth = np.ones(148)
    for i in [43, 44, 45, 103, 132, 147]:
        ground_truth[i] = -1 
    return (feature, ground_truth)
示例#15
0
def check_fetch_lfw(data_home=None, funneled=True, download_if_missing=True):
    """Helper function to download any missing LFW data"""
    data_home = get_data_home(data_home=data_home)
    lfw_home = join(data_home, "lfw_home")

    if funneled:
        archive_path = join(lfw_home, FUNNELED_ARCHIVE_NAME)
        data_folder_path = join(lfw_home, "lfw_funneled")
        archive_url = BASE_URL + FUNNELED_ARCHIVE_NAME
    else:
        archive_path = join(lfw_home, ARCHIVE_NAME)
        data_folder_path = join(lfw_home, "lfw")
        archive_url = BASE_URL + ARCHIVE_NAME

    if not exists(lfw_home):
        makedirs(lfw_home)

    for target_filename in TARGET_FILENAMES:
        target_filepath = join(lfw_home, target_filename)
        if not exists(target_filepath):
            if download_if_missing:
                url = BASE_URL + target_filename
                logger.warn("Downloading LFW metadata: %s", url)
                urllib.urlretrieve(url, target_filepath)
            else:
                raise IOError("%s is missing" % target_filepath)

    if not exists(data_folder_path):

        if not exists(archive_path):
            if download_if_missing:
                logger.warn("Downloading LFW data (~200MB): %s", archive_url)
                urllib.urlretrieve(archive_url, archive_path)
            else:
                raise IOError("%s is missing" % target_filepath)

        import tarfile
        logger.info("Decompressing the data archive to %s", data_folder_path)
        tarfile.open(archive_path, "r:gz").extractall(path=lfw_home)
        remove(archive_path)

    return lfw_home, data_folder_path
def check_fetch_lfw(data_home=None, funneled=True, download_if_missing=True):
    """Helper function to download any missing LFW data"""
    data_home = get_data_home(data_home=data_home)
    lfw_home = join(data_home, "lfw_home")

    if funneled:
        archive_path = join(lfw_home, FUNNELED_ARCHIVE_NAME)
        data_folder_path = join(lfw_home, "lfw_funneled")
        archive_url = BASE_URL + FUNNELED_ARCHIVE_NAME
    else:
        archive_path = join(lfw_home, ARCHIVE_NAME)
        data_folder_path = join(lfw_home, "lfw")
        archive_url = BASE_URL + ARCHIVE_NAME

    if not exists(lfw_home):
        makedirs(lfw_home)

    for target_filename in TARGET_FILENAMES:
        target_filepath = join(lfw_home, target_filename)
        if not exists(target_filepath):
            if download_if_missing:
                url = BASE_URL + target_filename
                logger.warn("Downloading LFW metadata: %s", url)
                urllib.urlretrieve(url, target_filepath)
            else:
                raise IOError("%s is missing" % target_filepath)

    if not exists(data_folder_path):

        if not exists(archive_path):
            if download_if_missing:
                logger.warn("Downloading LFW data (~200MB): %s", archive_url)
                urllib.urlretrieve(archive_url, archive_path)
            else:
                raise IOError("%s is missing" % target_filepath)

        import tarfile
        logger.info("Decompressing the data archive to %s", data_folder_path)
        tarfile.open(archive_path, "r:gz").extractall(path=lfw_home)
        remove(archive_path)

    return lfw_home, data_folder_path
    def __init__(self,
                 data_home=None,
                 download_if_missing=True,
                 variant="05-corrected-nexus-scaled33"):

        self.data_home = get_data_home(data_home=data_home)
        self.download_if_missing = download_if_missing

        self.sd15ch1_home = os.path.join(self.data_home, SD15CH1_DIRNAME)
        self.modelsdir = os.path.join(self.sd15ch1_home, "models")

        # Open or try download or raise an exception if dataset is unavailable
        _ensure_dataset_is_downloaded(self.sd15ch1_home,
                                      self.download_if_missing)

        # Read metadata file
        models_metadata_path = os.path.join(self.modelsdir, "metadata.csv.gz")
        self._info('Loading frames metadata from %s' %
                   (models_metadata_path, ))
        df = pd.read_csv(models_metadata_path)

        # Filter the variant we want to load
        if variant not in Models.VARIANTS:
            self._err("Unknown model variant: '%s'." % variant, ValueError)
        self._rawdata = df[df["model_cat"] == variant]

        for _rid, rseries in self._rawdata.iterrows():
            mdli_dict = {}
            # Copy content
            for colname in self._rawdata:  # .keys()
                mdli_dict[colname] = rseries[colname]
            # Add extra entries
            mdli_dict["image_path_absolute"] = os.path.join(
                self.modelsdir, rseries["image_path"])  # string (path)
            # Store
            self.append(Model(mdli_dict))

        self._unique_model_ids = None
        self._unique_model_names = None
        self._unique_modeltype_ids = None
        self._unique_modeltype_names = None
示例#18
0
def fetch_raetsch(name, data_home=None):
    """Fetch Gunnar Raetsch's dataset.

    Fetch a Gunnar Raetsch's benchmark dataset by name. Availabe datasets are
    'banana', 'breast_cancer', 'diabetis', 'flare_solar', 'german', 'heart',
    'image', 'ringnorm', 'splice', 'thyroid', 'titanic', 'twonorm' and
    'waveform'. More info at
    https://github.com/tdiethe/gunnar_raetsch_benchmark_datasets.

    Parameters
    ----------
    name : string
        Dataset name.
    data_home : string or None, default None
        Specify another download and cache folder for the data sets. By default
        all scikit-learn data is stored in ‘~/scikit_learn_data’ subfolders.

    Returns
    -------
    data : Bunch
        Dictionary-like object with all the data and metadata.

    """
    if name not in DATASETS:
        raise Exception('Avaliable datasets are ' + str(list(DATASETS)))
    dirname = os.path.join(get_data_home(data_home=data_home), 'raetsch')
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    filename = _fetch_remote(ARCHIVE, dirname=dirname)
    X, y, train_splits, test_splits = loadmat(filename)[name][0][0]
    cv = ((X[tr - 1], y[tr - 1], X[ts - 1], y[ts - 1])
          for tr, ts in zip(train_splits, test_splits))
    return Bunch(data=X,
                 target=y,
                 data_test=None,
                 target_test=None,
                 inner_cv=None,
                 outer_cv=cv,
                 DESCR=name)
示例#19
0
def load_data(subset):
    '''
    Fetch 20newsgroup data and return it in tf-idf matrix form.

    Parameters
    ----------
    subset: str
        subset of date to fetch, whose value can be train, test, and all.

    Returns
    -------
    A: array, shape (number of documents, number of words)
    vocab_matrix: array, shape (number of words, number of words)
        This is the matrix of eigenwords of words in the dictionary.
    vocabulary: list
        List contains vocabulary. Note that they actual word string.
    '''
    # Data preprocessing
    # ================================================================
    # Fetch 20newsgroups data, removing its header, footer and quotes.
    docs = datasets.fetch_20newsgroups(subset=subset,
                                       remove=('headers', 'footers', 'quotes'))
    # ================================================================

    # Convert the raw documents data into tf-idf matrix
    # ================================================================
    data_home = get_data_home()
    vocabulary_path = os.path.join(data_home, 'vocabulary.txt')
    vocabulary = load_vocabulary(vocabulary_path)

    vectorizer = TfidfVectorizer(vocabulary=vocabulary,
                                 norm='l2',
                                 use_idf=True,
                                 smooth_idf=True,
                                 sublinear_tf=True)
    docs_tfidf = vectorizer.fit_transform(docs.data)
    vocab_tfidf = vectorizer.transform(vocabulary)
    # ================================================================
    return docs_tfidf, vocab_tfidf, vocabulary
示例#20
0
 def __init__(self, N=6, ds_size=80, ds="BAS"):
     self.ds = ds
     if ds == "BAS":
         self.BAS = BAS(args.N)
         self.ds_size = ds_size
         self.S = self.BAS.getSample(size=ds_size)
     elif ds == "MNIST":
         if (N != 28):
             raise ValueError("Please use N = 28 for the MNIST data set")
         try:
             mnist = fetch_openml('mnist_784')
         except:
             print(
                 "Could not download MNIST data from mldata.org, trying alternative..."
             )
             mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
             data_home = get_data_home(data_home=None)
             data_home = os.path.join(data_home, 'mldata')
             if not os.path.exists(data_home):
                 os.makedirs(data_home)
             mnist_save_path = os.path.join(data_home, "mnist-original.mat")
             if not os.path.exists(mnist_save_path):
                 print("Downloading from ", mnist_alternative_url)
                 urllib.request.urlretrieve(mnist_alternative_url,
                                            mnist_save_path)
             print("Now calling fetch_mldata once more")
             mnist = fetch_openml('mnist_784')
         label = np.asarray(list(map(int, mnist['target'])))
         mnist = mnist.data
         mnist = ((mnist / 255.0) + 0.5).astype(int)
         images = []
         for i in range(ds_size):
             digit = i % 10
             u = np.where(label == digit)[0]
             images.append(mnist[u[i // 10], None, :])
         self.S = np.concatenate(images, axis=0)
         self.ds_size = ds_size
     else:
         raise ValueError("Unknown data set name")
示例#21
0
def _get_latest_version_offline(package_name):
    """
    Get the latest downloaded version of the package.

    Returns None if not found.

    """
    home = pathlib.Path(get_data_home())  # Should allow providing data home?

    downloaded_packages = tuple(home.glob(package_name + "_*.tar.gz"))

    if downloaded_packages:
        versions = [
            LooseVersion(p.name[(len(package_name) + 1):-len(".tar.gz")])
            for p in downloaded_packages]

        versions.sort()
        latest_version = versions[-1]

        return str(latest_version)
    else:
        return None
示例#22
0
def fetch_classic(data_home=None, subset='all', categories=None,
                  shuffle=True, random_state=42,
                  remove=(),
                  download_if_missing=True):
    """Load the filenames and data from the 20 newsgroups dataset.

    Read more in the :ref:`User Guide <20newsgroups>`.

    Parameters
    ----------
    subset: 'train' or 'test', 'all', optional
        Select the dataset to load: 'train' for the training set, 'test'
        for the test set, 'all' for both, with shuffled ordering.

    data_home: optional, default: None
        Specify a download and cache folder for the datasets. If None,
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    categories: None or collection of string or unicode
        If None (default), load all the categories.
        If not None, list of category names to load (other categories
        ignored).

    shuffle: bool, optional
        Whether or not to shuffle the data: might be important for models that
        make the assumption that the samples are independent and identically
        distributed (i.i.d.), such as stochastic gradient descent.

    random_state: numpy random number generator or seed integer
        Used to shuffle the dataset.

    download_if_missing: optional, True by default
        If False, raise an IOError if the data is not locally available
        instead of trying to download the data from the source site.

    remove: tuple
        May contain any subset of ('headers', 'footers', 'quotes'). Each of
        these are kinds of text that will be detected and removed from the
        newsgroup posts, preventing classifiers from overfitting on
        metadata.

        'headers' removes newsgroup headers, 'footers' removes blocks at the
        ends of posts that look like signatures, and 'quotes' removes lines
        that appear to be quoting another post.

        'headers' follows an exact standard; the other filters are not always
        correct.
    """

    data_home = get_data_home(data_home=data_home)
    cache_path = _pkl_filepath(data_home, CACHE_NAME)
    classic_home = os.path.join(data_home, CLASSIC_HOME)
    cache = None
    if os.path.exists(cache_path):
        try:
            with open(cache_path, 'rb') as f:
                compressed_content = f.read()
            uncompressed_content = codecs.decode(
                compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        except Exception as e:
            print(80 * '_')
            print('Cache loading failed')
            print(80 * '_')
            print(e)

    if cache is None:
        if download_if_missing:
            cache = download_classic(target_dir=classic_home,
                                     cache_path=cache_path)
        else:
            raise IOError('classic dataset not found')

    if subset in ('train', 'test'):
        data = cache[subset]
    elif subset == 'all':
        data_lst = list()
        target = list()
        filenames = list()
        for subset in ('train', 'test'):
            data = cache[subset]
            data_lst.extend(data.data)
            target.extend(data.target)
            filenames.extend(data.filenames)

        data.data = data_lst
        data.target = np.array(target)
        data.filenames = np.array(filenames)
    else:
        raise ValueError(
            "subset can only be 'train', 'test' or 'all', got '%s'" % subset)

    data.description = 'the classic dataset'

    # if 'headers' in remove:
    #     data.data = [strip_newsgroup_header(text) for text in data.data]
    # if 'footers' in remove:
    #     data.data = [strip_newsgroup_footer(text) for text in data.data]
    # if 'quotes' in remove:
    #     data.data = [strip_newsgroup_quoting(text) for text in data.data]

    if categories is not None:
        labels = [(data.target_names.index(cat), cat) for cat in categories]
        # Sort the categories to have the ordering of the labels
        labels.sort()
        labels, categories = zip(*labels)
        mask = np.in1d(data.target, labels)
        data.filenames = data.filenames[mask]
        data.target = data.target[mask]
        # searchsorted to have continuous labels
        data.target = np.searchsorted(labels, data.target)
        data.target_names = list(categories)
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[mask]
        data.data = data_lst.tolist()

    if shuffle:
        random_state = check_random_state(random_state)
        indices = np.arange(data.target.shape[0])
        random_state.shuffle(indices)
        data.filenames = data.filenames[indices]
        data.target = data.target[indices]
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[indices]
        data.data = data_lst.tolist()

    return data
 def __init__(self,
              N=28,
              input=100,
              ds_size=80,
              ds="MNIST",
              speed_threshold=50):
     self.ds = ds
     if ds == "BAS":
         bas_size = 30
         data_home = get_data_home(data_home=None)
         data_home = os.path.join(data_home, 'BAS')
         datafile = os.path.join(data_home, 'BAS.pkl')
         if not os.path.exists(data_home):
             os.makedirs(data_home)
         if not os.path.exists(datafile):
             _BAS = BAS(N)
             BAS_data = _BAS.getSample(size=bas_size)
             f = open(datafile, "wb")
             pickle.dump(BAS_data, f)
             f.close()
         else:
             f = open(datafile, "rb")
             BAS_data = pickle.load(f)
             f.close()
         S = BAS_data[0:ds_size, None, :]
         self.S = np.concatenate(S, axis=0)
         self.ds_size = ds_size
         self.test_data = self.S
         self.ground_truth = self.S
     elif ds == "4bits":
         self.S = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0],
                            [0, 0, 0, 1]])
         self.ds_size = 4
         self.test_data = self.S
         self.ground_truth = self.S
     elif ds == "traffic":
         traffic_data = Traffic_data(input, 0, 50)
         self.S = traffic_data.train
         self.test_data = traffic_data.test
         self.ground_truth = traffic_data.ground_truth
         self.ds_size = traffic_data.ds_size
     elif ds == "small_traffic":
         pickle_out = open("small_traffic_dataset.pkl", "rb")
         data = pickle.load(pickle_out)
         pickle_out.close()
         self.S = data["train"] > speed_threshold
         self.ds_size = self.S.shape[0]
         self.test_data = data["test"] > speed_threshold
     elif ds == "MNIST":
         if (N != 28):
             raise ValueError("Please use N = 28 for the MNIST data set")
         try:
             custom_data_home = "C:/Users/Hamco/scikit_learn_data"
             mnist = fetch_mldata('MNIST original',
                                  data_home=custom_data_home)
         except:
             print(
                 "Could not download MNIST data from mldata.org, trying alternative..."
             )
             mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
             data_home = get_data_home(data_home=None)
             data_home = os.path.join(data_home, 'mldata')
             if not os.path.exists(data_home):
                 os.makedirs(data_home)
             mnist_save_path = os.path.join(data_home, "mnist-original.mat")
             if not os.path.exists(mnist_save_path):
                 print("Downloading from ", mnist_alternative_url)
                 urllib.urlretrieve(mnist_alternative_url, mnist_save_path)
             print("Now calling fetch_mldata once more")
             mnist = fetch_mldata('MNIST original')
         label = mnist['target']
         mnist = mnist.data
         mnist = ((mnist / 255.0) + 0.5).astype(int)
         images = []
         for i in range(ds_size):
             digit = i % 10
             u = np.where(label == digit)[0]
             images.append(mnist[u[i // 10], None, :])
         self.S = np.concatenate(images, axis=0)
         self.resize(20, 20)
         self.ds_size = ds_size
     else:
         raise ValueError("Unknown data set name")
示例#24
0
import argparse

parser = argparse.ArgumentParser()  

parser.add_argument("-n", "--name", default='MNIST_A_default')
parser.add_argument("-i", "--iteration", type=int, default=1000)
tp = lambda x:list(map(int, x.split(',')))
parser.add_argument("-nl", '--number_list', type=tp, default="0,1,2,3,4,5,6,7,8,9")
parser.add_argument("-pxl", '--pos_x_list', type=tp, default="0,36")
parser.add_argument("-pyl", '--pos_y_list', type=tp, default="0,36")
parser.add_argument("-al", '--angle_list', type=tp, default="-45,0,45")
parser.add_argument("-sl", '--scale_list', type=tp, default="28,16")

args = parser.parse_args()   

mnist_path = join(get_data_home(), "mldata/mnist-original.mat")
if not exists(os.path.dirname(mnist_path)):
    os.makedirs(os.path.dirname(mnist_path))
mnist_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
urllib.request.urlretrieve(mnist_url, mnist_path)
mnist = fetch_mldata('MNIST original', data_home=get_data_home())
mnist_data = mnist["data"]/255.
mnist_target = mnist["target"].astype(int)

train_X, test_X, train_y, test_y = train_test_split(mnist_data, mnist_target, random_state=42, test_size=10000)
test_X, valid_X, test_y, valid_y = train_test_split(test_X, test_y, random_state=42, test_size=2000)


def change_scale(image, scale):
    image = resize(image, (scale, scale), mode='constant')
    pad_size = int((28-scale)/2)
def fetch_species_distributions(data_home=None,
                                download_if_missing=True):
    """Loader for species distribution dataset from Phillips et. al. (2006)

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing: optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    Notes
    ------

    This dataset represents the geographic distribution of species.
    The dataset is provided by Phillips et. al. (2006).

    The two species are:

    - `"Bradypus variegatus"
      <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ ,
      the Brown-throated Sloth.

    - `"Microryzomys minutus"
      <http://www.iucnredlist.org/apps/redlist/details/13408/0>`_ ,
      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
      Colombia, Ecuador, Peru, and Venezuela.

    The data is returned as a Bunch object with the following attributes:

    coverages : array, shape = [14, 1592, 1212]
        These represent the 14 features measured at each point of the map grid.
        The latitude/longitude values for the grid are discussed below.
        Missing data is represented by the value -9999.

    train : record array, shape = (1623,)
        The training points for the data.  Each point has three fields:

        - train['species'] is the species name
        - train['dd long'] is the longitude, in degrees
        - train['dd lat'] is the latitude, in degrees

    test : record array, shape = (619,)
        The test points for the data.  Same format as the training data.

    Nx, Ny : integers
        The number of longitudes (x) and latitudes (y) in the grid

    x_left_lower_corner, y_left_lower_corner : floats
        The (x,y) position of the lower-left corner, in degrees

    grid_size : float
        The spacing between points of the grid, in degrees

    References
    ----------

    * `"Maximum entropy modeling of species geographic distributions"
      <http://www.cs.princeton.edu/~schapire/papers/ecolmod.pdf>`_
      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
      190:231-259, 2006.

    Notes
    -----

    * See examples/applications/plot_species_distribution_modeling.py
      for an example of using this dataset with scikit-learn

    """
    data_home = get_data_home(data_home)
    if not exists(data_home):
        makedirs(data_home)

    # Define parameters for the data files.  These should not be changed
    # unless the data model changes.  They will be saved in the npz file
    # with the downloaded data.
    extra_params = dict(x_left_lower_corner=-94.8,
                        Nx=1212,
                        y_left_lower_corner=-56.05,
                        Ny=1592,
                        grid_size=0.05)
    dtype = np.int16

    if not exists(join(data_home, DATA_ARCHIVE_NAME)):
        print('Downloading species data from %s to %s' % (SAMPLES_URL,
                                                          data_home))
        X = np.load(BytesIO(urlopen(SAMPLES_URL).read()))

        for f in X.files:
            fhandle = BytesIO(X[f])
            if 'train' in f:
                train = _load_csv(fhandle)
            if 'test' in f:
                test = _load_csv(fhandle)

        print('Downloading coverage data from %s to %s' % (COVERAGES_URL,
                                                           data_home))

        X = np.load(BytesIO(urlopen(COVERAGES_URL).read()))

        coverages = []
        for f in X.files:
            fhandle = BytesIO(X[f])
            print(' - converting', f)
            coverages.append(_load_coverage(fhandle))
        coverages = np.asarray(coverages,
                               dtype=dtype)

        bunch = Bunch(coverages=coverages,
                      test=test,
                      train=train,
                      **extra_params)
        joblib.dump(bunch, join(data_home, DATA_ARCHIVE_NAME), compress=9)
    else:
        bunch = joblib.load(join(data_home, DATA_ARCHIVE_NAME))

    return bunch
示例#26
0
def fetch_species_distributions(data_home=None, download_if_missing=True):
    """Loader for species distribution dataset from Phillips et. al. (2006)

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit learn data is stored in '~/scikit_learn_data' subfolders.

    download_if_missing: optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    Notes
    ------

    This dataset represents the geographic distribution of species.
    The dataset is provided by Phillips et. al. (2006).

    The two species are:

    - `"Bradypus variegatus"
      <http://www.iucnredlist.org/apps/redlist/details/3038/0>`_ ,
      the Brown-throated Sloth.

    - `"Microryzomys minutus"
      <http://www.iucnredlist.org/apps/redlist/details/13408/0>`_ ,
      also known as the Forest Small Rice Rat, a rodent that lives in Peru,
      Colombia, Ecuador, Peru, and Venezuela.

    The data is returned as a Bunch object with the following attributes:

    coverages : array, shape = [14, 1592, 1212]
        These represent the 14 features measured at each point of the map grid.
        The latitude/longitude values for the grid are discussed below.
        Missing data is represented by the value -9999.

    train : record array, shape = (1623,)
        The training points for the data.  Each point has three fields:

        - train['species'] is the species name
        - train['dd long'] is the longitude, in degrees
        - train['dd lat'] is the latitude, in degrees

    test : record array, shape = (619,)
        The test points for the data.  Same format as the training data.

    Nx, Ny : integers
        The number of longitudes (x) and latitudes (y) in the grid

    x_left_lower_corner, y_left_lower_corner : floats
        The (x,y) position of the lower-left corner, in degrees

    grid_size : float
        The spacing between points of the grid, in degrees

    References
    ----------

    * `"Maximum entropy modeling of species geographic distributions"
      <http://www.cs.princeton.edu/~schapire/papers/ecolmod.pdf>`_
      S. J. Phillips, R. P. Anderson, R. E. Schapire - Ecological Modelling,
      190:231-259, 2006.

    Notes
    -----

    * See examples/applications/plot_species_distribution_modeling.py
      for an example of using this dataset with scikit-learn

    """
    data_home = get_data_home(data_home)
    if not exists(data_home):
        makedirs(data_home)

    # Define parameters for the data files.  These should not be changed
    # unless the data model changes.  They will be saved in the npz file
    # with the downloaded data.
    extra_params = dict(x_left_lower_corner=-94.8,
                        Nx=1212,
                        y_left_lower_corner=-56.05,
                        Ny=1592,
                        grid_size=0.05)
    dtype = np.int16

    if not exists(join(data_home, DATA_ARCHIVE_NAME)):
        print 'Downloading species data from %s to %s' % (SAMPLES_URL,
                                                          data_home)
        X = np.load(StringIO(urllib2.urlopen(SAMPLES_URL).read()))

        for f in X.files:
            fhandle = StringIO(X[f])
            if 'train' in f:
                train = _load_csv(fhandle)
            if 'test' in f:
                test = _load_csv(fhandle)

        print 'Downloading coverage data from %s to %s' % (COVERAGES_URL,
                                                           data_home)

        X = np.load(StringIO(urllib2.urlopen(COVERAGES_URL).read()))

        coverages = []
        for f in X.files:
            fhandle = StringIO(X[f])
            print ' - converting', f
            coverages.append(_load_coverage(fhandle))
        coverages = np.asarray(coverages, dtype=dtype)

        bunch = Bunch(coverages=coverages,
                      test=test,
                      train=train,
                      **extra_params)
        joblib.dump(bunch, join(data_home, DATA_ARCHIVE_NAME))
    else:
        bunch = joblib.load(join(data_home, DATA_ARCHIVE_NAME))

    return bunch
示例#27
0
def _fetch_brute_indoor_pos(data_home=None, is_train=True, remove_dup=False):
    """Load the indoor position dataset

    Parameters
    ----------
    data_home : string, optional
        Specify the folder for the datasets.

    Returns
    -------
    dataset : dict-like object with the following attributes:
        dataset.data : numpy array of shape (xxx, 12)
            Each row corresponds to the 12 features in the dataset.
        dataset.target : numpy array of shape (xxx,)
            Each value corresponds to x,y

    """

    data_home = get_data_home(data_home=data_home)

    if is_train:
        indoorpos_trainfile = join(data_home, "train.csv")
    else:
        indoorpos_trainfile = join(data_home, "test.csv")

    available = exists(indoorpos_trainfile)

    if available:
        dt = [('x', int), ('y', int), ('2.1G(10)', float), ('2.1G(11)', float),
              ('2.1G(12)', float), ('2.1G(4)', float), ('2.1G(7)', float),
              ('2.1G(8)', float), ('3.5G(10)', float), ('3.5G(11)', float),
              ('3.5G(12)', float), ('3.5G(4)', float), ('3.5G(7)', float),
              ('3.5G(8)', float)]

        DT = np.dtype(dt)

        file_ = open(indoorpos_trainfile, mode='r')
        Xy = []
        linenum = 0
        for line in file_.readlines():
            if linenum > 0:
                Xy.append(line.replace('\n', '').split(','))
            linenum = linenum + 1
        file_.close()

        Xy = np.asarray(Xy, dtype=object)

        if is_train:
            for j in range(len(dt)):
                Xy[:, j] = Xy[:, j].astype(DT[j])
        else:
            for j in range(2, len(dt)):
                Xy[:, j] = Xy[:, j].astype(DT[j])

        X = Xy[:, 2:14]

        if not is_train:
            return Bunch(data=X, target_x=[], target_y=[], orig_data=X)

        dict = {}
        indexes = []
        X1 = []
        y1 = Xy[:, 0:1]
        y2 = Xy[:, 1:2]

        #duplicate entries processing
        for i in range(len(X)):
            if not remove_dup:
                indexes.append(i)
                X1.append(X[i])
                continue

            str1 = ""
            str1 = ''.join(str(e) for e in X[i])
            if str1 in dict:
                continue

            dict[str1] = i
            indexes.append(i)
            X1.append(X[i])

        new_y1 = []
        new_y2 = []
        y11 = np.asarray(np.ravel(y1), dtype=np.int)
        y21 = np.asarray(np.ravel(y2), dtype=np.int)

        for i in indexes:
            new_y1 = np.append(new_y1, y11[i])
            new_y2 = np.append(new_y2, y21[i])

    elif not available:
        raise IOError("Data not found")

    return Bunch(data=X1, target_x=new_y1, target_y=new_y2, orig_data=X)
from sklearn.datasets.base import get_data_home

get_data_home()

from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

import pandas as pd
pixels = pd.DataFrame(mnist.data)
labels = pd.DataFrame(mnist.target)

pixels.loc[0].values

labels.loc[0].values

import numpy as np
#%matplotlib inline
import matplotlib.pyplot as plt
label = labels.loc[0]
pixel = pixels.loc[0]
pixel = np.array(pixel, dtype='uint8')
pixel = pixel.reshape((28,28))
plt.title('Label is {label}'.format(label=label))
plt.imshow(pixel, cmap='gray')
plt.show()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=1/7.0)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
示例#29
0
def __download_open_dataset(data_home=None, download_if_missing=True):
    """Helper function to download any missing SD15-CH1 data.

    The dataset will be stored like this:
        ${data_home}/smartdoc15-ch1_home/frames:
        ├── background01
        │   ├── datasheet001
        │   │   ├── frame_0001.jpeg
        │   │   ├── [...]
        │   │   └── frame_0235.jpeg
        │   ├── [...]
        │   └── tax005
        │       └── [...]
        ├── background02
        |   └── [...]
        ├── background03
        |   └── [...]
        ├── background04
        |   └── [...]
        ├── background05
        |   └── [...]
        └── metadata.csv.gz

        ${data_home}/smartdoc15-ch1_home/models:
        ├── 01-original
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 02-edited
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 03-captured-nexus
        │   ├── datasheet001.jpg # JPG images here
        │   ├── [...]
        │   └── tax005.jpg
        ├── 04-corrected-nexus
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── 05-corrected-nexus-scaled33
        │   ├── datasheet001.png
        │   ├── [...]
        │   └── tax005.png
        ├── correct_perspective.m
        └── original_datasets_files.txt
    """
    data_home = get_data_home(data_home=data_home)
    sd15ch1_home = os.path.join(data_home, SD15CH1_DIRNAME)

    if not os.path.exists(sd15ch1_home):
        os.makedirs(sd15ch1_home)

    data_dirs = {}
    for subdir, (archive, size, description) in six.iteritems(DATASET_CONTENT):
        data_folder_path = os.path.join(sd15ch1_home, subdir)
        data_dirs[subdir] = data_folder_path

        if not os.path.exists(data_folder_path):
            archive_path = os.path.join(sd15ch1_home, archive.filename)
            # (later) FIXME this is a naive test for existing files
            if not os.path.exists(archive_path):
                if download_if_missing:
                    __info("Downloading file %s (%s): %s" %
                           (archive.filename, size, archive.url))
                    _fetch_remote(archive, dirname=sd15ch1_home)
                else:
                    __err("%s is missing" % archive_path, IOError)

            __info("Decompressing the data archive to %s" %
                   (data_folder_path, ))
            tarfile.open(archive_path,
                         "r:gz").extractall(path=data_folder_path)
            os.remove(archive_path)

    return data_dirs
示例#30
0
def get_sd15ch1_basedir_models(data_home=None):
    data_home = get_data_home(data_home=data_home)
    sd15ch1_home = os.path.join(data_home, SD15CH1_DIRNAME)
    basedir = os.path.join(sd15ch1_home, "models")
    return basedir
示例#31
0
# coding: utf-8

# In[12]:

### fetch_mldata('MNIST original')でRemoteDisconnectedエラーが出た場合

# https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat
# ↑から mnist-original.mat をダウンロード

from sklearn.datasets.base import get_data_home

print(get_data_home())  # これで表示されるパスに mnist-original.mat を置く

# In[13]:

### fetch_mldata('MNIST original')でRemoteDisconnectedエラーのため書き換え
# from sklearn.datasets import fetch_mldata
# mnist = fetch_mldata('MNIST original', data_home=".")
# data_homeは保存するフォルダ
from sklearn.datasets import fetch_openml

mnist = fetch_openml(
    'mnist_784',
    version=1,
)

# MNISTデータの前処理
# 28x28ピクセルごとの0-255の数値を0-1に変換
X = mnist.data / 255
# 正解ラベルを取得
y = mnist.target
示例#32
0
def fetch_uci_glass_outlier(data_home=None,
                            shuffle=False,
                            random_state=0,
                            download_if_missing=True):
    """Load the UCI glass data-set from AT&T (classification).
    Download it if necessary.
    =================   =====================
    Classes                                6
    Samples total                         214
    Dimensionality                       9
    Features                            real
    =================   =====================
    
    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
    shuffle : boolean, optional
        If True the order of the dataset is shuffled to avoid having
        images of the same person grouped.
    random_state : int, RandomState instance or None (default=0)
        Determines random number generation for dataset shuffling. Pass an int
        for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.
    download_if_missing : optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.
    Returns
    -------    
    data : numpy array of shape (214, 9)
        Each row corresponds to a glass feature of 9 dimension
    target : numpy array of shape (214, )
        Labels associated to each glas. Those labels are from
        [1,2,3,5,6,7] and correspond to the Subject IDs.
    """
    global GLASS
    data_home = get_data_home(data_home=data_home)
    if not exists(data_home):
        makedirs(data_home)
    filepath = _pkl_filepath(data_home, 'uci_glass_outlier.pkz')
    if not exists(filepath):
        if not download_if_missing:
            raise IOError("Data not found and `download_if_missing` is False")

        print('downloading UCI GLASS from %s to %s' % (GLASS.url, data_home))
        data_path = _fetch_remote(GLASS, dirname=data_home)

        glass = np.genfromtxt(data_path, delimiter=",")
        # the class 6 (minority) as outlier and all other classes as inliers
        glass[:, -1] = 2 * (glass[:, -1] != 6) - 1
        _joblib.dump(glass, filepath, compress=6)
        remove(data_path)

    else:
        glass = _joblib.load(filepath)

    feature = glass[:, 1:-1]
    target = glass[:, -1]
    if shuffle:
        random_state = check_random_state(random_state)
        order = random_state.permutation(len(glass))
        feature = glass[order]
        target = target[order]

    return (feature, target)
示例#33
0
    max_for_C = (accuracy["C"][argmax], accuracy["avg_accuracy"][argmax])
    plt.annotate(f"C={max_for_C[0]} Avg. Accuracy={max_for_C[1]:.3f}..",
                 xy=max_for_C)
    # plt.xlim((eta_0_values[0], eta_0_values[-1]))
    plt.xticks(np.arange(len(c_values)), c_values)
    plt.xlabel("C")

    plt.xscale('log')

    plt.ylabel("Accuracy")
    plt.savefig(f"section-2-b-{label}.png")
    plt.show()
    return max_for_C[0]


if __name__ == "__main__":
    from sklearn.datasets.base import get_data_home

    print(get_data_home())
    train_data, train_labels, validation_data, validation_labels, test_data, test_labels = helper(
    )
    # section_1_a()
    # section_1_b()
    # section_1_c()
    # section_1_d()
    #
    # section_2_a()
    # section_2_b()
    # section_2_c()
    section_2_d()
示例#34
0
def fetch_classic(data_home=None,
                  subset='all',
                  categories=None,
                  shuffle=True,
                  random_state=42,
                  remove=(),
                  download_if_missing=True):
    """Load the filenames and data from the 20 newsgroups dataset.

    Read more in the :ref:`User Guide <20newsgroups>`.

    Parameters
    ----------
    subset: 'train' or 'test', 'all', optional
        Select the dataset to load: 'train' for the training set, 'test'
        for the test set, 'all' for both, with shuffled ordering.

    data_home: optional, default: None
        Specify a download and cache folder for the datasets. If None,
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    categories: None or collection of string or unicode
        If None (default), load all the categories.
        If not None, list of category names to load (other categories
        ignored).

    shuffle: bool, optional
        Whether or not to shuffle the data: might be important for models that
        make the assumption that the samples are independent and identically
        distributed (i.i.d.), such as stochastic gradient descent.

    random_state: numpy random number generator or seed integer
        Used to shuffle the dataset.

    download_if_missing: optional, True by default
        If False, raise an IOError if the data is not locally available
        instead of trying to download the data from the source site.

    remove: tuple
        May contain any subset of ('headers', 'footers', 'quotes'). Each of
        these are kinds of text that will be detected and removed from the
        newsgroup posts, preventing classifiers from overfitting on
        metadata.

        'headers' removes newsgroup headers, 'footers' removes blocks at the
        ends of posts that look like signatures, and 'quotes' removes lines
        that appear to be quoting another post.

        'headers' follows an exact standard; the other filters are not always
        correct.
    """

    data_home = get_data_home(data_home=data_home)
    cache_path = _pkl_filepath(data_home, CACHE_NAME)
    classic_home = os.path.join(data_home, CLASSIC_HOME)
    cache = None
    if os.path.exists(cache_path):
        try:
            with open(cache_path, 'rb') as f:
                compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content,
                                                 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        except Exception as e:
            print(80 * '_')
            print('Cache loading failed')
            print(80 * '_')
            print(e)

    if cache is None:
        if download_if_missing:
            cache = download_classic(target_dir=classic_home,
                                     cache_path=cache_path)
        else:
            raise IOError('classic dataset not found')

    if subset in ('train', 'test'):
        data = cache[subset]
    elif subset == 'all':
        data_lst = list()
        target = list()
        filenames = list()
        for subset in ('train', 'test'):
            data = cache[subset]
            data_lst.extend(data.data)
            target.extend(data.target)
            filenames.extend(data.filenames)

        data.data = data_lst
        data.target = np.array(target)
        data.filenames = np.array(filenames)
    else:
        raise ValueError(
            "subset can only be 'train', 'test' or 'all', got '%s'" % subset)

    data.description = 'the classic dataset'

    # if 'headers' in remove:
    #     data.data = [strip_newsgroup_header(text) for text in data.data]
    # if 'footers' in remove:
    #     data.data = [strip_newsgroup_footer(text) for text in data.data]
    # if 'quotes' in remove:
    #     data.data = [strip_newsgroup_quoting(text) for text in data.data]

    if categories is not None:
        labels = [(data.target_names.index(cat), cat) for cat in categories]
        # Sort the categories to have the ordering of the labels
        labels.sort()
        labels, categories = zip(*labels)
        mask = np.in1d(data.target, labels)
        data.filenames = data.filenames[mask]
        data.target = data.target[mask]
        # searchsorted to have continuous labels
        data.target = np.searchsorted(labels, data.target)
        data.target_names = list(categories)
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[mask]
        data.data = data_lst.tolist()

    if shuffle:
        random_state = check_random_state(random_state)
        indices = np.arange(data.target.shape[0])
        random_state.shuffle(indices)
        data.filenames = data.filenames[indices]
        data.target = data.target[indices]
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.data, dtype=object)
        data_lst = data_lst[indices]
        data.data = data_lst.tolist()

    return data