def test_data_home(): # get_data_home will point to a pre-existing folder data_home = get_data_home(data_home=DATA_HOME) assert_equal(data_home, DATA_HOME) assert_true(os.path.exists(data_home)) # clear_data_home will delete both the content and the folder it-self clear_data_home(data_home=data_home) assert_false(os.path.exists(data_home)) # if the folder is missing it will be created again data_home = get_data_home(data_home=DATA_HOME) assert_true(os.path.exists(data_home))
def get_unclassified_data(self): source_path = os.path.join(get_data_home(), 'tweets_unclassified\\' + self.disease) file_paths = [] for root, directories, files in os.walk(source_path): for filename in files: file_path = os.path.join(root, filename) file_paths.append(file_path) print 'unclassified data loaded from ' + str(file_paths) tweets = [] for file_path in file_paths: line_num = 0 with codecs.open(file_path, 'r') as f: for line in f: if line_num>0: try: tweets.append(Tweet(line)) line_num += 1 except: print "Unexpected error in line " + line_num + ":", pickle.sys.exc_info()[0] else: line_num += 1 f.closed print 'unclassified tweets loaded ' + str(len(tweets)) return tweets
def setup_module(): check_skip_network() # skip the test in rcv1.rst if the dataset is not already loaded rcv1_dir = os.path.join(get_data_home(), "RCV1") if not os.path.exists(rcv1_dir): raise SkipTest("Download RCV1 dataset to run this test.")
def setup_working_with_text_data(): if IS_PYPY and os.environ.get('CI', None): raise SkipTest('Skipping too slow test with PyPy on CI') check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
def fetch_vega_spectrum(data_home=None): data_home = get_data_home(data_home) refspec_file = os.path.join(data_home, REFSPEC_URL.split('/')[-1]) if not os.path.exists(refspec_file): print "downnloading from %s" % REFSPEC_URL F = urllib2.urlopen(REFSPEC_URL) open(refspec_file, 'w').write(F.read()) F = open(refspec_file) data = np.loadtxt(F) return data
def fetch_filter(filter, data_home=None): data_home = get_data_home(data_home) assert filter in 'ugriz' url = URL % filter loc = os.path.join(data_home, '%s.dat' % filter) if not os.path.exists(loc): print "downloading from %s" % url F = urllib2.urlopen(url) open(loc, 'w').write(F.read()) F = open(loc) data = np.loadtxt(F) return data
def fetch_sdss_spec_data(data_home=None): data_home = get_data_home(data_home) local_file = os.path.join(data_home, os.path.basename(DATA_URL)) # data directory is password protected so the public can't access it password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() password_mgr.add_password(None, DATA_URL, 'pydata', 'astroML') handler = urllib2.HTTPBasicAuthHandler(password_mgr) opener = urllib2.build_opener(handler) # download training data if not os.path.exists(local_file): fhandle = opener.open(DATA_URL) open(local_file, 'w').write(fhandle.read()) return np.load(local_file)
def stream_reuters_documents(data_path=None): """Iterate over documents of the Reuters dataset. The Reuters archive will automatically be downloaded and uncompressed if the `data_path` directory does not exist. Documents are represented as dictionaries with 'body' (str), 'title' (str), 'topics' (list(str)) keys. """ DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' 'reuters21578-mld/reuters21578.tar.gz') ARCHIVE_FILENAME = 'reuters21578.tar.gz' if data_path is None: data_path = os.path.join(get_data_home(), "reuters") if not os.path.exists(data_path): """Download the dataset.""" print("downloading dataset (once and for all) into %s" % data_path) os.mkdir(data_path) def progress(blocknum, bs, size): total_sz_mb = '%.2f MB' % (size / 1e6) current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) if _not_in_sphinx(): print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') archive_path = os.path.join(data_path, ARCHIVE_FILENAME) urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path, reporthook=progress) if _not_in_sphinx(): print('\r', end='') print("untarring Reuters dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) print("done.") parser = ReutersParser() for filename in glob(os.path.join(data_path, "*.sgm")): for doc in parser.parse(open(filename, 'rb')): #print (doc) yield doc
def _fetch_drug_protein(data_home=None): """Fetch drug-protein dataset from the server""" base_url = "http://cbio.ensmp.fr/~yyamanishi/substr-domain/" # check if this data set has been already downloaded data_home = get_data_home(data_home) data_home = os.path.join(data_home, 'drug-protein') if not os.path.exists(data_home): os.makedirs(data_home) for base_name in ["drug_repmat.txt", "target_repmat.txt", "inter_admat.txt"]: filename = os.path.join(data_home, base_name) if not os.path.exists(filename): urlname = base_url + base_name print("Download data at {}".format(urlname)) try: url = urlopen(urlname) except HTTPError as e: if e.code == 404: e.msg = "Dataset drug-protein '%s' not found." % base_name raise try: with open(filename, 'w+b') as fhandle: shutil.copyfileobj(url, fhandle) except: os.remove(filename) raise url.close() return data_home
def create_data(self): data_home = get_data_home() cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name) if os.path.exists(cache_path): return # e.g. C:\Users\[user]\scikit_learn_data\hiv # disease_path = os.path.join(data_home, self.disease) # e.g. C:\Users\[user]\scikit_learn_data\tweets\hiv tweets_path = os.path.join(data_home, 'tweets', self.disease + self._cl_cut) if not os.path.exists(tweets_path): return ''' *** Manual process: Save annotation files as 'Text (MS-DOS)(*.txt)', e.g. tweets1.txt (all annotation files should keep the same format) *** Automated process: 1. Get file names from the C:\Users\[user]\scikit_learn_data\tweets\hiv 2. For each file read all tweets line by line (only those where the category is not empty) 3. For each tweet generate a unique file ''' train_path = os.path.join(tweets_path, self.train_folder) train_output_path = os.path.join(data_home, self.train_folder, self.disease + self._cl_cut) if not os.path.exists(train_output_path): os.makedirs(train_output_path) test_path = os.path.join(tweets_path, self.test_folder) test_output_path = os.path.join(data_home, self.test_folder, self.disease + self._cl_cut) if not os.path.exists(test_output_path): os.makedirs(test_output_path) train_tweets = self._load_tweets(train_path) self._generate_singular_tweet_files(train_tweets, train_output_path) test_tweets = self._load_tweets(test_path) self._generate_singular_tweet_files(test_tweets, test_output_path)
else: # make prediction testData = dataAdapter.get_unclassified_data(categories=categories) # predicted = clf.classifier.predict(testData.data) predicted_prob = clf.classifier.predict_proba(testData.data) print('predict done') #for i in range(len(testData.data)): # probabilities = predicted_prob[i] # zero_prob = probabilities[0] # one_prob = probabilities[1] # # if one_prob > 0.1: # print one_prob # # #testData.data[i].append(str(clf.labels[predicted_prob[i]])) file_dir = os.path.join(get_data_home(), 'output') if not os.path.exists(file_dir): os.makedirs(file_dir) # np.savetxt(os.path.join(file_dir, "predicted.csv"), predicted, delimiter=",") np.savetxt(os.path.join(file_dir, "predicted_prob.csv"), predicted_prob, delimiter=",") print("done") print('done!')
import numpy as np from sklearn.datasets import fetch_covtype, get_data_home from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import zero_one_loss from sklearn.utils import Memory from sklearn.utils import check_array # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='C', random_state=13): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=random_state) X = check_array(data['data'], dtype=dtype, order=order) y = (data['target'] != 1).astype(np.int) # Create train-test split (as [Joachims, 2006])
chunk_size = 1000 data_chunks = list(partition(chunk_size, testData)) print ('start prediction') for i,chunk in enumerate(data_chunks): t0 = time() predicted = clf.classifier.predict(list(chunk)) ranTime = time() - t0 print ('progress ' + str(round((i+1)/float(len(data_chunks)) * 100,2)) + '% last_predict_time=' + str(ranTime)) for j in range(len(chunk)): testData[i*chunk_size+j].talk_about = str(clf.labels[predicted[j]]) print ('predict done') file_dir = os.path.join(get_data_home(), 'output', disease, cl_cut) if not os.path.exists(file_dir): os.makedirs(file_dir) file_path = os.path.join(file_dir, 'output.txt') with codecs.open(file_path, "w", "utf-8") as text_file: for i in range(len(testData)): try: tweet = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\t{15}\n". \ format(testData[i].tweet_id, testData[i].query, testData[i].disease, testData[i].created_at, testData[i].screen_name,
from sklearn.datasets import fetch_olivetti_faces from sklearn.datasets import fetch_lfw_people from sklearn.datasets import get_data_home if __name__ == "__main__": fetch_olivetti_faces() print("Loading Labeled Faces Data (~200MB)") fetch_lfw_people(min_faces_per_person=70, resize=0.4) print("=> Success!") print("Data saved in %s" % get_data_home())
def setup_rcv1(): check_skip_network() # skip the test in rcv1.rst if the dataset is not already loaded rcv1_dir = join(get_data_home(), "RCV1") if not exists(rcv1_dir): raise SkipTest("Download RCV1 dataset to run this test.")
def setup_twenty_newsgroups(): data_home = get_data_home() if not exists(join(data_home, '20news_home')): raise SkipTest("Skipping dataset loading doctests")
def fetch_jrcacquis( langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/' ): assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' if not langs: langs = JRC_LANGS else: if isinstance(langs, str): langs = [langs] for l in langs: if l not in JRC_LANGS: raise ValueError( 'Language %s is not among the valid languages in JRC-Acquis v3' % l) if not data_path: data_path = get_data_home() if not os.path.exists(data_path): os.mkdir(data_path) request = [] total_read = 0 for l in langs: file_name = 'jrc-' + l + '.tgz' archive_path = join(data_path, file_name) if not os.path.exists(archive_path): print( "downloading language-specific dataset (once and for all) into %s" % data_path) DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) download_file(DOWNLOAD_URL, archive_path) print("untarring dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) documents_dir = join(data_path, l) print("Reading documents...") read = 0 for dir in list_dirs(documents_dir): year = int(dir) if years == None or year in years: year_dir = join(documents_dir, dir) pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') if os.path.exists(pickle_name): print("loading from file %s" % pickle_name) l_y_documents = pickle.load(open(pickle_name, "rb")) read += len(l_y_documents) else: l_y_documents = [] all_documents = list_files(year_dir) empty = 0 for i, doc_file in enumerate(all_documents): try: jrc_doc = parse_document(join(year_dir, doc_file), year) except ValueError: jrc_doc = None if jrc_doc and (not ignore_unclassified or jrc_doc.categories): l_y_documents.append(jrc_doc) else: empty += 1 if len(all_documents) > 50 and ( (i + 1) % (len(all_documents) / 50) == 0): print('\r\tfrom %s: completed %d%%' % (year_dir, int((i + 1) * 100.0 / len(all_documents))), end='') read += 1 print( '\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i + 1, empty), end='') print("\t\t(Pickling object for future runs in %s)" % pickle_name) pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) request += l_y_documents print("Read %d documents for language %s\n" % (read, l)) total_read += read print("Read %d documents in total" % (total_read)) if parallel == 'force': request = _force_parallel(request, langs) elif parallel == 'avoid': request = random_sampling_avoiding_parallel(request) final_cats = _get_categories(request) if cat_filter: request = _filter_by_category(request, cat_filter) final_cats = _get_categories(request) if cat_threshold > 0: request, final_cats = _filter_by_frequency(request, cat_threshold) if most_frequent != -1 and len(final_cats) > most_frequent: request, final_cats = _most_common(request, most_frequent) return request, final_cats
def setup_twenty_newsgroups(): data_home = get_data_home() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
import numpy as np from sklearn.datasets import fetch_covtype, get_data_home from sklearn.svm import LinearSVC from sklearn.linear_model import SGDClassifier from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import zero_one_loss from sklearn.externals.joblib import Memory from sklearn.utils import check_array # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='C', random_state=13): """Load the data, then cache and memmap the train/test split""" ###################################################################### ## Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=random_state) X = check_array(data['data'], dtype=dtype, order=order) y = (data['target'] != 1).astype(np.int)
def fetch_mnist(data_home=None): mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat" data_home = get_data_home(data_home=data_home) data_home = os.path.join(data_home, 'mldata') if not os.path.exists(data_home): os.makedirs(data_home) mnist_save_path = os.path.join(data_home, "mnist-original.mat") if not os.path.exists(mnist_save_path): mnist_url = urllib.request.urlopen(mnist_alternative_url) with open(mnist_save_path, "wb") as matlab_file: copyfileobj(mnist_url, matlab_file) #Step 1. Downloading the Data (MNIST) print(get_data_home()) fetch_mnist() mnist = fetch_mldata('MNIST original') # These are the images # There are 70,000 images (28 by 28 images for a dimensionality of 784) print("Number of images: ", mnist.data.shape) # These are the labels print("Labels: ", mnist.target.shape) #Step 2. Splitting Data into Training and Test Sets (MNIST) train_img, test_img, train_lbl, test_lbl = train_test_split(mnist.data, mnist.target, test_size=1 / 7.0, random_state=0)
from sklearn.metrics import confusion_matrix, accuracy_score from skimage.transform import rotate import missinglink project = missinglink.SkLearnProject() # Optional: Name this experiment. `display_name` is always visible in the experiments # table. While the `description` is accessible by clicking the note icon. project.set_properties(display_name="MNIST", description="Using scikit-learn") print(__doc__) # Load data from https://www.openml.org/d/554 print("Loading data") print("Data home: {}".format(get_data_home())) data, target = fetch_openml('mnist_784', version=1, return_X_y=True) rotate = False model_type = "forest" #model_type = "mlp" # rescale the data, use the traditional train/test split print("Rescaling {} datapoints".format(data.shape)) data = data / 255. split = 10000 # out of 70000 data_train, data_test = data[:split], data[split:] target_train, target_test = target[:split], target[split:] if rotate: print("Adding rotation") data_train = np.append(
from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.dummy import DummyClassifier from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.metrics import zero_one_loss from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array from sklearn.linear_model import LogisticRegression from sklearn.neural_network import MLPClassifier # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### # Load dataset print("Loading dataset...") data = fetch_openml('mnist_784') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255
def setup_module(module): data_home = get_data_home() if not exists(join(data_home, 'lfw_home')): raise SkipTest("Skipping dataset loading doctests")
def setup_module(module): data_home = get_data_home() if not exists(join(data_home, '20news_home')): raise SkipTest("Skipping dataset loading doctests")
def setup_labeled_faces(): data_home = get_data_home() if not exists(join(data_home, 'lfw_home')): raise SkipTest("Skipping dataset loading doctests")
def fetch_datasets(data_home=None, filter_data=None, download_if_missing=True, random_state=None, shuffle=False, verbose=False): """Load the benchmark datasets from Zenodo, downloading it if necessary. Parameters ---------- data_home : string, optional (default=None) Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. filter_data : tuple of str/int or None, optional (default=None) A tuple containing the ID or the name of the datasets to be returned. Refer to the above table to get the ID and name of the datasets. download_if_missing : boolean, optional (default=True) If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, optional (default=None) Random state for shuffling the dataset. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, optional (default=False) Whether to shuffle dataset. verbose : bool, optional (default=False) Show information regarding the fetching. Returns ------- datasets : OrderedDict of Bunch object, The ordered is defined by ``filter_data``. Each Bunch object --- refered as dataset --- have the following attributes: dataset.data : ndarray, shape (n_samples, n_features) dataset.target : ndarray, shape (n_samples, ) dataset.DESCR : string Description of the each dataset. Notes ----- This collection of datasets have been proposed in [1]_. The characteristics of the available datasets are presented in the table below. +--+--------------+-------------------------------+-------+---------+-----+ |ID|Name | Repository & Target | Ratio | #S | #F | +==+==============+===============================+=======+=========+=====+ |1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 | +--+--------------+-------------------------------+-------+---------+-----+ |2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 | +--+--------------+-------------------------------+-------+---------+-----+ |3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 | +--+--------------+-------------------------------+-------+---------+-----+ |4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ |6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 | +--+--------------+-------------------------------+-------+---------+-----+ |7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 | +--+--------------+-------------------------------+-------+---------+-----+ |8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 | +--+--------------+-------------------------------+-------+---------+-----+ |10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 | +--+--------------+-------------------------------+-------+---------+-----+ |11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 | +--+--------------+-------------------------------+-------+---------+-----+ |12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 | +--+--------------+-------------------------------+-------+---------+-----+ |13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 | +--+--------------+-------------------------------+-------+---------+-----+ |14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 | +--+--------------+-------------------------------+-------+---------+-----+ |15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 | +--+--------------+-------------------------------+-------+---------+-----+ |16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 | +--+--------------+-------------------------------+-------+---------+-----+ |17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 | +--+--------------+-------------------------------+-------+---------+-----+ |18|oil | UCI, target: minority | 22:1 | 937 | 49 | +--+--------------+-------------------------------+-------+---------+-----+ |19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 | +--+--------------+-------------------------------+-------+---------+-----+ |21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 | +--+--------------+-------------------------------+-------+---------+-----+ |23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 | +--+--------------+-------------------------------+-------+---------+-----+ |24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 | +--+--------------+-------------------------------+-------+---------+-----+ |25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 | +--+--------------+-------------------------------+-------+---------+-----+ |26|protein_homo | KDD CUP 2004, minority | 11:1 | 145,751 | 74 | +--+--------------+-------------------------------+-------+---------+-----+ |27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ References ---------- .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly Imbalanced Data Learning and their Application in Bioinformatics." Dissertation, Georgia State University, (2011). """ data_home = get_data_home(data_home=data_home) zenodo_dir = join(data_home, "zenodo") datasets = OrderedDict() if filter_data is None: filter_data_ = MAP_NAME_ID.keys() else: list_data = MAP_NAME_ID.keys() filter_data_ = [] for it in filter_data: if isinstance(it, six.string_types): if it not in list_data: raise ValueError('{} is not a dataset available. ' 'The available datasets are {}'.format( it, list_data)) else: filter_data_.append(it) elif isinstance(it, int): if it < 1 or it > 27: raise ValueError('The dataset with the ID={} is not an ' 'available dataset. The IDs are ' '{}'.format(it, range(1, 28))) else: # The index start at one, then we need to remove one # to not have issue with the indexing. filter_data_.append(MAP_ID_NAME[it]) else: raise ValueError('The value in the tuple should be str or int.' ' Got {} instead.'.format(type(it))) # go through the list and check if the data are available for it in filter_data_: filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME filename = join(zenodo_dir, filename) available = isfile(filename) if download_if_missing and not available: makedirs(zenodo_dir, exist_ok=True) if verbose: print("Downloading %s" % URL) f = BytesIO(urlopen(URL).read()) tar = tarfile.open(fileobj=f) tar.extractall(path=zenodo_dir) elif not download_if_missing and not available: raise IOError("Data not found and `download_if_missing` is False") data = np.load(filename) X, y = data['data'], data['label'] if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] datasets[it] = Bunch(data=X, target=y, DESCR=it) return datasets
def get_data(self, subset='train', categories=None, shuffle=True, random_state=42): data_home = get_data_home() cache_path = os.path.join(data_home, 'cache\\' + self.disease + self._cl_cut + '\\' + self.cache_name) train_path = os.path.join(data_home, self.train_folder, self.disease + self._cl_cut) test_path = os.path.join(data_home, self.test_folder, self.disease + self._cl_cut) cache = None if os.path.exists(cache_path): try: with open(cache_path, 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode( compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) except Exception as e: print(80 * '_') print('Cache loading failed') print(80 * '_') print(e) if cache is None: cache = self.get_cache(train_path=train_path, test_path=test_path, cache_path=cache_path) if subset in ('train', 'test'): data = cache[subset] elif subset == 'all': data_lst = list() target = list() filenames = list() for subset in ('train', 'test'): data = cache[subset] data_lst.extend(data.data) target.extend(data.target) filenames.extend(data.filenames) data.data = data_lst data.target = np.array(target) data.filenames = np.array(filenames) else: raise ValueError( "subset can only be 'train', 'test' or 'all', got '%s'" % subset) data.description = 'The HIV dataset' if categories is not None: labels = [(data.target_names.index(cat), cat) for cat in categories] # Sort the categories to have the ordering of the labels labels.sort() labels, categories = zip(*labels) mask = np.in1d(data.target, labels) data.filenames = data.filenames[mask] data.target = data.target[mask] # searchsorted to have continuous labels data.target = np.searchsorted(labels, data.target) data.target_names = list(categories) # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[mask] data.data = data_lst.tolist() if shuffle: random_state = validation.check_random_state(random_state) indices = np.arange(data.target.shape[0]) random_state.shuffle(indices) data.filenames = data.filenames[indices] data.target = data.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.data, dtype=object) data_lst = data_lst[indices] data.data = data_lst.tolist() return data
import io from scipy.io.arff import loadarff import matplotlib.pyplot as plt from sklearn.datasets import get_data_home from sklearn.externals.joblib import Memory from sklearn.neural_network import MLPClassifier try: from urllib.request import urlopen except ImportError: from urllib2 import urlopen memory = Memory(get_data_home()) @memory.cache() def fetch_mnist(): content = urlopen('https://www.openml.org/data/download/52667/mnist_784.arff').read() data,meta = loadarff(io.StringIO(content.decode('utf8'))) data = data.view([('pixels', '<f8', 784), ('class', '|S1')]) return data['pixels'],data['class'] x,y = fetch_mnist() x_train, x_test = x[:6000],x[6000:] y_train, y_test = y[:6000],y[6000:] mlp = MLPClassifier(hidden_layer_sizes=(50,),max_iter=10,alpha=1e-4,solver='sgd',verbose=10,tol=1e-4,random_state=1,learning_rate_init=.1) mlp.fit(x_train,y_train) print("Training set score :%f"%mlp.score(x_train,y_train)) print("Test set score:%f"%mlp.score(x_test,y_test)) fig,axes = plt.subplots(4,4) vmin, vmax = mlp.coefs_[0].min(),mlp.coefs_[0].max() for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()): ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin, vmax=.5 * vmax) ax.set_xticks(())
def setup_working_with_text_data(): check_skip_network() cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests")
plt.legend(loc='upper right') plt.show() ''' x_index = 0 y_index = 3 ''' for label,color in zip(range(len(d1.target_names)),colors): plt.scatter(d1.data[d1.target==label,x_index],d1.data[d1.target == label, y_index],label=d1.target_names[label],color=color) #散点图 plt.xlabel(d1.feature_names[x_index]) plt.xlabel(d1.feature_names[y_index]) plt.legend(loc='upper left') plt.show() ''' ''' fig = plt.figure(figsize=(6,6)) fig.subplotpars(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05) for i in range(64): ax = fig.add_subplot(8,8,i+1,xticks=[],yticks=[]) ax.imshow(d3.images[i],cmap=plt.cm.binary,interpolation="nearest") ax.text(0,7,str(d3.target[i])) plt.show() ''' #china = datasets.load_sample_image('china.jpg') print(datasets.get_data_home())
# Each number generator use the same seed to avoid coupling issue between # estimators. op.add_option("--random-seed", dest="random_seed", default=13, type=int, help="Common seed used by random number generator.") op.print_help() (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode joblib_cache_folder = os.path.join(get_data_home(), 'covertype_benchmark_data') m = Memory(joblib_cache_folder, mmap_mode='r') # Load the data, then cache and memmap the train/test split @m.cache def load_data(dtype=np.float32, order='C'): ###################################################################### ## Load dataset print("Loading dataset...") data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=opts.random_seed) X, y = data['data'], data['target'] X = np.asarray(X, dtype=dtype) if order.lower() == 'f':
from sklearn.datasets import get_data_home from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.dummy import DummyClassifier from sklearn.externals.joblib import Memory from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import RBFSampler from sklearn.metrics import zero_one_loss from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_array # Memoize the data extraction and memory map the resulting # train / test splits in readonly mode memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'), mmap_mode='r') @memory.cache def load_data(dtype=np.float32, order='F'): """Load the data, then cache and memmap the train/test split""" ###################################################################### ## Load dataset print("Loading dataset...") data = fetch_mldata('MNIST original') X = check_array(data['data'], dtype=dtype, order=order) y = data["target"] # Normalize features X = X / 255
#In [4]: boston. #boston.DESCR boston.items boston.target #boston.clear boston.iteritems boston.update #boston.copy boston.iterkeys boston.values #boston.data boston.itervalues boston.viewitems #boston.feature_names boston.keys boston.viewkeys #boston.fromkeys boston.pop boston.viewvalues #boston.get boston.popitem #boston.has_key boston.setdefault #housing = datasets.fetch_california_housing() #downloading Cal. housing from http://lib.stat.cmu.edu/modules.php?op=modload&nam #e=Downloads&file=index&req=getit&lid=83 to C:\Users\c01843\scikit_learn_data datasets.get_data_home() #Out[8]: 'C:\\Users\\c01843\\scikit_learn_data' X, y = boston.data, boston.target datasets.make_biclusters datasets.make_blobs datasets.make_checkerboard datasets.make_circles datasets.make_classification #datasets.make_biclusters datasets.make_friedman3 datasets.make_s_curve #datasets.make_blobs datasets.make_gaussian_quantiles datasets.make_sparse_coded_signal #datasets.make_checkerboard datasets.make_hastie_10_2 datasets.make_sparse_spd_matrix #datasets.make_circles datasets.make_low_rank_matrix datasets.make_sparse_uncorrelated #datasets.make_classification datasets.make_moons datasets.make_spd_matrix #datasets.make_friedman1 datasets.make_multilabel_classification datasets.make_swiss_roll
def fetch_datasets( *, data_home=None, filter_data=None, download_if_missing=True, random_state=None, shuffle=False, verbose=False, ): """Load the benchmark datasets from Zenodo, downloading it if necessary. .. versionadded:: 0.3 Parameters ---------- data_home : str, default=None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. filter_data : tuple of str/int, default=None A tuple containing the ID or the name of the datasets to be returned. Refer to the above table to get the ID and name of the datasets. download_if_missing : bool, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. random_state : int, RandomState instance or None, default=None Random state for shuffling the dataset. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. shuffle : bool, default=False Whether to shuffle dataset. verbose : bool, default=False Show information regarding the fetching. Returns ------- datasets : OrderedDict of Bunch object, The ordered is defined by ``filter_data``. Each Bunch object --- referred as dataset --- have the following attributes: dataset.data : ndarray of shape (n_samples, n_features) dataset.target : ndarray of shape (n_samples,) dataset.DESCR : str Description of the each dataset. Notes ----- This collection of datasets have been proposed in [1]_. The characteristics of the available datasets are presented in the table below. +--+--------------+-------------------------------+-------+---------+-----+ |ID|Name | Repository & Target | Ratio | #S | #F | +==+==============+===============================+=======+=========+=====+ |1 |ecoli | UCI, target: imU | 8.6:1 | 336 | 7 | +--+--------------+-------------------------------+-------+---------+-----+ |2 |optical_digits| UCI, target: 8 | 9.1:1 | 5,620 | 64 | +--+--------------+-------------------------------+-------+---------+-----+ |3 |satimage | UCI, target: 4 | 9.3:1 | 6,435 | 36 | +--+--------------+-------------------------------+-------+---------+-----+ |4 |pen_digits | UCI, target: 5 | 9.4:1 | 10,992 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |5 |abalone | UCI, target: 7 | 9.7:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ |6 |sick_euthyroid| UCI, target: sick euthyroid | 9.8:1 | 3,163 | 42 | +--+--------------+-------------------------------+-------+---------+-----+ |7 |spectrometer | UCI, target: >=44 | 11:1 | 531 | 93 | +--+--------------+-------------------------------+-------+---------+-----+ |8 |car_eval_34 | UCI, target: good, v good | 12:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |9 |isolet | UCI, target: A, B | 12:1 | 7,797 | 617 | +--+--------------+-------------------------------+-------+---------+-----+ |10|us_crime | UCI, target: >0.65 | 12:1 | 1,994 | 100 | +--+--------------+-------------------------------+-------+---------+-----+ |11|yeast_ml8 | LIBSVM, target: 8 | 13:1 | 2,417 | 103 | +--+--------------+-------------------------------+-------+---------+-----+ |12|scene | LIBSVM, target: >one label | 13:1 | 2,407 | 294 | +--+--------------+-------------------------------+-------+---------+-----+ |13|libras_move | UCI, target: 1 | 14:1 | 360 | 90 | +--+--------------+-------------------------------+-------+---------+-----+ |14|thyroid_sick | UCI, target: sick | 15:1 | 3,772 | 52 | +--+--------------+-------------------------------+-------+---------+-----+ |15|coil_2000 | KDD, CoIL, target: minority | 16:1 | 9,822 | 85 | +--+--------------+-------------------------------+-------+---------+-----+ |16|arrhythmia | UCI, target: 06 | 17:1 | 452 | 278 | +--+--------------+-------------------------------+-------+---------+-----+ |17|solar_flare_m0| UCI, target: M->0 | 19:1 | 1,389 | 32 | +--+--------------+-------------------------------+-------+---------+-----+ |18|oil | UCI, target: minority | 22:1 | 937 | 49 | +--+--------------+-------------------------------+-------+---------+-----+ |19|car_eval_4 | UCI, target: vgood | 26:1 | 1,728 | 21 | +--+--------------+-------------------------------+-------+---------+-----+ |20|wine_quality | UCI, wine, target: <=4 | 26:1 | 4,898 | 11 | +--+--------------+-------------------------------+-------+---------+-----+ |21|letter_img | UCI, target: Z | 26:1 | 20,000 | 16 | +--+--------------+-------------------------------+-------+---------+-----+ |22|yeast_me2 | UCI, target: ME2 | 28:1 | 1,484 | 8 | +--+--------------+-------------------------------+-------+---------+-----+ |23|webpage | LIBSVM, w7a, target: minority | 33:1 | 34,780 | 300 | +--+--------------+-------------------------------+-------+---------+-----+ |24|ozone_level | UCI, ozone, data | 34:1 | 2,536 | 72 | +--+--------------+-------------------------------+-------+---------+-----+ |25|mammography | UCI, target: minority | 42:1 | 11,183 | 6 | +--+--------------+-------------------------------+-------+---------+-----+ |26|protein_homo | KDD CUP 2004, minority | 111:1 | 145,751 | 74 | +--+--------------+-------------------------------+-------+---------+-----+ |27|abalone_19 | UCI, target: 19 | 130:1 | 4,177 | 10 | +--+--------------+-------------------------------+-------+---------+-----+ References ---------- .. [1] Ding, Zejin, "Diversified Ensemble Classifiers for Highly Imbalanced Data Learning and their Application in Bioinformatics." Dissertation, Georgia State University, (2011). """ data_home = get_data_home(data_home=data_home) zenodo_dir = join(data_home, "zenodo") datasets = OrderedDict() if filter_data is None: filter_data_ = MAP_NAME_ID.keys() else: list_data = MAP_NAME_ID.keys() filter_data_ = [] for it in filter_data: if isinstance(it, str): if it not in list_data: raise ValueError( f"{it} is not a dataset available. " f"The available datasets are {list_data}" ) else: filter_data_.append(it) elif isinstance(it, int): if it < 1 or it > 27: raise ValueError( f"The dataset with the ID={it} is not an " f"available dataset. The IDs are " f"{range(1, 28)}" ) else: # The index start at one, then we need to remove one # to not have issue with the indexing. filter_data_.append(MAP_ID_NAME[it]) else: raise ValueError( f"The value in the tuple should be str or int." f" Got {type(it)} instead." ) # go through the list and check if the data are available for it in filter_data_: filename = PRE_FILENAME + str(MAP_NAME_ID[it]) + POST_FILENAME filename = join(zenodo_dir, filename) available = isfile(filename) if download_if_missing and not available: makedirs(zenodo_dir, exist_ok=True) if verbose: print("Downloading %s" % URL) f = BytesIO(urlopen(URL).read()) tar = tarfile.open(fileobj=f) tar.extractall(path=zenodo_dir) elif not download_if_missing and not available: raise IOError("Data not found and `download_if_missing` is False") data = np.load(filename) X, y = data["data"], data["label"] if shuffle: ind = np.arange(X.shape[0]) rng = check_random_state(random_state) rng.shuffle(ind) X = X[ind] y = y[ind] datasets[it] = Bunch(data=X, target=y, DESCR=it) return datasets