def load_data(shuffle=True, n_cols=None): train_path = get_file( 'P1B1.train.csv', origin= 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv' ) test_path = get_file( 'P1B1.test.csv', origin= 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv' ) usecols = list(range(n_cols)) if n_cols else None df_train = pd.read_csv(train_path, engine='c', usecols=usecols) df_test = pd.read_csv(test_path, engine='c', usecols=usecols) df_train = df_train.drop('case_id', 1).astype(np.float32) df_test = df_test.drop('case_id', 1).astype(np.float32) if shuffle: df_train = df_train.sample(frac=1, random_state=seed) df_test = df_test.sample(frac=1, random_state=seed) X_train = df_train.as_matrix() X_test = df_test.as_matrix() scaler = MaxAbsScaler() mat = np.concatenate((X_train, X_test), axis=0) mat = scaler.fit_transform(mat) X_train = mat[:X_train.shape[0], :] X_test = mat[X_train.shape[0]:, :] return X_train, X_test
def load_data(shuffle=True, n_cols=None): train_path = get_file( 'P1B2.train.csv', origin= 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.train.csv' ) test_path = get_file( 'P1B2.test.csv', origin= 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B2/P1B2.test.csv' ) usecols = list(range(n_cols)) if n_cols else None df_train = pd.read_csv(train_path, engine='c', usecols=usecols) df_test = pd.read_csv(test_path, engine='c', usecols=usecols) if shuffle: df_train = df_train.sample(frac=1, random_state=seed) df_test = df_test.sample(frac=1, random_state=seed) X_train = df_train.iloc[:, 2:].as_matrix() X_test = df_test.iloc[:, 2:].as_matrix() y_train = pd.get_dummies(df_train[['cancer_type']]).as_matrix() y_test = pd.get_dummies(df_test[['cancer_type']]).as_matrix() return (X_train, y_train), (X_test, y_test)
def load_cell_proteome(ncols=None, scaling='std', add_prefix=True): """Load cell line microRNA data, sub-select columns randomly if specificed, scale the selected data and return a pandas dataframe. Parameters ---------- ncols : int or None number of columns to randomly subselect (default None : use all data) scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply add_prefix: True or False add feature namespace prefix """ path1 = get_file(P1B3_URL + 'nci60_proteome_log2.transposed.tsv') path2 = get_file(P1B3_URL + 'nci60_kinome_log2.transposed.tsv') df = global_cache.get(path1) if df is None: df = pd.read_csv(path1, sep='\t', engine='c') global_cache[path1] = df df_k = global_cache.get(path2) if df_k is None: df_k = pd.read_csv(path2, sep='\t', engine='c') global_cache[path2] = df_k df = df.set_index('CellLine') df_k = df_k.set_index('CellLine') if add_prefix: df = df.add_prefix('prot.') df_k = df_k.add_prefix('kino.') else: df_k = df_k.add_suffix('.K') df = df.merge(df_k, left_index=True, right_index=True) index = df.index.map(lambda x: x.replace('.', ':')) total = df.shape[1] if ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) df = df.iloc[:, usecols] df = impute_and_scale(df, scaling) df = df.astype(np.float32) df.index = index df.index.names = ['CELLNAME'] df = df.reset_index() return df
def _load_fashion_mnist(): dirname = os.path.join('datasets', 'fashion-mnist') base = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/' files = [ 'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz' ] paths = [] for fname in files: paths.append(get_file(fname, origin=base + fname, cache_subdir=dirname)) with gzip.open(paths[0], 'rb') as lbpath: y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(paths[1], 'rb') as imgpath: x_train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(y_train), 28, 28) with gzip.open(paths[2], 'rb') as lbpath: y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8) with gzip.open(paths[3], 'rb') as imgpath: x_test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(y_test), 28, 28) # prevent compatibility issues x_train = np.expand_dims(x_train, axis=-1) y_train = np.expand_dims(y_train, axis=-1) x_test = np.expand_dims(x_test, axis=-1) y_test = np.expand_dims(y_test, axis=-1) return (x_train, y_train), (x_test, y_test)
def _load_cifar10(): dirname = 'cifar-10-batches-py' origin = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' path = get_file(dirname, origin=origin, untar=True) num_train_samples = 50000 x_train = np.empty((num_train_samples, 3, 32, 32), dtype='uint8') y_train = np.empty((num_train_samples, ), dtype='uint8') for i in range(1, 6): fpath = os.path.join(path, 'data_batch_' + str(i)) (x_train[(i - 1) * 10000:i * 10000, :, :, :], y_train[(i - 1) * 10000:i * 10000]) = _load_batch(fpath) fpath = os.path.join(path, 'test_batch') x_test, y_test = _load_batch(fpath) y_train = np.reshape(y_train, (len(y_train), 1)) y_test = np.reshape(y_test, (len(y_test), 1)) # make channels last dimension x_train = x_train.transpose(0, 2, 3, 1) x_test = x_test.transpose(0, 2, 3, 1) return (x_train, y_train), (x_test, y_test)
def load_data(path="data/imdb.pkl", n_words=100000, maxlen=None, test_split=0.2, seed=113): path = get_file(path, origin="https://s3.amazonaws.com/text-datasets/imdb.pkl") if path.endswith(".gz"): f = gzip.open(path, 'rb') else: f = open(path, 'rb') X, labels = cPickle.load(f) random.seed(seed) random.shuffle(X) random.seed(seed) random.shuffle(labels) f.close() if maxlen: new_X = [] new_labels = [] for x, y in zip(X, labels): if len(x) < maxlen: new_X.append(x) new_labels.append(y) X = new_X labels = new_labels X = [[1 if w >= n_words else w for w in x] for x in X] X_train = X[:int(len(X)*(1-test_split))] y_train = labels[:int(len(X)*(1-test_split))] X_test = X[int(len(X)*(1-test_split)):] y_test = labels[int(len(X)*(1-test_split)):] return (X_train, y_train), (X_test, y_test)
def download_data(self, file_dir, download_dir): # Open file path imdb_root = os.path.join(file_dir, "aclImdb") if not os.path.isdir(imdb_root): logger.info("Downloading IMDB dataset") if download_dir is None: download_dir = os.path.dirname(os.path.normpath(file_dir)) # ensure directories exist if not os.path.isdir(download_dir): mkdir_p(download_dir) if not os.path.isdir(file_dir): mkdir_p(file_dir) # download file downloaded_file_path = get_file( "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", download_dir) # then extract it if not os.path.isdir(os.path.join(file_dir, 'aclImdb')): logger.info("Extracting IMDB dataset") tar = tarfile.open(downloaded_file_path, mode="r:gz") tar.extractall(path=file_dir) tar.close() # output data location return imdb_root
def load_data(): """Loads CIFAR10 dataset. # Returns Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. """ dirname = 'cifar-10-batches-py' origin = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' path = get_file(dirname, origin=origin, untar=True) num_train_samples = 50000 x_train = np.zeros((num_train_samples, 3, 32, 32), dtype='uint8') y_train = np.zeros((num_train_samples,), dtype='uint8') for i in range(1, 6): fpath = os.path.join(path, 'data_batch_' + str(i)) data, labels = load_batch(fpath) x_train[(i - 1) * 10000: i * 10000, :, :, :] = data y_train[(i - 1) * 10000: i * 10000] = labels fpath = os.path.join(path, 'test_batch') x_test, y_test = load_batch(fpath) y_train = np.reshape(y_train, (len(y_train), 1)) y_test = np.reshape(y_test, (len(y_test), 1)) if K.image_data_format() == 'channels_last': x_train = x_train.transpose(0, 2, 3, 1) x_test = x_test.transpose(0, 2, 3, 1) return (x_train, y_train), (x_test, y_test)
def load_data(path="data/reuters.pkl", n_words=100000, maxlen=None, test_split=0.2, seed=113): path = get_file( path, origin="https://s3.amazonaws.com/text-datasets/reuters.pkl") if path.endswith(".gz"): f = gzip.open(path, 'rb') else: f = open(path, 'rb') X, labels = cPickle.load(f) f.close() if maxlen: new_X = [] new_labels = [] for x, y in zip(X, labels): if len(x) < maxlen: new_X.append(x) new_labels.append(y) X = new_X labels = new_labels X = [[1 if w >= n_words else w for w in x] for x in X] X_train = X[:int(len(X) * (1 - test_split))] y_train = labels[:int(len(X) * (1 - test_split))] X_test = X[int(len(X) * (1 - test_split)):] y_test = labels[int(len(X) * (1 - test_split)):] return (X_train, y_train), (X_test, y_test)
def load_data(test_split=0.1, seed=113): dirname = "cifar-10-batches-py" origin = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" path = get_file(dirname, origin=origin, untar=True) nb_samples = 50000 X = np.zeros((nb_samples, 3, 32, 32), dtype="uint8") y = np.zeros((nb_samples, ), dtype="uint8") for i in range(1, 6): fpath = path + '/data_batch_' + str(i) f = open(fpath, 'rb') d = cPickle.load(f) f.close() data = d["data"] labels = d["labels"] data = data.reshape(data.shape[0], 3, 32, 32) X[(i - 1) * 10000:i * 10000, :, :, :] = data y[(i - 1) * 10000:i * 10000] = labels np.random.seed(seed) np.random.shuffle(X) np.random.seed(seed) np.random.shuffle(y) y = np.reshape(y, (len(y), 1)) X_train = X[:int(len(X) * (1 - test_split))] y_train = y[:int(len(X) * (1 - test_split))] X_test = X[int(len(X) * (1 - test_split)):] y_test = y[int(len(X) * (1 - test_split)):] return (X_train, y_train), (X_test, y_test)
def load_data(file_path=None): ''' Function that takes in a path to the Google questions-words.txt word analogy file, opens it, removes topic tags and returns a list of the analogies @Arguments: file_path -- (optional) personal system file path to the questions-words.txt data set (or others of a similar structure) The Questions-Words Dataset is of the following format per row: 'WordA WordB WordC, WordD' @Return: A list of strings representing analogies ''' word_analogies = list() # Open file path if not file_path: file_path = get_file( "https://word2vec.googlecode.com/svn/trunk/questions-words.txt") # Questions word file try: qw = open(file_path, 'r') except IOError, e: print "IO Error" + e.code + file_path
def load_data(path="reuters.pkl", nb_words=None, skip_top=0, maxlen=None, test_split=0.2, seed=113): path = get_file(path, origin="https://s3.amazonaws.com/text-datasets/reuters.pkl") f = open(path, 'rb') X, labels = cPickle.load(f) f.close() random.seed(seed) random.shuffle(X) random.seed(seed) random.shuffle(labels) if maxlen: new_X = [] new_labels = [] for x, y in zip(X, labels): if len(x) < maxlen: new_X.append(x) new_labels.append(y) X = new_X labels = new_labels if not nb_words: nb_words = max([max(x) for x in X]) X = [[0 if (w >= nb_words or w < skip_top) else w for w in x] for x in X] X_train = X[:int(len(X)*(1-test_split))] y_train = labels[:int(len(X)*(1-test_split))] X_test = X[int(len(X)*(1-test_split)):] y_test = labels[int(len(X)*(1-test_split)):] return (X_train, y_train), (X_test, y_test)
def load_data(file_path=None): ''' Function that takes in a path to the Google questions-words.txt word analogy file, opens it, removes topic tags and returns a list of the analogies @Arguments: file_path -- (optional) personal system file path to the questions-words.txt data set (or others of a similar structure) The Questions-Words Dataset is of the following format per row: 'WordA WordB WordC, WordD' @Return: A list of strings representing analogies ''' word_analogies = list() # Open file path if not file_path: file_path = get_file("https://word2vec.googlecode.com/svn/trunk/questions-words.txt") # Questions word file try: qw = open(file_path, 'r') except IOError, e: print "IO Error" + e.code + file_path
def load_data(file_path='/data/amazon/reviews_Home_and_Kitchen.json.gz', amazon_url = "http://snap.stanford.edu/data/amazon/" "productGraph/categoryFiles/" "reviews_Home_and_Kitchen.json.gz"): ''' Function that takes in a path to the Stanford SNAP Amazon review data, opens it, and yields a dictoray of information for each review @Arguments: file_path -- (optional) personal system file path to the SNAP Stanford data set (or others of a similar structure) amazon_url -- (optional) URI of data set, in case it needs to be downloaded. Defaults to Home and Kitchen reviews @Return: A generator over a dictionaries of each Amazon Reveiws ''' # Open file path if not os.path.isfile(file_path): file_path = get_file(amazon_url, os.path.dirname(file_path)) # Parse Amazon Reviews GZip file with gzip.open(file_path, 'r') as f: for l in f: try: review_text, sentiment = process_amazon_json(l) yield review_text.decode("latin1"), sentiment except BoringException as e: logger.info(e) continue
def download_data(self, file_dir, download_dir): # Open file path imdb_root = os.path.join(file_dir, "aclImdb") if not os.path.isdir(imdb_root): logger.info("Downloading IMDB dataset") if download_dir is None: download_dir = os.path.dirname(os.path.normpath(file_dir)) # ensure directories exist if not os.path.isdir(download_dir): mkdir_p(download_dir) if not os.path.isdir(file_dir): mkdir_p(file_dir) # download file downloaded_file_path = get_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", download_dir) # then extract it if not os.path.isdir(os.path.join(file_dir, 'aclImdb')): logger.info("Extracting IMDB dataset") tar = tarfile.open(downloaded_file_path, mode="r:gz") tar.extractall(path=file_dir) tar.close() # output data location return imdb_root
def load_data(test_split=0.1, seed=113): dirname = "cifar-10-batches-py" origin = "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" path = get_file(dirname, origin=origin, untar=True) nb_samples = 50000 X = np.zeros((nb_samples, 3, 32, 32), dtype="uint8") y = np.zeros((nb_samples,)) for i in range(1, 6): fpath = path + '/data_batch_' + str(i) f = open(fpath, 'rb') d = cPickle.load(f) f.close() data = d["data"] labels = d["labels"] data = data.reshape(data.shape[0], 3, 32, 32) X[(i-1)*10000:i*10000, :, :, :] = data y[(i-1)*10000:i*10000] = labels np.random.seed(seed) np.random.shuffle(X) np.random.seed(seed) np.random.shuffle(y) y = np.reshape(y, (len(y), 1)) X_train = X[:int(len(X)*(1-test_split))] y_train = y[:int(len(X)*(1-test_split))] X_test = X[int(len(X)*(1-test_split)):] y_test = y[int(len(X)*(1-test_split)):] return (X_train, y_train), (X_test, y_test)
def load_data(file_dir="./.downloads", download_dir="./.downloads"): ''' Function that yields records from the IMDB reviews dataset @Arguments: file_dir -- personal system file path to the unzipped IMDB data set (so, a directory). If this does not exist, the archive will be downloaded and unzipped here download_dir -- what directory to download the actual archive to? Can be None, in which case it defaults to the parent directory of file_path. The archive will only be downloaded if necessary @Return: A generator over a tuples of Movie reviews and their sentiment ''' # Open file path imdb_root = os.path.join(file_dir, "aclImdb") if not os.path.isdir(imdb_root): logger.info("Downloading IMDB dataset") if download_dir is None: download_dir = os.path.dirname(os.path.normpath(file_dir)) downloaded_file_path = get_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", download_dir) # then extract it if not os.path.isdir(os.path.join(file_dir, 'aclImdb')): logger.info("Extracting IMDB dataset") tar = tarfile.open(downloaded_file_path, mode="r:gz") tar.extractall(path=file_dir) tar.close() imdb_train = os.path.join(imdb_root, "train") imdb_test = os.path.join(imdb_root, "test") imdb_train_pos = os.path.join(imdb_train, "pos") imdb_train_neg = os.path.join(imdb_train, "neg") imdb_test_pos = os.path.join(imdb_test, "pos") imdb_test_neg = os.path.join(imdb_test, "neg") # Specifies positive and negative files pos_train = os.listdir(imdb_train_pos) pos_train = [(os.path.join(imdb_train_pos, file_name), pos_label) for file_name in pos_train] pos_test = os.listdir(imdb_test_pos) pos_test = [(os.path.join(imdb_test_pos, file_name), pos_label) for file_name in pos_test] neg_train = os.listdir(imdb_train_neg) neg_train = [(os.path.join(imdb_train_neg, file_name), neg_label) for file_name in neg_train] neg_test = os.listdir(imdb_test_neg) neg_test = [(os.path.join(imdb_test_neg, file_name), neg_label) for file_name in neg_test] all_data = pos_train + pos_test + neg_train + neg_test # Combines data and shuffles it. random.shuffle(all_data) for (file_path, sentiment) in all_data: # Open the movie review f = open(file_path, 'r') yield (f.read().decode('utf-8'), sentiment) # Closes f on the following next() call by user f.close()
def load_dose_response(min_logconc=-4., max_logconc=-4., subsample=None, fraction=False): """Load cell line response to different drug compounds, sub-select response for a specific drug log concentration range and return a pandas dataframe. Parameters ---------- min_logconc : -3, -4, -5, -6, -7, optional (default -4) min log concentration of drug to return cell line growth max_logconc : -3, -4, -5, -6, -7, optional (default -4) max log concentration of drug to return cell line growth subsample: None, 'naive_balancing' (default None) subsampling strategy to use to balance the data based on growth fraction: bool (default False) divide growth percentage by 100 """ path = get_file(P1B3_URL + 'NCI60_dose_response_with_missing_z5_avg.csv') df = global_cache.get(path) if df is None: df = pd.read_csv(path, sep=',', engine='c', na_values=['na', '-', ''], dtype={ 'NSC': object, 'CELLNAME': str, 'LOG_CONCENTRATION': np.float32, 'GROWTH': np.float32 }) global_cache[path] = df df = df[(df['LOG_CONCENTRATION'] >= min_logconc) & (df['LOG_CONCENTRATION'] <= max_logconc)] df = df[['NSC', 'CELLNAME', 'GROWTH', 'LOG_CONCENTRATION']] if subsample and subsample == 'naive_balancing': df1 = df[df['GROWTH'] <= 0] df2 = df[(df['GROWTH'] > 0) & (df['GROWTH'] < 50)].sample( frac=0.7, random_state=SEED) df3 = df[(df['GROWTH'] >= 50) & (df['GROWTH'] <= 100)].sample( frac=0.18, random_state=SEED) df4 = df[df['GROWTH'] > 100].sample(frac=0.01, random_state=SEED) df = pd.concat([df1, df2, df3, df4]) if fraction: df['GROWTH'] /= 100 df = df.set_index(['NSC']) return df
def download_data(self, file_path='/data/amazon/reviews_Home_and_Kitchen.json.gz', amazon_url = "http://snap.stanford.edu/data/amazon/" "productGraph/categoryFiles/" "reviews_Home_and_Kitchen.json.gz"): # download data if necessary filename_url = os.path.basename(amazon_url) dir_data = os.path.dirname(file_path) if not os.path.isfile(file_path): file_downloaded = get_file(amazon_url, dir_data) shutil.move(os.path.join(dir_data, filename_url), file_path) # return parent data directory return dir_data
def download_data(self, file_path): # download file if not os.path.isfile(file_path): # download and save file from internet logger.info("Downloading {}...".format(file_path)) file_downloaded = get_file("http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip") # extract csv filename = 'training.1600000.processed.noemoticon.csv' file_dir = os.path.dirname(file_path) with ZipFile(file_downloaded, 'r') as zp: zp.extract(filename, path=file_dir) shutil.move(os.path.join(file_dir, filename), file_path)
def load_data(file_path=None, download_path="./.downloads", dest_path="./.downloads"): """ Function that takes in a path to the IMDB movie review dataset word analogy file, opens it, removes topic tags and returns a list of the analogies @Arguments: file_path -- (optional) personal system file path to the IMDB data set in gzip form(or others of a similar structure) @Return: A generator over a tuples of Movie reviews and their sentiment """ # Open file path if not file_path: print "Downloading IMDB dataset" file_path = get_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", download_path) # If file has not been extracted, then extract it # to the downloads folder. This will save a lot of time if not os.path.isdir(os.path.join(dest_path, "aclImdb")): print ("Extracting IMDB dataset") tar = tarfile.open(file_path, mode="r:gz") tar.extractall(path=dest_path) tar.close() # Specifies positive and negative files pos_train = os.listdir("./.downloads/aclImdb/train/pos") pos_train = [(os.path.join("./.downloads/aclImdb/train/pos", file_name), pos_label) for file_name in pos_train] pos_test = os.listdir("./.downloads/aclImdb/test/pos") pos_test = [(os.path.join("./.downloads/aclImdb/test/pos", file_name), pos_label) for file_name in pos_test] neg_train = os.listdir("./.downloads/aclImdb/train/neg") neg_train = [(os.path.join("./.downloads/aclImdb/train/neg", file_name), neg_label) for file_name in neg_train] neg_test = os.listdir("./.downloads/aclImdb/test/neg") neg_test = [(os.path.join("./.downloads/aclImdb/test/neg", file_name), neg_label) for file_name in neg_test] all_data = pos_train + pos_test + neg_train + neg_test # Combines data and shuffles it. random.shuffle(all_data) for (file_path, sentiment) in all_data: # Open the movie review f = open(file_path, "r") yield (f.read(), sentiment) # Closes f on the following next() call by user f.close()
def load_data(file_path=None): ''' Function that takes in a path to the IMDB movie review dataset word analogy file, opens it, removes topic tags and returns a list of the analogies @Arguments: file_path -- (optional) personal system file path to the IMDB data set in gzip form(or others of a similar structure) @Return: A generator over a tuples of Movie reviews and their sentiment ''' # Open file path if not file_path: print "Downloading IMDB dataset" file_path = get_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz") # If file has not been extracted, then extract it # to the downloads folder. This will save a lot of time if not os.path.isdir('./.downloads/aclImdb'): print("Extracting IMDB dataset") tar = tarfile.open(file_path, mode="r:gz") tar.extractall(path="./.downloads") tar.close() # Specifies positive and negative files pos_train = os.listdir('./.downloads/aclImdb/train/pos') pos_train = [(os.path.join('./.downloads/aclImdb/train/pos', file_name), 'pos') for file_name in pos_train] pos_test = os.listdir('./.downloads/aclImdb/test/pos') pos_test = [(os.path.join('./.downloads/aclImdb/test/pos', file_name), 'pos') for file_name in pos_test] neg_train = os.listdir('./.downloads/aclImdb/train/neg') neg_train = [(os.path.join('./.downloads/aclImdb/train/neg', file_name), 'neg') for file_name in neg_train] neg_test = os.listdir('./.downloads/aclImdb/test/neg') neg_test = [(os.path.join('./.downloads/aclImdb/test/neg', file_name), 'neg') for file_name in neg_test] all_data = pos_train + pos_test + neg_train + neg_test # Combines data and shuffles it. random.shuffle(all_data) for (file_path, sentiment) in all_data: # Open the movie review f = open(file_path, 'r') yield (f.read(), sentiment) # Closes f on the following next() call by user f.close()
def load_data(path="mnist.pkl.gz"): path = get_file(path, origin="https://s3.amazonaws.com/img-datasets/mnist.pkl.gz") if path.endswith(".gz"): f = gzip.open(path, 'rb') else: f = open(path, 'rb') if sys.version_info < (3,): data = cPickle.load(f) else: data = cPickle.load(f, encoding="bytes") f.close() return data # (X_train, y_train), (X_test, y_test)
def get_word_index(path='imdb_word_index.json'): """Retrieves the dictionary mapping word indices back to words. Arguments: path: where to cache the data (relative to `~/.keras/dataset`). Returns: The word index dictionary. """ path = get_file( path, origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json') f = open(path) data = json.load(f) f.close() return data
def load_data(path="mnist.pkl.gz"): path = get_file( path, origin="https://s3.amazonaws.com/img-datasets/mnist.pkl.gz") if path.endswith(".gz"): f = gzip.open(path, 'rb') else: f = open(path, 'rb') if sys.version_info < (3, ): data = cPickle.load(f) else: data = cPickle.load(f, encoding="bytes") f.close() return data # (X_train, y_train), (X_test, y_test)
def _load_mnist(path='mnist.npz'): path = get_file(path, origin='https://s3.amazonaws.com/img-datasets/mnist.npz', file_hash='8a61469f7ea1b51cbae51d4f78837e45') f = np.load(path) x_train, y_train = f['x_train'], f['y_train'] x_test, y_test = f['x_test'], f['y_test'] f.close() # prevent compatibility issues x_train = np.expand_dims(x_train, axis=-1) y_train = np.expand_dims(y_train, axis=-1) x_test = np.expand_dims(x_test, axis=-1) y_test = np.expand_dims(y_test, axis=-1) return (x_train, y_train), (x_test, y_test)
def download_data(file_path): url_weibo = "http://weiboscope.jmsc.hku.hk/datazip/week{}.zip" if not os.path.exists(file_path) or not check_for_csvs(file_path): # download repository files and unzip them try: os.makedirs(file_path) except OSError as e: logger.debug(e) if not os.path.isdir(file_path): raise for remote_path in [ url_weibo.format(a) for a in [ str(b) for b in range(1, 52) ] ]: local_zip = get_file(remote_path, file_path) with ZipFile(local_zip) as zf: zf.extractall(file_path)
def download_data(self, file_path): # download file if not os.path.isfile(file_path): # download and save file from internet logger.info("Downloading {}...".format(file_path)) file_downloaded = get_file( "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip" ) # extract csv filename = 'training.1600000.processed.noemoticon.csv' file_dir = os.path.dirname(file_path) with ZipFile(file_downloaded, 'r') as zp: zp.extract(filename, path=file_dir) shutil.move(os.path.join(file_dir, filename), file_path)
def load_data(): """Loads the MNIST dataset. # Arguments path: path where to cache the dataset locally (relative to ~/.evolutionary-learning/datasets). # Returns Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. """ path = get_file('mnist.npz', origin='https://s3.amazonaws.com/img-datasets/mnist.npz', file_hash='8a61469f7ea1b51cbae51d4f78837e45') f = np.load(path) x_train, y_train = f['x_train'], f['y_train'] x_test, y_test = f['x_test'], f['y_test'] f.close() return (x_train, y_train), (x_test, y_test)
def download_data(file_path): url_weibo = "http://weiboscope.jmsc.hku.hk/datazip/week{}.zip" if not os.path.exists(file_path) or not check_for_csvs(file_path): # download repository files and unzip them try: os.makedirs(file_path) except OSError as e: logger.debug(e) if not os.path.isdir(file_path): raise for remote_path in [ url_weibo.format(a) for a in [str(b) for b in range(1, 52)] ]: local_zip = get_file(remote_path, file_path) with ZipFile(local_zip) as zf: zf.extractall(file_path)
def load_data( path='/home/inorganic-bandstructure/band-inversion/band_inv-01.jpg'): """Loads the MNIST dataset. # Arguments path: path where to cache the dataset locally (relative to ~/.keras/datasets). # Returns Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. """ path = get_file( path, '/home/inorganic-bandstructure/band-inversion/band_inv-01.jpg', file_hash='adh340') f = np.load(path) x_train, y_train = f['x_train'], f['y_train'] x_test, y_test = f['x_test'], f['y_test'] f.close() return (x_train, y_train), (x_test, y_test)
def load_drug_descriptors(ncols=None, scaling='std', add_prefix=True): """Load drug descriptor data, sub-select columns of drugs descriptors randomly if specificed, impute and scale the selected data, and return a pandas dataframe. Parameters ---------- ncols : int or None number of columns (drugs descriptors) to randomly subselect (default None : use all data) scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply add_prefix: True or False add feature namespace prefix """ path = get_file(P1B3_URL + 'descriptors.2D-NSC.5dose.filtered.txt') df = global_cache.get(path) if df is None: df = pd.read_csv(path, sep='\t', engine='c', na_values=['na', '-', ''], dtype=np.float32) global_cache[path] = df df1 = pd.DataFrame(df.loc[:, 'NAME'].astype(int).astype(str)) df1.rename(columns={'NAME': 'NSC'}, inplace=True) df2 = df.drop('NAME', 1) if add_prefix: df2 = df2.add_prefix('dragon7.') total = df2.shape[1] if ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) df2 = df2.iloc[:, usecols] df2 = impute_and_scale(df2, scaling) df2 = df2.astype(np.float32) df_dg = pd.concat([df1, df2], axis=1) return df_dg
def get_list_of_data_files(GP): import pilot2_datasets as p2 reload(p2) print ('Reading Data...') ## Identify the data set selected data_set=p2.data_sets[GP['set_sel']][0] ## Get the MD5 hash for the proper data set data_hash=p2.data_sets[GP['set_sel']][1] print ('Reading Data Files... %s->%s' % (GP['set_sel'], data_set)) ## Check if the data files are in the data director, otherwise fetch from FTP data_file = get_file(data_set, origin='http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot2/'+data_set+'.tar.gz', untar=True, md5_hash=data_hash) data_dir = os.path.join(os.path.dirname(data_file), data_set) ## Make a list of all of the data files in the data set data_files=glob.glob('%s/*.npz'%data_dir) fields = p2.gen_data_set_dict() return (data_files, fields)
def _load_freyface(path='frey_rawface.mat'): img_dims = [28, 20] train_size = 1685 # ??? path = get_file(path, origin='https://cs.nyu.edu/~roweis/data/frey_rawface.mat') f = loadmat(path) x_train = f['ff'][:, :train_size] x_test = f['ff'][:, train_size:] # reformat data to match expected format x_train = x_train.transpose() x_train = np.reshape(x_train, tuple([train_size] + img_dims), order='C') x_train = np.expand_dims(x_train, axis=-1) x_test = x_test.transpose() x_test = np.reshape(x_test, tuple([x_test.shape[0]] + img_dims), order='C') x_test = np.expand_dims(x_test, axis=-1) return (x_train, np.zeros(shape=(train_size, 1), dtype=np.uint8)), \ (x_test, np.zeros(shape=(x_test.shape[0], 1), dtype=np.uint8))
def load_cell_expression_5platform(ncols=None, scaling='std', add_prefix=True): """Load 5-platform averaged cell line expression data, sub-select columns of gene expression randomly if specificed, scale the selected data and return a pandas dataframe. Parameters ---------- ncols : int or None number of columns (gene expression) to randomly subselect (default None : use all data) scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply add_prefix: True or False add feature namespace prefix """ path = get_file( P1B3_URL + 'RNA_5_Platform_Gene_Transcript_Averaged_intensities.transposed.txt') df = global_cache.get(path) if df is None: df = pd.read_csv(path, sep='\t', engine='c', na_values=['na', '-', '']) global_cache[path] = df df1 = df['CellLine'] df1 = df1.map(lambda x: x.replace('.', ':')) df1.name = 'CELLNAME' df2 = df.drop('CellLine', 1) if add_prefix: df2 = df2.add_prefix('expr_5p.') total = df2.shape[1] if ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) df2 = df2.iloc[:, usecols] df2 = impute_and_scale(df2, scaling) df2 = df2.astype(np.float32) df = pd.concat([df1, df2], axis=1) return df
def load_smiles(verbose=False): """ (ap) Load SMILES data (Simplified Molecular-Input Line-Entry System). Args: Returns: """ path = get_file(P1B3_URL + 'ChemStructures_Consistent.smiles') df = global_cache.get(path) if df is None: df = pd.read_csv(path, sep='\t', engine='c', dtype=np.str) # (ap) update this command global_cache[path] = df # TODO maybe do some processing (data augmentation; check if strings are valid) df_smiles = df if verbose: print('SMILES shape {}'.format(df_smiles.shape)) print('SMILES columns {}'.format(df.columns)) return df_smiles
def load_data(file_path=None, verbose=False): ''' Function that takes in a path to the Stanford SNAP Amazon review data, opens it, and yields a dictoray of information for each review @Arguments: file_path -- (optional) personal system file path to the SNAP Stanford data set (or others of a similar structure) @Return: A generator over a dictionaries of each Amazon Reveiws ''' # Open file path if not file_path: file_path = get_file("https://snap.stanford.edu/data/amazon/all.txt.gz") # Parse Amazon Reviews GZip file -- taken from Stanford SNAP page try: f = gzip.open(file_path, 'r') except IOError, e: print "IO Error", e.code, file_path
def _load_cifar_100(label_mode='fine'): if label_mode not in ['fine', 'coarse']: raise ValueError('`label_mode` must be one of `"fine"`, `"coarse"`.') dirname = 'cifar-100-python' origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz' path = get_file(dirname, origin=origin, untar=True) fpath = os.path.join(path, 'train') x_train, y_train = _load_batch(fpath, label_key=label_mode + '_labels') fpath = os.path.join(path, 'test') x_test, y_test = _load_batch(fpath, label_key=label_mode + '_labels') y_train = np.reshape(y_train, (len(y_train), 1)) y_test = np.reshape(y_test, (len(y_test), 1)) # make channels last dimension x_train = x_train.transpose(0, 2, 3, 1) x_test = x_test.transpose(0, 2, 3, 1) return (x_train, y_train), (x_test, y_test)
def load_data(file_path=None, verbose=False): ''' Function that takes in a path to the Stanford SNAP Amazon review data, opens it, and yields a dictoray of information for each review @Arguments: file_path -- (optional) personal system file path to the SNAP Stanford data set (or others of a similar structure) @Return: A generator over a dictionaries of each Amazon Reveiws ''' # Open file path if not file_path: file_path = get_file( "https://snap.stanford.edu/data/amazon/all.txt.gz") # Parse Amazon Reviews GZip file -- taken from Stanford SNAP page try: f = gzip.open(file_path, 'r') except IOError, e: print "IO Error", e.code, file_path
def load_cell_expression_u133p2(ncols=None, scaling='std', add_prefix=True): """Load U133_Plus2 cell line expression data prepared by Judith, sub-select columns of gene expression randomly if specificed, scale the selected data and return a pandas dataframe. Parameters ---------- ncols : int or None number of columns (gene expression) to randomly subselect (default None : use all data) scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply add_prefix: True or False add feature namespace prefix """ path = get_file( 'http://bioseed.mcs.anl.gov/~fangfang/p1h/GSE32474_U133Plus2_GCRMA_gene_median.txt' ) df = global_cache.get(path) if df is None: df = pd.read_csv(path, sep='\t', engine='c') global_cache[path] = df df1 = df['CELLNAME'] df2 = df.drop('CELLNAME', 1) if add_prefix: df2 = df2.add_prefix('expr.') total = df.shape[1] if ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) df2 = df2.iloc[:, usecols] df2 = impute_and_scale(df2, scaling) df2 = df2.astype(np.float32) df = pd.concat([df1, df2], axis=1) return df
def load_drug_autoencoded_AG(ncols=None, scaling='std', add_prefix=True): """Load drug latent representation from Aspuru-Guzik's variational autoencoder, sub-select columns of drugs randomly if specificed, impute and scale the selected data, and return a pandas dataframe Parameters ---------- ncols : int or None number of columns (drug latent representations) to randomly subselect (default None : use all data) scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply add_prefix: True or False add feature namespace prefix """ path = get_file(P1B3_URL + 'Aspuru-Guzik_NSC_latent_representation_292D.csv') df = global_cache.get(path) if df is None: df = pd.read_csv(path, engine='c', dtype=np.float32) global_cache[path] = df df1 = pd.DataFrame(df.loc[:, 'NSC'].astype(int).astype(str)) df2 = df.drop('NSC', 1) if add_prefix: df2 = df2.add_prefix('smiles_latent_AG.') total = df2.shape[1] if ncols and ncols < total: usecols = np.random.choice(total, size=ncols, replace=False) df2 = df2.iloc[:, usecols] df2 = impute_and_scale(df2, scaling) df2 = df2.astype(np.float32) df = pd.concat([df1, df2], axis=1) return df
def load_data(file_path, which_set='train', form='pinyin', train_pct=1.0, nr_records=None, rng_seed=None, min_length=None, max_length=None, pad_out=False): """ Load data from Open Weiboscope corpus of Sina Weibo posts. Options are available for encoding of returned text data. @Arguments: file_path -- path to downloaded, unzipped Open Weiboscope data (a directory). If this path does not exist or is not given, load_data will create the path and download the data (string) which_set -- whether to iterate over train or testing set. You should also set train_pct and rng_seed to non-default values if you specify this (string) form -- return results in hanzi, pinyin romanization? can take values of 'hanzi', 'pinyin' (string) train_pct -- what percent of dataset should go to training (remainder goes to test)? (float) nr_records -- if not None, gives the maximum number of records this generator should yield. will yield fewer records if the corpus exhausted before nr_records records are yielded rng_seed -- value for seeding random number generator min_length -- enforce a minimum length, in characters, for the dataset? Counted in hanzi for form='hanzi' and in roman characters for form='pinyin'. Texts that are too short will be excluded. (int) max_length -- enforce a maximum length, in characters, for the dataset? Counted in hanzi or roman characters as approriate (see above). Texts that are too long will be truncated at the end. (int) pad_out -- for texts shorter than max_length, should they be padded out at the end with spaces? @Return: a generator over a tuples of review text (unicode or numpy array) and whether or not the tweet was deleted (bool) """ if not os.path.exists(file_path): # download repository files and unzip them os.makedirs(file_path) for remote_path in [ "http://weiboscope.jmsc.hku.hk/datazip/week{}.zip".format(a) for a in [ str(b) for b in range(1, 52) ] ]: local_zip = get_file(remote_path, file_path) with ZipFile(local_zip) as zf: zf.extractall(file_path) # get list of weekNN.csv files at file_path ow_files = [ os.path.join(file_path, f) for f in os.listdir(file_path) if re.match(r"week[0-9]{,2}\.csv", f) is not None ] assert ow_files is not [] # strategy: randomize order of weeks (individual files), sample in order from each week. try: random.seed(rng_seed) except: pass random.shuffle(ow_files) split_on = int(len(ow_files) * train_pct) data_sets = {} logger.debug("Shuffle order: {}, split on {}".format(ow_files, split_on)) data_sets['train'], data_sets['test'] = ow_files[:split_on], ow_files[split_on:] logger.debug(data_sets) nr_yielded = 0 for table_path in data_sets[which_set]: with codecs.open(table_path, "r", encoding="utf-8") as f: logging.debug("In file {}".format(table_path)) for line in unicode_csv_reader(f): try: records_split = line post_id = records_split[0] if len(records_split) != 11: raise BadRecordException("Comma split error on mid={} in" "file {} (len of record: {})".format( post_id, os.path.basename(table_path), len(records_split))) # different fields of post record post_text = records_split[6] post_retweeted = records_split[1] != '' post_deleted = records_split[9] != '' if not post_retweeted: if form=='hanzi': record_txt, sentiment = enforce_length( post_text, min_length, max_length, pad_out), post_deleted yield record_txt, sentiment elif form=='pinyin': record_txt, sentiment = enforce_length( romanize_tweet(post_text), min_length, max_length, pad_out), post_deleted yield record_txt, sentiment else: raise Exception("Unknown form '{}' (should be 'hanzi' " "or 'pinyin')".format(form)) # limit number of records retrieved? nr_yielded += 1 if nr_records is not None and nr_yielded >= nr_records: raise StopIteration() # log various exception cases from loop body except TextTooShortException: logger.info("Record {} thrown out (too short)".format(post_id)) except BadRecordException as e: logger.info(e) except IndexError as e: logger.info(e) except UnicodeEncodeError as e: logger.info(e) except GeneratorExit: return
def get_p1_file(link): fname = os.path.basename(link) return get_file(fname, origin=link, cache_subdir='Pilot1')
@Return: A list of tuples of the following format: (tweets/features, sentiment label) ''' tweet_to_sentiment = list() # Open file path if file_path: try: twitter_csv = open(file_path, 'r') except IOError, e: print "IO Error:", e.code, file_path else: # Dowloads and saves locally the zip file from internet file_path = get_file("http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip") with ZipFile(file_path, 'r') as zp: twitter_csv = zp.open('training.1600000.processed.noemoticon.csv') # Perform parsing of CSV file reader = latin_csv_reader(twitter_csv, delimiter=',') for i, tweet in enumerate(reader): # Prints progress every 10000 words read if verbose and i % 10000 == 0: logging.info("PROGRESS: at tweet #%s", i) # Gets tweets string from line in csv tweet_string = tweet[5] # Gets feature from Sentiment dictionary sent = Sentiment[int(tweet[0])]
## Import keras modules from keras.optimizers import SGD,RMSprop,Adam from keras.datasets import mnist from keras.callbacks import LearningRateScheduler,ModelCheckpoint from keras import callbacks from keras.layers.advanced_activations import ELU from keras.preprocessing.image import ImageDataGenerator batch_size = GP['batch_size'] ##### Read Data ######## print ('Reading Data...') datagen=hf.ImageNoiseDataGenerator(corruption_level=GP['noise_factor']) data_set=p2.data_sets[opts.set_sel][0] data_hash=p2.data_sets[opts.set_sel][1] print ('Reading Data Files... %s->%s' % (opts.set_sel, data_set)) data_file = get_file(data_set, origin='http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/Pilot2/'+data_set+'.tar.gz', untar=True, md5_hash=data_hash) data_dir = os.path.join(os.path.dirname(data_file), data_set) data_files=glob.glob('%s/*.npy'%data_dir) X=np.load(data_files[0]) data=hf.get_data(X,case=opts.case) X_train,y_train=hf.create_dataset(data,GP['look_back'],look_forward=GP['look_forward']) ## convert data to a sequence temporal_dim=X_train.shape[1] input_dim=X_train.shape[2] print('X_train type and shape:', X_train.dtype, X_train.shape) print('X_train.min():', X_train.min()) print('X_train.max():', X_train.max()) ### Define Model, Solver and Compile ########## print ('Define the model and compile')
def do_10_fold(): shared_nnet_spec= [ 1200 ] individual_nnet_spec0= [ 1200, 1200 ] individual_nnet_spec1= [ 1200, 1200 ] individual_nnet_spec2= [ 1200, 1200 ] individual_nnet_spec = [ individual_nnet_spec0, individual_nnet_spec1, individual_nnet_spec2 ] learning_rate = 0.01 batch_size = 10 n_epochs = 10 dropout = 0.0 truth0 = [] pred0 = [] truth1 = [] pred1 = [] truth2 = [] pred2 = [] ## Read files file_path = os.path.dirname(os.path.realpath(__file__)) print file_path lib_path = os.path.abspath(os.path.join(file_path, '..', '..', 'common')) sys.path.append(lib_path) from data_utils import get_file origin = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P3B1/P3B1_data.tgz' data_loc = get_file('P3B1_data.tgz', origin, untar=True, md5_hash=None, cache_subdir='P3B1') print 'Data downloaded and stored at: ' + data_loc data_path = os.path.dirname(data_loc) print data_path for fold in range( 1 ): feature_train_0 = np.genfromtxt( data_path + '/task0_' + str( fold ) + '_train_feature.csv', delimiter= ',' ) truth_train_0 = np.genfromtxt( data_path + '/task0_' + str( fold ) + '_train_label.csv', delimiter= ',' ) feature_test_0 = np.genfromtxt( data_path + '/task0_' + str( fold ) + '_test_feature.csv', delimiter= ',' ) truth_test_0 = np.genfromtxt( data_path + '/task0_' + str( fold ) + '_test_label.csv', delimiter= ',' ) feature_train_1 = np.genfromtxt( data_path + '/task1_' + str( fold ) + '_train_feature.csv', delimiter= ',' ) truth_train_1 = np.genfromtxt( data_path + '/task1_' + str( fold ) + '_train_label.csv', delimiter= ',' ) feature_test_1 = np.genfromtxt( data_path + '/task1_' + str( fold ) + '_test_feature.csv', delimiter= ',' ) truth_test_1 = np.genfromtxt( data_path + '/task1_' + str( fold ) + '_test_label.csv', delimiter= ',' ) feature_train_2 = np.genfromtxt( data_path + '/task2_' + str( fold ) + '_train_feature.csv', delimiter= ',' ) truth_train_2 = np.genfromtxt( data_path + '/task2_' + str( fold ) + '_train_label.csv', delimiter= ',' ) feature_test_2 = np.genfromtxt( data_path + '/task2_' + str( fold ) + '_test_feature.csv', delimiter= ',' ) truth_test_2 = np.genfromtxt( data_path + '/task2_' + str( fold ) + '_test_label.csv', delimiter= ',' ) features_train = [ feature_train_0, feature_train_1, feature_train_2 ] truths_train = [ truth_train_0, truth_train_1, truth_train_2 ] features_test = [ feature_test_0, feature_test_1, feature_test_2 ] truths_test = [ truth_test_0, truth_test_1, truth_test_2 ] ret = run_mtl( features_train= features_train, truths_train= truths_train, features_test= features_test, truths_test= truths_test, shared_nnet_spec= shared_nnet_spec, individual_nnet_spec= individual_nnet_spec, learning_rate= learning_rate, batch_size= batch_size, n_epochs= n_epochs, dropout= dropout ) truth0.extend( ret[ 0 ][ 0 ] ) pred0.extend( ret[ 0 ][ 1 ] ) truth1.extend( ret[ 1 ][ 0 ] ) pred1.extend( ret[ 1 ][ 1 ] ) truth2.extend( ret[ 2 ][ 0 ] ) pred2.extend( ret[ 2 ][ 1 ] ) print 'Task 1: Primary site - Macro F1 score', f1_score( truth0, pred0, average= 'macro' ) print 'Task 1: Primary site - Micro F1 score', f1_score( truth0, pred0, average= 'micro' ) print 'Task 2: Tumor laterality - Macro F1 score', f1_score( truth1, pred1, average= 'macro' ) print 'Task 3: Tumor laterality - Micro F1 score', f1_score( truth1, pred1, average= 'micro' ) print 'Task 3: Histological grade - Macro F1 score', f1_score( truth2, pred2, average= 'macro' ) print 'Task 3: Histological grade - Micro F1 score', f1_score( truth2, pred2, average= 'micro' )
def get_word_index(path="reuters_word_index.pkl"): path = get_file(path, origin="https://s3.amazonaws.com/text-datasets/reuters_word_index.pkl") f = open(path, 'rb') return cPickle.load(f)