def dispBanner(stage): if stage == 'MNIST': print '************************************************************************' print '** MNIST DIGITS EXPERIMENTS' print '************************************************************************' print 'First we load the data ' dataset = ms.MNISTcontrol("../MNIST/") trX_images, trY = dataset.load_mnist('training') # we need x to be 1d sizeX = trX_images.shape if len(sizeX) > 2: newXdim = 0 for i in range(sizeX[1]): newXdim += len(trX_images[0][i]) trX = np.reshape(trX_images, (sizeX[0], newXdim)) #read in test data deX, deY = dataset.load_mnist('test') # we need x to be 1d sizeX = deX.shape if len(sizeX) > 2: newXdim = 0 for i in range(sizeX[1]): newXdim += len(deX[0][i]) deX = np.reshape(deX, (sizeX[0], newXdim)) return trX_images, trX, trY, deX, deY print'%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%' print'********************************************************************' print'** 20 NEWSGROUPS EXPERIMENTS' print'********************************************************************\n' print 'First we load the data \n' cats = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space'] news_train = fetch_20newsgroups(subset='train', categories=cats, shuffle=True, random_state=42) news_test = fetch_20newsgroups(subset='test', categories=cats, shuffle=True, random_state=42) for i in range(len(news_train.target_names)): print "Target number " + str(i) + " is " + news_train.target_names[i] vectorizer = TfidfVectorizer() trX = vectorizer.fit_transform(news_train.data) # trX = trX.toarray() trX_images = news_train.data trY = news_train.target trY = np.reshape(trY, (trY.shape[0],1)) deX = vectorizer.fit_transform(news_test.data) # deX = deX.toarray() deY = news_test.target deY = np.reshape(deY, (deY.shape[0],1)) raw_input('\nPress enter to continue...') return trX_images, trX, trY, deX,deY
def fetch_20newsgroups_bows(subset='all', data_home="/home/ian/School/CS5950/datasets/"): """ Load the 20 newsgroups dataset and transform in into bag-of-words. This is a wrapper around sklearn.fetch_20newsgroups Parameters: ----------- subset: ['train'|'test'|'all'], optional Select the dataset to load data_home: optional, default: None Specify an download and cache folder for the datasets. Returns ------- bunch : Bunch object bunch.data: sparse matrix, shape [n_samples, n_features] bunch.target: array, shape [n_samples] bunch.target_names: list, length [n_classes] """ data_home = get_data_home(data_home=data_home) target_file = os.path.join(data_home, "20newsgroup_bow.pk") data_train = fetch_20newsgroups(data_home=data_home, subset='train', catagories=None, shuffle=True, random_state=12) data_test = fetch_20newsgroups(data_home=data_home, subset='test', catagories=None, shuffle=True, random_state=12) if os.path.exists(target_file): X_train, X_test = joblib.load(target_file) else: vocabulary = dict((t,i) for i, t in enumerate(open(vocab_path)))
def check_ci(crit_val=95): try: z_score = CRIT_VALS[crit_val] except KeyError: print('Provide a value one of %s' % list(CRIT_VALS.keys())) return print('Using threshold %d%%' % crit_val) lr = LogisticRegression(C=1, dual=False, solver='lbfgs', max_iter=1000) svm = LinearSVC(C=0.3, max_iter=1000) sel = SelectFromModel(svm, prefit=False) sc = StandardScaler() vect = TfidfVectorizer(sublinear_tf=True, stop_words='english') categ = ['alt.atheism', 'talk.religion.misc'] res = twenty_newsgroups.fetch_20newsgroups(categories=categ) data, y = res['data'], res['target'] data_trn_val, data_tst, y_trn_val, y_tst = train_test_split( data, y, test_size=0.2, random_state=42) data_trn, data_val, y_trn, y_val = train_test_split( data_trn_val, y_trn_val, test_size=0.5, random_state=42) X_trn = sc.fit_transform( sel.fit_transform(vect.fit_transform(data_trn), y_trn).todense()) X_val = sc.transform(sel.transform(vect.transform(data_val)).todense()) X_tst = sc.transform(sel.transform(vect.transform(data_tst)).todense()) lr.fit(X_trn, y_trn) # could be estimated from x-validation instead SE_est = get_se(X_val, y_val, lr) prob, up, lo = get_probs(lr, X_tst, SE_est, z=z_score) # print(up - lo) print('Accuracy normally:', lr.score(X_tst, y_tst)) # 90.6% evaluate_using_heuristic(prob, up, lo, y_tst)
token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b') tokens = token_pattern.findall(doc) tokens = ["#NUMBER" if token[0] in "0123456789_" else token for token in tokens] return tokens # exclude 'comp.os.ms-windows.misc' categories = ['alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target vectorizer = TfidfVectorizer(stop_words='english', min_df=5, tokenizer=number_aware_tokenizer) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X)
tokens = token_pattern.findall(doc) tokens = ["#NUMBER" if token[0] in "0123456789_" else token for token in tokens] return tokens # exclude 'comp.os.ms-windows.misc' categories = ['alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] newsgroups = fetch_20newsgroups(categories=categories) y_true = newsgroups.target vectorizer = TfidfVectorizer(stop_words='english', min_df=5, tokenizer=number_aware_tokenizer) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X)
from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.neighbors.classification import KNeighborsClassifier from sklearn.svm.classes import SVC, LinearSVC import matplotlib.pyplot as plt mpl.rcParams['font.sans-serif'] = [u'simHei'] mpl.rcParams['axes.unicode_minus'] = False #加载数据 load_start_time = time() remove = ('headers', 'footers', 'quotes') #类别 categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space' data_train = fetch_20newsgroups(data_home='../../data/', subset='train', categories=categories, shuffle=True, random_state=0, remove=remove) data_test = fetch_20newsgroups(data_home='../../data/', subset='test', categories=categories, shuffle=True, random_state=0, remove=remove) print u"完成数据加载过程耗时:%.3fs" % (time() - load_start_time) #查看测试数据和训练数据大小 def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'] import sklearn.datasets.twenty_newsgroups as news from sklearn.feature_extraction.text import CountVectorizer twenty_train = news.fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) print X_train_counts.shape print "hello!"
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups fetch_20newsgroups()
""" import numpy as np from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn import metrics from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups from global_variables import stop, topic_mapping, inverse_mapping, leaf_to_topic, inverse_leaf_to_topic, cats from functions import build_hierarchy, train_classifiers, build_classifier_map, predict_class import time start = time.clock() train_dataset = fetch_20newsgroups(subset='train', categories=cats, shuffle=True, random_state=42) # Adjacency list represents the hieararchy tree # node_int_map maps the node label to the adjacency list index # node_int_inverse_map represents the inverse of node_int_map # parent_nos represents the number of nodes which are not leaves [adjacency_list, node_int_map, node_int_inverse_map, parent_nos] = build_hierarchy() start_time = time.process_time() count_vectorizer = CountVectorizer(stop_words=stop, ngram_range=(1, 2)) tfidf_transformer = TfidfTransformer() features = count_vectorizer.fit_transform(train_dataset.data) features = tfidf_transformer.fit_transform(features) print("----Built Features----")
def load_data(dataset_name): if dataset_name == 'iris': from sklearn.datasets import load_iris iris = load_iris() X, y = iris.data, iris.target num_cls = 3 elif dataset_name == 'digits': from sklearn.datasets import load_digits digits = load_digits() X, y = digits.data, digits.target num_cls = 10 elif dataset_name == 'wine': from sklearn.datasets import load_wine wine = load_wine() X, y = wine.data, wine.target num_cls = 3 elif dataset_name == 'breast_cancer': from sklearn.datasets import load_breast_cancer data = load_breast_cancer() X, y = data.data, data.target num_cls = 2 elif dataset_name == 'olivetti_faces': from sklearn.datasets.olivetti_faces import fetch_olivetti_faces data = fetch_olivetti_faces() X, y = data.data, data.target num_cls = 40 elif dataset_name == '20newsgroups': from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups data = fetch_20newsgroups() vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(data.data) y = data.target num_cls = 20 elif dataset_name == '20newsgroups_vectorized': from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups_vectorized data = fetch_20newsgroups_vectorized() X, y = data.data, data.target num_cls = 20 elif dataset_name == 'lfw_people': from sklearn.datasets.lfw import fetch_lfw_people # all dataset # data = fetch_lfw_people() # X, y = data.data, data.target # num_cls = 5749 # subset data = fetch_lfw_people(min_faces_per_person=50) X, y = data.data, data.target num_cls = 12 assert num_cls == len(set(y)) elif dataset_name == 'covtype': from benchmark.utils.dataset_scripts.covtype import load_covtype X, y = load_covtype() num_cls = 7 elif dataset_name == 'rcv1': from sklearn.datasets.rcv1 import fetch_rcv1 data = fetch_rcv1(subset='train') X, y = data.data, trans_label(np.argmax(data.target.A, axis=1)) num_cls = len(set(y)) assert num_cls == 37 elif dataset_name == 'kddcup99-sa': from sklearn.datasets.kddcup99 import fetch_kddcup99 data = fetch_kddcup99(subset='SA') y = trans_label(data.target) # one-hot encode the 1-th and 3-th column. # col1 = trans_label(data.data[:, 1]).reshape((len(data.data), 1)) # col1 = one_hot(col1) # col3 = trans_label(data.data[:, 3]).reshape((len(data.data), 1)) # col3 = one_hot(col3) # 2-th column has too many categories, just encode the label. X = data.data.copy() X[:, 2] = trans_label(data.data[:, 2]) X[:, 1] = trans_label(data.data[:, 1]) X[:, 3] = trans_label(data.data[:, 3]) # # print(col1.shape, col3.shape, X.shape) # X = np.delete(X, [1, 3], axis=1) # X = X.astype(float) # print(col1.shape, col3.shape, X.shape) # print(X.dtype, col1.dtype, col3.dtype) # X = np.c_[X, col1] # X = np.c_[X, col3] num_cls = len(set(y)) elif dataset_name == 'kddcup99-smtp': from sklearn import preprocessing from sklearn.datasets.kddcup99 import fetch_kddcup99 data = fetch_kddcup99(subset='smtp') le = preprocessing.LabelEncoder() le.fit(data.target) y = le.transform(data.target) X = data.data num_cls = 3 elif dataset_name == 'fall_detection': from benchmark.utils.dataset_scripts.fall_detection import load_fall_detection X, y = load_fall_detection() num_cls = 6 elif dataset_name == 'banana': from benchmark.utils.dataset_scripts.banana import load_banana X, y = load_banana() num_cls = 2 elif dataset_name == 'talkingdata': from benchmark.utils.dataset_scripts.talkingdata import load_talkinigdata X, y = load_talkinigdata() num_cls = 2 elif dataset_name == 'biomechanical2C': from benchmark.utils.dataset_scripts.biomechanical import load_biomechanical2C X, y = load_biomechanical2C() num_cls = 2 elif dataset_name == 'biomechanical3C': from benchmark.utils.dataset_scripts.biomechanical import load_biomechanical3C X, y = load_biomechanical3C() num_cls = 3 elif dataset_name == 'susy': from benchmark.utils.dataset_scripts.susy import load_susy X, y = load_susy() num_cls = 2 elif dataset_name == 'higgs': from benchmark.utils.dataset_scripts.higgs import load_higgs X, y = load_higgs() num_cls = 2 elif dataset_name == 'hepmass': from benchmark.utils.dataset_scripts.hepmass import load_hepmass X, y = load_hepmass() num_cls = 2 elif dataset_name == 'letter': from benchmark.utils.dataset_scripts.letter import load_letter X, y = load_letter() num_cls = 26 elif dataset_name == 'usps': from benchmark.utils.dataset_scripts.usps import load_usps X, y = load_usps() num_cls = 10 elif dataset_name == 'epsilon': from benchmark.utils.dataset_scripts.usps import load_epsilon X, y = load_epsilon() num_cls = 2 elif dataset_name == 'dermatology': from benchmark.utils.dataset_scripts.dermatology import load_dermatology X, y = load_dermatology() num_cls = 4 elif dataset_name == 'poker': from benchmark.utils.dataset_scripts.poker import load_poker X, y = load_poker() num_cls = 10 elif dataset_name == 'sensorless': from benchmark.utils.dataset_scripts.sensorless import load_sensorless X, y = load_sensorless() num_cls = 11 elif dataset_name == 'phishing': from benchmark.utils.dataset_scripts.phishinig import load_phishing X, y = load_phishing() num_cls = 2 elif dataset_name == 'w8a': from benchmark.utils.dataset_scripts.w8a import load_w8a X, y = load_w8a() num_cls = 2 elif dataset_name == 'a8a': from benchmark.utils.dataset_scripts.a8a import load_a8a X, y = load_a8a() num_cls = 2 elif dataset_name == 'sector': from benchmark.utils.dataset_scripts.sector import load_sector X, y = load_sector() num_cls = 105 elif dataset_name == 'protein': from benchmark.utils.dataset_scripts.protein import load_protein X, y = load_protein() num_cls = 3 elif dataset_name == 'shuttle': from benchmark.utils.dataset_scripts.shuttle import load_shuttle X, y = load_shuttle() num_cls = 7 elif dataset_name == 'vowel': from benchmark.utils.dataset_scripts.vowel import load_vowel X, y = load_vowel() num_cls = 11 elif dataset_name == 'splice': from benchmark.utils.dataset_scripts.splice import load_splice X, y = load_splice() num_cls = 2 elif dataset_name == 'codrna': from benchmark.utils.dataset_scripts.codrna import load_codrna X, y = load_codrna() num_cls = 2 elif dataset_name == 'australian': from benchmark.utils.dataset_scripts.australian import load_australian X, y = load_australian() num_cls = 2 else: raise ValueError('Invalid dataset name!') x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2) return (x_train, y_train), (x_test, y_test), num_cls
########### ### The following file generates bags of instances for the 20Newsgroups data set (see description in paper) ########## from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer class_pair_list = [("sci.space","sci.med"),("comp.sys.mac.hardware","comp.sys.ibm.pc.hardware"), ("rec.sport.hockey",'rec.sport.baseball')] ### Select binary classification task (category pair) c1,c2 = class_pair_list[2] #load train,test data, represent with TF-IDF newsgroups_valid = fetch_20newsgroups(subset = "test",categories=[c1, c2]) y_valid = newsgroups_valid.target vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english") newsgroups_train = fetch_20newsgroups(subset = "train",categories=[c1, c2]) dd = list(newsgroups_train.data) y_train = list(newsgroups_train.target) y_train =np.array(y_train) dd = np.array(dd) X_train = vectorizer.fit_transform(dd) X_valid = vectorizer.transform(newsgroups_valid.data) y_train = np.where(y_train==0,-1,1) y_valid = np.where(y_valid==0,-1,1)
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier from sklearn.feature_extraction.text import TfidfTransformer # Using inverse document frequency to filter the noise from sklearn.pipeline import Pipeline from sklearn import metrics from global_variables import stop, cats import time train_dataset = fetch_20newsgroups(subset='train', categories=cats, remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42) # remove=('headers', 'footers', 'quotes'), # Using pipeline to fit data to model and then convert it to tf-idf counts pipeline = Pipeline([('count_vectorizer', CountVectorizer(stop_words=stop, ngram_range=(1, 2))), ('tfidf_transformer', TfidfTransformer()), ('classifier', SGDClassifier(alpha=1e-3, random_state=42))]) start_time = time.process_time() # SGDClassifier(alpha=1e-3, random_state=42) pipeline.fit(train_dataset.data, train_dataset.target) print("Training Time : " + str(time.process_time() - start_time)) test_dataset = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),