def test_lime_text_tabular_not_equal_random_state(self): categories = ['alt.atheism', 'soc.religion.christian'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) class_names = ['atheism', 'christian'] vectorizer = TfidfVectorizer(lowercase=False) train_vectors = vectorizer.fit_transform(newsgroups_train.data) test_vectors = vectorizer.transform(newsgroups_test.data) nb = MultinomialNB(alpha=.01) nb.fit(train_vectors, newsgroups_train.target) pred = nb.predict(test_vectors) f1_score(newsgroups_test.target, pred, average='weighted') c = make_pipeline(vectorizer, nb) explainer = LimeTextExplainer( class_names=class_names, random_state=10) exp_1 = explainer.explain_instance(newsgroups_test.data[83], c.predict_proba, num_features=6) explainer = LimeTextExplainer( class_names=class_names, random_state=20) exp_2 = explainer.explain_instance(newsgroups_test.data[83], c.predict_proba, num_features=6) self.assertFalse(exp_1.as_map() == exp_2.as_map())
def get_data(): newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories= ['talk.politics.guns', 'talk.politics.mideast','alt.atheism','talk.politics.misc', 'talk.religion.misc']) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories= ['talk.politics.guns', 'talk.politics.mideast','alt.atheism','talk.politics.misc', 'talk.religion.misc']) # texts_train, target_train = newsgroups_train.data, newsgroups_train.target # texts_test, target_test = newsgroups_test.data,newsgroups_test.target #return texts_train, target_train, newsgroups_train.filenames, texts_test, target_test, newsgroups_test.filenames return newsgroups_train, newsgroups_test
def file(): cats = ["alt.atheism", "sci.electronics"] newsgroups_train = fetch_20newsgroups(subset="train", categories=cats) newsgroups_test = fetch_20newsgroups(subset="test", categories=cats) vectorizer = TfidfVectorizer() # 把所有文档都切词,统计了 vectors_train = vectorizer.fit_transform(newsgroups_train.data) vectors = vectorizer.transform(newsgroups_test.data) print vectors.shape[1] # f=open('test_all.txt','wb') for j in range(0, vectors.shape[0]): item_id = list() tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j]) # 提取分词结果 # print tokens word_sort = np.argsort(-vectors[j].data) print "顶点" + str(j) for i in range(0, len(word_sort)): word = vectorizer.get_feature_names()[vectors[j].indices[word_sort[i]]] # 这个是tf-idf詞 for line in range(0, len(tokens)): if tokens[line].lower() == word: item_id.append((line, word_sort[i])) pos_item = sorted(item_id, key=lambda jj: jj[0], reverse=True) # 抽取tf-idf词 word_word = np.zeros([len(word_sort), len(word_sort)]) for p in range(0, len(pos_item)): if p < (len(pos_item) - 1): ki = word_sort[pos_item[p][1]] kj = word_sort[pos_item[p + 1][1]] word_word[ki, kj] = word_word[ki, kj] + 1
def test_20news(): try: data = datasets.fetch_20newsgroups( subset='all', download_if_missing=False, shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # Extract a reduced dataset data2cats = datasets.fetch_20newsgroups( subset='all', categories=data.target_names[-1:-3:-1], shuffle=False) # Check that the ordering of the target_names is the same # as the ordering in the full dataset assert_equal(data2cats.target_names, data.target_names[-2:]) # Assert that we have only 0 and 1 as labels assert_equal(np.unique(data2cats.target).tolist(), [0, 1]) # Check that the number of filenames is consistent with data/target assert_equal(len(data2cats.filenames), len(data2cats.target)) assert_equal(len(data2cats.filenames), len(data2cats.data)) # Check that the first entry of the reduced dataset corresponds to # the first entry of the corresponding category in the full dataset entry1 = data2cats.data[0] category = data2cats.target_names[data2cats.target[0]] label = data.target_names.index(category) entry2 = data.data[np.where(data.target == label)[0][0]] assert_equal(entry1, entry2)
def News(): from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer newsgroups_train = datasets.fetch_20newsgroups(subset='train') vectorizer = CountVectorizer(encoding='latin-1', max_features=30000) #vectorizer = HashingVectorizer(encoding='latin-1') x_train = vectorizer.fit_transform(newsgroups_train.data) x_train = numpy.asarray(x_train.todense(), dtype='float32') y_train = numpy.asarray(newsgroups_train.target, dtype='int32') newsgroups_test = datasets.fetch_20newsgroups(subset='test') x_test = vectorizer.transform(newsgroups_test.data) x_test = numpy.asarray(x_test.todense(), dtype='float32') y_test = numpy.asarray(newsgroups_test.target, dtype='int32') dnn=RegularizedNet(numpy_rng=numpy.random.RandomState(123), theano_rng=None, n_ins=x_train.shape[1], layers_types=[ReLU, ReLU, LogisticRegression], layers_sizes=[1000, 1000], n_outs=len(set(y_train)), rho=0.95, eps=1.E-6, max_norm=0., debugprint=False, L1_reg=0., L2_reg=1./x_train.shape[0]) print len(set(y_train)) dnn.fit(x_train, y_train, max_epochs=30, method='adadelta_nesterov', verbose=True, plot=False) test_error = dnn.score(x_test, y_test) print("score: %f" % (1. - test_error))
def train_20_news(n_jobs, n_folds): from sklearn.datasets import fetch_20newsgroups train = fetch_20newsgroups(subset='train', shuffle=False, random_state=100, remove=('headers', 'footers', 'quotes')) test = fetch_20newsgroups(subset='test', shuffle=False, random_state=100, remove=('headers', 'footers', 'quotes')) x_train = map(dt.clean_str, train.data) x_test = map(dt.clean_str, test.data) text_clf = Pipeline([ # ('clean', Cleaner()), ('vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('clf', SGDClassifier(fit_intercept=True, random_state=0)) ]) SGDClassifier_params = { 'clf__alpha': np.arange(4e-5, 2e-3, 2e-5), 'clf__loss': ('squared_loss', 'hinge', 'squared_hinge'), 'clf__penalty': ('l2', 'elasticnet'), } gs_clf = GridSearchCV(text_clf, SGDClassifier_params, n_jobs=n_jobs, cv=n_folds, refit=True, verbose=3) gs_clf.fit(x_train, train.target) result_str = list() result_str.append('\n') result_str.append('best params:') result_str.append(str(gs_clf.best_params_)) result_str.append('best score = %f' % gs_clf.best_score_) result_str = '\n'.join(result_str) print result_str print "test score = " % gs_clf.score(x_test, test.target)
def load_sklearn_data(self,name): if name == "digits": training = fetch_20newsgroups(subset='train',shuffle=True,random_state=42); testing = fetch_20newsgroups(subset='test',shuffle=True,random_state=100); validation = fetch_20newsgroups(subset='test',shuffle=True,random_state=200); categories = training.target_names data_train_size_mb = size_mb(training.data) data_test_size_mb = size_mb(testing.data) data_test_size_mb = size_mb(validation.data) print("%d documents - %0.3fMB (training set)" % ( len(training.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(testing.data), data_test_size_mb)) print("%d documents - %0.3fMB (test set)" % ( len(validation.data), data_test_size_mb)) print("%d categories" % len(categories)) print() training=[training.data,training.target_names] testing=[testing.data,testing.target_names] validation=[validation.data,validation.target_names] return [training,testing,validation];
def export_20ng(remove_headers=False, remove_footers=False, remove_quotes=False, categories=None): output_dir = os.path.join('..', 'datasets', '20ng', 'data') if not os.path.exists(output_dir): os.makedirs(output_dir) remove = [] if remove_headers: remove.append('headers') if remove_footers: remove.append('footers') if remove_quotes: remove.append('quotes') print categories ng_train = fetch_20newsgroups(subset='train', remove=remove, categories=categories) keys = ['train' + str(i) for i in range(len(ng_train.data))] print len(keys) train_text = dict(zip(keys, ng_train.data)) fh.write_to_json(train_text, os.path.join(output_dir, 'train.json')) train_labels = pd.DataFrame(ng_train.target, columns=['target'], index=keys) train_labels.to_csv(os.path.join(output_dir, 'train.csv')) print train_labels.shape ng_test = fetch_20newsgroups(subset='test', remove=remove, categories=categories) keys = ['test' + str(i) for i in range(len(ng_test.data))] test_text = dict(zip(keys, ng_train.data)) fh.write_to_json(test_text, os.path.join(output_dir, 'test.json')) test_labels = pd.DataFrame(ng_test.target, columns=['target'], index=keys) test_labels.to_csv(os.path.join(output_dir, 'test.csv'))
def load_20newsgroups(category=None, shuffle=True, rnd=1): categories = {'religion': ['alt.atheism', 'talk.religion.misc'], 'graphics': ['comp.graphics', 'comp.windows.x'], 'hardware': ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], 'baseball': ['rec.sport.baseball', 'sci.crypt']} cat = None if category is not None: cat = categories[category] data = bunch.Bunch() data.train = fetch_20newsgroups(subset='train', categories=cat, remove=('headers', 'footers', 'quotes'), shuffle=shuffle, random_state=rnd) # data.train.data = np.array([keep_header_subject(text) for text in data.train.data], dtype=object) data.train.data = np.array(data.train.data, dtype=object) data.test = fetch_20newsgroups(subset='test', categories=cat, remove=('headers', 'footers', 'quotes'), shuffle=shuffle, random_state=rnd) # data.test.data = np.array([keep_header_subject(text) for text in data.test.data], dtype=object) data.test.data = np.array(data.test.data, dtype=object) data = minimum_size(data) if shuffle: random_state = np.random.RandomState(rnd) indices = np.arange(data.train.target.shape[0]) random_state.shuffle(indices) data.train.filenames = data.train.filenames[indices] data.train.target = data.train.target[indices] # Use an object array to shuffle: avoids memory copy data_lst = np.array(data.train.data, dtype=object) data_lst = data_lst[indices] data.train.data = data_lst return data
def Load20NG(): cats = ['alt.atheism', 'soc.religion.christian'] newsgroups_train = fetch_20newsgroups(subset='train', categories=cats) newsgroups_test = fetch_20newsgroups(subset='test', categories=cats) train, train_labels = newsgroups_train.data, newsgroups_train.target test, test_labels = newsgroups_test.data, newsgroups_test.target return train, train_labels, test, test_labels
def loadData(self, opts): if opts.all_categories: categories = None else: categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] if opts.filtered: remove = ('headers', 'footers', 'quotes') else: remove = () print('Loading 20 newsgroups dataset for categories:') print((categories if categories else 'all')) data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove) categories = data_train.target_names # for case categories == None # print(len(data_train)) print('data loaded') return data_train, data_test, categories
def uai(params):#, **kwargs): print 'Params: ', params, '\n' #y = benchmark_functions.save_svm_on_grid(params, opt_time=ret_time, **kwargs) logreg = linear_model.LogisticRegression(penalty=params['penalty'],tol=float(params['tol']),C=float(params['strength'])) if params['n_min'] > params['n_max']: z=params['n_min'] params['n_min']=params['n_max'] params['n_max']=z if params['stop_words']==True: st='english' else: st=None vectorizer = TfidfVectorizer(ngram_range=(int(params['n_min']),int(params['n_max'])),binary=params['binary'],use_idf=params['idf'],smooth_idf=True,stop_words=st) if params['cats'] == 'all': cats = None elif params['cats'] == 'science': cats = ['sci.med','sci.space','sci.crypt','sci.electronics'] elif params['cats'] == 'religion': cats = ['alt.atheism', 'talk.religion.misc'] elif params['cats'] == 'graphics': cats = ['comp.windows.x','comp.graphics'] #cats = ['sci.med','sci.space'] #cats = ['comp.sys.ibm.pc.hardware','comp.sys.mac.hardware'] print 'preprocess data' #newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=cats) #vectors = vectorizer.fit_transform(newsgroups_train.data) #print vectors.shape #newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=cats) #print 'preprocess test data' #vectors_test = vectorizer.fit_transform(newsgroups_test.data) if params['rm_footers']: to_remove = ('headers', 'footers') else: to_remove = ('headers',) print_20n(to_remove, cats, params) newsgroups_all = fetch_20newsgroups(subset='all', remove=to_remove, categories=cats)#,'footers'))#,'footers','quotes'), categories=cats) vectors_all = vectorizer.fit_transform(newsgroups_all.data) #nrow=round(7.0/10.0*vectors_all.shape[0]) newsgroups_train = fetch_20newsgroups(subset='train',remove=to_remove, categories=cats) nrow=newsgroups_train.target.shape[0] #print nrow #print vectors_all.shape vectors=vectors_all[0:nrow,:] vectors_test=vectors_all[nrow:,:] #print vectors.shape #print vectors_test.shape print 'fit model' logreg.fit(vectors,newsgroups_all.target[0:nrow]) print 'predict model' pred=logreg.predict(vectors_test) print 'evaluate' y=metrics.accuracy_score(newsgroups_all.target[nrow:], pred) print 'Result: ', y print('idf: ', params['idf'], 'rm_footers: ', params['rm_footers'], 'cats: ', params['cats']) return -y
def exercise(): groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'] train_data = fetch_20newsgroups(subset='train', categories=groups) clusterizer = DocumentClusterizer() clusterizer.train(train_data.data) test_data = fetch_20newsgroups(subset='test', categories=groups) for i in range(10): sample = test_data.data[np.random.randint(len(test_data.data))] clusterizer.find_most_similar(sample)
def load_data(): twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) twenty_test = fetch_20newsgroups(subset='test',shuffle=True, random_state=42) x_train = twenty_train.data y_train = twenty_train.target x_test = twenty_test.data y_test = twenty_test.target print 'data loaded!' return (x_train, y_train, x_test, y_test)
def get_login_pages(keywords): from sklearn.datasets import fetch_20newsgroups import gensim import re """ newsgroups_train = fetch_20newsgroups(subset='train') for news in newsgroups_train.target_names: print news alt.atheism comp.graphics comp.os.ms-windows.misc comp.sys.ibm.pc.hardware comp.sys.mac.hardware comp.windows.x misc.forsale rec.autos rec.motorcycles rec.sport.baseball rec.sport.hockey sci.crypt sci.electronics sci.med sci.space soc.religion.christian talk.politics.guns talk.politics.mideast talk.politics.misc talk.religion.misc """ #cats = ['sci.crypt'] #newsgroups_train = fetch_20newsgroups(subset='train', categories=cats) newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') newsgroups=[] newsgroups.append(newsgroups_train.data) newsgroups.append(newsgroups_test.data) #newsgroups_train = fetch_20newsgroups() #print len(newsgroups_train.data) print newsgroups_train.data sentences=[re.findall("[a-z\-]+",s.lower()) for s in newsgroups_train.data] #sentences = [s.lower().split() for s in newsgroups_train.data] #print sentences model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=1, workers=4,iter=20) #print len(sentences) for key in keywords: print "[%s] most_similar:" % key results=model.most_similar(positive=[key], topn=10) for i in results: print i
def testNaiveBayesSK2(self): categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) vectorizer = TfidfVectorizer() # Both vectors and vectors_test are SciPy CSR matrix vectors = vectorizer.fit_transform(newsgroups_train.data) vectors_test = vectorizer.transform(newsgroups_test.data) nb = NaiveBayes(sqlCtx) nb.fit(vectors, newsgroups_train.target) pred = nb.predict(vectors_test) score = metrics.f1_score(newsgroups_test.target, pred, average='weighted') self.failUnless(score > 0.8)
def load_dataset(category_list): """ :return: Load the 20_newsgroup dataset depending on category_list. If [] provided return everything """ if category_list == []: # read all categories from news20 dataset train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42) else: # read only computer technology & recreational activity categories train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories=category_list) test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories=category_list) return train, test
def test_naive_bayes1(self): categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] newsgroups_train = fetch_20newsgroups(subset='train', categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', categories=categories) vectorizer = TfidfVectorizer() # Both vectors and vectors_test are SciPy CSR matrix vectors = vectorizer.fit_transform(newsgroups_train.data) vectors_test = vectorizer.transform(newsgroups_test.data) nb = NaiveBayes(sparkSession) mllearn_predicted = nb.fit(vectors, newsgroups_train.target).predict(vectors_test) from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() sklearn_predicted = clf.fit(vectors, newsgroups_train.target).predict(vectors_test) self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )
def __init__(self): data_train = fetch_20newsgroups(subset='train', categories=None, shuffle=True, random_state=42) data_test = fetch_20newsgroups(subset='test', categories=None, shuffle=True, random_state=42) self.train_data = data_train.data self.train_target = data_train.target self.alltest_data = data_test.data self.alltest_target = data_test.target self.categories = data_train.target_names self.num_classes = 20 DataGatherer.__init__(self)
def test_20news_length_consistency(): """Checks the length consistencies within the bunch This is a non-regression test for a bug present in 0.16.1. """ try: data = datasets.fetch_20newsgroups( subset='all', download_if_missing=False, shuffle=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # Extract the full dataset data = datasets.fetch_20newsgroups(subset='all') assert_equal(len(data['data']), len(data.data)) assert_equal(len(data['target']), len(data.target)) assert_equal(len(data['filenames']), len(data.filenames))
def load_20_news_data(data_path=None, max_size=None): newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=100, remove=('headers', 'footers', 'quotes')) data = pd.DataFrame({"text" : newsgroups.data, "label" : newsgroups.target}) if max_size is not None: data = data[0:max_size] return data
def _download_20newsgroup(): """ Download the 20 newsgroups dataset from scikitlearn. :return: The train, test and validation set. """ from sklearn.datasets import fetch_20newsgroups print "downloading 20 newsgroup train data...." newsgroups_train = fetch_20newsgroups( subset='train', remove=('headers', 'footers', 'quotes')) print "downloading 20 newsgroup test data...." newsgroups_test = fetch_20newsgroups( subset='test', remove=('headers', 'footers', 'quotes')) train_set = (newsgroups_train.data, newsgroups_train.target) test_set = (newsgroups_test.data, newsgroups_test.target) return train_set,test_set
def get_train_data(): try: twenty_train = pickle.load("twenty_train.p") except: twenty_train = fetch_20newsgroups(subset='train', categories=CATEGORIES, shuffle=True, random_state=42) pickle.dump(twenty_train, open("twenty_train.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL) return twenty_train
def main(): newsgoups = fetch_20newsgroups(subset='train', categories=['sci.crypt', 'talk.politics.guns']) vectorizer = CountVectorizer() vector = vectorizer.fit_transform(newsgoups.data, newsgoups.target) vocab = np.array(vectorizer.get_feature_names()) print "number of positive examples:", np.sum(newsgoups.target) t0 = time.time() ig_scores, _ = ig(vector, newsgoups.target) print "Information Gain top 50 scored terms:" print vocab[np.argsort(ig_scores)][-50:] print "time: %.4f secs" % (time.time()-t0) t0 = time.time() bns_scores, _ = bns(vector, newsgoups.target) print "Bi-Normal Separation top 50 scored terms:" print vocab[np.argsort(bns_scores)][-50:] print "time: %.4f secs" % (time.time()-t0) t0 = time.time() chi2_scores, _ = chi2(vector, newsgoups.target) print "Chi Squared top 50 scored terms:" print vocab[np.argsort(chi2_scores)][-50:] print "time: %.4f secs" % (time.time()-t0)
def fetch_and_save(dirpath, vocpath=None, min_df=6, tokenizer=None): """ Fetches the 20 newsgroups corpus, vectorized the documents, stores them in a database of lists and saves it to file. """ # Loading data newsgroups_dataset = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"), random_state=123) # uses a predefined vocabulary list if available if vocpath: vocabulary = load_vocabulary(vocpath) newsgroups_counter = CountVectorizer( stop_words="english", tokenizer=tokenizer, vocabulary=vocabulary, min_df=min_df ) else: newsgroups_counter = CountVectorizer(stop_words="english", tokenizer=tokenizer, min_df=min_df) # generates csr matrix with the vectors of term frequencies newsgroups_mat = newsgroups_counter.fit_transform(newsgroups_dataset.data) # converts csr matrix to a database of lists num_of_docs, vocab_size = newsgroups_mat.shape newsgroups_list = [[] for i in xrange(num_of_docs)] newsgroups_coo = newsgroups_mat.tocoo() for i, j, v in itertools.izip(newsgroups_coo.row, newsgroups_coo.col, newsgroups_coo.data): newsgroups_list[i].append([j, v]) # saves corpus, vocabulary and indices save_corpus_to_file(dirpath + "/20newsgroups.corpus", newsgroups_list) save_vocabulary_to_file(dirpath + "/20newsgroups.vocab", newsgroups_list, newsgroups_counter) save_idx_to_file(dirpath + "/20newsgroups.idx", newsgroups_dataset)
def load_20news(): newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(newsgroups_train.data) X_test = vectorizer.transform(newsgroups_test.data) y_train = newsgroups_train.target y_test = newsgroups_test.target X_train, X_val = X_train[:-1000], X_train[-1000:] y_train, y_val = y_train[:-1000], y_train[-1000:] return X_train, y_train, X_val, y_val, X_test, y_test
def retrieve_data(): graphics_train = fetch_20newsgroups(subset = 'train', shuffle = True, random_state = 42) categories = graphics_train.target_names # cluster the data from one class all_data = graphics_train.data filenames = graphics_train.filenames return all_data, filenames, categories
def category_docs_frequency_count(category): category_train = fetch_20newsgroups(subset='train', categories=category, shuffle=True, random_state=42) frequency = [] frequency = collections.Counter(category_train.target) #count frequency of category ids docs_count = sum(frequency.values()) #sum up frequencies of docs of a category return docs_count
def load_newsgroup_data(V, cats, sort_data=True): from sklearn.datasets import fetch_20newsgroups print("Downloading newsgroups data...") print('cats = %s' % cats) newsgroups = fetch_20newsgroups( subset="train", categories=cats, remove=('headers', 'footers', 'quotes')) return get_sparse_repr(newsgroups.data, V, sort_data)
stop_words = text.ENGLISH_STOP_WORDS class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): new_doc = re.sub('[,.-:/()?{}*$#&]', ' ', doc) # Remove symbols new_doc = ''.join([ch for ch in new_doc if ch not in string.punctuation]) # remove all punctuation new_doc = "".join(ch for ch in new_doc if ord(ch) < 128) # remove all n5n-ascii characters new_doc = new_doc.lower() # convert to lowercase return [self.wnl.lemmatize(t) for t in word_tokenize(new_doc)] # stemmed # Load the eight category categories_8 = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey'] eight_train = fetch_20newsgroups(subset='train', categories=categories_8, shuffle=True, random_state=42) eight_test = fetch_20newsgroups(subset='test', categories=categories_8, shuffle=True, random_state=42) # Tokenize each document into words # Gets rid of stop words, and stemmed version of word # Ignores words appearing in less then 5 (or 2 if min_df = 2) documents vectorizer = CountVectorizer(min_df=5, stop_words= stop_words, tokenizer=LemmaTokenizer() ) X_train_counts = vectorizer.fit_transform(eight_train.data) X_test_counts = vectorizer.transform(eight_test.data) # TFIDF # We set smooth_idf = false so we use the equation idf(d, t) = log [ n / df(d, t) ] + 1 tfidf_transformer = TfidfTransformer(smooth_idf=False) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_tfidf = tfidf_transformer.transform(X_test_counts)
''' Source codes for Python Machine Learning By Example 2nd Edition (Packt Publishing) Chapter 2: Exploring the 20 Newsgroups Dataset with Text Analysis Techniques Author: Yuxi (Hayden) Liu ''' from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer categories_3 = ['talk.religion.misc', 'comp.graphics', 'sci.space'] groups_3 = fetch_20newsgroups(categories=categories_3) def is_letter_only(word): for char in word: if not char.isalpha(): return False return True from nltk.corpus import names all_names = set(names.words()) count_vector_sw = CountVectorizer(stop_words="english", max_features=500) from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() data_cleaned = []
from sklearn.datasets import fetch_20newsgroups from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer emails = fetch_20newsgroups( categories=['rec.sport.baseball', 'rec.sport.hockey']) #print(emails.target_names) #print(emails.data[5]) #print(emails.target[5]) train_emails = fetch_20newsgroups( categories=['rec.sport.baseball', 'rec.sport.hockey'], subset='train', shuffle=True, random_state=108) test_emails = fetch_20newsgroups( categories=['rec.sport.baseball', 'rec.sport.hockey'], subset='test', shuffle=True, random_state=108) counter = CountVectorizer() counter.fit(test_emails.data + train_emails.data) train_counts = counter.transform(train_emails.data) test_counts = counter.transform(test_emails.data) classifier = MultinomialNB() classifier.fit(train_counts, train_emails.target) print(classifier.score(test_counts, test_emails.target))
import scipy as sp from sklearn import (datasets, svm, metrics) from nlp02_onehot_word import build_vocab from nlp02_bow_hand import build_idf, doc2bow_hit, doc2bow_count, doc2bow_tfidf # Load the 20 newsgroup dataset remove = ('headers', 'footers', 'quotes') train = datasets.fetch_20newsgroups(subset='train', remove=remove) test = datasets.fetch_20newsgroups(subset='test', remove=remove) # Build a vocaburary and its document frequency vocab = build_vocab(train.data) idf = build_idf(train.data, vocab) # Vectorize training and test data dataset_vectors = [ {'name' : 'Hit', # Stack document vectors vertically for the whole dataset 'train': sp.sparse.vstack([doc2bow_hit(doc, vocab) for doc in train.data]), 'test' : sp.sparse.vstack([doc2bow_hit(doc, vocab) for doc in test.data])}, {'name' : 'Count', 'train': sp.sparse.vstack([doc2bow_count(doc, vocab) for doc in train.data]), 'test' : sp.sparse.vstack([doc2bow_count(doc, vocab) for doc in test.data])}, {'name' : 'TF-IDF', 'train': sp.sparse.vstack([doc2bow_tfidf(doc, vocab, idf) for doc in train.data]), 'test' : sp.sparse.vstack([doc2bow_tfidf(doc, vocab, idf) for doc in test.data])}, ] # Test with the SVM classifier print('### Classification test (accuracy)') for vector in dataset_vectors:
from sklearn.datasets import fetch_20newsgroups from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn import metrics from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB from nltk.corpus import stopwords #problem1 twenty_train = fetch_20newsgroups(subset='train', shuffle=True) tfidf_Vect = TfidfVectorizer() X_train_tfidf = tfidf_Vect.fit_transform(twenty_train.data) clf = MultinomialNB() classifier = SVC(kernel='linear', random_state=0) clf.fit(X_train_tfidf, twenty_train.target) classifier.fit(X_train_tfidf, twenty_train.target) twenty_test = fetch_20newsgroups(subset='test', shuffle=True) X_test_tfidf = tfidf_Vect.transform(twenty_test.data) predicted =clf.predict(X_test_tfidf) predicted1 = classifier.predict(X_test_tfidf) score = metrics.accuracy_score(twenty_test.target, predicted) score1 = metrics.accuracy_score(twenty_test.target, predicted1) print("accuracy score with multinomialNB",score) print("accuracy score after applyingSVM",score1)
def main3(): newsgroups = fetch_20newsgroups(subset='all') count_vec = CountVectorizer(analyzer='word', stop_words='english') vec = count_vec.fit_transform(newsgroups.data) lab = newsgroups.target newvec = SelectKBest(chi2, k=features_num).fit_transform(vec, lab).todense() print(numpy.shape(newvec)) #print(newvec) def add_layer( inputs, in_size, out_size, activation_function=None, ): weights = { 'h1': tf.Variable(tf.random_normal([features_num, 100])), 'h2': tf.Variable(tf.random_normal([100, 100])), 'out': tf.Variable(tf.random_normal([100, 20])) } biases = { 'b1': tf.Variable(tf.zeros([1, 100]) + 0.1, ), 'b2': tf.Variable(tf.zeros([1, 100]) + 0.1, ), 'out': tf.Variable(tf.zeros([1, 20]) + 0.1, ) } layer_1_multiplication = tf.matmul(inputs, weights['h1']) layer_1_addition = tf.add(layer_1_multiplication, biases['b1']) layer_1 = tf.nn.relu(layer_1_addition) layer_2_multiplication = tf.matmul(layer_1, weights['h2']) layer_2_addition = tf.add(layer_2_multiplication, biases['b2']) layer_2 = tf.nn.relu(layer_2_addition) out_layer_multiplication = tf.matmul(layer_2, weights['out']) out_layer_addition = out_layer_multiplication + biases['out'] return out_layer_addition def compute_accuracy(v_xs, v_ys): y_pre = sess.run(prediction, feed_dict={xs: v_xs}) v_ys = numpy.array(v_ys) correct_prediction = tf.equal(tf.argmax(y_pre, 1), tf.argmax(v_ys, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) result = sess.run(accuracy, feed_dict={xs: v_xs, ys: v_ys}) return result xs = tf.placeholder(tf.float32, [None, features_num]) ys = tf.placeholder(tf.float32, [None, 20]) prediction = add_layer(xs, features_num, 20, activation_function=tf.nn.softmax) cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=ys)) train_step = tf.train.GradientDescentOptimizer(0.01).minimize( cross_entropy) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) batchsize = 100 def get_batch(vec, lab, i): batches = [] results = [] texts = vec[i * batchsize:i * batchsize + batchsize] categories = lab[i * batchsize:i * batchsize + batchsize] for text in texts: features = numpy.zeros((features_num), dtype=float) for i in range(features_num): features[i] = text[0, i] batches.append(features) for category in categories: y = numpy.zeros((20), dtype=int) y[category] = 1 results.append(y) return batches, results def getall(vec, lab): batches = [] results = [] texts = vec[0:8000] categories = lab[0:8000] for text in texts: features = numpy.zeros((features_num), dtype=float) for i in range(features_num): features[i] = text[0, i] batches.append(features) for category in categories: y = numpy.zeros((20), dtype=int) y[category] = 1 results.append(y) return batches, results for i in range(1000): print(i) batch_xs, batch_ys = get_batch(newvec, lab, i) if (len(batch_xs) == 0): break sess.run(train_step, feed_dict={xs: batch_xs, ys: batch_ys}) all_xs, all_ys = getall(newvec, lab) print(compute_accuracy(all_xs, all_ys))
#first extract the 20 news_group dataset to /scikit_learn_data from sklearn.datasets import fetch_20newsgroups #all categories #newsgroup_train = fetch_20newsgroups(subset='train') #part categories categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x']; newsgroup_train = fetch_20newsgroups(subset = 'train',categories = categories); def calculate_result(actual,pred): m_precision = metrics.precision_score(actual,pred,average='macro'); m_recall = metrics.recall_score(actual,pred,average='macro'); print 'predict info:' print 'precision:{0:.3f}'.format(m_precision) print 'recall:{0:0.3f}'.format(m_recall); print 'f1-score:{0:.3f}'.format(metrics.f1_score(actual,pred,average='macro')); #print category names from pprint import pprint pprint(list(newsgroup_train.target_names)) #newsgroup_train.data is the original documents, but we need to extract the #TF-IDF vectors inorder to model the text data from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer #vectorizer = TfidfVectorizer(sublinear_tf = True,
# TextBlob is wrapper over NLTK and provide easy-to-use built-in functions #and methods from nltk.corpus import names from sklearn.feature_extraction.text import CountVectorizer from nltk.stem.porter import PorterStemmer from nltk.stem import WordNetLemmatizer # download the dataset from sklearn.datasets import fetch_20newsgroups # see the unique import numpy as np from sklearn.decomposition import NMF # download the dataset groups = fetch_20newsgroups() ps = PorterStemmer() lm = WordNetLemmatizer() # lets check few names print(names.words()[:20]) # difference between Stemming and lemmatization is that lemmatization is a #cautious version of stemming # examples ps.stem("machines") ps.stem('learning') # lemmatization algo based on wordnet corpus built-in lm.lemmatize('machines') lm.lemmatize('learning') # lm works on nouns not on verb
from sklearn.cross_validation import train_test_split from sklearn.model_selection import cross_val_predict from sklearn.preprocessing import StandardScaler from sklearn.utils.extmath import density from sklearn import metrics from sklearn.decomposition import PCA from time import time import matplotlib.pyplot as plt from sklearn.utils.extmath import density from sklearn import metrics # ############################################################################# # Load data set print("Loading 20 newsgroups dataset:") data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) data_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42) print('data loaded') target_names = data_train.target_names print("%d documents (training set)" % len(data_train.data)) print("%d documents (test set)" % len(data_test.data)) print("%d categories" % len(data_train.target_names)) print() # ############################################################################# # split into train set and test set y_train, y_test = data_train.target, data_test.target # #############################################################################
'talk.religion.misc', 'comp.graphics', 'sci.space', ] if opts.filtered: remove = ('headers', 'footers', 'quotes') else: remove = () print("Loading 20 newsgroups dataset for categories:") print(categories if categories else "all") data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, remove=remove) data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, remove=remove) print('data loaded') target_names = data_train.target_names def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6
# coding: utf-8 # In[2]: import matplotlib.pyplot as plt import seaborn as sns sns.set() import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.datasets import fetch_20newsgroups data = fetch_20newsgroups() data.target_names # In[3]: categories = [ 'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] # In[4]: ## Training the data on these categories train = fetch_20newsgroups(subset="train", categories=categories) ## testing the data for these categories
import numpy as np import pandas as pd from sklearn.datasets import fetch_20newsgroups if __name__ == '__main__': # Fetch the dataset data = fetch_20newsgroups(subset="all") texts = np.array(data.data) labels = np.array(data.target) df = pd.DataFrame(data={'texts': texts, 'labels': labels}) df.to_csv('20newsgroups.csv', index=False) # Read split indices with open('splits/test', 'r') as f: test_idx = np.array(list(map(int, f.read().splitlines()))) with open('splits/validation', 'r') as f: validation_idx = np.array(list(map(int, f.read().splitlines()))) assert not set(test_idx).intersection(set(validation_idx)) test_texts, test_labels = texts[test_idx], labels[test_idx] val_texts, val_labels = texts[validation_idx], labels[validation_idx] concat_idx = np.append(test_idx, validation_idx) texts, labels = np.delete(texts, concat_idx), np.delete(labels, concat_idx) df_test = pd.DataFrame(data={'texts': test_texts, 'labels': test_labels}) df_val = pd.DataFrame(data={'texts': val_texts, 'labels': val_labels}) # the test set contains is also used for training because it's unsupervised learning df_test.to_csv('20newsgroups_train.csv', index=False) df_val.to_csv('20newsgroups_val.csv', index=False)
from sklearn.datasets import fetch_20newsgroups news = fetch_20newsgroups(subset='all') X, y = news.data, news.target from bs4 import BeautifulSoup import nltk, re def news_to_sentences(news): news_text = BeautifulSoup(news).get_text() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') raw_sentences = tokenizer.tokenize(news_text) sentences = [] for sent in raw_sentences: sentences.append( re.sub('[^a-zA-Z]', ' ', sent.lower().strip()).split()) return sentences sentences = [] for x in X: sentences += news_to_sentences(x) from gensim.models import word2vec
loss = torch.mul(x, weight) return loss.mean() def train_model(data: GloveDataset): optimizer = torch.optim.Adam(data.all_params, weight_decay=1e-8) optimizer.zero_grad() for epoch in tqdm(range(NUM_EPOCH)): logging.info("Start epoch %i", epoch) num_batches = int(len(data) / BATCH_SIZE) avg_loss = 0.0 n_batch = int(len(data) / BATCH_SIZE) for batch in tqdm(gen_batchs(data), total=n_batch, mininterval=1): optimizer.zero_grad() loss = get_loss(*batch) avg_loss += loss.data.item() / num_batches loss.backward() optimizer.step() logging.info("Average loss for epoch %i: %.5f", epoch + 1, avg_loss) if __name__ == "__main__": logging.info("Fetching data") newsgroup = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) logging.info("Build dataset") glove_data = GloveDataset(newsgroup.data, right_window=RIGHT_WINDOW) logging.info("#Words: %s", glove_data.indexer.n_words) logging.info("#Ngrams: %s", len(glove_data)) logging.info("Start training") train_model(glove_data)
# coding: utf-8 # # Part 1 - Clustering of Text Data # ## Question 1: Build TF-IDF Matrix # In[1]: from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) vectorizer = CountVectorizer(min_df=3, stop_words='english') X_counts = vectorizer.fit_transform(dataset.data) tfidf_transformer = TfidfTransformer() X_tfidf = tfidf_transformer.fit_transform(X_counts) labels = [0 if label<4 else 1 for label in dataset.target] print('X_tfidf Shape:', X_tfidf.shape) # ## Question 2: Contingency Table of K-means Clustering # In[2]:
from sklearn import datasets from sklearn.cross_validation import KFold from sklearn.feature_extraction import text from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC ''' Для начала вам потребуется загрузить данные. В этом задании мы воспользуемся одним из датасетов, доступных в scikit-learn'е — 20 newsgroups. Для этого нужно воспользоваться модулем datasets: 1 Загрузите объекты из новостного датасета 20 newsgroups, относящиеся к категориям "космос" и "атеизм" (инструкция приведена выше). Обратите внимание, что загрузка данных может занять несколько минут ''' print("Loading dataset...") t0 = time() newsgroups = datasets.fetch_20newsgroups( subset='all', categories=['alt.atheism', 'sci.space'], download_if_missing=True) data_samples = newsgroups.data print("done in %0.3fs." % (time() - t0)) ''' После выполнения этого кода массив с текстами будет находиться в поле newsgroups.data, номер класса — в поле newsgroups.target. Одна из сложностей работы с текстовыми данными состоит в том, что для них нужно построить числовое представление. Одним из способов нахождения такого представления является вычисление TF-IDF. В Scikit-Learn это реализовано в классе sklearn.feature_extraction.text.TfidfVectorizer. Преобразование обучающей выборки нужно делать с помощью функции fit_transform, тестовой — с помощью transform. Реализация SVM-классификатора находится в классе sklearn.svm.SVC. Веса каждого признака у обученного классификатора хранятся в поле coef_. Чтобы понять, какому слову соответствует i-й признак,
print "This run will use min_df=" + str(this_df) print "numpy version: " + np.__version__ print "sklearn version: " + skl.__version__ print "matplotlib version: " + mpl.__version__ print "nltk version: " + nltk.__version__ categories = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ] trainset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) testset = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42) #plot the histogram for part a #plt.hist(trainset.target, bins=range(min(trainset.target), (max(trainset.target) + 2))) #plt.xlabel('Article target number') #plt.ylabel('Number of articles target number') #plt.title('Histogram of Article distribution, training set') #plt.show() # #plt.hist(testset.target, bins=range(min(testset.target), (max(testset.target) + 2))) #plt.xlabel('Article target number')
#=================================Obtain data================================== comp_tech_subclasses = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware'] rec_act_subclasses = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] dataset = fetch_20newsgroups(subset='all', categories=comp_tech_subclasses+rec_act_subclasses, shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes')) labels = [1]*len(dataset.data) for i in range(len(dataset.data)): if dataset.target[i] > 3: labels[i] = 0 #============================================================================== #===================Remove Punctuation & Stem & Stop Words=====================
from sklearn.datasets import fetch_20newsgroups Training_data = fetch_20newsgroups(subset='train', shuffle=True) Training_data.target_names from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import SGDClassifier svm_classification = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) svm_classification = svm_classification.fit(Training_data.data, Training_data.target) import numpy as np Testing_data = fetch_20newsgroups(subset='test', shuffle=True) svm_prediction = svm_classification.predict(Testing_data.data) print("Accuracy of Support Vector Machine in percentage :", np.mean(svm_prediction == Testing_data.target) * 100)
for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) print(" ".join( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print() # Load the 20 newsgroups dataset and vectorize it. We use a few heuristics # to filter out useless terms early on: the posts are stripped of headers, # footers and quoted replies, and common English words, words occurring in # only one document or in at least 95% of the documents are removed. print("Loading dataset...") t0 = time() dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) data_samples = dataset.data[:n_samples] print("done in %0.3fs." % (time() - t0)) # Use tf-idf features for NMF. print("Extracting tf-idf features for NMF...") tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') t0 = time() tfidf = tfidf_vectorizer.fit_transform(data_samples) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA.
from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline from sklearn import metrics from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.datasets import fetch_20newsgroups newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') X_train = newsgroups_train.data X_test = newsgroups_test.data y_train = newsgroups_train.target y_test = newsgroups_test.target text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators=100)), ]) text_clf.fit(X_train, y_train) predicted = text_clf.predict(X_test)
from sklearn import datasets from sklearn import svm from sklearn.model_selection import GridSearchCV, KFold # TF-IDF - Это показатель, который равен произведению двух чисел: # TF (term frequency) и IDF (inverse document frequency). # Первая равна отношению числа вхождений слова в документ к общей длине документа. # Вторая величина зависит от того, в скольки документах выборки встречается это слово. from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np # массив с текстами будет находиться в поле newsgroups.data, # номер класса — в поле newsgroups.target. newsgroups = datasets.fetch_20newsgroups( subset='all', categories=['alt.atheism', 'sci.space']) tfid_newsgroups = TfidfVectorizer() data_params = tfid_newsgroups.fit_transform(newsgroups.data) # Чтобы понять, какому слову соответствует i-й признак, # можно воспользоваться методом get_feature_names() у TfidfVectorizer: feature_mapping = tfid_newsgroups.get_feature_names() grid = {'C': np.power(10.0, np.arange(-5, 6))} cv = KFold(n_splits=5, shuffle=True, random_state=241) clf = svm.SVC(kernel='linear', random_state=241) gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv) gs.fit(data_params, newsgroups.target) clf = svm.SVC(kernel='linear', C=gs.best_estimator_.C, random_state=241) clf.fit(data_params, newsgroups.target)
'comp.sys.mac.hardware', 'comp.windows.x', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'misc.forsale', 'talk.politics.misc', 'talk.politics.guns', 'talk.politics.mideast', 'talk.religion.misc', 'alt.atheism', 'soc.religion.christian' ] # Print information print("Loading 20 newsgroups dataset for categories...") print(categories) print() # Load dataset and split two groups dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes')) size = dataset.target.shape[0] for i in range(0, size): if (dataset.target[i] <= 4): dataset.target[i] = 0 if (5 <= dataset.target[i] and dataset.target[i] <= 8): dataset.target[i] = 1 if (9 <= dataset.target[i] and dataset.target[i] <= 12): dataset.target[i] = 2 if (dataset.target[i] == 13): dataset.target[i] = 3 if (14 <= dataset.target[i] and dataset.target[i] <= 16): dataset.target[i] = 4 if (17 <= dataset.target[i] and dataset.target[i] <= 19):
from sklearn.datasets import fetch_20newsgroups import numpy as np import matplotlib.pyplot as plt categories = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ] train_data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) length = [] data = [] index = [] for m in range(8): temp_index = [] temp_index.append(list(np.where(train_data.target == m))[0]) index.append(temp_index) temp_data = [] for n in index[m][0]: temp_data.append(train_data.data[n]) data.append(temp_data) length.append(len(temp_data)) plt.figure() plt_index = range(8) width = 1
from gensim.models.word2vec import Word2Vec from sklearn.manifold import TSNE from sklearn.datasets import fetch_20newsgroups import re import matplotlib.pyplot as plt # download example data ( may take a while) train = fetch_20newsgroups() def clean(text): """Remove posting header, split by sentences and words, keep only letters""" lines = re.split('[?!.:]\s', re.sub('^.*Lines: \d+', '', re.sub('\n', ' ', text))) return [re.sub('[^a-zA-Z]', ' ', line).lower().split() for line in lines] sentences = [line for text in train.data for line in clean(text)] model = Word2Vec(sentences, workers=4, size=100, min_count=50, window=10, sample=1e-3) print(model.most_similar('memory')) X = model[model.wv.vocab] tsne = TSNE(n_components=2)
def getData(categories, subset, shuffle, random_state): return fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
], # weight components in ColumnTransformer transformer_weights={ 'subject': 0.8, 'body_bow': 0.5, 'body_stats': 1.0, })), # Use a SVC classifier on the combined features ('svc', LinearSVC()), ], verbose=True) # limit the list of categories to make running this example faster. categories = ['alt.atheism', 'talk.religion.misc'] train = fetch_20newsgroups( random_state=1, subset='train', categories=categories, ) test = fetch_20newsgroups( random_state=1, subset='test', categories=categories, ) pipeline.fit(train.data, train.target) y = pipeline.predict(test.data) print(classification_report(y, test.target))
pred = self.w @ vector # print(pred) pred = self.sigmoid(pred) labels.append(np.argmax(pred)) return labels def sigmoid(self, x): return 1.0 / (1.0 + np.exp(-x)) def loss(self, label, pred): return np.mean(np.square(label - pred)) newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes')) newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer(stop_words='english', lowercase=True) vectorizer.fit(newsgroups_train.data) train_vector = vectorizer.transform(newsgroups_train.data) test_vector = vectorizer.transform(newsgroups_test.data) decomp = decomposition.TruncatedSVD(n_components=300) train_pca = decomp.fit_transform(train_vector) test_pca = decomp.transform(test_vector)
from sklearn.datasets import fetch_20newsgroups from gensim.models import Word2Vec from nltk import sent_tokenize, word_tokenize newsgroups_data = fetch_20newsgroups(subset='all') sentences = [] for doc in newsgroups_data.data: for sent in sent_tokenize(doc): word_list = word_tokenize(sent) sentences.append(word_list) # reference https://radimrehurek.com/gensim/models/word2vec.html print('Start training!') model = Word2Vec(sentences, sg=1, hs=0, size=100, min_count=5, max_vocab_size=50000) # skip-gram with negative sampling model.save('20news-vectors-negative100.model') model.wv.save_word2vec_format('20news-vectors-negative100.bin', binary=True) print('Done training!')
return int(newsgroups_counts[:, count_vectorizer.vocabulary_["phone"]]. toarray().sum()) # ## Questão 7 # # Aplique `TfidfVectorizer` ao _data set_ `newsgroups` e descubra o TF-IDF da palavra _phone_. Responda como um único escalar arredondado para três casas decimais. # In[28]: tfidf_vectorizer = TfidfVectorizer() categories = ['sci.electronics', 'comp.graphics', 'rec.motorcycles'] newsgroup = fetch_20newsgroups(subset="train", categories=categories, shuffle=True, random_state=42) tfidf_vectorizer.fit(newsgroup.data) newsgroups_tfidf_vectorized = tfidf_vectorizer.transform(newsgroup.data) # In[29]: def q7(): idf_value = newsgroups_tfidf_vectorized[:, tfidf_vectorizer. vocabulary_["phone"]].toarray( ).sum()
def test_20news_vectorized(): try: datasets.fetch_20newsgroups(subset='all', download_if_missing=False) except IOError: raise SkipTest("Download 20 newsgroups to run this test") # test subset = train bunch = datasets.fetch_20newsgroups_vectorized(subset="train") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314, 130107)) assert_equal(bunch.target.shape[0], 11314) assert_equal(bunch.data.dtype, np.float64) # test subset = test bunch = datasets.fetch_20newsgroups_vectorized(subset="test") assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (7532, 130107)) assert_equal(bunch.target.shape[0], 7532) assert_equal(bunch.data.dtype, np.float64) # test return_X_y option fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test') check_return_X_y(bunch, fetch_func) # test subset = all bunch = datasets.fetch_20newsgroups_vectorized(subset='all') assert_true(sp.isspmatrix_csr(bunch.data)) assert_equal(bunch.data.shape, (11314 + 7532, 130107)) assert_equal(bunch.target.shape[0], 11314 + 7532) assert_equal(bunch.data.dtype, np.float64)