def loadTrain(): print("Loading 20 newsgroups training set...") news_train = load_mlcomp('20news-18828', 'train') print(news_train.DESCR) print("%d documents" % len(news_train.filenames)) print("%d categories" % len(news_train.target_names)) return news_train
def loadTest(): print("Loading 20 newsgroups test set...") news_test = load_mlcomp('20news-18828', 'test') t0 = time() print("done in %fs" % (time() - t0)) print("%d documents" % len(news_test.filenames)) print("%d categories" % len(news_test.target_names)) return news_test
def load_data(self): """Load the news_train file to see how it's structured""" # Load the training set print("Loading 20 newsgroups training set... ") self.news_train = load_mlcomp('20news-18828', 'train', mlcomp_root=MLCOMP) print(self.news_train.DESCR) print("%d documents" % len(self.news_train.filenames)) print("%d categories" % len(self.news_train.target_names)) return self.news_train
def load_texts(dataset_type='train', groups=None): """ load datasets to bytes list :return:train_dataset_bunch.data bytes list """ if groups == 'small': groups = ['comp.graphics', 'comp.os.ms-windows.misc'] # 仅用于小数据测试时用, #1368 elif groups == 'medium': groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'] # 中量数据时用 #3414 train_dataset_bunch = datasets.load_mlcomp('20news-18828', dataset_type, mlcomp_root='./datasets', categories=groups) # 13180 return train_dataset_bunch.data
def load_texts(dataset_type='train', groups='small'): """ load datasets to bytes list :return:train_dataset_bunch.items bytes list """ if groups == 'small': groups = ['comp.graphics', 'rec.motorcycles', 'talk.politics.guns'] # 仅用于小数据测试时用, #1368 elif groups == 'medium': groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.X', 'sci.space'] # 中量数据时用 #3414 train_dataset_bunch = datasets.load_mlcomp('20news-18828', dataset_type, mlcomp_root='../datasets', categories=groups) # 13180 texts = preprocess_texts(train_dataset_bunch.data) # bunch.items list of text bytes return texts
def extract_test_features(self): print("Loading 20 newsgroups test set... ") self.news_test = load_mlcomp('20news-18828', 'test', mlcomp_root=MLCOMP) t0 = time() print("done in %fs" % (time() - t0)) print("Predicting the labels of the test set...") print("%d documents" % len(self.news_test.filenames)) print("%d categories" % len(self.news_test.target_names)) print("Extracting features from the dataset using the same vectorizer") t0 = time() # Read in test data test_input_strings = (open(f, encoding='latin-1').read() for f in self.news_test.filenames) self.X_test = self.vectorizer.transform(test_input_strings) self.y_test = self.news_test.target print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % self.X_test.shape) return
if not os.path.exists(DATA_DIR): print("""\ It seems that you have not yet downloaded the MLCOMP data set. Please do so and place it into %s."""%DATA_DIR) sys.exit(1) # # groups = [ # 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', # 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'] # dataset = sklearn.datasets.load_mlcomp("20news-18828", "train", # mlcomp_root=DATA_DIR, # categories=groups) # Load the training set print("Loading 20 newsgroups training set... ") news_train = load_mlcomp('20news-18828', 'train',mlcomp_root=DATA_DIR) print(news_train.DESCR) print("%d documents" % len(news_train.filenames)) print("%d categories" % len(news_train.target_names)) print("Extracting features from the dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(encoding='latin1') X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train.shape) assert sp.issparse(X_train) y_train = news_train.target print("Loading 20 newsgroups test set... ")
#posts = load_data_from_dir("Building_ML_Systems_with_Python/chapter_03_Codes/data/toy", "\n") #X_train = vectorizer.fit_transform(posts) #n_samples, n_features = X_train.shape #print n_samples #post = "how does machine learning work?" #post_vec, found_post, distance = get_similar_posts(X_train, post, posts) # #print "\n" #print "The most similar post to '%s' is: '%s' with distance= %.2f" % (post, found_post, distance) MLCOMP_DIR = "Building_ML_Systems_with_Python/chapter_03_Codes/data" categories = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'] train_data = datasets.load_mlcomp("20news-18828", "train", mlcomp_root=MLCOMP_DIR, categories=categories) test_data = datasets.load_mlcomp("20news-18828", "test", mlcomp_root=MLCOMP_DIR, categories=categories) print("Number of training data posts:", len(train_data.filenames)) print("Number of test data posts:", len(test_data.filenames)) vectorizer = StemmedTfIdfCountVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore') X_train = vectorizer.fit_transform(train_data.data) X_test = vectorizer.transform(test_data.data) num_train_samples, num_train_features = X_train.shape num_test_samples, num_test_features = X_test.shape labels = train_data.target
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.naive_bayes import MultinomialNB print(__doc__) # if 'MLCOMP_DATASETS_HOME' not in os.environ: # print("MLCOMP_DATASETS_HOME not set; please follow the above instructions") # sys.exit(0) #/ updated by Ivy MLCOMP_DIR = r"E:\ctvit\lab\Code\Pycharm Project\cctv_news_content" # Load the training set print("Loading 20 newsgroups training set... ") news_train = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_DIR) # news_train = load_mlcomp('20news-18828', 'train') print(news_train.DESCR) print("%d documents" % len(news_train.filenames)) print("%d categories" % len(news_train.target_names)) print("Extracting features from the dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(encoding='latin1') X_train = vectorizer.fit_transform( (open(f).read() for f in news_train.filenames)) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train.shape) assert sp.issparse(X_train) y_train = news_train.target
┃┫┫ ┃┫┫ ┗┻┛ ┗┻┛ """ import pdb from sklearn import datasets from sklearn.cluster import KMeans from RelatedPosts import StemmedTfidfVectorizer, norm_euclidDist """ loading datasets into train_dataset_bunch """ MLCOMP_DIR = r'./datasets' groups = ['comp.graphics', 'comp.os.ms-windows.misc'] #仅用于小数据测试时用 # groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'] # 中量数据时用 train_dataset_bunch = datasets.load_mlcomp(name_or_id='20news-18828', set_='train', mlcomp_root=MLCOMP_DIR,categories=groups) #<class 'sklearn.datasets.base.Bunch'> # print(type(train_dataset_bunch.items)) #list # print(type(train_dataset_bunch.items[0])) #bytes # print(len(train_dataset_bunch.filenames)) # #posts=3414 """ fit_transform dataset.items into train_dataset_mats """ Vectorizer = StemmedTfidfVectorizer vectorizer = Vectorizer(decode_error='ignore', stop_words='english', min_df=10, max_df=0.5) # scipy.sparse.csr.csr_matrix train_dataset_mats = vectorizer.fit_transform(train_dataset_bunch.data) num_samples, num_features = train_dataset_mats.shape print("#samples:%d, #features:%d" % (num_samples, num_features))
from flask import Flask from flask import render_template from flask import request import dill from unicode import convertunicode from sklearn.datasets import load_mlcomp MLCOMPDIR = r'NewsData' trainNews = load_mlcomp('NewsData', 'train', mlcomp_root=MLCOMPDIR) app = Flask(__name__) @app.route('/') def index(): return render_template('index.html') @app.route('/classify', methods=['GET', 'POST']) def classify(): if request.method == 'POST': result = request.form newsText = result.get('inputText') newsText = [newsText] tfidf = dill.load(open("tfidf.pkl", 'rb')) tfidf.fit_transform(newsText) model = dill.load(open("model.pkl", 'rb'))
from sklearn import tree from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report #from sklearn.naive_bayes import MultinomialNB print(__doc__) #if 'MLCOMP_DATASETS_HOME' not in os.environ: # print("DATASETS_HOME not set; please follow the above instructions") # sys.exit(0) #Load the training set dataset_name = '20w_nospec_may' print("Loading twitter training set... ") twitter_train = load_mlcomp(dataset_name, 'train') print(twitter_train.DESCR) print("%d documents" % len(twitter_train.filenames)) print("%d categories" % len(twitter_train.target_names)) print("Extracting features from the dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(encoding='latin1') X_train = vectorizer.fit_transform((open(f).read() for f in twitter_train.filenames)) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train.shape) assert sp.issparse(X_train) y_train = twitter_train.target
import sklearn import nltk import scipy as sp from sklearn.datasets import load_mlcomp from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectKBest, chi2 from sklearn.pipeline import Pipeline from sklearn import metrics from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.svm import SVC MLCOMPDIR = r'LOCATION OF CORPUS' trainNews = load_mlcomp('16NepaliNews', 'train', mlcomp_root=MLCOMPDIR) testNews = load_mlcomp('16NepaliNews', 'test', mlcomp_root=MLCOMPDIR) ''' Nepali Stop Words ''' # The stop words file is copied into the stopwords directory of nltk.data\corpora folder stopWords = set(nltk.corpus.stopwords.words('nepali')) ''' Testing and Training Data ''' xTrain = trainNews.data xTest = testNews.data yTrain = trainNews.target yTest = testNews.target ''' Vectorizer ''' tfidfVectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "), sublinear_tf=True, encoding='utf-8', decode_error='ignore',
def testBaseLine(self): #return # disable slow test for now logger.info("Running 20NG NB baseline...") logger.info("Calculating TF-IDF on 20ng data set...") news_train = load_mlcomp('20news-18828', 'train') news_test = load_mlcomp('20news-18828', 'test') target_names = news_test.target_names vectorizer = TfidfVectorizer(encoding='latin1') X_train = vectorizer.fit_transform( (open(f).read() for f in news_train.filenames)) y_train = news_train.target X_test = vectorizer.transform( (open(f).read() for f in news_test.filenames)) y_test = news_test.target del news_train, news_test logger.info("Running MultinomialNB...") clf = MultinomialNB().fit(X_train, y_train) print( classification_report(y_test, clf.predict(X_test), target_names=target_names)) del clf logger.info("Running pybrain...") from pybrain.datasets import ClassificationDataSet from pybrain.utilities import percentError from pybrain.tools.shortcuts import buildNetwork from pybrain.supervised.trainers import BackpropTrainer from pybrain.structure.modules import SoftmaxLayer from pybrain.tools.xml.networkwriter import NetworkWriter from pybrain.tools.xml.networkreader import NetworkReader trndata = ClassificationDataSet(len(vectorizer.get_feature_names()), 1, nb_classes=len(target_names), class_labels=target_names) for i, x in enumerate(X_train): #print x, y_train[i] trndata.addSample(x.toarray(), y_train[i]) trndata._convertToOneOfMany() del X_train, y_train tstdata = ClassificationDataSet(len(vectorizer.get_feature_names()), 1, nb_classes=len(target_names), class_labels=target_names) for i, x in enumerate(X_test): tstdata.addSample(x.toarray(), y_test[i]) tstdata._convertToOneOfMany() del X_test, y_test logger.info("Building network...") fnn = buildNetwork(trndata.indim, 100, trndata.outdim, outclass=SoftmaxLayer) trainer = BackpropTrainer(fnn, dataset=trndata, momentum=0.1, learningrate=0.01, verbose=True, weightdecay=0.01) logger.info("Training pybrain for 50 epochs...") trainer.trainEpochs(50) pred = fnn.activateOnDataset(tstdata) pred = np.argmax(pred, axis=1) # argmax gives the class print(classification_report(y_test, pred, target_names=target_names))
#coding:utf-8 ''' Created on 2015年7月5日 @author: Administrator ''' from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import load_mlcomp import nltk.stem MLCOMP_DIR = r'data' data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_DIR) #print (data.filenames) #print len(data.filenames) #print len(data.target_names) groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'] train_data = load_mlcomp('20news-18828', 'train',mlcomp_root=MLCOMP_DIR,categories=groups) test_data = load_mlcomp('20news-18828', 'test',mlcomp_root=MLCOMP_DIR) #print len(test_data.filenames) english_stemmer = nltk.stem.SnowballStemmer('english') class StemmedTfidfVectorizer(TfidfVectorizer): def bulid_analyzer(self): analyzer = super(TfidfVectorizer,self).bulid_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,stop_words='english',decode_error='ignore') vectorized = vectorizer.fit_transform(data)
from sklearn.datasets import load_mlcomp from sklearn.feature_extraction.text import Vectorizer from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.naive_bayes import MultinomialNB if 'MLCOMP_DATASETS_HOME' not in os.environ: print "MLCOMP_DATASETS_HOME not set; please follow the above instructions" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp('20news-18828', 'train') print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... "
english_stemmer = SnowballStemmer('english') vectorizer = StemmedTfidfVectorizer(min_df=10, stop_words='english', decode_error='ignore') MLCOMP_DIR = os.path.join(os.getcwd()) print(MLCOMP_DIR) groups = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardhware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space' ] #data = load_mlcomp("20news-18828",mlcomp_root=MLCOMP_DIR) #train_data = fetch_20newsgroups(subset='train') #test_data = fetch_20newsgroups(subset='test') test_data = load_mlcomp("20news-18828", "test", mlcomp_root=MLCOMP_DIR) train_data = load_mlcomp("20news-18828", "train", mlcomp_root=MLCOMP_DIR) vectorized = vectorizer.fit_transform(train_data.data) num_samples, num_features = vectorized.shape print(num_samples, num_features) num_clusters = 50 km = KMeans(n_clusters=num_clusters, init='random', n_init=1, verbose=1) km.fit(vectorized) new_post = "Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more. Any ideas? Thanks." new_post_vec = vectorizer.transform([new_post]) new_post_label = km.predict(new_post_vec)[0] print(new_post_label) similar_indices = (km.labels_ == new_post_label).nonzero()[0] similar = []
from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report #from sklearn.naive_bayes import MultinomialNB print(__doc__) if 'MLCOMP_DATASETS_HOME' not in os.environ: print("DATASETS_HOME not set; please follow the above instructions") sys.exit(0) # Load the training set dataset_name = 'hashtagging-tweets' print("Loading twitter training set... ") twitter_train = load_mlcomp(dataset_name, 'train') print(twitter_train.DESCR) print("%d documents" % len(twitter_train.filenames)) print("%d categories" % len(twitter_train.target_names)) print("Extracting features from the dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(encoding='latin1') X_train = vectorizer.fit_transform((open(f).read() for f in twitter_train.filenames)) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train.shape) assert sp.issparse(X_train) y_train = twitter_train.target ##################################################
def testBaseLine(self): #return # disable slow test for now logger.info("Running 20NG NB baseline...") logger.info("Calculating TF-IDF on 20ng data set...") news_train = load_mlcomp('20news-18828', 'train') news_test = load_mlcomp('20news-18828', 'test') target_names = news_test.target_names vectorizer = TfidfVectorizer(encoding='latin1') X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) y_train = news_train.target X_test = vectorizer.transform((open(f).read() for f in news_test.filenames)) y_test = news_test.target del news_train, news_test logger.info("Running MultinomialNB...") clf = MultinomialNB().fit(X_train, y_train) print(classification_report(y_test, clf.predict(X_test), target_names=target_names)) del clf logger.info("Running pybrain...") from pybrain.datasets import ClassificationDataSet from pybrain.utilities import percentError from pybrain.tools.shortcuts import buildNetwork from pybrain.supervised.trainers import BackpropTrainer from pybrain.structure.modules import SoftmaxLayer from pybrain.tools.xml.networkwriter import NetworkWriter from pybrain.tools.xml.networkreader import NetworkReader trndata = ClassificationDataSet(len(vectorizer.get_feature_names()), 1, nb_classes = len(target_names), class_labels = target_names) for i, x in enumerate(X_train): #print x, y_train[i] trndata.addSample(x.toarray(), y_train[i]) trndata._convertToOneOfMany( ) del X_train, y_train tstdata = ClassificationDataSet(len(vectorizer.get_feature_names()), 1, nb_classes = len(target_names), class_labels = target_names) for i, x in enumerate(X_test): tstdata.addSample(x.toarray(), y_test[i]) tstdata._convertToOneOfMany( ) del X_test, y_test logger.info("Building network...") fnn = buildNetwork(trndata.indim, 100, trndata.outdim, outclass=SoftmaxLayer) trainer = BackpropTrainer(fnn, dataset=trndata, momentum=0.1, learningrate=0.01, verbose=True, weightdecay=0.01) logger.info("Training pybrain for 50 epochs...") trainer.trainEpochs(50) pred = fnn.activateOnDataset(tstdata) pred = np.argmax(pred, axis=1) # argmax gives the class print(classification_report(y_test, pred, target_names=target_names))
import dill import nltk import scipy as sp from sklearn.datasets import load_mlcomp from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn import metrics from sklearn.naive_bayes import MultinomialNB, BernoulliNB MLCOMPDIR = r'16NepaliNews/16NepaliNews' trainNews = load_mlcomp('NepaliData', 'train', mlcomp_root=MLCOMPDIR) testNews = load_mlcomp('NepaliData', 'test', mlcomp_root=MLCOMPDIR) stopWords = set(nltk.corpus.stopwords.words('nepali')) ''' Testing and Training Data ''' xTrain = trainNews.data xTest = testNews.data yTrain = trainNews.target yTest = testNews.target tfidfVectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "), sublinear_tf=True, encoding='utf-8', decode_error='ignore', stop_words=stopWords) vectorised = tfidfVectorizer.fit_transform(xTrain) with open('tfidf.pkl', 'wb') as f:
#coding:utf-8 ''' Created on 2015年7月5日 @author: Administrator ''' from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import load_mlcomp import nltk.stem MLCOMP_DIR = r'data' data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_DIR) #print (data.filenames) #print len(data.filenames) #print len(data.target_names) groups = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space' ] train_data = load_mlcomp('20news-18828', 'train', mlcomp_root=MLCOMP_DIR, categories=groups) test_data = load_mlcomp('20news-18828', 'test', mlcomp_root=MLCOMP_DIR) #print len(test_data.filenames) english_stemmer = nltk.stem.SnowballStemmer('english')
from sklearn.datasets import load_mlcomp from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.naive_bayes import MultinomialNB if "MLCOMP_DATASETS_HOME" not in os.environ: print "MLCOMP_DATASETS_HOME not set; please follow the above instructions" sys.exit(0) # Load the training set print "Loading 20 newsgroups training set... " news_train = load_mlcomp("20news-18828", "train") print news_train.DESCR print "%d documents" % len(news_train.filenames) print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = TfidfVectorizer(charset="latin1") X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Loading 20 newsgroups test set... " news_test = load_mlcomp("20news-18828", "test")
#X_train = vectorizer.fit_transform(posts) #n_samples, n_features = X_train.shape #print n_samples #post = "how does machine learning work?" #post_vec, found_post, distance = get_similar_posts(X_train, post, posts) # #print "\n" #print "The most similar post to '%s' is: '%s' with distance= %.2f" % (post, found_post, distance) MLCOMP_DIR = "Building_ML_Systems_with_Python/chapter_03_Codes/data" categories = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space' ] train_data = datasets.load_mlcomp("20news-18828", "train", mlcomp_root=MLCOMP_DIR, categories=categories) test_data = datasets.load_mlcomp("20news-18828", "test", mlcomp_root=MLCOMP_DIR, categories=categories) print("Number of training data posts:", len(train_data.filenames)) print("Number of test data posts:", len(test_data.filenames)) vectorizer = StemmedTfIdfCountVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore') X_train = vectorizer.fit_transform(train_data.data) X_test = vectorizer.transform(test_data.data)
from sklearn.datasets import load_mlcomp from aiml.learnig.my_stemmer import StemmedTfidfVectorizer from sklearn.cluster import KMeans from aiml.learnig.utils import dis_raw MLCOMP_ROOT = '/home/ring/datasets/mlcomp' groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'] train_data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_ROOT, set_='train', categories=groups) #test_data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_ROOT, set_='test') vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore') vectorized = vectorizer.fit_transform(train_data.data) num_samples, num_features = vectorized.shape print("#samples: %d, #features: %d" % (num_samples, num_features)) new_posts = """Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more. Any ideas? Thanks.""" new_posts_vec = vectorizer.transform([new_posts]) # K-Means聚类 num_clusters = train_data.target.max() + 1 km = KMeans(n_clusters=num_clusters, n_init=1, init='random', verbose=1) km.fit(vectorized) print('#K-Means total matched %i' % len(train_data.target == km.labels_)) # 分类预测 new_post_label = km.predict(new_posts_vec) # 获取相同类别文章的下标
metadata = 'relig' elif (entry == 4): path = path + 'T\\ti' metadata = 'ti' elif (entry == 5): path = path + 'C\\course' metadata = 'course' else: print('Tente um número entre 1 - 4') print('1-age, 2-gender, 3-relig, 4-ti, 5-course') exit(0) print("Loading database") age_train = load_mlcomp(metadata, metadata) print(age_train.DESCR) print("%d documents" % len(age_train.filenames)) print("%d categories" % len(age_train.target_names)) I = {} T = {} i = 0 for f in age_train.filenames: I[i] = open(f, encoding='utf-8').read() i = i + 1 i = 0 for f in age_train.target: T[i] = f i = i + 1
data = datasets.load_iris() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) print len(set(data["target"])) # data = datasets.load_linnerud() # import pdb # pdb.set_trace() # print "Features: ", len(data["data"][0]) # print "Instances: ", len(data["data"]) # print len(set(data["target"])) data = datasets.load_mlcomp() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) import pdb pdb.set_trace() data = datasets.load_sample_image() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) print len(set(data["target"])) data = datasets.load_sample_images() print "Features: ", len(data["data"][0]) print "Instances: ", len(data["data"]) print len(set(data["target"]))
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.naive_bayes import MultinomialNB print(__doc__) if 'MLCOMP_DATASETS_HOME' not in os.environ: print("MLCOMP_DATASETS_HOME not set; please follow the above instructions") sys.exit(0) # Load the training set print("Loading 20 newsgroups training set... ") news_train = load_mlcomp('20news-18828', 'train') print(news_train.DESCR) print("%d documents" % len(news_train.filenames)) print("%d categories" % len(news_train.target_names)) print("Extracting features from the dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(charset='latin1') X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train.shape) assert sp.issparse(X_train) y_train = news_train.target print("Loading 20 newsgroups test set... ")
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report #from sklearn.naive_bayes import MultinomialNB print(__doc__) if 'MLCOMP_DATASETS_HOME' not in os.environ: print("DATASETS_HOME not set; please follow the above instructions") sys.exit(0) # Load the training set dataset_name = 'hashtagging-tweets' print("Loading twitter training set... ") twitter_train = load_mlcomp(dataset_name, 'train') print(twitter_train.DESCR) print("%d documents" % len(twitter_train.filenames)) print("%d categories" % len(twitter_train.target_names)) print("Extracting features from the dataset using a sparse vectorizer") t0 = time() vectorizer = TfidfVectorizer(encoding='latin1') X_train = vectorizer.fit_transform( (open(f).read() for f in twitter_train.filenames)) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X_train.shape) assert sp.issparse(X_train) y_train = twitter_train.target ##################################################