def loadTrain():
    print("Loading 20 newsgroups training set...")
    news_train = load_mlcomp('20news-18828', 'train')
    
    print(news_train.DESCR)
    print("%d documents" % len(news_train.filenames))
    print("%d categories" % len(news_train.target_names))

    return news_train
Пример #2
0
def loadTrain():
    print("Loading 20 newsgroups training set...")
    news_train = load_mlcomp('20news-18828', 'train')

    print(news_train.DESCR)
    print("%d documents" % len(news_train.filenames))
    print("%d categories" % len(news_train.target_names))

    return news_train
Пример #3
0
def loadTest():

    print("Loading 20 newsgroups test set...")
    news_test = load_mlcomp('20news-18828', 'test')

    t0 = time()
    print("done in %fs" % (time() - t0))
    print("%d documents" % len(news_test.filenames))
    print("%d categories" % len(news_test.target_names))

    return news_test
def loadTest():

    print("Loading 20 newsgroups test set...")
    news_test = load_mlcomp('20news-18828', 'test')

    t0 = time()
    print("done in %fs" % (time() - t0))
    print("%d documents" % len(news_test.filenames))
    print("%d categories" % len(news_test.target_names))

    return news_test
    def load_data(self):
        """Load the news_train file to see how it's structured"""
        # Load the training set
        print("Loading 20 newsgroups training set... ")
        self.news_train = load_mlcomp('20news-18828',
                                      'train',
                                      mlcomp_root=MLCOMP)
        print(self.news_train.DESCR)
        print("%d documents" % len(self.news_train.filenames))
        print("%d categories" % len(self.news_train.target_names))

        return self.news_train
def load_texts(dataset_type='train', groups=None):
    """
    load datasets to bytes list
    :return:train_dataset_bunch.data bytes list
    """
    if groups == 'small':
        groups = ['comp.graphics', 'comp.os.ms-windows.misc']  # 仅用于小数据测试时用, #1368
    elif groups == 'medium':
        groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware',
                  'comp.windows.x', 'sci.space']  # 中量数据时用    #3414
    train_dataset_bunch = datasets.load_mlcomp('20news-18828', dataset_type, mlcomp_root='./datasets',
                                               categories=groups)  # 13180
    return train_dataset_bunch.data
Пример #7
0
Файл: LDA.py Проект: wdw110/ML
def load_texts(dataset_type='train', groups=None):
	"""
	load datasets to bytes list
	:return:train_dataset_bunch.data bytes list
	"""
	if groups == 'small':
		groups = ['comp.graphics', 'comp.os.ms-windows.misc']  # 仅用于小数据测试时用, #1368
	elif groups == 'medium':
		groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware',
				  'comp.windows.x', 'sci.space']  # 中量数据时用    #3414
	train_dataset_bunch = datasets.load_mlcomp('20news-18828', dataset_type, mlcomp_root='./datasets',
											   categories=groups)  # 13180
	return train_dataset_bunch.data
Пример #8
0
def load_texts(dataset_type='train', groups='small'):
    """
    load datasets to bytes list
    :return:train_dataset_bunch.items bytes list
    """
    if groups == 'small':
        groups = ['comp.graphics', 'rec.motorcycles', 'talk.politics.guns']  # 仅用于小数据测试时用, #1368
    elif groups == 'medium':
        groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware',
                  'comp.windows.X', 'sci.space']  # 中量数据时用    #3414
    train_dataset_bunch = datasets.load_mlcomp('20news-18828', dataset_type, mlcomp_root='../datasets',
                                               categories=groups)  # 13180
    texts = preprocess_texts(train_dataset_bunch.data)  # bunch.items list of text bytes
    return texts
    def extract_test_features(self):

        print("Loading 20 newsgroups test set... ")
        self.news_test = load_mlcomp('20news-18828',
                                     'test',
                                     mlcomp_root=MLCOMP)
        t0 = time()
        print("done in %fs" % (time() - t0))

        print("Predicting the labels of the test set...")
        print("%d documents" % len(self.news_test.filenames))
        print("%d categories" % len(self.news_test.target_names))

        print("Extracting features from the dataset using the same vectorizer")
        t0 = time()

        # Read in test data
        test_input_strings = (open(f, encoding='latin-1').read()
                              for f in self.news_test.filenames)
        self.X_test = self.vectorizer.transform(test_input_strings)
        self.y_test = self.news_test.target
        print("done in %fs" % (time() - t0))
        print("n_samples: %d, n_features: %d" % self.X_test.shape)
        return
if not os.path.exists(DATA_DIR):
    print("""\
It seems that you have not yet downloaded the MLCOMP data set.
Please do so and place it into %s."""%DATA_DIR)
    sys.exit(1)

# 
# groups = [
#     'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
#     'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
# dataset = sklearn.datasets.load_mlcomp("20news-18828", "train",
#                                        mlcomp_root=DATA_DIR,
#                                        categories=groups)
# Load the training set
print("Loading 20 newsgroups training set... ")
news_train = load_mlcomp('20news-18828', 'train',mlcomp_root=DATA_DIR)
print(news_train.DESCR)
print("%d documents" % len(news_train.filenames))
print("%d categories" % len(news_train.target_names))

print("Extracting features from the dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(encoding='latin1')
X_train = vectorizer.fit_transform((open(f).read()
                                    for f in news_train.filenames))
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X_train.shape)
assert sp.issparse(X_train)
y_train = news_train.target

print("Loading 20 newsgroups test set... ")
#posts = load_data_from_dir("Building_ML_Systems_with_Python/chapter_03_Codes/data/toy", "\n")
#X_train = vectorizer.fit_transform(posts)
#n_samples, n_features = X_train.shape
#print n_samples
#post = "how does machine learning work?"
#post_vec, found_post, distance = get_similar_posts(X_train, post, posts)
#
#print "\n"
#print "The most similar post to '%s' is: '%s' with distance= %.2f" % (post, found_post, distance)

MLCOMP_DIR = "Building_ML_Systems_with_Python/chapter_03_Codes/data"
categories = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
train_data = datasets.load_mlcomp("20news-18828", "train",
                                  mlcomp_root=MLCOMP_DIR,
                                  categories=categories)
test_data = datasets.load_mlcomp("20news-18828", "test",
                                 mlcomp_root=MLCOMP_DIR,
                                 categories=categories)
print("Number of training data posts:", len(train_data.filenames))
print("Number of test data posts:", len(test_data.filenames))

vectorizer = StemmedTfIdfCountVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore')

X_train = vectorizer.fit_transform(train_data.data)
X_test = vectorizer.transform(test_data.data)

num_train_samples, num_train_features = X_train.shape
num_test_samples, num_test_features = X_test.shape
labels = train_data.target
Пример #12
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

print(__doc__)

# if 'MLCOMP_DATASETS_HOME' not in os.environ:
#     print("MLCOMP_DATASETS_HOME not set; please follow the above instructions")
#     sys.exit(0)
#/ updated by Ivy
MLCOMP_DIR = r"E:\ctvit\lab\Code\Pycharm Project\cctv_news_content"
# Load the training set
print("Loading 20 newsgroups training set... ")
news_train = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_DIR)
# news_train = load_mlcomp('20news-18828', 'train')
print(news_train.DESCR)
print("%d documents" % len(news_train.filenames))
print("%d categories" % len(news_train.target_names))

print("Extracting features from the dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(encoding='latin1')
X_train = vectorizer.fit_transform(
    (open(f).read() for f in news_train.filenames))
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X_train.shape)
assert sp.issparse(X_train)
y_train = news_train.target
Пример #13
0
                  ┃┫┫ ┃┫┫
                  ┗┻┛ ┗┻┛
"""
import pdb
from sklearn import datasets
from sklearn.cluster import KMeans

from RelatedPosts import StemmedTfidfVectorizer, norm_euclidDist

"""
loading datasets into train_dataset_bunch
"""
MLCOMP_DIR = r'./datasets'
groups = ['comp.graphics', 'comp.os.ms-windows.misc']   #仅用于小数据测试时用
# groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']  # 中量数据时用
train_dataset_bunch = datasets.load_mlcomp(name_or_id='20news-18828', set_='train', mlcomp_root=MLCOMP_DIR,categories=groups)  #<class 'sklearn.datasets.base.Bunch'>
# print(type(train_dataset_bunch.items)) #list
# print(type(train_dataset_bunch.items[0]))  #bytes
# print(len(train_dataset_bunch.filenames)) # #posts=3414


"""
fit_transform dataset.items into train_dataset_mats
"""
Vectorizer = StemmedTfidfVectorizer
vectorizer = Vectorizer(decode_error='ignore', stop_words='english', min_df=10, max_df=0.5)
# scipy.sparse.csr.csr_matrix
train_dataset_mats = vectorizer.fit_transform(train_dataset_bunch.data)
num_samples, num_features = train_dataset_mats.shape
print("#samples:%d, #features:%d" % (num_samples, num_features))
Пример #14
0
from flask import Flask
from flask import render_template
from flask import request
import dill
from unicode import convertunicode

from sklearn.datasets import load_mlcomp


MLCOMPDIR = r'NewsData'

trainNews = load_mlcomp('NewsData', 'train', mlcomp_root=MLCOMPDIR)

app = Flask(__name__)


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/classify', methods=['GET', 'POST'])
def classify():
    if request.method == 'POST':
        result = request.form
        newsText = result.get('inputText')
        newsText = [newsText]
        tfidf = dill.load(open("tfidf.pkl", 'rb'))
        tfidf.fit_transform(newsText)
        model = dill.load(open("model.pkl", 'rb'))
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#from sklearn.naive_bayes import MultinomialNB


print(__doc__)

#if 'MLCOMP_DATASETS_HOME' not in os.environ:
#    print("DATASETS_HOME not set; please follow the above instructions")
#    sys.exit(0)

 #Load the training set
dataset_name = '20w_nospec_may'
print("Loading twitter training set... ")
twitter_train = load_mlcomp(dataset_name, 'train')
print(twitter_train.DESCR)
print("%d documents" % len(twitter_train.filenames))
print("%d categories" % len(twitter_train.target_names))


print("Extracting features from the dataset using a sparse vectorizer")
t0 = time()

vectorizer = TfidfVectorizer(encoding='latin1')

X_train = vectorizer.fit_transform((open(f).read() for f in twitter_train.filenames))
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X_train.shape)
assert sp.issparse(X_train)
y_train = twitter_train.target
import sklearn
import nltk
import scipy as sp

from sklearn.datasets import load_mlcomp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC

MLCOMPDIR = r'LOCATION OF CORPUS'

trainNews = load_mlcomp('16NepaliNews', 'train', mlcomp_root=MLCOMPDIR)
testNews = load_mlcomp('16NepaliNews', 'test', mlcomp_root=MLCOMPDIR)
''' Nepali Stop Words '''
# The stop words file is copied into the stopwords directory of nltk.data\corpora folder

stopWords = set(nltk.corpus.stopwords.words('nepali'))
''' Testing and Training Data '''
xTrain = trainNews.data
xTest = testNews.data
yTrain = trainNews.target
yTest = testNews.target
''' Vectorizer '''

tfidfVectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "),
                                  sublinear_tf=True,
                                  encoding='utf-8',
                                  decode_error='ignore',
Пример #17
0
    def testBaseLine(self):
        #return # disable slow test for now
        logger.info("Running 20NG NB baseline...")

        logger.info("Calculating TF-IDF on 20ng data set...")
        news_train = load_mlcomp('20news-18828', 'train')
        news_test = load_mlcomp('20news-18828', 'test')
        target_names = news_test.target_names
        vectorizer = TfidfVectorizer(encoding='latin1')
        X_train = vectorizer.fit_transform(
            (open(f).read() for f in news_train.filenames))
        y_train = news_train.target
        X_test = vectorizer.transform(
            (open(f).read() for f in news_test.filenames))
        y_test = news_test.target

        del news_train, news_test

        logger.info("Running MultinomialNB...")
        clf = MultinomialNB().fit(X_train, y_train)
        print(
            classification_report(y_test,
                                  clf.predict(X_test),
                                  target_names=target_names))

        del clf

        logger.info("Running pybrain...")

        from pybrain.datasets import ClassificationDataSet
        from pybrain.utilities import percentError
        from pybrain.tools.shortcuts import buildNetwork
        from pybrain.supervised.trainers import BackpropTrainer
        from pybrain.structure.modules import SoftmaxLayer
        from pybrain.tools.xml.networkwriter import NetworkWriter
        from pybrain.tools.xml.networkreader import NetworkReader

        trndata = ClassificationDataSet(len(vectorizer.get_feature_names()),
                                        1,
                                        nb_classes=len(target_names),
                                        class_labels=target_names)
        for i, x in enumerate(X_train):
            #print x, y_train[i]
            trndata.addSample(x.toarray(), y_train[i])
        trndata._convertToOneOfMany()
        del X_train, y_train

        tstdata = ClassificationDataSet(len(vectorizer.get_feature_names()),
                                        1,
                                        nb_classes=len(target_names),
                                        class_labels=target_names)
        for i, x in enumerate(X_test):
            tstdata.addSample(x.toarray(), y_test[i])
        tstdata._convertToOneOfMany()
        del X_test, y_test

        logger.info("Building network...")
        fnn = buildNetwork(trndata.indim,
                           100,
                           trndata.outdim,
                           outclass=SoftmaxLayer)
        trainer = BackpropTrainer(fnn,
                                  dataset=trndata,
                                  momentum=0.1,
                                  learningrate=0.01,
                                  verbose=True,
                                  weightdecay=0.01)

        logger.info("Training pybrain for 50 epochs...")
        trainer.trainEpochs(50)
        pred = fnn.activateOnDataset(tstdata)
        pred = np.argmax(pred, axis=1)  # argmax gives the class

        print(classification_report(y_test, pred, target_names=target_names))
Пример #18
0
#coding:utf-8
'''
Created on 2015年7月5日

@author: Administrator
'''

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_mlcomp
import nltk.stem

MLCOMP_DIR = r'data'
data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_DIR)
#print (data.filenames)
#print len(data.filenames)
#print len(data.target_names)

groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
train_data = load_mlcomp('20news-18828', 'train',mlcomp_root=MLCOMP_DIR,categories=groups)
test_data = load_mlcomp('20news-18828', 'test',mlcomp_root=MLCOMP_DIR)

#print len(test_data.filenames)

english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def bulid_analyzer(self):
        analyzer = super(TfidfVectorizer,self).bulid_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,stop_words='english',decode_error='ignore')
vectorized = vectorizer.fit_transform(data)
Пример #19
0
from sklearn.datasets import load_mlcomp
from sklearn.feature_extraction.text import Vectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB


if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = Vectorizer()
X_train = vectorizer.fit_transform((open(f).read()
                                    for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
Пример #20
0
english_stemmer = SnowballStemmer('english')

vectorizer = StemmedTfidfVectorizer(min_df=10,
                                    stop_words='english',
                                    decode_error='ignore')
MLCOMP_DIR = os.path.join(os.getcwd())
print(MLCOMP_DIR)
groups = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardhware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'
]
#data = load_mlcomp("20news-18828",mlcomp_root=MLCOMP_DIR)
#train_data = fetch_20newsgroups(subset='train')
#test_data = fetch_20newsgroups(subset='test')
test_data = load_mlcomp("20news-18828", "test", mlcomp_root=MLCOMP_DIR)
train_data = load_mlcomp("20news-18828", "train", mlcomp_root=MLCOMP_DIR)

vectorized = vectorizer.fit_transform(train_data.data)
num_samples, num_features = vectorized.shape
print(num_samples, num_features)
num_clusters = 50
km = KMeans(n_clusters=num_clusters, init='random', n_init=1, verbose=1)

km.fit(vectorized)
new_post = "Disk	drive	problems.	Hi,	I	have	a	problem	with	my hard	disk. After	1	year	it	is	working	only	sporadically	now. I	tried	to	format	it,	but	now	it	doesn't	boot	any more. Any	ideas?	Thanks."
new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]
print(new_post_label)
similar_indices = (km.labels_ == new_post_label).nonzero()[0]
similar = []
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#from sklearn.naive_bayes import MultinomialNB


print(__doc__)

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print("DATASETS_HOME not set; please follow the above instructions")
    sys.exit(0)

# Load the training set
dataset_name = 'hashtagging-tweets'
print("Loading twitter training set... ")
twitter_train = load_mlcomp(dataset_name, 'train')
print(twitter_train.DESCR)
print("%d documents" % len(twitter_train.filenames))
print("%d categories" % len(twitter_train.target_names))

print("Extracting features from the dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(encoding='latin1')
X_train = vectorizer.fit_transform((open(f).read()
                                    for f in twitter_train.filenames))
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X_train.shape)
assert sp.issparse(X_train)
y_train = twitter_train.target

##################################################
Пример #22
0
    def testBaseLine(self):
        #return # disable slow test for now
        logger.info("Running 20NG NB baseline...")

        logger.info("Calculating TF-IDF on 20ng data set...")
        news_train = load_mlcomp('20news-18828', 'train')
        news_test = load_mlcomp('20news-18828', 'test')
        target_names = news_test.target_names
        vectorizer = TfidfVectorizer(encoding='latin1')
        X_train = vectorizer.fit_transform((open(f).read()
                                        for f in news_train.filenames))
        y_train = news_train.target
        X_test = vectorizer.transform((open(f).read() 
                                        for f in news_test.filenames))
        y_test = news_test.target

        del news_train, news_test

        logger.info("Running MultinomialNB...")
        clf = MultinomialNB().fit(X_train, y_train)
        print(classification_report(y_test, clf.predict(X_test),
                                    target_names=target_names))

        del clf

        logger.info("Running pybrain...")

        from pybrain.datasets            import ClassificationDataSet
        from pybrain.utilities           import percentError
        from pybrain.tools.shortcuts     import buildNetwork
        from pybrain.supervised.trainers import BackpropTrainer
        from pybrain.structure.modules   import SoftmaxLayer
        from pybrain.tools.xml.networkwriter import NetworkWriter
        from pybrain.tools.xml.networkreader import NetworkReader

        trndata = ClassificationDataSet(len(vectorizer.get_feature_names()), 1,
                                        nb_classes = len(target_names),
                                        class_labels = target_names)
        for i, x in enumerate(X_train):
            #print x, y_train[i]
            trndata.addSample(x.toarray(), y_train[i])
        trndata._convertToOneOfMany( )
        del X_train, y_train

        tstdata = ClassificationDataSet(len(vectorizer.get_feature_names()), 1,
                                        nb_classes = len(target_names),
                                        class_labels = target_names)
        for i, x in enumerate(X_test):
            tstdata.addSample(x.toarray(), y_test[i])
        tstdata._convertToOneOfMany( )
        del X_test, y_test

        logger.info("Building network...")
        fnn = buildNetwork(trndata.indim, 100, trndata.outdim, outclass=SoftmaxLayer)
        trainer = BackpropTrainer(fnn, dataset=trndata, momentum=0.1, learningrate=0.01,
                                  verbose=True, weightdecay=0.01)

        logger.info("Training pybrain for 50 epochs...")
        trainer.trainEpochs(50)
        pred = fnn.activateOnDataset(tstdata)
        pred = np.argmax(pred, axis=1) # argmax gives the class

        print(classification_report(y_test, pred,
                                    target_names=target_names))
import dill
import nltk
import scipy as sp

from sklearn.datasets import load_mlcomp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

MLCOMPDIR = r'16NepaliNews/16NepaliNews'

trainNews = load_mlcomp('NepaliData', 'train', mlcomp_root=MLCOMPDIR)
testNews = load_mlcomp('NepaliData', 'test', mlcomp_root=MLCOMPDIR)

stopWords = set(nltk.corpus.stopwords.words('nepali'))
''' Testing and Training Data '''
xTrain = trainNews.data
xTest = testNews.data
yTrain = trainNews.target
yTest = testNews.target

tfidfVectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "),
                                  sublinear_tf=True,
                                  encoding='utf-8',
                                  decode_error='ignore',
                                  stop_words=stopWords)

vectorised = tfidfVectorizer.fit_transform(xTrain)

with open('tfidf.pkl', 'wb') as f:
Пример #24
0
#coding:utf-8
'''
Created on 2015年7月5日

@author: Administrator
'''

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_mlcomp
import nltk.stem

MLCOMP_DIR = r'data'
data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_DIR)
#print (data.filenames)
#print len(data.filenames)
#print len(data.target_names)

groups = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space'
]
train_data = load_mlcomp('20news-18828',
                         'train',
                         mlcomp_root=MLCOMP_DIR,
                         categories=groups)
test_data = load_mlcomp('20news-18828', 'test', mlcomp_root=MLCOMP_DIR)

#print len(test_data.filenames)

english_stemmer = nltk.stem.SnowballStemmer('english')
from sklearn.datasets import load_mlcomp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB


if "MLCOMP_DATASETS_HOME" not in os.environ:
    print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp("20news-18828", "train")
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = TfidfVectorizer(charset="latin1")
X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
news_test = load_mlcomp("20news-18828", "test")
#X_train = vectorizer.fit_transform(posts)
#n_samples, n_features = X_train.shape
#print n_samples
#post = "how does machine learning work?"
#post_vec, found_post, distance = get_similar_posts(X_train, post, posts)
#
#print "\n"
#print "The most similar post to '%s' is: '%s' with distance= %.2f" % (post, found_post, distance)

MLCOMP_DIR = "Building_ML_Systems_with_Python/chapter_03_Codes/data"
categories = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'
]
train_data = datasets.load_mlcomp("20news-18828",
                                  "train",
                                  mlcomp_root=MLCOMP_DIR,
                                  categories=categories)
test_data = datasets.load_mlcomp("20news-18828",
                                 "test",
                                 mlcomp_root=MLCOMP_DIR,
                                 categories=categories)
print("Number of training data posts:", len(train_data.filenames))
print("Number of test data posts:", len(test_data.filenames))

vectorizer = StemmedTfIdfCountVectorizer(min_df=10,
                                         max_df=0.5,
                                         stop_words='english',
                                         decode_error='ignore')

X_train = vectorizer.fit_transform(train_data.data)
X_test = vectorizer.transform(test_data.data)
Пример #27
0
from sklearn.datasets import load_mlcomp
from aiml.learnig.my_stemmer import StemmedTfidfVectorizer
from sklearn.cluster import KMeans
from aiml.learnig.utils import dis_raw

MLCOMP_ROOT = '/home/ring/datasets/mlcomp'
groups = ['comp.graphics', 'comp.os.ms-windows.misc',
          'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x', 'sci.space']
train_data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_ROOT, set_='train', categories=groups)
#test_data = load_mlcomp('20news-18828', mlcomp_root=MLCOMP_ROOT, set_='test')

vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore')
vectorized = vectorizer.fit_transform(train_data.data)

num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

new_posts = """Disk drive problems. Hi, I have a problem with my hard disk.
After 1 year it is working only sporadically now.
I tried to format it, but now it doesn't boot any more.
Any ideas? Thanks."""
new_posts_vec = vectorizer.transform([new_posts])

# K-Means聚类
num_clusters = train_data.target.max() + 1
km = KMeans(n_clusters=num_clusters, n_init=1, init='random', verbose=1)
km.fit(vectorized)
print('#K-Means total matched %i' % len(train_data.target == km.labels_))
# 分类预测
new_post_label = km.predict(new_posts_vec)
# 获取相同类别文章的下标
    metadata = 'relig'

elif (entry == 4):
    path = path + 'T\\ti'
    metadata = 'ti'
elif (entry == 5):
    path = path + 'C\\course'
    metadata = 'course'

else:
    print('Tente um número entre 1 - 4')
    print('1-age, 2-gender, 3-relig, 4-ti, 5-course')
    exit(0)

print("Loading database")
age_train = load_mlcomp(metadata, metadata)
print(age_train.DESCR)
print("%d documents" % len(age_train.filenames))
print("%d categories" % len(age_train.target_names))

I = {}
T = {}
i = 0
for f in age_train.filenames:
    I[i] = open(f, encoding='utf-8').read()
    i = i + 1
i = 0
for f in age_train.target:
    T[i] = f
    i = i + 1
Пример #29
0

data = datasets.load_iris()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))


# data = datasets.load_linnerud()
# import pdb
# pdb.set_trace()
# print "Features: ", len(data["data"][0])
# print "Instances: ", len(data["data"])
# print len(set(data["target"]))

data = datasets.load_mlcomp()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
import pdb
pdb.set_trace()

data = datasets.load_sample_image()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))

data = datasets.load_sample_images()
print "Features: ", len(data["data"][0])
print "Instances: ", len(data["data"])
print len(set(data["target"]))
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB


print(__doc__)

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print("MLCOMP_DATASETS_HOME not set; please follow the above instructions")
    sys.exit(0)

# Load the training set
print("Loading 20 newsgroups training set... ")
news_train = load_mlcomp('20news-18828', 'train')
print(news_train.DESCR)
print("%d documents" % len(news_train.filenames))
print("%d categories" % len(news_train.target_names))

print("Extracting features from the dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(charset='latin1')
X_train = vectorizer.fit_transform((open(f).read()
                                    for f in news_train.filenames))
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X_train.shape)
assert sp.issparse(X_train)
y_train = news_train.target

print("Loading 20 newsgroups test set... ")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#from sklearn.naive_bayes import MultinomialNB

print(__doc__)

if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print("DATASETS_HOME not set; please follow the above instructions")
    sys.exit(0)

# Load the training set
dataset_name = 'hashtagging-tweets'
print("Loading twitter training set... ")
twitter_train = load_mlcomp(dataset_name, 'train')
print(twitter_train.DESCR)
print("%d documents" % len(twitter_train.filenames))
print("%d categories" % len(twitter_train.target_names))

print("Extracting features from the dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(encoding='latin1')
X_train = vectorizer.fit_transform(
    (open(f).read() for f in twitter_train.filenames))
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X_train.shape)
assert sp.issparse(X_train)
y_train = twitter_train.target

##################################################