示例#1
0
from sklearn.feature_extraction import text
from matplotlib.ticker import FormatStrFormatter
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize.regexp import RegexpTokenizer
# remove stem when calculating TF.IDF
class Tokenizer(object):  
    def __init__(self):
        self.tok=RegexpTokenizer(r'\b([a-zA-Z]+)\b')
        self.stemmer = LancasterStemmer()
    def __call__(self, doc):
        return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)]  

# choose 8 required classes
cat=['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
train = f20(subset='train',categories=cat, shuffle = True, random_state = 42)

#printing number of documents per target(class)
x=np.arange(0,9,1)
fig, ax = plt.subplots()
counts, bins, patches = ax.hist(train.target,x, facecolor='red', edgecolor='gray')
ax.set_xticks(bins)
ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
ax.set_xlabel('Targets',x=1)
ax.set_ylabel('Numbers')
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for count, x in zip(counts, bin_centers):
    # Label the raw counts
    ax.annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
        xytext=(0, -18), textcoords='offset points', va='top', ha='center')
示例#2
0
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))


cats = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles',
    'rec.sport.baseball', 'rec.sport.hockey'
]

print("Loading 20 newsgroups dataset for categories:")
print '\n'
print list(cats)
print '\n'

traindata = f20(subset='all', categories=cats)

print "%d documents" % len(traindata.data)
print "%d categories" % len(traindata.target_names)
print '\n'

print "Creating stemmed TFxIDF representation..."
t0 = time()

vect = StemmedTfidfVectorizer(stop_words='english')
vectors = vect.fit_transform(traindata.data)  # TFxIDF representation

print "Done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % vectors.shape
print '\n'
示例#3
0
import nltk.stem
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix, precision_score, recall_score
from sklearn import cross_validation
from sklearn.cross_validation import KFold

english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):

    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
        
cat=['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
train = f20(subset='train',categories=cat, shuffle = True, random_state = 42)
##train = f20(subset='train',shuffle = True, random_state = 42)

stopwords = text.ENGLISH_STOP_WORDS
vectorizer = StemmedTfidfVectorizer(
    min_df=1, stop_words=stopwords, decode_error='ignore')
vector_train = vectorizer.fit_transform(train.data)
tfidf_train=vector_train.toarray()
svd = TSVD(n_components = 50, n_iter = 10, random_state = 42)  
tfidf_train_reduced = svd.fit_transform(tfidf_train)
#print(tfidf_train.shape)
#print(tfidf_train_reduced.shape) 

svm_train_data = tfidf_train_reduced
#svm_train_tag = np.concatenate((-np.ones(len(train_comp.data)), np.ones(len(train_rect.data))))
svm_train_tag = []
示例#4
0
      return lambda doc: english_stemmer.stemWords(analyzer(doc))

#loading all the data
cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'comp.windows.x', 
        'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey',
        'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
        'misc.forsale',
        'talk.politics.misc','talk.politics.guns','talk.politics.mideast',
        'talk.religion.misc','alt.atheism','soc.religion.christian']

print("Loading 20 newsgroups dataset for categories:")
print '\n'
print list(cats) 
print '\n'

traindata = f20(subset='all')

print "%d documents" % len(traindata.data)
print "%d categories" % len(traindata.target_names)
print '\n'

print"Creating stemmed TFxIDF representation..."
t0 = time()

vect = StemmedTfidfVectorizer(stop_words='english')
vectors = vect.fit_transform(traindata.data) # TFxIDF representation

print"Done in %fs" % (time() - t0)
print"n_samples: %d, n_features: %d" % vectors.shape
print'\n'
示例#5
0
"""
import numpy as np
from sklearn.datasets import fetch_20newsgroups as f20
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize.regexp import RegexpTokenizer
import scipy
class Tokenizer(object):  
    def __init__(self):
        self.tok=RegexpTokenizer(r'\b([a-zA-Z]+)\b')
        self.stemmer = LancasterStemmer()
    def __call__(self, doc):
        return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)]        
# combine the documents of the same classes into the same docuemnt, divided by ' '        
train = f20(subset='train',shuffle = True, random_state = 42)
datalist=[]
for i in range(0,20):
    datalist.append('')
for i in range(0,len(train.data)):
    datalist[train.target[i]]+=(' '+train.data[i])
    
# get the count vector
stopwords = text.ENGLISH_STOP_WORDS
vectorizer = CountVectorizer(tokenizer = Tokenizer(),
                             stop_words=stopwords,
                             min_df=1)
vector = vectorizer.fit_transform(datalist)
count=vector.toarray()
# get the if and icf
index={0:3,1:4,2:6,3:15}
示例#6
0
#!/usr/bin/env python3.6
# -*- coding: utf-8 -*-
#@Author: Yang Xiaojun
from sklearn.datasets import fetch_20newsgroups as f20
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction.text import TfidfTransformer as TF
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
tr_data = f20(
    subset='train',
    categories=['comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale'])  #
tr_data_x = tr_data.data
tr_data_y = tr_data.target
te_data = f20(
    subset='test',
    categories=['comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale'])  #
te_data_x = tr_data.data
te_data_y = tr_data.target
target_name = tr_data.target_names
from sklearn.svm import SVC
# def feature_work(data=None,vb=None,stop_words=None,max_df=1):
#     cv=CV(stop_words=stop_words,max_df=max_df,vocabulary=vb)
#     #print(cv.vocabulary)
#     tr_vb=cv.vocabulary_
#
#     tf=TF()
#     tf_idf=tf.fit_transform(cv.fit_transform(data))#词频和tfidf值
#     print('0:',cv.fit_transform(data).shape)
#     print('1:', tf_idf.shape)