示例#1
0
# We'll reduce the document vectors to 10 concepts.

# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or negative.

# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.

t = time.time()
print "accuracy:", KNN.test(m, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print "LSA reduction..."
print
m.reduce(10)

t = time.time()
print "accuracy:", KNN.test(m, folds=10)[-1]
print "time:", time.time() - t
print

# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.
示例#2
0
文件: 03-lsa.py 项目: Abhishek-1/temp
# We'll reduce the document vectors to 10 concepts.

# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or negative.

# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print("LSA reduction...")
print()
m.reduce(10)

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.
示例#3
0
# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
# to predict the label (type/class) of unlabeled documents.
# In this case, it can predict whether a new movie review is positive or
# negative.

# The details are not that important right now, just observe the accuracy.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features).
print("LSA reduction...")
print()
m.reduce(10)

t = time.time()
print("accuracy:", KNN.test(m, folds=10)[-1])
print("time:", time.time() - t)
print()

# Accuracy is about the same, but the performance is better: 2x-3x faster,
# because each document is now a "10-word summary" of the original review.
示例#4
0
文件: 05-bayes.py 项目: mlyne/Scripts
        x += 1

print "ERROR"
print x / n
print t1
print t2

#print xxx


print len(corpus)
print len(corpus.features)
print len(corpus.documents[0].vector)
from time import time
t = time()
print KNN.test(corpus, folds=10)
print time()-t

print "filter..."

from time import time
t = time()
f = corpus.feature_selection(150, verbose=False)
print f
print time()-t
corpus = corpus.filter(f)

#corpus.reduce(300)
#print len(corpus.lsa.vectors[corpus.documents[0].id])
#print corpus.lsa.vectors[corpus.documents[0].id]
#print len(corpus)
示例#5
0
corpus = Corpus(documents)

print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# This may be too much words for some clustering algorithms (e.g., hierarchical).
# We'll reduce the documents to vectors of 4 concepts.

# First, let's test how the corpus would perform as a classifier.
# The details of KNN are not that important right now, just observe the numbers.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words).
print "LSA reduction..."
print
corpus.reduce(4)

t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Not bad, accuracy is about the same but performance is 3x faster,
# because each document is now a "4-word summary" of the original review.
示例#6
0
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(
    len(d.terms) for d in corpus.documents) / float(len(corpus))
print

# This may be too much words for some clustering algorithms (e.g., hierarchical).
# We'll reduce the documents to vectors of 4 concepts.

# First, let's test how the corpus would perform as a classifier.
# The details of KNN are not that important right now, just observe the numbers.
# Naturally, we want accuracy to stay the same after LSA reduction,
# and hopefully decrease the time needed to run.
t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words).
print "LSA reduction..."
print
corpus.reduce(4)

t = time.time()
print "accuracy:", KNN.test(corpus, folds=10)[-1]
print "time:", time.time() - t
print

# Not bad, accuracy is about the same but performance is 3x faster,
# because each document is now a "4-word summary" of the original review.
print 'Number of Negative Tweets:',len(neg_lines)
print 'Number of Positive Tweets:',len(pos_lines)

documents = []
for line in neg_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type='0')
    documents.append(document)
for line in pos_lines:
    document = Document(line,stopword=True,stemmer=PORTER,type='1')
    documents.append(document)

corpus = Corpus(documents,weight=TFIDF)
print "number of documents:", len(corpus)
print "number of words:", len(corpus.vector)
print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus))
print

#Filtering top 1000 features using Information Gain Criterion
corpus=corpus.filter(features=(corpus.feature_selection(top=1000,method=IG)))

# To test the accuracy of a classifier, Using 10-fold crossvalidation
# This yields 4 scores: Accuracy, Precision, Recall and F-score.
print 'classifying using KNN'
print  '-------------------------'
print  '(Accuracy, Precision,REcall,F-Measure)'
print KNN.test(corpus,k=100,folds=10,distance=COSINE)

f_neg.close()
f_pos.close()