# We'll reduce the document vectors to 10 concepts. # Let's test how our model performs as a classifier. # A document can have a label (or type, or class). # For example, in the movie reviews corpus, # there are positive reviews (score > 0) and negative reviews (score < 0). # A classifier uses a model as "training" data # to predict the label (type/class) of unlabeled documents. # In this case, it can predict whether a new movie review is positive or negative. # The details are not that important right now, just observe the accuracy. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print "accuracy:", KNN.test(m, folds=10)[-1] print "time:", time.time() - t print # Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features). print "LSA reduction..." print m.reduce(10) t = time.time() print "accuracy:", KNN.test(m, folds=10)[-1] print "time:", time.time() - t print # Accuracy is about the same, but the performance is better: 2x-3x faster, # because each document is now a "10-word summary" of the original review.
# We'll reduce the document vectors to 10 concepts. # Let's test how our model performs as a classifier. # A document can have a label (or type, or class). # For example, in the movie reviews corpus, # there are positive reviews (score > 0) and negative reviews (score < 0). # A classifier uses a model as "training" data # to predict the label (type/class) of unlabeled documents. # In this case, it can predict whether a new movie review is positive or negative. # The details are not that important right now, just observe the accuracy. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features). print("LSA reduction...") print() m.reduce(10) t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Accuracy is about the same, but the performance is better: 2x-3x faster, # because each document is now a "10-word summary" of the original review.
# Let's test how our model performs as a classifier. # A document can have a label (or type, or class). # For example, in the movie reviews corpus, # there are positive reviews (score > 0) and negative reviews (score < 0). # A classifier uses a model as "training" data # to predict the label (type/class) of unlabeled documents. # In this case, it can predict whether a new movie review is positive or # negative. # The details are not that important right now, just observe the accuracy. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Reduce the documents to vectors of 10 concepts (= 1/4 of 40 features). print("LSA reduction...") print() m.reduce(10) t = time.time() print("accuracy:", KNN.test(m, folds=10)[-1]) print("time:", time.time() - t) print() # Accuracy is about the same, but the performance is better: 2x-3x faster, # because each document is now a "10-word summary" of the original review.
x += 1 print "ERROR" print x / n print t1 print t2 #print xxx print len(corpus) print len(corpus.features) print len(corpus.documents[0].vector) from time import time t = time() print KNN.test(corpus, folds=10) print time()-t print "filter..." from time import time t = time() f = corpus.feature_selection(150, verbose=False) print f print time()-t corpus = corpus.filter(f) #corpus.reduce(300) #print len(corpus.lsa.vectors[corpus.documents[0].id]) #print corpus.lsa.vectors[corpus.documents[0].id] #print len(corpus)
corpus = Corpus(documents) print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus)) print # This may be too much words for some clustering algorithms (e.g., hierarchical). # We'll reduce the documents to vectors of 4 concepts. # First, let's test how the corpus would perform as a classifier. # The details of KNN are not that important right now, just observe the numbers. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words). print "LSA reduction..." print corpus.reduce(4) t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Not bad, accuracy is about the same but performance is 3x faster, # because each document is now a "4-word summary" of the original review.
print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum( len(d.terms) for d in corpus.documents) / float(len(corpus)) print # This may be too much words for some clustering algorithms (e.g., hierarchical). # We'll reduce the documents to vectors of 4 concepts. # First, let's test how the corpus would perform as a classifier. # The details of KNN are not that important right now, just observe the numbers. # Naturally, we want accuracy to stay the same after LSA reduction, # and hopefully decrease the time needed to run. t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Reduce the documents to vectors of 4 concepts (= 1/7 of 30 words). print "LSA reduction..." print corpus.reduce(4) t = time.time() print "accuracy:", KNN.test(corpus, folds=10)[-1] print "time:", time.time() - t print # Not bad, accuracy is about the same but performance is 3x faster, # because each document is now a "4-word summary" of the original review.
print 'Number of Negative Tweets:',len(neg_lines) print 'Number of Positive Tweets:',len(pos_lines) documents = [] for line in neg_lines: document = Document(line,stopword=True,stemmer=PORTER,type='0') documents.append(document) for line in pos_lines: document = Document(line,stopword=True,stemmer=PORTER,type='1') documents.append(document) corpus = Corpus(documents,weight=TFIDF) print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum(len(d.terms) for d in corpus.documents) / float(len(corpus)) print #Filtering top 1000 features using Information Gain Criterion corpus=corpus.filter(features=(corpus.feature_selection(top=1000,method=IG))) # To test the accuracy of a classifier, Using 10-fold crossvalidation # This yields 4 scores: Accuracy, Precision, Recall and F-score. print 'classifying using KNN' print '-------------------------' print '(Accuracy, Precision,REcall,F-Measure)' print KNN.test(corpus,k=100,folds=10,distance=COSINE) f_neg.close() f_pos.close()