def __init__(self, username, password, method='kmeans'): auth = ClientAuthMethod(username, password) self.reader = GoogleReader(auth) self.reader.buildSubscriptionList() self.categories = self.reader.getCategories() self.corpus = Corpus() self.method = method
import os, sys sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.vector import Document, Corpus, Vectorspace # Latent Semantic Analysis (LSA) is a statistical machine learning method # based on singular value decomposition (SVD). # It discovers semantically related words across documents. # The idea is to group the document vectors in a matrix # (each document is a row, each word in the corpus is a column), # and then to reduce the number of dimensions, filtering out "noise". D1 = Document("The dog wags his tail.", threshold=0, name="dog") D2 = Document("Curiosity killed the cat.", threshold=0, name="cat") D3 = Document("Cats and dogs make good pets.", threshold=0, name="pet") D4 = Document("Curiosity drives science.", threshold=0, name="science") corpus = Corpus([D1, D2, D3, D4]) lsa = corpus.lsa() print lsa.keywords(D4) print print lsa.search("curiosity") # Document D4 now yields kill as a keyword, although this word was not D4's description. # However, document D2 and D4 share curiosity as a keyword, # so D4 inherits some of the keywords from D2. # Performing a search on curiosity now also yields document D3 as a result.
----------- This program trains a kNN classifier to recognize adjectives taken from Twitter. Adjectives are classified and identified as #win or #fail. The adjective vectors are put in a corpus to train the classifier. "damn" and "sucks" are classified as #fail & "awesome" and "cool" are classified as "win" Results vary according to real-time tweets. ''' from pattern.web import Twitter from pattern.en import Sentence, parse from pattern.search import search from pattern.vector import Document, Corpus, KNN corpus = Corpus() #collection of texts for i in range(1, 15): for tweet in Twitter().search( '#win' or '#fail', start=i, count=100 ): #searches 15*100=1500 tweets for these classes of hashtags p = '#win' in tweet.description.lower() and 'WIN' or 'FAIL' m = '#fail' in tweet.description.lower() and 'WIN' or 'FAIL' s = tweet.description.lower() s = Sentence( parse(s) ) #parse anlayzes & gives strings that are annotated with specified tags s = search('JJ', s) #searches for adjectives in tweets (JJ = adjectiive) s = [match[0].string for match in s] s = ' '.join(s)
# Naive Bayes is one of the oldest classifiers, # but is is still popular because it is fast for corpora # that have many documents and many words. # It is outperformed by KNN and SVM, but useful for running tests. # We'll test it with a corpus of spam e-mail messages # included in the test suite, stored as a CSV-file. # The corpus contains mostly technical e-mail from developer mailing lists. data = Datasheet.load( os.path.join("..", "..", "test", "corpora", "apache-spam.csv")) documents = [] for score, message in data: document = Document(message, type=int(score) > 0) documents.append(document) corpus = Corpus(documents) print "number of documents:", len(corpus) print "number of words:", len(corpus.vector) print "number of words (average):", sum( len(d.terms) for d in corpus.documents) / float(len(corpus)) print # Train Naive Bayes on all documents. # Each document has a type: True for real e-mail, False for spam. # This results in a "binary" classifier that either answers True or False # for unknown documents. classifier = Bayes() for document in corpus: classifier.train(document)
import os, sys; sys.path.insert(0, os.path.join("..", "..")) import os import glob from pattern.vector import Document, Corpus, Bayes, KNN, features, distance, Vector, _distance, COSINE, kdtree #from pattern.web import PDF ##pdf = PDF(open("/users/tom/downloads/10-1.1.1.61.7217.pdf", "rb").read()) #pdf = PDF(open("/users/tom/downloads/10-1.1.1.14.8422.pdf", "rb").read()) #print Document(unicode(pdf), threshold=1).keywords(30) #print xxx corpus = Corpus() for product in glob.glob(os.path.join("reviews", "*")): for review in glob.glob(os.path.join(product, "*.txt")): polarity = "yes" in review s = open(review).read() corpus.append(Document(s, type=polarity, top=50, threshold=2)) #print "testtree" #V = lambda x: Vector(dict(enumerate(x))) #v = [(2,3), (5,4), (9,6), (4,7), (8,1), (7,2)] #v = [V(x) for x in v] #t = kdtree(v) #print t.nn(V((9,5))) #print xxx n = 10 x = 0 t1 = 0