import sys import nltk import sets import operator import featureset words = {} for line in sys.stdin: for word in featureset.wordlist(line.decode('utf-8')): words[word] = words[word] + 1 if word in words else 1 wordsSorted = sorted(words.items(), key=operator.itemgetter(1),reverse=True) for w in wordsSorted: sys.stdout.write("{0}\t{1}\n".format(w[0].encode('utf-8'),w[1]))
import sys import nltk import sets import operator import featureset words = {} for line in sys.stdin: for word in featureset.wordlist(line.decode('utf-8')): words[word] = words[word] + 1 if word in words else 1 wordsSorted = sorted(words.items(), key=operator.itemgetter(1), reverse=True) for w in wordsSorted: sys.stdout.write("{0}\t{1}\n".format(w[0].encode('utf-8'), w[1]))
wordStart = int(rangeSpec[0]) wordEnd = int(rangeSpec[1]) f = open(classifierFilename, "rb") classifier = pickle.load(f) f.close() featureWords = featureset.load(wordlistFilename, wordStart, wordEnd) reviews = [] extractFeatures = featureset.makeExtractor(featureWords) count = 0 missed = 0 variance = 0 for line in sys.stdin: parts = line.decode('utf-8').split("\n")[0].split("\t") wordlist = list(featureset.wordlist(parts[1])) c = classifier.classify(extractFeatures(wordlist)) a = parts[0] count += 1 if c != a: missed += 1 print str(count) + "\t" + a + "\t" + c + "\t" + (",".join( reduce(lambda l, w: l + [w] if w in featureWords else l, wordlist, []))) if count > 0: print "{0} % correct, {1}/{2} ".format( 100 * ((count - missed) * 1.0 / count), (count - missed), count)
wordlistFilename = sys.argv[1] rangeSpec = sys.argv[2].split(",") wordStart = int(rangeSpec[0]) wordEnd = int(rangeSpec[1]) outputFilename = sys.argv[3] featureWords = featureset.load(wordlistFilename, wordStart, wordEnd) print featureWords sys.stderr.write("Loading training data...") texts = [] for line in sys.stdin: parts = line.decode("utf-8").split("\n")[0].split("\t") wordlist = list(featureset.wordlist(parts[1])) texts.append((wordlist, parts[0])) extractFeatures = featureset.makeExtractor(featureWords) sys.stderr.write(" applying features ...") trainingSet = nltk.classify.apply_features(extractFeatures, texts) sys.stderr.write(" training classifier ...") classifier = nltk.NaiveBayesClassifier.train(trainingSet) sys.stderr.write(" done\n") f = open(outputFilename, "wb") pickle.dump(classifier, f) f.close()