예제 #1
0
import sys
import nltk
import sets
import operator

import featureset

words = {}

for line in sys.stdin:
   for word in featureset.wordlist(line.decode('utf-8')):
      words[word] = words[word] + 1 if word in words else 1

wordsSorted = sorted(words.items(), key=operator.itemgetter(1),reverse=True)

for w in wordsSorted:
   sys.stdout.write("{0}\t{1}\n".format(w[0].encode('utf-8'),w[1]))
   
예제 #2
0
import sys
import nltk
import sets
import operator

import featureset

words = {}

for line in sys.stdin:
    for word in featureset.wordlist(line.decode('utf-8')):
        words[word] = words[word] + 1 if word in words else 1

wordsSorted = sorted(words.items(), key=operator.itemgetter(1), reverse=True)

for w in wordsSorted:
    sys.stdout.write("{0}\t{1}\n".format(w[0].encode('utf-8'), w[1]))
예제 #3
0
wordStart = int(rangeSpec[0])
wordEnd = int(rangeSpec[1])

f = open(classifierFilename, "rb")
classifier = pickle.load(f)
f.close()

featureWords = featureset.load(wordlistFilename, wordStart, wordEnd)

reviews = []

extractFeatures = featureset.makeExtractor(featureWords)

count = 0
missed = 0
variance = 0
for line in sys.stdin:
    parts = line.decode('utf-8').split("\n")[0].split("\t")
    wordlist = list(featureset.wordlist(parts[1]))
    c = classifier.classify(extractFeatures(wordlist))
    a = parts[0]
    count += 1
    if c != a:
        missed += 1
    print str(count) + "\t" + a + "\t" + c + "\t" + (",".join(
        reduce(lambda l, w: l + [w]
               if w in featureWords else l, wordlist, [])))

if count > 0:
    print "{0} % correct, {1}/{2}  ".format(
        100 * ((count - missed) * 1.0 / count), (count - missed), count)
예제 #4
0
wordlistFilename = sys.argv[1]
rangeSpec = sys.argv[2].split(",")
wordStart = int(rangeSpec[0])
wordEnd = int(rangeSpec[1])
outputFilename = sys.argv[3]

featureWords = featureset.load(wordlistFilename, wordStart, wordEnd)
print featureWords

sys.stderr.write("Loading training data...")

texts = []

for line in sys.stdin:
    parts = line.decode("utf-8").split("\n")[0].split("\t")
    wordlist = list(featureset.wordlist(parts[1]))
    texts.append((wordlist, parts[0]))

extractFeatures = featureset.makeExtractor(featureWords)

sys.stderr.write(" applying features ...")
trainingSet = nltk.classify.apply_features(extractFeatures, texts)

sys.stderr.write(" training classifier ...")
classifier = nltk.NaiveBayesClassifier.train(trainingSet)
sys.stderr.write(" done\n")

f = open(outputFilename, "wb")
pickle.dump(classifier, f)
f.close()