def update_wordcount(word_fd, label_word_fd, handle, label): print "Counting '%s'" % (label) print datetime.datetime.now() for line in handle: for word in words_in_tweet(line): word_fd.inc(word) label_word_fd[label].inc(word) handle.seek(0)
def build_csv(vocab, pos_tweets, negative_tweets, output_csv_file): nFeature = len(vocab) dataset = [] for (label, tweets) in [(1, pos_tweets), (0, negative_tweets)]: for line in tweets: features = [0] * nFeature for word in words_in_tweet(line): if vocab.has_key( word ): # it may not be in the vocab beucase of its low frequency overall features[vocab[word]] = features[vocab[word]] + 1 dataset.append((label, line, features)) random.shuffle(dataset) fd = open(output_csv_file + '.vis', 'w') fdet = open(output_csv_file + '.details', 'w') fdesc = open(output_csv_file + '.desc', 'w') fdict = open(output_csv_file + '.dict', 'w') fdesc.write('We transformed ' + str(len(dataset)) + ' tweets, ' + str(len(pos_tweets)) + ' postives and ' + str(len(negative_tweets)) + ' negative tweets. Total number of features = ' + str(nFeature) + '\n\n') fdesc.write('Format of ' + output_csv_file + '.vis file is as follows:\n') fdesc.write('PrimaryKey, realLabel, feature1, ....feature' + str(nFeature) + '\n\n') fdesc.write('Format of ' + output_csv_file + '.details file is as follows:\n') fdesc.write('PrimaryKey, realLabel, actual_tweat\n\n') fdesc.write('Format of ' + output_csv_file + '.dict file is as follows:\n') fdesc.write('NumberOfTheFeature, CorrespondingWord\n\n') fdesc.close() for word, index in vocab.iteritems(): fdict.write(str(index) + ':' + word + '\n') fdict.close() for pk in range(len(dataset)): (label, tweet, features) = dataset[pk] entries = map(str, features) fd.write(','.join([str(pk + 1), str(label)]) + ',' + ','.join(entries) + '\n') fdet.write(str(pk + 1) + ',' + str(label) + ',' + tweet) fd.close() fdet.close() print len( dataset), ' total lines were transformed into ', nFeature, ' features'
def build_csv(vocab, pos_tweets, negative_tweets, output_csv_file): nFeature = len(vocab) dataset = [] for (label, tweets) in [(1, pos_tweets), (0, negative_tweets)]: for line in tweets: features = [0] * nFeature for word in words_in_tweet(line): if vocab.has_key(word): # it may not be in the vocab beucase of its low frequency overall features[vocab[word]] = features[vocab[word]]+1 dataset.append((label, line, features)) random.shuffle(dataset) fd = open(output_csv_file+'.vis', 'w') fdet = open(output_csv_file+'.details', 'w') fdesc = open(output_csv_file+'.desc', 'w') fdict = open(output_csv_file+'.dict', 'w') fdesc.write('We transformed '+str(len(dataset))+' tweets, '+str(len(pos_tweets))+' postives and '+ str(len(negative_tweets))+' negative tweets. Total number of features = '+str(nFeature)+'\n\n') fdesc.write('Format of '+output_csv_file+'.vis file is as follows:\n') fdesc.write('PrimaryKey, realLabel, feature1, ....feature'+str(nFeature)+'\n\n') fdesc.write('Format of '+output_csv_file+'.details file is as follows:\n') fdesc.write('PrimaryKey, realLabel, actual_tweat\n\n') fdesc.write('Format of '+output_csv_file+'.dict file is as follows:\n') fdesc.write('NumberOfTheFeature, CorrespondingWord\n\n') fdesc.close() for word, index in vocab.iteritems(): fdict.write(str(index) + ':'+ word + '\n') fdict.close() for pk in range(len(dataset)): (label, tweet, features) = dataset[pk] entries = map(str, features) fd.write(','.join([str(pk+1),str(label)])+','+','.join(entries)+'\n') fdet.write(str(pk+1)+','+str(label)+','+tweet) fd.close() fdet.close() print len(dataset), ' total lines were transformed into ', nFeature, ' features'
def build_vocab(min_word_freq, positive_file, negative_file, nPos, nNeg): positive_handle = open(positive_file, 'r') negative_handle = open(negative_file, 'r') groups = [(1, positive_handle, nPos), (0, negative_handle, nNeg)] vocab = {} for (curLabel, handle, limit) in groups: tweetsRead = 0; tweets = [] for line in handle: if tweetsRead >= limit: break else: tweetsRead = tweetsRead + 1 tweets.append(line) for word in words_in_tweet(line): if vocab.has_key(word): vocab[word] = vocab[word] + 1 else: vocab[word] = 1; if tweetsRead != limit: print '***Warning: you requested ', limit, ' instances for label ', curLabel, ' but we only found ', tweetsRead, ' tweets with that label' handle.close() if curLabel == 0: negative_tweets = tweets elif curLabel == 1: positive_tweets = tweets wordId = {} nextId = 0 for word in vocab.keys(): if vocab[word] < min_word_freq: continue wordId[word] = nextId nextId = nextId +1 return (wordId, positive_tweets, negative_tweets)
def build_vocab(min_word_freq, positive_file, negative_file, nPos, nNeg): positive_handle = open(positive_file, 'r') negative_handle = open(negative_file, 'r') groups = [(1, positive_handle, nPos), (0, negative_handle, nNeg)] vocab = {} for (curLabel, handle, limit) in groups: tweetsRead = 0 tweets = [] for line in handle: if tweetsRead >= limit: break else: tweetsRead = tweetsRead + 1 tweets.append(line) for word in words_in_tweet(line): if vocab.has_key(word): vocab[word] = vocab[word] + 1 else: vocab[word] = 1 if tweetsRead != limit: print '***Warning: you requested ', limit, ' instances for label ', curLabel, ' but we only found ', tweetsRead, ' tweets with that label' handle.close() if curLabel == 0: negative_tweets = tweets elif curLabel == 1: positive_tweets = tweets wordId = {} nextId = 0 for word in vocab.keys(): if vocab[word] < min_word_freq: continue wordId[word] = nextId nextId = nextId + 1 return (wordId, positive_tweets, negative_tweets)
def features(feat_func, handle, label): print "Generating features for '%s'" % (label) print datetime.datetime.now() return [((feat_func(words_in_tweet(line))), label) for line in handle]
r = start while r <= stop: yield r r += step testfile = open('testdata.manual.2009.05.25') print "Loading classifier" classifier = load_classifier() print "Running test" print "prob, pos prec, pos rec, neg prec, neg rec" for prob in drange(0.5, 1.0, .01): right = 0 wrong = 0 refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) testfile.seek(0) count = 0 for line in testfile: parts = line.split(";;") dist = classifier.prob_classify(word_feats(words_in_tweet(parts[5]))) if dist.prob(dist.max()) > prob: realguess = dist.max() else: realguess = NEUTRAL refsets[parts[0]].add(count) testsets[realguess].add(count) count += 1 print "%f, %f, %f, %f, %f" % (prob, nltk.metrics.precision(refsets[POSITIVE], testsets[POSITIVE]), nltk.metrics.recall(refsets[POSITIVE],testsets[POSITIVE]), nltk.metrics.precision(refsets[NEGATIVE], testsets[NEGATIVE]), nltk.metrics.recall(refsets[NEGATIVE], testsets[NEGATIVE]))
testfile = open('testdata.manual.2009.05.25') print "Loading classifier" classifier = load_classifier() print "Running test" print "prob, pos prec, pos rec, neg prec, neg rec" for prob in drange(0.5, 1.0, .01): right = 0 wrong = 0 refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) testfile.seek(0) count = 0 for line in testfile: parts = line.split(";;") dist = classifier.prob_classify(word_feats(words_in_tweet(parts[5]))) if dist.prob(dist.max()) > prob: realguess = dist.max() else: realguess = NEUTRAL refsets[parts[0]].add(count) testsets[realguess].add(count) count += 1 print "%f, %f, %f, %f, %f" % ( prob, nltk.metrics.precision(refsets[POSITIVE], testsets[POSITIVE]), nltk.metrics.recall(refsets[POSITIVE], testsets[POSITIVE]), nltk.metrics.precision(refsets[NEGATIVE], testsets[NEGATIVE]), nltk.metrics.recall(refsets[NEGATIVE], testsets[NEGATIVE]))