def select_bigram(): dts.setFile('../output/BiDict.txt', '../output/select_bigram.txt', '../log/select_bigram') dts.setSize(389920) dts.openFiles() dts.loop_with_param(__filter_bigram, [100, 100000], 'filter_bigram') dts.closeFiles()
def __preProcess(): dts.loop_with_param(__readin, tweets, 'loading files') for tweet in tweets: #dts.writeO( tmp + '\n' ) tmp = {u'text': tweet} dts.writeO(json.dumps(tmp) + '\n') setlen = len(tweets) print '%d tweets remaining' % setlen dts.writeL('%d tweets remaining' % setlen)
def select_dict(): dts.setFile('../output/Dict_raw.txt', '../output/Dict_select.txt', '../log/idf_select.log') dts.setSize(214884) dts.openFiles() dts.loop_with_param(__filter_range, [1000, 34400], 'filter Dict_raw') dts.closeFiles()
def featureUnigram(): topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"] for hashtag in topicList: topic = hashtag[1:] dts.setSize( 50000 ) dts.setFile( '../entityOutput/topictwitter', '../entityOutput/topicTwitter_'+topic, '../log/topicTwitterFeatvect') dts.openFiles() dts.loop_with_param( __dealLine, [ hashtag, ],'Generating Unigram With Tag:'+topic ) dts.closeFiles()
def divideEmoticons(): dts.setSize(3830000) dts.setFile('../emojiOutput/EafterPre.txt', '', '../log/divideEmoticons') dts.openFiles() for emo in Emotions: emo['fileptr'] = codecs.open(outputDir + emo['filename'], 'w', 'utf-8') dts.loop_with_param(__divide, [ 3000, ], 'divide Emotions') for emo in Emotions: print '%s\t:\t%d' % (emo['filename'], emo['cnt']) emo['fileptr'].close() dts.closeFiles()
def featureGenerator(): dts.setSize(5000) dts.setFile( '../emojiOutput/afterPre.txt', '../emojiOutput/feature5000.txt', '../log/emojiFeatureGenerator.log' ) dts.openFiles() __featureGenerator_init() for emo in devideEmotion.Emotions: filename = devideEmotion.outputDir + emo['filename'] ifile = codecs.open( filename, 'r', 'utf-8' ) #print 'Processing %s:' % emo['filename'] dts.loop_with_param( __g_each_tweet, [emo['label'], ifile] , emo['filename'] ) ifile.close() #dts.loop( __g_each_tweet, 'feature Generator' ) dts.closeFiles()
def make_dict(): dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt', '../log/idf.log') dts.setSize(25770000) dts.openFiles() dict = {} dts.loop_with_param(__calcIDF, dict, 'calc the Idf') print 'start sort and print' cnt = 0 pcnt = 0 CntDistribution = {} CNT_MAX = 1000000 for x in range(CNT_MAX + 1): CntDistribution[x] = 0 for key, value in [(k, dict[k]) for k in sorted(dict.keys())]: if value > 10 and value < 364600: dts.writeO('%s:%d\n' % (key, value)) pcnt += 1 cnt += 1 if (value > 364600): print key if (value > CNT_MAX * 10): CntDistribution[CNT_MAX] += 1 else: CntDistribution[value / 10] += 1 print '%d words output' % pcnt dts.writeL('%d words output\n' % pcnt) print 'printing range log' ncnt = 0 for x in range(CNT_MAX): ncnt += CntDistribution[x] dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt)) dts.closeFiles()
""" find emoji in tweets """ import io import os import re import codecs import dealTweets as dts dts.setSize(5000000) dts.setFile("../data/tweet_noRT_noDup.txt", "../tmp/b.out", "../tmp/c.out") dts.openFiles() def findemoji(str): line = dts.readlineI() if str in line: print(line) dts.writeO(line) dts.loop_with_param(findemoji, b'\xf0\x9f\x98\x80'.decode('utf-8'), 'try to find Emoji :😀') #dts.writeL( u'\xe2\x98\xba\xef\xb8\x8f with hay!' ) #smile = '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8') #dts.loop_with_param( findemoji, smile, u'try to find Emoji :' + smile) #print '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8').encode('utf-8') dts.closeFiles()
""" find emoji in tweets """ import io import os import re import codecs import dealTweets as dts dts.setSize(50000) dts.setFile("../data/tweet_noRT_noDup.txt", "../tmp/b.out", "../tmp/c.out") dts.openFiles() def findemoji(str): line = dts.readlineI() if str in line: print line dts.writeO(line) #dts.loop_with_param( findemoji, u'☺️', u'try to find Emoji :☺️' ) #dts.writeL( '0001F612'.decode('hex').encode('utf-8') ) dts.writeL(u'\xe2\x98\xba\xef\xb8\x8f with hay!') smile = '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8') dts.loop_with_param(findemoji, smile, u'try to find Emoji :' + smile) print '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8').encode('utf-8') dts.closeFiles()