def filterHashtags(): line = dts.readlineI() if not line: return try: obj = json.loads(line) except: return text = obj["text"] pt = "space" for topic in topicList: matchs = re.search(topic, text) if not matchs: continue else: pt = topic break if not matchs: return else: nobj = {"text": text, "hashtag": pt} dts.writeO(json.dumps(nobj) + '\n') pass
def __io(): line = dts.readlineI() if not line: return obj = json.loads(line) text = obj['text'] dts.writeO(text + '\n')
def __divide(): line = dts.readlineI() if not line: return try: obj = json.loads( line ) except: return text = obj['text'] ans = 0 for emo in Emotions: label = int ( emo['label'] ) for icon in emo['Icons']: if icon in text: if 0 == ans or label == ans: ans = label else: ans = -1 if ans > 0 : # for add the hashtag to HashtagSet htList = re.findall( '#\w+', text ) global hashtagSet for ht in htList: hashtagSet.add( ht[1:].lower() ) #text = preprocess_func.preprocess( text ).lower() #text = text.lower() #text = re.sub( '[!?]', '', text ) #text = re.sub( '[^\d\w\ ]', '', text ) #text = re.sub( 'username', '', text ) #text = re.sub( 'twhashtag', '', text ) #text = re.sub( 'url', '', text ) text = re.sub( '\s+', ' ', text ) if len( tokenizer.tokenize( preprocess_func.preprocess(text ).lower() )) <= 3: return tmp = {u'text':text, u'label':ans} dts.writeO( json.dumps(tmp) + '\n' ) for emo in Emotions: label = 0 #print '%d:%d\n' % ( ans, int(emo['label'] )) if ans == int(emo['label']): label = 1 Emotions[ ans - 1 ][ 'cnt' ] += 1 tmp = {u'text':text, u'label':label} emo['fileptr'].write( json.dumps(tmp) + '\n' ) else : if Emotions[ans-1]['ncnt'] < Emotions[ans-1]['cnt']: label = -1 Emotions[ ans - 1 ][ 'ncnt' ] += 1 tmp = {u'text':text, u'label':label} emo['fileptr'].write( json.dumps(tmp) + '\n' ) pass
def __preProcess(): dts.loop_with_param(__readin, tweets, 'loading files') for tweet in tweets: #dts.writeO( tmp + '\n' ) tmp = {u'text': tweet} dts.writeO(json.dumps(tmp) + '\n') setlen = len(tweets) print '%d tweets remaining' % setlen dts.writeL('%d tweets remaining' % setlen)
def __filter_bigram(ran): line = dts.readlineI() if not line: return left = ran[0] right = ran[1] num = int(line.split(':')[1]) if num >= left and num <= right: dts.writeO(line)
def __filter_range(border): l = border[0] r = border[1] line = dts.readlineI() if not line: return cnt = int((line.split(':'))[1]) if cnt >= l and cnt <= r: dts.writeO(line)
def __divide(): line = dts.readlineI() if not line: return obj = json.loads(line) text = obj['text'] label = __label(text) if label > 0: obj['label'] = label dts.writeO(json.dumps(obj) + '\n')
def __divide(): line = dts.readlineI() if not line: return global GC if GC < 1000000: GC += 1 return try: obj = json.loads(line) except: return text = obj['text'] ans = 0 for emo in Emotions: label = int(emo['label']) for icon in emo['Icons']: if icon in text: if 0 == ans or label == ans: ans = label else: ans = -1 if ans > 0: text = re.sub('[!?]', '', text) text = re.sub('[^\d\w\ ]', '', text) text = re.sub('USERNAME', '', text) text = re.sub('URL', '', text) if len(tokenizer.tokenize(text)) <= 3: return tmp = {u'text': text, u'label': ans} dts.writeO(json.dumps(tmp) + '\n') for emo in Emotions: label = 0 #print '%d:%d\n' % ( ans, int(emo['label'] )) if ans == int(emo['label']): label = 1 Emotions[ans - 1]['cnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') else: if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']: label = -1 Emotions[ans - 1]['ncnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') pass
def __lineParse(): line = dts.readlineI() if not line: return obj = eval(line) output = str(obj[1]) + ' ' wordlist = {} for word, value in obj[0].iteritems(): if value > 0: wordlist.update({__getIndex(word): value}) #output += str(__getIndex(word)) + ':'+ str(value) +' ' for key, value in [(k, wordlist[k]) for k in sorted(wordlist.keys())]: output += str(key) + ':' + str(value) + ' ' if obj[1] > 1: dts.writeO(output + '\n')
def __cleanDup(): dts.openFiles() tw = set() def __push(): text = dts.readlineI() tw.add(text) dts.loop(__push, 'push into set') print 'start write to file %s' % dts.ofileName cnt = 0 for text in tw: dts.writeO(text) cnt += 1 print 'write finished, tot tweet left: %d' % cnt dts.closeFiles()
def parse_line(): line = dts.readlineI() if not line: return try: obj = json.loads(line) except: return label = obj[u'label'] text = obj[u'text'] global PC if int(label) == 1: PC += 1 output = '%d %s\n' % (label, gen_Feature(text)) dts.writeO(output) dts.writeL(str(label) + '\n')
def __g_each_tweet(param): #line = dts.readlineI() line = param[1].readline() if not line: return oline = '%s ' % param[0] featureList = __g_each_feature(preprocess_func.preprocess(line)) for key, value in [(k, featureList[k]) for k in sorted(featureList.keys())]: oline += str(key) + ':' + value + ' ' #for t in range( index_name_cnt-1 ): # oline += str(t) + ':' + featureList[str(t)] + ' ' #for key,value in featureList.items(): # oline += key + ':' + value + ' ' #return oline dts.writeO(oline + '\n')
def __dealLine(param): line = dts.readlineI() if not line: return obj = json.loads( line ) text = obj['text'] hashtag = obj['hashtag'] if hashtag != param[0]: return wordList = tokenizer.tokenize(preprocess(text)) dict = {} for uni in getUnigramWords.getUnigramWords(): dict.update( { uni: 0 } ) for word in wordList: if word == uni: dict.update( { word: 1 } ) nobj = ( dict, 0 ) #print nobj dts.writeO( str(nobj)+'\n' )
def make_bigram(): dict = {} dts.loop_with_param_clean(__bigram, __bigram_clean, dict, 'Make the dict of bigram') print 'start to output' cntList = {} for x in range(100000): cntList.update({x: 0}) for k, v in dict.iteritems(): if v > 10: dts.writeO(k + ':' + str(v) + '\n') if v >= 100000: cntList.update({100000: cntList.get(100000, 0)}) else: cntList.update({v: cntList.get(v, 0) + 1}) for k, v in cntList.iteritems(): dts.writeL(str(k) + ':' + str(v) + '\n')
def make_dict(): dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt', '../log/idf.log') dts.setSize(25770000) dts.openFiles() dict = {} dts.loop_with_param(__calcIDF, dict, 'calc the Idf') print 'start sort and print' cnt = 0 pcnt = 0 CntDistribution = {} CNT_MAX = 1000000 for x in range(CNT_MAX + 1): CntDistribution[x] = 0 for key, value in [(k, dict[k]) for k in sorted(dict.keys())]: if value > 10 and value < 364600: dts.writeO('%s:%d\n' % (key, value)) pcnt += 1 cnt += 1 if (value > 364600): print key if (value > CNT_MAX * 10): CntDistribution[CNT_MAX] += 1 else: CntDistribution[value / 10] += 1 print '%d words output' % pcnt dts.writeL('%d words output\n' % pcnt) print 'printing range log' ncnt = 0 for x in range(CNT_MAX): ncnt += CntDistribution[x] dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt)) dts.closeFiles()
def findemoji(str): line = dts.readlineI() if str in line: print(line) dts.writeO(line)
def __clean(param): for key, cnt in [(k, v) for k, v in topicDict.iteritems()]: if cnt < param[0]: topicDict.pop(key) if __name__ == "__main__": dts.setSize(13000000) dts.setFile('/home/server2103/dump/twitter.tweet.json', '../emojiOutput/topics', '../log/topics.emoji') dts.openFiles() dts.loop_with_param_clean(__dealLine, __clean, [ 3, ], 'find hashtags') cnt = 0 sum = 0 print 'start output' for key, value in topicDict.iteritems(): dts.writeO('%s\t:%d\n' % (key, value)) cnt += 1 sum += value dts.writeL('%d hashtags with %d displays' % (cnt, sum)) print '%d hashtags with %d displays' % (cnt, sum) dts.closeFiles() pass
def __preprocess(): line = preprocess_func.preprocess(dts.readlineI()) dts.writeO(line)
def __cleanTweet(): dts.writeO( __bandWords(dts.readlineI()) )