def __readin(tweetSet): line = dts.readlineI() if not line: return #print '\n'+ line # if not re.match( r'\}\s*$', line ): # nline = dts.readlineI() # # if not nline: # return # line += nline try: obj = json.loads(line) except: #print line nextline = dts.readlineI() #print nextline return text = __cleanRT(obj['text']) if not text: return else: tweetSet.add(text)
def __divide(parList): line = dts.readlineI() if not line: return obj = json.loads(line) text = obj['text'] ans = 0 for emo in Emotions: label = int(emo['label']) for icon in emo['Icons']: if icon in text: if 0 == ans or label == ans: ans = label else: ans = -1 if ans > 0: ttext = re.sub('[!?]', '', text) ttext = re.sub('[^\d\w\ ]', '', ttext) ttext = re.sub('USERNAME', '', ttext) ttext = re.sub('URL', '', text) if len(tokenizer.tokenize(text)) <= 3: return if ans > 0 and Emotions[ans - 1]['cnt'] < parList[0]: Emotions[ans - 1]['cnt'] += 1 Emotions[ans - 1]['fileptr'].write(line)
def filterHashtags(): line = dts.readlineI() if not line: return try: obj = json.loads(line) except: return text = obj["text"] pt = "space" for topic in topicList: matchs = re.search(topic, text) if not matchs: continue else: pt = topic break if not matchs: return else: nobj = {"text": text, "hashtag": pt} dts.writeO(json.dumps(nobj) + '\n') pass
def __io(): line = dts.readlineI() if not line: return obj = json.loads(line) text = obj['text'] dts.writeO(text + '\n')
def __testEmo(): line = dts.readlineI() if not line: return obj = json.loads(line) if u'\U0001F608' in obj['text']: print obj['text']
def __divide(): line = dts.readlineI() if not line: return try: obj = json.loads( line ) except: return text = obj['text'] ans = 0 for emo in Emotions: label = int ( emo['label'] ) for icon in emo['Icons']: if icon in text: if 0 == ans or label == ans: ans = label else: ans = -1 if ans > 0 : # for add the hashtag to HashtagSet htList = re.findall( '#\w+', text ) global hashtagSet for ht in htList: hashtagSet.add( ht[1:].lower() ) #text = preprocess_func.preprocess( text ).lower() #text = text.lower() #text = re.sub( '[!?]', '', text ) #text = re.sub( '[^\d\w\ ]', '', text ) #text = re.sub( 'username', '', text ) #text = re.sub( 'twhashtag', '', text ) #text = re.sub( 'url', '', text ) text = re.sub( '\s+', ' ', text ) if len( tokenizer.tokenize( preprocess_func.preprocess(text ).lower() )) <= 3: return tmp = {u'text':text, u'label':ans} dts.writeO( json.dumps(tmp) + '\n' ) for emo in Emotions: label = 0 #print '%d:%d\n' % ( ans, int(emo['label'] )) if ans == int(emo['label']): label = 1 Emotions[ ans - 1 ][ 'cnt' ] += 1 tmp = {u'text':text, u'label':label} emo['fileptr'].write( json.dumps(tmp) + '\n' ) else : if Emotions[ans-1]['ncnt'] < Emotions[ans-1]['cnt']: label = -1 Emotions[ ans - 1 ][ 'ncnt' ] += 1 tmp = {u'text':text, u'label':label} emo['fileptr'].write( json.dumps(tmp) + '\n' ) pass
def __calcIDF(dict): line = dts.readlineI() if not line: return line = re.sub('[!?]', '', line) line = re.sub(r'[^\d\w\ ]', '', line) for term in tokenizer.tokenize(line): dict.update({term: dict.get(term, 0) + 1})
def __filter_bigram(ran): line = dts.readlineI() if not line: return left = ran[0] right = ran[1] num = int(line.split(':')[1]) if num >= left and num <= right: dts.writeO(line)
def __filter_range(border): l = border[0] r = border[1] line = dts.readlineI() if not line: return cnt = int((line.split(':'))[1]) if cnt >= l and cnt <= r: dts.writeO(line)
def __divide(): line = dts.readlineI() if not line: return obj = json.loads(line) text = obj['text'] label = __label(text) if label > 0: obj['label'] = label dts.writeO(json.dumps(obj) + '\n')
def __dealLine(param): line = dts.readlineI() if not line: return try: obj = json.loads(line) except: #print line nextline = dts.readlineI() #print nextline return text = obj['text'] regex = r'#\w+' matchs = re.findall(regex, text) if not matchs: return for words in matchs: words = words.lower() topicDict.update({words: topicDict.get(words, 0) + 1})
def __divide(): line = dts.readlineI() if not line: return global GC if GC < 1000000: GC += 1 return try: obj = json.loads(line) except: return text = obj['text'] ans = 0 for emo in Emotions: label = int(emo['label']) for icon in emo['Icons']: if icon in text: if 0 == ans or label == ans: ans = label else: ans = -1 if ans > 0: text = re.sub('[!?]', '', text) text = re.sub('[^\d\w\ ]', '', text) text = re.sub('USERNAME', '', text) text = re.sub('URL', '', text) if len(tokenizer.tokenize(text)) <= 3: return tmp = {u'text': text, u'label': ans} dts.writeO(json.dumps(tmp) + '\n') for emo in Emotions: label = 0 #print '%d:%d\n' % ( ans, int(emo['label'] )) if ans == int(emo['label']): label = 1 Emotions[ans - 1]['cnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') else: if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']: label = -1 Emotions[ans - 1]['ncnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') pass
def dealLine(): line = dts.readlineI() for emo in Emotions: flag = -2 for eicon in emo['Icons']: if eicon in line: print line #flag = line.find( eicon ) flag = 0 break if flag >= 0: emo['cnt'] = emo['cnt'] + 1
def __bigram(dict): text = dts.readlineI() if not text: return text = re.sub('[!?]', '', text) text = re.sub('[^\d\w\ ]', '', text) tokens = tokenizer.tokenize(text) for wx in tokens: for wy in tokens: tmpstr = wx + '#' + wy dict.update({tmpstr: dict.get(tmpstr, 0) + 1})
def dealLine(): line = dts.readlineI() for emo in Emotions: if emo['cnt'] > MaxEmotionSize: continue flag = -2 for eicon in emo['Icons']: flag = line.find( eicon ) if flag != -1 : emo['fileptr'].write( line ) break if flag >= 0: emo['cnt'] = emo['cnt'] + 1
def __lineParse(): line = dts.readlineI() if not line: return obj = eval(line) output = str(obj[1]) + ' ' wordlist = {} for word, value in obj[0].iteritems(): if value > 0: wordlist.update({__getIndex(word): value}) #output += str(__getIndex(word)) + ':'+ str(value) +' ' for key, value in [(k, wordlist[k]) for k in sorted(wordlist.keys())]: output += str(key) + ':' + str(value) + ' ' if obj[1] > 1: dts.writeO(output + '\n')
def parse_line(): line = dts.readlineI() if not line: return try: obj = json.loads(line) except: return label = obj[u'label'] text = obj[u'text'] global PC if int(label) == 1: PC += 1 output = '%d %s\n' % (label, gen_Feature(text)) dts.writeO(output) dts.writeL(str(label) + '\n')
def __dealLine(param): line = dts.readlineI() if not line: return obj = json.loads( line ) text = obj['text'] hashtag = obj['hashtag'] if hashtag != param[0]: return wordList = tokenizer.tokenize(preprocess(text)) dict = {} for uni in getUnigramWords.getUnigramWords(): dict.update( { uni: 0 } ) for word in wordList: if word == uni: dict.update( { word: 1 } ) nobj = ( dict, 0 ) #print nobj dts.writeO( str(nobj)+'\n' )
def __push(): text = dts.readlineI() tw.add(text)
def findemoji(str): line = dts.readlineI() if str in line: print(line) dts.writeO(line)
def __line(): line = dts.readlineI() if not line: return obj = eval(line) counter[obj[1]] += 1
def __cleanTweet(): dts.writeO( __bandWords(dts.readlineI()) )
def __preprocess(): line = preprocess_func.preprocess(dts.readlineI()) dts.writeO(line)