예제 #1
0
def __readin(tweetSet):
    line = dts.readlineI()
    if not line:
        return
    #print '\n'+ line


#    if not re.match( r'\}\s*$', line ):
#        nline = dts.readlineI()
#
#        if not nline:
#            return
#        line += nline

    try:
        obj = json.loads(line)
    except:
        #print line
        nextline = dts.readlineI()
        #print nextline
        return

    text = __cleanRT(obj['text'])
    if not text:
        return
    else:
        tweetSet.add(text)
예제 #2
0
def __divide(parList):
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']

    ans = 0
    for emo in Emotions:
        label = int(emo['label'])
        for icon in emo['Icons']:
            if icon in text:
                if 0 == ans or label == ans:
                    ans = label
                else:
                    ans = -1

    if ans > 0:
        ttext = re.sub('[!?]', '', text)
        ttext = re.sub('[^\d\w\ ]', '', ttext)
        ttext = re.sub('USERNAME', '', ttext)
        ttext = re.sub('URL', '', text)
        if len(tokenizer.tokenize(text)) <= 3:
            return

    if ans > 0 and Emotions[ans - 1]['cnt'] < parList[0]:
        Emotions[ans - 1]['cnt'] += 1
        Emotions[ans - 1]['fileptr'].write(line)
예제 #3
0
def filterHashtags():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        return

    text = obj["text"]
    pt = "space"
    for topic in topicList:
        matchs = re.search(topic, text)
        if not matchs:
            continue
        else:
            pt = topic
            break

    if not matchs:
        return
    else:
        nobj = {"text": text, "hashtag": pt}
        dts.writeO(json.dumps(nobj) + '\n')

    pass
예제 #4
0
def __io():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']
    dts.writeO(text + '\n')
예제 #5
0
def __testEmo():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    if u'\U0001F608' in obj['text']:
        print obj['text']
예제 #6
0
def __divide():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads( line )
    except:
        return

    text = obj['text']
    
    ans = 0
    for emo in Emotions:
        label = int ( emo['label'] )
        for icon in emo['Icons']:
            if icon in text:
                if 0 == ans or label == ans:
                    ans = label
                else:
                    ans = -1

    if ans > 0 :

        # for add the hashtag to HashtagSet
        htList = re.findall( '#\w+', text )
        global hashtagSet
        for ht in htList:
            hashtagSet.add( ht[1:].lower() )

        #text = preprocess_func.preprocess( text ).lower()
        #text = text.lower()
        #text = re.sub( '[!?]', '', text )
        #text = re.sub( '[^\d\w\ ]', '', text )
        #text = re.sub( 'username', '', text )
        #text = re.sub( 'twhashtag', '', text )
        #text = re.sub( 'url', '', text )
        text = re.sub( '\s+', ' ', text  )
        if len( tokenizer.tokenize( preprocess_func.preprocess(text ).lower() )) <= 3:
            return

        tmp = {u'text':text, u'label':ans}
        dts.writeO( json.dumps(tmp) + '\n' )

        for emo in Emotions:
            label = 0
            #print '%d:%d\n' % ( ans, int(emo['label'] ))
            if ans == int(emo['label']):
                label = 1
                Emotions[ ans - 1 ][ 'cnt' ] += 1
                tmp = {u'text':text, u'label':label}
                emo['fileptr'].write( json.dumps(tmp) + '\n' )
            else :
                if Emotions[ans-1]['ncnt'] < Emotions[ans-1]['cnt']:
                    label = -1
                    Emotions[ ans - 1 ][ 'ncnt' ] += 1
                    tmp = {u'text':text, u'label':label}
                    emo['fileptr'].write( json.dumps(tmp) + '\n' )

    pass
예제 #7
0
def __calcIDF(dict):
    line = dts.readlineI()
    if not line:
        return
    line = re.sub('[!?]', '', line)
    line = re.sub(r'[^\d\w\ ]', '', line)
    for term in tokenizer.tokenize(line):
        dict.update({term: dict.get(term, 0) + 1})
예제 #8
0
def __filter_bigram(ran):
    line = dts.readlineI()
    if not line:
        return
    left = ran[0]
    right = ran[1]
    num = int(line.split(':')[1])
    if num >= left and num <= right:
        dts.writeO(line)
예제 #9
0
def __filter_range(border):
    l = border[0]
    r = border[1]
    line = dts.readlineI()
    if not line:
        return
    cnt = int((line.split(':'))[1])
    if cnt >= l and cnt <= r:
        dts.writeO(line)
예제 #10
0
def __divide():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']
    label = __label(text)
    if label > 0:
        obj['label'] = label
        dts.writeO(json.dumps(obj) + '\n')
예제 #11
0
def __dealLine(param):
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        #print line
        nextline = dts.readlineI()
        #print nextline
        return
    text = obj['text']
    regex = r'#\w+'
    matchs = re.findall(regex, text)
    if not matchs:
        return
    for words in matchs:
        words = words.lower()
        topicDict.update({words: topicDict.get(words, 0) + 1})
예제 #12
0
def __divide():
    line = dts.readlineI()
    if not line:
        return

    global GC
    if GC < 1000000:
        GC += 1
        return

    try:
        obj = json.loads(line)
    except:
        return

    text = obj['text']

    ans = 0
    for emo in Emotions:
        label = int(emo['label'])
        for icon in emo['Icons']:
            if icon in text:
                if 0 == ans or label == ans:
                    ans = label
                else:
                    ans = -1

    if ans > 0:
        text = re.sub('[!?]', '', text)
        text = re.sub('[^\d\w\ ]', '', text)
        text = re.sub('USERNAME', '', text)
        text = re.sub('URL', '', text)
        if len(tokenizer.tokenize(text)) <= 3:
            return

        tmp = {u'text': text, u'label': ans}
        dts.writeO(json.dumps(tmp) + '\n')

        for emo in Emotions:
            label = 0
            #print '%d:%d\n' % ( ans, int(emo['label'] ))
            if ans == int(emo['label']):
                label = 1
                Emotions[ans - 1]['cnt'] += 1
                tmp = {u'text': text, u'label': label}
                emo['fileptr'].write(json.dumps(tmp) + '\n')
            else:
                if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']:
                    label = -1
                    Emotions[ans - 1]['ncnt'] += 1
                    tmp = {u'text': text, u'label': label}
                    emo['fileptr'].write(json.dumps(tmp) + '\n')

    pass
예제 #13
0
def dealLine():
    line = dts.readlineI()
    for emo in Emotions:
        flag = -2
        for eicon in emo['Icons']:
            if eicon in line:
                print line
                #flag = line.find( eicon )
                flag = 0 
                break
        if flag >= 0:
            emo['cnt'] = emo['cnt'] + 1
예제 #14
0
def __bigram(dict):
    text = dts.readlineI()
    if not text:
        return
    text = re.sub('[!?]', '', text)
    text = re.sub('[^\d\w\ ]', '', text)

    tokens = tokenizer.tokenize(text)

    for wx in tokens:
        for wy in tokens:
            tmpstr = wx + '#' + wy
            dict.update({tmpstr: dict.get(tmpstr, 0) + 1})
예제 #15
0
 def dealLine():
     line = dts.readlineI()
     for emo in Emotions:
         if emo['cnt'] > MaxEmotionSize:
             continue
         flag = -2
         for eicon in emo['Icons']:
             flag = line.find( eicon )
             if flag != -1 :
                 emo['fileptr'].write( line )
                 break
         if flag >= 0:
             emo['cnt'] = emo['cnt'] + 1
예제 #16
0
def __lineParse():
    line = dts.readlineI()
    if not line:
        return
    obj = eval(line)
    output = str(obj[1]) + ' '
    wordlist = {}
    for word, value in obj[0].iteritems():
        if value > 0:
            wordlist.update({__getIndex(word): value})
            #output += str(__getIndex(word))  + ':'+ str(value) +' '

    for key, value in [(k, wordlist[k]) for k in sorted(wordlist.keys())]:
        output += str(key) + ':' + str(value) + ' '

    if obj[1] > 1:
        dts.writeO(output + '\n')
예제 #17
0
def parse_line():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        return

    label = obj[u'label']
    text = obj[u'text']

    global PC
    if int(label) == 1:
        PC += 1
    output = '%d %s\n' % (label, gen_Feature(text))
    dts.writeO(output)
    dts.writeL(str(label) + '\n')
예제 #18
0
def __dealLine(param):
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads( line )
    text = obj['text']
    hashtag = obj['hashtag']
    if hashtag != param[0]:
        return
    
    wordList = tokenizer.tokenize(preprocess(text))
    dict = {}
    for uni in getUnigramWords.getUnigramWords():
        dict.update( { uni: 0 } )
        for word in wordList:
            if word == uni:
                dict.update( { word: 1 } )
    
    nobj = ( dict, 0 )
    #print nobj
    dts.writeO( str(nobj)+'\n' )
예제 #19
0
 def __push():
     text = dts.readlineI()
     tw.add(text)
예제 #20
0
def findemoji(str):
    line = dts.readlineI()
    if str in line:
        print(line)
        dts.writeO(line)
예제 #21
0
def __line():
    line = dts.readlineI()
    if not line:
        return
    obj = eval(line)
    counter[obj[1]] += 1
예제 #22
0
def __cleanTweet():
    dts.writeO( __bandWords(dts.readlineI()) )
예제 #23
0
def __preprocess():
    line = preprocess_func.preprocess(dts.readlineI())
    dts.writeO(line)