Пример #1
0
def __preProcess():
    dts.loop_with_param(__readin, tweets, 'loading files')
    for tweet in tweets:
        #dts.writeO( tmp + '\n' )
        tmp = {u'text': tweet}
        dts.writeO(json.dumps(tmp) + '\n')
    setlen = len(tweets)
    print '%d tweets remaining' % setlen
    dts.writeL('%d tweets remaining' % setlen)
Пример #2
0
def featureVectorParse():
    dts.setSize(10000)
    dts.setFile('../data/featvect', '../emojiOutput/featureWang10000_no01',
                '../log/featureWang')
    dts.openFiles()
    dts.loop(__lineParse, 'parse featvect')

    dts.writeL(str(name_dict))

    dts.closeFiles()
Пример #3
0
def divideHashtag():
    dts.setSize(1000000)
    dts.setFile('../hashOutput/afterPre.txt',
                '../hashOutput/divideHashtag.txt', '../log/divideHashtag.log')
    dts.openFiles()

    dts.loop(__divide, 'divide by Hashtag')
    for emo in EmoList:
        print 'label %d \t: %d' % (emo['label'], emo['cnt'])
        dts.writeL('label %d \t: %d\n' % (emo['label'], emo['cnt']))

    dts.closeFiles()
Пример #4
0
def parse_line():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        return

    label = obj[u'label']
    text = obj[u'text']

    global PC
    if int(label) == 1:
        PC += 1
    output = '%d %s\n' % (label, gen_Feature(text))
    dts.writeO(output)
    dts.writeL(str(label) + '\n')
Пример #5
0
def make_bigram():

    dict = {}
    dts.loop_with_param_clean(__bigram, __bigram_clean, dict,
                              'Make the dict of bigram')

    print 'start to output'

    cntList = {}
    for x in range(100000):
        cntList.update({x: 0})
    for k, v in dict.iteritems():
        if v > 10:
            dts.writeO(k + ':' + str(v) + '\n')
        if v >= 100000:
            cntList.update({100000: cntList.get(100000, 0)})
        else:
            cntList.update({v: cntList.get(v, 0) + 1})

    for k, v in cntList.iteritems():
        dts.writeL(str(k) + ':' + str(v) + '\n')
Пример #6
0
def labelCounter():
    dts.setSize(100000)
    dts.setFile('../data/featvect', '', '../log/featvectLabelCount')
    dts.openFiles()
    global counter
    for x in range(9):
        counter[x] = 0
    dts.loop(__line, 'parse featvect')

    sum = 0
    for x in range(9):
        sum += counter[x]

    for x in range(9):
        print 'Label\t%d\t:%d (%.2f%%)' % (
            x, counter[x], float(counter[x] * 100.0) / float(sum))
        dts.writeL('Label\t%d\t:%d (%.2f%%)\n' %
                   (x, counter[x], float(counter[x] * 100.0) / float(sum)))

    print 'Sum\t\t:%d' % sum

    dts.closeFiles()
Пример #7
0
def make_dict():
    dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt',
                '../log/idf.log')
    dts.setSize(25770000)
    dts.openFiles()

    dict = {}
    dts.loop_with_param(__calcIDF, dict, 'calc the Idf')

    print 'start sort and print'
    cnt = 0
    pcnt = 0
    CntDistribution = {}
    CNT_MAX = 1000000
    for x in range(CNT_MAX + 1):
        CntDistribution[x] = 0
    for key, value in [(k, dict[k]) for k in sorted(dict.keys())]:
        if value > 10 and value < 364600:
            dts.writeO('%s:%d\n' % (key, value))
            pcnt += 1
        cnt += 1
        if (value > 364600):
            print key
        if (value > CNT_MAX * 10):
            CntDistribution[CNT_MAX] += 1
        else:
            CntDistribution[value / 10] += 1

    print '%d words output' % pcnt
    dts.writeL('%d words output\n' % pcnt)

    print 'printing range log'
    ncnt = 0
    for x in range(CNT_MAX):
        ncnt += CntDistribution[x]
        dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt))

    dts.closeFiles()
Пример #8
0
                if flag != -1 :
                    emo['fileptr'].write( line )
                    break
            if flag >= 0:
                emo['cnt'] = emo['cnt'] + 1

    dts.loop( dealLine, 'check Emoticons' )

    for emo in Emotions:
        emo['fileptr'].close()

    print '============='
    print 'processed Tweets:' + str( dts.processSize )
    for emo in Emotions:
        print emo['filename'] + ':' + str( emo['cnt'] )
        dts.writeL( emo['filename'] + ':' + str( emo['cnt'] ) + '\n' )

    dts.closeFiles()

#tfile = open( '../data/tweets_small.txt', 'r' )
#
#for x in range( processTweetSize + 1 ):
#    line = tfile.readline()
#    for emo in Emotions:
#        flag = -2
#        for eicon in emo['Icons']:
#            flag = line.find( eicon )
#            if flag != -1 :
#                break
#        if flag >= 0:
#            emo['cnt'] = emo['cnt'] + 1
Пример #9
0
            if ans == int(emo['label']):
                label = 1
                Emotions[ans - 1]['cnt'] += 1
                tmp = {u'text': text, u'label': label}
                emo['fileptr'].write(json.dumps(tmp) + '\n')
            else:
                if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']:
                    label = -1
                    Emotions[ans - 1]['ncnt'] += 1
                    tmp = {u'text': text, u'label': label}
                    emo['fileptr'].write(json.dumps(tmp) + '\n')

    pass


if __name__ == "__main__":
    dts.setSize(2000000)
    dts.setFile('../emojiOutput/afterPre.txt',
                '../emojiOutput/test_featre_all',
                '../log/test_labeled_by_emoji_log')
    dts.openFiles()
    for emo in Emotions:
        emo['fileptr'] = codecs.open(OutputDir + emo['name'], 'w', 'utf-8')
    dts.loop(__divide, 'divide and label twiiters')
    for emo in Emotions:
        print '%s\t:\t%d' % (emo['name'], emo['cnt'])
        dts.writeL('%s\t:\t%d\n' % (emo['name'], emo['cnt']))
        emo['fileptr'].close()
    dts.closeFiles()
    pass
Пример #10
0

def __clean(param):
    for key, cnt in [(k, v) for k, v in topicDict.iteritems()]:
        if cnt < param[0]:
            topicDict.pop(key)


if __name__ == "__main__":
    dts.setSize(13000000)

    dts.setFile('/home/server2103/dump/twitter.tweet.json',
                '../emojiOutput/topics', '../log/topics.emoji')
    dts.openFiles()
    dts.loop_with_param_clean(__dealLine, __clean, [
        3,
    ], 'find hashtags')

    cnt = 0
    sum = 0
    print 'start output'
    for key, value in topicDict.iteritems():
        dts.writeO('%s\t:%d\n' % (key, value))
        cnt += 1
        sum += value
    dts.writeL('%d hashtags with %d displays' % (cnt, sum))
    print '%d hashtags with %d displays' % (cnt, sum)

    dts.closeFiles()
    pass
Пример #11
0
"""
find emoji in tweets
"""
import io
import os
import re
import codecs
import dealTweets as dts

dts.setSize(50000)
dts.setFile("../data/tweet_noRT_noDup.txt", "../tmp/b.out", "../tmp/c.out")
dts.openFiles()


def findemoji(str):
    line = dts.readlineI()
    if str in line:
        print line
        dts.writeO(line)


#dts.loop_with_param( findemoji, u'☺️', u'try to find Emoji :☺️' )
#dts.writeL( '0001F612'.decode('hex').encode('utf-8') )
dts.writeL(u'\xe2\x98\xba\xef\xb8\x8f with hay!')
smile = '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8')

dts.loop_with_param(findemoji, smile, u'try to find Emoji :' + smile)
print '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8').encode('utf-8')

dts.closeFiles()