Exemplo n.º 1
0
def preAll():
    dts.setSize(7000000)
    dts.setFile('../data/twitter.tweets.json', '../emojiOutput/afterPre.txt',
                '../log/EmojiPre.log')
    dts.openFiles()
    __preProcess()
    dts.closeFiles()
Exemplo n.º 2
0
def __testfind():
    dts.setSize(3830000)
    dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/EmoAll.txt',
                '../log/divideEmoticons')
    dts.openFiles()
    dts.loop(__testEmo, 'test emoji')
    dts.closeFiles()
Exemplo n.º 3
0
def bigram():
    dts.setFile('../output/afterPre.txt', '../output/BiDict.txt',
                '../log/bigram.txt')
    dts.setSize(25000000)
    dts.openFiles()
    make_bigram()
    dts.closeFiles()
Exemplo n.º 4
0
def select_bigram():
    dts.setFile('../output/BiDict.txt', '../output/select_bigram.txt',
                '../log/select_bigram')
    dts.setSize(389920)
    dts.openFiles()
    dts.loop_with_param(__filter_bigram, [100, 100000], 'filter_bigram')
    dts.closeFiles()
Exemplo n.º 5
0
def topicFilter():
    dts.setSize(14000000)
    dts.setFile("/home/server2103/dump/twitter.tweet.json",
                "../entityOutput/topictwitter", "../log/matchtwitter")
    dts.openFiles()
    dts.loop(filterHashtags, 'filterHashtags')
    dts.closeFiles()
Exemplo n.º 6
0
def select_dict():
    dts.setFile('../output/Dict_raw.txt', '../output/Dict_select.txt',
                '../log/idf_select.log')
    dts.setSize(214884)
    dts.openFiles()

    dts.loop_with_param(__filter_range, [1000, 34400], 'filter Dict_raw')

    dts.closeFiles()
Exemplo n.º 7
0
def featureUnigram():
    topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"]
    
    for hashtag in topicList:
        topic = hashtag[1:]
        dts.setSize( 50000 )
        dts.setFile( '../entityOutput/topictwitter', '../entityOutput/topicTwitter_'+topic, '../log/topicTwitterFeatvect')
        dts.openFiles()
        dts.loop_with_param( __dealLine, [ hashtag, ],'Generating Unigram With Tag:'+topic )
        dts.closeFiles()
Exemplo n.º 8
0
def featureVectorParse():
    dts.setSize(10000)
    dts.setFile('../data/featvect', '../emojiOutput/featureWang10000_no01',
                '../log/featureWang')
    dts.openFiles()
    dts.loop(__lineParse, 'parse featvect')

    dts.writeL(str(name_dict))

    dts.closeFiles()
Exemplo n.º 9
0
def divideHashtag():
    dts.setSize(1000000)
    dts.setFile('../hashOutput/afterPre.txt',
                '../hashOutput/divideHashtag.txt', '../log/divideHashtag.log')
    dts.openFiles()

    dts.loop(__divide, 'divide by Hashtag')
    for emo in EmoList:
        print 'label %d \t: %d' % (emo['label'], emo['cnt'])
        dts.writeL('label %d \t: %d\n' % (emo['label'], emo['cnt']))

    dts.closeFiles()
Exemplo n.º 10
0
def divideEmoticons():
    dts.setSize(3830000)
    dts.setFile('../emojiOutput/EafterPre.txt', '', '../log/divideEmoticons')
    dts.openFiles()

    for emo in Emotions:
        emo['fileptr'] = codecs.open(outputDir + emo['filename'], 'w', 'utf-8')

    dts.loop_with_param(__divide, [
        3000,
    ], 'divide Emotions')

    for emo in Emotions:
        print '%s\t:\t%d' % (emo['filename'], emo['cnt'])
        emo['fileptr'].close()
    dts.closeFiles()
Exemplo n.º 11
0
def __cleanDup():
    dts.openFiles()
    tw = set()

    def __push():
        text = dts.readlineI()
        tw.add(text)

    dts.loop(__push, 'push into set')
    print 'start write to file %s' % dts.ofileName
    cnt = 0
    for text in tw:
        dts.writeO(text)
        cnt += 1
    print 'write finished, tot tweet left: %d' % cnt

    dts.closeFiles()
Exemplo n.º 12
0
def featureGenerator():
    dts.setSize(5000)
    dts.setFile( '../emojiOutput/afterPre.txt', '../emojiOutput/feature5000.txt', '../log/emojiFeatureGenerator.log' )
    dts.openFiles()

    __featureGenerator_init()

    for emo in devideEmotion.Emotions:
        filename = devideEmotion.outputDir + emo['filename']
        ifile = codecs.open( filename, 'r', 'utf-8' )
        #print 'Processing %s:' % emo['filename']
        dts.loop_with_param( __g_each_tweet, [emo['label'], ifile] , emo['filename']  )
        ifile.close()

    #dts.loop( __g_each_tweet, 'feature Generator' )

    dts.closeFiles()
Exemplo n.º 13
0
def featureVectorParse():
    topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"]
    
    dfile = codecs.open( '../log/featureWang', 'r', 'utf-8' )
    line = dfile.readline()
    global name_dict
    name_dict = eval( line )
    dfile.close()

    for topic in topicList:
        ifilename = '../entityOutput/topicTwitter_' + topic[1:]
        ofilename = '../entityOutput/topicFeat_' + topic[1:]
        lfilename = '../log/featureVectorParse_entity'

        dts.setSize( 50000 )
        dts.setFile( ifilename, ofilename, lfilename )
        dts.openFiles()
        dts.loop( __lineParse, 'parse featvect:' + topic )
        dts.closeFiles()
Exemplo n.º 14
0
def labelCounter():
    dts.setSize(100000)
    dts.setFile('../data/featvect', '', '../log/featvectLabelCount')
    dts.openFiles()
    global counter
    for x in range(9):
        counter[x] = 0
    dts.loop(__line, 'parse featvect')

    sum = 0
    for x in range(9):
        sum += counter[x]

    for x in range(9):
        print 'Label\t%d\t:%d (%.2f%%)' % (
            x, counter[x], float(counter[x] * 100.0) / float(sum))
        dts.writeL('Label\t%d\t:%d (%.2f%%)\n' %
                   (x, counter[x], float(counter[x] * 100.0) / float(sum)))

    print 'Sum\t\t:%d' % sum

    dts.closeFiles()
Exemplo n.º 15
0
def make_dict():
    dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt',
                '../log/idf.log')
    dts.setSize(25770000)
    dts.openFiles()

    dict = {}
    dts.loop_with_param(__calcIDF, dict, 'calc the Idf')

    print 'start sort and print'
    cnt = 0
    pcnt = 0
    CntDistribution = {}
    CNT_MAX = 1000000
    for x in range(CNT_MAX + 1):
        CntDistribution[x] = 0
    for key, value in [(k, dict[k]) for k in sorted(dict.keys())]:
        if value > 10 and value < 364600:
            dts.writeO('%s:%d\n' % (key, value))
            pcnt += 1
        cnt += 1
        if (value > 364600):
            print key
        if (value > CNT_MAX * 10):
            CntDistribution[CNT_MAX] += 1
        else:
            CntDistribution[value / 10] += 1

    print '%d words output' % pcnt
    dts.writeL('%d words output\n' % pcnt)

    print 'printing range log'
    ncnt = 0
    for x in range(CNT_MAX):
        ncnt += CntDistribution[x]
        dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt))

    dts.closeFiles()
Exemplo n.º 16
0
def filterEmoticons():
    dts.setSize( 310000 )
    dts.setFile( '../data/tweet_noRT_noDup.txt', '../tmp/filter.out', '../log/filterEmoticons.log' )
    dts.openFiles()
    dts.loop( __cleanTweet, 'clean Tweets' )
    dts.closeFiles()
Exemplo n.º 17
0
            loop_lfilename = '../Compare_Output/ans_unihash_'
            all_ofilename = '../emojiOutput/feautre_unihash_all'
            all_lfilename = '../Compare_Output/ans_unihash_all'
        elif __type == 'UnigramEmoticon_run':
            DictDir = '../emojiOutput/UnigramEmoticonDict'
            loop_ofilename = '../emojiOutput/feautre_uniemo_'
            loop_lfilename = '../Compare_Output/ans_uniemo_'
            all_ofilename = '../emojiOutput/feautre_uniemo_all'
            all_lfilename = '../Compare_Output/ans_uniemo_all'
        load_Index()

        for Emo in divideByEmoji.Emotions:
            ifilename = divideByEmoji.OutputDir + Emo['name']
            ofilename = loop_ofilename + Emo['name']
            lfilename = loop_lfilename + Emo['name']
            dts.setSize(100000)
            dts.setFile(ifilename, ofilename, lfilename)
            dts.openFiles()
            PC = 0
            dts.loop(parse_line, 'generating ' + Emo['name'])
            dts.closeFiles()

        ifilename = '../emojiOutput/featre_all'
        dts.setSize(100000)
        dts.setFile(ifilename, all_ofilename, all_lfilename)
        dts.openFiles()
        dts.loop(parse_line, 'generating all')
        dts.closeFiles()

    pass