Exemplo n.º 1
0
def __testfind():
    dts.setSize(3830000)
    dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/EmoAll.txt',
                '../log/divideEmoticons')
    dts.openFiles()
    dts.loop(__testEmo, 'test emoji')
    dts.closeFiles()
Exemplo n.º 2
0
def bigram():
    dts.setFile('../output/afterPre.txt', '../output/BiDict.txt',
                '../log/bigram.txt')
    dts.setSize(25000000)
    dts.openFiles()
    make_bigram()
    dts.closeFiles()
Exemplo n.º 3
0
def preAll():
    dts.setSize(7000000)
    dts.setFile('../data/twitter.tweets.json', '../emojiOutput/afterPre.txt',
                '../log/EmojiPre.log')
    dts.openFiles()
    __preProcess()
    dts.closeFiles()
Exemplo n.º 4
0
def select_bigram():
    dts.setFile('../output/BiDict.txt', '../output/select_bigram.txt',
                '../log/select_bigram')
    dts.setSize(389920)
    dts.openFiles()
    dts.loop_with_param(__filter_bigram, [100, 100000], 'filter_bigram')
    dts.closeFiles()
Exemplo n.º 5
0
def topicFilter():
    dts.setSize(14000000)
    dts.setFile("/home/server2103/dump/twitter.tweet.json",
                "../entityOutput/topictwitter", "../log/matchtwitter")
    dts.openFiles()
    dts.loop(filterHashtags, 'filterHashtags')
    dts.closeFiles()
Exemplo n.º 6
0
def select_dict():
    dts.setFile('../output/Dict_raw.txt', '../output/Dict_select.txt',
                '../log/idf_select.log')
    dts.setSize(214884)
    dts.openFiles()

    dts.loop_with_param(__filter_range, [1000, 34400], 'filter Dict_raw')

    dts.closeFiles()
Exemplo n.º 7
0
def featureVectorParse():
    dts.setSize(10000)
    dts.setFile('../data/featvect', '../emojiOutput/featureWang10000_no01',
                '../log/featureWang')
    dts.openFiles()
    dts.loop(__lineParse, 'parse featvect')

    dts.writeL(str(name_dict))

    dts.closeFiles()
Exemplo n.º 8
0
def featureUnigram():
    topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"]
    
    for hashtag in topicList:
        topic = hashtag[1:]
        dts.setSize( 50000 )
        dts.setFile( '../entityOutput/topictwitter', '../entityOutput/topicTwitter_'+topic, '../log/topicTwitterFeatvect')
        dts.openFiles()
        dts.loop_with_param( __dealLine, [ hashtag, ],'Generating Unigram With Tag:'+topic )
        dts.closeFiles()
Exemplo n.º 9
0
def divideHashtag():
    dts.setSize(1000000)
    dts.setFile('../hashOutput/afterPre.txt',
                '../hashOutput/divideHashtag.txt', '../log/divideHashtag.log')
    dts.openFiles()

    dts.loop(__divide, 'divide by Hashtag')
    for emo in EmoList:
        print 'label %d \t: %d' % (emo['label'], emo['cnt'])
        dts.writeL('label %d \t: %d\n' % (emo['label'], emo['cnt']))

    dts.closeFiles()
Exemplo n.º 10
0
def divideEmoticons():
    dts.setSize(3830000)
    dts.setFile('../emojiOutput/EafterPre.txt', '', '../log/divideEmoticons')
    dts.openFiles()

    for emo in Emotions:
        emo['fileptr'] = codecs.open(outputDir + emo['filename'], 'w', 'utf-8')

    dts.loop_with_param(__divide, [
        3000,
    ], 'divide Emotions')

    for emo in Emotions:
        print '%s\t:\t%d' % (emo['filename'], emo['cnt'])
        emo['fileptr'].close()
    dts.closeFiles()
Exemplo n.º 11
0
def featureGenerator():
    dts.setSize(5000)
    dts.setFile( '../emojiOutput/afterPre.txt', '../emojiOutput/feature5000.txt', '../log/emojiFeatureGenerator.log' )
    dts.openFiles()

    __featureGenerator_init()

    for emo in devideEmotion.Emotions:
        filename = devideEmotion.outputDir + emo['filename']
        ifile = codecs.open( filename, 'r', 'utf-8' )
        #print 'Processing %s:' % emo['filename']
        dts.loop_with_param( __g_each_tweet, [emo['label'], ifile] , emo['filename']  )
        ifile.close()

    #dts.loop( __g_each_tweet, 'feature Generator' )

    dts.closeFiles()
Exemplo n.º 12
0
def featureVectorParse():
    topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"]
    
    dfile = codecs.open( '../log/featureWang', 'r', 'utf-8' )
    line = dfile.readline()
    global name_dict
    name_dict = eval( line )
    dfile.close()

    for topic in topicList:
        ifilename = '../entityOutput/topicTwitter_' + topic[1:]
        ofilename = '../entityOutput/topicFeat_' + topic[1:]
        lfilename = '../log/featureVectorParse_entity'

        dts.setSize( 50000 )
        dts.setFile( ifilename, ofilename, lfilename )
        dts.openFiles()
        dts.loop( __lineParse, 'parse featvect:' + topic )
        dts.closeFiles()
Exemplo n.º 13
0
def labelCounter():
    dts.setSize(100000)
    dts.setFile('../data/featvect', '', '../log/featvectLabelCount')
    dts.openFiles()
    global counter
    for x in range(9):
        counter[x] = 0
    dts.loop(__line, 'parse featvect')

    sum = 0
    for x in range(9):
        sum += counter[x]

    for x in range(9):
        print 'Label\t%d\t:%d (%.2f%%)' % (
            x, counter[x], float(counter[x] * 100.0) / float(sum))
        dts.writeL('Label\t%d\t:%d (%.2f%%)\n' %
                   (x, counter[x], float(counter[x] * 100.0) / float(sum)))

    print 'Sum\t\t:%d' % sum

    dts.closeFiles()
Exemplo n.º 14
0
def make_dict():
    dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt',
                '../log/idf.log')
    dts.setSize(25770000)
    dts.openFiles()

    dict = {}
    dts.loop_with_param(__calcIDF, dict, 'calc the Idf')

    print 'start sort and print'
    cnt = 0
    pcnt = 0
    CntDistribution = {}
    CNT_MAX = 1000000
    for x in range(CNT_MAX + 1):
        CntDistribution[x] = 0
    for key, value in [(k, dict[k]) for k in sorted(dict.keys())]:
        if value > 10 and value < 364600:
            dts.writeO('%s:%d\n' % (key, value))
            pcnt += 1
        cnt += 1
        if (value > 364600):
            print key
        if (value > CNT_MAX * 10):
            CntDistribution[CNT_MAX] += 1
        else:
            CntDistribution[value / 10] += 1

    print '%d words output' % pcnt
    dts.writeL('%d words output\n' % pcnt)

    print 'printing range log'
    ncnt = 0
    for x in range(CNT_MAX):
        ncnt += CntDistribution[x]
        dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt))

    dts.closeFiles()
Exemplo n.º 15
0
frindIcons=[ 'x-<', ':-)', '(-:', ':)', '(:',u'☺️']
tensnIcons=[ ':-0', ':-o', ':-()',':-O', ':O','o_O', 'O_o','O_O','o_o']
confuIcons=[ '?_?', '@_@', '<@_@>']

angerEmo = {'filename': 'angerEmo.txt', 'Icons': angerIcons, 'cnt':0}
depreEmo = {'filename': 'depreEmo.txt', 'Icons': depreIcons, 'cnt':0}
fatigEmo = {'filename': 'fatigEmo.txt', 'Icons': fatigIcons, 'cnt':0}
vigorEmo = {'filename': 'vigorEmo.txt', 'Icons': vigorIcons, 'cnt':0}
frindEmo = {'filename': 'frindEmo.txt', 'Icons': frindIcons, 'cnt':0}
tensnEmo = {'filename': 'tensnEmo.txt', 'Icons': tensnIcons, 'cnt':0}
confuEmo = {'filename': 'confuEmo.txt', 'Icons': confuIcons, 'cnt':0}

#Emotions = [angerEmo, depreEmo, fatigEmo, vigorEmo, frindEmo, tensnEmo, confuEmo];

dts.setSize( 5000000 )
dts.setFile( '../data/tweet_noRT_noDup.txt' )
dts.openFiles()
def dealLine():
    line = dts.readlineI()
    for emo in Emotions:
        flag = -2
        for eicon in emo['Icons']:
            if eicon in line:
                print line
                #flag = line.find( eicon )
                flag = 0 
                break
        if flag >= 0:
            emo['cnt'] = emo['cnt'] + 1

dts.loop( dealLine, 'check Emoticons' )
Exemplo n.º 16
0
# vim:fenc=utf-8
#
# Copyright © 2014 Carwest Sung <*****@*****.**>
#
# Distributed under terms of the MIT license.
"""
find emoji in tweets
"""
import io
import os
import re
import codecs
import dealTweets as dts

dts.setSize(5000000)
dts.setFile("../data/tweet_noRT_noDup.txt", "../tmp/b.out", "../tmp/c.out")
dts.openFiles()


def findemoji(str):
    line = dts.readlineI()
    if str in line:
        print(line)
        dts.writeO(line)


dts.loop_with_param(findemoji, b'\xf0\x9f\x98\x80'.decode('utf-8'),
                    'try to find Emoji :😀')
#dts.writeL( u'\xe2\x98\xba\xef\xb8\x8f with hay!' )
#smile = '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8')
Exemplo n.º 17
0
    for words in matchs:
        words = words.lower()
        topicDict.update({words: topicDict.get(words, 0) + 1})
        #print words


def __clean(param):
    for key, cnt in [(k, v) for k, v in topicDict.iteritems()]:
        if cnt < param[0]:
            topicDict.pop(key)


if __name__ == "__main__":
    dts.setSize(13000000)

    dts.setFile('/home/server2103/dump/twitter.tweet.json',
                '../emojiOutput/topics', '../log/topics.emoji')
    dts.openFiles()
    dts.loop_with_param_clean(__dealLine, __clean, [
        3,
    ], 'find hashtags')

    cnt = 0
    sum = 0
    print 'start output'
    for key, value in topicDict.iteritems():
        dts.writeO('%s\t:%d\n' % (key, value))
        cnt += 1
        sum += value
    dts.writeL('%d hashtags with %d displays' % (cnt, sum))
    print '%d hashtags with %d displays' % (cnt, sum)
Exemplo n.º 18
0
    if len( sys.argv ) == 0:
        print 'no argv given'
        pass
    elif len(sys.argv) != 4:
        print 'error argvs'
    else:
        ProcessSize = int(sys.argv[1])
        MaxEmotionSize = int(sys.argv[2])
        outputDir=sys.argv[3]


    print 'ProcessSize set to %d, MaxEmotionSize set to %d' % ( ProcessSize, MaxEmotionSize )
    print 'outputDir = %s' % outputDir

    dts.setSize( ProcessSize )
    dts.setFile( '../data/tweet_noRT_noDup.txt', '', '../log/dividedByEmoticons_'+str(ProcessSize) + '.log' )
    dts.openFiles()

    for emo in Emotions:
        emo['fileptr'] = codecs.open( outputDir + emo['filename'], 'w', 'utf-8' )

    def dealLine():
        line = dts.readlineI()
        for emo in Emotions:
            if emo['cnt'] > MaxEmotionSize:
                continue
            flag = -2
            for eicon in emo['Icons']:
                flag = line.find( eicon )
                if flag != -1 :
                    emo['fileptr'].write( line )
Exemplo n.º 19
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2014 Carwest Sung <*****@*****.**>
#
# Distributed under terms of the MIT license.
"""

"""

import dealTweets as dts
import json


def __io():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']
    dts.writeO(text + '\n')


dts.setSize(300)
dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/checkAfterPre.txt')
dts.openFiles()
dts.loop(__io, 'io')
dts.closeFiles()
Exemplo n.º 20
0
def filterEmoticons():
    dts.setSize( 310000 )
    dts.setFile( '../data/tweet_noRT_noDup.txt', '../tmp/filter.out', '../log/filterEmoticons.log' )
    dts.openFiles()
    dts.loop( __cleanTweet, 'clean Tweets' )
    dts.closeFiles()
Exemplo n.º 21
0
            loop_lfilename = '../Compare_Output/ans_unihash_'
            all_ofilename = '../emojiOutput/feautre_unihash_all'
            all_lfilename = '../Compare_Output/ans_unihash_all'
        elif __type == 'UnigramEmoticon_run':
            DictDir = '../emojiOutput/UnigramEmoticonDict'
            loop_ofilename = '../emojiOutput/feautre_uniemo_'
            loop_lfilename = '../Compare_Output/ans_uniemo_'
            all_ofilename = '../emojiOutput/feautre_uniemo_all'
            all_lfilename = '../Compare_Output/ans_uniemo_all'
        load_Index()

        for Emo in divideByEmoji.Emotions:
            ifilename = divideByEmoji.OutputDir + Emo['name']
            ofilename = loop_ofilename + Emo['name']
            lfilename = loop_lfilename + Emo['name']
            dts.setSize(100000)
            dts.setFile(ifilename, ofilename, lfilename)
            dts.openFiles()
            PC = 0
            dts.loop(parse_line, 'generating ' + Emo['name'])
            dts.closeFiles()

        ifilename = '../emojiOutput/featre_all'
        dts.setSize(100000)
        dts.setFile(ifilename, all_ofilename, all_lfilename)
        dts.openFiles()
        dts.loop(parse_line, 'generating all')
        dts.closeFiles()

    pass
Exemplo n.º 22
0
Preprocess for tweet file

take out RT, url addresses, punctuations, ans paramiters
"""

import re
import sys
import codecs
import HTMLParser
import preprocess_func
import dealTweets as dts
import utilities
#from nltk.stem.wordnet import WordNetLemmatizer

dts.setSize(25770000)
dts.setFile('../data/tweet_noRT_noDup.txt', '../output/afterPre.txt',
            '../log/pre.log')
dts.openFiles()

tokenizer = utilities.Tokenizer()


def __preprocess():
    line = preprocess_func.preprocess(dts.readlineI())
    dts.writeO(line)


#    terms = [term for term in tokenizer.tokenize(line)]
#    print terms

dts.loop(__preprocess, 'preprocess symbols')
Exemplo n.º 23
0
            if ans == int(emo['label']):
                label = 1
                Emotions[ans - 1]['cnt'] += 1
                tmp = {u'text': text, u'label': label}
                emo['fileptr'].write(json.dumps(tmp) + '\n')
            else:
                if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']:
                    label = -1
                    Emotions[ans - 1]['ncnt'] += 1
                    tmp = {u'text': text, u'label': label}
                    emo['fileptr'].write(json.dumps(tmp) + '\n')

    pass


if __name__ == "__main__":
    dts.setSize(2000000)
    dts.setFile('../emojiOutput/afterPre.txt',
                '../emojiOutput/test_featre_all',
                '../log/test_labeled_by_emoji_log')
    dts.openFiles()
    for emo in Emotions:
        emo['fileptr'] = codecs.open(OutputDir + emo['name'], 'w', 'utf-8')
    dts.loop(__divide, 'divide and label twiiters')
    for emo in Emotions:
        print '%s\t:\t%d' % (emo['name'], emo['cnt'])
        dts.writeL('%s\t:\t%d\n' % (emo['name'], emo['cnt']))
        emo['fileptr'].close()
    dts.closeFiles()
    pass
Exemplo n.º 24
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2014 Carwest Sung <*****@*****.**>
#
# Distributed under terms of the MIT license.
"""
clean duplicate tweet , with simple set method
"""

import dealTweets as dts

dts.setSize(5000000)
dts.setFile('../data/tweet_noRT.txt', '../tmp/noDup.txt',
            '../log/checkNoDup.log')


def __cleanDup():
    dts.openFiles()
    tw = set()

    def __push():
        text = dts.readlineI()
        tw.add(text)

    dts.loop(__push, 'push into set')
    print 'start write to file %s' % dts.ofileName
    cnt = 0
    for text in tw:
        dts.writeO(text)