示例#1
0
def filterHashtags():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        return

    text = obj["text"]
    pt = "space"
    for topic in topicList:
        matchs = re.search(topic, text)
        if not matchs:
            continue
        else:
            pt = topic
            break

    if not matchs:
        return
    else:
        nobj = {"text": text, "hashtag": pt}
        dts.writeO(json.dumps(nobj) + '\n')

    pass
示例#2
0
def __io():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']
    dts.writeO(text + '\n')
示例#3
0
def __divide():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads( line )
    except:
        return

    text = obj['text']
    
    ans = 0
    for emo in Emotions:
        label = int ( emo['label'] )
        for icon in emo['Icons']:
            if icon in text:
                if 0 == ans or label == ans:
                    ans = label
                else:
                    ans = -1

    if ans > 0 :

        # for add the hashtag to HashtagSet
        htList = re.findall( '#\w+', text )
        global hashtagSet
        for ht in htList:
            hashtagSet.add( ht[1:].lower() )

        #text = preprocess_func.preprocess( text ).lower()
        #text = text.lower()
        #text = re.sub( '[!?]', '', text )
        #text = re.sub( '[^\d\w\ ]', '', text )
        #text = re.sub( 'username', '', text )
        #text = re.sub( 'twhashtag', '', text )
        #text = re.sub( 'url', '', text )
        text = re.sub( '\s+', ' ', text  )
        if len( tokenizer.tokenize( preprocess_func.preprocess(text ).lower() )) <= 3:
            return

        tmp = {u'text':text, u'label':ans}
        dts.writeO( json.dumps(tmp) + '\n' )

        for emo in Emotions:
            label = 0
            #print '%d:%d\n' % ( ans, int(emo['label'] ))
            if ans == int(emo['label']):
                label = 1
                Emotions[ ans - 1 ][ 'cnt' ] += 1
                tmp = {u'text':text, u'label':label}
                emo['fileptr'].write( json.dumps(tmp) + '\n' )
            else :
                if Emotions[ans-1]['ncnt'] < Emotions[ans-1]['cnt']:
                    label = -1
                    Emotions[ ans - 1 ][ 'ncnt' ] += 1
                    tmp = {u'text':text, u'label':label}
                    emo['fileptr'].write( json.dumps(tmp) + '\n' )

    pass
示例#4
0
def __preProcess():
    dts.loop_with_param(__readin, tweets, 'loading files')
    for tweet in tweets:
        #dts.writeO( tmp + '\n' )
        tmp = {u'text': tweet}
        dts.writeO(json.dumps(tmp) + '\n')
    setlen = len(tweets)
    print '%d tweets remaining' % setlen
    dts.writeL('%d tweets remaining' % setlen)
示例#5
0
def __filter_bigram(ran):
    line = dts.readlineI()
    if not line:
        return
    left = ran[0]
    right = ran[1]
    num = int(line.split(':')[1])
    if num >= left and num <= right:
        dts.writeO(line)
示例#6
0
def __filter_range(border):
    l = border[0]
    r = border[1]
    line = dts.readlineI()
    if not line:
        return
    cnt = int((line.split(':'))[1])
    if cnt >= l and cnt <= r:
        dts.writeO(line)
示例#7
0
def __divide():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']
    label = __label(text)
    if label > 0:
        obj['label'] = label
        dts.writeO(json.dumps(obj) + '\n')
def __divide():
    line = dts.readlineI()
    if not line:
        return

    global GC
    if GC < 1000000:
        GC += 1
        return

    try:
        obj = json.loads(line)
    except:
        return

    text = obj['text']

    ans = 0
    for emo in Emotions:
        label = int(emo['label'])
        for icon in emo['Icons']:
            if icon in text:
                if 0 == ans or label == ans:
                    ans = label
                else:
                    ans = -1

    if ans > 0:
        text = re.sub('[!?]', '', text)
        text = re.sub('[^\d\w\ ]', '', text)
        text = re.sub('USERNAME', '', text)
        text = re.sub('URL', '', text)
        if len(tokenizer.tokenize(text)) <= 3:
            return

        tmp = {u'text': text, u'label': ans}
        dts.writeO(json.dumps(tmp) + '\n')

        for emo in Emotions:
            label = 0
            #print '%d:%d\n' % ( ans, int(emo['label'] ))
            if ans == int(emo['label']):
                label = 1
                Emotions[ans - 1]['cnt'] += 1
                tmp = {u'text': text, u'label': label}
                emo['fileptr'].write(json.dumps(tmp) + '\n')
            else:
                if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']:
                    label = -1
                    Emotions[ans - 1]['ncnt'] += 1
                    tmp = {u'text': text, u'label': label}
                    emo['fileptr'].write(json.dumps(tmp) + '\n')

    pass
def __lineParse():
    line = dts.readlineI()
    if not line:
        return
    obj = eval(line)
    output = str(obj[1]) + ' '
    wordlist = {}
    for word, value in obj[0].iteritems():
        if value > 0:
            wordlist.update({__getIndex(word): value})
            #output += str(__getIndex(word))  + ':'+ str(value) +' '

    for key, value in [(k, wordlist[k]) for k in sorted(wordlist.keys())]:
        output += str(key) + ':' + str(value) + ' '

    if obj[1] > 1:
        dts.writeO(output + '\n')
示例#10
0
def __cleanDup():
    dts.openFiles()
    tw = set()

    def __push():
        text = dts.readlineI()
        tw.add(text)

    dts.loop(__push, 'push into set')
    print 'start write to file %s' % dts.ofileName
    cnt = 0
    for text in tw:
        dts.writeO(text)
        cnt += 1
    print 'write finished, tot tweet left: %d' % cnt

    dts.closeFiles()
示例#11
0
def parse_line():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        return

    label = obj[u'label']
    text = obj[u'text']

    global PC
    if int(label) == 1:
        PC += 1
    output = '%d %s\n' % (label, gen_Feature(text))
    dts.writeO(output)
    dts.writeL(str(label) + '\n')
示例#12
0
def __g_each_tweet(param):
    #line = dts.readlineI()
    line = param[1].readline()
    if not line:
        return
    oline = '%s ' % param[0]

    featureList = __g_each_feature(preprocess_func.preprocess(line))

    for key, value in [(k, featureList[k])
                       for k in sorted(featureList.keys())]:
        oline += str(key) + ':' + value + ' '

    #for t in range( index_name_cnt-1 ):
    #    oline += str(t) + ':' + featureList[str(t)] + ' '
    #for key,value in featureList.items():
    #    oline += key + ':' + value + ' '

    #return oline
    dts.writeO(oline + '\n')
示例#13
0
def __dealLine(param):
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads( line )
    text = obj['text']
    hashtag = obj['hashtag']
    if hashtag != param[0]:
        return
    
    wordList = tokenizer.tokenize(preprocess(text))
    dict = {}
    for uni in getUnigramWords.getUnigramWords():
        dict.update( { uni: 0 } )
        for word in wordList:
            if word == uni:
                dict.update( { word: 1 } )
    
    nobj = ( dict, 0 )
    #print nobj
    dts.writeO( str(nobj)+'\n' )
示例#14
0
def make_bigram():

    dict = {}
    dts.loop_with_param_clean(__bigram, __bigram_clean, dict,
                              'Make the dict of bigram')

    print 'start to output'

    cntList = {}
    for x in range(100000):
        cntList.update({x: 0})
    for k, v in dict.iteritems():
        if v > 10:
            dts.writeO(k + ':' + str(v) + '\n')
        if v >= 100000:
            cntList.update({100000: cntList.get(100000, 0)})
        else:
            cntList.update({v: cntList.get(v, 0) + 1})

    for k, v in cntList.iteritems():
        dts.writeL(str(k) + ':' + str(v) + '\n')
示例#15
0
def make_dict():
    dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt',
                '../log/idf.log')
    dts.setSize(25770000)
    dts.openFiles()

    dict = {}
    dts.loop_with_param(__calcIDF, dict, 'calc the Idf')

    print 'start sort and print'
    cnt = 0
    pcnt = 0
    CntDistribution = {}
    CNT_MAX = 1000000
    for x in range(CNT_MAX + 1):
        CntDistribution[x] = 0
    for key, value in [(k, dict[k]) for k in sorted(dict.keys())]:
        if value > 10 and value < 364600:
            dts.writeO('%s:%d\n' % (key, value))
            pcnt += 1
        cnt += 1
        if (value > 364600):
            print key
        if (value > CNT_MAX * 10):
            CntDistribution[CNT_MAX] += 1
        else:
            CntDistribution[value / 10] += 1

    print '%d words output' % pcnt
    dts.writeL('%d words output\n' % pcnt)

    print 'printing range log'
    ncnt = 0
    for x in range(CNT_MAX):
        ncnt += CntDistribution[x]
        dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt))

    dts.closeFiles()
示例#16
0
def findemoji(str):
    line = dts.readlineI()
    if str in line:
        print(line)
        dts.writeO(line)
示例#17
0

def __clean(param):
    for key, cnt in [(k, v) for k, v in topicDict.iteritems()]:
        if cnt < param[0]:
            topicDict.pop(key)


if __name__ == "__main__":
    dts.setSize(13000000)

    dts.setFile('/home/server2103/dump/twitter.tweet.json',
                '../emojiOutput/topics', '../log/topics.emoji')
    dts.openFiles()
    dts.loop_with_param_clean(__dealLine, __clean, [
        3,
    ], 'find hashtags')

    cnt = 0
    sum = 0
    print 'start output'
    for key, value in topicDict.iteritems():
        dts.writeO('%s\t:%d\n' % (key, value))
        cnt += 1
        sum += value
    dts.writeL('%d hashtags with %d displays' % (cnt, sum))
    print '%d hashtags with %d displays' % (cnt, sum)

    dts.closeFiles()
    pass
示例#18
0
def __preprocess():
    line = preprocess_func.preprocess(dts.readlineI())
    dts.writeO(line)
示例#19
0
def __cleanTweet():
    dts.writeO( __bandWords(dts.readlineI()) )