Python readlineI 예제들, dealTweets.readlineI Python 예제들

예제 #1

0

파일 보기

def __readin(tweetSet):
    line = dts.readlineI()
    if not line:
        return
    #print '\n'+ line


#    if not re.match( r'\}\s*$', line ):
#        nline = dts.readlineI()
#
#        if not nline:
#            return
#        line += nline

    try:
        obj = json.loads(line)
    except:
        #print line
        nextline = dts.readlineI()
        #print nextline
        return

    text = __cleanRT(obj['text'])
    if not text:
        return
    else:
        tweetSet.add(text)

예제 #2

0

파일 보기

파일: devideEmotion.py 프로젝트: carwestsam/tweetEmotion

def __divide(parList):
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']

    ans = 0
    for emo in Emotions:
        label = int(emo['label'])
        for icon in emo['Icons']:
            if icon in text:
                if 0 == ans or label == ans:
                    ans = label
                else:
                    ans = -1

    if ans > 0:
        ttext = re.sub('[!?]', '', text)
        ttext = re.sub('[^\d\w\ ]', '', ttext)
        ttext = re.sub('USERNAME', '', ttext)
        ttext = re.sub('URL', '', text)
        if len(tokenizer.tokenize(text)) <= 3:
            return

    if ans > 0 and Emotions[ans - 1]['cnt'] < parList[0]:
        Emotions[ans - 1]['cnt'] += 1
        Emotions[ans - 1]['fileptr'].write(line)

예제 #3

0

파일 보기

파일: TopicFilter.py 프로젝트: carwestsam/tweetEmotion

def filterHashtags():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        return

    text = obj["text"]
    pt = "space"
    for topic in topicList:
        matchs = re.search(topic, text)
        if not matchs:
            continue
        else:
            pt = topic
            break

    if not matchs:
        return
    else:
        nobj = {"text": text, "hashtag": pt}
        dts.writeO(json.dumps(nobj) + '\n')

    pass

예제 #4

0

파일 보기

파일: testpre.py 프로젝트: carwestsam/tweetEmotion

def __io():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']
    dts.writeO(text + '\n')

예제 #5

0

파일 보기

파일: devideEmotion.py 프로젝트: carwestsam/tweetEmotion

def __testEmo():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    if u'\U0001F608' in obj['text']:
        print obj['text']

예제 #6

0

파일 보기

def __divide():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads( line )
    except:
        return

    text = obj['text']
    
    ans = 0
    for emo in Emotions:
        label = int ( emo['label'] )
        for icon in emo['Icons']:
            if icon in text:
                if 0 == ans or label == ans:
                    ans = label
                else:
                    ans = -1

    if ans > 0 :

        # for add the hashtag to HashtagSet
        htList = re.findall( '#\w+', text )
        global hashtagSet
        for ht in htList:
            hashtagSet.add( ht[1:].lower() )

        #text = preprocess_func.preprocess( text ).lower()
        #text = text.lower()
        #text = re.sub( '[!?]', '', text )
        #text = re.sub( '[^\d\w\ ]', '', text )
        #text = re.sub( 'username', '', text )
        #text = re.sub( 'twhashtag', '', text )
        #text = re.sub( 'url', '', text )
        text = re.sub( '\s+', ' ', text  )
        if len( tokenizer.tokenize( preprocess_func.preprocess(text ).lower() )) <= 3:
            return

        tmp = {u'text':text, u'label':ans}
        dts.writeO( json.dumps(tmp) + '\n' )

        for emo in Emotions:
            label = 0
            #print '%d:%d\n' % ( ans, int(emo['label'] ))
            if ans == int(emo['label']):
                label = 1
                Emotions[ ans - 1 ][ 'cnt' ] += 1
                tmp = {u'text':text, u'label':label}
                emo['fileptr'].write( json.dumps(tmp) + '\n' )
            else :
                if Emotions[ans-1]['ncnt'] < Emotions[ans-1]['cnt']:
                    label = -1
                    Emotions[ ans - 1 ][ 'ncnt' ] += 1
                    tmp = {u'text':text, u'label':label}
                    emo['fileptr'].write( json.dumps(tmp) + '\n' )

    pass

예제 #7

0

파일 보기

def __calcIDF(dict):
    line = dts.readlineI()
    if not line:
        return
    line = re.sub('[!?]', '', line)
    line = re.sub(r'[^\d\w\ ]', '', line)
    for term in tokenizer.tokenize(line):
        dict.update({term: dict.get(term, 0) + 1})

예제 #8

0

파일 보기

파일: bigram_make.py 프로젝트: carwestsam/tweetEmotion

def __filter_bigram(ran):
    line = dts.readlineI()
    if not line:
        return
    left = ran[0]
    right = ran[1]
    num = int(line.split(':')[1])
    if num >= left and num <= right:
        dts.writeO(line)

예제 #9

0

파일 보기

def __filter_range(border):
    l = border[0]
    r = border[1]
    line = dts.readlineI()
    if not line:
        return
    cnt = int((line.split(':'))[1])
    if cnt >= l and cnt <= r:
        dts.writeO(line)

예제 #10

0

파일 보기

파일: divideHash.py 프로젝트: carwestsam/tweetEmotion

def __divide():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']
    label = __label(text)
    if label > 0:
        obj['label'] = label
        dts.writeO(json.dumps(obj) + '\n')

예제 #11

0

파일 보기

파일: topicSelect.py 프로젝트: carwestsam/tweetEmotion

def __dealLine(param):
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        #print line
        nextline = dts.readlineI()
        #print nextline
        return
    text = obj['text']
    regex = r'#\w+'
    matchs = re.findall(regex, text)
    if not matchs:
        return
    for words in matchs:
        words = words.lower()
        topicDict.update({words: topicDict.get(words, 0) + 1})

예제 #12

0

파일 보기

파일: divideByEmoji_test.py 프로젝트: carwestsam/tweetEmotion

def __divide():
    line = dts.readlineI()
    if not line:
        return

    global GC
    if GC < 1000000:
        GC += 1
        return

    try:
        obj = json.loads(line)
    except:
        return

    text = obj['text']

    ans = 0
    for emo in Emotions:
        label = int(emo['label'])
        for icon in emo['Icons']:
            if icon in text:
                if 0 == ans or label == ans:
                    ans = label
                else:
                    ans = -1

    if ans > 0:
        text = re.sub('[!?]', '', text)
        text = re.sub('[^\d\w\ ]', '', text)
        text = re.sub('USERNAME', '', text)
        text = re.sub('URL', '', text)
        if len(tokenizer.tokenize(text)) <= 3:
            return

        tmp = {u'text': text, u'label': ans}
        dts.writeO(json.dumps(tmp) + '\n')

        for emo in Emotions:
            label = 0
            #print '%d:%d\n' % ( ans, int(emo['label'] ))
            if ans == int(emo['label']):
                label = 1
                Emotions[ans - 1]['cnt'] += 1
                tmp = {u'text': text, u'label': label}
                emo['fileptr'].write(json.dumps(tmp) + '\n')
            else:
                if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']:
                    label = -1
                    Emotions[ans - 1]['ncnt'] += 1
                    tmp = {u'text': text, u'label': label}
                    emo['fileptr'].write(json.dumps(tmp) + '\n')

    pass

예제 #13

0

파일 보기

def dealLine():
    line = dts.readlineI()
    for emo in Emotions:
        flag = -2
        for eicon in emo['Icons']:
            if eicon in line:
                print line
                #flag = line.find( eicon )
                flag = 0 
                break
        if flag >= 0:
            emo['cnt'] = emo['cnt'] + 1

예제 #14

0

파일 보기

파일: bigram_make.py 프로젝트: carwestsam/tweetEmotion

def __bigram(dict):
    text = dts.readlineI()
    if not text:
        return
    text = re.sub('[!?]', '', text)
    text = re.sub('[^\d\w\ ]', '', text)

    tokens = tokenizer.tokenize(text)

    for wx in tokens:
        for wy in tokens:
            tmpstr = wx + '#' + wy
            dict.update({tmpstr: dict.get(tmpstr, 0) + 1})

예제 #15

0

파일 보기

 def dealLine():
     line = dts.readlineI()
     for emo in Emotions:
         if emo['cnt'] > MaxEmotionSize:
             continue
         flag = -2
         for eicon in emo['Icons']:
             flag = line.find( eicon )
             if flag != -1 :
                 emo['fileptr'].write( line )
                 break
         if flag >= 0:
             emo['cnt'] = emo['cnt'] + 1

예제 #16

0

파일 보기

파일: featureVectorParse.py 프로젝트: carwestsam/tweetEmotion

def __lineParse():
    line = dts.readlineI()
    if not line:
        return
    obj = eval(line)
    output = str(obj[1]) + ' '
    wordlist = {}
    for word, value in obj[0].iteritems():
        if value > 0:
            wordlist.update({__getIndex(word): value})
            #output += str(__getIndex(word))  + ':'+ str(value) +' '

    for key, value in [(k, wordlist[k]) for k in sorted(wordlist.keys())]:
        output += str(key) + ':' + str(value) + ' '

    if obj[1] > 1:
        dts.writeO(output + '\n')

예제 #17

0

파일 보기

파일: featureEmoji.py 프로젝트: carwestsam/tweetEmotion

def parse_line():
    line = dts.readlineI()
    if not line:
        return
    try:
        obj = json.loads(line)
    except:
        return

    label = obj[u'label']
    text = obj[u'text']

    global PC
    if int(label) == 1:
        PC += 1
    output = '%d %s\n' % (label, gen_Feature(text))
    dts.writeO(output)
    dts.writeL(str(label) + '\n')

예제 #18

0

파일 보기

파일: featureUnigram.py 프로젝트: carwestsam/tweetEmotion

def __dealLine(param):
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads( line )
    text = obj['text']
    hashtag = obj['hashtag']
    if hashtag != param[0]:
        return
    
    wordList = tokenizer.tokenize(preprocess(text))
    dict = {}
    for uni in getUnigramWords.getUnigramWords():
        dict.update( { uni: 0 } )
        for word in wordList:
            if word == uni:
                dict.update( { word: 1 } )
    
    nobj = ( dict, 0 )
    #print nobj
    dts.writeO( str(nobj)+'\n' )

예제 #19

0

파일 보기

 def __push():
     text = dts.readlineI()
     tw.add(text)

예제 #20

0

파일 보기

파일: findemoji.py 프로젝트: carwestsam/tweetEmotion

def findemoji(str):
    line = dts.readlineI()
    if str in line:
        print(line)
        dts.writeO(line)

예제 #21

0

파일 보기

파일: labelCounter.py 프로젝트: carwestsam/tweetEmotion

def __line():
    line = dts.readlineI()
    if not line:
        return
    obj = eval(line)
    counter[obj[1]] += 1

예제 #22

0

파일 보기

def __cleanTweet():
    dts.writeO( __bandWords(dts.readlineI()) )

예제 #23

0

파일 보기

파일: preprocess.py 프로젝트: carwestsam/tweetEmotion

def __preprocess():
    line = preprocess_func.preprocess(dts.readlineI())
    dts.writeO(line)