Python processTweet示例，preprocessing.processTweet Python示例

示例#1

0

显示文件

文件： main.py 项目： gipsylobos/clasificacion-de-tweets

def mapTweet(tweet, afinn, emoDict, positive, negative, neutral, slangs):
    out = []
    line = preprocessing.processTweet(
        tweet, stopWords, slangs
    )  # limpio el tweet, eliminando las palabras innecesarias y sobreescribiendo los modismos
    out.append(polarity.afinnPolarity(line, afinn))  # afinidad
    out.append(float(features.emoticonScore(line, emoDict)))  # emoticon score
    out.append(float(features.hashtagWordsRatio(
        line)))  # porcentaje de palabras con hashtag
    out.append(float(len(line) /
                     140))  # tamaño total de los 140 carácteres utilizados
    out.append(float(features.upperCase(
        line)))  # si existen mayúsuculas en el tweet; 1 = si, 0 = no
    out.append(float(features.exclamationTest(
        line)))  # si tiene signo de exclamación o no; 1 = si, 0 = no
    out.append(float(line.count("!") /
                     140))  # procentaje de signos de exlamación
    out.append(float(
        (features.questionTest(line))))  # si tiene un signo de pregunta
    out.append(float(line.count('?') /
                     140))  # procentaje de signos de preguntas
    out.append(float(
        features.freqCapital(line)))  # porcentaje de las letras en mayusculas
    u = features.scoreUnigram(
        line, positive, negative, neutral
    )  # Score sobre el vector de palabras utilizadas en los documentos de prueba
    out.extend(u)
    return out

示例#2

0

显示文件

文件： hybrid.py 项目： ziany/neptune

def mapTweet(tweet,sentiWordnet,emoDict,unigram,slangs):
    out=[]
    line=preprocessing.processTweet(tweet,stopWords,slangs)
   
#    p=polarity.polarity(line,sentiWordnet)
    p=polarity.posPolarity(line,sentiWordnet)
   
    out.extend([float(p[0]),float(p[1]),float(p[2])]) # aggregate polarity for pos neg and neutral here neutral is stripped
#    pos=polarity.posFreq(line,sentiWordnet)
    out.extend(p[7:]) # frequencies of pos 

#    out.extend([float(pos['v']),float(pos['n']),float(pos['a']),float(pos['r'])]) # pos counts inside the tweet
    out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights
    out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words
    out.append(float(len(line)/140)) # for the length
    out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1
    out.append(float(features.exclamationTest(line)))
    out.append(float(line.count("!")/140))
    out.append(float((features.questionTest(line))))
    out.append(float(line.count('?')/140))
    out.append(float(features.freqCapital(line)))
    for w in unigram:  # unigram
            if (w in line.split()):
                out.append(float(1))
            else:
                out.append(float(0))
    return out

示例#3

0

显示文件

文件： new.py 项目： zhouxiaofan/Projects

	def classify(self, items):
		tweet_feature = list()
		for tweet in items:
			tweet = preprocessing.processTweet(tweet)
			tokens = tweet.split()
			tweet_feature.append(self.best_feats.best_bigram_word_feats(tokens, 10)) #n is equal 10 is the best result
		label = self.classif.classify_many(tweet_feature)
		return label

示例#4

0

显示文件

文件： Classification.py 项目： zhouxiaofan/Projects

	def classify(self, items):
		tweet_features = list()
		for tweet_raw in items:
			tweet = preprocessing.processTweet(tweet_raw)
			tokens = tweet.split()
			single_f = self.best_feats.best_bigram_word_feats(tokens, 10)
			# if type(single_f) == types.NoneType:
				# print"^^^^^^^^^^^:", tweet_raw, tokens
			tweet_features.append(single_f) #n is equal 10 is the best result
		label = self.classif.classify_many(tweet_features)
		return label

示例#5

0

显示文件

文件： svm.py 项目： ziany/uranus

def mapTweet(tweet,afinn,emoDict,positive,negative,neutral,slangs):
    out=[]
    line=preprocessing.processTweet(tweet,stopWords,slangs)
    p=polarity.afinnPolarity(line,afinn)
    out.append(p)
    out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights
    out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words
    out.append(float(len(line)/140)) # for the length
    out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1
    out.append(float(features.exclamationTest(line)))
    out.append(float(line.count("!")/140))
    out.append(float((features.questionTest(line))))
    out.append(float(line.count('?')/140))
    out.append(float(features.freqCapital(line)))
    u=features.scoreUnigram(line,positive,negative,neutral)
    out.extend(u)
    return out

示例#6

0

显示文件

文件： svm.py 项目： gipsylobos/clasificacion-de-tweets

def mapTweet(tweet,afinn,emoDict,positive,negative,neutral,slangs):
    out=[]
    line=preprocessing.processTweet(tweet,stopWords,slangs)
    p=polarity.afinnPolarity(line,afinn)
    out.append(p)
    out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights
    out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words
    out.append(float(len(line)/140)) # for the length
    out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1
    out.append(float(features.exclamationTest(line)))
    out.append(float(line.count("!")/140))
    out.append(float((features.questionTest(line))))
    out.append(float(line.count('?')/140))
    out.append(float(features.freqCapital(line)))
    u=features.scoreUnigram(line,positive,negative,neutral)
    out.extend(u)
    return out

示例#7

0

显示文件

文件： mapper.py 项目： ziany/MyProject

def mapper(filename,label):
#    k=0
    f=open(filename,'r')
    line=f.readline()
    
    while line:
#        k=k+1
        newLine=preprocessing.processTweet(line,stopWords,slangs)
        line=newLine
        out=label+'\t'
        
        p=polarity.polarity(line,sentiWordnet)
        out=out+str(p[0])+'\t'+str(p[1])+'\t'+str(p[2])+'\t' # aggregate polarity for pos neg and neutral 
#        print len(out.split('\t'))
        pos=polarity.posFreq(line,sentiWordnet)
        out=out+str(pos['v'])+'\t'+str(pos['n'])+'\t'+str(pos['a'])+'\t'+str(pos['r'])+'\t' # pos counts inside the tweet
        out=out+str(features.emoticonScore(line,emoticonDict))+'\t' # emo aggregate score be careful to modify weights
        out=out+str(len(line))+'\t' # for the length
        out=out+str(features.upperCase(line))+'\t' # uppercase existence : 0 or 1
        
        out=out+str(features.exclamationTest(line))+'\t'
#        print len(out.split('\t'))
        out=out+str(line.count("!"))+'\t'
       
        out=out+str(features.questionTest(line))+'\t'
        
        out=out+str(line.count('?'))+'\t'
        
        out=out+str(features.freqCapital(line))+'\t'
#        print len(out.split('\t'))
        for w in total:  # unigram
            if (w in line):
                out=out+'1\t'
            else:
                out=out+'0\t'
        
        
        fo.write(out+'\n')
#        print len(out.split('\t'))
#        k=k+1
#        print str(k)+' line(s) mapped'
        line=f.readline()
    f.close()
    return None

示例#8

0

显示文件

文件： svm.py 项目： ziany/neptune

def mapTweet(tweet,sentiWordnet,emoDict,positive,negative,neutral,slangs):
    out=[]
    line=preprocessing.processTweet(tweet,stopWords,slangs)
   
    p=polarity.posPolarity(line,sentiWordnet)
    out.extend([p[0],p[1],p[2]/2]) # aggregate polsarity pos - negative
#    out.extend(p[7:]) # frequencies of pos 
#    out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights
#    out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words
#    out.append(float(len(line)/140)) # for the length
#    out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1
#    out.append(float(features.exclamationTest(line)))
#    out.append(float(line.count("!")/140))
#    out.append(float((features.questionTest(line))))
#    out.append(float(line.count('?')/140))
#    out.append(float(features.freqCapital(line)))
    u=features.scoreUnigram(tweet,positive,negative,neutral)
    out.extend(u)
    return out

示例#9

0

显示文件

文件： fuck.py 项目： ziany/neptune

def mapTweet(tweet,sentiWordnet,emoDict,unigram,slangs):
    out=[]
    line=preprocessing.processTweet(tweet,stopWords,slangs)
   
    p=polarity.posPolarity(line,sentiWordnet)
    out.extend([p[0],p[1],p[2]]) # aggregate polsarity pos - negative
    out.extend(p[7:]) # frequencies of pos 
    out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights
    out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words
    out.append(float(len(line)/140)) # for the length
    out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1
    out.append(float(features.exclamationTest(line)))
    out.append(float(line.count("!")/140))
    out.append(float((features.questionTest(line))))
    out.append(float(line.count('?')/140))
    out.append(float(features.freqCapital(line)))
    for w in unigram:  # unigram
            if (w in line.split()):
                out.append(float(1))
            else:
                out.append(float(0))
    return out

示例#10

0

显示文件

文件： threeFileGeneration.py 项目： ziany/MyProject

f=open('../data/positive_sample.csv','r')
fo=open('../data/positive_processed.csv','w')
import preprocessing
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')

f=open('../data/positive_sample.csv','r')
fo=open('../data/positive_processed.csv','w')

line=f.readline()
while line:
    a=line.split(r'","')
    b=a[5][:-1]
    c=preprocessing.processTweet(b,stopWords,slangs)
    
    d=preprocessing.removeStopWords(c,stopWords)
    
    fo.write(d+'\n')
    line = f.readline()

f.close()
fo.close()

print "positive samples processed"

f=open('../data/negative_sample.csv','r')
fo=open('../data/negative_processed.csv','w')
line=f.readline()
while line:
    a=line.split(r'","')
    b=a[5][:-1]

示例#11

0

显示文件

import preprocessing
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')

f = open('../data/sts/positive_sample.csv', 'r')
fo = open('../data/positive_processed.csv', 'w')

line = f.readline()
while line:
    a = line.split(r'","')
    b = a[5][:-1]
    c = preprocessing.processTweet(b, stopWords, slangs)

    d = preprocessing.removeStopWords(c, stopWords)

    fo.write(d + '\n')
    line = f.readline()

f.close()
fo.close()

print("positive samples processed")

f = open('../data/sts/negative_sample.csv', 'r')
fo = open('../data/negative_processed.csv', 'w')
line = f.readline()
while line:
    a = line.split(r'","')
    b = a[5][:-1]
    c = preprocessing.processTweet(b, stopWords, slangs)
    d = preprocessing.removeStopWords(c, stopWords)