def mapTweet(tweet, afinn, emoDict, positive, negative, neutral, slangs): out = [] line = preprocessing.processTweet( tweet, stopWords, slangs ) # limpio el tweet, eliminando las palabras innecesarias y sobreescribiendo los modismos out.append(polarity.afinnPolarity(line, afinn)) # afinidad out.append(float(features.emoticonScore(line, emoDict))) # emoticon score out.append(float(features.hashtagWordsRatio( line))) # porcentaje de palabras con hashtag out.append(float(len(line) / 140)) # tamaño total de los 140 carácteres utilizados out.append(float(features.upperCase( line))) # si existen mayúsuculas en el tweet; 1 = si, 0 = no out.append(float(features.exclamationTest( line))) # si tiene signo de exclamación o no; 1 = si, 0 = no out.append(float(line.count("!") / 140)) # procentaje de signos de exlamación out.append(float( (features.questionTest(line)))) # si tiene un signo de pregunta out.append(float(line.count('?') / 140)) # procentaje de signos de preguntas out.append(float( features.freqCapital(line))) # porcentaje de las letras en mayusculas u = features.scoreUnigram( line, positive, negative, neutral ) # Score sobre el vector de palabras utilizadas en los documentos de prueba out.extend(u) return out
def mapTweet(tweet,sentiWordnet,emoDict,unigram,slangs): out=[] line=preprocessing.processTweet(tweet,stopWords,slangs) # p=polarity.polarity(line,sentiWordnet) p=polarity.posPolarity(line,sentiWordnet) out.extend([float(p[0]),float(p[1]),float(p[2])]) # aggregate polarity for pos neg and neutral here neutral is stripped # pos=polarity.posFreq(line,sentiWordnet) out.extend(p[7:]) # frequencies of pos # out.extend([float(pos['v']),float(pos['n']),float(pos['a']),float(pos['r'])]) # pos counts inside the tweet out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words out.append(float(len(line)/140)) # for the length out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1 out.append(float(features.exclamationTest(line))) out.append(float(line.count("!")/140)) out.append(float((features.questionTest(line)))) out.append(float(line.count('?')/140)) out.append(float(features.freqCapital(line))) for w in unigram: # unigram if (w in line.split()): out.append(float(1)) else: out.append(float(0)) return out
def classify(self, items): tweet_feature = list() for tweet in items: tweet = preprocessing.processTweet(tweet) tokens = tweet.split() tweet_feature.append(self.best_feats.best_bigram_word_feats(tokens, 10)) #n is equal 10 is the best result label = self.classif.classify_many(tweet_feature) return label
def classify(self, items): tweet_features = list() for tweet_raw in items: tweet = preprocessing.processTweet(tweet_raw) tokens = tweet.split() single_f = self.best_feats.best_bigram_word_feats(tokens, 10) # if type(single_f) == types.NoneType: # print"^^^^^^^^^^^:", tweet_raw, tokens tweet_features.append(single_f) #n is equal 10 is the best result label = self.classif.classify_many(tweet_features) return label
def mapTweet(tweet,afinn,emoDict,positive,negative,neutral,slangs): out=[] line=preprocessing.processTweet(tweet,stopWords,slangs) p=polarity.afinnPolarity(line,afinn) out.append(p) out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words out.append(float(len(line)/140)) # for the length out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1 out.append(float(features.exclamationTest(line))) out.append(float(line.count("!")/140)) out.append(float((features.questionTest(line)))) out.append(float(line.count('?')/140)) out.append(float(features.freqCapital(line))) u=features.scoreUnigram(line,positive,negative,neutral) out.extend(u) return out
def mapTweet(tweet,afinn,emoDict,positive,negative,neutral,slangs): out=[] line=preprocessing.processTweet(tweet,stopWords,slangs) p=polarity.afinnPolarity(line,afinn) out.append(p) out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words out.append(float(len(line)/140)) # for the length out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1 out.append(float(features.exclamationTest(line))) out.append(float(line.count("!")/140)) out.append(float((features.questionTest(line)))) out.append(float(line.count('?')/140)) out.append(float(features.freqCapital(line))) u=features.scoreUnigram(line,positive,negative,neutral) out.extend(u) return out
def mapper(filename,label): # k=0 f=open(filename,'r') line=f.readline() while line: # k=k+1 newLine=preprocessing.processTweet(line,stopWords,slangs) line=newLine out=label+'\t' p=polarity.polarity(line,sentiWordnet) out=out+str(p[0])+'\t'+str(p[1])+'\t'+str(p[2])+'\t' # aggregate polarity for pos neg and neutral # print len(out.split('\t')) pos=polarity.posFreq(line,sentiWordnet) out=out+str(pos['v'])+'\t'+str(pos['n'])+'\t'+str(pos['a'])+'\t'+str(pos['r'])+'\t' # pos counts inside the tweet out=out+str(features.emoticonScore(line,emoticonDict))+'\t' # emo aggregate score be careful to modify weights out=out+str(len(line))+'\t' # for the length out=out+str(features.upperCase(line))+'\t' # uppercase existence : 0 or 1 out=out+str(features.exclamationTest(line))+'\t' # print len(out.split('\t')) out=out+str(line.count("!"))+'\t' out=out+str(features.questionTest(line))+'\t' out=out+str(line.count('?'))+'\t' out=out+str(features.freqCapital(line))+'\t' # print len(out.split('\t')) for w in total: # unigram if (w in line): out=out+'1\t' else: out=out+'0\t' fo.write(out+'\n') # print len(out.split('\t')) # k=k+1 # print str(k)+' line(s) mapped' line=f.readline() f.close() return None
def mapTweet(tweet,sentiWordnet,emoDict,positive,negative,neutral,slangs): out=[] line=preprocessing.processTweet(tweet,stopWords,slangs) p=polarity.posPolarity(line,sentiWordnet) out.extend([p[0],p[1],p[2]/2]) # aggregate polsarity pos - negative # out.extend(p[7:]) # frequencies of pos # out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights # out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words # out.append(float(len(line)/140)) # for the length # out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1 # out.append(float(features.exclamationTest(line))) # out.append(float(line.count("!")/140)) # out.append(float((features.questionTest(line)))) # out.append(float(line.count('?')/140)) # out.append(float(features.freqCapital(line))) u=features.scoreUnigram(tweet,positive,negative,neutral) out.extend(u) return out
def mapTweet(tweet,sentiWordnet,emoDict,unigram,slangs): out=[] line=preprocessing.processTweet(tweet,stopWords,slangs) p=polarity.posPolarity(line,sentiWordnet) out.extend([p[0],p[1],p[2]]) # aggregate polsarity pos - negative out.extend(p[7:]) # frequencies of pos out.append(float(features.emoticonScore(line,emoDict))) # emo aggregate score be careful to modify weights out.append(float(len(features.hashtagWords(line))/40)) # number of hashtagged words out.append(float(len(line)/140)) # for the length out.append(float(features.upperCase(line))) # uppercase existence : 0 or 1 out.append(float(features.exclamationTest(line))) out.append(float(line.count("!")/140)) out.append(float((features.questionTest(line)))) out.append(float(line.count('?')/140)) out.append(float(features.freqCapital(line))) for w in unigram: # unigram if (w in line.split()): out.append(float(1)) else: out.append(float(0)) return out
f=open('../data/positive_sample.csv','r') fo=open('../data/positive_processed.csv','w') import preprocessing stopWords = preprocessing.getStopWordList('../resources/stopWords.txt') slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt') f=open('../data/positive_sample.csv','r') fo=open('../data/positive_processed.csv','w') line=f.readline() while line: a=line.split(r'","') b=a[5][:-1] c=preprocessing.processTweet(b,stopWords,slangs) d=preprocessing.removeStopWords(c,stopWords) fo.write(d+'\n') line = f.readline() f.close() fo.close() print "positive samples processed" f=open('../data/negative_sample.csv','r') fo=open('../data/negative_processed.csv','w') line=f.readline() while line: a=line.split(r'","') b=a[5][:-1]
import preprocessing stopWords = preprocessing.getStopWordList('../resources/stopWords.txt') slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt') f = open('../data/sts/positive_sample.csv', 'r') fo = open('../data/positive_processed.csv', 'w') line = f.readline() while line: a = line.split(r'","') b = a[5][:-1] c = preprocessing.processTweet(b, stopWords, slangs) d = preprocessing.removeStopWords(c, stopWords) fo.write(d + '\n') line = f.readline() f.close() fo.close() print("positive samples processed") f = open('../data/sts/negative_sample.csv', 'r') fo = open('../data/negative_processed.csv', 'w') line = f.readline() while line: a = line.split(r'","') b = a[5][:-1] c = preprocessing.processTweet(b, stopWords, slangs) d = preprocessing.removeStopWords(c, stopWords)