Пример #1
0

import numpy as np
import nltk # for pos tags 

import features
import polarity
import ngramGenerator
import preprocessing


KERNEL_FUNCTION='linear'
C_PARAMETER=0.6

print "Initializing dictionnaries"
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')
afinn=polarity.loadAfinn('../resources/afinn.txt')
#sentiWordnet=polarity.loadSentiWordnet('../resources/sentiWordnetBig.csv')
emoticonDict=features.createEmoticonDictionary("../resources/emoticon.txt")

print "Bulding Bag of words ..."
positive=ngramGenerator.mostFreqList('../data/used/positive1.csv',3000)
negative=ngramGenerator.mostFreqList('../data/used/negative1.csv',3000)
neutral=ngramGenerator.mostFreqList('../data/used/neutral1.csv',3000)


for w in positive:
    if w in negative+neutral : 
        positive.remove(w)
import preprocessing
import sys
# to suppress warning - https://stackoverflow.com/questions/14463277/how-to-disable-python-warnings
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')

f = open('../data/sem/positive.tsv', 'r')
fo = open('../data/positive_processed.csv', 'w')

line = f.readline()
while line:
    a = line.split('\t')
    b = a[len(a) - 1][:-1]
    c = preprocessing.processTweet(b, stopWords, slangs)

    d = preprocessing.removeStopWords(c, stopWords)

    fo.write(d + '\n')
    line = f.readline()

f.close()
fo.close()

print "positive samples processed"

f = open('../data/sem/negative.tsv', 'r')
fo = open('../data/negative_processed.csv', 'w')