示例#1
0
    def __init__(self,
                 traing_data_fileP1='mood_traing_p1.dat',
                 traing_data_fileP2='mood_traing.dat',
                 data_file='tweets_raw.dat'):
        if self.sentiwordnet:
            print "using sentiwordnet dictionary"
        else:
            print "not using sentiwordnet dictionary"

        self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsFile = open(
            os.path.join(os.curdir, os.path.normpath('../data/' + data_file)),
            'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(
            os.path.join(os.curdir, os.path.normpath('../data/' + data_file)),
            'rb')

        self.limit['en'] = 300000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME
        self.swn = SentiWordNetCorpusReader(swn_filename)
def findScoreWord(word, dType):
    swn_filename = 'SentiWordNet_3.0.0_20130122.txt'
    swn = SentiWordNetCorpusReader(swn_filename)
    word = re.sub('[%s]' % re.escape(string.punctuation), ' ', word)
    word = word.lower()
    #print word
    wS = 0
    
    for w in word.split():
	#print w
	if dType == NN:
	    test = swn.senti_synsets(w, 'n')
	elif dType == ADJ:
	    test = swn.senti_synsets(w, 'a')
	elif dType == VB:
	    test = swn.senti_synsets(w, 'v')
	elif dType == ADV:
	    test = swn.senti_synsets(w, 'r')
	
	try:
	    if test[0].pos_score < 0.1:
		wS += -test[0].neg_score
	    elif test[0].neg_score < 0.1:
		wS += test[0].pos_score
	    else:
		wS = test[0].pos_score
	except:
	    continue
    #print word, wS
    
    if len(word.split()) == 0:
	return 0
    
    return wS/len(word.split())
示例#3
0
    def __init__(self):
        self.mwnet = MWNet(os.path.join("data", "mwnet.db"))
        self.swn = SentiWordNetCorpusReader(os.path.join("data", "SentiWordNet_3.0.0.txt"))

        if os.getenv('TANL_EMAIL'):
            self.splitter = TanlSplitter()
        else:
            self.splitter = SimpleSplitter()
    def __init__(self,
                 traing_data_fileP1='mood_traing_p1.dat',
                 traing_data_fileP2='mood_traing.dat',
                 data_file='tweets_raw.dat'):

        self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb')
        self.countRows(self.tweetsFile)

        self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 10000
        self.count = 0

        swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt'
        self.swn = SentiWordNetCorpusReader(swn_filename)
示例#5
0
words_noslang = []
for w in words:
    for k, v in dico_slang.items():
        if k in w:
            #print(k)
            w = list(filter(lambda x: x != k, w))
            w.append(v)
            #print(w)
    words_noslang.append(w)
print(words_noslang)

tag = nltk.pos_tag
tag_list_tot = []
tag_list = [tag(words_noslang[i]) for i in range(N)]
print(tag_list)
for i in range(N):
    tag_list_tot.extend(tag(words_noslang[i]))

tot_pos = [tag_list_tot[i][1] for i in range(len(tag_list_tot))]
count_VB = Counter(tot_pos)
print(count_VB)
verb_tag = ['VB', 'VBZ', 'VBP', 'VBG', 'VBN', 'VBD']
sum_vb = 0
for k, v in count_VB.items():
    if k in verb_tag:
        sum_vb += v
print('Le nombre total de POS verbes sur l\'ensemble des tweets est: ', sum_vb)

swn_filename = '/home/audrey/Audrey/Cours/INF344/TP/TP_sentiment/SentiWordNet_3.0.0_20130122.txt'
swn = SentiWordNetCorpusReader(swn_filename)
swn.senti_synset('breakdown.n.03')
def sentimentAnalysis(filename, outputFile):
    swn_filename = 'SentiWordNet_3.0.0_20100705.txt'
    swn = SentiWordNetCorpusReader(swn_filename)

    regex = re.compile('[%s]' % re.escape(string.punctuation))

    with codecs.open(filename, 'r', 'utf-8') as f:
        data1 = f.readlines()
    f.close()

    writer = csv.writer(open(outputFile, 'wb'))

    tag = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'TO', 'UH', 'PDT', 'SYM', 'RP']
    noun = ['NN', 'NNS', 'NP', 'NPS']
    adj = ['JJ', 'JJR', 'JJS']
    pronoun = ['PP', 'PP$', 'WP', 'WP$']
    verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    adverb = ['RB', 'RBR', 'RBS', 'WRB']
    for text in data1:
        count = re.split("\s+", text, 1)[0]
        if len(re.split("\s+", text, 1)) > 1:
            text = re.split("\s+", text, 1)[1]
            Tex = regex.sub(u'', text)
            words = word_tokenize(Tex.lower())
            word = nltk.pos_tag(words)
            objCount = 0
            subCount = 0
            for w in word:
                if not w[1] in tag:
                    #print(w)
                    if w[1] in noun:
                        pos_Char = 'n'
                    elif w[1] in adj:
                        pos_Char = 'a'
                    elif w[1] in pronoun:
                        pos_Char = 'p'
                    elif w[1] in verb:
                        pos_Char = 'v'
                    elif w[1] in adverb:
                        pos_Char = 'r'
                    else:
                        pos_Char = 'none'

                    if pos_Char == 'none':
                        try:
                            s = swn.senti_synsets(w[0])
                            scores = list(s)[0]
                            if scores.obj_score > 0.5:
                                objCount += 1
                            elif scores.pos_score + scores.neg_score > 0.5:
                                subCount += 1
                        except:
                            print('Unexpected word')
                    else:
                        try:
                            s = swn.senti_synsets(w[0], pos_Char)
                            scores = list(s)[0]
                            if scores.obj_score > 0.5:
                                objCount += 1
                            elif scores.pos_score + scores.neg_score > 0.5:
                                subCount += 1
                        except:
                            print('Unexpected word')

            if objCount + subCount > 0:
                ratioObj = float(objCount) / (objCount + subCount)
                ratioSub = float(subCount) / (objCount + subCount)
            else:
                ratioObj = 0.0
                ratioSub = 0.0
            writer.writerow([count, ratioObj, ratioSub])