def __init__(self, traing_data_fileP1='mood_traing_p1.dat', traing_data_fileP2='mood_traing.dat', data_file='tweets_raw.dat'): if self.sentiwordnet: print "using sentiwordnet dictionary" else: print "not using sentiwordnet dictionary" self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open( os.path.join(os.curdir, os.path.normpath('../data/' + data_file)), 'rb') self.countRows(self.tweetsFile) self.tweetsFile = open( os.path.join(os.curdir, os.path.normpath('../data/' + data_file)), 'rb') self.limit['en'] = 300000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/' + conf.SENTIWORDNET_DICT_FILENAME self.swn = SentiWordNetCorpusReader(swn_filename)
def findScoreWord(word, dType): swn_filename = 'SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) word = re.sub('[%s]' % re.escape(string.punctuation), ' ', word) word = word.lower() #print word wS = 0 for w in word.split(): #print w if dType == NN: test = swn.senti_synsets(w, 'n') elif dType == ADJ: test = swn.senti_synsets(w, 'a') elif dType == VB: test = swn.senti_synsets(w, 'v') elif dType == ADV: test = swn.senti_synsets(w, 'r') try: if test[0].pos_score < 0.1: wS += -test[0].neg_score elif test[0].neg_score < 0.1: wS += test[0].pos_score else: wS = test[0].pos_score except: continue #print word, wS if len(word.split()) == 0: return 0 return wS/len(word.split())
def __init__(self): self.mwnet = MWNet(os.path.join("data", "mwnet.db")) self.swn = SentiWordNetCorpusReader(os.path.join("data", "SentiWordNet_3.0.0.txt")) if os.getenv('TANL_EMAIL'): self.splitter = TanlSplitter() else: self.splitter = SimpleSplitter()
def __init__(self, traing_data_fileP1='mood_traing_p1.dat', traing_data_fileP2='mood_traing.dat', data_file='tweets_raw.dat'): self.clsP1 = MoodDetectTrainer(data_file=traing_data_fileP1) self.clsP2 = MoodDetectTrainer(data_file=traing_data_fileP2) self.langClassifier = LangDetect(supportedLangs) self.training_data_p1 = MoodDetectTrainData() self.training_data_p2 = MoodDetectTrainData() self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb') self.countRows(self.tweetsFile) self.tweetsFile = open(os.path.join(self.dataDir, data_file), 'rb') self.limit['en'] = 150000 self.limit['default'] = 10000 self.count = 0 swn_filename = '../dict/sentiwordnet/SentiWordNet_3.0.0_20100705.txt' self.swn = SentiWordNetCorpusReader(swn_filename)
words_noslang = [] for w in words: for k, v in dico_slang.items(): if k in w: #print(k) w = list(filter(lambda x: x != k, w)) w.append(v) #print(w) words_noslang.append(w) print(words_noslang) tag = nltk.pos_tag tag_list_tot = [] tag_list = [tag(words_noslang[i]) for i in range(N)] print(tag_list) for i in range(N): tag_list_tot.extend(tag(words_noslang[i])) tot_pos = [tag_list_tot[i][1] for i in range(len(tag_list_tot))] count_VB = Counter(tot_pos) print(count_VB) verb_tag = ['VB', 'VBZ', 'VBP', 'VBG', 'VBN', 'VBD'] sum_vb = 0 for k, v in count_VB.items(): if k in verb_tag: sum_vb += v print('Le nombre total de POS verbes sur l\'ensemble des tweets est: ', sum_vb) swn_filename = '/home/audrey/Audrey/Cours/INF344/TP/TP_sentiment/SentiWordNet_3.0.0_20130122.txt' swn = SentiWordNetCorpusReader(swn_filename) swn.senti_synset('breakdown.n.03')
def sentimentAnalysis(filename, outputFile): swn_filename = 'SentiWordNet_3.0.0_20100705.txt' swn = SentiWordNetCorpusReader(swn_filename) regex = re.compile('[%s]' % re.escape(string.punctuation)) with codecs.open(filename, 'r', 'utf-8') as f: data1 = f.readlines() f.close() writer = csv.writer(open(outputFile, 'wb')) tag = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'TO', 'UH', 'PDT', 'SYM', 'RP'] noun = ['NN', 'NNS', 'NP', 'NPS'] adj = ['JJ', 'JJR', 'JJS'] pronoun = ['PP', 'PP$', 'WP', 'WP$'] verb = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] adverb = ['RB', 'RBR', 'RBS', 'WRB'] for text in data1: count = re.split("\s+", text, 1)[0] if len(re.split("\s+", text, 1)) > 1: text = re.split("\s+", text, 1)[1] Tex = regex.sub(u'', text) words = word_tokenize(Tex.lower()) word = nltk.pos_tag(words) objCount = 0 subCount = 0 for w in word: if not w[1] in tag: #print(w) if w[1] in noun: pos_Char = 'n' elif w[1] in adj: pos_Char = 'a' elif w[1] in pronoun: pos_Char = 'p' elif w[1] in verb: pos_Char = 'v' elif w[1] in adverb: pos_Char = 'r' else: pos_Char = 'none' if pos_Char == 'none': try: s = swn.senti_synsets(w[0]) scores = list(s)[0] if scores.obj_score > 0.5: objCount += 1 elif scores.pos_score + scores.neg_score > 0.5: subCount += 1 except: print('Unexpected word') else: try: s = swn.senti_synsets(w[0], pos_Char) scores = list(s)[0] if scores.obj_score > 0.5: objCount += 1 elif scores.pos_score + scores.neg_score > 0.5: subCount += 1 except: print('Unexpected word') if objCount + subCount > 0: ratioObj = float(objCount) / (objCount + subCount) ratioSub = float(subCount) / (objCount + subCount) else: ratioObj = 0.0 ratioSub = 0.0 writer.writerow([count, ratioObj, ratioSub])