class Dictionary: def __init__(self): self.sn = SenticNet() """ Input : String Output : "positive" or "negative" """ def get_word_polarity(self, word, log=True): value = "empty" try: value = self.sn.polarity_value(word.lower()) except: if log: print('An error occurred. Word: ' + word + ' is not known.') return value """ Input : String Output : Int [-1 : 1] """ def get_word_polarity_numerical_value(self, word, log=True): value = "empty" try: value = self.sn.polarity_intense(word.lower()) except: if log: print('An error occurred. Word: ' + word + ' is not known.') return value
def Terms_Chooser(data, n_of_words, polarity_threshold): sn = SenticNet() #choosing words data["Content"] = data["Content"].apply(lambda row: nltk.word_tokenize(row)) lista = np.array(data["Content"].values.tolist()) lista = list(itertools.chain.from_iterable(lista)) FD = FreqDist(lista) MC = FD.most_common(n_of_words) common_words = [] for i in range (0,n_of_words): common_words.append(MC[i][0]) polarity = list() words = list() for x in common_words: try: temp = sn.polarity_intense(x) if (float(temp) > polarity_threshold or float(temp) < -(polarity_threshold)): polarity.append(temp) words.append(x) except Exception: continue return words
def getSentics(self, word): senticsAndItensity = [] sn = SenticNet('en') try: sentics = sn.sentics(word) polarity_intensity = sn.polarity_intense(word) # print(sentics) # print(sentics['pleasantness']) # print(sentics['attention']) # print(sentics['sensitivity']) # print(sentics['aptitude']) # print(polarity_intensity) senticsAndItensity.append(float(sentics['pleasantness'])) senticsAndItensity.append(float(sentics['attention'])) senticsAndItensity.append(float(sentics['sensitivity'])) senticsAndItensity.append(float(sentics['aptitude'])) senticsAndItensity.append(float(polarity_intensity)) return senticsAndItensity except Exception as e: defaultsentics = [0.0, 0.0, 0.0, 0.0, 0.0] return defaultsentics # ##TESTING AREA # yas = SenticValuer() # print(yas.getSentics("awkward"))
from senticnet.senticnet import SenticNet sn = SenticNet() print("polarity value:", sn.polarity_value("love")) print("polarity intense:", sn.polarity_intense("love")) print("moodtags:", ", ".join(sn.moodtags("love"))) print("semantics:", ", ".join(sn.semantics("love"))) print("\n".join([key + ": " + str(value) for key, value in sn.sentics("love").items()]))
class Sarcasm: def __init__(self, *args, **kwargs): # loading necessaries self.nlp = spacy.load("en_core_web_sm") self.senti = SenticNet() self.sid = SentimentIntensityAnalyzer() #loading dataset self.df = pd.read_json("./Sarcasm_Headlines_Dataset.json", lines=True) self.df = self.df[:15000] self.df.drop(columns="article_link", inplace=True) #dropping unnessary attribute #storing nlp data in headlines variable self.headlines = [] self.uni_gram = set() self.uni_feature = [] self.y_ = [] for i in self.df['headline']: self.headlines.append(self.nlp(i)) def w_score(self, w): """ input: word Finding word score based on nltk's vader_lexicon sentiment analysis and Senticnet sentiment analysis """ ss = self.sid.polarity_scores(w)['compound'] try: sn = self.senti.polarity_intense(w) sn = float(sn) if ss == 0: return sn else: return (sn + ss) / 2 except: #not found in sn find for only ss or concepts if ss != 0: return ss elif ss == 0: #find for the concepts return ss def sentimentScore(self, sent): """ input: sentence Return if contradiction occurs or not """ sum_pos_score = 0 sum_neg_score = 0 for w in sent: if w.lemma_ == '-PRON-': score = self.w_score(w.text) else: score = self.w_score(w.lemma_) if score > 0: sum_pos_score += score else: sum_neg_score += score if sum_pos_score > 0 and sum_neg_score < 0: return ("contradict", sum_pos_score, sum_neg_score) else: return ("anything", sum_pos_score, sum_neg_score) def coherence(self, s1, s2): ''' Input sentence1, sentence2 using nlp Rule1:- Pronoun match feature - including reflexive, personal, and possessive pronouns. Rule2:- String match feature - ignore stop words Rule3:- Definite noun phrase - w2 starts with the word 'the' Rule4:- Demonstrative noun phrase feature - w2 starts with the "this", "that", "these" and "those" Rule5:- Both proper names features - w1 and w2 are both named entities ''' # subject and object of s1 and s2 sub1 = "" sub2 = "" obj1 = "" obj2 = "" for i in s1.noun_chunks: if i.root.dep_ == 'nsubj': sub1 = i.root if i.root.dep == 'pobj': obj1 = i.root for j in s2.noun_chunks: if j.root.dep_ == 'nsubj': if type(sub1) != type( "") and sub1.pos_ == 'PRON' and j.root.pos_ == 'PRON': if sub1.text.lower() == j.root.text.lower(): return "coherent" # rule 4:- if j[0].text.lower() == 'the': return "coherent" if j[0].text.lower() in ['this', 'that', 'these', 'those']: return "coherent" if j.root.dep_ == 'pobj': if type(obj1) != type( "") and obj1.pos_ == 'PRON' and j.root.pos_ == 'PRON': if obj1.text.lower() == j.root.text.lower(): return "coherent" return "Not coherent" def to_string_from_list(self, l): st = "" for i in l: st += i + ' ' return st.rstrip() def n_gram_feature(self, text, n): """ Input: headline in nlp Finding n grams of given text """ one_list = [] for tok in text: if not tok.is_punct: if tok.lemma_ != '-PRON-': one_list.append(tok.lemma_) else: one_list.append(tok.text) try: one_list.remove(' ') except: pass #convert it to n-gram _list = [] for i, t in enumerate(one_list): if len(one_list[i:n + i]) >= n: _list.append(self.to_string_from_list(one_list[i:n + i])) return set(_list) def contradiction_feature(self, headline): ''' Contradiction feature input: nlp processed ''' #for single sentence headline if len(list(headline.sents)) == 1: if self.sentimentScore(headline)[0] == 'contradict': return (1, 0) else: return (0, 0) #for multisentence headline else: if self.sentimentScore(headline)[0] == 'contradict': sent = list(headline.sents) i = 0 while i < len(sent) - 1: # number of sentece if self.coherence(sent[i], sent[i + 1]) is not "coherent": return (0, 0) i += 1 return (0, 1) else: return (0, 0) def baseline3(self): ''' Use of sentiment analysis + coherence ''' predictions = [] for i in self.headlines: get = self.contradiction_feature(i) if get == (1, 0) or get == (0, 1): predictions.append(1) else: predictions.append(0) return (confusion_matrix(self.df['is_sarcastic'], predictions), classification_report(self.df['is_sarcastic'], predictions), accuracy_score(self.df['is_sarcastic'], predictions)) def baseline1(self): predictions = [] for p in self.headlines: co, _, _ = self.sentimentScore(p) if (co == 'contradict'): predictions.append(1) else: predictions.append(0) return (confusion_matrix(self.df['is_sarcastic'], predictions), classification_report(self.df['is_sarcastic'], predictions), accuracy_score(self.df['is_sarcastic'], predictions)) def uni_gram_features(self, start, end, n=1): self.uni_gram = list(self.uni_gram) self.uni_gram = sorted(self.uni_gram) index = start for p in self.headlines[start:end]: uni = [0 for i in range(len(self.uni_gram))] for i, j in enumerate(p): temp = [] #temp if len(p[i:n + i]) >= n: for k in range(n): if p[i + k].lemma_ != '-PRON-': temp.append(p[i + k].lemma_) else: temp.append(p[i + k].text) temp = self.to_string_from_list(temp) if temp in self.uni_gram: uni[self.uni_gram.index(temp)] = 1 self.y_.append(self.df['is_sarcastic'][index]) index += 1 self.uni_feature.append(uni) def baseline2(self, n=1): #unigram features self.uni_gram = set() self.uni_feature = [] self.y_ = [] for p in self.headlines: self.uni_gram = self.uni_gram.union(self.n_gram_feature(p, n)) #now find length = len(self.headlines) t1 = threading.Thread(target=self.uni_gram_features, name='t1', args=(0, int(length / 4), n)) t2 = threading.Thread(target=self.uni_gram_features, name='t2', args=(int(length / 4), int(length / 2), n)) t3 = threading.Thread(target=self.uni_gram_features, name='t3', args=(int(length / 2), int(3 * length / 4), n)) t4 = threading.Thread(target=self.uni_gram_features, name='t4', args=(int(3 * length / 4), length, n)) t1.daemon = True t2.daemon = True t3.daemon = True t4.daemon = True st = time.time() t1.start() t2.start() t3.start() t4.start() t1.join() t2.join() t3.join() t4.join() print(f'time taken: {time.time()-st}') X_train, X_test, y_train, y_test = train_test_split(self.uni_feature, self.y_, test_size=0.33, random_state=42) return self.findLINEARSVCResult(X_train, X_test, y_train, y_test) def findLINEARSVCResult(self, X_train, X_test, y_train, y_test): ''' Training data using LinearSVC model ''' svc_model = LinearSVC() svc_model.fit(X_train, y_train) predictions = svc_model.predict(X_test) return (confusion_matrix(y_test, predictions), classification_report(y_test, predictions), accuracy_score(y_test, predictions))
def data_Preprocessing(data, data_test, n_of_words, polarity_threshold): Reviews = data["Content"] #Check if all char are ASCII # If we need another method for Encode/Decode there is string.printable method for i in range(0, len(Reviews)): x = Reviews.iloc[i].encode('ascii', errors='ignore').decode() # Set all the content to lower case Reviews = Reviews.apply(lambda row: row.lower()) # Add to the follow variable the characters that you want to delete chars_to_del = "[" + string.punctuation + string.digits + "]" # Delete all the chars in "chars_to_del" from each row of the dataframe Reviews = Reviews.apply(lambda row: re.sub(chars_to_del, '', row)) # Tokenize every single words of the data content Token_Reviews = Reviews.apply(lambda row: nltk.word_tokenize(row)) # Generating the list "stop" of element TO BE REMOVED from the sentences (stopwords, numbers and punctuations) stop = stopwords.words("english") # Remove all the words in the variable "stop" Filtered_Review = Token_Reviews.apply( lambda row: [w for w in row if not w in stop]) # Stemming the data's content # Stemming the Filtered sentence, some stemmed words: # http://snowball.tartarus.org/algorithms/english/stemmer.html ps = PorterStemmer() for idx in range(0, len(Filtered_Review)): Stemmed_Review_temp = [] for word in Filtered_Review.iloc[i]: Stemmed_Review_temp.append(ps.stem(word)) Filtered_Review.iloc[i] = Stemmed_Review_temp # Terms choosing: most common word sn = SenticNet() Filtered_Review_List = list(itertools.chain.from_iterable(Filtered_Review)) Words_Frquency = FreqDist(Filtered_Review_List) Most_Common_Words_Frequency = Words_Frquency.most_common(n_of_words) Most_Common_Words = [] for i in range(0, n_of_words): Most_Common_Words.append(Most_Common_Words_Frequency[i][0]) index = 1 words_and_polarity = pd.DataFrame(columns=["Word", "Polarity"]) Selected_Words = [] # Terms polarity for word in Most_Common_Words: try: temp = sn.polarity_intense(word) if (float(temp) > polarity_threshold or float(temp) < -(polarity_threshold)): words_and_polarity.loc[index] = [word, float(temp)] index = index + 1 Selected_Words.append(word) except Exception: continue # Decomment if you want to recomputer the selected words and their polarity #words_and_polarity.to_csv("Words_and_Polarity.csv", sep=",") return data, data_test
sn = SenticNet() zeroSen = 0 tp = 0 tn = 0 fp = 0 fn = 0 actT = 0 with open("Dataset.pickle", "rb") as handle: pyDS = pickle.load(handle) for doc in pyDS.DocList: totSen = 0 for w in doc.TermList: try: sen = sn.polarity_intense(w) except KeyError: sen = 0 # if w in slangwords: # sen = slangwords[w] # elif w in kaggeleSentiment: # sen = kaggeleSentiment[w] # elif w in porvalis: # sen = porvalis[w] totSen = totSen + float(sen) doc.Sentiment = totSen # true positive if doc.Class == 1 and totSen < 0.5: tp = tp + 1
# Each line of corpus must be equivalent to each document of the corpus #boc_model=boc.BOCModel(doc_path="input corpus path") boc_model = boc.BOCModel('text.txt') #boc_model.context = text # output can be saved with save_path parameter boc_matrix, word2concept_list, idx2word_converter = boc_model.fit() # SenitcNet lexicon lookup from senticnet.senticnet import SenticNet sn = SenticNet() concept_info = sn.concept(text) polarity_value = sn.polarity_value(text) polarity_intense = sn.polarity_intense(text) moodtags = sn.moodtags(text) semantics = sn.semantics(text) sentics = sn.sentics(text) print('==================================') print('test: ', text) print('concept_info: ', concept_info) print('polarity_value: ', polarity_value) print('polarity_intense: ', polarity_intense) print('moodtags: ', moodtags) print('semantics: ', semantics) print('sentics: ', sentics) print('==================================')
negative_words ) ##wordcloud using frequencies ( this needs an dictionary object) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show() #Most positive and Most Negative Reviews Review_polarity_df = pd.DataFrame() for review in clean_reviews_final.without_stopwords: tokens = word_tokenize(review) count = 0 pol = 0 for word in tokens: if word in sn.data: pol = pol + float(sn.polarity_intense(word)) count = count + 1 if count > 0: polarity = pol / count else: polarity = 0 temp_df = pd.DataFrame([review, polarity]).T Review_polarity_df = pd.concat([Review_polarity_df, temp_df], ignore_index=True) Review_polarity_df.columns = ['review', 'polarity'] Reviews_Sorted = Review_polarity_df.sort_values(by=['polarity'], ascending=[False]) Most_positive_Reviews = Review_polarity_df.sort_values(
datas[idx]['ContextVector'][1] += count[f]['sent'] if tags[tokens.index(w)][1] in nn: if w in count.keys(): datas[idx]['ContextVector'][2] = count[w]['sent'] elif tags[tokens.index(w)][1] in vb: if w in count.keys(): datas[idx]['ContextVector'][3] = count[w]['sent'] # print(sentiment_feature[f][w]['sent']) for word, tag in tags: if tag in adj: if word in count.keys(): datas[idx]['DsVector'][0] += count[word]['sent'] datas[idx]['PmiVector'][0] += count[word]['PMI_sent'] if word in sn.data.keys(): datas[idx]['SnVector'][0] += float(sn.polarity_intense(word)) if word in bl_sent.keys(): datas[idx]['BlVector'][0] += bl_sent[word] elif tag in adv: if word in count.keys(): datas[idx]['SnVector'][1] += count[word]['sent'] datas[idx]['PmiVector'][1] += count[word]['PMI_sent'] if word in sn.data.keys(): datas[idx]['SnVector'][1] += float(sn.polarity_intense(word)) if word in bl_sent.keys(): datas[idx]['BlVector'][1] += bl_sent[word] elif tag in nn: if word in count.keys(): datas[idx]['DsVector'][2] = count[word]['sent'] datas[idx]['PmiVector'][2] += count[word]['PMI_sent'] if word in sn.data.keys():
def pre_process_and_predict(sentence): wordnet_lemmatizer = WordNetLemmatizer() # # Replacing double quotes with single, within a string sentence = sentence.replace("\"", "\'") # # Removing unnecessary special characters, keeping only , ! ? sentence = re.sub(r"[^!?,a-zA-Z0-9\ ]+", '', sentence) # # Lemmatization on verbs sentence = ' '.join([ wordnet_lemmatizer.lemmatize(word, pos='v') for word in word_tokenize(sentence) ]) sn = SenticNet() senti = PySentiStr() senti.setSentiStrengthPath(CODE_PATH + '/sentistrength/SentiStrength.jar') senti.setSentiStrengthLanguageFolderPath( CODE_PATH + '/sentistrength/SentStrength_Data/') sentiment_score = [] for sen in sent_tokenize(sentence): senti_pos, senti_neg = senti.getSentiment(sen, score='dual')[0] senti_pos -= 1 if senti_neg == -1: senti_neg = 0 sum_pos_score = 0 sum_neg_score = 0 for word in word_tokenize(sen): try: w_score = float(sn.polarity_intense(word)) * 5 except KeyError: w_score = 0 if w_score > 0: sum_pos_score = sum_pos_score + w_score elif w_score < 0: sum_neg_score = sum_neg_score + w_score sum_pos_score = (sum_pos_score + senti_pos) / 2 sum_neg_score = (sum_neg_score + senti_neg) / 2 sentiment_score.append((sum_pos_score, sum_neg_score)) additional_features_s = [] additional_features_ns = [] contra = [] pos_low = [] pos_medium = [] pos_high = [] neg_low = [] neg_medium = [] neg_high = [] for sum_pos_score, sum_neg_score in sentiment_score: contra.append(int(sum_pos_score > 0 and abs(sum_neg_score) > 0)) pos_low.append(int(sum_pos_score < 0)) pos_medium.append(int(sum_pos_score >= 0 and sum_pos_score <= 1)) pos_high.append(int(sum_pos_score >= 2)) neg_low.append(int(sum_neg_score < 0)) neg_medium.append(int(sum_neg_score >= 0 and sum_neg_score <= 1)) neg_high.append(int(sum_neg_score >= 2)) additional_features_s = additional_features_s + [ max(pos_medium), max(pos_high), max(neg_medium), max(neg_high) ] additional_features_ns = additional_features_ns + [ max(pos_low), max(neg_low) ] tweet = sentence punctuation_count = SequencePunctuationCount(tweet) character_count = SequenceCharacterCount(tweet) capitalized_count = CapitalizedCount(tweet) exclamation_count = ExclamationCount(tweet) # emoji_count = EmojiCount(tweet) f_count = [ punctuation_count, character_count, capitalized_count, exclamation_count ] for count in f_count: f_low = int(count == 0) f_medium = int(count >= 1 and count <= 3) f_high = int(count >= 4) additional_features_s = additional_features_s + [f_medium, f_high] additional_features_ns = additional_features_ns + [f_low] X = [sentence] in_file = open(os.path.join(PICKLES_PATH, "vocab.pickle"), "rb") vocab = pickle.load(in_file) in_file.close() in_file = open(os.path.join(PICKLES_PATH, "model.pickle"), "rb") model = pickle.load(in_file) in_file.close() vectorizer = TfidfVectorizer(vocabulary=vocab) X = vectorizer.fit_transform(X) ans = int(sum(model.predict(X))) print('Sentence : ', sentence) print('Sarcastic features : ', additional_features_s) print('Not Sarcastic features : ', additional_features_ns) print('Contradict : ', max(contra)) print('Model Predict : ', ans) print( 'My obs : ', int((sum(additional_features_s) >= sum(additional_features_ns)) and max(contra) == 1)) print('Final Prd : ', end='') if ans == 1 or ((sum(additional_features_s) >= sum(additional_features_ns)) and max(contra) == 1): return True else: return False
from senticnet.senticnet import SenticNet teste = [] sn = SenticNet('pt') concept_info = sn.concept('amor') polarity_value = sn.polarity_value('amor') polarity_intense = sn.polarity_intense('amor') moodtags = sn.moodtags('amor') semantics = sn.semantics('amor') sentics = sn.sentics('amor') teste.append(concept_info) print(teste)