def large_scale_visual_sentiment(vg_en_tn_prdct): lexicon = Empath() vg_en_tn_prdct_sentiments = defaultdict(int) for row in vg_en_tn_prdct: for tensorproduct in row: tpedges = tensorproduct.edges() tpnodes = tensorproduct.nodes() print "Edges:", tpedges print "Nodes:", tpnodes for tpedge in tpedges: sentiment00 = lexicon.analyze((tpedge[0][0]).decode("utf-8")) for k, v in sentiment00.iteritems(): vg_en_tn_prdct_sentiments[ k] = vg_en_tn_prdct_sentiments[k] + v sentiment01 = lexicon.analyze((tpedge[0][1]).decode("utf-8")) for k, v in sentiment01.iteritems(): vg_en_tn_prdct_sentiments[ k] = vg_en_tn_prdct_sentiments[k] + v sentiment10 = lexicon.analyze((tpedge[1][0]).decode("utf-8")) for k, v in sentiment10.iteritems(): vg_en_tn_prdct_sentiments[ k] = vg_en_tn_prdct_sentiments[k] + v sentiment11 = lexicon.analyze((tpedge[1][1]).decode("utf-8")) for k, v in sentiment11.iteritems(): vg_en_tn_prdct_sentiments[ k] = vg_en_tn_prdct_sentiments[k] + v print "Sentiment Analysis of the Video:", sorted( vg_en_tn_prdct_sentiments.items(), key=operator.itemgetter(0), reverse=True) return vg_en_tn_prdct_sentiments
def n_analyze_emotion(): print("Analyzing emotions from News API") global CATS, CATS_DICT lexicon = Empath() descrip_list = [] DBNAME = 'final.db' conn = sqlite3.connect(DBNAME) cur = conn.cursor() statement = 'SELECT text from news_api' cur.execute(statement) for row in cur: descrip_list.append(row[0]) ### entire corpus str1 = ''.join(descrip_list) print(str1) n_empath_dict = lexicon.analyze(str1, categories=CATS, normalize=True) n_empath_dict_new = {} for key in n_empath_dict: n_empath_dict_new[key] = n_empath_dict[key]*1000 ## Row by Row: counter = 0 for row in descrip_list: n_row_empath_dict = lexicon.analyze(row, categories=CATS, normalize=True) #print(len((list(u_row_empath_dict.keys())))) # This is 17 long n_row_empath_dict_new = {} for key in n_row_empath_dict: n_row_empath_dict_new[key] = n_row_empath_dict[key]*1000 #print(len((list(u_row_empath_dict_new.keys())))) # This is 17 long counter +=1 vals_list = list(n_row_empath_dict_new.values()) vals_list.insert(0, None) vals_list.insert(1, 999) vals_list.insert(2, 999) vals_list.insert(3, counter) vals_list.insert(4, "news_api") insertion = tuple(vals_list) DBNAME = 'final.db' conn = sqlite3.connect(DBNAME) cur = conn.cursor() statement = ''' INSERT INTO "Emotions" VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''' cur.execute(statement, insertion) conn.commit() return n_empath_dict_new
def command(self, dataframe, selector, aggregate_scores): documents = selector.to_matrix().flatten() print(documents) import numpy as np from empath import Empath lexicon = Empath() to_df = [] if aggregate_scores == "aggregate": out_dict = lexicon.analyze(documents.tolist(), normalize=True) for k,v in sorted(out_dict.items(), key=lambda x: x[1], reverse=True): to_df.append([k,v]) return iris_objects.IrisDataframe(column_names=["category", "normalized_count"], column_types=["String", "Number"], data=to_df) else: out_scores = [order_keys(lexicon.analyze(d, normalize=True)) for d in documents.tolist()] return iris_objects.IrisDataframe(column_names=order_keys.s_keys, data=out_scores)
def get_post_metrics(df, post_file): #define empath lexicon = Empath() # this is the eleven categories which we will use eleven_categories = [ 'family', 'friends', 'home', 'sexual', 'swears', 'work', 'leisure', 'money', 'body', 'religion', 'health' ] # clean the post data post_df = cleaning(post_file) # need the cleaning function from above! # find post frequency for each friend df['post frequency'] = 0 df['empath'] = 0 for friend in df.name: ind = df.loc[df.name == friend].index[0] title = post_df[post_df['title'].str.contains(friend, na=False)] tags = post_df[post_df['tags'].str.contains(friend, na=False)] friend_post = pd.concat([title, tags]) df.at[ind, 'post frequency'] = len(friend_post.index) # if there is a post, find empath analysis if df.loc[ind, 'post frequency'] != 0: # sum the empath analysis for each post on eleven categories friend_post['empath'] = friend_post['post'].apply(lambda x: sum( lexicon.analyze(x, categories=eleven_categories).values())) # find the average of the empath score for each post df.at[ind, 'empath'] = np.mean(friend_post['empath']) return df
def hateLikeMaker(tweets): lexicon = Empath() likeness = defaultdict(int) for i in tweets: sents = sent_tokenize(i) for j in sents: j = re.sub(r'[^\w\s]', '', j) a = lexicon.analyze(j) if a['negative_emotion'] == 1: print("TRUE") for k, l in a.items(): if l == 1 and j != 'negative_emotion': likeness[k] -= 1 else: print("FALSE") for k, l in a.items(): if l == 1: likeness[k] += 1 if 'hate' in likeness: likeness.pop('hate') if 'envy' in likeness: likeness.pop('envy') likeness = sorted(likeness.items(), key=operator.itemgetter(1)) dislikes = likeness[:3] length = len(likeness) likes = likeness[length - 3:length] likes = dict(likes) dislikes = dict(dislikes) return likes, dislikes
def train_text_classification(file): import pandas as pd train = pd.read_csv(file) train = train.dropna() train = pd.DataFrame(train.labeldata.str.split('\r\r\n').tolist(), index=train.labelname).stack() train = train.reset_index()[[0, 'labelname']] # var1 variable is currently labeled 0 train.columns = ['labeldata', 'labelname'] # renaming var1 import html train = html.unescape(train) # Shuffle data train = train.sample(frac=1, random_state=1).reset_index(drop=True) train = train.dropna() train = train[train['labeldata']!=''] from empath import Empath lexicon = Empath() train_features = [] for data in train['labeldata']: feature = lexicon.analyze(data, normalize=True) train_features.append(feature) train_features = pd.DataFrame(train_features) from sklearn.linear_model import LogisticRegression model = LogisticRegression(solver='sag') model.fit(train_features,train['labelname']) return model
def empath_extraction(text): # You need to 'pip install empath' first from empath import Empath lexicon = Empath() # Get the Empath result '''Note : The output sequence of empath is random!!!''' result = lexicon.analyze(text, normalize=True) # Filter out those equal to zero output = dict() for (k, v) in result.items(): if (v > 0): output[k] = v '''You can either print the result directly With format : '#the class#':#the score#''' # Change the form of output as you need # for (k,v) in output.items(): # print(k + "-" + str(v)) '''Or you can get the predefined classes in ./empathClass.txt and convert the #the class# into number, which may be more convenient and efficient for other programs to process. But this will definitely slow down this python code''' class_empath = dict() with open(r"./empathClass.txt") as file: for line in file: tmp = line.split(",") for index, item in enumerate(tmp): class_empath[item] = index output_convet = [] for (k, v) in output.items(): output_convet.append([str(class_empath[k]), str(v)]) # Change the form of output as you need for item in output_convet: print("-".join(item))
def subcommand_sentiment(texts, docnames, args): nlp = spacy.load('en') lexicon = Empath() if args.posneg_only: cats = ['positive_emotion','negative_emotion'] else: cats = None # all the categories analyze = lambda t: lexicon.analyze(t, categories=cats, normalize= not args.no_normalize) sentiments = [analyze(t) for t in texts] df = pd.DataFrame(sentiments,index=docnames) summarydf = make_summary(df) sheets = list() if args.human_readable: hdf = make_human_report(df) sheets.append( ('report',hdf) ) else: sheets.append( ('report',df)) sheets.append(('summary',summarydf)) final_fname = write_report( args.outfile, sheets, hdf_if_fail=not args.nohdfonfail and not args.human_readable, verbose=True, ) return final_fname
def use_text_classification(file, model): import pandas as pd test = pd.read_csv(file) test = test.dropna() test = pd.DataFrame(test.labeldata.str.split('\r\r\n').tolist(), index=test.labelname).stack() test = test.reset_index()[[0, 'labelname']] # var1 variable is currently labeled 0 test.columns = ['labeldata', 'labelname'] # renaming var1 import html test = html.unescape(test) # Shuffle data test = test.dropna() test = test[test['labeldata']!=''] from empath import Empath lexicon = Empath() test_features = [] for sentence in test['labeldata']: feature = lexicon.analyze(sentence, normalize=True) test_features.append(feature) test_features = pd.DataFrame(test_features) prediction = model.predict(test_features) test['prediction'] = prediction return test
def count_unconnect(u): # espero que seja um grupo bem diverso lexicon = Empath() # print(len(u)) lexicon.create_category("support", support, model="nytimes") lexicon.create_category("conflict", conflict, model="nytimes") lexicon.create_category("conclusion", conclusion, model="nytimes") lexicon.create_category("complementary", complementary, model="nytimes") lexicon.create_category("causal_argument", causal_argument, model="nytimes") lexicon.create_category("verbs_hedging", verbs_hedging, model="nytimes") #["because", "only", "before", "so", "if", "though", "then", "until", "once", "even", "since", "although", "so", "while", "having", "because", "already", "thus", "time", "unless", "now", "actually", "eventually"] #["though", "although", "except", "yet", "but", "even", "because", "only", "Though", "Although", "Yet", "either", "nevertheless", "whereas", "though", "fact", "however", "unlike", "Furthermore", "because", "nonetheless", "And", "However", "none", "either", "still", "Even", "despite", "if", "so", "Yet", "meaning", "indeed", "consequently"] #[] #["while", "whereas", "though", "only", "yet", "While", "thus", "even", "Thus", "Instead", "although", "instead", "Though", "Moreover", "actually", "nevertheless", "sometimes", "still", "rather"] #["means", "therefore", "means", "merely", "mechanism", "democratic_process", "Therefore", "simply", "free_market", "consequence", "because"] # cat_all = lexicon.analyze(u, categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True) cat_all = lexicon.analyze(u, categories=['verbs_hedging'], normalize=True) #cat_all = {} #for arg in u: # cat = lexicon.analyze(arg) # if cat["children"] != 0: # print(arg, cat["children"]) return cat_all
def analyze_tokens(word_list, topk=10): lexicon = Empath() word_list_analyzed = lexicon.analyze(word_list, normalize=True) return sorted(word_list_analyzed.items(), key=lambda kv: kv[1], reverse=True)[:topk]
def create_empath_cats(text): lexicon = Empath() try: cat_scores = lexicon.analyze(text, normalize=True) except Exception as e: print(e) return 0 return pd.Series(cat_scores)
def executeEmpathOnISEAR(ISEAR, DATADIR): try: corpus = pd.read_csv(ISEAR, sep=',',header=None) if not os.path.isfile(DATADIR + "/labels_empath_on_ISEAR.txt"): lexicon = Empath() #instance of empath analyser emotions_list = ['fear', 'joy', 'anger', 'sadness', 'disgust'] model = "reddit" res = {} best_em = [] # will contain empath analysis results emotions_results = [] for i in range(len(emotions_list)): # creates a category for each emotion lexicon.create_category(emotions_list[i],[emotions_list[i]], model=model) for sentence in corpus[1]: for k in range(len(emotions_list)): # tokenizes and analyzes the sentences tokens = nltk.word_tokenize(sentence) emotions_results = lexicon.analyze(tokens, normalize=True, categories=[emotions_list[k]]) res = {**res, **emotions_results} # merge all results in one dictionary emotion_results = [] max_likely_emotions_empath = max(res.items(), key=operator.itemgetter(1))[0] if res[max_likely_emotions_empath] != 0.0: best_em.append(max_likely_emotions_empath) else: best_em.append('no_idea') best_em = np.asarray(best_em) np.savetxt(DATADIR + "/labels_empath_on_ISEAR.txt", best_em, fmt="%s") #saves empath detection # ---------------------------------- if labels already exist: -------------------------------- ISEAR_labels = corpus[0] empath_labels = pd.read_csv(DATADIR + '/labels_empath_on_ISEAR.txt', sep=',',header=None) detected_labels = [ISEAR_labels[i] for i in range(len(ISEAR_labels)) if empath_labels[0][i] != 'no_idea'] matches = [ISEAR_labels[i] for i in range(len(ISEAR_labels)) if empath_labels[0][i] == ISEAR_labels[i]] detected_percentage = len(detected_labels)/len(ISEAR_labels) overall_accuracy = len(matches)/len(ISEAR_labels) detected_accuracy = len(matches)/len(detected_labels) print('detected_percentage:', detected_percentage) print('detected_accuracy:', detected_accuracy) print('overall_accuracy:', overall_accuracy) return 0 except Exception as e: print(str(e)) return 51
def Empath_List(List): lexicon = Empath() EMs = {} for text in List: EM = lexicon.analyze(text, normalize=True) #v = max(EM, key=EM.get) EMs[text] = [(k, v) for k, v in EM.items() if v != 0] return EMs
def count_connect(u): cat_all = {} lexicon = Empath() lexicon.create_category("support", support, model="nytimes") lexicon.create_category("conflict", conflict, model="nytimes") lexicon.create_category("conclusion", conclusion, model="nytimes") lexicon.create_category("complementary", complementary, model="nytimes") lexicon.create_category("causal_argument", causal_argument, model="nytimes") lexicon.create_category("verbs_hedging", verbs_hedging, model="nytimes") heads = [] not_heads = [] for (arg1, arg2) in u: heads.append(arg1) not_heads.append(arg2) norep_heads = list(set(heads)) norep_not_heads = list(set(not_heads)) args_conn = list(set(heads) | set(not_heads)) lexicon = Empath() #cat_heads = lexicon.analyze(norep_heads, categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True) cat_heads = lexicon.analyze(norep_heads, categories=['verbs_hedging'], normalize=True) # cat_heads = {} # for h in norep_heads: # cat_heads = lexicon.analyze(h, normalize=True) # if cat_heads["fun"] != 0: # print(h, cat_heads["fun"]) # cat_not_heads = lexicon.analyze(norep_not_heads,categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True) cat_not_heads = lexicon.analyze(norep_not_heads, categories=['verbs_hedging'], normalize=True) # cat_all = lexicon.analyze(args_conn,categories = ["support", "conflict", "conclusion", "complementary", "causal_argument"], normalize=True) cat_all = lexicon.analyze(args_conn, categories=['verbs_hedging'], normalize=True) return cat_heads, cat_not_heads, cat_all
def u_analyze_emotion(): print("Analyzing emotions from SOTU") global CATS, CATS_DICT lexicon = Empath() descrip_list = [] DBNAME = 'final.db' conn = sqlite3.connect(DBNAME) cur = conn.cursor() statement = 'SELECT text from sotu' cur.execute(statement) for row in cur: descrip_list.append(row[0]) ### entire corpus str1 = ''.join(descrip_list) u_empath_dict = lexicon.analyze(str1, categories=CATS, normalize=True) u_empath_dict_new = {} for key in u_empath_dict: #print (key, 'corresponds to', u_empath_dict[key]*1000) u_empath_dict_new[key] = u_empath_dict[key]*1000 ## Row by Row: counter = 0 for row in descrip_list: u_row_empath_dict = lexicon.analyze(row, categories=CATS, normalize=True) u_row_empath_dict_new = {} for key in u_row_empath_dict: u_row_empath_dict_new[key] = u_row_empath_dict[key]*1000 counter +=1 vals_list = list(u_row_empath_dict_new.values()) vals_list.insert(0, None) vals_list.insert(1, 999) vals_list.insert(2, counter) vals_list.insert(3, 999) vals_list.insert(4, "sotu") insertion = tuple(vals_list) statement = ''' INSERT INTO "Emotions" VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''' cur.execute(statement, insertion) conn.commit() return u_empath_dict_new
def command(self, documents): documents = documents.to_matrix().flatten() import numpy as np from empath import Empath lexicon = Empath() to_df = [] out_dict = lexicon.analyze(documents.tolist(), normalize=True) for k,v in sorted(out_dict.items(), key=lambda x: x[1], reverse=True): to_df.append([k,v]) return iris_objects.IrisDataframe(column_names=["category", "normalized_count"], column_types=["String", "Number"], data=to_df)
def get_raw_empath_categories_for_topics(model): lex = Empath() categories = [] for topic in model["topics"]: word_categories = {} for word in topic: result = lex.analyze(word) word_categories[word] = [ key for key in result.keys() if result[key] > 0 ] categories.append(word_categories) return categories
def process_lexicon(texts): lexicon = Empath() data = {} for i, text in texts.iteritems(): data[i] = [ k for k, v in lexicon.analyze(text, normalize=False).items() if v > 0 ] print("{:<5}%".format(round(i * 100 / len(texts), 2)), end='\r') return data
def parseInput(self, input): self.history.append(input) # topic modeling and additional topic generation lexicon = Empath() topicVector = lexicon.analyze(input, normalize=False) topics = [] for key in topicVector.keys(): if topicVector[key] > 0: topics.append(key) self.topics = topics return topics
def semantics(df): df['Clean_Text'] = df['Text'].apply(lambda x: clean_text(x)) df.dropna(inplace=True) lexicon = Empath() semantic = [] # adding it to respective categories for article in df['Clean_Text']: d = lexicon.analyze(article, normalize=False) x = [] for key, value in d.items(): x.append(value) x = np.asarray(x) semantic.append(x) df['Semantic'] = semantic categories = [] a = lexicon.analyze("") for key, value in a.items(): categories.append(key) categories # replacing test with categories sem = [] for i in range(df.shape[0]): a = [] for j in range(len(semantic[0])): for k in range(int(semantic[i][j])): a.append(categories[j]) b = " ".join(a) sem.append(b) df['Semantics'] = sem data = df['Semantics'] return data
def command(self, documents, top_n): import numpy as np from empath import Empath lexicon = Empath() data = np.array([ order_keys(lexicon.analyze(doc, normalize=True)) for doc in documents ]) types_ = ["Number" for _ in order_keys.s_keys] return top_n, iris_objects.IrisDataframe( column_names=order_keys.s_keys, column_types=types_, data=data, do_conversion=False)
def empath_vector(text): """ Returns a normalised vector (list) of 15 hand-picked categories from Empath: http://empath.stanford.edu/ """ categories = [ 'hate', 'aggression', 'dispute', 'swearing_terms', 'ridicule', 'exasperation', 'fight', 'politeness', 'disgust', 'rage', 'warmth', 'sadness', 'shame', 'negative_emotion', 'positive_emotion' ] lex = Empath() d = lex.analyze(text, categories=categories, normalize=True) if d == None: return 15 * [0.0] return list(d.values())
def empath_analytics(speech: str) -> list: categories_to_include = ['hate', 'cheerfulness', 'aggression', 'envy', 'anticipation', 'masculine', 'pride', 'dispute', 'nervousness', 'weakness', 'horror', 'swearing_terms', 'suffering', 'art', 'ridicule', 'optimism', 'divine', 'fear', 'religion', 'worship', 'confusion', 'death', 'violence', 'dominant_heirarchical', 'neglect', 'dominant_personality', 'love', 'order', 'sympathy', 'trust', 'deception', 'politeness', 'disgust', 'sadness', 'ugliness', 'lust', 'torment', 'politics', 'power', 'disappointment', 'pain', 'negative_emotion', 'competing', 'friends', 'achievement', 'feminine', 'positive_emotion'] lexicon = Empath() results = lexicon.analyze(speech, categories=categories_to_include) output = {} for (key, value) in results.items(): if value != 0: output[key] = value return sorted(output, key=output.get, reverse=True)[0:5]
def large_scale_visual_sentiment(vg_en_tn_prdct): lexicon=Empath() vg_en_tn_prdct_sentiments=defaultdict(int) for row in vg_en_tn_prdct: for tensorproduct in row: tpedges=tensorproduct.edges() tpnodes=tensorproduct.nodes() print "Edges:",tpedges print "Nodes:",tpnodes for tpedge in tpedges: sentiment00=lexicon.analyze((tpedge[0][0]).decode("utf-8")) for k,v in sentiment00.iteritems(): vg_en_tn_prdct_sentiments[k] = vg_en_tn_prdct_sentiments[k] + v sentiment01=lexicon.analyze((tpedge[0][1]).decode("utf-8")) for k,v in sentiment01.iteritems(): vg_en_tn_prdct_sentiments[k] = vg_en_tn_prdct_sentiments[k] + v sentiment10=lexicon.analyze((tpedge[1][0]).decode("utf-8")) for k,v in sentiment10.iteritems(): vg_en_tn_prdct_sentiments[k] = vg_en_tn_prdct_sentiments[k] + v sentiment11=lexicon.analyze((tpedge[1][1]).decode("utf-8")) for k,v in sentiment11.iteritems(): vg_en_tn_prdct_sentiments[k] = vg_en_tn_prdct_sentiments[k] + v print "Sentiment Analysis of the Video:", sorted(vg_en_tn_prdct_sentiments.items(), key=operator.itemgetter(0), reverse=True) return vg_en_tn_prdct_sentiments
def ts_mod(tokens): """ This function implements the topic signal approach of Empath. Empath uses a trained (Neuronal Networks) word category list with the aim to detect topic signals in tokenized text. It then sorts the topics by value and shortlist it to the 10 highest ranked topics. :param tokens: tokenized list of words f.ex.: ["cheese","fighting","dog","cold","man","war"] :return: dictionary (created by empath) shortlist of the 10 highest ranked topics - key: detected topic / value: calculated value of importance """ lexicon = Empath() lexicon = lexicon.analyze(tokens, normalize=True) if lexicon == None: return topics = threshold_filter(lexicon) topics_sorted = sort_topics_by_value(topics) topics_shortlist = shortlist_topics(topics_sorted) return topics_shortlist
def analyze_tweets_liwc(tweets): """ Uses the Empath library to gather topics found in labeled tweet data Keyword arguments: tweets -- list of labeled tweet objects """ lexicon = Empath() results = {CATEGORY_HATE: {}, CATEGORY_NON_HATE: {}} num_hate = num_non_hate = 0 for tweet in tweets: category = "" text = clean_tweet_text(get_tweet_text(tweet)) if (tweet["hate_speech"]): category = CATEGORY_HATE num_hate += 1 else: category = CATEGORY_NON_HATE num_non_hate += 1 topics = lexicon.analyze(text, normalize=False) for topic in topics.keys(): if topics[topic] > 0: if topic in results[category]: results[category][topic] += topics[topic] else: results[category][topic] = topics[topic] # Sort the topics by total raw counts results[CATEGORY_HATE] = sorted(results[CATEGORY_HATE].items(), key=lambda kv: (kv[1], kv[0]), reverse=True) results[CATEGORY_NON_HATE] = sorted(results[CATEGORY_NON_HATE].items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # Normalize topic counts by dividing by the total number of tweets in each category results[CATEGORY_HATE] = [(x, y / num_hate) for x, y in results[CATEGORY_HATE]] results[CATEGORY_NON_HATE] = [(x, y / num_non_hate) for x, y in results[CATEGORY_NON_HATE]] return results
class LexiconFeatures() : def __init__(self) : self.lexicon = Empath() def tokenize(self, text): text = [str(w) for w in tokenizer(text)] return text def get_features(self, text): features = list(self.lexicon.analyze(text, normalize=True).values()) features = torch.as_tensor([features]) return(features) def parse_sentences(self, sentences) : sent_features = [] for sent in sentences: sent_features.append(self.get_features(sent)) sent_features = torch.cat(sent_features, dim=0) print("Empath features: {}".format(sent_features.shape)) return sent_features
def get_empath(self, empathCol): """Get empath score """ tweet_dict = self.convert_dict() lexicon = Empath() empath_dict = {} for tweetid, tweet in tweet_dict.items(): result = lexicon.analyze(tweet, normalize=True) empath_dict[tweet] = result[empathCol] with open(self.path + 'empath.json', 'a') as f: json.dump(empath_dict, f) # empath_df = pd.DataFrame.from_dict(data, orient='index') # empath_df['tweet_id'] = empath_df.index # empath_df.columns = [empathCol, 'tweet_id'] return empath_dict
class LexiconFeatures() : def __init__(self): self.lexicon = Empath() def tokenize(self, text): text = [str(w) for w in tokenizer(text)] return text def get_features(self, text): features = list(self.lexicon.analyze(text,normalize=True).values()) features = torch.as_tensor([features]) return(features) def parse_sentences(self, sentences) : temp = [] for i in tqdm(range(len(sentences))): sent = sentences[i] temp.append(self.get_features(sent)) temp = torch.cat(temp, dim=0) print("liwc features: {}".format(temp.shape)) return temp
def process_csv(in_file, out_file, column): start_time = time.time() f = open(in_file, "r", encoding='utf-8') csv_reader = csv.reader(f, delimiter=',', quotechar='"') result = open(out_file, mode='w', encoding='utf-8') csv_writer = csv.writer(result) review_col = int(column) empath_column = 'categories' lexicon = Empath() line_count = 0 for row in csv_reader: if line_count == 0: row.append(empath_column) csv_writer.writerow(row) line_count += 1 else: content = list(row) review = row[review_col] categories = lexicon.analyze(review, normalize=True) trimmed_categories = dict() if categories: for category, score in categories.items(): if score != 0: trimmed_categories[category] = score row.append(trimmed_categories) #print(row) csv_writer.writerow(row) line_count += 1 print('Processed', line_count, 'rows in', "{:.2f}".format(time.time() - start_time), 'seconds.')
from empath import Empath import sys lexicon = Empath() lyric_filename = sys.argv[1] with open(lyric_filename) as f: content = f.readlines() # you may also want to remove whitespace characters like `\n` at the end of each line content = [x.strip() for x in content] i = 0 while (i < len(content)): print content[i] i+=1 print lexicon.analyze(content[i], normalize=True) i+=1
topN = [] #For each interest i, for i,w in interests_and_weights.iteritems(): iTopN = [None]*count max_tweets = 500 # the number of requests consumed will be this number / 100 * the number of interests #Search the latest tweets t_i related to i searched_tweets = [status for status in tweepy.Cursor(api.search, rpp=100, q=i + " -filter:retweets", since=since_date,languages=["en"],tweet_mode='extended', count=max_tweets).items(max_tweets)] for idx,tweet in enumerate(searched_tweets): if len(tweet.full_text) < 100: continue #print idx,tweet.text #For each tweet t_i, compute the empath vector empath_vec = lexicon.analyze(tweet.full_text, normalize=True) #print empath_vec score = emotional_score(empath_vec) #print "Score:",score,"*",w,"=",(score*w) score *= w for j,topI in enumerate(iTopN): if topI == None or score > topI[1]: tweet.persona_interest = i iTopN.insert(j,[tweet,score,empath_vec]) iTopN.pop() break; topN += iTopN shuffle(topN) topN = topN[0:count]
# Not sure if tese count. If you don't include these we have 16 emotions! #answer.append(lexicon.create_category("ambiguous",["no","different","disagree"])) #answer.append(lexicon.create_category("neutral",["whatever","alright","anything"])) return answer # Removes the emotional categories with scores of 0.0 def removeZeros(analysisDict): ans = {} for key,value in analysisDict.iteritems(): if value > 0.0: ans[key] = value return ans # Start of Script emotionalCats = getEmotionalCategories() for filename in os.listdir("processedTweets"): with open('processedTweets/'+filename) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: row = " ".join(row) # remove commas, and turn token list into string try: # Analyze text over all set emotional categories normalized by words in each tweet: analysis=removeZeros(lexicon.analyze(row, categories=emotionalCats, normalize=True)) if len(analysis) !=0: print(analysis) except: pass break # remove break to keep iterating through directory 'processedTweets'
from empath import Empath from textblob import TextBlob import matplotlib import matplotlib.mlab as mlab matplotlib.use('Agg') import matplotlib.pyplot as plt lexicon = Empath() pos_txt = open('persuasiveargs.txt', 'r').read() neg_txt = open('notpersuasiveargs.txt', 'r').read() cat_pos = lexicon.analyze(pos_txt, normalize=True) cat_neg = lexicon.analyze(neg_txt, normalize=True) for k in cat_pos: if cat_neg[k] != 0: r = cat_pos[k] / cat_neg[k] if r > 2: print('1 Categoria: ', k, 'Pos:', cat_pos[k], 'Neg:', cat_neg[k]) if cat_pos[k] != 0: r = cat_neg[k] / cat_pos[k] if r > 2: print('2 Categoria: ', k, 'Pos:', cat_pos[k], 'Neg:', cat_neg[k]) blob_pos = TextBlob(pos_txt) polarity_pos = [] subjectivity_pos = [] for sentence in blob_pos.sentences: polarity_pos.append(sentence.sentiment.polarity)
from empath import Empath import csv from HTMLParser import HTMLParser import sys lexicon = Empath() import re reload(sys) sys.setdefaultencoding('utf8') def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext.replace("&","and") lyric_filename = sys.argv[1] h = HTMLParser() with open(lyric_filename, "r") as f: reader = csv.reader(f, delimiter=",") for i, line in enumerate(reader): if i == 0: line.append('empath_vec') print line else: line[3]=cleanhtml(h.unescape(line[3]).encode('utf-8')) line.append(lexicon.analyze(line[3], normalize=True)) print line