def __initialize_senti(self): self.senti = PySentiStr() self.senti.setSentiStrengthPath( str(Path.cwd() / 'lib' / 'SentiStrengthCom.jar')) self.senti.setSentiStrengthLanguageFolderPath(str(Path.cwd() / 'lang')) # simple test to make sure senti works test = self.senti.getSentiment(['You are beautiful'], 'dual') assert type(test) is list assert type(test[0]) is tuple
def main(): #==========================================================================# # criando o objeto do sentistrength e setando os caminhos dos arquivos # auxiliares #==========================================================================# obj_sentistrength = PySentiStr() obj_sentistrength.setSentiStrengthPath(SENTISTRENGTH_JAR_PATH) obj_sentistrength.setSentiStrengthLanguageFolderPath( SENTISTRENGTH_DATA_PATH) #===========================================================================# # realizando a leitura do arquivo frases.txt e colocando as linhas # na lista file_lines (file.readlines() retorna essa lista) #===========================================================================# with open('frases.txt', 'r') as file: file_lines = file.readlines() #===========================================================================# # iterando sobre a lista file_lines e realizando a análise de sentimentos # dos textos obtendo como resultados 3 scores (dual, trinary e scale) # similares e proporcionais para um mesmo texto de entrada #===========================================================================# for line in file_lines: text = line.strip() # para removermos o \n ao final da linha result_scale = obj_sentistrength.getSentiment(text, score='scale') result_dual = obj_sentistrength.getSentiment(text, score='dual') result_trinary = obj_sentistrength.getSentiment(text, score='trinary') print( 'text: {0}\nresult_scale: {1}\nresult_dual: {2}\nresult_trinary: {3}\n' .format(text, str(result_scale), str(result_dual), str(result_trinary)))
def sentiment_analysis(tweet_sample, aggregate=True, mode='trinary'): senti = PySentiStr() senti.setSentiStrengthPath(sentistrength_jar_full_path) senti.setSentiStrengthLanguageFolderPath(sentistrength_lan_full_path_en) sentiment_dict = {} if type(tweet_sample) is not dict: return 'No matches' else: for topic in tweet_sample.keys(): # Scores: scale, dual, binary and trinary sentiment = senti.getSentiment(tweet_sample[topic], score=mode) if (aggregate == True): sentisum = 0 summary = {} for sent in sentiment: sentisum += sent[ 2] # The trinary score returns a tuple, unless the others summary['value'] = sentisum if sentisum > 0: summary['sentiment'] = 'positive' else: summary['sentiment'] = 'negative' sentiment = summary sentiment_dict[topic] = sentiment return sentiment_dict
def tweet_word_sentiment(data): ''' input: whole corpus output: 1 dicts for tweet_word_sentiment, keys: tweet_id, values: dict (keys={"max","min","distance"}) max--highest sentiment score among all words min--lowest sentiment score among all words distance-- difference between highest score and lowest score ''' feature_dict = {} # try: senti = PySentiStr() senti.setSentiStrengthPath('./SentiStrength.jar') senti.setSentiStrengthLanguageFolderPath('./SentiStrengthData/') for tweet in data: tokenized = tweet.tweet_words() new_words = [word for word in tokenized if word.isalnum()] if not new_words: feature_dict[tweet.tweet_id] = {"max": 0, "min": 0, "distance": 0} continue result = senti.getSentiment(new_words) max_, min_ = result[0], result[0] for score in result: max_ = max(max_, score) min_ = min(min_, score) #feature_dict[tweet.tweet_id]={"max":max_,"min":min_,"distance":max_-min_} feature_dict[tweet.tweet_id] = [max_, min_, max_ - min_] return feature_dict
def sentistr(x): senti = PySentiStr() senti.setSentiStrengthPath("SentiStrength.jar") senti.setSentiStrengthLanguageFolderPath("SentStrength_Data") result = senti.getSentiment( x, score='trinary') #positive rating, negative rating and neutral rating return result
def get_sentistrength(df): senti = PySentiStr() senti.setSentiStrengthPath('~/softwares/SentiStrengthCom.jar') senti.setSentiStrengthLanguageFolderPath( '~/softwares/SentStrength_Data_Sept2011/') df["text"] = [t if t != "" else " " for t in df['text']] result = senti.getSentiment(df["text"], score='trinary') df["sentistrength_pos"] = [r[0] for r in result] df["sentistrength_neg"] = [r[1] for r in result] df["sentistrength_neutral"] = [r[2] for r in result] return df
def main(): #mudar entrada with open( './Comentarios_csv/Test/OPOVOOnline sobre escolha do novo reitor UFC.csv' ) as csv_file: csv_dict_reader = csv.DictReader(csv_file) senti = PySentiStr() senti.setSentiStrengthPath( "/home/caio/Documentos/Projeto Analise Comentarios Facebook/SentiStrength.jar" ) senti.setSentiStrengthLanguageFolderPath( "/home/caio/Documentos/Projeto Analise Comentarios Facebook/SentStrength_Data/portuguese/" ) #mudar saída with open('./Comentarios_csv/Test/teste.csv', 'w') as csvfile: spamwriter = csv.writer(csvfile) spamwriter.writerow( ["Comentário", "notaPositiva", "notaNegativa", "Sentimento"]) for row in csv_dict_reader: #colocar nome da coluna que tem o comentario if row["message"]: sentence = row["message"] #sentence = RemoveAccent(sentence) sentence = Tokenize(sentence) if sentence: sentence = RemoveStopWords(sentence) if sentence: sentence = Stemming(sentence) sentence = " ".join(sentence) #sentistrength result = senti.getSentiment(sentence, score='binary') if result[0][0] + result[0][1] <= -1: sentiment = 'negativo' elif result[0][0] + result[0][1] >= 1: sentiment = 'positivo' else: sentiment = 'neutro' spamwriter.writerow([ row["message"], result[0][0], result[0][1], sentiment ]) print("finish!")
from utils import * import pandas as pd import ssl ssl._create_default_https_context = ssl._create_unverified_context config = get_config('config.yaml') from sentistrength import PySentiStr senti = PySentiStr() # Rocket HPC senti.setSentiStrengthPath('/gpfs/space/home/enlik/GitRepo/master-thesis-2021/references/SentiStrengthCom.jar') # Note: Provide absolute path instead of relative path senti.setSentiStrengthLanguageFolderPath('/gpfs/space/home/enlik/GitRepo/master-thesis-2021/references/SentiStrengthData/') # Note: Provide absolute path instead of relative path df_freenow = pd.read_csv(config['csv_input_local']['freenow_apple_google_p1'], index_col=0) df_freenow = df_freenow.reset_index(drop=True) total_reviews = len(df_freenow) print(f'Total English reviews: {total_reviews} \n') df_freenow.review = df_freenow.review.astype(str) # df_freenow = df_freenow.head(10) # testing purpose listOfSentimentScores = [] for i in range(0, int(len(df_freenow))): text_input = df_freenow.review[i] star_rating = df_freenow.rating[i] result = senti.getSentiment(text_input)
from sentistrength import PySentiStr #inicializando sentistrength senti = PySentiStr() senti.setSentiStrengthPath("SentiStrength.jar") senti.setSentiStrengthLanguageFolderPath("SentiStrength_Data") frase1 = senti.getSentiment('The food here is GREAT!!', score='dual') frase2 = senti.getSentiment('The food here is GREAT!!', score='binary') frase3 = senti.getSentiment('The food here is GREAT!!', score='trinary') frase4 = senti.getSentiment('The food here is GREAT!!', score='scale') print("Frase1 na saída dual:", frase1) print("Frase2 na saída binary:", frase2) print("Frase3 na saída trinary:", frase3) print("Frase4 na saída scale:", frase4)
# -Recommended to run sentistrength on the csv containing the column that contains full text #The location of SentiStrength on your computer SentiStrengthLocation = "C:/Users/ThinkPad/SpyderProjects/sentistrengthStuff/SentiStrength.jar" #The location of the unzipped SentiStrength data files on your computer SentiStrengthLanguageFolder = "C:/Users/ThinkPad/SpyderProjects/sentistrengthStuff/SentiStrength_Data/" #Check if the paths are correct (if the paths are correct, you will see no flags thrown) if not os.path.isfile(SentiStrengthLocation): print("SentiStrength not found at: ", SentiStrengthLocation) if not os.path.isdir(SentiStrengthLanguageFolder): print("SentiStrength data folder not found at: ", SentiStrengthLanguageFolder) # Initiate an object senti = PySentiStr() # set paths senti.setSentiStrengthPath(SentiStrengthLocation) senti.setSentiStrengthLanguageFolderPath(SentiStrengthLanguageFolder) # Read csv (give your path) all_files = glob.glob("C:/Users/ThinkPad/SpyderProjects/sentistrengthStuff/entropy files" + "/*.csv") li = [] #Make a dataframe from appending lists for filename in all_files: df = pd.read_csv(filename, index_col=None, header=0, error_bad_lines=False) li.append(df)
def prAnalysis( config: Configuration, senti: sentistrength.PySentiStr, delta: relativedelta, batchDates: List[datetime], ): print("Querying PRs") batches = prRequest(config.pat, config.repositoryOwner, config.repositoryName, delta, batchDates) batchParticipants = list() batchComments = list() for batchIdx, batch in enumerate(batches): print(f"Analyzing PR batch #{batchIdx}") # extract data from batch prCount = len(batch) participants = list(pr["participants"] for pr in batch if len(pr["participants"]) > 0) batchParticipants.append(participants) allComments = list() prPositiveComments = list() prNegativeComments = list() generallyNegative = list() print(f" Sentiments per PR", end="") semaphore = threading.Semaphore(15) threads = [] for pr in batch: comments = list(comment for comment in pr["comments"] if comment and comment.strip()) # split comments that are longer than 20KB splitComments = [] for comment in comments: # calc number of chunks byteChunks = math.ceil(sys.getsizeof(comment) / (20 * 1024)) if byteChunks > 1: # calc desired max length of each chunk chunkLength = math.floor(len(comment) / byteChunks) # divide comment into chunks chunks = [ comment[i * chunkLength:i * chunkLength + chunkLength] for i in range(0, byteChunks) ] # save chunks splitComments.extend(chunks) else: # append comment as-is splitComments.append(comment) # re-assign comments after chunking comments = splitComments if len(comments) == 0: prPositiveComments.append(0) prNegativeComments.append(0) continue allComments.extend(comments) thread = threading.Thread( target=analyzeSentiments, args=( senti, comments, prPositiveComments, prNegativeComments, generallyNegative, semaphore, ), ) threads.append(thread) for thread in threads: thread.start() for thread in threads: thread.join() print("") # save comments batchComments.append(allComments) # get comment length stats commentLengths = [len(c) for c in allComments] generallyNegativeRatio = len(generallyNegative) / prCount # get pr duration stats durations = [(pr["closedAt"] - pr["createdAt"]).days for pr in batch] print(" All sentiments") commentSentiments = [] commentSentimentsPositive = 0 commentSentimentsNegative = 0 if len(allComments) > 0: commentSentiments = senti.getSentiment(allComments) commentSentimentsPositive = sum( 1 for _ in filter(lambda value: value >= 1, commentSentiments)) commentSentimentsNegative = sum( 1 for _ in filter(lambda value: value <= -1, commentSentiments)) toxicityPercentage = getToxicityPercentage(config, allComments) centrality.buildGraphQlNetwork(batchIdx, participants, "PRs", config) print(" Writing results") with open( os.path.join(config.resultsPath, f"results_{batchIdx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["NumberPRs", prCount]) w.writerow(["NumberPRComments", len(allComments)]) w.writerow(["PRCommentsPositive", commentSentimentsPositive]) w.writerow(["PRCommentsNegative", commentSentimentsNegative]) w.writerow(["PRCommentsNegativeRatio", generallyNegativeRatio]) w.writerow(["PRCommentsToxicityPercentage", toxicityPercentage]) with open( os.path.join(config.metricsPath, f"PRCommits_{batchIdx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["PR Number", "Commit Count"]) for pr in batch: w.writerow([pr["number"], pr["commitCount"]]) with open( os.path.join(config.metricsPath, f"PRParticipants_{batchIdx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["PR Number", "Developer Count"]) for pr in batch: w.writerow([pr["number"], len(set(pr["participants"]))]) # output statistics stats.outputStatistics( batchIdx, commentLengths, "PRCommentsLength", config.resultsPath, ) stats.outputStatistics( batchIdx, durations, "PRDuration", config.resultsPath, ) stats.outputStatistics( batchIdx, [len(pr["comments"]) for pr in batch], "PRCommentsCount", config.resultsPath, ) stats.outputStatistics( batchIdx, [pr["commitCount"] for pr in batch], "PRCommitsCount", config.resultsPath, ) stats.outputStatistics( batchIdx, commentSentiments, "PRCommentSentiments", config.resultsPath, ) stats.outputStatistics( batchIdx, [len(set(pr["participants"])) for pr in batch], "PRParticipantsCount", config.resultsPath, ) stats.outputStatistics( batchIdx, prPositiveComments, "PRCountPositiveComments", config.resultsPath, ) stats.outputStatistics( batchIdx, prNegativeComments, "PRCountNegativeComments", config.resultsPath, ) return batchParticipants, batchComments
def __aggregate(_posts, _media, _comments, session, logger): _entries = list() # Initialize sentistrength variable senti = PySentiStr() setup_sentistrength_path(senti) for p in _posts: try: """ Id, name of a post """ entry = [p.id, p.name] """ Number of version of a post """ version = __extract_version(p.discussion_url) entry = entry + [version] """ Number of tags for a product """ tags_number = session.query(func.count( Topic.name)).filter(Topic.post_id == p.id).scalar() entry = entry + [tags_number] entry = entry + [p.featured, p.votes_count, p.day, p.created_at] """ Time features """ launch_day = get_day_name_from_date(p.created_at.year, p.created_at.month, p.created_at.day) best_launch_time = is_best_posted_time(p.created_at.hour, p.created_at.minute, p.created_at.second) best_launch_day = is_best_launched_day(p.created_at.hour, p.created_at.minute, p.created_at.second, launch_day) max_follower = session.query(func.max( User.followers_count)).scalar() maker_id = session.query( Apps.maker_id).filter(Apps.post_id == p.id).one()[0] maker = session.query( User.id, User.name, User.twitter_username, User.website_url, User.followers_count).filter(User.id == maker_id).one() weekend = is_weekend(maker.followers_count, max_follower, launch_day) entry = entry + [ launch_day, best_launch_time, best_launch_day, weekend ] """ Presentation features """ entry = entry + [p.description] if p.description: """ Extraction of maker sentiment based on the description of his post """ maker_description_sentiment = __extract_sentiment( senti, p.description) entry = entry + [ maker_description_sentiment[0][0], maker_description_sentiment[0][1], '', '', '' ] # Text length entry = entry + [len(p.description)] # Sentence length sentence = get_sentence(p.description) sentence_length_sum = 0 for i in range(0, len(sentence)): sentence_length_sum = sentence_length_sum + len( sentence[i]) try: sentence_length_average = sentence_length_sum / len( sentence) except ZeroDivisionError: sentence_length_average = 0.0 entry = entry + [round(sentence_length_average)] # Bullet points / Explicit features bullet_points_explicit_features = __extract_bullet_points_explicit_features( sentence) entry = entry + [bullet_points_explicit_features] # Emoji in description emoji_description = __extract_emoji(p.description) entry = entry + [emoji_description] else: entry = entry + [1, -1, '', '', '', 0, 0, 'No', 'No'] entry = entry + [p.tagline] if p.tagline: # Tagline length entry = entry + [len(p.tagline)] # Emoji in tagline emoji_tagline = __extract_emoji(p.tagline) entry = entry + [emoji_tagline] else: entry = entry + [0, 'No'] # Video, Tweetable images, Gif and Gif's number for a post video = [] tweetable_images = [] gif = [] index_media = 0 while index_media < len(_media): # check if the current post_id is equal to the post_id of the current media if p.id == _media[index_media][0]: # check if the media type is 'video' if _media[index_media][1] == 'video': # append to the list the link of the video video = video + [_media[index_media][2]] # calculate the image size passing its width and its height roughly, ratio = calculate_aspect_ratio( _media[index_media][3], _media[index_media][4]) # check if the image is a tweetable image if (roughly == 2) and (ratio == 1): # append to the list the image url tweetable_images = tweetable_images + [ _media[index_media][5] ] # check if the image is a gif image passing its url found = is_gif(_media[index_media][5]) if found: # append to the list the image url gif = gif + [_media[index_media][5]] index_media = index_media + 1 if video: entry = entry + ['Yes'] else: entry = entry + ['No'] if tweetable_images: entry = entry + ['Yes'] else: entry = entry + ['No'] if gif: entry = entry + [gif, len(gif)] else: entry = entry + ['', len(gif)] # Offers, Promo/Discount Codes, Questions, Maker_inside, Hunter_inside in comment body for a post offers = [] questions = [] promo_codes = [] # maker_follows_up_on_comments = 0 hunter_follows_up_on_comments = 0 maker_comments = [] others_comments = [] comm_in_thread = [] hunter_id = session.query( Hunts.hunter_id).filter(Hunts.post_id == p.id).one()[0] index_comment = 0 while index_comment < len(_comments): # check if the current post_id is equal to the post_id of the current comment if p.id == _comments[index_comment][3]: # extract offers passing the comment body offer = __extract_offers(_comments[index_comment][1]) if offer: offers = offers + offer # extract questions passing the comment body question = __extract_questions(_comments[index_comment][1]) if question: questions = questions + question # extract promo_codes passing the comment body promo_code = __extract_promo_codes( _comments[index_comment][1]) if promo_code: promo_codes = promo_codes + promo_code # # check if the maker follows up on the current comment # if _comments[index_comment][4] == maker_id: # maker_follows_up_on_comments = 1 # put comments in comm list (comment_id, comment_body, created_at, user_id) comm_in_thread.append([ _comments[index_comment][0], _comments[index_comment][1], _comments[index_comment][2], _comments[index_comment][4] ]) # check if the hunter follows up on the current comment if _comments[index_comment][4] == hunter_id: hunter_follows_up_on_comments = 1 """ Extraction of maker sentiment based on his post comments written the day of launch """ if _comments[index_comment][4] == maker_id: # date of maker's comment written the day the post was launched comment_date = _comments[index_comment][2] # cut the comments written days after the post was launched if (p.created_at.year == comment_date.year) and ( p.created_at.month == comment_date.month) and ( p.created_at.day == comment_date.day): if not maker_comments: maker_comments = [_comments[index_comment][1]] else: maker_comments = maker_comments + [ _comments[index_comment][1] ] """ Extraction of others users sentiment based on their post comments written the day of launch """ if (_comments[index_comment][4] != maker_id) and (maker_id != hunter_id): # date of others comment written the day the post was launched comment_date = _comments[index_comment][2] # cut the comments written days after the post was launched if (p.created_at.year == comment_date.year) and ( p.created_at.month == comment_date.month) and ( p.created_at.day == comment_date.day): if not others_comments: others_comments = [_comments[index_comment][1]] else: others_comments = others_comments + [ _comments[index_comment][1] ] index_comment = index_comment + 1 if offers: entry = entry + ['Yes'] else: entry = entry + ['No'] if promo_codes: entry = entry + ['Yes'] else: entry = entry + ['No'] if questions: entry = entry + ['Yes'] else: entry = entry + ['No'] # check if the maker writes the first comment in the thread maker_started_comment_thread = 0 if comm_in_thread: if (p.created_at.year == comm_in_thread[0][2].year) and ( p.created_at.month == comm_in_thread[0][2].month) and ( p.created_at.day == comm_in_thread[0][2].day): if comm_in_thread[0][3] == maker_id: maker_started_comment_thread = 1 # calculate maker comment ratio ((number of maker comments / number of all comments)*100) number_maker_comments = 0 number_others_comments = 0 if comm_in_thread: for i in range(0, len(comm_in_thread)): if (p.created_at.year == comm_in_thread[i][2].year) and ( p.created_at.month == comm_in_thread[i][2].month ) and (p.created_at.day == comm_in_thread[i][2].day): if comm_in_thread[i][3] == maker_id: number_maker_comments = number_maker_comments + 1 else: number_others_comments = number_others_comments + 1 thread_length = number_maker_comments + number_others_comments try: if maker_started_comment_thread == 1: maker_comment_ratio = (number_maker_comments / thread_length) * 100 else: maker_comment_ratio = 0.0 except ZeroDivisionError: maker_comment_ratio = 0.00 # Hunter reputation hunter = session.query( User.id, User.name, User.twitter_username, User.website_url, User.followers_count, User.apps_made_count).filter(User.id == hunter_id).one() entry = entry + [ hunter.id, hunter.name, hunter.twitter_username, hunter.website_url, hunter.followers_count, hunter.apps_made_count, hunter_follows_up_on_comments ] # Maker reputation entry = entry + [ maker.id, maker.name, maker.twitter_username, maker.website_url, maker.followers_count, maker_started_comment_thread, round(maker_comment_ratio, 2), thread_length ] # check if the hunter is also the maker and append the variable hunter_is_maker to the list entry hunter_is_maker = 0 if hunter_id == maker_id: hunter_is_maker = 1 entry = entry + [hunter_is_maker] # Append to the list the maker comment sentiment if maker_comments: comment = '\n'.join(maker_comments) sentiment = __extract_sentiment(senti, comment) else: comment = '' sentiment = [[1, -1]] entry = entry + [ comment, sentiment[0][0], sentiment[0][1], '', '', '' ] # Append to the list the others comment sentiment if others_comments: comment = '\n'.join(others_comments) sentiment = __extract_sentiment(senti, comment) else: comment = '' sentiment = [[1, -1]] entry = entry + [ comment, sentiment[0][0], sentiment[0][1], '', '', '' ] _entries.append(entry) except NoResultFound as ex: logger.error(str(ex)) continue except MultipleResultsFound as ex: logger.error(str(ex)) continue return _entries
def main(): with open( './OPOVOOnline sobre escolha do novo reitor UFC.csv') as csv_file: csv_dict_reader = csv.DictReader(csv_file) senti = PySentiStr() senti.setSentiStrengthPath( "/home/caio/Documentos/Projeto Analise Comentarios Facebook/SentiStrength.jar" ) senti.setSentiStrengthLanguageFolderPath( "/home/caio/Documentos/Projeto Analise Comentarios Facebook/SentStrength_Data/portuguese/" ) prev_message = "" with open( '/home/caio/Documentos/Projeto Analise Comentarios Facebook/Frases_Neutras.csv', 'w') as csvfile: spamwriter = csv.writer(csvfile) spamwriter.writerow(["Frase", "notaPositiva", "notaNegativa"]) #sentistrength for row in csv_dict_reader: if prev_message != row["message"] and row["message"]: sentence = row["message"] #sentence = RemoveAccent(sentence) sentence = Tokenize(sentence) if sentence: sentence = RemoveStopWords(sentence) if sentence: sentence = Stemming(sentence) sentence = " ".join(sentence) result = senti.getSentiment(sentence, score='binary') if result[0][0] + result[0][1] == 0: #salvar frase tokenizada #spamwriter.writerow([sentence, result[0][0], result[0][1]]) #salvar frase inteira spamwriter.writerow([ row["message"], result[0][0], result[0][1] ]) #publicacao com resposta de comentários if row["object_link.connections.comments.message"] != 'null' and row[ "object_link.connections.comments.message"]: sentence = row["object_link.connections.comments.message"] #sentence = RemoveAccent(sentence) sentence = Tokenize(sentence) if sentence: sentence = RemoveStopWords(sentence) if sentence: sentence = Stemming(sentence) sentence = " ".join(sentence) result = senti.getSentiment(sentence, score='binary') if result[0][0] + result[0][1] == 0: #mostrar tokenizada #spamwriter.writerow([sentence, result[0][0], result[0][1]]) #mostrar frase inteira spamwriter.writerow([ row["object_link.connections.comments.message"], result[0][0], result[0][1] ]) prev_message = row["message"] print("finish!")
import sys import json import csv import pandas as pd import re import demoji import emoji from datetime import datetime from sentistrength import PySentiStr from langdetect import detect #prendo in input la caption. Deve essere stringa. Es: python3 txt_features.py "ciao come stai?" caption = sys.argv[1] senti = PySentiStr() #impostare i 3 percorsi corretti, trovate i tre file nella cartella SentiStrength - 1) SentiStrength.jar - 2) SentStrength_Data_EN - 3) SentStrength_Data_IT2 senti.setSentiStrengthPath('./SentiStrength/SentiStrength.jar') eng_path = './SentiStrength/SentStrength_Data_EN' ita_path = './SentiStrength/SentStrength_Data_IT2' def deEmojify(inputString): return inputString.encode('ascii', 'ignore').decode('ascii') #number of hashtag def hashtag_count(string): count = len([string for words in string.split() if words.startswith('#')]) return count
def pre_process_and_predict(sentence): wordnet_lemmatizer = WordNetLemmatizer() # # Replacing double quotes with single, within a string sentence = sentence.replace("\"", "\'") # # Removing unnecessary special characters, keeping only , ! ? sentence = re.sub(r"[^!?,a-zA-Z0-9\ ]+", '', sentence) # # Lemmatization on verbs sentence = ' '.join([ wordnet_lemmatizer.lemmatize(word, pos='v') for word in word_tokenize(sentence) ]) sn = SenticNet() senti = PySentiStr() senti.setSentiStrengthPath(CODE_PATH + '/sentistrength/SentiStrength.jar') senti.setSentiStrengthLanguageFolderPath( CODE_PATH + '/sentistrength/SentStrength_Data/') sentiment_score = [] for sen in sent_tokenize(sentence): senti_pos, senti_neg = senti.getSentiment(sen, score='dual')[0] senti_pos -= 1 if senti_neg == -1: senti_neg = 0 sum_pos_score = 0 sum_neg_score = 0 for word in word_tokenize(sen): try: w_score = float(sn.polarity_intense(word)) * 5 except KeyError: w_score = 0 if w_score > 0: sum_pos_score = sum_pos_score + w_score elif w_score < 0: sum_neg_score = sum_neg_score + w_score sum_pos_score = (sum_pos_score + senti_pos) / 2 sum_neg_score = (sum_neg_score + senti_neg) / 2 sentiment_score.append((sum_pos_score, sum_neg_score)) additional_features_s = [] additional_features_ns = [] contra = [] pos_low = [] pos_medium = [] pos_high = [] neg_low = [] neg_medium = [] neg_high = [] for sum_pos_score, sum_neg_score in sentiment_score: contra.append(int(sum_pos_score > 0 and abs(sum_neg_score) > 0)) pos_low.append(int(sum_pos_score < 0)) pos_medium.append(int(sum_pos_score >= 0 and sum_pos_score <= 1)) pos_high.append(int(sum_pos_score >= 2)) neg_low.append(int(sum_neg_score < 0)) neg_medium.append(int(sum_neg_score >= 0 and sum_neg_score <= 1)) neg_high.append(int(sum_neg_score >= 2)) additional_features_s = additional_features_s + [ max(pos_medium), max(pos_high), max(neg_medium), max(neg_high) ] additional_features_ns = additional_features_ns + [ max(pos_low), max(neg_low) ] tweet = sentence punctuation_count = SequencePunctuationCount(tweet) character_count = SequenceCharacterCount(tweet) capitalized_count = CapitalizedCount(tweet) exclamation_count = ExclamationCount(tweet) # emoji_count = EmojiCount(tweet) f_count = [ punctuation_count, character_count, capitalized_count, exclamation_count ] for count in f_count: f_low = int(count == 0) f_medium = int(count >= 1 and count <= 3) f_high = int(count >= 4) additional_features_s = additional_features_s + [f_medium, f_high] additional_features_ns = additional_features_ns + [f_low] X = [sentence] in_file = open(os.path.join(PICKLES_PATH, "vocab.pickle"), "rb") vocab = pickle.load(in_file) in_file.close() in_file = open(os.path.join(PICKLES_PATH, "model.pickle"), "rb") model = pickle.load(in_file) in_file.close() vectorizer = TfidfVectorizer(vocabulary=vocab) X = vectorizer.fit_transform(X) ans = int(sum(model.predict(X))) print('Sentence : ', sentence) print('Sarcastic features : ', additional_features_s) print('Not Sarcastic features : ', additional_features_ns) print('Contradict : ', max(contra)) print('Model Predict : ', ans) print( 'My obs : ', int((sum(additional_features_s) >= sum(additional_features_ns)) and max(contra) == 1)) print('Final Prd : ', end='') if ans == 1 or ((sum(additional_features_s) >= sum(additional_features_ns)) and max(contra) == 1): return True else: return False
import itertools #from multiprocessing.pool import ThreadPool #pool = ThreadPool(20) # However many you wish to run in parallel from tqdm import tqdm import glob import os.path import sys from os import getcwd from sentistrength import PySentiStr senti = PySentiStr() #senti.setSentiStrengthPath('C:\\SentiStrength\\SentiStrength.jar') # e.g. 'C:\Documents\SentiStrength.jar' #senti.setSentiStrengthLanguageFolderPath('C:\\SentiStrength') # e.g. 'C:\Documents\SentiStrengthData\' senti.setSentiStrengthPath(os.path.join(getcwd(),"SentiStrengthData/SentiStrength.jar")) senti.setSentiStrengthLanguageFolderPath(os.path.join(getcwd(),"SentiStrengthData/")) def preprocess_data(data): data_out = pd.DataFrame() data_out = data[['type','content']] data_out.dropna(inplace=True) return data_out def count_words(text): try: return len(TextBlob(text).words)
#!/usr/bin/env python # coding: utf-8 from utils import * import ssl ssl._create_default_https_context = ssl._create_unverified_context config = get_config('config.yaml') from sentistrength import PySentiStr senti = PySentiStr() senti.setSentiStrengthPath( '/Users/enlik/GitRepo/master-thesis-2021/references/SentiStrengthCom.jar' ) # Note: Provide absolute path instead of relative path senti.setSentiStrengthLanguageFolderPath( '/Users/enlik/GitRepo/master-thesis-2021/references/SentiStrengthData/' ) # Note: Provide absolute path instead of relative path # # Sample use case of SentiStrength result = senti.getSentiment('What a bad day') print(result) str_arr = ['What a lovely day', 'What a bad day'] result = senti.getSentiment(str_arr, score='scale') print(result) str_arr = ['What a lovely day', 'What a bad day'] result = senti.getSentiment(str_arr, score='dual') print(result)
from sentistrength import PySentiStr senti = PySentiStr() senti.setSentiStrengthPath('data/sentistrength/SentiStrength5.jar') senti.setSentiStrengthLanguageFolderPath('data/sentistrength/SentStrength_Data') def analyse_sentence(sentence): return senti.getSentiment(sentence)
class Maestro: def __init__(self, df, output_path, output_name, batch): # storing variables self.df = df self.filename = Path(output_path) / output_name self.raw_file = '{}_raw.csv'.format(self.filename) self.batch = batch # initialize tools self.translator = Translator() self.__initialize_senti() # collect jobs job_list = self.__collect_jobs() self.total_job = len(job_list) # initialize queues self.jobs = Queue(maxsize=self.total_job) for job in job_list: self.jobs.put(job) self.results = Queue(maxsize=self.total_job) # setup threading variables self.stop = threading.Event() self.worker_ct_lock = threading.Lock() self.worker_ct = 0 # num_of_spawned worker def __initialize_senti(self): self.senti = PySentiStr() self.senti.setSentiStrengthPath( str(Path.cwd() / 'lib' / 'SentiStrengthCom.jar')) self.senti.setSentiStrengthLanguageFolderPath(str(Path.cwd() / 'lang')) # simple test to make sure senti works test = self.senti.getSentiment(['You are beautiful'], 'dual') assert type(test) is list assert type(test[0]) is tuple def __collect_jobs(self): try: out_df = pd.read_csv(self.raw_file, header=None) processed_ser = self.df['tweetid'].isin(out_df[1]) except FileNotFoundError: zeros = np.zeros((len(self.df.index), ), dtype=bool) processed_ser = pd.Series(zeros) job_list = processed_ser[~processed_ser].index job_list = list(grouper(job_list, self.batch)) if len(job_list) > 0: job_list[-1] = tuple(job for job in job_list[-1] if job is not None) return job_list def __despawn_worker(self): with self.worker_ct_lock: self.worker_ct = self.worker_ct - 1 def __translate(self, thread_num): with self.worker_ct_lock: self.worker_ct = self.worker_ct + 1 while not self.stop.is_set() and not self.jobs.empty(): job = self.jobs.get() try: mini_df = self.df.loc[job, ] # trailing comma is needed ids = mini_df.iloc[:, 0] items = mini_df.iloc[:, -1].to_numpy().tolist() except Exception as e: print('Worker #{} got pandas error: {}'.format(thread_num, e)) break try: if len(items) == 1: translations = [self.translator.translate(items)] else: translations = self.translator.translate(items) except Exception as e: print('Worker #{} got translation error: {}'.format( thread_num, e)) break self.results.put((job, ids, translations)) self.__despawn_worker() def __save(self, results): with open(self.raw_file, 'a', encoding='utf-8', newline='') as csv_file: writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerows(results) def __process(self, score='dual'): total_batch = int(np.ceil(len(self.df.index) / self.batch)) pbar = tqdm(total=total_batch, initial=(total_batch - self.total_job)) while not self.stop.is_set() or not self.results.empty(): time.sleep(2) if not self.results.empty(): # merges all results job_list, id_list, translation_list = ([], [], []) steps = 0 while not self.results.empty(): job, ids, translations = self.results.get() job_list.extend(job) id_list.extend(ids) translation_list.extend(translations) steps = steps + 1 # analyze sentiments texts = [tr.text for tr in translation_list] try: sentis = self.senti.getSentiment(texts, score) except Exception as e: print('Process got sentistrength error:', e) break try: rows = [ (order, i, *senti, tr.src, text) for order, i, senti, tr, text in zip( job_list, id_list, sentis, translation_list, texts) ] except Exception as e: print(e) break try: self.__save(rows) except Exception as e: print('Process got on save error:', e) break pbar.update(steps) time.sleep(.1) # prevent too much loop checking if not self.stop.is_set(): self.stop.set() # force stop all threads print('Rebuilding...') self.__rebuild() print('Exiting...') pbar.close() def __rebuild(self): try: sf = pd.read_csv(self.raw_file, header=None, names=[ 'order', 'tweetid', '+', '-', 'src_lang', 'translation' ]) sf.sort_values('order', inplace=True) sf.to_csv('{}.csv'.format(self.filename), index=None) except FileNotFoundError: pass except Exception as e: print(ERR_STR.format('rebuild', 'on rebuilding csv'), e) def play(self, n_thread=1): if n_thread < 1: return with ThreadPoolExecutor(max_workers=n_thread + 1) as executor: try: executor.map(self.__translate, range(n_thread)) print('Spawing {} workers...'.format(n_thread)) while self.worker_ct is 0: pass # waiting for any worker being spawned print('Aye, Sir!') executor.submit(self.__process) # as long as there are atleast a worker while self.worker_ct > 0: # wait for any keyboard interrupt time.sleep(.5) # power napping for half second # either no job left or all worker has been despawned self.stop.set() if self.jobs.empty(): print('All done!') if self.worker_ct is 0: print('All workers quit their job!') except KeyboardInterrupt: print('\nKeyboard interrupt') except Exception as e: print(ERR_STR.format('play', 'something went wrong'), e) finally: self.stop.set() print('Byee 👋')
import xml.etree.ElementTree as xml from sentistrength import PySentiStr #inicializando sentistrength sstrength = PySentiStr() sstrength.setSentiStrengthPath("SentiStrength.jar") sstrength.setSentiStrengthLanguageFolderPath("SentiStrength_Data") # Dada uma lista com as respostas, retorna uma lista com os valores de sentimento # gerados pelo SentiStr def analise_sentistr(respostas): return sstrength.getSentiment(respostas)
import pandas as pd from sentistrength import PySentiStr senti = PySentiStr() senti.setSentiStrengthPath( 'SentiStrengthCom.jar' ) # Note: Provide absolute path instead of relative path senti.setSentiStrengthLanguageFolderPath( 'SentStrength_Data_Sept2011' ) # Note: Provide absolute path instead of relative path str_arr = ['What a lovely day', 'What a bad day'] result = senti.getSentiment(str_arr) print(result) result = senti.getSentiment(str_arr, score='scale') print(result) # OR, if you want dual scoring (a score each for positive rating and negative rating) result = senti.getSentiment(str_arr, score='dual') print(result) # OR, if you want binary scoring (1 for positive sentence, -1 for negative sentence) result = senti.getSentiment(str_arr, score='binary') print(result) # OR, if you want trinary scoring (a score each for positive rating, negative rating and neutral rating) result = senti.getSentiment(str_arr, score='trinary') print(result)
def commitBatchAnalysis( idx: int, senti: PySentiStr, commits: List[git.Commit], config: Configuration ): authorInfoDict = {} timezoneInfoDict = {} experienceDays = 150 # traverse all commits print("Analyzing commits") startDate = None if config.startDate is not None: startDate = datetime.strptime(config.startDate, "%Y-%m-%d") startDate = startDate.replace(tzinfo=pytz.UTC) # sort commits commits.sort(key=lambda o: o.committed_datetime, reverse=True) commitMessages = [] commit: Commit lastDate = None firstDate = None realCommitCount = 0 for commit in Bar("Processing").iter(commits): if startDate is not None and startDate > commit.committed_datetime: continue if lastDate is None: lastDate = commit.committed_date firstDate = commit.committed_date realCommitCount = realCommitCount + 1 # extract info author = authorIdExtractor(commit.author) timezone = commit.author_tz_offset time = commit.authored_datetime # get timezone timezoneInfo = timezoneInfoDict.setdefault( timezone, dict(commitCount=0, authors=set()) ) # save info timezoneInfo["authors"].add(author) if commit.message and commit.message.strip(): commitMessages.append(commit.message) # increase commit count timezoneInfo["commitCount"] += 1 # get author authorInfo = authorInfoDict.setdefault( author, dict( commitCount=0, sponsoredCommitCount=0, earliestCommitDate=time, latestCommitDate=time, sponsored=False, activeDays=0, experienced=False, ), ) # increase commit count authorInfo["commitCount"] += 1 # validate earliest commit # by default GitPython orders commits from latest to earliest if time < authorInfo["earliestCommitDate"]: authorInfo["earliestCommitDate"] = time # check if commit was between 9 and 5 if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17: authorInfo["sponsoredCommitCount"] += 1 print("Analyzing commit message sentiment") sentimentScores = [] commitMessageSentimentsPositive = [] commitMessageSentimentsNegative = [] if len(commitMessages) > 0: sentimentScores = senti.getSentiment(commitMessages) commitMessageSentimentsPositive = list( result for result in filter(lambda value: value >= 1, sentimentScores) ) commitMessageSentimentsNegative = list( result for result in filter(lambda value: value <= -1, sentimentScores) ) print("Analyzing authors") sponsoredAuthorCount = 0 for login, author in authorInfoDict.items(): # check if sponsored commitCount = int(author["commitCount"]) sponsoredCommitCount = int(author["sponsoredCommitCount"]) diff = sponsoredCommitCount / commitCount if diff >= 0.95: author["sponsored"] = True sponsoredAuthorCount += 1 # calculate active days earliestDate = author["earliestCommitDate"] latestDate = author["latestCommitDate"] activeDays = (latestDate - earliestDate).days + 1 author["activeDays"] = activeDays # check if experienced if activeDays >= experienceDays: author["experienced"] = True # calculate percentage sponsored authors percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict]) # calculate active project days firstCommitDate = None lastCommitDate = None if firstDate is not None: firstCommitDate = datetime.fromtimestamp(firstDate) if lastDate is not None: lastCommitDate = datetime.fromtimestamp(lastDate) daysActive = 0 if lastCommitDate is not None: daysActive = (lastCommitDate - firstCommitDate).days print("Outputting CSVs") # output author days on project with open( os.path.join(config.metricsPath, f"authorDaysOnProject_{idx}.csv"), "a", newline="", ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "# of Days"]) for login, author in authorInfoDict.items(): w.writerow([login, author["activeDays"]]) # output commits per author with open( os.path.join(config.metricsPath, f"commitsPerAuthor_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Author", "Commit Count"]) for login, author in authorInfoDict.items(): w.writerow([login, author["commitCount"]]) # output timezones with open( os.path.join(config.metricsPath, f"timezones_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["Timezone Offset", "Author Count", "Commit Count"]) for key, timezone in timezoneInfoDict.items(): w.writerow([key, len(timezone["authors"]), timezone["commitCount"]]) # output results with open( os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline="" ) as f: w = csv.writer(f, delimiter=",") w.writerow(["CommitCount", realCommitCount]) w.writerow(["DaysActive", daysActive]) w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)]) w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)]) w.writerow(["AuthorCount", len([*authorInfoDict])]) w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount]) w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors]) w.writerow(["TimezoneCount", len([*timezoneInfoDict])]) outputStatistics( idx, [author["activeDays"] for login, author in authorInfoDict.items()], "AuthorActiveDays", config.resultsPath, ) outputStatistics( idx, [author["commitCount"] for login, author in authorInfoDict.items()], "AuthorCommitCount", config.resultsPath, ) outputStatistics( idx, [len(timezone["authors"]) for key, timezone in timezoneInfoDict.items()], "TimezoneAuthorCount", config.resultsPath, ) outputStatistics( idx, [timezone["commitCount"] for key, timezone in timezoneInfoDict.items()], "TimezoneCommitCount", config.resultsPath, ) outputStatistics( idx, sentimentScores, "CommitMessageSentiment", config.resultsPath, ) outputStatistics( idx, commitMessageSentimentsPositive, "CommitMessageSentimentsPositive", config.resultsPath, ) outputStatistics( idx, commitMessageSentimentsNegative, "CommitMessageSentimentsNegative", config.resultsPath, ) return authorInfoDict, daysActive
import math print(os.getcwd()) os.chdir("C:/Users/marcs/OneDrive/Bureaublad/Master/Thesis") df = pd.DataFrame() k=0 print("Start part 1:") until = datetime.datetime(2019,1,1) since = datetime.datetime(2018,12,31) init_start = datetime.datetime.now() afinn = Afinn(emoticons=True) senti = PySentiStr() senti.setSentiStrengthPath('C:/Users/marcs/OneDrive/Bureaublad/Master/Thesis/SentiStrength.jar') # Note: Provide absolute path instead of relative path senti.setSentiStrengthLanguageFolderPath('C:/Users/marcs/OneDrive/Bureaublad/Master/Thesis/SentiStrength_Data/') # Note: Provide absolute path instead of relative path for j in list(range(100000)): start = datetime.datetime.now() res = None while res is None: try: tweetCriteria = got.manager.TweetCriteria().setQuerySearch('$HAS')\ .setSince(since.strftime('%Y-%m-%d'))\ .setUntil(until.strftime('%Y-%m-%d'))\ .setMaxTweets(10000)\ .setEmoji("unicode")\ .setLang("en") tweet = got.manager.TweetManager.getTweets(tweetCriteria)
# remove stopwords stop_words = set(stopwords.words('english')) from nltk.tokenize import word_tokenize tokens = word_tokenize(result) result = [i for i in tokens if not i in stop_words] # stemming # stemmer= PorterStemmer() # newResult = [] # for word in result: # newResult.append(stemmer.stem(word)) # print(newResult) return result senti = PySentiStr() senti.setSentiStrengthPath( 'C:\ProgramData\Anaconda3\Lib\site-packages\sentistrength\SentiStrength.jar' ) senti.setSentiStrengthLanguageFolderPath( 'C:\ProgramData\Anaconda3\Lib\site-packages\sentistrength\\') data = pd.read_csv("D:\senior\sentiment\Moodle_comments2.csv") tagcomment = pd.read_csv("D:\\senior\\sentiment\\data\\tags.csv", encoding='iso-8859-1') tagcommentId = tagcomment['commentid'] commendId = [] cleanComment = [] sentiment = [] # tagger = []
if score > 0: return 'positive' elif score < 0: return 'negative' else: return 'neutral' afinn = Afinn() def afinn_polarity(text): score = afinn.score(text) if score > 0: return 'positive' elif score < 0: return 'negative' else: return 'neutral' senti = PySentiStr() senti.setSentiStrengthPath(senti_strength_jar_filepath) senti.setSentiStrengthLanguageFolderPath(senti_strength_data_dirname) def sentistrength_polarity(text): score = senti.getSentiment([text])[0] if score > 0: return 'positive' elif score < 0: return 'negative' else: return 'neutral' mpqa_df = pd.read_csv(mpqa_filepath) def mpqa_polarity(text):
#tqdm loading from datetime import datetime from pair_score import calculatePairScore import os import sys NUMTHREAD = 20 curdir = os.getcwd() while 'filepathhelper.py' not in os.listdir(curdir): curdir = os.path.dirname(curdir) sys.path.append(curdir) import filepathhelper from tqdm import tqdm import multiprocessing as mp senti = PySentiStr() # senti.setSentiStrengthPath('C:\ProgramData\Anaconda3\Lib\site-packages\sentistrength\SentiStrength.jar') # senti.setSentiStrengthLanguageFolderPath('C:\ProgramData\Anaconda3\Lib\site-packages\sentistrength\\') senti.setSentiStrengthPath( '/home/waraleetan/ming/lib/python2.7/site-packages/sentistrength/SentiStrength.jar' ) senti.setSentiStrengthLanguageFolderPath( '/home/waraleetan/ming/lib/python2.7/site-packages/sentistrength/') def cleanData(text): #remove [~] result = re.sub("\\[~.*?\\]", "", text) #remove{code} result = re.sub(r'^{code(.+){code}', ' ', result)