def __initialize_senti(self):
        self.senti = PySentiStr()
        self.senti.setSentiStrengthPath(
            str(Path.cwd() / 'lib' / 'SentiStrengthCom.jar'))
        self.senti.setSentiStrengthLanguageFolderPath(str(Path.cwd() / 'lang'))

        # simple test to make sure senti works
        test = self.senti.getSentiment(['You are beautiful'], 'dual')
        assert type(test) is list
        assert type(test[0]) is tuple
示例#2
0
def main():
    #==========================================================================#
    # criando o objeto do sentistrength e setando os caminhos dos arquivos
    # auxiliares
    #==========================================================================#
    obj_sentistrength = PySentiStr()
    obj_sentistrength.setSentiStrengthPath(SENTISTRENGTH_JAR_PATH)
    obj_sentistrength.setSentiStrengthLanguageFolderPath(
        SENTISTRENGTH_DATA_PATH)

    #===========================================================================#
    # realizando a leitura do arquivo frases.txt e colocando as linhas
    # na lista file_lines (file.readlines() retorna essa lista)
    #===========================================================================#
    with open('frases.txt', 'r') as file:
        file_lines = file.readlines()

    #===========================================================================#
    # iterando sobre a lista file_lines e realizando a análise de sentimentos
    # dos textos obtendo como resultados 3 scores (dual, trinary e scale)
    # similares e proporcionais para um mesmo texto de entrada
    #===========================================================================#
    for line in file_lines:
        text = line.strip()  # para removermos o \n ao final da linha
        result_scale = obj_sentistrength.getSentiment(text, score='scale')
        result_dual = obj_sentistrength.getSentiment(text, score='dual')
        result_trinary = obj_sentistrength.getSentiment(text, score='trinary')
        print(
            'text: {0}\nresult_scale: {1}\nresult_dual: {2}\nresult_trinary: {3}\n'
            .format(text, str(result_scale), str(result_dual),
                    str(result_trinary)))
示例#3
0
def sentiment_analysis(tweet_sample, aggregate=True, mode='trinary'):
    senti = PySentiStr()
    senti.setSentiStrengthPath(sentistrength_jar_full_path)
    senti.setSentiStrengthLanguageFolderPath(sentistrength_lan_full_path_en)

    sentiment_dict = {}

    if type(tweet_sample) is not dict: return 'No matches'
    else:
        for topic in tweet_sample.keys():
            # Scores: scale, dual, binary and trinary
            sentiment = senti.getSentiment(tweet_sample[topic], score=mode)
            if (aggregate == True):
                sentisum = 0
                summary = {}
                for sent in sentiment:
                    sentisum += sent[
                        2]  # The trinary score returns a tuple, unless the others
                summary['value'] = sentisum
                if sentisum > 0: summary['sentiment'] = 'positive'
                else: summary['sentiment'] = 'negative'
                sentiment = summary

            sentiment_dict[topic] = sentiment
        return sentiment_dict
示例#4
0
def tweet_word_sentiment(data):
    '''
    input: whole corpus
    output: 1 dicts for tweet_word_sentiment, 
            keys: tweet_id, values: dict (keys={"max","min","distance"})
                                    max--highest sentiment score among all words
                                    min--lowest sentiment score among all words
                                    distance-- difference between highest score and lowest score
    '''
    feature_dict = {}
    #     try:
    senti = PySentiStr()
    senti.setSentiStrengthPath('./SentiStrength.jar')
    senti.setSentiStrengthLanguageFolderPath('./SentiStrengthData/')

    for tweet in data:
        tokenized = tweet.tweet_words()
        new_words = [word for word in tokenized if word.isalnum()]
        if not new_words:
            feature_dict[tweet.tweet_id] = {"max": 0, "min": 0, "distance": 0}
            continue
        result = senti.getSentiment(new_words)
        max_, min_ = result[0], result[0]
        for score in result:
            max_ = max(max_, score)
            min_ = min(min_, score)
        #feature_dict[tweet.tweet_id]={"max":max_,"min":min_,"distance":max_-min_}
        feature_dict[tweet.tweet_id] = [max_, min_, max_ - min_]
    return feature_dict
def sentistr(x):
    senti = PySentiStr()
    senti.setSentiStrengthPath("SentiStrength.jar")
    senti.setSentiStrengthLanguageFolderPath("SentStrength_Data")
    result = senti.getSentiment(
        x,
        score='trinary')  #positive rating, negative rating and neutral rating
    return result
示例#6
0
def get_sentistrength(df):
    senti = PySentiStr()
    senti.setSentiStrengthPath('~/softwares/SentiStrengthCom.jar')
    senti.setSentiStrengthLanguageFolderPath(
        '~/softwares/SentStrength_Data_Sept2011/')
    df["text"] = [t if t != "" else " " for t in df['text']]
    result = senti.getSentiment(df["text"], score='trinary')
    df["sentistrength_pos"] = [r[0] for r in result]
    df["sentistrength_neg"] = [r[1] for r in result]
    df["sentistrength_neutral"] = [r[2] for r in result]
    return df
示例#7
0
def main():
    #mudar entrada
    with open(
            './Comentarios_csv/Test/OPOVOOnline sobre escolha do novo reitor UFC.csv'
    ) as csv_file:
        csv_dict_reader = csv.DictReader(csv_file)
        senti = PySentiStr()
        senti.setSentiStrengthPath(
            "/home/caio/Documentos/Projeto Analise Comentarios Facebook/SentiStrength.jar"
        )
        senti.setSentiStrengthLanguageFolderPath(
            "/home/caio/Documentos/Projeto Analise Comentarios Facebook/SentStrength_Data/portuguese/"
        )

        #mudar saída
        with open('./Comentarios_csv/Test/teste.csv', 'w') as csvfile:
            spamwriter = csv.writer(csvfile)
            spamwriter.writerow(
                ["Comentário", "notaPositiva", "notaNegativa", "Sentimento"])
            for row in csv_dict_reader:
                #colocar nome da coluna que tem o comentario
                if row["message"]:
                    sentence = row["message"]
                    #sentence = RemoveAccent(sentence)
                    sentence = Tokenize(sentence)
                    if sentence:
                        sentence = RemoveStopWords(sentence)
                        if sentence:
                            sentence = Stemming(sentence)
                            sentence = " ".join(sentence)
                            #sentistrength
                            result = senti.getSentiment(sentence,
                                                        score='binary')
                            if result[0][0] + result[0][1] <= -1:
                                sentiment = 'negativo'
                            elif result[0][0] + result[0][1] >= 1:
                                sentiment = 'positivo'
                            else:
                                sentiment = 'neutro'
                            spamwriter.writerow([
                                row["message"], result[0][0], result[0][1],
                                sentiment
                            ])
            print("finish!")
示例#8
0
from utils import *
import pandas as pd

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

config = get_config('config.yaml')

from sentistrength import PySentiStr
senti = PySentiStr()

# Rocket HPC
senti.setSentiStrengthPath('/gpfs/space/home/enlik/GitRepo/master-thesis-2021/references/SentiStrengthCom.jar') # Note: Provide absolute path instead of relative path
senti.setSentiStrengthLanguageFolderPath('/gpfs/space/home/enlik/GitRepo/master-thesis-2021/references/SentiStrengthData/') # Note: Provide absolute path instead of relative path



df_freenow = pd.read_csv(config['csv_input_local']['freenow_apple_google_p1'], index_col=0)
df_freenow = df_freenow.reset_index(drop=True)
total_reviews = len(df_freenow)

print(f'Total English reviews: {total_reviews} \n')
df_freenow.review = df_freenow.review.astype(str)

# df_freenow = df_freenow.head(10) # testing purpose
listOfSentimentScores = []

for i in range(0, int(len(df_freenow))):
    text_input = df_freenow.review[i]
    star_rating = df_freenow.rating[i]
    result = senti.getSentiment(text_input)
示例#9
0
from sentistrength import PySentiStr

#inicializando sentistrength
senti = PySentiStr()
senti.setSentiStrengthPath("SentiStrength.jar")
senti.setSentiStrengthLanguageFolderPath("SentiStrength_Data")

frase1 = senti.getSentiment('The food here is GREAT!!', score='dual')
frase2 = senti.getSentiment('The food here is GREAT!!', score='binary')
frase3 = senti.getSentiment('The food here is GREAT!!', score='trinary')
frase4 = senti.getSentiment('The food here is GREAT!!', score='scale')
print("Frase1 na saída dual:", frase1)
print("Frase2 na saída binary:", frase2)
print("Frase3 na saída trinary:", frase3)
print("Frase4 na saída scale:", frase4)





# -Recommended to run sentistrength on the csv containing the column that contains full text

#The location of SentiStrength on your computer
SentiStrengthLocation = "C:/Users/ThinkPad/SpyderProjects/sentistrengthStuff/SentiStrength.jar" 

#The location of the unzipped SentiStrength data files on your computer
SentiStrengthLanguageFolder = "C:/Users/ThinkPad/SpyderProjects/sentistrengthStuff/SentiStrength_Data/" 

#Check if the paths are correct (if the paths are correct, you will see no flags thrown)
if not os.path.isfile(SentiStrengthLocation):
    print("SentiStrength not found at: ", SentiStrengthLocation)
if not os.path.isdir(SentiStrengthLanguageFolder):
    print("SentiStrength data folder not found at: ", SentiStrengthLanguageFolder)

# Initiate an object
senti = PySentiStr()

# set paths
senti.setSentiStrengthPath(SentiStrengthLocation) 
senti.setSentiStrengthLanguageFolderPath(SentiStrengthLanguageFolder) 


# Read csv (give your path)
all_files = glob.glob("C:/Users/ThinkPad/SpyderProjects/sentistrengthStuff/entropy files" + "/*.csv")

li = []

#Make a dataframe from appending lists
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, error_bad_lines=False)
    li.append(df)
示例#11
0
def prAnalysis(
    config: Configuration,
    senti: sentistrength.PySentiStr,
    delta: relativedelta,
    batchDates: List[datetime],
):

    print("Querying PRs")
    batches = prRequest(config.pat, config.repositoryOwner,
                        config.repositoryName, delta, batchDates)

    batchParticipants = list()
    batchComments = list()

    for batchIdx, batch in enumerate(batches):
        print(f"Analyzing PR batch #{batchIdx}")

        # extract data from batch
        prCount = len(batch)
        participants = list(pr["participants"] for pr in batch
                            if len(pr["participants"]) > 0)
        batchParticipants.append(participants)

        allComments = list()
        prPositiveComments = list()
        prNegativeComments = list()
        generallyNegative = list()

        print(f"    Sentiments per PR", end="")

        semaphore = threading.Semaphore(15)
        threads = []
        for pr in batch:

            comments = list(comment for comment in pr["comments"]
                            if comment and comment.strip())

            # split comments that are longer than 20KB
            splitComments = []
            for comment in comments:

                # calc number of chunks
                byteChunks = math.ceil(sys.getsizeof(comment) / (20 * 1024))
                if byteChunks > 1:

                    # calc desired max length of each chunk
                    chunkLength = math.floor(len(comment) / byteChunks)

                    # divide comment into chunks
                    chunks = [
                        comment[i * chunkLength:i * chunkLength + chunkLength]
                        for i in range(0, byteChunks)
                    ]

                    # save chunks
                    splitComments.extend(chunks)

                else:
                    # append comment as-is
                    splitComments.append(comment)

            # re-assign comments after chunking
            comments = splitComments

            if len(comments) == 0:
                prPositiveComments.append(0)
                prNegativeComments.append(0)
                continue

            allComments.extend(comments)

            thread = threading.Thread(
                target=analyzeSentiments,
                args=(
                    senti,
                    comments,
                    prPositiveComments,
                    prNegativeComments,
                    generallyNegative,
                    semaphore,
                ),
            )
            threads.append(thread)

        for thread in threads:
            thread.start()

        for thread in threads:
            thread.join()

        print("")

        # save comments
        batchComments.append(allComments)

        # get comment length stats
        commentLengths = [len(c) for c in allComments]

        generallyNegativeRatio = len(generallyNegative) / prCount

        # get pr duration stats
        durations = [(pr["closedAt"] - pr["createdAt"]).days for pr in batch]

        print("    All sentiments")

        commentSentiments = []
        commentSentimentsPositive = 0
        commentSentimentsNegative = 0

        if len(allComments) > 0:
            commentSentiments = senti.getSentiment(allComments)
            commentSentimentsPositive = sum(
                1 for _ in filter(lambda value: value >= 1, commentSentiments))
            commentSentimentsNegative = sum(
                1
                for _ in filter(lambda value: value <= -1, commentSentiments))

        toxicityPercentage = getToxicityPercentage(config, allComments)

        centrality.buildGraphQlNetwork(batchIdx, participants, "PRs", config)

        print("    Writing results")
        with open(
                os.path.join(config.resultsPath, f"results_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["NumberPRs", prCount])
            w.writerow(["NumberPRComments", len(allComments)])
            w.writerow(["PRCommentsPositive", commentSentimentsPositive])
            w.writerow(["PRCommentsNegative", commentSentimentsNegative])
            w.writerow(["PRCommentsNegativeRatio", generallyNegativeRatio])
            w.writerow(["PRCommentsToxicityPercentage", toxicityPercentage])

        with open(
                os.path.join(config.metricsPath, f"PRCommits_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["PR Number", "Commit Count"])
            for pr in batch:
                w.writerow([pr["number"], pr["commitCount"]])

        with open(
                os.path.join(config.metricsPath,
                             f"PRParticipants_{batchIdx}.csv"),
                "a",
                newline="",
        ) as f:
            w = csv.writer(f, delimiter=",")
            w.writerow(["PR Number", "Developer Count"])
            for pr in batch:
                w.writerow([pr["number"], len(set(pr["participants"]))])

        # output statistics
        stats.outputStatistics(
            batchIdx,
            commentLengths,
            "PRCommentsLength",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            durations,
            "PRDuration",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [len(pr["comments"]) for pr in batch],
            "PRCommentsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [pr["commitCount"] for pr in batch],
            "PRCommitsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            commentSentiments,
            "PRCommentSentiments",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            [len(set(pr["participants"])) for pr in batch],
            "PRParticipantsCount",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            prPositiveComments,
            "PRCountPositiveComments",
            config.resultsPath,
        )

        stats.outputStatistics(
            batchIdx,
            prNegativeComments,
            "PRCountNegativeComments",
            config.resultsPath,
        )

    return batchParticipants, batchComments
示例#12
0
def __aggregate(_posts, _media, _comments, session, logger):
    _entries = list()

    # Initialize sentistrength variable
    senti = PySentiStr()
    setup_sentistrength_path(senti)

    for p in _posts:
        try:
            """ Id, name of a post """
            entry = [p.id, p.name]
            """ Number of version of a post """
            version = __extract_version(p.discussion_url)
            entry = entry + [version]
            """ Number of tags for a product """
            tags_number = session.query(func.count(
                Topic.name)).filter(Topic.post_id == p.id).scalar()
            entry = entry + [tags_number]

            entry = entry + [p.featured, p.votes_count, p.day, p.created_at]
            """ Time features """
            launch_day = get_day_name_from_date(p.created_at.year,
                                                p.created_at.month,
                                                p.created_at.day)
            best_launch_time = is_best_posted_time(p.created_at.hour,
                                                   p.created_at.minute,
                                                   p.created_at.second)
            best_launch_day = is_best_launched_day(p.created_at.hour,
                                                   p.created_at.minute,
                                                   p.created_at.second,
                                                   launch_day)
            max_follower = session.query(func.max(
                User.followers_count)).scalar()
            maker_id = session.query(
                Apps.maker_id).filter(Apps.post_id == p.id).one()[0]
            maker = session.query(
                User.id, User.name, User.twitter_username, User.website_url,
                User.followers_count).filter(User.id == maker_id).one()
            weekend = is_weekend(maker.followers_count, max_follower,
                                 launch_day)
            entry = entry + [
                launch_day, best_launch_time, best_launch_day, weekend
            ]
            """ Presentation features """
            entry = entry + [p.description]
            if p.description:
                """ Extraction of maker sentiment based on the description of his post """
                maker_description_sentiment = __extract_sentiment(
                    senti, p.description)
                entry = entry + [
                    maker_description_sentiment[0][0],
                    maker_description_sentiment[0][1], '', '', ''
                ]

                # Text length
                entry = entry + [len(p.description)]

                # Sentence length
                sentence = get_sentence(p.description)
                sentence_length_sum = 0
                for i in range(0, len(sentence)):
                    sentence_length_sum = sentence_length_sum + len(
                        sentence[i])
                try:
                    sentence_length_average = sentence_length_sum / len(
                        sentence)
                except ZeroDivisionError:
                    sentence_length_average = 0.0
                entry = entry + [round(sentence_length_average)]

                # Bullet points / Explicit features
                bullet_points_explicit_features = __extract_bullet_points_explicit_features(
                    sentence)
                entry = entry + [bullet_points_explicit_features]

                # Emoji in description
                emoji_description = __extract_emoji(p.description)
                entry = entry + [emoji_description]
            else:
                entry = entry + [1, -1, '', '', '', 0, 0, 'No', 'No']

            entry = entry + [p.tagline]
            if p.tagline:
                # Tagline length
                entry = entry + [len(p.tagline)]

                # Emoji in tagline
                emoji_tagline = __extract_emoji(p.tagline)
                entry = entry + [emoji_tagline]
            else:
                entry = entry + [0, 'No']

            # Video, Tweetable images, Gif and Gif's number for a post
            video = []
            tweetable_images = []
            gif = []
            index_media = 0
            while index_media < len(_media):
                # check if the current post_id is equal to the post_id of the current media
                if p.id == _media[index_media][0]:
                    # check if the media type is 'video'
                    if _media[index_media][1] == 'video':
                        # append to the list the link of the video
                        video = video + [_media[index_media][2]]

                    # calculate the image size passing its width and its height
                    roughly, ratio = calculate_aspect_ratio(
                        _media[index_media][3], _media[index_media][4])
                    # check if the image is a tweetable image
                    if (roughly == 2) and (ratio == 1):
                        # append to the list the image url
                        tweetable_images = tweetable_images + [
                            _media[index_media][5]
                        ]

                    # check if the image is a gif image passing its url
                    found = is_gif(_media[index_media][5])
                    if found:
                        # append to the list the image url
                        gif = gif + [_media[index_media][5]]
                index_media = index_media + 1
            if video:
                entry = entry + ['Yes']
            else:
                entry = entry + ['No']
            if tweetable_images:
                entry = entry + ['Yes']
            else:
                entry = entry + ['No']
            if gif:
                entry = entry + [gif, len(gif)]
            else:
                entry = entry + ['', len(gif)]

            # Offers, Promo/Discount Codes, Questions, Maker_inside, Hunter_inside in comment body for a post
            offers = []
            questions = []
            promo_codes = []
            # maker_follows_up_on_comments = 0
            hunter_follows_up_on_comments = 0
            maker_comments = []
            others_comments = []
            comm_in_thread = []
            hunter_id = session.query(
                Hunts.hunter_id).filter(Hunts.post_id == p.id).one()[0]
            index_comment = 0
            while index_comment < len(_comments):
                # check if the current post_id is equal to the post_id of the current comment
                if p.id == _comments[index_comment][3]:
                    # extract offers passing the comment body
                    offer = __extract_offers(_comments[index_comment][1])
                    if offer:
                        offers = offers + offer

                    # extract questions passing the comment body
                    question = __extract_questions(_comments[index_comment][1])
                    if question:
                        questions = questions + question

                    # extract promo_codes passing the comment body
                    promo_code = __extract_promo_codes(
                        _comments[index_comment][1])
                    if promo_code:
                        promo_codes = promo_codes + promo_code

                    # # check if the maker follows up on the current comment
                    # if _comments[index_comment][4] == maker_id:
                    #    maker_follows_up_on_comments = 1

                    # put comments in comm list (comment_id, comment_body, created_at, user_id)
                    comm_in_thread.append([
                        _comments[index_comment][0],
                        _comments[index_comment][1],
                        _comments[index_comment][2],
                        _comments[index_comment][4]
                    ])

                    # check if the hunter follows up on the current comment
                    if _comments[index_comment][4] == hunter_id:
                        hunter_follows_up_on_comments = 1
                    """ Extraction of maker sentiment based on his post comments written the day of launch """
                    if _comments[index_comment][4] == maker_id:
                        # date of maker's comment written the day the post was launched
                        comment_date = _comments[index_comment][2]
                        # cut the comments written days after the post was launched
                        if (p.created_at.year == comment_date.year) and (
                                p.created_at.month == comment_date.month) and (
                                    p.created_at.day == comment_date.day):
                            if not maker_comments:
                                maker_comments = [_comments[index_comment][1]]
                            else:
                                maker_comments = maker_comments + [
                                    _comments[index_comment][1]
                                ]
                    """ Extraction of others users sentiment based on their post comments written the day of launch """
                    if (_comments[index_comment][4] !=
                            maker_id) and (maker_id != hunter_id):
                        # date of others comment written the day the post was launched
                        comment_date = _comments[index_comment][2]
                        # cut the comments written days after the post was launched
                        if (p.created_at.year == comment_date.year) and (
                                p.created_at.month == comment_date.month) and (
                                    p.created_at.day == comment_date.day):
                            if not others_comments:
                                others_comments = [_comments[index_comment][1]]
                            else:
                                others_comments = others_comments + [
                                    _comments[index_comment][1]
                                ]

                index_comment = index_comment + 1
            if offers:
                entry = entry + ['Yes']
            else:
                entry = entry + ['No']
            if promo_codes:
                entry = entry + ['Yes']
            else:
                entry = entry + ['No']
            if questions:
                entry = entry + ['Yes']
            else:
                entry = entry + ['No']

            # check if the maker writes the first comment in the thread
            maker_started_comment_thread = 0
            if comm_in_thread:
                if (p.created_at.year == comm_in_thread[0][2].year) and (
                        p.created_at.month == comm_in_thread[0][2].month) and (
                            p.created_at.day == comm_in_thread[0][2].day):
                    if comm_in_thread[0][3] == maker_id:
                        maker_started_comment_thread = 1

            # calculate maker comment ratio ((number of maker comments / number of all comments)*100)
            number_maker_comments = 0
            number_others_comments = 0
            if comm_in_thread:
                for i in range(0, len(comm_in_thread)):
                    if (p.created_at.year == comm_in_thread[i][2].year) and (
                            p.created_at.month == comm_in_thread[i][2].month
                    ) and (p.created_at.day == comm_in_thread[i][2].day):
                        if comm_in_thread[i][3] == maker_id:
                            number_maker_comments = number_maker_comments + 1
                        else:
                            number_others_comments = number_others_comments + 1
            thread_length = number_maker_comments + number_others_comments
            try:
                if maker_started_comment_thread == 1:
                    maker_comment_ratio = (number_maker_comments /
                                           thread_length) * 100
                else:
                    maker_comment_ratio = 0.0
            except ZeroDivisionError:
                maker_comment_ratio = 0.00

            # Hunter reputation
            hunter = session.query(
                User.id, User.name, User.twitter_username, User.website_url,
                User.followers_count,
                User.apps_made_count).filter(User.id == hunter_id).one()
            entry = entry + [
                hunter.id, hunter.name, hunter.twitter_username,
                hunter.website_url, hunter.followers_count,
                hunter.apps_made_count, hunter_follows_up_on_comments
            ]

            # Maker reputation
            entry = entry + [
                maker.id, maker.name, maker.twitter_username,
                maker.website_url, maker.followers_count,
                maker_started_comment_thread,
                round(maker_comment_ratio, 2), thread_length
            ]

            # check if the hunter is also the maker and append the variable hunter_is_maker to the list entry
            hunter_is_maker = 0
            if hunter_id == maker_id:
                hunter_is_maker = 1
            entry = entry + [hunter_is_maker]

            # Append to the list the maker comment sentiment
            if maker_comments:
                comment = '\n'.join(maker_comments)
                sentiment = __extract_sentiment(senti, comment)
            else:
                comment = ''
                sentiment = [[1, -1]]
            entry = entry + [
                comment, sentiment[0][0], sentiment[0][1], '', '', ''
            ]

            # Append to the list the others comment sentiment
            if others_comments:
                comment = '\n'.join(others_comments)
                sentiment = __extract_sentiment(senti, comment)
            else:
                comment = ''
                sentiment = [[1, -1]]
            entry = entry + [
                comment, sentiment[0][0], sentiment[0][1], '', '', ''
            ]

            _entries.append(entry)
        except NoResultFound as ex:
            logger.error(str(ex))
            continue
        except MultipleResultsFound as ex:
            logger.error(str(ex))
            continue
    return _entries
def main():
    with open(
            './OPOVOOnline sobre escolha do novo reitor UFC.csv') as csv_file:
        csv_dict_reader = csv.DictReader(csv_file)
        senti = PySentiStr()
        senti.setSentiStrengthPath(
            "/home/caio/Documentos/Projeto Analise Comentarios Facebook/SentiStrength.jar"
        )
        senti.setSentiStrengthLanguageFolderPath(
            "/home/caio/Documentos/Projeto Analise Comentarios Facebook/SentStrength_Data/portuguese/"
        )
        prev_message = ""

        with open(
                '/home/caio/Documentos/Projeto Analise Comentarios Facebook/Frases_Neutras.csv',
                'w') as csvfile:
            spamwriter = csv.writer(csvfile)
            spamwriter.writerow(["Frase", "notaPositiva", "notaNegativa"])
            #sentistrength
            for row in csv_dict_reader:
                if prev_message != row["message"] and row["message"]:
                    sentence = row["message"]
                    #sentence = RemoveAccent(sentence)
                    sentence = Tokenize(sentence)
                    if sentence:
                        sentence = RemoveStopWords(sentence)
                        if sentence:
                            sentence = Stemming(sentence)
                            sentence = " ".join(sentence)
                            result = senti.getSentiment(sentence,
                                                        score='binary')
                            if result[0][0] + result[0][1] == 0:
                                #salvar frase tokenizada
                                #spamwriter.writerow([sentence, result[0][0], result[0][1]])
                                #salvar frase inteira
                                spamwriter.writerow([
                                    row["message"], result[0][0], result[0][1]
                                ])
                #publicacao com resposta de comentários
                if row["object_link.connections.comments.message"] != 'null' and row[
                        "object_link.connections.comments.message"]:
                    sentence = row["object_link.connections.comments.message"]
                    #sentence = RemoveAccent(sentence)
                    sentence = Tokenize(sentence)
                    if sentence:
                        sentence = RemoveStopWords(sentence)
                        if sentence:
                            sentence = Stemming(sentence)
                            sentence = " ".join(sentence)
                            result = senti.getSentiment(sentence,
                                                        score='binary')
                            if result[0][0] + result[0][1] == 0:
                                #mostrar tokenizada
                                #spamwriter.writerow([sentence, result[0][0], result[0][1]])
                                #mostrar frase inteira
                                spamwriter.writerow([
                                    row["object_link.connections.comments.message"],
                                    result[0][0], result[0][1]
                                ])
                prev_message = row["message"]
            print("finish!")
示例#14
0
import sys
import json
import csv
import pandas as pd
import re
import demoji
import emoji
from datetime import datetime
from sentistrength import PySentiStr
from langdetect import detect

#prendo in input la caption. Deve essere stringa. Es: python3 txt_features.py "ciao come stai?"
caption = sys.argv[1]
senti = PySentiStr()

#impostare i 3 percorsi corretti, trovate i tre file nella cartella SentiStrength - 1) SentiStrength.jar - 2) SentStrength_Data_EN - 3) SentStrength_Data_IT2
senti.setSentiStrengthPath('./SentiStrength/SentiStrength.jar')
eng_path = './SentiStrength/SentStrength_Data_EN'
ita_path = './SentiStrength/SentStrength_Data_IT2'


def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')


#number of hashtag
def hashtag_count(string):
    count = len([string for words in string.split() if words.startswith('#')])
    return count

示例#15
0
def pre_process_and_predict(sentence):
    wordnet_lemmatizer = WordNetLemmatizer()
    # # Replacing double quotes with single, within a string
    sentence = sentence.replace("\"", "\'")
    # # Removing unnecessary special characters, keeping only ,  ! ?
    sentence = re.sub(r"[^!?,a-zA-Z0-9\ ]+", '', sentence)
    # # Lemmatization on verbs
    sentence = ' '.join([
        wordnet_lemmatizer.lemmatize(word, pos='v')
        for word in word_tokenize(sentence)
    ])

    sn = SenticNet()
    senti = PySentiStr()
    senti.setSentiStrengthPath(CODE_PATH + '/sentistrength/SentiStrength.jar')
    senti.setSentiStrengthLanguageFolderPath(
        CODE_PATH + '/sentistrength/SentStrength_Data/')

    sentiment_score = []

    for sen in sent_tokenize(sentence):
        senti_pos, senti_neg = senti.getSentiment(sen, score='dual')[0]
        senti_pos -= 1
        if senti_neg == -1:
            senti_neg = 0
        sum_pos_score = 0
        sum_neg_score = 0
        for word in word_tokenize(sen):
            try:
                w_score = float(sn.polarity_intense(word)) * 5
            except KeyError:
                w_score = 0
            if w_score > 0:
                sum_pos_score = sum_pos_score + w_score
            elif w_score < 0:
                sum_neg_score = sum_neg_score + w_score
        sum_pos_score = (sum_pos_score + senti_pos) / 2
        sum_neg_score = (sum_neg_score + senti_neg) / 2
        sentiment_score.append((sum_pos_score, sum_neg_score))
    additional_features_s = []
    additional_features_ns = []

    contra = []
    pos_low = []
    pos_medium = []
    pos_high = []
    neg_low = []
    neg_medium = []
    neg_high = []

    for sum_pos_score, sum_neg_score in sentiment_score:
        contra.append(int(sum_pos_score > 0 and abs(sum_neg_score) > 0))
        pos_low.append(int(sum_pos_score < 0))
        pos_medium.append(int(sum_pos_score >= 0 and sum_pos_score <= 1))
        pos_high.append(int(sum_pos_score >= 2))
        neg_low.append(int(sum_neg_score < 0))
        neg_medium.append(int(sum_neg_score >= 0 and sum_neg_score <= 1))
        neg_high.append(int(sum_neg_score >= 2))
    additional_features_s = additional_features_s + [
        max(pos_medium),
        max(pos_high),
        max(neg_medium),
        max(neg_high)
    ]
    additional_features_ns = additional_features_ns + [
        max(pos_low), max(neg_low)
    ]

    tweet = sentence
    punctuation_count = SequencePunctuationCount(tweet)
    character_count = SequenceCharacterCount(tweet)
    capitalized_count = CapitalizedCount(tweet)
    exclamation_count = ExclamationCount(tweet)
    #     emoji_count       = EmojiCount(tweet)
    f_count = [
        punctuation_count, character_count, capitalized_count,
        exclamation_count
    ]
    for count in f_count:
        f_low = int(count == 0)
        f_medium = int(count >= 1 and count <= 3)
        f_high = int(count >= 4)
        additional_features_s = additional_features_s + [f_medium, f_high]
        additional_features_ns = additional_features_ns + [f_low]
    X = [sentence]

    in_file = open(os.path.join(PICKLES_PATH, "vocab.pickle"), "rb")
    vocab = pickle.load(in_file)
    in_file.close()

    in_file = open(os.path.join(PICKLES_PATH, "model.pickle"), "rb")
    model = pickle.load(in_file)
    in_file.close()

    vectorizer = TfidfVectorizer(vocabulary=vocab)
    X = vectorizer.fit_transform(X)
    ans = int(sum(model.predict(X)))
    print('Sentence : ', sentence)
    print('Sarcastic features : ', additional_features_s)
    print('Not Sarcastic features : ', additional_features_ns)
    print('Contradict : ', max(contra))
    print('Model Predict : ', ans)
    print(
        'My obs : ',
        int((sum(additional_features_s) >= sum(additional_features_ns))
            and max(contra) == 1))
    print('Final Prd : ', end='')

    if ans == 1 or ((sum(additional_features_s) >= sum(additional_features_ns))
                    and max(contra) == 1):
        return True
    else:
        return False
示例#16
0
import itertools
#from multiprocessing.pool import ThreadPool
#pool = ThreadPool(20)  # However many you wish to run in parallel

from tqdm import tqdm

import glob

import os.path
import sys
from os import getcwd


from sentistrength import PySentiStr
senti = PySentiStr()
#senti.setSentiStrengthPath('C:\\SentiStrength\\SentiStrength.jar') # e.g. 'C:\Documents\SentiStrength.jar'
#senti.setSentiStrengthLanguageFolderPath('C:\\SentiStrength') # e.g. 'C:\Documents\SentiStrengthData\'
senti.setSentiStrengthPath(os.path.join(getcwd(),"SentiStrengthData/SentiStrength.jar"))
senti.setSentiStrengthLanguageFolderPath(os.path.join(getcwd(),"SentiStrengthData/"))

def preprocess_data(data):

    data_out = pd.DataFrame()
    data_out = data[['type','content']]
    data_out.dropna(inplace=True)
    return data_out

def count_words(text):
    try:
        return len(TextBlob(text).words)
#!/usr/bin/env python
# coding: utf-8

from utils import *

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

config = get_config('config.yaml')

from sentistrength import PySentiStr
senti = PySentiStr()
senti.setSentiStrengthPath(
    '/Users/enlik/GitRepo/master-thesis-2021/references/SentiStrengthCom.jar'
)  # Note: Provide absolute path instead of relative path
senti.setSentiStrengthLanguageFolderPath(
    '/Users/enlik/GitRepo/master-thesis-2021/references/SentiStrengthData/'
)  # Note: Provide absolute path instead of relative path

# # Sample use case of SentiStrength

result = senti.getSentiment('What a bad day')
print(result)

str_arr = ['What a lovely day', 'What a bad day']
result = senti.getSentiment(str_arr, score='scale')
print(result)

str_arr = ['What a lovely day', 'What a bad day']
result = senti.getSentiment(str_arr, score='dual')
print(result)
示例#18
0
from sentistrength import PySentiStr

senti = PySentiStr()
senti.setSentiStrengthPath('data/sentistrength/SentiStrength5.jar')
senti.setSentiStrengthLanguageFolderPath('data/sentistrength/SentStrength_Data')


def analyse_sentence(sentence):
    return senti.getSentiment(sentence)
示例#19
0
class Maestro:
    def __init__(self, df, output_path, output_name, batch):
        # storing variables
        self.df = df
        self.filename = Path(output_path) / output_name
        self.raw_file = '{}_raw.csv'.format(self.filename)
        self.batch = batch

        # initialize tools
        self.translator = Translator()
        self.__initialize_senti()

        # collect jobs
        job_list = self.__collect_jobs()
        self.total_job = len(job_list)

        # initialize queues
        self.jobs = Queue(maxsize=self.total_job)
        for job in job_list:
            self.jobs.put(job)
        self.results = Queue(maxsize=self.total_job)

        # setup threading variables
        self.stop = threading.Event()
        self.worker_ct_lock = threading.Lock()
        self.worker_ct = 0  # num_of_spawned worker

    def __initialize_senti(self):
        self.senti = PySentiStr()
        self.senti.setSentiStrengthPath(
            str(Path.cwd() / 'lib' / 'SentiStrengthCom.jar'))
        self.senti.setSentiStrengthLanguageFolderPath(str(Path.cwd() / 'lang'))

        # simple test to make sure senti works
        test = self.senti.getSentiment(['You are beautiful'], 'dual')
        assert type(test) is list
        assert type(test[0]) is tuple

    def __collect_jobs(self):
        try:
            out_df = pd.read_csv(self.raw_file, header=None)
            processed_ser = self.df['tweetid'].isin(out_df[1])
        except FileNotFoundError:
            zeros = np.zeros((len(self.df.index), ), dtype=bool)
            processed_ser = pd.Series(zeros)

        job_list = processed_ser[~processed_ser].index
        job_list = list(grouper(job_list, self.batch))
        if len(job_list) > 0:
            job_list[-1] = tuple(job for job in job_list[-1]
                                 if job is not None)

        return job_list

    def __despawn_worker(self):
        with self.worker_ct_lock:
            self.worker_ct = self.worker_ct - 1

    def __translate(self, thread_num):
        with self.worker_ct_lock:
            self.worker_ct = self.worker_ct + 1
        while not self.stop.is_set() and not self.jobs.empty():
            job = self.jobs.get()
            try:
                mini_df = self.df.loc[job, ]  # trailing comma is needed
                ids = mini_df.iloc[:, 0]
                items = mini_df.iloc[:, -1].to_numpy().tolist()
            except Exception as e:
                print('Worker #{} got pandas error: {}'.format(thread_num, e))
                break

            try:
                if len(items) == 1:
                    translations = [self.translator.translate(items)]
                else:
                    translations = self.translator.translate(items)
            except Exception as e:
                print('Worker #{} got translation error: {}'.format(
                    thread_num, e))
                break

            self.results.put((job, ids, translations))

        self.__despawn_worker()

    def __save(self, results):
        with open(self.raw_file, 'a', encoding='utf-8',
                  newline='') as csv_file:
            writer = csv.writer(csv_file,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerows(results)

    def __process(self, score='dual'):
        total_batch = int(np.ceil(len(self.df.index) / self.batch))
        pbar = tqdm(total=total_batch, initial=(total_batch - self.total_job))

        while not self.stop.is_set() or not self.results.empty():
            time.sleep(2)
            if not self.results.empty():
                # merges all results
                job_list, id_list, translation_list = ([], [], [])
                steps = 0
                while not self.results.empty():
                    job, ids, translations = self.results.get()
                    job_list.extend(job)
                    id_list.extend(ids)
                    translation_list.extend(translations)
                    steps = steps + 1

                # analyze sentiments
                texts = [tr.text for tr in translation_list]
                try:
                    sentis = self.senti.getSentiment(texts, score)
                except Exception as e:
                    print('Process got sentistrength error:', e)
                    break

                try:
                    rows = [
                        (order, i, *senti, tr.src, text)
                        for order, i, senti, tr, text in zip(
                            job_list, id_list, sentis, translation_list, texts)
                    ]
                except Exception as e:
                    print(e)
                    break

                try:
                    self.__save(rows)
                except Exception as e:
                    print('Process got on save error:', e)
                    break

                pbar.update(steps)
            time.sleep(.1)  # prevent too much loop checking

        if not self.stop.is_set():
            self.stop.set()  # force stop all threads

        print('Rebuilding...')
        self.__rebuild()

        print('Exiting...')
        pbar.close()

    def __rebuild(self):
        try:
            sf = pd.read_csv(self.raw_file,
                             header=None,
                             names=[
                                 'order', 'tweetid', '+', '-', 'src_lang',
                                 'translation'
                             ])
            sf.sort_values('order', inplace=True)
            sf.to_csv('{}.csv'.format(self.filename), index=None)
        except FileNotFoundError:
            pass
        except Exception as e:
            print(ERR_STR.format('rebuild', 'on rebuilding csv'), e)

    def play(self, n_thread=1):
        if n_thread < 1:
            return
        with ThreadPoolExecutor(max_workers=n_thread + 1) as executor:
            try:
                executor.map(self.__translate, range(n_thread))
                print('Spawing {} workers...'.format(n_thread))
                while self.worker_ct is 0:
                    pass  # waiting for any worker being spawned
                print('Aye, Sir!')
                executor.submit(self.__process)

                # as long as there are atleast a worker
                while self.worker_ct > 0:
                    # wait for any keyboard interrupt
                    time.sleep(.5)  # power napping for half second
                # either no job left or all worker has been despawned
                self.stop.set()

                if self.jobs.empty():
                    print('All done!')
                if self.worker_ct is 0:
                    print('All workers quit their job!')
            except KeyboardInterrupt:
                print('\nKeyboard interrupt')
            except Exception as e:
                print(ERR_STR.format('play', 'something went wrong'), e)
            finally:
                self.stop.set()

        print('Byee 👋')
示例#20
0
import xml.etree.ElementTree as xml
from sentistrength import PySentiStr

#inicializando sentistrength
sstrength = PySentiStr()
sstrength.setSentiStrengthPath("SentiStrength.jar")
sstrength.setSentiStrengthLanguageFolderPath("SentiStrength_Data")


# Dada uma lista com as respostas, retorna uma lista com os valores de sentimento
# gerados pelo SentiStr
def analise_sentistr(respostas):
    return sstrength.getSentiment(respostas)
import pandas as pd
from sentistrength import PySentiStr
senti = PySentiStr()
senti.setSentiStrengthPath(
    'SentiStrengthCom.jar'
)  # Note: Provide absolute path instead of relative path
senti.setSentiStrengthLanguageFolderPath(
    'SentStrength_Data_Sept2011'
)  # Note: Provide absolute path instead of relative path

str_arr = ['What a lovely day', 'What a bad day']

result = senti.getSentiment(str_arr)
print(result)
result = senti.getSentiment(str_arr, score='scale')
print(result)

# OR, if you want dual scoring (a score each for positive rating and negative rating)
result = senti.getSentiment(str_arr, score='dual')
print(result)

# OR, if you want binary scoring (1 for positive sentence, -1 for negative sentence)
result = senti.getSentiment(str_arr, score='binary')
print(result)

# OR, if you want trinary scoring (a score each for positive rating, negative rating and neutral rating)
result = senti.getSentiment(str_arr, score='trinary')
print(result)
示例#22
0
def commitBatchAnalysis(
    idx: int, senti: PySentiStr, commits: List[git.Commit], config: Configuration
):

    authorInfoDict = {}
    timezoneInfoDict = {}
    experienceDays = 150

    # traverse all commits
    print("Analyzing commits")
    startDate = None
    if config.startDate is not None:
        startDate = datetime.strptime(config.startDate, "%Y-%m-%d")
        startDate = startDate.replace(tzinfo=pytz.UTC)
    # sort commits
    commits.sort(key=lambda o: o.committed_datetime, reverse=True)

    commitMessages = []
    commit: Commit
    lastDate = None
    firstDate = None
    realCommitCount = 0
    for commit in Bar("Processing").iter(commits):
        if startDate is not None and startDate > commit.committed_datetime:
            continue
        if lastDate is None:
            lastDate = commit.committed_date
        firstDate = commit.committed_date
        realCommitCount = realCommitCount + 1
        # extract info
        author = authorIdExtractor(commit.author)
        timezone = commit.author_tz_offset
        time = commit.authored_datetime

        # get timezone
        timezoneInfo = timezoneInfoDict.setdefault(
            timezone, dict(commitCount=0, authors=set())
        )

        # save info
        timezoneInfo["authors"].add(author)

        if commit.message and commit.message.strip():
            commitMessages.append(commit.message)

        # increase commit count
        timezoneInfo["commitCount"] += 1

        # get author
        authorInfo = authorInfoDict.setdefault(
            author,
            dict(
                commitCount=0,
                sponsoredCommitCount=0,
                earliestCommitDate=time,
                latestCommitDate=time,
                sponsored=False,
                activeDays=0,
                experienced=False,
            ),
        )

        # increase commit count
        authorInfo["commitCount"] += 1

        # validate earliest commit
        # by default GitPython orders commits from latest to earliest
        if time < authorInfo["earliestCommitDate"]:
            authorInfo["earliestCommitDate"] = time

        # check if commit was between 9 and 5
        if not commit.author_tz_offset == 0 and time.hour >= 9 and time.hour <= 17:
            authorInfo["sponsoredCommitCount"] += 1

    print("Analyzing commit message sentiment")
    sentimentScores = []
    commitMessageSentimentsPositive = []
    commitMessageSentimentsNegative = []

    if len(commitMessages) > 0:
        sentimentScores = senti.getSentiment(commitMessages)
        commitMessageSentimentsPositive = list(
            result for result in filter(lambda value: value >= 1, sentimentScores)
        )
        commitMessageSentimentsNegative = list(
            result for result in filter(lambda value: value <= -1, sentimentScores)
        )

    print("Analyzing authors")
    sponsoredAuthorCount = 0
    for login, author in authorInfoDict.items():

        # check if sponsored
        commitCount = int(author["commitCount"])
        sponsoredCommitCount = int(author["sponsoredCommitCount"])
        diff = sponsoredCommitCount / commitCount
        if diff >= 0.95:
            author["sponsored"] = True
            sponsoredAuthorCount += 1

        # calculate active days
        earliestDate = author["earliestCommitDate"]
        latestDate = author["latestCommitDate"]
        activeDays = (latestDate - earliestDate).days + 1
        author["activeDays"] = activeDays

        # check if experienced
        if activeDays >= experienceDays:
            author["experienced"] = True

    # calculate percentage sponsored authors
    percentageSponsoredAuthors = sponsoredAuthorCount / len([*authorInfoDict])

    # calculate active project days
    firstCommitDate = None
    lastCommitDate = None
    if firstDate is not None:
        firstCommitDate = datetime.fromtimestamp(firstDate)
    if lastDate is not None:
        lastCommitDate = datetime.fromtimestamp(lastDate)
    daysActive = 0
    if lastCommitDate is not None:
        daysActive = (lastCommitDate - firstCommitDate).days

    print("Outputting CSVs")

    # output author days on project
    with open(
        os.path.join(config.metricsPath, f"authorDaysOnProject_{idx}.csv"),
        "a",
        newline="",
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "# of Days"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["activeDays"]])

    # output commits per author
    with open(
        os.path.join(config.metricsPath, f"commitsPerAuthor_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Author", "Commit Count"])
        for login, author in authorInfoDict.items():
            w.writerow([login, author["commitCount"]])

    # output timezones
    with open(
        os.path.join(config.metricsPath, f"timezones_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["Timezone Offset", "Author Count", "Commit Count"])
        for key, timezone in timezoneInfoDict.items():
            w.writerow([key, len(timezone["authors"]), timezone["commitCount"]])

    # output results
    with open(
        os.path.join(config.resultsPath, f"results_{idx}.csv"), "a", newline=""
    ) as f:
        w = csv.writer(f, delimiter=",")
        w.writerow(["CommitCount", realCommitCount])
        w.writerow(["DaysActive", daysActive])
        w.writerow(["FirstCommitDate", "{:%Y-%m-%d}".format(firstCommitDate)])
        w.writerow(["LastCommitDate", "{:%Y-%m-%d}".format(lastCommitDate)])
        w.writerow(["AuthorCount", len([*authorInfoDict])])
        w.writerow(["SponsoredAuthorCount", sponsoredAuthorCount])
        w.writerow(["PercentageSponsoredAuthors", percentageSponsoredAuthors])
        w.writerow(["TimezoneCount", len([*timezoneInfoDict])])

    outputStatistics(
        idx,
        [author["activeDays"] for login, author in authorInfoDict.items()],
        "AuthorActiveDays",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [author["commitCount"] for login, author in authorInfoDict.items()],
        "AuthorCommitCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [len(timezone["authors"]) for key, timezone in timezoneInfoDict.items()],
        "TimezoneAuthorCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        [timezone["commitCount"] for key, timezone in timezoneInfoDict.items()],
        "TimezoneCommitCount",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        sentimentScores,
        "CommitMessageSentiment",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        commitMessageSentimentsPositive,
        "CommitMessageSentimentsPositive",
        config.resultsPath,
    )

    outputStatistics(
        idx,
        commitMessageSentimentsNegative,
        "CommitMessageSentimentsNegative",
        config.resultsPath,
    )

    return authorInfoDict, daysActive
示例#23
0
import math


print(os.getcwd())
os.chdir("C:/Users/marcs/OneDrive/Bureaublad/Master/Thesis")


df = pd.DataFrame()
k=0
print("Start part 1:")
until = datetime.datetime(2019,1,1) 
since =  datetime.datetime(2018,12,31)
init_start = datetime.datetime.now()

afinn = Afinn(emoticons=True)
senti = PySentiStr()
senti.setSentiStrengthPath('C:/Users/marcs/OneDrive/Bureaublad/Master/Thesis/SentiStrength.jar') # Note: Provide absolute path instead of relative path
senti.setSentiStrengthLanguageFolderPath('C:/Users/marcs/OneDrive/Bureaublad/Master/Thesis/SentiStrength_Data/') # Note: Provide absolute path instead of relative path

for j in list(range(100000)):
    start = datetime.datetime.now()
    res = None
    while res is None:
        try:
            tweetCriteria = got.manager.TweetCriteria().setQuerySearch('$HAS')\
                                                   .setSince(since.strftime('%Y-%m-%d'))\
                                                   .setUntil(until.strftime('%Y-%m-%d'))\
                                                   .setMaxTweets(10000)\
                                                   .setEmoji("unicode")\
                                                   .setLang("en")
            tweet = got.manager.TweetManager.getTweets(tweetCriteria)
示例#24
0
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(result)
    result = [i for i in tokens if not i in stop_words]

    # stemming
    #     stemmer= PorterStemmer()
    #     newResult = []
    #     for word in result:
    #         newResult.append(stemmer.stem(word))
    #     print(newResult)
    return result


senti = PySentiStr()
senti.setSentiStrengthPath(
    'C:\ProgramData\Anaconda3\Lib\site-packages\sentistrength\SentiStrength.jar'
)
senti.setSentiStrengthLanguageFolderPath(
    'C:\ProgramData\Anaconda3\Lib\site-packages\sentistrength\\')

data = pd.read_csv("D:\senior\sentiment\Moodle_comments2.csv")
tagcomment = pd.read_csv("D:\\senior\\sentiment\\data\\tags.csv",
                         encoding='iso-8859-1')
tagcommentId = tagcomment['commentid']

commendId = []
cleanComment = []
sentiment = []
# tagger = []
示例#25
0
    if score > 0: return 'positive'
    elif score < 0: return 'negative'
    else: return 'neutral'


afinn = Afinn()


def afinn_polarity(text):
    score = afinn.score(text)
    if score > 0: return 'positive'
    elif score < 0: return 'negative'
    else: return 'neutral'


senti = PySentiStr()
senti.setSentiStrengthPath(senti_strength_jar_filepath)
senti.setSentiStrengthLanguageFolderPath(senti_strength_data_dirname)


def sentistrength_polarity(text):
    score = senti.getSentiment([text])[0]
    if score > 0: return 'positive'
    elif score < 0: return 'negative'
    else: return 'neutral'


mpqa_df = pd.read_csv(mpqa_filepath)


def mpqa_polarity(text):
示例#26
0
#tqdm loading
from datetime import datetime
from pair_score import calculatePairScore
import os
import sys

NUMTHREAD = 20
curdir = os.getcwd()
while 'filepathhelper.py' not in os.listdir(curdir):
    curdir = os.path.dirname(curdir)
sys.path.append(curdir)
import filepathhelper
from tqdm import tqdm
import multiprocessing as mp

senti = PySentiStr()
#    senti.setSentiStrengthPath('C:\ProgramData\Anaconda3\Lib\site-packages\sentistrength\SentiStrength.jar')
#    senti.setSentiStrengthLanguageFolderPath('C:\ProgramData\Anaconda3\Lib\site-packages\sentistrength\\')
senti.setSentiStrengthPath(
    '/home/waraleetan/ming/lib/python2.7/site-packages/sentistrength/SentiStrength.jar'
)
senti.setSentiStrengthLanguageFolderPath(
    '/home/waraleetan/ming/lib/python2.7/site-packages/sentistrength/')


def cleanData(text):
    #remove [~]
    result = re.sub("\\[~.*?\\]", "", text)

    #remove{code}
    result = re.sub(r'^{code(.+){code}', ' ', result)