示例#1
0
    def _analyze_sentiment(self):
        """Compute the mean sentiment of the Message's sentences."""

        # Todo: If user mentions a known restaurant by name, treat that message as a Review. 

        self._tokenize() # populate the sentences array
        sentiments_sum = 0 # sum of vaderSentiment SentimentIntensityAnalyzer "compound" scores
        analyzer = vs.SentimentIntensityAnalyzer()
        for sentence in self._sentences:
            sentence_sentiment = analyzer.polarity_scores(sentence)["compound"]
            sentiments_sum += sentence_sentiment
            for word in sentence.split(): # todo time complexity needlessly bad, VSA already made one pass. Subclass VSA into a custom MessageAnalyzer and change just the one method
                                    #     by making it also check for the keywords. 
                    # todo implement binary search
                word = word.lower().strip() # todo refactor this preprocessing to a helper
                while not word[-1].isalpha(): # strip ending punctuation
                    word = word[:-1]
                # Todo should we search character-wise to deal with plurals? Or always strip "s" from plurals?
                if self._bsearch_taste_keywords(word):
                    self.sender.update_tastes(taste = word, strength = sentence_sentiment) # Todo improve the business logic. Right now, this merely treats the sentiment of the 
                                                                            # sentence in which the word appeared as the user's sentiment toward that taste.

                     # todo update the sender user object literal's tastes data. MessageModelInterface is responsible for writing the changes to both the 
                        #   message data and the user data.
        self._sentiment_avg = round(sentiments_sum / len(self._sentences), SENTIMENT_DECIMAL_PLACES)
        return self._sentiment_avg

    # Todo consider an ABC for the text-based objects--reviews and messages. Lot of reusable operations. 
示例#2
0
def func():
    st = request.form["review"]
    if (st == ''):
        return render_template('index.html')
    english = spacy.load("en_core_web_sm")
    result = english(st)
    sentences = [str(s) for s in result.sents]
    analyzer = vaderSentiment.SentimentIntensityAnalyzer()
    sentiment = [analyzer.polarity_scores(str(s)) for s in sentences]

    if (sentiment[0]['compound'] >= 0.05):
        sent = "Positive "
        emoji = 128512
        address = ' https://st.depositphotos.com/1016482/2236/i/950/depositphotos_22362437-stock-photo-background-with-heap-of-yellow.jpg'

    elif (sentiment[0]['compound'] <= -0.05):
        sent = "Negative "
        emoji = 128577
        address = 'https://www.ecopetit.cat/wpic/mpic/270-2706765_sad-emoji-cover-photo-for-fb.jpg '

    else:
        sent = "Neutral "
        emoji = 128528
        address = 'https://atlas-content-cdn.pixelsquid.com/stock-images/neutral-face-facial-expression-L63Mrq1-600.jpg '

    return render_template('output.html',
                           sentence=st,
                           sent=sent,
                           emoji=emoji,
                           address=address)
示例#3
0
文件: analyzer.py 项目: OwenGY/twexit
class VaderAnalyzer(Analyzer):

    analyzer = vader.SentimentIntensityAnalyzer()

    def __init__(self):
        pass

    def get_value(self, word):
        """
            Get value of a single word
        """
        wordlist = self.validate(word)
        if len(wordlist) > 1:
            raise ValueError("get_value takes only a single word!")
        return self.analyzer.polarity_scores(wordlist[0])['compound']

    def get_values(self, textlist):
        """
            Get values of a list of strings
              textlist (list) : list of strings, input sentences
            Returns:
              vals (list) : list of floats, sentiment for each input sentence
        """
        vals = []
        for text in textlist:
            vals.append(self.analyzer.polarity_scores(text)['compound'])
        return vals
def vader_analyze(twitter_input):
    analyzer = vS.SentimentIntensityAnalyzer()
    pos = []
    neg = []
    neu = []
    com = []

    # VADER analysis
    for tweet in twitter_input:
        analyzed_tweet = analyzer.polarity_scores(tweet)
        pos.append(analyzed_tweet['pos'])
        neg.append(analyzed_tweet['neg'])
        neu.append(analyzed_tweet['neu'])
        com.append(analyzed_tweet['compound'])

    # Average calculation
    ave_pos = sum(pos) / float(len(pos))
    ave_neu = sum(neu) / float(len(neu))
    ave_neg = sum(neg) / float(len(neg))

    # Setting up Plot.ly graphing
    labels = ['Positive', 'Neutral', 'Negative']
    values = [ave_pos, ave_neu, ave_neg]
    trace = Pie(labels=labels, values=values)
    data = [Histogram(x=com)]

    # Creates Plot.ly graphs and stores them in tuple as strings. The graphs are saved as HTML divs
    divs = [
        plotly.offline.plot([trace], include_plotlyjs=False,
                            output_type='div'),  # Pie Chart
        plotly.offline.plot(data, include_plotlyjs=False, output_type='div')
    ]  # Histogram

    return divs
示例#5
0
    def getVaderscores(self):
        '''gets tweets from movie_tweets, calculates vader scores and stores them back in movie_tweets'''
        logging.info("----Calculating VADER Sentiment Scores----")
        tweets = self.movie_tweets['vaderTweet']
        vader = vaderSentiment.SentimentIntensityAnalyzer()

        sentScores_neg = []
        sentScores_neu = []
        sentScores_pos = []
        sentScores_cpd = []

        for tweet in tqdm(tweets):
            doc = self.nlp(tweet)
            sentScore = vader.polarity_scores(str(doc))
            sentScores_neg.append(sentScore['neg'])
            sentScores_neu.append(sentScore['neu'])
            sentScores_pos.append(sentScore['pos'])
            sentScores_cpd.append(sentScore['compound'])

        scores_df = pd.DataFrame({
            'neg': sentScores_neg,
            'neu': sentScores_neu,
            'pos': sentScores_pos,
            'cpd': sentScores_cpd
        })

        self.movie_tweets = pd.concat([self.movie_tweets, scores_df], axis=1)
        logging.info("----Scored Appended to Processed Dataframe----")
示例#6
0
def main(args):
    spark = sql.SparkSession.builder.appName('update-mutator').getOrCreate()

    msg_struct = types.StructType([
        types.StructField('text', types.StringType(), True),
        types.StructField('user_id', types.StringType(), True),
        types.StructField('update_id', types.StringType(), True)
    ])

    sentiments_struct = types.ArrayType(
        types.MapType(types.StringType(), types.FloatType(), False))

    analyzer = vader.SentimentIntensityAnalyzer()
    analyzer_bcast = spark.sparkContext.broadcast(analyzer)

    def sentiment_generator_impl(text):
        va = analyzer_bcast.value
        english = SpacyMagic.get('en_core_web_sm')
        result = english(text)
        sents = [str(sent) for sent in result.sents]
        sentiment = [va.polarity_scores(str(s)) for s in sents]
        return sentiment

    sentiment_generator = functions.udf(sentiment_generator_impl,
                                        sentiments_struct)

    def json_converter_impl(user_id, update_id, text, sentiments):
        obj = dict(user_id=user_id,
                   update_id=update_id,
                   text=text,
                   sentiments=sentiments)
        return json.dumps(obj)

    json_converter = functions.udf(json_converter_impl, types.StringType())

    records = (spark.readStream.format('kafka').option(
        'kafka.bootstrap.servers',
        args.brokers).option('subscribe', args.intopic).load().select(
            functions.column('value').cast(types.StringType()).alias('value')
        ).select(
            functions.from_json(
                functions.column('value'), msg_struct).alias('json')).select(
                    functions.column('json.user_id'),
                    functions.column('json.update_id'),
                    functions.column('json.text'),
                    sentiment_generator(functions.column('json.text')).alias(
                        'sentiments')).select(
                            json_converter(functions.column('user_id'),
                                           functions.column('update_id'),
                                           functions.column('text'),
                                           functions.column('sentiments')).
                            alias('value')).writeStream.format('kafka').option(
                                'kafka.bootstrap.servers',
                                args.brokers).option('topic',
                                                     args.outtopic).option(
                                                         'checkpointLocation',
                                                         '/tmp').start())

    records.awaitTermination()
示例#7
0
文件: review.py 项目: B-T-D/datespot
 def _analyze_sentiment(self) -> float:
     self._tokenize()
     analyzer = vs.SentimentIntensityAnalyzer()
     sentiments_sum = 0 # sum of VSA "compound" scores
     for sentence in self._sentences:
         sentiments_sum += analyzer.polarity_scores(sentence)["compound"]
     sentiments_mean = round(sentiments_sum / len(self._sentences), SENTIMENT_DECIMAL_PLACES) # VSA demo rounds to 4 decimal places
     self._sentiment = sentiments_mean
     return self._sentiment
def getNewsSentiment(source):
    b = vaderSentiment.SentimentIntensityAnalyzer()
    news_list = news2.getNews(source)
    sentiment = 0
    for each in news_list:
        scores = b.polarity_scores(each)
        sentiment += (scores['pos'] - scores['neg'])

    return sentiment, news_list
示例#9
0
def get_vader(video_id):
    comments = get_comments(video_id)
    analyzer = vaderSentiment.SentimentIntensityAnalyzer()
    vaders = []

    for comment in comments:
        vs = comment['message'], analyzer.polarity_scores(comment['message'])
        vaders.append(vs)

    return vaders
示例#10
0
 def sentiment(data):
     sentiment = []
     analyzer = vd.SentimentIntensityAnalyzer()
     for i in data:
         result = analyzer.polarity_scores(i)["compound"]
         sentiment.append(result)
     if len(sentiment) == 0:
         return 0
     else:
         return sum(sentiment) / len(sentiment)
示例#11
0
def main(args):
    spark = sql.SparkSession.builder.appName('update-analyzer').getOrCreate()

    msg_struct = types.StructType([
        types.StructField('text', types.StringType(), True),
        types.StructField('user_id', types.StringType(), True),
        types.StructField('update_id', types.StringType(), True)
    ])

    analyzer = vader.SentimentIntensityAnalyzer()
    analyzer_bcast = spark.sparkContext.broadcast(analyzer)
    vhost_bcast = args.vhost
    vport_bcast = args.vport

    def sentiment_generator_impl(text, user_id, update_id):
        va = analyzer_bcast.value
        english = SpacyMagic.get('en_core_web_sm')
        result = english(text)
        sents = [str(sent) for sent in result.sents]
        sentiments = [va.polarity_scores(str(s)) for s in sents]
        obj = dict(user_id=user_id,
                   update_id=update_id,
                   text=text,
                   sentiments=sentiments)
        try:
            con = httplib.HTTPConnection(host=vhost_bcast, port=vport_bcast)
            con.request('POST', '/', body=json.dumps(obj))
            con.close()
        except Exception as e:
            logging.warn('unable to POST to visualizer, error:')
            logging.warn(e.message)

    sentiment_generator = functions.udf(sentiment_generator_impl,
                                        types.NullType())

    records = (spark.readStream.format('kafka').option(
        'kafka.bootstrap.servers',
        args.brokers).option('subscribe', args.topic).load().select(
            functions.column('value').cast(
                types.StringType()).alias('value')).select(
                    functions.from_json(
                        functions.column('value'),
                        msg_struct).alias('json')).select(
                            functions.column('json.user_id'),
                            functions.column('json.update_id'),
                            functions.column('json.text'),
                            sentiment_generator(
                                functions.column('json.text'),
                                functions.column('json.user_id'),
                                functions.column('json.update_id'))).
               writeStream.format("console").start())

    records.awaitTermination()
示例#12
0
def sent_analyze(sentence):
    """Analyze a given sentence/block of text."""
    english = spacy.load("en_core_web_sm")  # load spacy lang
    # nlp = en_core_web_sm.load()
    result = english(sentence)
    sentences = [str(s) for s in result.sents]  # go thru sentences
    analyzer = (vaderSentiment.SentimentIntensityAnalyzer()
                )  # create analyzer using vaderSentiment
    sentiment = [analyzer.polarity_scores(str(s))
                 for s in sentences]  # calculate sentiment rating

    return sentiment
示例#13
0
def analyzer_cols(row):

    # INITIALIZE VADER
    analyzer = vd.SentimentIntensityAnalyzer()

    # RETRIEVE SCORES
    scores = analyzer.polarity_scores(row['QUOTE'])

    row['neg'] = scores['neg']
    row['neu'] = scores['neu']
    row['pos'] = scores['pos']
    row['compound'] = scores['compound']

    return row
示例#14
0
def sentiment(data):
    analyzer = vaderSentiment.SentimentIntensityAnalyzer()
    compound = []
    positive = []
    neutral = []
    negative = []
    for t in data['tweet']:
        vs = analyzer.polarity_scores(str(t))
        compound.append(vs['compound'])
        positive.append(vs['pos'])
        neutral.append(vs['neu'])
        negative.append(vs['neg'])
    data['compound'] = compound
    data['positive'] = positive
    data['neutral'] = neutral
    data['negative'] = negative
    return data
示例#15
0
def is_threat(text, lat, lon):
    """
    Checks to see if sentiment is threating, related to Merck and within range of a facility 

    :param: raw text
    :param: latitude
    :param: longitude
    :return: response body if nonthreat or null if it is a threat
    """
    if not filterText(text):
        return buildResponse(body=json.dumps(
            {"message": "Not related to Merck or its interests"}),
                             status=200)

    analyzer = vaderSentiment.SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)

    neg = False

    if sentiment['neg'] > 0.5:
        neg = True
    elif sentiment['pos'] < 0.1 and sentiment['neg'] > 0.3:
        neg = True

    # if sentiment is neutral or positive we dont want to waste
    # finite db space
    if not neg:
        return buildResponse(body=json.dumps(
            {"message": "Nonnegative sentiment"}),
                             status=200)

    # check to see if the threat is in the range of a Merck Facility
    if not inRangeOfMerckFacility(lat, lon):
        return buildResponse(body=json.dumps(
            {"message": "Not in range of a facility"}),
                             status=200)

    return None
示例#16
0
def sentiment_return(request):
    data = json.loads(request.body)
    print(data)
    q = data["query"]
    month = "01"
    day = "01"
    year = "2020"
    count = "2"
    # function that pulls tweets
    def get_tweets():
    # twitter dev credentials here:
        consumer_key = secrets.ck1
        consumer_secret = secrets.cs1
        access_token = secrets.at1
        access_token_secret = secrets.ats1

        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

        # open/create a file to append data
        # csvFile = open('tweets.csv', 'a')
        # # use csv Writer
        # csvWriter = csv.writer(csvFile)

        # compile tweets csv to be analyzed
        tweet_array = []
        for tweet in tweepy.Cursor(api.search,q=f"{q}",count=f"{count}",
                                lang="en",
                                since=f"{year}-{month}-{day}").items():
            # print (tweet.created_at, tweet.text)
            tweet_array.append({"created": tweet.created_at, "body": tweet.text.encode('utf-8')})
            # csvWriter.writerow([tweet.created_at, tweet.text.encode('utf-8')])
        return tweet_array

        
    # call get_tweets()
    tweet_list = get_tweets()

    # sentiment analysis function 
    analyzer = vader.SentimentIntensityAnalyzer()
    english = spacy.load("en_core_web_sm")

    # define get_sentiments function to process tweets
    def get_sentiments(text_list):
        text = "\n".join([str(tweet["body"]) for tweet in text_list])
        result = english(text)
        # print(result)
        sentences = [str(sent) for sent in result.sents]
        sentiments = [analyzer.polarity_scores(str(s)) for s in sentences]
        return sentiments    

    # define analyze_tweets function
    def analyze_tweets(tweet_list):
        # open and analyze sentiment of tweets
        # data = open('tweets.csv', 'r')
        # text = data.read()
        text = tweet_list
        sentiments = get_sentiments(text)

        # open/create a file to append data
        csvFile = open('sentiment.csv', 'a')
        fieldnames = ["neg", "neu", "pos", "compound"]
        # use csv Writer
        csvWriter = csv.DictWriter(csvFile, fieldnames=fieldnames)

        # compile tweets csv to be analyzed
        csvWriter.writeheader()
        for sent in sentiments:
            # print(sent)
            csvWriter.writerow(sent)

    analyze_tweets(tweet_list)

    def find_sent_mean():
        df = pd.read_csv("sentiment.csv")
        mean = df.mean()
        mean = mean.drop(["neu", "compound"])
        neg = float(mean[0]) * 100
        pos = float(mean[1]) * 100
        neg = format(neg,'.1f')
        pos = format(pos,'.1f')
        print("\n----------- Tweet Sentiment -----------\n")
        print(f"Negative: {neg}\nPositive: {pos}")
        os.remove("sentiment.csv")
        return neg, pos
    # call find_sent_mean function
    neg, pos = find_sent_mean()
    # define generate_wordcloud function
    def generate_wordcloud(tweet_list):
        # define now for naming wordcloud.png
        now = datetime.now()
        # create wordcloud from tweet_list
        # remove stopwords & irrelevant phrases
        WordCloud(width=800, height=400, background_color="white", max_words=5000, contour_width=3, contour_color="steelblue").generate_from_text(" ".join([r for _d in tweet_list for r in _d['body'].decode('utf-8').replace('https', "").replace('photo', '').replace('RT', '').replace('co', '').split() if r not in set(nltk.corpus.stopwords.words("english"))])).to_file("static/wordcloud.png")
        
    generate_wordcloud(tweet_list)
    
    return JsonResponse({
        "neg":neg,
        "pos":pos,
        "q":q,
    })
示例#17
0
def main():

    emojiPattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F100-\U0001F1FF"
        u"\U0001F780-\U0001F999"
        u"\u2000-\u206F"
        u"\u2701-\u27BF"
        "]+",
        flags=re.UNICODE)

    fileName = "cleanedTweets.json"

    outputFileName = "finalTweets.json"

    file = open(fileName, "r")

    languages = getLanguages("Languages.txt")
    print(languages)

    #cleanTweets(fileName, outputFileName)

    translator = Translator()

    #with open("dictFile.json") as jsonData:

    #regionsToSentiments = json.load(jsonData)

    #regionsToSentiments = json.loads(file)

    regionsToSentiments = {}

    workingPath = "C:\\Users\\Taylor\\Desktop\\TEST\\Countries"

    analyzer = vaderSentiment.SentimentIntensityAnalyzer()

    usa = "US_EN.json"

    #for root, directories, files in os.walk(workingPath):

    #for file in files:

    tweet = 0

    realFile = open(usa, "r")

    for line in realFile:

        #print(file)
        tweet += 1
        print(tweet)

        tweetObject = json.loads(line)
        # print(tweetObject)

        originalTweet = tweetObject["text"]
        #originalTweet = emojiPattern.sub(r'', originalTweet)
        # print(originalTweet)
        # originalTweet = ''.join(character for character in tweetObject["text"] if character not in emoji.UNICODE_EMOJI)
        '''
        if tweetObject["lang"] != "en":

            # print("Original Tweet:" + str(tweetObject["text"]))
            # print(tweetObject["lang"])

            if tweetObject["lang"] in languages:

                # Translates the tweet from its native language to English.
                translatedTweet = translator.translate(originalTweet, dest="en", src=tweetObject["lang"]).text
                # print("Translated Tweet: " + str(translatedTweet))

            else:

                transObject = translator.translate(originalTweet, dest="en")
                translatedTweet = transObject.text
                # print("Translated Tweet: " + str(translatedTweet))

        else:

            translatedTweet = tweetObject["text"]
        '''

        sentiment = analyzer.polarity_scores(originalTweet)
        coordinates = (
            tweetObject["place"]["bounding_box"]["coordinates"][0][0][0],
            tweetObject["place"]["bounding_box"]["coordinates"][0][0][1])
        # print(sentiment)
        # print(coordinates)

        if tweetObject["place"]["country_code"] not in regionsToSentiments:

            regionsToSentiments[tweetObject["place"]["country_code"]] = {}

            if tweetObject["created_at"].split(
            )[3][0:2] not in regionsToSentiments[tweetObject["place"]
                                                 ["country_code"]]:
                regionsToSentiments[tweetObject["place"]["country_code"]][
                    tweetObject["created_at"].split()[3][0:2]] = [
                        (sentiment, coordinates)
                    ]

            else:

                regionsToSentiments[tweetObject["place"]["country_code"]][
                    tweetObject["created_at"].split()[3][0:2]].append(
                        (sentiment, coordinates))

        else:

            if tweetObject["created_at"].split(
            )[3][0:2] not in regionsToSentiments[tweetObject["place"]
                                                 ["country_code"]]:
                regionsToSentiments[tweetObject["place"]["country_code"]][
                    tweetObject["created_at"].split()[3][0:2]] = [
                        (sentiment, coordinates)
                    ]

            else:

                regionsToSentiments[tweetObject["place"]["country_code"]][
                    tweetObject["created_at"].split()[3][0:2]].append(
                        (sentiment, coordinates))

        print()

    print(regionsToSentiments)

    saveDict(regionsToSentiments, "dictFile.json")
示例#18
0
messages.show(2)
+--------------------+------+
|                Text|Target|
+--------------------+------+
|I have bought sev...|     1|
|"Product arrived ...|     0|
+--------------------+------+
only showing top 2 rows

In [42]:
messages.count()
Out[42]:
525814
Sentiment Analysis using VaderSentiment library
In [6]:
sentiment = messages.rdd.map(lambda x: [x[1],vaderSentiment.SentimentIntensityAnalyzer().polarity_scores(x[0][0:140])])
In [7]:
sentiment.cache()
sentiment.take(2)
Out[7]:
[[u'1', {'compound': 0.7902, 'neg': 0.0, 'neu': 0.734, 'pos': 0.266}],
 [u'0', {'compound': -0.1027, 'neg': 0.104, 'neu': 0.808, 'pos': 0.088}]]
In [9]:
sentiment_DF = sentiment.map(lambda x: (x[0],x[1]['compound'],x[1]['neg'],x[1]['neu'],x[1]['pos'])).toDF()
In [ ]:
sentiment_pandas = sentiment_DF.toPandas()
In [83]:
# importing the data
sentiment_pandas = pd.read_csv('D:/Divya/Fall/6330 Harvesting Big Data/FinalProject/Reviews.csv')

#Sampling
 def __init__(self):
     self.analyser = vaderSentiment.SentimentIntensityAnalyzer()
示例#20
0
    def google_businfo(self, return_hours=False):
        """
        Parse the responses from get_details() and get_distance_time() into a dictionary for each Google Maps Place ID.
        """
        #TODO: Find a simpler way to return hours of operation data.
        dict_businfo = {
            "place_id": self.place_ids,
            "phone_formatted": [],
            "phone": [],
            "site": [],
            "distance": [],
            "trip_time": [],
            "goog_sent_pos": [],
            "goog_sent_neu": [],
            "goog_sent_neg": [],
            "goog_sent_comp": [],
            "goog_rev_avgrating": []
        }

        sent_scorer = vs.SentimentIntensityAnalyzer()

        for _id in self.place_ids:
            details = self.google_busdetails(place_id=_id)
            dist_time = self.distance_time(place_id=_id)

            dict_businfo["phone_formatted"].append(
                details["result"]["formatted_phone_number"] if
                "formatted_phone_number" in details["result"].keys() else None)
            dict_businfo["phone"].append(
                details["result"]["international_phone_number"] if
                "formatted_phone_number" in details["result"].keys() else None)
            dict_businfo["phone"] = [
                re.sub(pattern=r"[^\d\+]", repl="", string=x)
                if x is not None else None for x in dict_businfo["phone"]
            ]
            dict_businfo["site"].append(
                details["result"]["website"] if "website" in
                details["result"].keys() else None)

            dict_businfo["distance"].append(dist_time[0])
            dict_businfo["trip_time"].append(dist_time[1])

            if "reviews" not in details["result"].keys():
                for k in [
                        "goog_sent_pos", "goog_sent_neu", "goog_sent_neg",
                        "goog_sent_comp", "goog_rev_avgrating"
                ]:
                    dict_businfo[k].append(None)
            else:
                reviews = details["result"]["reviews"]
                sent_scores = [
                    sent_scorer.polarity_scores(r["text"]) for r in reviews
                ]

                dict_businfo["goog_sent_pos"].append(
                    sum(s["pos"] for s in sent_scores) / len(sent_scores))
                dict_businfo["goog_sent_neu"].append(
                    sum(s["neu"] for s in sent_scores) / len(sent_scores))
                dict_businfo["goog_sent_neg"].append(
                    sum(s["neg"] for s in sent_scores) / len(sent_scores))
                dict_businfo["goog_sent_comp"].append(
                    sum(s["compound"] for s in sent_scores) / len(sent_scores))
                dict_businfo["goog_rev_avgrating"].append(
                    sum(r["rating"] for r in reviews) / len(reviews))

            if return_hours:
                sun_open, mon_open, tue_open, wed_open, thur_open, \
                fri_open, sat_open, sun_close, mon_close, tue_close, \
                wed_close, thur_close, fri_close, sat_close = [], [], [], [], [], [], [], [], [], [], [], [], [], []

                opens = [
                    sun_open, mon_open, tue_open, wed_open, thur_open,
                    fri_open, sat_open
                ]
                closes = [
                    sun_close, mon_close, tue_close, wed_close, thur_close,
                    fri_close, sat_close
                ]

                day_idx = {
                    "sun": 0,
                    "mon": 1,
                    "tue": 2,
                    "wed": 3,
                    "thur": 4,
                    "fri": 5,
                    "sat": 6
                }

                for day in day_idx.keys():
                    try:
                        opens[day_idx[day]].append(
                            details["result"]["opening_hours"]["periods"][
                                day_idx[day]]["open"]["time"])
                    except:
                        opens[day_idx[day]].append(None)
                    try:
                        closes[day_idx[day]].append(
                            details["result"]["opening_hours"]["periods"][
                                day_idx[day]]["close"]["time"])
                    except:
                        closes[day_idx[day]].append(None)

                for day in day_idx.keys():
                    dict_businfo["opens_" + day] = opens[day_idx[day]]
                    dict_businfo["closes_" + day] = closes[day_idx[day]]

        return dict_businfo
示例#21
0
# -*- coding: utf-8 -*-
# encoding=utf8
import sys, json, re
import numpy as np
from vaderSentiment import vaderSentiment as vs
analyzer = vs.SentimentIntensityAnalyzer()
reload(sys)
sys.setdefaultencoding('utf8')
affect_list = ["anger", "fear", "joy", "sadness"]
import os
cwd = os.getcwd()
print cwd
with open(
        os.path.join(cwd, 'dataset', 'task1', 'train',
                     'dataset_json_task_1.txt')) as data_file:
    train_data = json.load(data_file)
with open(
        os.path.join(cwd, 'dataset', 'task1', 'development',
                     'dataset_json_development.txt')) as data_file:
    development_data = json.load(data_file)
with open(
        os.path.join(cwd, 'dataset', 'task1', 'test',
                     'dataset_json_test.txt')) as data_file:
    test_data = json.load(data_file)

test_data = development_data

import sklearn.ensemble, sklearn.metrics  #, sklearn.cross_validation
from sklearn.metrics import mean_squared_error, r2_score
import scipy
import math
示例#22
0
    def yelp_sentiment(self, id_match_list):

        dict_sentiment = {
            "place_id": [],
            "yelp_id": [],
            "yelp_sent_pos": [],
            "yelp_sent_neu": [],
            "yelp_sent_neg": [],
            "yelp_sent_comp": [],
            "yelp_rev_avgrating": []
        }

        sent_scorer = vs.SentimentIntensityAnalyzer()

        for _ids in id_match_list:
            if "yelp_id" in _ids.keys():
                endpoint_reviews = "/".join(
                    [self.endpoint_base, _ids["yelp_id"], "reviews"])

                getrequest = requests.get(url=endpoint_reviews,
                                          headers=self.headers)

                try:
                    getresp = getrequest.json()["reviews"]

                    #Yelp review text snippets end in trailing elipsis, hence the [:-3] slice notation.
                    texts = [r["text"][:-3] for r in getresp]

                    dict_sentiment["place_id"].append(_ids["place_id"])
                    dict_sentiment["yelp_id"].append(_ids["yelp_id"])
                    dict_sentiment["yelp_sent_pos"].append(
                        sum([
                            sent_scorer.polarity_scores(t)["pos"]
                            for t in texts
                        ]) / len(texts))
                    dict_sentiment["yelp_sent_neu"].append(
                        sum([
                            sent_scorer.polarity_scores(t)["neu"]
                            for t in texts
                        ]) / len(texts))
                    dict_sentiment["yelp_sent_neg"].append(
                        sum([
                            sent_scorer.polarity_scores(t)["neg"]
                            for t in texts
                        ]) / len(texts))
                    dict_sentiment["yelp_sent_comp"].append(
                        sum([
                            sent_scorer.polarity_scores(t)["compound"]
                            for t in texts
                        ]) / len(texts))
                    dict_sentiment["yelp_rev_avgrating"].append(
                        sum([r["rating"] for r in getresp]) / len(getresp))

                except:
                    dict_sentiment["place_id"].append(_ids["place_id"])
                    dict_sentiment["yelp_id"].append(_ids["yelp_id"])
                    [
                        dict_sentiment[key].append(None)
                        for key in list(dict_sentiment.keys())[2:]
                    ]
            else:
                dict_sentiment["place_id"].append(_ids["place_id"])
                [
                    dict_sentiment[key].append(None)
                    for key in list(dict_sentiment.keys())[1:]
                ]

        return dict_sentiment
import nltk, glob, csv, re
from vaderSentiment import vaderSentiment

lexicons = "data/sentiment_analysis/lexicons/*.tsv"
dataset = "data/sentiment_analysis/tweets/*.tsv"

ps = nltk.stem.PorterStemmer()
stop_words = set(nltk.corpus.stopwords.words('english'))
vader_analyzer = vaderSentiment.SentimentIntensityAnalyzer()


def pretprocess(tweet):
    tweet = re.sub(r'@\w+', '', tweet)  # remove mentions
    tweet = re.sub(r'@(\s+)\w+', '', tweet)
    tweet = re.sub(r'http\S+', '', tweet)  # remove links
    tweet = re.sub(r'\w*\\.\w*', '', tweet)
    tweet = re.sub(r'/\w*', '', tweet)
    tweet = re.sub(r'([^\s\w]|_)+', '', tweet)  # only alfanumeric and space
    tweet = re.sub(r'\W*\b\w{18,60}\b', '', tweet)  # remove big words
    tokenize_tweet = nltk.word_tokenize(tweet)
    tweet = [word for word in tokenize_tweet if word not in stop_words]  # stop
    tweet = [ps.stem(word) for word in tweet]  # stem
    tweet = [word for word in tweet if len(word) > 2]  # small words
    return tweet


def check_sentiment_per_lexicon(words, lexicon_path):
    sent = 0
    with open(lexicon_path, 'r') as csvfile:
        rows = [row for row in csv.reader(csvfile, delimiter='\t')]
        for word in words:
示例#24
0
def readData(filename, rnn_filename):
    data = []
    input = []
    output = []
    ids = []
    #sentiment  = []
    with open(filename, 'r', encoding="utf-8") as csvfile:
        csvdata = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(csvdata)
        analyzer = vaderSentiment.SentimentIntensityAnalyzer()
        for line in csvdata:
            d = []
            i = []
            ids.append(line[0])
            i.append(line[1])
            sentiment = analyzer.polarity_scores(line[1])
            d.append(sentiment['neg'])
            d.append(sentiment['neu'])
            d.append(sentiment['pos'])
            #d.append(TextBlob(line[1]).sentiment)

            i.append(line[2])
            sentiment = analyzer.polarity_scores(line[2])
            d.append(sentiment['neg'])
            d.append(sentiment['neu'])
            d.append(sentiment['pos'])
            #d.append(TextBlob(line[2]).sentiment)

            i.append(line[3])
            sentiment = analyzer.polarity_scores(line[3])
            d.append(sentiment['neg'])
            d.append(sentiment['neu'])
            d.append(sentiment['pos'])
            #d.append(TextBlob(line[3]).sentiment)

            i.append(line[4])
            sentiment = analyzer.polarity_scores(line[4])
            d.append(sentiment['neg'])
            d.append(sentiment['neu'])
            d.append(sentiment['pos'])
            #d.append(TextBlob(line[4]).sentiment)

            i.append(line[5])
            sentiment = analyzer.polarity_scores(line[5])
            d.append(sentiment['neg'])
            d.append(sentiment['neu'])
            d.append(sentiment['pos'])
            #d.append(TextBlob(line[5]).sentiment)

            i.append(line[6])
            sentiment = analyzer.polarity_scores(line[6])
            d.append(sentiment['neg'])
            d.append(sentiment['neu'])
            d.append(sentiment['pos'])
            #d.append(TextBlob(line[6]).sentiment)

            input.append(i)
            data.append(d)
            e1 = line[5].split(' ')
            d.append(len(e1))
            e2 = line[6].split(' ')
            d.append(len(e2))
            if (filename != "test.csv"):
                output.append(line[7])

    # add RNN features
    with open(rnn_filename, 'r', encoding="utf-8") as f:
        csvdata = csv.reader(f, delimiter=',', quotechar='"')
        count = 0
        index = 0
        for line in csvdata:
            if count % 2 == 0:
                if rnn_filename == "predtest.csv":
                    data[index].append(float(line[0]))
                else:
                    data[index].append(float(line[2]))
            else:
                if rnn_filename == "predtest.csv":
                    data[index].append(float(line[0]))
                else:
                    data[index].append(float(line[2]))
                index += 1
            count += 1

    return data, output, ids
示例#25
0
# Set up AWS Database for storage
HOST = "hedgedb.c288vca6ravj.us-east-2.rds.amazonaws.com"
PORT = 3306
DB_NAME = "scores_timeseries"
DB_USER = "******"
DB_PW = "bluefootedboobie123"

AWS_RDS = dataset.connect("mysql+pymysql://{}:{}@{}/{}".format\
(DB_USER, DB_PW, HOST, DB_NAME))

db = dataset.connect("sqlite:///tweetbase.db")  # connect Dataset to Tweetbase
db2 = dataset.connect("sqlite:///scorebase.db")

printer = pprint.PrettyPrinter()  # printer object

SIA = sia.SentimentIntensityAnalyzer()  # VADER Senitiment object

# Twitter Keys
CONSUMER_KEY = 'zQuVUVHVWNZd7yfMNdyXx4NgJ'
CONSUMER_SECRET = 'OBMTSJfy4UHuCDSslKzZdcgcm33NChTh1m3dJLX5OhRVY5EhUc'
AXS_TOKEN_KEY = '1005588267297853441-aYFOthzthNUwgHUvMJNDCcAMn0IfsC'
AXS_TOKEN_SECRET = 'e88p7236E3nrigW1pkvmyA6hUyUWrMDQd2D7ZThbnZvoQ'

# python-witter API Object
TWY = Twython(app_key=CONSUMER_KEY, app_secret=CONSUMER_SECRET, oauth_token=AXS_TOKEN_KEY, \
oauth_token_secret=AXS_TOKEN_SECRET)

# tweepy object
auth = tweepy.OAuthHandler(consumer_key=CONSUMER_KEY,
                           consumer_secret=CONSUMER_SECRET)
auth.set_access_token(key=AXS_TOKEN_KEY, secret=AXS_TOKEN_SECRET)
示例#26
0
from __future__ import print_function
import json
import time

import boto3
import vaderSentiment.vaderSentiment as vader

print('Loading function')
analyzer = vader.SentimentIntensityAnalyzer()


#This method calls lambda_handler2
def call_lambda_handler2(data):
    client = boto3.client("lambda")
    response = client.invoke(
        FunctionName='lambda_handler2',
        InvocationType='RequestResponse',
        Payload=json.dumps(data),
    )
    res = response['Payload']
    return json.load(res)


# This methods takes a tweet as argument an returns its sentiment
def get_sentiment(tweet):
    scores = analyzer.polarity_scores(tweet['text'])
    compound = scores['compound']

    if compound < -0.05:
        sentiment = "Negative"
        score = scores["neg"]
示例#27
0
#import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mtd
#from newsapi import newsapi_client
import vaderSentiment.vaderSentiment as sia
import IEX_API_Client as IEX_Client

# set up IEX_API_Client
IEX = IEX_Client.IEX_API_Client()

# set up NewsAPI
#NEWS_API = newsapi_client.NewsApiClient(api_key='a76f5e16666f4e66aa4514ea27d425d9')

# set up sentiment analyzer VADER
SIA = sia.SentimentIntensityAnalyzer()

###-------------------- IEX Methods -------------------###


def iex_format_data(symbol, data):
    """
    INPUT decoded JSON data from IEX news archive. Output of "get_news_data".
    RETURN {symbol : [(article text, polarity score, source, datetime)]} for SINGLE stock symbol
    """
    results_dict = {}

    results_dict[symbol] = []

    for article_dict in data:
        # combine headline and summary, get polarity score
示例#28
0
consumer_secret = lines[1].rstrip()
#sets up tweepy api object
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth)
geoList = {
    "UK_GEO":
    "54.364115800619615,-3.7233340937396093,505km",  #lat,long and radius of usa, using googlemaps measurerer
    "USA_GEO": "54.19653024080003,-98.03399875931424,2500km",
    "AUS_GEO": "-27.606338814377246,135.2637427077579,2000km",
    "JAMAICA_GEO": "18.13788732831686,-77.24297002881919,128km",
    "NEWZEALAND_GEO": "-41.876952864666166,173.64569158277476,854km",
    "INDIA_GEO": "20.09904976266362,79.35616263260019,1300km",
    "NIGERIA_GEO": "8.574482569303768,7.542770727835754,500km"
}

analyzer = vader.SentimentIntensityAnalyzer()  #Analyzer object from vader


def starter(input, country1, country2):
    country_1_tweets = tweepy.Cursor(api.search,
                                     q=str(input),
                                     geocode=geoList[country1],
                                     lang='en',
                                     tweet_mode='extended').items(200)
    country_2_tweets = tweepy.Cursor(api.search,
                                     q=str(input),
                                     geocode=geoList[country2],
                                     lang='en',
                                     tweet_mode='extended').items(200)

    country_1_tweets = list(map(FullTextHandler, country_1_tweets))
示例#29
0
def readData(filename, rnn_filename):
    data = []
    input = []
    output = []
    ids = []
    #sentiment  = []
    with open(filename, 'r', encoding="utf-8") as csvfile:
        csvdata = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(csvdata)
        analyzer = vaderSentiment.SentimentIntensityAnalyzer()
        for line in csvdata:
            d = []
            i = []
            ids.append(line[0])

            # Sentiment Features
            # i.append(line[1])
            # sentiment = analyzer.polarity_scores(line[1])
            # d.append(sentiment['neg'])
            # d.append(sentiment['neu'])
            # d.append(sentiment['pos'])

            # i.append(line[2])
            # sentiment = analyzer.polarity_scores(line[2])
            # d.append(sentiment['neg'])
            # d.append(sentiment['neu'])
            # d.append(sentiment['pos'])

            # i.append(line[3])
            # sentiment = analyzer.polarity_scores(line[3])
            # d.append(sentiment['neg'])
            # d.append(sentiment['neu'])
            # d.append(sentiment['pos'])

            # i.append(line[4])
            # sentiment = analyzer.polarity_scores(line[4])
            # d.append(sentiment['neg'])
            # d.append(sentiment['neu'])
            # d.append(sentiment['pos'])

            i.append(line[5])
            sentiment = analyzer.polarity_scores(line[5])
            d.append(sentiment['neg'])
            d.append(sentiment['neu'])
            d.append(sentiment['pos'])

            i.append(line[6])
            sentiment = analyzer.polarity_scores(line[6])
            d.append(sentiment['neg'])
            d.append(sentiment['neu'])
            d.append(sentiment['pos'])

            #avg WE
            # d.append(diffAvgWE(line[1] + " " + line[2] + " " + line[3] + " " + line[4],line[5]))
            # d.append(diffAvgWE(line[1] + " " + line[2] + " " + line[3] + " " + line[4],line[6]))
            e1 = diffAvgWE(line[5])
            for e in e1:
                d.append(e)
            e2 = diffAvgWE(line[6])
            for e in e2:
                d.append(e)

            # Ngram features
            #d.append(getBigramCount(line[5],line[6]))

            #Length of endings
            e1 = line[5].split(' ')
            d.append(len(e1))
            e2 = line[6].split(' ')
            d.append(len(e2))

            #Cosine Similarity
            d.append(cosineSim(line[1] + line[2] + line[3] + line[4], line[5]))
            d.append(cosineSim(line[1] + line[2] + line[3] + line[4], line[6]))

            d.append(
                get_jaccard_sim(line[1] + line[2] + line[3] + line[4],
                                line[5]))
            d.append(
                get_jaccard_sim(line[1] + line[2] + line[3] + line[4],
                                line[6]))

            data.append(d)
            input.append(i)
            if (filename != "test.csv"):
                output.append(line[7])

    # add RNN features
    with open(rnn_filename, 'r', encoding="utf-8") as f:
        csvdata = csv.reader(f, delimiter=',', quotechar='"')
        count = 0
        index = 0
        for line in csvdata:
            if count % 2 == 0:
                if rnn_filename == "predtest.csv":
                    data[index].append(float(line[0]))
                else:
                    data[index].append(float(line[2]))
            else:
                if rnn_filename == "predtest.csv":
                    data[index].append(float(line[0]))
                else:
                    data[index].append(float(line[2]))
                index += 1
            count += 1

    print(data[10])
    return data, output, ids