def find_happiest_state(sentiment_file, tweet_file):

    sentiment_dict = build_dict(sentiment_file)

    state_sentiment = {}

    with open(tweet_file, 'r') as tweets:

        for tweet in tweets:

            tweet = json.loads(tweet)

            # Coursera grader raises KeyError, wrap in try/except block
            try:
                if tweet['place'] != None:
                    if tweet['place']['country'] == 'United States':
                        tweet_state = tweet['place']['full_name'].split()[-1]
                        if tweet_state in states.keys():
                            # print tweet_state
                            # print tweet['text']
                            current_score = score_tweet(tweet['text'], sentiment_dict)

                            # First item is the running count, second is the avg score
                            old_count_and_score = state_sentiment.get(tweet_state, [0, 0])
                            weighted_old_score = old_count_and_score[0]*old_count_and_score[1]

                            updated_count = old_count_and_score[0] + 1

                            updated_score = (weighted_old_score + current_score) / updated_count

                            state_sentiment[tweet_state] = [updated_count, updated_score]
            except KeyError as e:
                pass


        # Sort by score (second number in the list of values)
        sorted_states =  sorted(state_sentiment.iteritems(),
                                key=lambda (k,v): (v[1], k),
                                reverse=True)

        # Print out the score of the first entry.
        # Use an OrderedDict next time. ;)
        print sorted_states[1][0]
def report_UNK_avg_sentiment(sentiments_file_name, tweet_file_name):
    '''Keeps a running dictionary of the average sentiment score for any words
        not found in the AFINN sentiment word score text file.

        Calculates based on average sentiment of tweet found in.
        Then averaged with all other occurrences found in the twitter text file
        provided.

        Could return dictionary to keep running affect dictionary between
        sessions.

    Args:
        sentiments_file_name (string): Filename of a text file containing
         Space delimited AFINN dictionary of word:sentiment-value pairs

        tweet_file_name (string): Filename of a text file containing a string
         representation of a json object from the Twitter API on each line.

    Returns:
        (None) prints output to stdout
    '''
    AFINN_dict = build_dict(sentiments_file_name)

    # Build a new dictionary of words not in AFINN
    # Update as we go along to update sentiment as we see more tweets
    # Useful if we want to save the dict in the end for later use
    # Keep separate from AFINN since those words are already scaled
    # This will have to be in the format:
    #  key: word
    #  value: [times seen, score]
    # This is to keep a running average of the word sentiment as we see more
    new_word_AFINN = {}

    with open(tweet_file_name, 'r') as tweets:
        for tweet in tweets:
            # Extract the text of tweet from json object/string
            tweet = extract_txt_from_json_string(tweet)

            tweet = strip_punct(tweet)

            # print 'SANITY CHECK'
            # print 'Original Tweet: {0}'.format(tweet.encode('utf-8'))
            tweet_words = [word.lower() for word in tweet.split()]
            # print 'Words list: {0}'.format(tweet_words)

            total_tweet_sentiment = score_tweet(tweet, AFINN_dict)

            for word in tweet_words:

                try:
                    # If word is in AFINN, print word:value
                    assert(AFINN_dict[word])
                    print word, AFINN_dict[word]
                except KeyError as e:
                    # Try to get current tuple value of word not in AFINN, or 0
                    #  if not found
                    new_value_list = new_word_AFINN.get(word, [0, 0])

                    # update times seen
                    new_value_list[0] += 1

                    # Set as average sentiment of words in tweet for unseen words
                    word_sent = total_tweet_sentiment / float(len(tweet_words))


                    # update running average value
                    new_score = (new_value_list[1] + word_sent) / new_value_list[0]
                    new_value_list[1] = new_score

                    new_word_AFINN[word] = new_value_list

                    print word, new_score
Exemplo n.º 3
0
def report_UNK_avg_sentiment(sentiments_file_name, tweet_file_name):
    '''Keeps a running dictionary of the average sentiment score for any words
        not found in the AFINN sentiment word score text file.

        Calculates based on average sentiment of tweet found in.
        Then averaged with all other occurrences found in the twitter text file
        provided.

        Could return dictionary to keep running affect dictionary between
        sessions.

    Args:
        sentiments_file_name (string): Filename of a text file containing
         Space delimited AFINN dictionary of word:sentiment-value pairs

        tweet_file_name (string): Filename of a text file containing a string
         representation of a json object from the Twitter API on each line.

    Returns:
        (None) prints output to stdout
    '''
    AFINN_dict = build_dict(sentiments_file_name)

    # Build a new dictionary of words not in AFINN
    # Update as we go along to update sentiment as we see more tweets
    # Useful if we want to save the dict in the end for later use
    # Keep separate from AFINN since those words are already scaled
    # This will have to be in the format:
    #  key: word
    #  value: [times seen, score]
    # This is to keep a running average of the word sentiment as we see more
    new_word_AFINN = {}

    with open(tweet_file_name, 'r') as tweets:
        for tweet in tweets:
            # Extract the text of tweet from json object/string
            tweet = extract_txt_from_json_string(tweet)

            tweet = strip_punct(tweet)

            # print 'SANITY CHECK'
            # print 'Original Tweet: {0}'.format(tweet.encode('utf-8'))
            tweet_words = [word.lower() for word in tweet.split()]
            # print 'Words list: {0}'.format(tweet_words)

            total_tweet_sentiment = score_tweet(tweet, AFINN_dict)

            for word in tweet_words:

                try:
                    # If word is in AFINN, print word:value
                    assert (AFINN_dict[word])
                    print word, AFINN_dict[word]
                except KeyError as e:
                    # Try to get current tuple value of word not in AFINN, or 0
                    #  if not found
                    new_value_list = new_word_AFINN.get(word, [0, 0])

                    # update times seen
                    new_value_list[0] += 1

                    # Set as average sentiment of words in tweet for unseen words
                    word_sent = total_tweet_sentiment / float(len(tweet_words))

                    # update running average value
                    new_score = (new_value_list[1] +
                                 word_sent) / new_value_list[0]
                    new_value_list[1] = new_score

                    new_word_AFINN[word] = new_value_list

                    print word, new_score