def award_names(year):
    best_re = re.compile('best ', re.IGNORECASE)
    trans_words = re.compile('\s(at|for|dressed|award|and|i\s|golden|http)',
                             re.IGNORECASE)

    awards = []
    for tweet in tweets_i_care_about(year):
        tweet_text = tweet['text'].encode('ascii', 'ignore')
        tweet_pieces = re.compile('[^\s\w-]').split(tweet_text.replace(
            '#', ''))
        award_tweets = [
            x.lower() for x in tweet_pieces
            if 'wins best' in x or 'won best' in x
        ]
        for award_piece in award_tweets:
            award_name = 'best ' + best_re.split(award_piece)[-1].rstrip()
            awards.append(
                trans_words.split(award_name)[0].encode('ascii', 'ignore'))

    thresh = max(AWARDS_THRESH, len(awards) / 400)

    fake_awards = [
        'best', 'best dressed', 'best speech', 'best act', 'best actor',
        'best actress'
    ]
    awardCounter = collections.Counter(awards).iteritems()
    ret = [[k, v] for k, v in awardCounter
           if v > thresh and k not in fake_awards]
    # ret = delete_duplicate_names(ret)
    ret = [k for k, v in ret]
    return ret
def award_names(year):
    best_re = re.compile('best ', re.IGNORECASE)
    trans_words = re.compile('\s(at|for|dressed|award|and|i\s|golden|http)', re.IGNORECASE)

    awards = []
    for tweet in tweets_i_care_about(year):
        tweet_text = tweet['text'].encode('ascii', 'ignore')
        tweet_pieces = re.compile('[^\s\w-]').split(tweet_text.replace('#',''))
        award_tweets = [x.lower() for x in tweet_pieces if 'wins best' in x or 'won best' in x]
        for award_piece in award_tweets:
            award_name = 'best ' + best_re.split(award_piece)[-1].rstrip()
            awards.append(trans_words.split(award_name)[0].encode('ascii', 'ignore'))

    thresh = max(AWARDS_THRESH, len(awards)/400)

    fake_awards = ['best', 'best dressed', 'best speech', 'best act', 'best actor', 'best actress']
    awardCounter = collections.Counter(awards).iteritems()
    ret = [[k,v] for k, v in awardCounter if v > thresh and k not in fake_awards]
    # ret = delete_duplicate_names(ret)
    ret = [k for k, v in ret]
    return ret
示例#3
0
def find_noms(year):
    current_award = None
    simple_official_awards = get_simple_official_awards()

    unverified_noms_for_current_award = []
    verified_noms = {}
    for k, v in simple_official_awards.iteritems():
        verified_noms[k] = {}

    for tweet in tweets_i_care_about(year):
        # code to associate with a specific award ###################
        autoverified = False
        tweet_text = tweet['text'].encode('ascii', 'ignore')
        lc_tw = tweet_text.lower()
        puncless_tw = remove_punc(lc_tw)

        if current_award and simple_official_awards[
                current_award] in puncless_tw:
            verified_noms = add_count_to_dict(
                unverified_noms_for_current_award, verified_noms,
                current_award)
            unverified_noms_for_current_award = []
            autoverified = True
        else:
            for (real, simple) in simple_official_awards.iteritems():
                if simple in puncless_tw:
                    current_award = real
                    autoverified = True
                    unverified_noms_for_current_award = []
                    break
        ##############################################################

        if 'http' in lc_tw:
            continue

        # salty people ###############################################
        # re.compile('i wish\s (had)?won)
        # if 'was better than ' + winner ....

        # shoulda_regex = re.compile('should have won', re.IGNORECASE)
        # if re.search(shoulda_regex, lc_tw):
        #     subject_clause = shoulda_regex.split(tweet_text)[0]
        #     proper_noun_regex = re.compile('(([A-Z)]\w*\s?)+(\w*\s?)*)+\Z')
        #     print '.'
        #     mtch = re.search(proper_noun_regex, subject_clause)
        #     print '.'

        #     if not mtch:
        #         unverified_noms_for_current_award.append(mtch.group(1))
        #         print mtch.group(1)

        # nominees lists ##############################################
        # method using lists after 'nominees'

        presenter_regex = re.compile(
            r'(@\w*|[A-Z]\w*\s[A-Z]\w*)\s(present|introduce)s?')
        tw_without_presenters = re.sub(presenter_regex, '', tweet_text)
        noms_regex = re.compile(r'(nominees:\s)|(nominees are\s)',
                                re.IGNORECASE)
        if re.search(noms_regex, tw_without_presenters):
            nominees_text = noms_regex.split(tw_without_presenters)[-1]
            nominees_text = re.sub(re.compile(r'#\w*'), '', nominees_text)
            nominees_text = nominees_text.split('.')[0]
            noms = list(
                set(
                    re.compile('(\s&\s|\sand\s|,\s|\n)').split(
                        nominees_text)))
            unverified_noms_for_current_award += noms

        # add if autoverified #########################################

        if autoverified:
            verified_noms = add_count_to_dict(
                unverified_noms_for_current_award, verified_noms,
                current_award)
            unverified_noms_for_current_award = []

    # post processing #################################################
    verified_noms = trim_nom_dict(verified_noms)
    for award, noms in verified_noms.iteritems():
        print award
        for nom in noms:
            print '-> -> ' + nom

    return verified_noms
def find_noms(year):
    current_award = None
    simple_official_awards = get_simple_official_awards()

    unverified_noms_for_current_award = []
    verified_noms = {}
    for k, v in simple_official_awards.iteritems():
        verified_noms[k] = {}

    for tweet in tweets_i_care_about(year):
        # code to associate with a specific award ###################
        autoverified = False
        tweet_text = tweet['text'].encode('ascii', 'ignore')
        lc_tw = tweet_text.lower()
        puncless_tw = remove_punc(lc_tw)

        if current_award and simple_official_awards[current_award] in puncless_tw:
            verified_noms = add_count_to_dict(unverified_noms_for_current_award, verified_noms, current_award)
            unverified_noms_for_current_award = []
            autoverified = True
        else:
            for (real, simple) in simple_official_awards.iteritems():
                if simple in puncless_tw:
                    current_award = real
                    autoverified = True
                    unverified_noms_for_current_award = []
                    break
        ##############################################################

        if 'http' in lc_tw:
            continue


        # salty people ###############################################
        # re.compile('i wish\s (had)?won)
        # if 'was better than ' + winner ....

        # shoulda_regex = re.compile('should have won', re.IGNORECASE)
        # if re.search(shoulda_regex, lc_tw):
        #     subject_clause = shoulda_regex.split(tweet_text)[0]
        #     proper_noun_regex = re.compile('(([A-Z)]\w*\s?)+(\w*\s?)*)+\Z')
        #     print '.'
        #     mtch = re.search(proper_noun_regex, subject_clause)
        #     print '.'

        #     if not mtch:
        #         unverified_noms_for_current_award.append(mtch.group(1))
        #         print mtch.group(1)

        # nominees lists ##############################################
        # method using lists after 'nominees'

        presenter_regex = re.compile(r'(@\w*|[A-Z]\w*\s[A-Z]\w*)\s(present|introduce)s?')
        tw_without_presenters = re.sub(presenter_regex, '', tweet_text)
        noms_regex = re.compile(r'(nominees:\s)|(nominees are\s)', re.IGNORECASE)
        if re.search(noms_regex, tw_without_presenters):
            nominees_text = noms_regex.split(tw_without_presenters)[-1]
            nominees_text = re.sub(re.compile(r'#\w*'),'', nominees_text)
            nominees_text = nominees_text.split('.')[0]
            noms = list(set(re.compile('(\s&\s|\sand\s|,\s|\n)').split(nominees_text)))
            unverified_noms_for_current_award += noms

        # add if autoverified #########################################

        if autoverified:
            verified_noms = add_count_to_dict(unverified_noms_for_current_award, verified_noms, current_award)
            unverified_noms_for_current_award = []

    # post processing #################################################
    verified_noms = trim_nom_dict(verified_noms)
    for award, noms in verified_noms.iteritems():
        print award
        for nom in noms:
            print '-> -> ' + nom

    return verified_noms
def find_winners(year):
    current_award = None
    simple_official_awards = get_simple_official_awards()
    unverified_wins_current = []
    verified_wins = {}
    for k, v in simple_official_awards.iteritems():
        verified_wins[k] = defaultdict(lambda: 0)

    for tweet in tweets_i_care_about(year):
        autoverified = False
        lc_tw = tweet['text'].lower()
        puncless_tw = remove_punc(lc_tw)


        if current_award and simple_official_awards[current_award] in puncless_tw:
             verified_wins = add_count_to_dict(unverified_wins_current, verified_wins, current_award)
             unverified_wins_current = []
             autoverified = True
        else:
            for (real, simple) in simple_official_awards.iteritems():
                #print simple
                for x in award_name_gen(simple): 
                    if x in puncless_tw:
                        #print x
                        current_award = real
                        verified_wins = add_count_to_dict(unverified_wins_current, verified_wins, current_award)                       
                        #print current_award
                        autoverified = True
                        unverified_wins_current = []
        if 'wins' not in lc_tw and 'won' not in lc_tw and 'goes to' not in lc_tw or 'http' in lc_tw:
            #print "no"
            continue

        presenter_regex = re.compile(r'(@\w*|[A-Z]\w*\s[A-Z]\w*)\s(present|introduce)')
        tw_without_presenters = re.sub(presenter_regex, ' ', tweet['text'])

        if 'goes to' in lc_tw or 'wins' in lc_tw or 'won' in lc_tw:
            #print current_award
            if 'wins' in lc_tw:
                #print "yes"
                winner_text = re.compile(r'wins', re.IGNORECASE).split(tw_without_presenters)[0]
                winner_text = winner_text.encode('ascii', 'ignore')
                winner_text = re.sub(re.compile(r'#\w*'),'', winner_text)
           
            if 'won' in lc_tw:
                #print "yes"
                winner_text = re.compile(r'won', re.IGNORECASE).split(tw_without_presenters)[0]
                winner_text = winner_text.encode('ascii', 'ignore')
                winner_text = re.sub(re.compile(r'#\w*'),'', winner_text)
             #   print winner_text
            if 'goes to' in lc_tw:
                winner_text = re.compile(r'goes to', re.IGNORECASE).split(tw_without_presenters)[-1]
                winner_text = re.compile(r'for', re.IGNORECASE).split(winner_text)[0]
                winner_text = winner_text.encode('ascii', 'ignore')
                winner_text = re.sub(re.compile(r'#\w*'),'', winner_text)
             #   print winner_text
            # if 'winner' in lc_tw:
            #     #winner_text = re.compile(r'winner', re.IGNORECASE).split(tw_without_presenters)[-1]
            #     winner_text = tw_without_presenters.encode('ascii', 'ignore')
            #     winner_text = re.sub(re.compile(r'#\w*'),'', winner_text)
            prop = re.compile(r'([A-Z]{1}[a-z]{1,}(\s[A-Z]{1}[a-z]{1,})?)')
            matches = prop.findall(winner_text)
            #print matches, current_award
            matches = [i[0] for i in matches]
            matches = filter_false_positive(matches)
            #if autoverified:
            #if autoverified:
            #print current_award, matches
            wins = list(matches)
            #print "matches:", wins, current_award
            unverified_wins_current += wins
            #print wins
            #print wins
            if autoverified:   
                verified_wins = add_count_to_dict(unverified_wins_current, verified_wins, current_award)
                unverified_wins_current = []

    #verified_wins = trim_nom_dict(verified_wins)
    #print verified_wins
    final_dict = {}
    for k, v in verified_wins.iteritems():
        #print k
        highWin=''
        highCount= 0
        for win, count in v.iteritems():
            if count > highCount and win != '':
                highCount = count
            	highWin = win
        final_dict[k] = highWin
    return final_dict


# find_noms(2015)