def award_names(year): best_re = re.compile('best ', re.IGNORECASE) trans_words = re.compile('\s(at|for|dressed|award|and|i\s|golden|http)', re.IGNORECASE) awards = [] for tweet in tweets_i_care_about(year): tweet_text = tweet['text'].encode('ascii', 'ignore') tweet_pieces = re.compile('[^\s\w-]').split(tweet_text.replace( '#', '')) award_tweets = [ x.lower() for x in tweet_pieces if 'wins best' in x or 'won best' in x ] for award_piece in award_tweets: award_name = 'best ' + best_re.split(award_piece)[-1].rstrip() awards.append( trans_words.split(award_name)[0].encode('ascii', 'ignore')) thresh = max(AWARDS_THRESH, len(awards) / 400) fake_awards = [ 'best', 'best dressed', 'best speech', 'best act', 'best actor', 'best actress' ] awardCounter = collections.Counter(awards).iteritems() ret = [[k, v] for k, v in awardCounter if v > thresh and k not in fake_awards] # ret = delete_duplicate_names(ret) ret = [k for k, v in ret] return ret
def award_names(year): best_re = re.compile('best ', re.IGNORECASE) trans_words = re.compile('\s(at|for|dressed|award|and|i\s|golden|http)', re.IGNORECASE) awards = [] for tweet in tweets_i_care_about(year): tweet_text = tweet['text'].encode('ascii', 'ignore') tweet_pieces = re.compile('[^\s\w-]').split(tweet_text.replace('#','')) award_tweets = [x.lower() for x in tweet_pieces if 'wins best' in x or 'won best' in x] for award_piece in award_tweets: award_name = 'best ' + best_re.split(award_piece)[-1].rstrip() awards.append(trans_words.split(award_name)[0].encode('ascii', 'ignore')) thresh = max(AWARDS_THRESH, len(awards)/400) fake_awards = ['best', 'best dressed', 'best speech', 'best act', 'best actor', 'best actress'] awardCounter = collections.Counter(awards).iteritems() ret = [[k,v] for k, v in awardCounter if v > thresh and k not in fake_awards] # ret = delete_duplicate_names(ret) ret = [k for k, v in ret] return ret
def find_noms(year): current_award = None simple_official_awards = get_simple_official_awards() unverified_noms_for_current_award = [] verified_noms = {} for k, v in simple_official_awards.iteritems(): verified_noms[k] = {} for tweet in tweets_i_care_about(year): # code to associate with a specific award ################### autoverified = False tweet_text = tweet['text'].encode('ascii', 'ignore') lc_tw = tweet_text.lower() puncless_tw = remove_punc(lc_tw) if current_award and simple_official_awards[ current_award] in puncless_tw: verified_noms = add_count_to_dict( unverified_noms_for_current_award, verified_noms, current_award) unverified_noms_for_current_award = [] autoverified = True else: for (real, simple) in simple_official_awards.iteritems(): if simple in puncless_tw: current_award = real autoverified = True unverified_noms_for_current_award = [] break ############################################################## if 'http' in lc_tw: continue # salty people ############################################### # re.compile('i wish\s (had)?won) # if 'was better than ' + winner .... # shoulda_regex = re.compile('should have won', re.IGNORECASE) # if re.search(shoulda_regex, lc_tw): # subject_clause = shoulda_regex.split(tweet_text)[0] # proper_noun_regex = re.compile('(([A-Z)]\w*\s?)+(\w*\s?)*)+\Z') # print '.' # mtch = re.search(proper_noun_regex, subject_clause) # print '.' # if not mtch: # unverified_noms_for_current_award.append(mtch.group(1)) # print mtch.group(1) # nominees lists ############################################## # method using lists after 'nominees' presenter_regex = re.compile( r'(@\w*|[A-Z]\w*\s[A-Z]\w*)\s(present|introduce)s?') tw_without_presenters = re.sub(presenter_regex, '', tweet_text) noms_regex = re.compile(r'(nominees:\s)|(nominees are\s)', re.IGNORECASE) if re.search(noms_regex, tw_without_presenters): nominees_text = noms_regex.split(tw_without_presenters)[-1] nominees_text = re.sub(re.compile(r'#\w*'), '', nominees_text) nominees_text = nominees_text.split('.')[0] noms = list( set( re.compile('(\s&\s|\sand\s|,\s|\n)').split( nominees_text))) unverified_noms_for_current_award += noms # add if autoverified ######################################### if autoverified: verified_noms = add_count_to_dict( unverified_noms_for_current_award, verified_noms, current_award) unverified_noms_for_current_award = [] # post processing ################################################# verified_noms = trim_nom_dict(verified_noms) for award, noms in verified_noms.iteritems(): print award for nom in noms: print '-> -> ' + nom return verified_noms
def find_noms(year): current_award = None simple_official_awards = get_simple_official_awards() unverified_noms_for_current_award = [] verified_noms = {} for k, v in simple_official_awards.iteritems(): verified_noms[k] = {} for tweet in tweets_i_care_about(year): # code to associate with a specific award ################### autoverified = False tweet_text = tweet['text'].encode('ascii', 'ignore') lc_tw = tweet_text.lower() puncless_tw = remove_punc(lc_tw) if current_award and simple_official_awards[current_award] in puncless_tw: verified_noms = add_count_to_dict(unverified_noms_for_current_award, verified_noms, current_award) unverified_noms_for_current_award = [] autoverified = True else: for (real, simple) in simple_official_awards.iteritems(): if simple in puncless_tw: current_award = real autoverified = True unverified_noms_for_current_award = [] break ############################################################## if 'http' in lc_tw: continue # salty people ############################################### # re.compile('i wish\s (had)?won) # if 'was better than ' + winner .... # shoulda_regex = re.compile('should have won', re.IGNORECASE) # if re.search(shoulda_regex, lc_tw): # subject_clause = shoulda_regex.split(tweet_text)[0] # proper_noun_regex = re.compile('(([A-Z)]\w*\s?)+(\w*\s?)*)+\Z') # print '.' # mtch = re.search(proper_noun_regex, subject_clause) # print '.' # if not mtch: # unverified_noms_for_current_award.append(mtch.group(1)) # print mtch.group(1) # nominees lists ############################################## # method using lists after 'nominees' presenter_regex = re.compile(r'(@\w*|[A-Z]\w*\s[A-Z]\w*)\s(present|introduce)s?') tw_without_presenters = re.sub(presenter_regex, '', tweet_text) noms_regex = re.compile(r'(nominees:\s)|(nominees are\s)', re.IGNORECASE) if re.search(noms_regex, tw_without_presenters): nominees_text = noms_regex.split(tw_without_presenters)[-1] nominees_text = re.sub(re.compile(r'#\w*'),'', nominees_text) nominees_text = nominees_text.split('.')[0] noms = list(set(re.compile('(\s&\s|\sand\s|,\s|\n)').split(nominees_text))) unverified_noms_for_current_award += noms # add if autoverified ######################################### if autoverified: verified_noms = add_count_to_dict(unverified_noms_for_current_award, verified_noms, current_award) unverified_noms_for_current_award = [] # post processing ################################################# verified_noms = trim_nom_dict(verified_noms) for award, noms in verified_noms.iteritems(): print award for nom in noms: print '-> -> ' + nom return verified_noms
def find_winners(year): current_award = None simple_official_awards = get_simple_official_awards() unverified_wins_current = [] verified_wins = {} for k, v in simple_official_awards.iteritems(): verified_wins[k] = defaultdict(lambda: 0) for tweet in tweets_i_care_about(year): autoverified = False lc_tw = tweet['text'].lower() puncless_tw = remove_punc(lc_tw) if current_award and simple_official_awards[current_award] in puncless_tw: verified_wins = add_count_to_dict(unverified_wins_current, verified_wins, current_award) unverified_wins_current = [] autoverified = True else: for (real, simple) in simple_official_awards.iteritems(): #print simple for x in award_name_gen(simple): if x in puncless_tw: #print x current_award = real verified_wins = add_count_to_dict(unverified_wins_current, verified_wins, current_award) #print current_award autoverified = True unverified_wins_current = [] if 'wins' not in lc_tw and 'won' not in lc_tw and 'goes to' not in lc_tw or 'http' in lc_tw: #print "no" continue presenter_regex = re.compile(r'(@\w*|[A-Z]\w*\s[A-Z]\w*)\s(present|introduce)') tw_without_presenters = re.sub(presenter_regex, ' ', tweet['text']) if 'goes to' in lc_tw or 'wins' in lc_tw or 'won' in lc_tw: #print current_award if 'wins' in lc_tw: #print "yes" winner_text = re.compile(r'wins', re.IGNORECASE).split(tw_without_presenters)[0] winner_text = winner_text.encode('ascii', 'ignore') winner_text = re.sub(re.compile(r'#\w*'),'', winner_text) if 'won' in lc_tw: #print "yes" winner_text = re.compile(r'won', re.IGNORECASE).split(tw_without_presenters)[0] winner_text = winner_text.encode('ascii', 'ignore') winner_text = re.sub(re.compile(r'#\w*'),'', winner_text) # print winner_text if 'goes to' in lc_tw: winner_text = re.compile(r'goes to', re.IGNORECASE).split(tw_without_presenters)[-1] winner_text = re.compile(r'for', re.IGNORECASE).split(winner_text)[0] winner_text = winner_text.encode('ascii', 'ignore') winner_text = re.sub(re.compile(r'#\w*'),'', winner_text) # print winner_text # if 'winner' in lc_tw: # #winner_text = re.compile(r'winner', re.IGNORECASE).split(tw_without_presenters)[-1] # winner_text = tw_without_presenters.encode('ascii', 'ignore') # winner_text = re.sub(re.compile(r'#\w*'),'', winner_text) prop = re.compile(r'([A-Z]{1}[a-z]{1,}(\s[A-Z]{1}[a-z]{1,})?)') matches = prop.findall(winner_text) #print matches, current_award matches = [i[0] for i in matches] matches = filter_false_positive(matches) #if autoverified: #if autoverified: #print current_award, matches wins = list(matches) #print "matches:", wins, current_award unverified_wins_current += wins #print wins #print wins if autoverified: verified_wins = add_count_to_dict(unverified_wins_current, verified_wins, current_award) unverified_wins_current = [] #verified_wins = trim_nom_dict(verified_wins) #print verified_wins final_dict = {} for k, v in verified_wins.iteritems(): #print k highWin='' highCount= 0 for win, count in v.iteritems(): if count > highCount and win != '': highCount = count highWin = win final_dict[k] = highWin return final_dict # find_noms(2015)