def tweetData(year): tweets = pull_text.Data(year).tweets return tweets
def get_nominees(year): key_words = [ 'nominated', 'nominate', 'nominee', 'nominates', 'nominating', 'nominations', 'nomination', 'nom', 'noms', 'nommed' ] tweets = pull_text.Data(year).tweets if year == 2013 or year == 2015: awards = OFFICIAL_AWARDS_1.copy() else: awards = OFFICIAL_AWARDS_2.copy() # key_words.extend(awards) award_candidate_sents = dict() for tweet in tweets: try: cont_nom = False for word in key_words: if fuzz.partial_token_sort_ratio( word, tweet.text ) > 90: #if the tweet doesn't contain one of the key_words, go to the next tweet cont_nom = True break if not cont_nom: continue except: continue high_match_idx = -1 high_match_score = -1 equal_scores = [] for idx, award in enumerate(awards): score = fuzz.token_set_ratio(tweet.text_unchanged, award) if score > high_match_score: high_match_idx = idx high_match_score = score equal_scores = [] elif score == high_match_score: equal_scores.append(awards[high_match_idx]) high_match_idx = idx high_match_score = score if high_match_score > 50: if len(equal_scores) > 0: for candidate in equal_scores: award_candidate_sents = handle_high_score( award_candidate_sents, candidate, tweet.text_unchanged, high_match_score) award_candidate_sents = handle_high_score(award_candidate_sents, awards[high_match_idx], tweet.text_unchanged, high_match_score) # print(award_candidate_sents) final_nominees = dict() awards_no_candidates = dict() for award in awards: bucket = [] award_tokens = word_tokenize(award) if 'actor' in award_tokens or 'actress' in award_tokens or 'director' in award_tokens: person = True else: person = False if award in award_candidate_sents: all_candidates = award_candidate_sents[award] else: similar_award_candidates = awards.copy() similar_award_candidates.remove(award) most_similar = '' high_score = 0 for similar in similar_award_candidates: score = fuzz.ratio(award, similar) if score > high_score: high_score = score most_similar = similar awards_no_candidates[award] = most_similar continue for candidate in all_candidates: tokenized = word_tokenize(candidate) nom_idx = -1 for word in key_words: for idx, token in enumerate(tokenized): if word == token: nom_idx = idx break if nom_idx != -1: break if nom_idx == 0 or nom_idx == -1: bucket.extend( make_string_combinations(tokenized, 'right', person)) elif nom_idx == len(tokenized) - 1: bucket.extend( make_string_combinations(tokenized, 'left', person)) else: bucket.extend( make_string_combinations(tokenized[0:nom_idx], 'left', person)) bucket.extend( make_string_combinations(tokenized[nom_idx + 1:], 'right', person)) final_nominees[award] = bucket complete_noms = dict() for award in awards: if award in awards_no_candidates: use_award = awards_no_candidates[award] bucket = final_nominees[use_award] dist = FreqDist(bucket) for noms in dist.most_common(10)[6:]: if noms[0] not in complete_noms[award]: complete_noms[award] = [noms[0]] else: complete_noms[award].append(noms[0]) else: bucket = final_nominees[award] dist = FreqDist(bucket) nom_list = [] for noms in dist.most_common(5): nom_list.append(noms[0]) complete_noms[award] = nom_list # print(complete_noms) return complete_noms
def get_tweets(year): tweets = pull_text.Data(year).tweets tweettext = [] for tweet in tweets: tweettext.append(tweet.text) return tweettext