def run_legislator_query(name, state=None, office=None, year=None, congress=None): starts_with_blocklength = 6; result_array = [] # don't even bother if there are less than 4 letters if (len(name) < 4): return result_array name1 = HumanName(name) name1_standardized = simple_clean(name1.last) + " " + unnickname(name1.first) # we block with the first n characters of the last name blocking_name = simple_clean(name1.last) # if we can't find the last name, assume the name is the last name. This might be a bad idea. if not blocking_name: blocking_name = simple_clean(name) possible_matches = block_by_startswith(blocking_name, starts_with_blocklength, state, office, year, congress) for match in possible_matches: name2 = simple_clean(match.legislator.last_name) + " " + unnickname(match.legislator.first_name) # calculate a buncha metrics text1 = name1_standardized text2 = name2 #print "comparing '%s' to '%s'" % (text1, text2) ratio = 1/100.0*fuzz.ratio(text1, text2) partial_ratio = 1/100.0*fuzz.partial_ratio(text1, text2) token_sort_ratio = 1/100.0*fuzz.token_sort_ratio(text1, text2) token_set_ratio = 1/100.0*fuzz.token_set_ratio(text1, text2) avg_len = 1/2*len(text1)+len(text2) min_len = min(len(text1), len(text2)) l_ratio = 0 try: l_distance = jellyfish.levenshtein_distance(text1, text2) l_ratio = 1.0 - ( (0.0 + l_distance) / (0.0+avg_len) ) except UnicodeEncodeError: pass long_match = longest_match(text1, text2) lng_ratio = (0.0 + long_match) / (0.0 + min_len) score = 0 if ( ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6 or lng_ratio > 0.6): score = compute_scores([ratio,partial_ratio,l_ratio,lng_ratio]) if debug: print "Candidate %s vs %s score: %s" % (text1, text2, score) print ("ratio=%s partial_ratio=%s token_sort_ratio=%s token_set_ratio=%s, l_ratio=%s lng_ratio=%s") % (ratio, partial_ratio, token_sort_ratio, token_set_ratio, l_ratio, lng_ratio) if (score > 0.8): # full name is missing in early years return_name = match.legislator.official_full if not return_name: return_name = "%s %s %s %s" % (match.legislator.first_name, match.legislator.middle_name if match.legislator.middle_name else '', match.legislator.last_name, match.legislator.suffix if match.legislator.suffix else '', ) name_standardized = "%s. %s (%s) (%s) %s-%s" % (match.term_type.title(), return_name, match.party, match.state, match.start.year, match.end.year) # name_standardized = match.term_type.title() + ". " + match.legislator.official_full + " (" + str(match.party) + ") (" + match.state +") " + match.start.strftime("%m/%d/%y") + "-" + match.end.strftime("%m/%d/%y") result_array.append({'name':name_standardized, 'id':match.legislator.bioguide, 'score':score, 'type':[], 'match':False}) if debug: print "Match found: %s" % name_standardized if (len(result_array)==0): if debug: print "No match for %s, which was standardized to: %s" % (name, name1_standardized) result_array = sorted(result_array, key=itemgetter('score'), reverse=True) return result_array
def match_by_name(name, state=None, office=None, cycle=None, reverse_name_order=False): if debug: print "match_by_name = state=%s office=%s cycle=%s" % (state,office, cycle) result_array = [] name1 = HumanName(name) name1_standardized = None blocking_name = None # sometimes we run into a name that's flipped: if reverse_name_order: print "Running name reversal check!" blocking_name = simple_clean(name1.first) name1_standardized = simple_clean(name1.first) + " " + unnickname(name1.last) else: name1_standardized = simple_clean(name1.last) + " " + unnickname(name1.first) blocking_name = simple_clean(name1.last) # if we can't find the last name, assume the name is the last name. This might be a bad idea. if not blocking_name: blocking_name = simple_clean(name) possible_matches = block_by_startswith(blocking_name, starts_with_blocklength, state, office, cycle) for match in possible_matches: name2_name = HumanName(match['fec_name']) name2 = simple_clean(name2_name.last) + " " + unnickname(name2_name.first) # calculate a buncha metrics text1 = name1_standardized text2 = name2 #print "comparing '%s' to '%s'" % (text1, text2) ratio = 1/100.0*fuzz.ratio(text1, text2) partial_ratio = 1/100.0*fuzz.partial_ratio(text1, text2) token_sort_ratio = 1/100.0*fuzz.token_sort_ratio(text1, text2) token_set_ratio = 1/100.0*fuzz.token_set_ratio(text1, text2) avg_len = 1/2*len(text1)+len(text2) min_len = min(len(text1), len(text2)) l_ratio = 0 try: l_distance = jellyfish.levenshtein_distance(text1, text2) l_ratio = 1.0 - ( (0.0 + l_distance) / (0.0+avg_len) ) except UnicodeEncodeError: pass long_match = longest_match(text1, text2) lng_ratio = (0.0 + long_match) / (0.0 + min_len) score = 0 if ( ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6 or lng_ratio > 0.6): score = compute_scores([ratio,partial_ratio,l_ratio,lng_ratio]) if debug: log.debug("|fuzzymatchresult|%s|'%s'|'%s'|score=%s|ratio=%s|partial_ratio=%s|token_sort_ratio=%s|token_set_ratio=%s| l_ratio=%s|lng_ratio=%s" % (match['fec_id'], match['fec_name'], name, score, ratio, partial_ratio, token_sort_ratio, token_set_ratio, l_ratio, lng_ratio)) if (score > 0.8): name_standardized = standardize_name_from_dict(match) result_array.append({'name':name_standardized, 'id':match['fec_id'], 'score':score, 'type':[], 'match':False}) if debug: log.debug("Match found: %s" % name_standardized) if debug and len(result_array)==0: log.debug("No match for %s, which was standardized to: %s" % (name, name1_standardized)) # If it's a good match and there's only one, call it a definite match. if (len(result_array)==1): if result_array[0]['score'] > 0.9: result_array[0]['match'] = True # surprisingly, google refine *doesn't* sort by score. return result_array