Python longest_match примеры использования

Язык программирования: Python

Пространство имен/Пакет: utils.matchlength

Метод/Функция: longest_match

Примеров на hotexamples.com: 2

Python longest_match - 2 примера найдено. Это лучшие примеры Python кода для utils.matchlength.longest_match, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: legislator_reconciler.py Проект: jsfenfen/reconcile-legislators

def run_legislator_query(name, state=None, office=None, year=None, congress=None):
    starts_with_blocklength = 6;
    result_array = []
    
    # don't even bother if there are less than 4 letters 
    if (len(name) < 4):
        return result_array
    
    name1 = HumanName(name)
    name1_standardized = simple_clean(name1.last) + " " + unnickname(name1.first)
    
    # we block with the first n characters of the last name
    blocking_name = simple_clean(name1.last)
    
    # if we can't find the last name, assume the name is the last name. This might be a bad idea. 
    if not blocking_name:
        blocking_name = simple_clean(name)
        
    possible_matches = block_by_startswith(blocking_name, starts_with_blocklength, state, office, year, congress)
        
    for match in possible_matches:
        
        
        name2 = simple_clean(match.legislator.last_name) + " " + unnickname(match.legislator.first_name)
        # calculate a buncha metrics
        text1 = name1_standardized
        text2 = name2
        #print "comparing '%s' to '%s'" % (text1, text2)
        ratio = 1/100.0*fuzz.ratio(text1, text2)
        partial_ratio = 1/100.0*fuzz.partial_ratio(text1, text2)
        token_sort_ratio = 1/100.0*fuzz.token_sort_ratio(text1, text2)
        token_set_ratio = 1/100.0*fuzz.token_set_ratio(text1, text2)
        
        avg_len = 1/2*len(text1)+len(text2)
        min_len = min(len(text1), len(text2))
        
        l_ratio = 0
        try:
            l_distance = jellyfish.levenshtein_distance(text1, text2)
            l_ratio = 1.0 - ( (0.0 + l_distance) / (0.0+avg_len) )
        except UnicodeEncodeError:
            pass
            
        long_match = longest_match(text1, text2)
        lng_ratio = (0.0 + long_match) / (0.0 + min_len)
        
        score = 0
        if ( ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6 or lng_ratio > 0.6):
            score = compute_scores([ratio,partial_ratio,l_ratio,lng_ratio])
           
        if debug:
            print "Candidate %s vs %s score: %s" % (text1, text2, score)
            print ("ratio=%s partial_ratio=%s token_sort_ratio=%s token_set_ratio=%s, l_ratio=%s lng_ratio=%s") % (ratio, partial_ratio, token_sort_ratio, token_set_ratio, l_ratio, lng_ratio)
        
        
        if (score > 0.8):
            # full name is missing in early years
            return_name = match.legislator.official_full
            if not return_name:
                return_name = "%s %s %s %s" % (match.legislator.first_name, match.legislator.middle_name if match.legislator.middle_name else '', match.legislator.last_name, match.legislator.suffix if match.legislator.suffix else '', )
                
            
            name_standardized = "%s. %s (%s) (%s) %s-%s" % (match.term_type.title(), return_name, match.party, match.state, match.start.year, match.end.year)
#            name_standardized = match.term_type.title() + ". " + match.legislator.official_full + " (" + str(match.party) + ") (" + match.state +") " + match.start.strftime("%m/%d/%y") + "-" + match.end.strftime("%m/%d/%y")
            result_array.append({'name':name_standardized, 'id':match.legislator.bioguide, 'score':score, 'type':[], 'match':False})
            if debug:
                print "Match found: %s" % name_standardized
    
    if (len(result_array)==0):
        if debug:
            print "No match for %s, which was standardized to: %s" % (name, name1_standardized)
    result_array = sorted(result_array, key=itemgetter('score'), reverse=True)
    return result_array

Пример #2

Показать файл

Файл: fec_reconciler.py Проект: jsfenfen/reconcile-legislators

def match_by_name(name, state=None, office=None, cycle=None, reverse_name_order=False):
    if debug:
        print "match_by_name = state=%s office=%s cycle=%s" % (state,office, cycle)
    result_array = []
    name1 = HumanName(name)
    
    name1_standardized = None
    blocking_name = None
    
    # sometimes we run into a name that's flipped:
    if reverse_name_order:
        print "Running name reversal check!"
        blocking_name = simple_clean(name1.first)
        name1_standardized = simple_clean(name1.first) + " " + unnickname(name1.last)
    
    else:
        name1_standardized = simple_clean(name1.last) + " " + unnickname(name1.first)
        blocking_name = simple_clean(name1.last)
    
    # if we can't find the last name, assume the name is the last name. This might be a bad idea. 
    if not blocking_name:
        blocking_name = simple_clean(name)
        
    possible_matches = block_by_startswith(blocking_name, starts_with_blocklength, state, office, cycle)
        
    for match in possible_matches:
        
        name2_name = HumanName(match['fec_name'])
        name2 = simple_clean(name2_name.last) + " " + unnickname(name2_name.first)
        # calculate a buncha metrics
        text1 = name1_standardized
        text2 = name2
        #print "comparing '%s' to '%s'" % (text1, text2)
        ratio = 1/100.0*fuzz.ratio(text1, text2)
        partial_ratio = 1/100.0*fuzz.partial_ratio(text1, text2)
        token_sort_ratio = 1/100.0*fuzz.token_sort_ratio(text1, text2)
        token_set_ratio = 1/100.0*fuzz.token_set_ratio(text1, text2)
        
        avg_len = 1/2*len(text1)+len(text2)
        min_len = min(len(text1), len(text2))
        
        l_ratio = 0
        try:
            l_distance = jellyfish.levenshtein_distance(text1, text2)
            l_ratio = 1.0 - ( (0.0 + l_distance) / (0.0+avg_len) )
        except UnicodeEncodeError:
            pass
            
        long_match = longest_match(text1, text2)
        lng_ratio = (0.0 + long_match) / (0.0 + min_len)
        
        score = 0
        if ( ratio > 0.6 or partial_ratio > 0.6 or l_ratio > 0.6 or lng_ratio > 0.6):
            score = compute_scores([ratio,partial_ratio,l_ratio,lng_ratio])
           
        if debug:
            log.debug("|fuzzymatchresult|%s|'%s'|'%s'|score=%s|ratio=%s|partial_ratio=%s|token_sort_ratio=%s|token_set_ratio=%s| l_ratio=%s|lng_ratio=%s" % (match['fec_id'], match['fec_name'], name, score, ratio, partial_ratio, token_sort_ratio, token_set_ratio, l_ratio, lng_ratio))
        
        
        if (score > 0.8):
            name_standardized = standardize_name_from_dict(match)
            result_array.append({'name':name_standardized, 'id':match['fec_id'], 'score':score, 'type':[], 'match':False})
            if debug:
                log.debug("Match found: %s" % name_standardized)
    
    if debug and len(result_array)==0:
        log.debug("No match for %s, which was standardized to: %s" % (name, name1_standardized))
            
    # If it's a good match and there's only one, call it a definite match.
    if (len(result_array)==1):
        if result_array[0]['score'] > 0.9:
            result_array[0]['match'] = True        
    # surprisingly, google refine *doesn't* sort by score.
    return result_array