def closest_by_sound(klass, search_string, similarity_threshold = 0.8):
        same = []
        similar = []

        # metaphones of search term
        search_sound = dm(search_string)

        for obj in klass.objects.all():
            name_list = str(obj.name).upper().replace('.', ' ').split()
            surname_guess = max(name_list, key=len) 

            # metaphones of obj name
            obj_sound = dm(surname_guess)

            if search_sound[0] == obj_sound[0]:
                # primary metaphones match exactly
                same.append((obj, obj_sound, 1.0))
                continue
            else:
                if search_sound[1] is not None:
                    # see if secondary metaphone of search_sound matches
                    # primary metaphone of obj
                    if search_sound[1] == obj_sound[0]:
                        same.append((obj, obj_sound, 1.0))
                        continue

                # no exact match, so see if the primary metaphones are similar
                primary_sound_dist = jarow(str(search_sound[0]), str(obj_sound[0]))
                if primary_sound_dist >= similarity_threshold:
                    similar.append((obj, obj_sound, primary_sound_dist))
                    continue

                if search_sound[1] is not None:
                    # still dont have a good match. see if secondary metaphone
                    # of search_sound is similar to obj 
                    secondary_sound_dist = jarow(str(search_sound[1]), str(obj_sound[0]))
                    if secondary_sound_dist >= similarity_threshold:
                        similar.append((obj, obj_sound, secondary_sound_dist))

        if len(same) > 0:
            return same
        else:
            similar.sort(None, operator.itemgetter(2))
            # return similar sounding matches if there are no exact matches
            # limit to top 50 percent if there are more than 5 similar matches
            if len(similar) > 5:
                def average(values):
                    return sum(values, 0.0) / len(values)
                avg_jaro = average([x[2] for x in similar])
                above_avg_jaro = [x for x in similar if (x[2] >= avg_jaro)]
                return search_sound, above_avg_jaro
                
            return similar
def calc_dists(mine, theirs):
    ''' Calculates Levenshtein distance, Damerau-Levenshtein distance,
        and Jaro-Winkler distance between two strings.

        Returns a 3-item tuple containing results, respectively.
    '''
    my_str = unicode(mine)
    search_str = unicode(theirs)
    # find levenshtein distance
    lev = distance(my_str, search_str)
    # find damerau-levenshtein distance
    dl = dameraulevenshtein(my_str, search_str)
    # find jaro-winkler distance
    jw = jarow(my_str, search_str)
    return (lev, dl, jw)
Exemplo n.º 3
0
def reconcile_country(raw_country):
    country_map = {u'AFGHANISTAN': u'AFG',
    u'ALBANIA': u'ALB',
    u'ALGERIA': u'DZA',
    u'AMERICAN SAMOA': u'ASM',
    u'ANDORRA': u'AND',
    u'ANGOLA': u'AGO',
    u'ANGUILLA': u'AIA',
    u'ANTARCTICA': None,
    u'ANTIGUA AND BARBUDA': u'ATG',
    u'ARGENTINA': u'ARG',
    u'ARMENIA': u'ARM',
    u'ARUBA': u'ABW',
    u'AUSTRALIA': u'AUS',
    u'AUSTRIA': u'AUT',
    u'AZERBAIJAN': u'AZE',
    u'BAHAMAS': u'BHS',
    u'BAHRAIN': u'BHR',
    u'BANGLADESH': u'BGD',
    u'BARBADOS': u'BRB',
    u'BELARUS': u'BLR',
    u'BELGIUM': u'BEL',
    u'BELIZE': u'BLZ',
    u'BENIN': u'BEN',
    u'BERMUDA': u'BMU',
    u'BHUTAN': u'BTN',
    u'BOLIVIA': u'BOL',
    u'BOSNIA AND HERZEGOVINA': u'BIH',
    u'BOTSWANA': u'BWA',
    u'BOUVET ISLAND': None,
    u'BRAZIL': u'BRA',
    u'BRITISH INDIAN OCEAN TERRITORY': None,
    u'BRUNEI DARUSSALAM': u'BRN',
    u'BULGARIA': u'BGR',
    u'BURKINA FASO': u'BFA',
    u'BURUNDI': u'BDI',
    u'CAMBODIA': u'KHM',
    u'CAMEROON': u'CMR',
    u'CANADA': u'CAN',
    u'CAPE VERDE': u'CPV',
    u'CAYMAN ISLANDS': u'CYM',
    u'CENTRAL AFRICAN REPUBLIC': u'CAF',
    u'CHAD': u'TCD',
    u'CHILE': u'CHL',
    u'CHINA': u'CHN',
    u'CHRISTMAS ISLAND': None,
    u'COCOS (KEELING) ISLANDS': None,
    u'COLOMBIA': u'COL',
    u'COMOROS': u'COM',
    u'CONGO': u'COG',
    u'CONGO, THE DEMOCRATIC REPUBLIC OF THE': u'COD',
    u'COOK ISLANDS': u'C*K',
    u'COSTA RICA': u'CRI',
    u"COTE D'IVOIRE": u'CIV',
    u'CROATIA': u'HRV',
    u'CUBA': u'CUB',
    u'CYPRUS': u'CYP',
    u'CZECH REPUBLIC': u'CZE',
    u'DENMARK': u'DNK',
    u'DJIBOUTI': u'DJI',
    u'DOMINICA': u'DMA',
    u'DOMINICAN REPUBLIC': u'DOM',
    u'ECUADOR': u'ECU',
    u'EGYPT': u'EGY',
    u'EL SALVADOR': u'SLV',
    u'EQUATORIAL GUINEA': u'GNQ',
    u'ERITREA': u'ERI',
    u'ESTONIA': u'EST',
    u'ETHIOPIA': u'ETH',
    u'FALKLAND ISLANDS (MALVINAS)': u'FLK',
    u'FAROE ISLANDS': u'FRO',
    u'FIJI': u'FJI',
    u'FINLAND': u'FIN',
    u'FRANCE': u'FRA',
    u'FRENCH GUIANA': u'GUF',
    u'FRENCH POLYNESIA': u'PYF',
    u'FRENCH SOUTHERN TERRITORIES': None,
    u'GABON': u'GAB',
    u'GAMBIA': u'GMB',
    u'GEORGIA': u'GEO',
    u'GERMANY': u'DEU',
    u'GHANA': u'GHA',
    u'GIBRALTAR': u'GIB',
    u'GREECE': u'GRC',
    u'GREENLAND': u'GRL',
    u'GRENADA': u'GRD',
    u'GUADELOUPE': u'GLP',
    u'GUAM': u'GUM',
    u'GUATEMALA': u'GTM',
    u'GUINEA': u'GIN',
    u'GUINEA-BISSAU': u'GNB',
    u'GUYANA': u'GUY',
    u'HAITI': u'HTI',
    u'HEARD ISLAND AND MCDONALD ISLANDS': None,
    u'HOLY SEE (VATICAN CITY STATE)': u'VAT',
    u'HONDURAS': u'HND',
    u'HONG KONG': u'HKG',
    u'HUNGARY': u'HUN',
    u'ICELAND': u'ISL',
    u'INDIA': u'IND',
    u'INDONESIA': u'IDN',
    u'IRAN, ISLAMIC REPUBLIC OF': u'IRN',
    u'IRAQ': u'IRQ',
    u'IRELAND': u'IRL',
    u'ISRAEL': u'ISR',
    u'ITALY': u'ITA',
    u'JAMAICA': u'JAM',
    u'JAPAN': u'JPN',
    u'JORDAN': u'JOR',
    u'KAZAKHSTAN': u'KAZ',
    u'KENYA': u'KEN',
    u'KIRIBATI': u'KIR',
    u"KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF": u'PRK',
    u'KOREA, REPUBLIC OF': u'KOR',
    u'KUWAIT': u'KWT',
    u'KYRGYZSTAN': u'KGZ',
    u"LAO PEOPLE'S DEMOCRATIC REPUBLIC": u'LAO',
    u'LATVIA': u'LVA',
    u'LEBANON': u'LBN',
    u'LESOTHO': u'LSO',
    u'LIBERIA': u'LBR',
    u'LIBYAN ARAB JAMAHIRIYA': u'LBY',
    u'LIECHTENSTEIN': u'LIE',
    u'LITHUANIA': u'LTU',
    u'LUXEMBOURG': u'LUX',
    u'MACAO': u'MAC',
    u'MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF': u'MKD',
    u'MADAGASCAR': u'MDG',
    u'MALAWI': u'MWI',
    u'MALAYSIA': u'MYS',
    u'MALDIVES': u'MDV',
    u'MALI': u'MLI',
    u'MALTA': u'MLT',
    u'MARSHALL ISLANDS': u'MHL',
    u'MARTINIQUE': u'MTQ',
    u'MAURITANIA': u'MRT',
    u'MAURITIUS': u'MUS',
    u'MAYOTTE': None,
    u'MEXICO': u'MEX',
    u'MICRONESIA, FEDERATED STATES OF': u'FSM',
    u'MOLDOVA, REPUBLIC OF': u'MDA',
    u'MONACO': u'MCO',
    u'MONGOLIA': u'MNG',
    u'MONTSERRAT': u'MSR',
    u'MOROCCO': u'MAR',
    u'MOZAMBIQUE': u'MOZ',
    u'MYANMAR': u'MMR',
    u'NAMIBIA': u'NAM',
    u'NAURU': u'NRU',
    u'NEPAL': u'NPL',
    u'NETHERLANDS': u'NLD',
    u'NETHERLANDS ANTILLES': u'ANT',
    u'NEW CALEDONIA': u'NCL',
    u'NEW ZEALAND': u'NZL',
    u'NICARAGUA': u'NIC',
    u'NIGER': u'NER',
    u'NIGERIA': u'NGA',
    u'NIUE': u'NIU',
    u'NORFOLK ISLAND': u'NFK',
    u'NORTHERN MARIANA ISLANDS': u'MNP',
    u'NORWAY': u'NOR',
    u'OMAN': u'OMN',
    u'PAKISTAN': u'PAK',
    u'PALAU': u'PLW',
    u'PALESTINIAN TERRITORY, OCCUPIED': None,
    u'PANAMA': u'PAN',
    u'PAPUA NEW GUINEA': u'PNG',
    u'PARAGUAY': u'PRY',
    u'PERU': u'PER',
    u'PHILIPPINES': u'PHL',
    u'PITCAIRN': u'PCN',
    u'POLAND': u'POL',
    u'PORTUGAL': u'PRT',
    u'PUERTO RICO': u'PRI',
    u'QATAR': u'QAT',
    u'REUNION': u'REU',
    u'ROMANIA': u'ROM',
    u'RUSSIAN FEDERATION': u'RUS',
    u'RWANDA': u'RWA',
    u'SAINT HELENA': u'SHN',
    u'SAINT KITTS AND NEVIS': u'KNA',
    u'SAINT LUCIA': u'LCA',
    u'SAINT PIERRE AND MIQUELON': u'SPM',
    u'SAINT VINCENT AND THE GRENADINES': u'VCT',
    u'SAMOA': u'WSM',
    u'SAN MARINO': u'SMR',
    u'SAO TOME AND PRINCIPE': u'STP',
    u'SAUDI ARABIA': u'SAU',
    u'SENEGAL': u'SEN',
    u'SERBIA AND MONTENEGRO': None,
    u'SEYCHELLES': u'SYC',
    u'SIERRA LEONE': u'SLE',
    u'SINGAPORE': u'SGP',
    u'SLOVAKIA': u'SVK',
    u'SLOVENIA': u'SVN',
    u'SOLOMON ISLANDS': u'SLB',
    u'SOMALIA': u'SOM',
    u'SOUTH AFRICA': u'ZAF',
    u'SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS': None,
    u'SPAIN': u'ESP',
    u'SRI LANKA': u'LKA',
    u'SUDAN': u'SDN',
    u'SURINAME': u'SUR',
    u'SVALBARD AND JAN MAYEN': u'SJM',
    u'SWAZILAND': u'SWZ',
    u'SWEDEN': u'SWE',
    u'SWITZERLAND': u'CHE',
    u'SYRIAN ARAB REPUBLIC': u'SYR',
    u'TAIWAN, PROVINCE OF CHINA': u'TWN',
    u'TAJIKISTAN': u'TJK',
    u'TANZANIA, UNITED REPUBLIC OF': u'TZA',
    u'THAILAND': u'THA',
    u'TIMOR-LESTE': None,
    u'TOGO': u'TGO',
    u'TOKELAU': u'TKL',
    u'TONGA': u'TON',
    u'TRINIDAD AND TOBAGO': u'TTO',
    u'TUNISIA': u'TUN',
    u'TURKEY': u'TUR',
    u'TURKMENISTAN': u'TKM',
    u'TURKS AND CAICOS ISLANDS': u'TCA',
    u'TUVALU': u'TUV',
    u'UGANDA': u'UGA',
    u'UKRAINE': u'UKR',
    u'UNITED ARAB EMIRATES': u'ARE',
    u'UNITED KINGDOM': u'GBR',
    u'UNITED STATES': u'USA',
    u'UNITED STATES MINOR OUTLYING ISLANDS': None,
    u'URUGUAY': u'URY',
    u'UZBEKISTAN': u'UZB',
    u'VANUATU': u'VUT',
    u'VENEZUELA': u'VEN',
    u'VIET NAM': u'VNM',
    u'VIRGIN ISLANDS, BRITISH': u'VGB',
    u'VIRGIN ISLANDS, U.S.': u'VIR',
    u'WALLIS AND FUTUNA': u'WLF',
    u'WESTERN SAHARA': u'ESH',
    u'YEMEN': u'YEM',
    u'ZAMBIA': u'ZMB',
    u'ZIMBABWE': u'ZWE'}

    # check if term is a key in country_map
    if raw_country.upper() in country_map:
        return True, country_map[raw_country.upper()] 
	
    search_sound = dm(unicode(raw_country))
    suggestions = []
    for c in country_map.keys():
        country_sound = dm(unicode(c))
	if search_sound[0] == country_sound[0]:
	    suggestions.append((1.0, raw_country, c, country_map[c]))
	    continue
	else:
	    if search_sound[1] is not None:
		# see if secondary metaphone of search_sound matches
		# primary metaphone of obj
		if search_sound[1] == country_sound[0]:
		    suggestions.append((1.0, raw_country, c, country_map[c]))
		    continue

	    # no exact match, so see if the primary metaphones are similar
	    primary_sound_dist = jarow(str(search_sound[0]), str(country_sound[0]))
	    if primary_sound_dist >= 2:
		similar.append((primary_sound_dist, raw_country, c, country_map[c]))
		continue
    return False, {'double-metaphone': suggestions}