def chanmsg(self, conn, user, channel, message): ch = self.channels[conn.factory.network, conn.irclower(channel)] if message.strip().lower() == "!next": ch.nobodygotit() elif message.strip().lower() == "!stop": ch.on = False ch.timer.stop() elif message.strip().lower() == "!start": ch.on = True ch.doquestion() elif message.strip().lower() == "!hint": stars = [i for i, c in enumerate(ch.hint) if c == "*"] n = min(3, len(stars)-3) if n >= 1: for i in random.sample(stars, n): ch.hint = ch.hint[:i] + ch.answer[i] + ch.hint[i+1:] ch.post("Hint!!!!! \x037" + ch.hint) elif ch.on: inp = ' '.join(message.strip().lower().split()) ans = ' '.join(ch.answer.strip().lower().split()) if inp == ans: ch.post("%s got the answer! %s" % (irc.usersplit(user).group("nick"), ch.explanation)) ch.doquestion() elif dameraulevenshtein.dameraulevenshtein(inp, ans) / len(ans) <= .2: ch.post(message + "? \x033That's close!") print "levenshtein" elif metaphone.dm(inp) == metaphone.dm(ans): ch.post(message + "? \x033That's close!") print "metaphone"
def __get_values(params): return [ params['name']['last'], params['name']['first'], params['name']['middle'], params['name']['suffix'], params['name']['nickname'], dm(params['name']['last'])[0], dm(params['name']['first'])[0], dm(params['name']['nickname'])[0], params['voter_info']['birth_year'], params['voter_info']['gender'], params['contact_info']['email'], params['contact_info']['phone1'], params['contact_info']['phone2'], params['address']['house_number'], params['address']['pre_direction'], params['address']['street_name'], params['address']['street_type'], params['address']['suf_direction'], params['address']['unit'], dm(params['address']['street_name'])[0], params['address']['city'], params['address']['zipcode'], params['voter_info']['precinct_id'], params['voter_info']['voter_id'], params['voter_info']['reg_date'] ]
def _metaphones(query, dataset): new_query = [dm(given_keyword)[0] for given_keyword in query] new_dataset = [] for data in dataset: user_keywords = [dm(user_keyword)[0] for user_keyword in data] new_dataset.append(user_keywords) return new_query, new_dataset
def compute_similarity_matching_fields(self): """Use double metaphone values and store as 'X-Y'.""" self.name_metaphone = '%s-%s' % metaphone.dm(unicode(self.name)) if self.name else None self.address_digits = _filter_non_digits(self.address) if self.address else None self.address_metaphone = '%s-%s' % metaphone.dm(unicode(self.address)) if self.address else None self.city_metaphone = '%s-%s' % metaphone.dm(unicode(self.city)) if self.city else None self.phone_normalised = _filter_non_digits(self.phone1) if self.phone1 else None
def palabras_sim(palabras1, palabras2): palabras1 = module2.Sortable(palabras1).encode('latin_1').split(' ') palabras2 = module2.Sortable(palabras2).encode('latin_1').split(' ') cuenta = 0 for j in range(min(len(palabras1),len(palabras2))): cuenta += similaridad(dm(palabras1[j]), dm(palabras2[j])) return cuenta
def greet_engine(self): assistant_name = self.c.config.get('SYSTEM', 'assistant_name') meta_name = dm(assistant_name)[0] for index, raw_text in enumerate(self.raw_text_array): meta_text = dm(raw_text)[0] chances = sm(None, meta_name, meta_text).ratio() if chances > 0.7: self.raw_text_array = self.raw_text_array[index + 1:] return
def computePhonemic(self, word): try: metaphone_tuples = metaphone.dm(unicode(word)) except Exception: try: metaphone_tuples = metaphone.dm(word.decode('utf-8', 'ignore')) except Exception: metaphone_tuples = metaphone.dm(word.decode('latin-1', 'ignore')) return metaphone_tuples
def get_metaphone_level(self, first, second): (first_pri, first_sec) = metaphone.dm(first) (second_pri, second_sec) = metaphone.dm(second) level = 0 if first_pri == second_pri: level = 4 if first_sec == second_pri or first_pri == second_sec: level = 2 if first_sec == second_sec: level = 1 return level
def closest_by_sound(klass, search_string, similarity_threshold = 0.8): same = [] similar = [] # metaphones of search term search_sound = dm(search_string) for obj in klass.objects.all(): name_list = str(obj.name).upper().replace('.', ' ').split() surname_guess = max(name_list, key=len) # metaphones of obj name obj_sound = dm(surname_guess) if search_sound[0] == obj_sound[0]: # primary metaphones match exactly same.append((obj, obj_sound, 1.0)) continue else: if search_sound[1] is not None: # see if secondary metaphone of search_sound matches # primary metaphone of obj if search_sound[1] == obj_sound[0]: same.append((obj, obj_sound, 1.0)) continue # no exact match, so see if the primary metaphones are similar primary_sound_dist = jarow(str(search_sound[0]), str(obj_sound[0])) if primary_sound_dist >= similarity_threshold: similar.append((obj, obj_sound, primary_sound_dist)) continue if search_sound[1] is not None: # still dont have a good match. see if secondary metaphone # of search_sound is similar to obj secondary_sound_dist = jarow(str(search_sound[1]), str(obj_sound[0])) if secondary_sound_dist >= similarity_threshold: similar.append((obj, obj_sound, secondary_sound_dist)) if len(same) > 0: return same else: similar.sort(None, operator.itemgetter(2)) # return similar sounding matches if there are no exact matches # limit to top 50 percent if there are more than 5 similar matches if len(similar) > 5: def average(values): return sum(values, 0.0) / len(values) avg_jaro = average([x[2] for x in similar]) above_avg_jaro = [x for x in similar if (x[2] >= avg_jaro)] return search_sound, above_avg_jaro return similar
def compute_similarity_matching_fields(self): """Use double metaphone values and store as 'X-Y'.""" self.name_metaphone = '%s-%s' % metaphone.dm(unicode( self.name)) if self.name else None self.address_digits = _filter_non_digits( self.address) if self.address else None self.address_metaphone = '%s-%s' % metaphone.dm(unicode( self.address)) if self.address else None self.city_metaphone = '%s-%s' % metaphone.dm(unicode( self.city)) if self.city else None self.phone_normalised = _filter_non_digits( self.phone1) if self.phone1 else None
def palabras_sim(palabras1, palabras2): palabras1 = module2.Sortable(palabras1).encode('latin_1').split(' ') palabras2 = module2.Sortable(palabras2).encode('latin_1').split(' ') cuenta = 0 #for j in range(min(len(palabras1),len(palabras2))): for j in range(len(palabras1)): if palabras1[j].lower() not in palabras_no_consideradas: for k in range(len(palabras2)): if palabras2[k].lower() not in palabras_no_consideradas: cuenta += similaridad(dm(palabras1[j]), dm(palabras2[k])) #print "comparando ", palabras1[j] , " con ", palabras2[k] return cuenta
def plausibleWords(incorrectWord): USengDict = open("enUS.txt","r") GBengDict = open("enGB.txt","r") phoneticDictUS = open("metaphonicDictUS.txt","r") phoneticDictGB = open("metaphonicDictGB.txt","r") temp = (metaphone.dm(incorrectWord))[0] plausibleList = [] plausibleListTemp = [] ctr = 0 for line in phoneticDictUS: if line[:-1] == temp: plausibleListTemp.append((ctr,"USprimary")) ctr = ctr + 1 ctr = 0 for line in phoneticDictGB: if line[:-1] == temp: plausibleListTemp.append((ctr,"GBprimary")) ctr = ctr + 1 linesUS = USengDict.readlines() linesGB = GBengDict.readlines() for i in xrange(len(plausibleListTemp)): if plausibleListTemp[i][1] == "USprimary": plausibleList.append(linesUS[plausibleListTemp[i][0]][:-1]) else: plausibleList.append(linesGB[plausibleListTemp[i][0]][:-1]) return plausibleList
def plausibleWords(incorrectWord): USengDict = open("enUS.txt", "r") GBengDict = open("enGB.txt", "r") phoneticDictUS = open("metaphonicDictUS.txt", "r") phoneticDictGB = open("metaphonicDictGB.txt", "r") temp = (metaphone.dm(incorrectWord))[0] plausibleList = [] plausibleListTemp = [] ctr = 0 for line in phoneticDictUS: if line[:-1] == temp: plausibleListTemp.append((ctr, "USprimary")) ctr = ctr + 1 ctr = 0 for line in phoneticDictGB: if line[:-1] == temp: plausibleListTemp.append((ctr, "GBprimary")) ctr = ctr + 1 linesUS = USengDict.readlines() linesGB = GBengDict.readlines() for i in xrange(len(plausibleListTemp)): if plausibleListTemp[i][1] == "USprimary": plausibleList.append(linesUS[plausibleListTemp[i][0]][:-1]) else: plausibleList.append(linesGB[plausibleListTemp[i][0]][:-1]) return plausibleList
def sort_text(raw_text): raw_text_array = raw_text.lower().split() assistant_name = CONFIG['assistant_name'] meta_name = dm(assistant_name)[0] for index, raw_text in enumerate(raw_text_array): meta_text = dm(raw_text)[0] chances = sm(None, meta_name, meta_text).ratio() if chances > 0.7: raw_text_array = raw_text_array[index + 1:] break key_words = raw_text_array.copy() sub_words = [] for index, raw_text in enumerate(raw_text_array): if raw_text in RESERVED_WORDS: sub_words.append(raw_text) key_words.remove(raw_text) return sub_words, key_words
def add_item(self, key, value): "add a key and associated value(s) to index." canon_key = self.normalize_key(key) self.literal.setdefault(canon_key, []).append(value) for word in canon_key.split(): self.words.setdefault(word, []).append(canon_key) self.alpha_words.setdefault(word[0], []).append(word) ph = metaphone.dm(word) self.phonetic_words.setdefault(ph[0], []).append(canon_key) if ph[1]: self.phonetic_words.setdefault(ph[1], []).append(canon_key)
def get_results(misspelt_word,prior_frequencies,ngram_words,matrices,phonetic): #start_time = time.time() candidate_selections = similarity_prune(ngram_words, misspelt_word, NGRAM_N) word_ph = metaphone.dm(misspelt_word) trie = TrieNode() for word in candidate_selections: trie.insert(word) results = search(misspelt_word, matrices,trie) results = [(x[0],x[1],x[2]*prior_frequencies[x[0]]*phonetic_score(word_ph, phonetic[x[0]])) for x in results] print_words_from_list(misspelt_word, sorted(results, key=lambda x: x[2], reverse=True)[:5])
def insert_statement(d): precinct_id = str(get_precinct(d)) t = d['street_type'] if t in street_abbrs: t = street_abbrs[t] reg_date = '"%s-%s-%s"' % ( d['reg_date'][4:].strip(), d['reg_date'][0:2], d['reg_date'][2:4]) flds = [ '"' + d['last_name'] + '"', '"' + d['first_name'] + '"', '"' + d['middle_name'] + '"', '"' + d['name_suffix'] + '"', '"' + dm(d['last_name'])[0] + '"', '"' + dm(d['first_name'])[0] + '"', d['birth_year'], '"' + d['gender'] + '"', d['house_number'] if d['house_number'] else 'null', '"' + d['pre_direction'] + '"', '"' + d['street_name'] + '"', '"' + d['street_type'] + '"', '"' + d['suf_direction'] + '"', '"' + d['unit'] + '"', '"' + dm(d['street_name'] + ' ' + t)[0] + '"', '"' + d['city'] + '"', '"' + d['zipcode'] + '"', precinct_id, '"' + d['voter_id'] + '"', reg_date, '"' + d['permanent_absentee'] + '"', '"' + d['status'] + '"', '"' + d['uocava'] + '"' ] return ("INSERT INTO voters " "(%s) " "VALUES (%s);\n") % ( ','.join(fldnames), ','.join(flds) )
def getPhoneticComparison(s1, s2): """ return the phonetic equality between of two string: [0., .. 1.] : 0 completely different, from 0 to 1: the more ressembling, 1: equal] """ import metaphone #~ print metaphone.dm( unicode(s1) ) #~ print metaphone.dm( unicode(s2) ) try: meta1 = metaphone.dm(s1)[0] meta2 = metaphone.dm(s2)[0] except BaseException as err: print("ERR: can't metaphone '%s' or '%s': err: %s" % (s1, s2, err)) meta1 = s1 meta2 = s2 if meta1 == meta2: return 1. rMidLen = (len(meta1) + len(meta2)) / 2 if (rMidLen < 1): return 0. rDist = 0.9 - levenshtein(meta1, meta2) / float(rMidLen) if (rDist < 0.): rDist = 0. return rDist
def preprocessing(): words = [] ngram_words = {} prior_frequencies = {} total_frequencies = 0 matrices = [] phonetic = {} # Reading dictionary with open('data/unixdict.txt') as f: for line in f.read().splitlines(): word = line.split('\t')[0] words.append(word) phonetic[word] = metaphone.dm(word) prior_frequencies[word] = 1 # Doing add one #Reading priors with open('data/count_1w.txt') as f: for line in f.read().splitlines(): word = line.split('\t')[0] freq = line.split('\t')[1] if word in prior_frequencies: prior_frequencies[word] = int(freq) total_frequencies += int(freq) # Divide by total frequency to get probability prior_frequencies = {k:v/float(total_frequencies) for k, v in prior_frequencies.iteritems()} ngram_words = ngram_index_structure(words,NGRAM_N) # Load matrices files = ['data/addoneAddXY.txt', 'data/addoneSubXY.txt', 'data/addoneDelXY.txt', 'data/newCharsXY.txt', 'data/addoneRevXY.txt', 'data/sumnewCharsXY.txt'] for f in files[:-1]: matrix = [] for lines in file(f).readlines(): matrix.append([float(x) for x in lines.split()]) matrices.append(matrix) # Last one is a vector, not a matrix matrix = [] for lines in file(files[-1]).readlines(): matrix.append(float(lines)) matrices.append(matrix) return (prior_frequencies,ngram_words,matrices,words,phonetic)
def phonetic_normalizer(s): s = s.lower() s = NOT_ALPHANUMSPACE_RE.sub('', s) s = "".join(w for w in sorted(s.split(' ')) if w not in SUFFIXES) return dm(unicode(s))[0]
def post(self): csv_file = self.request.get('file') cr = csv.DictReader(csv_file.split('\n')) g = geocoders.GoogleV3() complete = False to_put = [] regions_list=[] kind = None for row in cr: try: row["age_group"] kind = "Program" except: kind = None if kind == None: try: row["main_focus"] kind = "People" except: kind = None failures_array = [] if kind == "Program": p = program_db.Program() lat = None lng = None address_string = row["address"] + " " + row["city"] + " " + row["state"] try: if len(address_string) > 10: try: place, (lat, lng) = g.geocode(address_string.lower()) logging.debug(lat) logging.debug(lng) logging.debug(place) except: failures_array.append(address_string) continue #geocodes = g.geocode(address_string.lower()) #raise Exception(geocodes[0]) setattr(p, "latitude", float(lat)) setattr(p, "longitude", float(lng)) for key in row.keys(): initial_value = str(row[key]) #new_value = quoted_value = urllib.quote(initial_value.encode('utf-8')) new_value = unicode(initial_value, 'utf-8') setattr(p, key, new_value) #if key == "name": #name_metaphone = metaphone.dm(unicode(row[key])) #setattr(p, "name_metaphone", str(name_metaphone[0])) if key == "region": regions_list.append(row[key]) q = program_db.Program.all() q.filter('latitude = ', float(lat)) q.filter('longitude = ', float(lng)) #q.filter('name_metaphone = ', str(name_metaphone[0])) if not q.get(): to_put.append(p) else: pass complete = True except: failures_array.append(address_string) continue #if kind == "Location": #p = location_db.Location() #lat = None #lng = None #address_string = row["address"] + " " + row["city"] + " " + row["state"] #place, (lat, lng) = g.geocode(address_string.lower()) #setattr(p, "latitude", float(lat)) #setattr(p, "longitude", float(lng)) #for key in row.keys(): #setattr(p, key, row[key]) #if key == "name": #name_metaphone = metaphone.dm(unicode(row[key])) #setattr(p, key, str(name_metaphone)) #to_put.append(p) #complete = True if kind == "People": p = person_db.Person() for key in row.keys(): setattr(p, key, row[key]) if key == "name": name_metaphone = metaphone.dm(unicode(row[key])) setattr(p, "name_metaphone", str(name_metaphone[0])) if key == "program": program_metaphone = metaphone.dm(unicode(row[key])) setattr(p, "program_metaphone", str(program_metaphone[0])) if key == "region": regions_list.append(row[key]) q = person_db.Person.all() q.filter('name_metaphone = ', str(name_metaphone[0])) #q.get() if not q.get(): to_put.append(p) complete = True if complete: final_list = list(set(to_put)) db.put(final_list) q = region_db.Region.all() query = q.fetch(1000) saved_regions_list = [] for q in query: saved_regions_list.append(q.name) final_regions_list = list(set(regions_list)) to_save_regions = list(set(final_regions_list) - set(saved_regions_list)) for region in to_save_regions: r = region_db.Region(name=region) r.put() self.response.write(failures_array) #self.redirect("/import?message=Import complete") return self.redirect("/import?message=Nothing Uploaded, the CSV was not valid") return
def reconcile_country(raw_country): country_map = {u'AFGHANISTAN': u'AFG', u'ALBANIA': u'ALB', u'ALGERIA': u'DZA', u'AMERICAN SAMOA': u'ASM', u'ANDORRA': u'AND', u'ANGOLA': u'AGO', u'ANGUILLA': u'AIA', u'ANTARCTICA': None, u'ANTIGUA AND BARBUDA': u'ATG', u'ARGENTINA': u'ARG', u'ARMENIA': u'ARM', u'ARUBA': u'ABW', u'AUSTRALIA': u'AUS', u'AUSTRIA': u'AUT', u'AZERBAIJAN': u'AZE', u'BAHAMAS': u'BHS', u'BAHRAIN': u'BHR', u'BANGLADESH': u'BGD', u'BARBADOS': u'BRB', u'BELARUS': u'BLR', u'BELGIUM': u'BEL', u'BELIZE': u'BLZ', u'BENIN': u'BEN', u'BERMUDA': u'BMU', u'BHUTAN': u'BTN', u'BOLIVIA': u'BOL', u'BOSNIA AND HERZEGOVINA': u'BIH', u'BOTSWANA': u'BWA', u'BOUVET ISLAND': None, u'BRAZIL': u'BRA', u'BRITISH INDIAN OCEAN TERRITORY': None, u'BRUNEI DARUSSALAM': u'BRN', u'BULGARIA': u'BGR', u'BURKINA FASO': u'BFA', u'BURUNDI': u'BDI', u'CAMBODIA': u'KHM', u'CAMEROON': u'CMR', u'CANADA': u'CAN', u'CAPE VERDE': u'CPV', u'CAYMAN ISLANDS': u'CYM', u'CENTRAL AFRICAN REPUBLIC': u'CAF', u'CHAD': u'TCD', u'CHILE': u'CHL', u'CHINA': u'CHN', u'CHRISTMAS ISLAND': None, u'COCOS (KEELING) ISLANDS': None, u'COLOMBIA': u'COL', u'COMOROS': u'COM', u'CONGO': u'COG', u'CONGO, THE DEMOCRATIC REPUBLIC OF THE': u'COD', u'COOK ISLANDS': u'C*K', u'COSTA RICA': u'CRI', u"COTE D'IVOIRE": u'CIV', u'CROATIA': u'HRV', u'CUBA': u'CUB', u'CYPRUS': u'CYP', u'CZECH REPUBLIC': u'CZE', u'DENMARK': u'DNK', u'DJIBOUTI': u'DJI', u'DOMINICA': u'DMA', u'DOMINICAN REPUBLIC': u'DOM', u'ECUADOR': u'ECU', u'EGYPT': u'EGY', u'EL SALVADOR': u'SLV', u'EQUATORIAL GUINEA': u'GNQ', u'ERITREA': u'ERI', u'ESTONIA': u'EST', u'ETHIOPIA': u'ETH', u'FALKLAND ISLANDS (MALVINAS)': u'FLK', u'FAROE ISLANDS': u'FRO', u'FIJI': u'FJI', u'FINLAND': u'FIN', u'FRANCE': u'FRA', u'FRENCH GUIANA': u'GUF', u'FRENCH POLYNESIA': u'PYF', u'FRENCH SOUTHERN TERRITORIES': None, u'GABON': u'GAB', u'GAMBIA': u'GMB', u'GEORGIA': u'GEO', u'GERMANY': u'DEU', u'GHANA': u'GHA', u'GIBRALTAR': u'GIB', u'GREECE': u'GRC', u'GREENLAND': u'GRL', u'GRENADA': u'GRD', u'GUADELOUPE': u'GLP', u'GUAM': u'GUM', u'GUATEMALA': u'GTM', u'GUINEA': u'GIN', u'GUINEA-BISSAU': u'GNB', u'GUYANA': u'GUY', u'HAITI': u'HTI', u'HEARD ISLAND AND MCDONALD ISLANDS': None, u'HOLY SEE (VATICAN CITY STATE)': u'VAT', u'HONDURAS': u'HND', u'HONG KONG': u'HKG', u'HUNGARY': u'HUN', u'ICELAND': u'ISL', u'INDIA': u'IND', u'INDONESIA': u'IDN', u'IRAN, ISLAMIC REPUBLIC OF': u'IRN', u'IRAQ': u'IRQ', u'IRELAND': u'IRL', u'ISRAEL': u'ISR', u'ITALY': u'ITA', u'JAMAICA': u'JAM', u'JAPAN': u'JPN', u'JORDAN': u'JOR', u'KAZAKHSTAN': u'KAZ', u'KENYA': u'KEN', u'KIRIBATI': u'KIR', u"KOREA, DEMOCRATIC PEOPLE'S REPUBLIC OF": u'PRK', u'KOREA, REPUBLIC OF': u'KOR', u'KUWAIT': u'KWT', u'KYRGYZSTAN': u'KGZ', u"LAO PEOPLE'S DEMOCRATIC REPUBLIC": u'LAO', u'LATVIA': u'LVA', u'LEBANON': u'LBN', u'LESOTHO': u'LSO', u'LIBERIA': u'LBR', u'LIBYAN ARAB JAMAHIRIYA': u'LBY', u'LIECHTENSTEIN': u'LIE', u'LITHUANIA': u'LTU', u'LUXEMBOURG': u'LUX', u'MACAO': u'MAC', u'MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF': u'MKD', u'MADAGASCAR': u'MDG', u'MALAWI': u'MWI', u'MALAYSIA': u'MYS', u'MALDIVES': u'MDV', u'MALI': u'MLI', u'MALTA': u'MLT', u'MARSHALL ISLANDS': u'MHL', u'MARTINIQUE': u'MTQ', u'MAURITANIA': u'MRT', u'MAURITIUS': u'MUS', u'MAYOTTE': None, u'MEXICO': u'MEX', u'MICRONESIA, FEDERATED STATES OF': u'FSM', u'MOLDOVA, REPUBLIC OF': u'MDA', u'MONACO': u'MCO', u'MONGOLIA': u'MNG', u'MONTSERRAT': u'MSR', u'MOROCCO': u'MAR', u'MOZAMBIQUE': u'MOZ', u'MYANMAR': u'MMR', u'NAMIBIA': u'NAM', u'NAURU': u'NRU', u'NEPAL': u'NPL', u'NETHERLANDS': u'NLD', u'NETHERLANDS ANTILLES': u'ANT', u'NEW CALEDONIA': u'NCL', u'NEW ZEALAND': u'NZL', u'NICARAGUA': u'NIC', u'NIGER': u'NER', u'NIGERIA': u'NGA', u'NIUE': u'NIU', u'NORFOLK ISLAND': u'NFK', u'NORTHERN MARIANA ISLANDS': u'MNP', u'NORWAY': u'NOR', u'OMAN': u'OMN', u'PAKISTAN': u'PAK', u'PALAU': u'PLW', u'PALESTINIAN TERRITORY, OCCUPIED': None, u'PANAMA': u'PAN', u'PAPUA NEW GUINEA': u'PNG', u'PARAGUAY': u'PRY', u'PERU': u'PER', u'PHILIPPINES': u'PHL', u'PITCAIRN': u'PCN', u'POLAND': u'POL', u'PORTUGAL': u'PRT', u'PUERTO RICO': u'PRI', u'QATAR': u'QAT', u'REUNION': u'REU', u'ROMANIA': u'ROM', u'RUSSIAN FEDERATION': u'RUS', u'RWANDA': u'RWA', u'SAINT HELENA': u'SHN', u'SAINT KITTS AND NEVIS': u'KNA', u'SAINT LUCIA': u'LCA', u'SAINT PIERRE AND MIQUELON': u'SPM', u'SAINT VINCENT AND THE GRENADINES': u'VCT', u'SAMOA': u'WSM', u'SAN MARINO': u'SMR', u'SAO TOME AND PRINCIPE': u'STP', u'SAUDI ARABIA': u'SAU', u'SENEGAL': u'SEN', u'SERBIA AND MONTENEGRO': None, u'SEYCHELLES': u'SYC', u'SIERRA LEONE': u'SLE', u'SINGAPORE': u'SGP', u'SLOVAKIA': u'SVK', u'SLOVENIA': u'SVN', u'SOLOMON ISLANDS': u'SLB', u'SOMALIA': u'SOM', u'SOUTH AFRICA': u'ZAF', u'SOUTH GEORGIA AND THE SOUTH SANDWICH ISLANDS': None, u'SPAIN': u'ESP', u'SRI LANKA': u'LKA', u'SUDAN': u'SDN', u'SURINAME': u'SUR', u'SVALBARD AND JAN MAYEN': u'SJM', u'SWAZILAND': u'SWZ', u'SWEDEN': u'SWE', u'SWITZERLAND': u'CHE', u'SYRIAN ARAB REPUBLIC': u'SYR', u'TAIWAN, PROVINCE OF CHINA': u'TWN', u'TAJIKISTAN': u'TJK', u'TANZANIA, UNITED REPUBLIC OF': u'TZA', u'THAILAND': u'THA', u'TIMOR-LESTE': None, u'TOGO': u'TGO', u'TOKELAU': u'TKL', u'TONGA': u'TON', u'TRINIDAD AND TOBAGO': u'TTO', u'TUNISIA': u'TUN', u'TURKEY': u'TUR', u'TURKMENISTAN': u'TKM', u'TURKS AND CAICOS ISLANDS': u'TCA', u'TUVALU': u'TUV', u'UGANDA': u'UGA', u'UKRAINE': u'UKR', u'UNITED ARAB EMIRATES': u'ARE', u'UNITED KINGDOM': u'GBR', u'UNITED STATES': u'USA', u'UNITED STATES MINOR OUTLYING ISLANDS': None, u'URUGUAY': u'URY', u'UZBEKISTAN': u'UZB', u'VANUATU': u'VUT', u'VENEZUELA': u'VEN', u'VIET NAM': u'VNM', u'VIRGIN ISLANDS, BRITISH': u'VGB', u'VIRGIN ISLANDS, U.S.': u'VIR', u'WALLIS AND FUTUNA': u'WLF', u'WESTERN SAHARA': u'ESH', u'YEMEN': u'YEM', u'ZAMBIA': u'ZMB', u'ZIMBABWE': u'ZWE'} # check if term is a key in country_map if raw_country.upper() in country_map: return True, country_map[raw_country.upper()] search_sound = dm(unicode(raw_country)) suggestions = [] for c in country_map.keys(): country_sound = dm(unicode(c)) if search_sound[0] == country_sound[0]: suggestions.append((1.0, raw_country, c, country_map[c])) continue else: if search_sound[1] is not None: # see if secondary metaphone of search_sound matches # primary metaphone of obj if search_sound[1] == country_sound[0]: suggestions.append((1.0, raw_country, c, country_map[c])) continue # no exact match, so see if the primary metaphones are similar primary_sound_dist = jarow(str(search_sound[0]), str(country_sound[0])) if primary_sound_dist >= 2: similar.append((primary_sound_dist, raw_country, c, country_map[c])) continue return False, {'double-metaphone': suggestions}