def map_planname(origname): name = None if 'Refugee' in origname: words = origname.split(' ') try: index = words.index('Regional') name = ' '.join(words[:index + 1]) except ValueError: try: index = words.index('from') newwords = list() for word in words[index + 1:]: if '(' in word: break newwords.append(word) name = '%s Regional' % ' '.join(newwords) except ValueError: index = words.index('Refugee') name = '%s Regional' % ' '.join(words[:index]) if not name: name = re.sub('[\(\[].*?[\)\]]', '', origname) name = multiple_replace(name, { 'Intersectoral': '', 'Response': '', 'Plan': '', 'Joint': '' }) name = ' '.join(name.split()) if origname == name: logger.info('Plan name %s not simplified' % name) else: logger.info('Plan name %s simplified from %s' % (name, origname)) return name
def test_multiple_replace(self): result = multiple_replace(self.a, { "quick": "slow", "fast": "slow", "lazy": "busy" }) assert (result == "The slow brown fox jumped over the busy dog. It was so slow!")
def fuzzy_pcode(self, countryiso3, adm1_name, scrapername=None): if countryiso3 in self.iso3s_no_pcodes: self.ignored.add((scrapername, countryiso3)) return None name_to_pcode = self.name_to_pcode.get(countryiso3) if not name_to_pcode: self.errors.add((scrapername, countryiso3)) return None if adm1_name.lower() in self.adm1_fuzzy_ignore: self.ignored.add((scrapername, countryiso3, adm1_name)) return None # Replace accented characters with non accented ones adm1_name_lookup = ''.join((c for c in unicodedata.normalize('NFD', adm1_name) if unicodedata.category(c) != 'Mn')) # Remove all non-ASCII characters adm1_name_lookup = re.sub(ascii, ' ', adm1_name_lookup) adm1_name_lookup = unidecode(adm1_name_lookup) adm1_name_lookup = adm1_name_lookup.strip().lower() adm1_name_lookup2 = multiple_replace(adm1_name_lookup, self.adm1_name_replacements) pcode = name_to_pcode.get(adm1_name_lookup, name_to_pcode.get(adm1_name_lookup2)) if not pcode: for map_name in name_to_pcode: if adm1_name_lookup in map_name: pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, adm1_name, self.pcode_to_name[pcode], 'substring')) break for map_name in name_to_pcode: if adm1_name_lookup2 in map_name: pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, adm1_name, self.pcode_to_name[pcode], 'substring')) break if not pcode: map_names = list(name_to_pcode.keys()) lower_mapnames = [x.lower() for x in map_names] rs = pyphonetics.RefinedSoundex() mindistance = None match = None def check_name(lookup, mapname, index): nonlocal mindistance, match distance = rs.distance(lookup, mapname) if mindistance is None or distance < mindistance: mindistance = distance match = index for i, mapname in enumerate(lower_mapnames): check_name(adm1_name_lookup, mapname, i) for i, mapname in enumerate(lower_mapnames): if mapname[:3] == 'al ': check_name(adm1_name_lookup, 'ad %s' % mapname[3:], i) check_name(adm1_name_lookup, mapname[3:], i) check_name(adm1_name_lookup2, mapname, i) if mindistance is None or mindistance > match_threshold: self.errors.add((scrapername, countryiso3, adm1_name)) return None map_name = map_names[match] pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, adm1_name, self.pcode_to_name[pcode], 'fuzzy')) return pcode
def fuzzy_pcode(self, countryiso3, name, scrapername=None): # type: (str, str, Optional[str]) -> Optional[str] """Fuzzy match name to pcode Args: countryiso3 (str): Iso3 country code name (str): Name to match scrapername (Optional[str]): Name of scraper for logging purposes. Defaults to None (don't log). Returns: Optional[str]: Matched P code or None if no match """ if self.countries_fuzzy_try is not None and countryiso3 not in self.countries_fuzzy_try: self.ignored.add((scrapername, countryiso3)) return None name_to_pcode = self.name_to_pcode.get(countryiso3) if not name_to_pcode: self.errors.add((scrapername, countryiso3)) return None if name.lower() in self.admin1_fuzzy_dont: self.ignored.add((scrapername, countryiso3, name)) return None adm1_name_lookup = clean_name(name) adm1_name_lookup2 = multiple_replace(adm1_name_lookup, self.admin1_name_replacements) pcode = name_to_pcode.get(adm1_name_lookup, name_to_pcode.get(adm1_name_lookup2)) if not pcode: for map_name in name_to_pcode: if adm1_name_lookup in map_name: pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, name, self.pcode_to_name[pcode], 'substring')) break for map_name in name_to_pcode: if adm1_name_lookup2 in map_name: pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, name, self.pcode_to_name[pcode], 'substring')) break if not pcode: map_names = list(name_to_pcode.keys()) lower_mapnames = [x.lower() for x in map_names] def al_transform_1(name): if name[:3] == 'al ': return 'ad %s' % name[3:] else: return None def al_transform_2(name): if name[:3] == 'al ': return name[3:] else: return None matching_index = self.phonetics.match( lower_mapnames, adm1_name_lookup, alternative_name=adm1_name_lookup2, transform_possible_names=[al_transform_1, al_transform_2]) if matching_index is None: self.errors.add((scrapername, countryiso3, name)) return None map_name = map_names[matching_index] pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, name, self.pcode_to_name[pcode], 'fuzzy')) return pcode
def fuzzy_pcode(self, countryiso3, name, scrapername=None): # type: (str, str, Optional[str]) -> Optional[str] """Fuzzy match name to pcode Args: countryiso3 (str): Iso3 country code name (str): Name to match scrapername (Optional[str]): Name of scraper for logging purposes. Defaults to None (don't log). Returns: Optional[str]: Matched P code or None if no match """ if self.countries_fuzzy_try is not None and countryiso3 not in self.countries_fuzzy_try: self.ignored.add((scrapername, countryiso3)) return None name_to_pcode = self.name_to_pcode.get(countryiso3) if not name_to_pcode: self.errors.add((scrapername, countryiso3)) return None if name.lower() in self.admin1_fuzzy_dont: self.ignored.add((scrapername, countryiso3, name)) return None # Replace accented characters with non accented ones adm1_name_lookup = ''.join( (c for c in unicodedata.normalize('NFD', six.u(name)) if unicodedata.category(c) != 'Mn')) # Remove all non-ASCII characters adm1_name_lookup = re.sub(ascii, ' ', adm1_name_lookup) adm1_name_lookup = unidecode(adm1_name_lookup) adm1_name_lookup = adm1_name_lookup.strip().lower() adm1_name_lookup2 = multiple_replace(adm1_name_lookup, self.admin1_name_replacements) pcode = name_to_pcode.get(adm1_name_lookup, name_to_pcode.get(adm1_name_lookup2)) if not pcode: for map_name in name_to_pcode: if adm1_name_lookup in map_name: pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, name, self.pcode_to_name[pcode], 'substring')) break for map_name in name_to_pcode: if adm1_name_lookup2 in map_name: pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, name, self.pcode_to_name[pcode], 'substring')) break if not pcode: if sys.version_info[0] == 2: self.errors.add((scrapername, countryiso3, name)) return None map_names = list(name_to_pcode.keys()) lower_mapnames = [x.lower() for x in map_names] rs = pyphonetics.RefinedSoundex() mindistance = None match = None def check_name(mindistance, match, lookup, mapname, index): distance = rs.distance(lookup, mapname) if mindistance is None or distance < mindistance: mindistance = distance match = index return mindistance, match for i, mapname in enumerate(lower_mapnames): mindistance, match = check_name(mindistance, match, adm1_name_lookup, mapname, i) for i, mapname in enumerate(lower_mapnames): if mapname[:3] == 'al ': mindistance, match = check_name(mindistance, match, adm1_name_lookup, 'ad %s' % mapname[3:], i) mindistance, match = check_name(mindistance, match, adm1_name_lookup, mapname[3:], i) mindistance, match = check_name(mindistance, match, adm1_name_lookup2, mapname, i) if mindistance is None or mindistance > match_threshold: self.errors.add((scrapername, countryiso3, name)) return None map_name = map_names[match] pcode = name_to_pcode[map_name] self.matches.add((scrapername, countryiso3, name, self.pcode_to_name[pcode], 'fuzzy')) return pcode