Пример #1
0
def map_planname(origname):
    name = None
    if 'Refugee' in origname:
        words = origname.split(' ')
        try:
            index = words.index('Regional')
            name = ' '.join(words[:index + 1])
        except ValueError:
            try:
                index = words.index('from')
                newwords = list()
                for word in words[index + 1:]:
                    if '(' in word:
                        break
                    newwords.append(word)
                name = '%s Regional' % ' '.join(newwords)
            except ValueError:
                index = words.index('Refugee')
                name = '%s Regional' % ' '.join(words[:index])
    if not name:
        name = re.sub('[\(\[].*?[\)\]]', '', origname)
        name = multiple_replace(name, {
            'Intersectoral': '',
            'Response': '',
            'Plan': '',
            'Joint': ''
        })
        name = ' '.join(name.split())
    if origname == name:
        logger.info('Plan name %s not simplified' % name)
    else:
        logger.info('Plan name %s simplified from %s' % (name, origname))
    return name
Пример #2
0
 def test_multiple_replace(self):
     result = multiple_replace(self.a, {
         "quick": "slow",
         "fast": "slow",
         "lazy": "busy"
     })
     assert (result ==
             "The slow brown fox jumped over the busy dog. It was so slow!")
Пример #3
0
    def fuzzy_pcode(self, countryiso3, adm1_name, scrapername=None):
        if countryiso3 in self.iso3s_no_pcodes:
            self.ignored.add((scrapername, countryiso3))
            return None
        name_to_pcode = self.name_to_pcode.get(countryiso3)
        if not name_to_pcode:
            self.errors.add((scrapername, countryiso3))
            return None
        if adm1_name.lower() in self.adm1_fuzzy_ignore:
            self.ignored.add((scrapername, countryiso3, adm1_name))
            return None
        # Replace accented characters with non accented ones
        adm1_name_lookup = ''.join((c for c in unicodedata.normalize('NFD', adm1_name) if unicodedata.category(c) != 'Mn'))
        # Remove all non-ASCII characters
        adm1_name_lookup = re.sub(ascii, ' ', adm1_name_lookup)
        adm1_name_lookup = unidecode(adm1_name_lookup)
        adm1_name_lookup = adm1_name_lookup.strip().lower()
        adm1_name_lookup2 = multiple_replace(adm1_name_lookup, self.adm1_name_replacements)
        pcode = name_to_pcode.get(adm1_name_lookup, name_to_pcode.get(adm1_name_lookup2))
        if not pcode:
            for map_name in name_to_pcode:
                if adm1_name_lookup in map_name:
                    pcode = name_to_pcode[map_name]
                    self.matches.add((scrapername, countryiso3, adm1_name, self.pcode_to_name[pcode], 'substring'))
                    break
            for map_name in name_to_pcode:
                if adm1_name_lookup2 in map_name:
                    pcode = name_to_pcode[map_name]
                    self.matches.add((scrapername, countryiso3, adm1_name, self.pcode_to_name[pcode], 'substring'))
                    break
        if not pcode:
            map_names = list(name_to_pcode.keys())
            lower_mapnames = [x.lower() for x in map_names]
            rs = pyphonetics.RefinedSoundex()
            mindistance = None
            match = None

            def check_name(lookup, mapname, index):
                nonlocal mindistance, match

                distance = rs.distance(lookup, mapname)
                if mindistance is None or distance < mindistance:
                    mindistance = distance
                    match = index

            for i, mapname in enumerate(lower_mapnames):
                check_name(adm1_name_lookup, mapname, i)
            for i, mapname in enumerate(lower_mapnames):
                if mapname[:3] == 'al ':
                    check_name(adm1_name_lookup, 'ad %s' % mapname[3:], i)
                    check_name(adm1_name_lookup, mapname[3:], i)
                check_name(adm1_name_lookup2, mapname, i)

            if mindistance is None or mindistance > match_threshold:
                self.errors.add((scrapername, countryiso3, adm1_name))
                return None

            map_name = map_names[match]
            pcode = name_to_pcode[map_name]
            self.matches.add((scrapername, countryiso3, adm1_name, self.pcode_to_name[pcode], 'fuzzy'))
        return pcode
Пример #4
0
    def fuzzy_pcode(self, countryiso3, name, scrapername=None):
        # type: (str, str, Optional[str]) ->  Optional[str]
        """Fuzzy match name to pcode

        Args:
            countryiso3 (str): Iso3 country code
            name (str): Name to match
            scrapername (Optional[str]): Name of scraper for logging purposes. Defaults to None (don't log).

        Returns:
            Optional[str]: Matched P code or None if no match
        """
        if self.countries_fuzzy_try is not None and countryiso3 not in self.countries_fuzzy_try:
            self.ignored.add((scrapername, countryiso3))
            return None
        name_to_pcode = self.name_to_pcode.get(countryiso3)
        if not name_to_pcode:
            self.errors.add((scrapername, countryiso3))
            return None
        if name.lower() in self.admin1_fuzzy_dont:
            self.ignored.add((scrapername, countryiso3, name))
            return None
        adm1_name_lookup = clean_name(name)
        adm1_name_lookup2 = multiple_replace(adm1_name_lookup,
                                             self.admin1_name_replacements)
        pcode = name_to_pcode.get(adm1_name_lookup,
                                  name_to_pcode.get(adm1_name_lookup2))
        if not pcode:
            for map_name in name_to_pcode:
                if adm1_name_lookup in map_name:
                    pcode = name_to_pcode[map_name]
                    self.matches.add((scrapername, countryiso3, name,
                                      self.pcode_to_name[pcode], 'substring'))
                    break
            for map_name in name_to_pcode:
                if adm1_name_lookup2 in map_name:
                    pcode = name_to_pcode[map_name]
                    self.matches.add((scrapername, countryiso3, name,
                                      self.pcode_to_name[pcode], 'substring'))
                    break
        if not pcode:
            map_names = list(name_to_pcode.keys())
            lower_mapnames = [x.lower() for x in map_names]

            def al_transform_1(name):
                if name[:3] == 'al ':
                    return 'ad %s' % name[3:]
                else:
                    return None

            def al_transform_2(name):
                if name[:3] == 'al ':
                    return name[3:]
                else:
                    return None

            matching_index = self.phonetics.match(
                lower_mapnames,
                adm1_name_lookup,
                alternative_name=adm1_name_lookup2,
                transform_possible_names=[al_transform_1, al_transform_2])

            if matching_index is None:
                self.errors.add((scrapername, countryiso3, name))
                return None

            map_name = map_names[matching_index]
            pcode = name_to_pcode[map_name]
            self.matches.add((scrapername, countryiso3, name,
                              self.pcode_to_name[pcode], 'fuzzy'))
        return pcode
Пример #5
0
    def fuzzy_pcode(self, countryiso3, name, scrapername=None):
        # type: (str, str, Optional[str]) ->  Optional[str]
        """Fuzzy match name to pcode

        Args:
            countryiso3 (str): Iso3 country code
            name (str): Name to match
            scrapername (Optional[str]): Name of scraper for logging purposes. Defaults to None (don't log).

        Returns:
            Optional[str]: Matched P code or None if no match
        """
        if self.countries_fuzzy_try is not None and countryiso3 not in self.countries_fuzzy_try:
            self.ignored.add((scrapername, countryiso3))
            return None
        name_to_pcode = self.name_to_pcode.get(countryiso3)
        if not name_to_pcode:
            self.errors.add((scrapername, countryiso3))
            return None
        if name.lower() in self.admin1_fuzzy_dont:
            self.ignored.add((scrapername, countryiso3, name))
            return None
        # Replace accented characters with non accented ones
        adm1_name_lookup = ''.join(
            (c for c in unicodedata.normalize('NFD', six.u(name))
             if unicodedata.category(c) != 'Mn'))
        # Remove all non-ASCII characters
        adm1_name_lookup = re.sub(ascii, ' ', adm1_name_lookup)
        adm1_name_lookup = unidecode(adm1_name_lookup)
        adm1_name_lookup = adm1_name_lookup.strip().lower()
        adm1_name_lookup2 = multiple_replace(adm1_name_lookup,
                                             self.admin1_name_replacements)
        pcode = name_to_pcode.get(adm1_name_lookup,
                                  name_to_pcode.get(adm1_name_lookup2))
        if not pcode:
            for map_name in name_to_pcode:
                if adm1_name_lookup in map_name:
                    pcode = name_to_pcode[map_name]
                    self.matches.add((scrapername, countryiso3, name,
                                      self.pcode_to_name[pcode], 'substring'))
                    break
            for map_name in name_to_pcode:
                if adm1_name_lookup2 in map_name:
                    pcode = name_to_pcode[map_name]
                    self.matches.add((scrapername, countryiso3, name,
                                      self.pcode_to_name[pcode], 'substring'))
                    break
        if not pcode:
            if sys.version_info[0] == 2:
                self.errors.add((scrapername, countryiso3, name))
                return None
            map_names = list(name_to_pcode.keys())
            lower_mapnames = [x.lower() for x in map_names]
            rs = pyphonetics.RefinedSoundex()
            mindistance = None
            match = None

            def check_name(mindistance, match, lookup, mapname, index):
                distance = rs.distance(lookup, mapname)
                if mindistance is None or distance < mindistance:
                    mindistance = distance
                    match = index
                return mindistance, match

            for i, mapname in enumerate(lower_mapnames):
                mindistance, match = check_name(mindistance, match,
                                                adm1_name_lookup, mapname, i)
            for i, mapname in enumerate(lower_mapnames):
                if mapname[:3] == 'al ':
                    mindistance, match = check_name(mindistance, match,
                                                    adm1_name_lookup,
                                                    'ad %s' % mapname[3:], i)
                    mindistance, match = check_name(mindistance, match,
                                                    adm1_name_lookup,
                                                    mapname[3:], i)
                mindistance, match = check_name(mindistance, match,
                                                adm1_name_lookup2, mapname, i)

            if mindistance is None or mindistance > match_threshold:
                self.errors.add((scrapername, countryiso3, name))
                return None

            map_name = map_names[match]
            pcode = name_to_pcode[map_name]
            self.matches.add((scrapername, countryiso3, name,
                              self.pcode_to_name[pcode], 'fuzzy'))
        return pcode