예제 #1
0
    def best_guess(self, city=None, admin1=None, country=None,
                   res_level=RESOLUTION_LEVEL.city):
        """Makes a best guess with  for a given
           city, country, admin1 tuple
        :param city: city name
        :param country: country name
        :param admin1: admin1, state or province name
        :returns: ([canonical LT], [alias LT])
        city/admin/country level geocoded tuple with lat-lon
        LT = (city, country, admin1,
              admin2, admin3, pop, lat, lon, id, pad)
        NOTE: lat-lon corresponds to that of city/capital
              of admin1 or country
        """
        ci, a1, co = (city and nstr(city), admin1 and nstr(admin1),
                      country and nstr(country))
        if co and co in self.co_aliases:
            country = self.co_aliases[co]
            co = nstr(self.co_aliases[co])
        # ref: hack#1 in features list
        if a1 and co and not ci:
            if (co, a1) in self.same_ci_a1_name:
                ci = nstr(self.same_ci_a1_name[(co, a1)])

        if co and not a1 and not ci and co in self.co_names:
            return self.best_guess_country(co)
        elif a1 and co and not ci:
            return self.best_guess_admin1(a1, co)
        elif ci:
            return self.best_guess_city(ci, co, a1, res_level)
        else:
            return ([], [])
예제 #2
0
    def best_guess_city(self, city, country=None, admin1=None,
                        res_level=RESOLUTION_LEVEL.city):
        '''Resolve city
        :param admin1: norm str of admin1 name
        :param country (optional): norm str of country name
        :param admin1 (optional): norm str of admin1 name
        :param res_level (optional): reolution level
        :returns: ([canonical LT], [alias LT])
                  city/admin/country level geocoded tuple with lat-lon
                  LT = (city, country, admin1,
                        admin2, admin3, pop, lat, lon, id, pad)
        '''
        ci = nstr(city.strip())
        co = nstr(country.strip()) if country else None
        a1 = nstr(admin1.strip()) if admin1 else None
        canonical_name_bgs, canonical_a_match = self._best_guess_city(ci,
                                                                      co, a1)
        all_bgs = copy.deepcopy(canonical_name_bgs)
        alias_name_bgs, loc = set(), set()
        alias_a_match = False
        k = (ci, co)
        if k in self.ci_aliases:
            if a1 and a1 in self.ci_aliases[k]:
                alias_a_match = True
                for o_ci in self.ci_aliases[k][a1]:
                    loc.add(((nstr(o_ci), co), a1))
            else:
                if self.debug and a1:
                    self.log.info("Alias key (%s) exists but not admin1 %s" %
                                  (list(k), a1))
                for adm1, o_cities in self.ci_aliases[k].items():
                    for o_ci in o_cities:
                        loc.add(((nstr(o_ci), co), adm1))
            #print loc
            for (ci, co), a in loc:
                alias_bgs, alias_a_found = self._best_guess_city(ci, co, a)
                for bg in alias_bgs:
                    if bg not in all_bgs:
                        all_bgs.add(bg)
                        alias_name_bgs.add(bg)

        # sort by specific priority
        # TODO sort by tweet volume
        alias_name_bgs = sorted(alias_name_bgs,
                                key=lambda x: x[5], reverse=True)
        canonical_name_bgs = sorted(canonical_name_bgs,
                                    key=lambda x: x[5], reverse=True)
        # if no results returned try resolving on either admin1 level first
        # if that fails do country level
        if len(canonical_name_bgs) == 0 and len(alias_name_bgs) == 0 and \
           res_level > RESOLUTION_LEVEL.city:
            if co and a1:
                return self.best_guess_admin1(a1, co)
            elif co:
                return self.best_guess_country(co)
        if not canonical_a_match and alias_a_match:
            return (list(alias_name_bgs), [])
        else:
            return (list(canonical_name_bgs), list(alias_name_bgs))
예제 #3
0
    def match_for_code(self, loc_str, city=None, admin1=None, country=None):
        """Extract city, admin1 and country names using regular expressions
           built using codes for country and admin1, which in turn are
           mapped to full name string. It uses as input a location information
           containing string, which primarily is the location string mentioned
           in twitter user's profile
        :param loc_str: location string, but could be any string containing
                        geocodable information
        :param city: name of city
        :param admin1: name of admin1
        :param country: name of country
        :returns: locstr, city, admin1, country
        NOTE: loc string returned is substring after removal
              of matched loc entities
        """
        loc = None
        if loc_str:
            loc = loc_str  # nstr(loc_str)
            for r, t in [('D. F.', 'DF'), ('.', ''), ('-', ' '), (', ', ' '),
                         (',', ' '), ('#', ' ')]:
                loc = loc.replace(r, t)
            if not country:
                co_code = self.co_code_reg.search(loc)
                if co_code and co_code in self.co_code:
                    loc = loc.replace(co_code, " ", 1)
                    country = self.co_code[co_code]
            if not admin1:
                if not country:
                    admin1_code = self.admin1_code_reg2.search(loc)
                else:
                    admin1_code = self.admin1_code_reg1.search(loc)
                if admin1_code and admin1_code in self.admin_code:
                    co_a = self.admin_code[admin1_code]
                    co_a = [(nstr(c), a) for c, a in co_a
                            if nstr(a) in self.admin_name and
                            nstr(c) in self.admin_name[nstr(a)]]
                    if len(co_a) > 0:
                        x = None
                        if country:
                            co = nstr(country)
                            x = lambda i: i == co
                        elif len(co_a) == 1:
                            x = lambda i: True
                        else:
                            # to be safe if country is not known
                            # dont make guesses
                            x = lambda i: False
                        co_a = [(c, a) for c, a in co_a if x(c)]
                        if len(co_a) > 0:
                            c, a = co_a[0]
                            admin1 = a
                            if not country:
                                country = c
                    loc = (admin1_code
                           and loc.replace(admin1_code, " ", 1)) or loc

        return loc, city, admin1, country
예제 #4
0
    def get_all_cities(self, country, admin1):
        '''Return all cities for country & admin1 pair
        :param country: country name
        :param admin1: admin1 name
        :returns: list of city names
        '''
        cities = set()
        co = nstr(country)
        a = nstr(admin1)
        for ci, co_a in self.bguess.items():
            if co and co in co_a and a and a in co_a[co]:
                cities.add(self.data[co_a[co][a][0]][1][0])

        return list(cities)
예제 #5
0
    def match_for_names(self, loc_str, city=None, admin1=None, country=None):
        """Extract city, admin1 and country names using regular expressions
           using some string - which primarily is the location
           string mentioned in twitter user's profile
        :param loc_str: location string, but could be any string containing
                        geocodable information
        :param city: name of city
        :param admin1: name of admin1
        :param country: name of country
        :returns: locstr, city, admin1, country
        NOTE: loc string returned is substring after removal
              of matched loc entities
        """
        loc = None
        if loc_str:
            loc = nstr(loc_str)
            loc, city, admin1, country = self._match_for_names(loc, city,
                                                               admin1,
                                                               country)

            for r, t in [('.', ''), ('-', ' '), (', ', ' '), (',', ' '),
                         ('#', ' ')]:
                loc = loc.replace(r, t)
            loc, city, admin1, country = self._match_for_names(loc, city,
                                                               admin1,
                                                               country)

        return loc, city, admin1, country
예제 #6
0
 def best_guess_admin1(self, admin1, country):
     '''Resolve country and admin1
     :param admin1: norm str of admin1 name
     :param country: norm str of country name
     :returns: ([LT], [])
               admin1 level geocoded tuple with lat-lon
               LT = (city, country, admin1,
                     admin2, admin3, pop, lat, lon, id, pad)
     NOTE: lat-lon corresponds to avg. of lat, lon points from each
           record in world-gazetteer for given country and admin pair
     '''
     a1, co = nstr(admin1), nstr(country)
     if a1 in self.admin_name and co in self.admin_name[a1]:
         rid, lat, lon, code, admin_name, co_name = self.admin_name[a1][co]
         return ([(None, co_name, admin_name, None, None,
                  None, lat, lon, rid, 0)], [])
     else:  # if not found return country level geocoding
         return self.best_guess_country(co)
예제 #7
0
 def best_guess_country(self, country):
     '''Resolve country
     :param country: norm str of country name
     :returns: ([LT], [])
               country level geocoded tuple with lat-lon
               LT = (city, country, admin1,
                     admin2, admin3, pop, lat, lon, id, pad)
     NOTE: lat-lon corresponds to avg. of lat, lon points from each
           record in world-gazetteer for given country
     '''
     co = nstr(country) if country else None
     if co and co in self.co_names:
         rid, lat, lon, code, co_name = self.co_names[co]
         return ([(None, co_name, None, None, None,
                   None, lat, lon, rid, 0)], [])
     else:
         return ([], [])
예제 #8
0
    def normalize_places(places):
        """Extract city, admin1 and country from places
           json object of tweet
        :returns: (city, admin1, country)
        """
        city, admin1, country = [None] * 3
        if places:
            if 'place_type' in places:
                if (places['place_type'] == 'admin' and 'name' in places):
                    admin1 = nstr(places['name'].strip())
                elif places['place_type'] == 'city':
                    if 'name' in places:
                        city = nstr(places['name'].strip())
                    if 'full_name' in places:
                        ci_admin1 = places['full_name'].split(',')
                        if len(ci_admin1) > 0:
                            ci = ci_admin1[0]
                        if city is None and ci and len(ci.strip()) > 0:
                            city = nstr(ci.strip())
                        if len(ci_admin1) > 1:
                            adm1 = ci_admin1[-1]
                            if admin1 is None and len(adm1.strip()) > 0:
                                admin1 = nstr(adm1.strip())
                elif (places['place_type'] == 'poi'
                      or places['place_type'] == 'neighborhood'):
                    if 'full_name' in places:
                        ci_admin1 = places['full_name'].split(',')
                        if len(ci_admin1) > 1:
                            adm1 = ci_admin1[-1]
                            if admin1 is None and len(adm1.strip()) > 0:
                                admin1 = nstr(adm1.strip())

            if 'country' in places:
                country = nstr(places['country'].strip())
                # fix for twitter's venezuela long name
                if "venezuela" in country:
                    country = "venezuela"

        return city, admin1, country
예제 #9
0
    def _best_guess_city(self, city, country=None, admin1=None):
        '''Resolve city
        :param admin1: norm str of admin1 name
        :param country (optional): norm str of country name
        :param admin1 (optional): norm str of admin1 name
        :param res_level (optional): reolution level
        :returns: ([canonical LT], canononical_admin_match)
                  city/admin/country level geocoded tuple with lat-lon
                  LT = (city, country, admin1,
                        admin2, admin3, pop, lat, lon, id, pad)
                  canonical_admin_match : T/F 
        '''
        ci = nstr(city.strip())
        co = nstr(country.strip()) if country else None
        a1 = nstr(admin1.strip()) if admin1 else None
        # best-guess'es
        # aim is to first search through canonical names of city-country keys
        # and then also search alias ci names store
        # finaly we merge both results, where priority is given to records
        # results from canonical name searches
        canonical_name_bgs = set()
        canonical_a_match = False

        if ci in self.bguess:
            if co and co in self.bguess[ci]:
                if a1:
                    # TODO add log in else
                    if a1 in self.bguess[ci][co]:
                        canonical_a_match = True
                        canonical_name_bgs.add(
                            self.data[self.bguess[ci][co][a1][0]][1])
                else:
                    canonical_name_bgs.update([self.data[indices[0]][1]
                                               for adm1, indices in
                                               self.bguess[ci][co].items()])
            else:
                # pick if only one possible ci-co pair is present
                if len(self.bguess[ci]) == 1:
                    co1 = self.bguess[ci].keys()[0]
                    if a1:  # if admin is provided
                        # make sure its present in picked country
                        if a1 in self.bguess[ci][co1]:
                            canonical_a_match = True
                            canonical_name_bgs.add(
                                self.data[self.bguess[ci][co1][a1][0]][1])
                    else:
                        if len(self.bguess[ci][co1]) == 1:
                            canonical_name_bgs.update(
                                [self.data[indices[0]][1] for adm1, indices in
                                    self.bguess[ci][co1].items()])
                        else:  # do country level geocoding
                            bg_co = self.best_guess_country(co1)[0]
                            if bg_co:
                                canonical_name_bgs.add(bg_co[0])
                            else:
                                self.log.info(
                                    "Country not found CO:{0}".format(co1))
                else:  # if more than one country pair is present
                    if a1:
                        possible_co = set([co for co in
                                          self.bguess[ci].keys()
                                          if a1 in self.bguess[ci][co]])
                        if len(possible_co) == 1:
                            co1 = list(possible_co)[0]
                            canonical_name_bgs.add(
                                self.data[self.bguess[ci][co1][a1][0]][1])

        return canonical_name_bgs, canonical_a_match
예제 #10
0
    def __init__(self, wg_data=WG_DATA, co_admin_data=CO_ADMIN_DATA,
                 priority_policy=PRIORITY_POLICY,
                 debug=False):
        """
        """
        self.priority_policy = priority_policy
        self.debug = debug
        self.__version__ = "{0}-{1}-{2}-{3}-{4}".format(
            self.__class__.__name__,
            __version__,
            hashlib.md5(get_wg_data(wg_data).read()).hexdigest(),
            hashlib.md5(get_co_admin_data(co_admin_data).read()).hexdigest(),
            hashlib.md5(" ".join(self.priority_policy)).hexdigest())

        if self.debug:
            try:
                logs.init()
            except IOError:  # , err:
                logs.init(logfile=self.__class__.__name__.lower())

            self.log = logs.getLogger("{0}-{1}".format(
                                      self.__class__.__name__,
                                      __version__.replace('.', '_')))

        # 1. load country and admin1 level geo data
        f = get_co_admin_data(co_admin_data)
        dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t")
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect, fieldnames=CO_ADMIN_FIELDS)
        # NOTE:
        # Known conflicts b/w codes of countries and other admins
        # co Colombia ('Colombia', 'C\xc3\xb3rdoba')
        # cl Chile ('Colombia', 'Caldas')
        # ar Argentina ('Colombia', 'Arauca')
        # sv El Salvador ('El Salvador', 'San Vicente')

        # prep lookup dictionaries
        # key__value

        # countries
        self.co_code = {}
        self.co_names = {}
        self.co_aliases = {}
        self.co_capital_cities = {}
        # admin1
        self.admin_code = {}
        self.admin_name = {}
        # assumes countries appear first when reading data from
        # lac_co_admin TODO BAD!
        for r in reader:
            for k in r.keys():
                r[k] = r[k].strip()
            lat = float_or_none(r['latitude'])
            lon = float_or_none(r['longitude'])
            code = object_or_none(r['iso_3166_code'])
            rid = int_or_none(r["id"])
            if r['type'] == 'country':
                # country
                if code:
                    self.co_code[code] = r['name']
                    self.co_names[nstr(r['name'])] = (rid, lat, lon,
                                                      code, r['name'])
                    self.co_capital_cities[nstr(r['capital_city'])] =\
                        (r['capital_city'], r['name'])
                    aliases = r['alt_names'].split(',')
                    self.co_aliases.update({nstr(alias.strip()): r['name']
                                            for alias in aliases})
                else:
                    if self.debug:
                        self.log.error("Bad data country {0} Code {1}".format(
                                       r['name'], code))
            elif r['type'] == 'admin':
                # admin
                admin, co = r['full_name'].split(',')
                admin, co = admin.strip(), co.strip()

                if code:
                    if code not in self.admin_code:
                        self.admin_code[code] = []
                    self.admin_code[code].append((co, admin))
                co1, a = nstr(co), nstr(admin)
                if a not in self.admin_name:
                    self.admin_name[a] = {}
                if co1 not in self.admin_name[a]:
                    self.admin_name[a][co1] = (rid, lat, lon, code, admin, co)

        f.close()

        # 2. load (world-gazeteer) city level geo data
        f = get_wg_data(wg_data)
        dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t")
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect, fieldnames=WG_FIELDS)
        self.ci_aliases = {}
        # main data store for geocoding
        self.data = []
        counter = 0
        ci_set = set()
        for r in reader:
            for k in r.keys():
                r[k] = r[k].strip()
            # get alias names for cities
            ci_names = [a.strip() for a in r['alt_names'].split(',')
                        if len(a.strip()) > 0]
            ci_names.extend([a.strip() for a in r['orig_names'].split(',')
                             if len(a.strip()) > 0])
            for ci in ci_names:
                k = (nstr(ci), nstr(r['country']))
                a1 = nstr(r['admin1'])
                if k not in self.ci_aliases:
                    self.ci_aliases[k] = {a1: set([r['name']])}
                elif a1 not in self.ci_aliases[k]:
                    self.ci_aliases[k][a1] = set([r['name']])
                else:
                    # Cases where different cities for same
                    # admin-country pair have the same alias
                    self.ci_aliases[k][a1].add(r['name'])
                # add ci name aliases into ci_set
                ci_set.add(nstr(ci))
            # store only cannonical cities names
            self.data.append((counter, (r['name'], r['country'],
                              r['admin1'],
                              object_or_none(r['admin2']),
                              object_or_none(r['admin3']),
                              int_or_none(r['pop']),
                              float_or_none(r['latitude']) / 100,
                              float_or_none(r['longitude']) / 100,
                              int(r['id']), int(r['padded']))))
            counter += 1

        self.coordinates = {}
        # cases where admin1 and city share the same name
        # extended feature/hack #1 to resolve city when
        # only country and admin1 are specified
        self.same_ci_a1_name = {}
        for i, (n, c, a1, a2, a3, p, lat, lon, i_d, pad) in self.data:
            nn, nc, na1 = nstr(n),  nstr(c), nstr(a1)
            self.coordinates[(lat, lon)] = i
            if nn == na1 and pad == 0:
                self.same_ci_a1_name[(nc, na1)] = n
            ci_set.add(nn)

        # store (lat, lon)
        self.kdtree = KDTree([[i, j] for i, j in self.coordinates.keys()
                              if i is not None and j is not None])
        # build regular expr dicts
        co_set = set(self.co_names.keys())
        # add country name aliases into co_set
        co_set.update(self.co_aliases.keys())
        self.co_reg = ManyRE(co_set)
        self.ci_reg = ManyRE(ci_set)
        # add admin1 name aliases into admin1_set
        admin1_set = set(self.admin_name.keys())
        # build regular expression stores for co-admin1-ci
        self.admin1_reg = ManyRE(admin1_set)
        # add stopwords to prevent any 2-letter word in common usage
        # to be mis-interpretted as country or admin code
        two_letter_stop_words = set(
            ['BE', 'WE', '\xc3\xa0', 'YO', 'DO', 'YA', 'DE', 'DA', 'HA', 'BY',
             'HE', 'AL', 'NI', 'LE', 'NO', 'LO', 'TU', 'TO', 'TI', 'TE', 'EM',
             'EL', 'EN', 'IS', 'OS', 'AM', 'IT', 'AO', 'AN', 'AS', 'AT', 'IN',
             'EU', 'ES', 'IF', 'ME', 'ON', 'OF', 'LA', 'MI', 'UP', 'SU', 'UM',
             'UN', 'SO', 'NA', 'OU', 'MY', 'OR', 'SE', 'US'])

        self.co_code_reg = ManyRE([sw for sw in self.co_code.keys()
                                  if sw not in two_letter_stop_words])
        self.admin1_code_reg1 = ManyRE(self.admin_code.keys())
        self.admin1_code_reg2 = ManyRE([sw for sw in self.admin_code.keys()
                                       if sw not in two_letter_stop_words])

        self.bguess = {}
        for i, (city, country, admin1, a2, a3, p, la, lo, i_d, pad)\
                in self.data:

            ci, co, a = nstr(city), nstr(country), nstr(admin1)
            # value is list of admin1's that correspond to ci-co key
            # ci-co makes dictionary flatter
            # choose not to use co-admin1-ci as key to add more flexibility
            # for lookups
            if ci in self.bguess:
                if co in self.bguess[ci]:
                    if a in self.bguess[ci][co]:
                        # store original wg-records marked with pad = 0
                        # to head of the queue
                        if pad == 0:
                            self.bguess[ci][co][a].appendleft(i)
                        else:
                            self.bguess[ci][co][a].append(i)
                    else:
                        self.bguess[ci][co][a] = deque([i])
                else:
                    self.bguess[ci][co] = {a: deque([i])}
            else:
                self.bguess[ci] = {co: {a: deque([i])}}