def geocode_url(self, url, attempted=None): if attempted is None: attempted = set() util.logger.debug("Fetching %s..." % url) page = urlopen(url) soup = BeautifulSoup(page) rdf_url = self.parse_rdf_link(soup) util.logger.debug("Fetching %s..." % rdf_url) page = urlopen(rdf_url) things, thing = self.parse_rdf(page) name = self.get_label(thing) attributes = self.get_attributes(thing) for attribute, value in attributes: latitude, longitude = util.parse_geo(value) if None not in (latitude, longitude): break if None in (latitude, longitude): relations = self.get_relations(thing) for relation, resource in relations: url = things.get(resource, resource) if url in tried: # Avoid cyclic relationships. continue tried.add(url) name, (latitude, longitude) = self.geocode_url(url, tried) if None not in (name, latitude, longitude): break return (name, (latitude, longitude))
def parse_xhtml(self, page): soup = isinstance(page, BeautifulSoup) and page or BeautifulSoup(page) meta = soup.head.find('meta', {'name': 'geo.placename'}) name = meta and meta['content'] or None meta = soup.head.find('meta', {'name': 'geo.position'}) if meta: position = meta['content'] latitude, longitude = util.parse_geo(position) if latitude == 0 or longitude == 0: latitude = longitude = None else: latitude = longitude = None return (name, (latitude, longitude))