def run(): # connect to db db = Database() source_id = db.insert_update('source', {'url': 'http://brnonow.com'}) # fetch all article urls url = 'http://brnonow.com/page/%s' articles = [] for i in range(1000): log('page', i) try: links = Downloader(url, i).html().findAll('a', {'rel': 'bookmark'}) if not links: break for link in links: articles.append(link['href']) except HTMLParseError: log('error', 'parsing failure, skipping') # make it unique articles = set(articles) # process articles for url in articles: log('article', url) try: html = Downloader(url).html() links = html.findAll(lambda tag: tag.name == 'a' and re.match(r'http://[^\.]+\.google\.[^/]+/maps', tag.get('href', ''))) # get title & save article title = unicode(decode_unicode_entities(html.find('h1', 'entry-title').string)) article_id = db.insert_update('article', {'title': title, 'url': url, 'source_id': source_id}) # get places for link in links: query = GoogleMaps().parse_link_url(link['href']) # bud 49.234553,16.567812 nebo u'krav\xc3\xad hora' log('link', query) geocoded = Geocoder(query, resolve_coords=False).fetch() if geocoded: geocoded['name'] = None place_id = db.insert_update('place', geocoded) # save relations db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False) db.commit() except HTMLParseError: log('error', 'parsing failure, skipping') except AttributeError: log('error', 'attribute error, skipping')
class Tag(object): HIERARCHY = {'nonsmoking': {'name': u'Nekuřácké podniky', 'tags': [('non-smoking', u'nekuřácký podnik')]}, 'beer': {'name': u'Pivo', 'tags': []}, # dynamic, beer brand names 'wifi': {'name': u'Wi-Fi', 'tags': [('free-wifi', u'Wi-Fi zdarma')]}, 'grunge': {'name': u'Podzemní a tajuplná místa', 'tags': [('kafelanka', u'Kafélanka'), ('former-kafelanka', u'zaniklá Kafélanka'), ('underground', u'podzemí'), ('agartha-research', u'průzkum Agartha')]}, 'info': {'name': u'Články a tipy', 'tags': []}} # beware! tags are related to PLACES, not to ARTICLES def __init__(self, slug): self.db = Database() tag_id = self._find_tag(slug) if not tag_id: raise InvalidTagError() self.id = tag_id def _find_tag(self, slug): tag_id = None for category_slug, category in Tag.HIERARCHY.items(): for tag_slug, tag_name in category['tags']: if slug == tag_slug: category_id = self.db.insert_update('category', {'name': category['name'], 'slug': category_slug}) tag_id = self.db.insert_update('tag', {'name': tag_name, 'slug': tag_slug, 'category_id': category_id}) break self.db.commit() return tag_id def get_id(self): return self.id
def run(): # connect to db db = Database() source_id = db.insert_update('source', {'url': 'http://www.agartha.cz'}) # prepare tag_id = Tag('agartha-research').get_id() BIAS = {u'U Sedmi Švábů': u'Kopečná 37', u'Zábrdovický kostel': u'Zábrdovická 1', u'Kartouzský klášter': u'Božetěchova 2', u'Jakubská kostnice': u'Jakubské náměstí', u'Františkánský klášter': u'Františkánská', u'pod Bílou horou': u'Slatinská', u'Rosické nádraží': u''} # fetch all article urls for match in re.finditer(r'href="(http://agartha.cz/[^"]+/brno/[^"]+/)index.php">([^<]+)', Downloader('http://agartha.cz/html/pruzkumy/brno/').text('cp1250')): title = match.group(2).strip() url = match.group(1) log('article', title) # determining location location = re.sub(r'\s+\W?\w\W?\s+', ' ', title).strip() # strip one-char words if '-' in location: location = title.split('-')[0].strip() geocoded = None while len(location) and not geocoded: if location in BIAS: # some locations need manual hinting :( location = BIAS[location] log('location', location) geocoded = Geocoder(location).fetch() if not geocoded: # remove last word and try again location = re.sub(r'[^\s]+$', '', location).strip() if geocoded: log('geocoded', 'yes') place_id = db.insert_update('place', geocoded) article_id = db.insert_update('article', {'title': title, 'url': url, 'source_id': source_id}) # save relations db.insert_update('has_tag', {'place_id': place_id, 'tag_id': tag_id}, last_id=False) db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False) db.commit()
def run(): # connect to db db = Database() source_id = db.insert_update('source', {'url': 'http://www.podzemi.brno.cz'}) # prepare tag_id = Tag('underground').get_id() BIAS = {u'Kolektory': u'', u'Primární': u'', u'Sekundární': u'', u'Kanalizace': u'', u'Historie': u'', u'Současnost': u'', u'Vodovody': u'', u'Historické podzemí': u'', u'Úvod': u'', u'Aktuality': u'', u'Fotogalerie': u'', u'Kontakt': u'', u'Římské náměstí': u'Františkánská'} # fetch all locations for a in Downloader('http://www.podzemi.brno.cz').html().find(attrs={'id': 'obsahy-obsah'}).find('td').findAll('a'): title = re.sub(r'\s+', ' ', a.string).strip() url = 'http://www.podzemi.brno.cz/' + a['href'] log('article', title) # determining location location = title if location in BIAS: # some locations need manual hinting... location = BIAS[location] geocoded = Geocoder(location).fetch() if geocoded: log('geocoded', 'yes') place_id = db.insert_update('place', geocoded) article_id = db.insert_update('article', {'title': title, 'url': url, 'source_id': source_id}) # save relations db.insert_update('has_tag', {'place_id': place_id, 'tag_id': tag_id}, last_id=False) db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False) db.commit()
def run(): # connect to db db = Database() source_id = db.insert_update('source', {'url': 'http://www.kafelanka.cz'}) # prepare r_coord = re.compile(ur'mista\[\d+\]\s*=\s*new Array\(([\d\-\.]+),\s+([\d\-\.]+),([^,]+), "([^"]+)", "([^"]+)", "([^"]+)"\);') place_tag_id = Tag('kafelanka').get_id() former_place_tag_id = Tag('former-kafelanka').get_id() # fetch all article urls for match in re.findall(r'href="(mapa\.php\?ceho=[^"]+)"', Downloader('http://kafelanka.wz.cz/mista/').text('cp1250')): log('map', match) url = 'http://kafelanka.cz/mista/%s' % match html = Downloader(url).text('cp1250') for data in r_coord.finditer(html): log('place', data.group(4)) tag_ids = [place_tag_id] if data.group(3).strip(', "\'') == 'neni': tag_ids.append(former_place_tag_id) lat = str(float(data.group(2))) lng = str(float(data.group(1))) name = data.group(4) geocoded = Geocoder(','.join((lat, lng)), resolve_coords=False).fetch() geocoded['name'] = name article_url = 'http://kafelanka.cz/mista/%s' % data.group(5) photo_url = 'http://kafelanka.cz/mista/foto/%s' % data.group(6) place_id = db.insert_update('place', geocoded) article_id = db.insert_update('article', {'title': name, 'url': article_url, 'photo_url': photo_url, 'source_id': source_id}) # save relations for id in tag_ids: db.insert_update('has_tag', {'place_id': place_id, 'tag_id': id}, last_id=False) db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False) db.commit() break # apparently, all places are listed on every page