示例#1
0
def run():
    # connect to db
    db = Database()
    source_id = db.insert_update('source', {'url': 'http://brnonow.com'})
    
    # fetch all article urls
    url = 'http://brnonow.com/page/%s'
    articles = []
    for i in range(1000):
        log('page', i)
        try:
            links = Downloader(url, i).html().findAll('a', {'rel': 'bookmark'})
            if not links:
                break
            
            for link in links:
                articles.append(link['href'])
                
        except HTMLParseError:
            log('error', 'parsing failure, skipping')
    
    # make it unique
    articles = set(articles)
    
    # process articles
    for url in articles:
        log('article', url)
        try:
            html = Downloader(url).html()
            links = html.findAll(lambda tag: tag.name == 'a' and re.match(r'http://[^\.]+\.google\.[^/]+/maps', tag.get('href', ''))) 
            
            # get title & save article
            title = unicode(decode_unicode_entities(html.find('h1', 'entry-title').string))
            article_id = db.insert_update('article', {'title': title, 'url': url, 'source_id': source_id})
            
            # get places
            for link in links:
                query = GoogleMaps().parse_link_url(link['href']) # bud 49.234553,16.567812 nebo u'krav\xc3\xad hora'
                log('link', query)
    
                geocoded = Geocoder(query, resolve_coords=False).fetch()
                if geocoded:
                    geocoded['name'] = None
                    place_id = db.insert_update('place', geocoded)
                    
                    # save relations
                    db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False)
            
            db.commit()
                
        except HTMLParseError:
            log('error', 'parsing failure, skipping')
            
        except AttributeError:
            log('error', 'attribute error, skipping')
示例#2
0
文件: tag.py 项目: honzajavorek/brno
class Tag(object):
    
    HIERARCHY = {'nonsmoking': {'name': u'Nekuřácké podniky',
                                'tags': [('non-smoking', u'nekuřácký podnik')]},
                 
                 'beer': {'name': u'Pivo',
                          'tags': []}, # dynamic, beer brand names
                 
                 'wifi': {'name': u'Wi-Fi',
                          'tags': [('free-wifi', u'Wi-Fi zdarma')]},
                 
                 'grunge': {'name': u'Podzemní a tajuplná místa',
                            'tags': [('kafelanka', u'Kafélanka'),
                                     ('former-kafelanka', u'zaniklá Kafélanka'),
                                     ('underground', u'podzemí'),
                                     ('agartha-research', u'průzkum Agartha')]},
                 
                 'info': {'name': u'Články a tipy',
                          'tags': []}} # beware! tags are related to PLACES, not to ARTICLES
    
    def __init__(self, slug):
        self.db = Database()
        tag_id = self._find_tag(slug)

        if not tag_id:
            raise InvalidTagError()
        self.id = tag_id
    
    def _find_tag(self, slug):
        tag_id = None
        for category_slug, category in Tag.HIERARCHY.items():
            for tag_slug, tag_name in category['tags']:
                if slug == tag_slug:
                    category_id = self.db.insert_update('category', {'name': category['name'], 'slug': category_slug})
                    tag_id = self.db.insert_update('tag', {'name': tag_name, 'slug': tag_slug, 'category_id': category_id})
                    break
        self.db.commit()
        return tag_id
    
    def get_id(self):
        return self.id
示例#3
0
def run():
    # connect to db
    db = Database()
    source_id = db.insert_update('source', {'url': 'http://www.agartha.cz'})
    
    # prepare
    tag_id = Tag('agartha-research').get_id()
    
    BIAS = {u'U Sedmi Švábů': u'Kopečná 37',
            u'Zábrdovický kostel': u'Zábrdovická 1',
            u'Kartouzský klášter': u'Božetěchova 2',
            u'Jakubská kostnice': u'Jakubské náměstí',
            u'Františkánský klášter': u'Františkánská',
            u'pod Bílou horou': u'Slatinská',
            u'Rosické nádraží': u''}
    
    # fetch all article urls
    for match in re.finditer(r'href="(http://agartha.cz/[^"]+/brno/[^"]+/)index.php">([^<]+)', Downloader('http://agartha.cz/html/pruzkumy/brno/').text('cp1250')):
        title = match.group(2).strip()
        url = match.group(1)
        
        log('article', title)
    
        # determining location
        location = re.sub(r'\s+\W?\w\W?\s+', ' ', title).strip() # strip one-char words 
        if '-' in location:
            location = title.split('-')[0].strip()
    
        geocoded = None
        while len(location) and not geocoded:
            if location in BIAS:
                # some locations need manual hinting :(
                location = BIAS[location]
            
            log('location', location)
            geocoded = Geocoder(location).fetch()
            if not geocoded:
                # remove last word and try again
                location = re.sub(r'[^\s]+$', '', location).strip()
        
        if geocoded:
            log('geocoded', 'yes')
            
            place_id = db.insert_update('place', geocoded)
            article_id = db.insert_update('article', {'title': title, 'url': url, 'source_id': source_id})
            
            # save relations
            db.insert_update('has_tag', {'place_id': place_id, 'tag_id': tag_id}, last_id=False)
            db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False)
            
            db.commit()
示例#4
0
def run():
    # connect to db
    db = Database()
    source_id = db.insert_update('source', {'url': 'http://www.podzemi.brno.cz'})
    
    # prepare
    tag_id = Tag('underground').get_id()
    
    BIAS = {u'Kolektory': u'',
            u'Primární': u'',
            u'Sekundární': u'',
            u'Kanalizace': u'',
            u'Historie': u'',
            u'Současnost': u'',
            u'Vodovody': u'',
            u'Historické podzemí': u'',
            u'Úvod': u'',
            u'Aktuality': u'',
            u'Fotogalerie': u'',
            u'Kontakt': u'',
            u'Římské náměstí': u'Františkánská'}
    
    # fetch all locations
    for a in Downloader('http://www.podzemi.brno.cz').html().find(attrs={'id': 'obsahy-obsah'}).find('td').findAll('a'):
        title = re.sub(r'\s+', ' ', a.string).strip()
        url = 'http://www.podzemi.brno.cz/' + a['href']
        
        log('article', title)
    
        # determining location
        location = title
        if location in BIAS:
            # some locations need manual hinting...
            location = BIAS[location]
        geocoded = Geocoder(location).fetch()
        
        if geocoded:
            log('geocoded', 'yes')
            
            place_id = db.insert_update('place', geocoded)
            article_id = db.insert_update('article', {'title': title, 'url': url, 'source_id': source_id})
            
            # save relations
            db.insert_update('has_tag', {'place_id': place_id, 'tag_id': tag_id}, last_id=False)
            db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False)
            
            db.commit()
示例#5
0
def run():
    # connect to db
    db = Database()
    source_id = db.insert_update('source', {'url': 'http://www.kafelanka.cz'})
    
    # prepare
    r_coord = re.compile(ur'mista\[\d+\]\s*=\s*new Array\(([\d\-\.]+),\s+([\d\-\.]+),([^,]+), "([^"]+)", "([^"]+)", "([^"]+)"\);')
    
    place_tag_id = Tag('kafelanka').get_id()
    former_place_tag_id = Tag('former-kafelanka').get_id()
    
    # fetch all article urls
    for match in re.findall(r'href="(mapa\.php\?ceho=[^"]+)"', Downloader('http://kafelanka.wz.cz/mista/').text('cp1250')):
        log('map', match)
        url = 'http://kafelanka.cz/mista/%s' % match
        html = Downloader(url).text('cp1250')
        
        for data in r_coord.finditer(html):
            log('place', data.group(4))
            
            tag_ids = [place_tag_id]
            if data.group(3).strip(', "\'') == 'neni':
                tag_ids.append(former_place_tag_id)
            
            lat = str(float(data.group(2)))
            lng = str(float(data.group(1)))
            
            name = data.group(4)
            
            geocoded = Geocoder(','.join((lat, lng)), resolve_coords=False).fetch()
            geocoded['name'] = name
            
            article_url = 'http://kafelanka.cz/mista/%s' % data.group(5)
            photo_url = 'http://kafelanka.cz/mista/foto/%s' % data.group(6)
            
            place_id = db.insert_update('place', geocoded)
            article_id = db.insert_update('article', {'title': name, 'url': article_url, 'photo_url': photo_url, 'source_id': source_id})
            
            # save relations
            for id in tag_ids:
                db.insert_update('has_tag', {'place_id': place_id, 'tag_id': id}, last_id=False)
            db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False)
            
            db.commit()
        
        break # apparently, all places are listed on every page