Exemplo n.º 1
0
def dispatcher(worker, item, links_path, crawled, maxdepth, valid_urls, link_getter=lambda doc, links_path: doc.xpath(links_path)):
    """
        worker: threading.Thread
        item: {'url':url, 'referrer':some_url or None, 'depth':int}
        links_path: an xpath to look for links that matches the 'href' attribute
        crawled: a set where the crawled urls are added
        maxdepth: int
        valid_urls: [(compiled_regex, function), (compiled_regex, function), etc...], function signature is function(worker, doc, regexmatch, item)
    """
    url = item['url']
    referrer = item['referrer']
    depth = item['depth']
    
    with worker.pool.mutex:
        if url in crawled:
            return
        crawled.add(url)
    
    try:
        doc = get_doc(url)
    except urllib2.HTTPError, e:
        print '*' * 30
        print 'Error when fetching url: %s' % url
        print 'Error was: %s' % e
        if e.code == 500:
            if item.setdefault('retries', 0) < 100:
                print 'Putting url back in queue, tried %s times.' % (item['retries'] + 1)
                with worker.pool.mutex:
                    crawled.discard(url)
                item['retries'] += 1
                worker.add_to_queue(item)
            else:
                print 'Url %s failed after %s attempts... discarding.' % (url, item['retries'] + 1)
        print '*' * 30
        return
Exemplo n.º 2
0
    showtime_pairs = [(showtimes[i],showtimes[i+1]) for i in range(len(showtimes))[::2]]
    with local_mutex:
        for place,times in showtime_pairs:
            event_showtimes[movie_id].add((place.text_content().strip(), u'%s-%s' % (showtime_date, times.text_content().strip())))

valid_urls = [(re.compile(r'http://www\.lanacion\.com\.ar/espectaculos/cartelera-cine/peliculaFicha\.asp\?pelicula=(\d+)$'), get_movie_details),]


def save_to_db():
    Movie.objects.all().delete()
    MovieShow.objects.all().delete()
    for event_info in events.values():
        Movie.objects.create(**event_info)
    
    for event_title,venue_showtimes in event_showtimes.items():
        for venue,showtime in venue_showtimes:
            MovieShow.objects.create(movie=event_title, sala=venue, horarios=showtime)

if __name__ == '__main__':
    pool = Pool(25, spyder.dispatcher, timeout=5, extra_kargs={'links_path':links_path, 'crawled':set(), 'maxdepth':None, 'valid_urls':valid_urls,})
    doc = get_doc('http://www.lanacion.com.ar/espectaculos/cartelera-cine/index.asp')
    for movie_id in doc.xpath(u'//div[@id="contenido"]//form//select[@name="pelicula"]/option/@value'):
        if movie_id.strip():
            pool.add_to_queue({'depth':0, 'referrer':None,
                               'url':'http://www.lanacion.com.ar/espectaculos/cartelera-cine/peliculaFicha.asp?pelicula=%s' % movie_id.strip(),})
    start = datetime.datetime.now()
    pool.start()
    save_to_db()
    end = datetime.datetime.now()
    print 'took', end - start, 'to crawl and save to db.'