def dispatcher(worker, item, links_path, crawled, maxdepth, valid_urls, link_getter=lambda doc, links_path: doc.xpath(links_path)): """ worker: threading.Thread item: {'url':url, 'referrer':some_url or None, 'depth':int} links_path: an xpath to look for links that matches the 'href' attribute crawled: a set where the crawled urls are added maxdepth: int valid_urls: [(compiled_regex, function), (compiled_regex, function), etc...], function signature is function(worker, doc, regexmatch, item) """ url = item['url'] referrer = item['referrer'] depth = item['depth'] with worker.pool.mutex: if url in crawled: return crawled.add(url) try: doc = get_doc(url) except urllib2.HTTPError, e: print '*' * 30 print 'Error when fetching url: %s' % url print 'Error was: %s' % e if e.code == 500: if item.setdefault('retries', 0) < 100: print 'Putting url back in queue, tried %s times.' % (item['retries'] + 1) with worker.pool.mutex: crawled.discard(url) item['retries'] += 1 worker.add_to_queue(item) else: print 'Url %s failed after %s attempts... discarding.' % (url, item['retries'] + 1) print '*' * 30 return
showtime_pairs = [(showtimes[i],showtimes[i+1]) for i in range(len(showtimes))[::2]] with local_mutex: for place,times in showtime_pairs: event_showtimes[movie_id].add((place.text_content().strip(), u'%s-%s' % (showtime_date, times.text_content().strip()))) valid_urls = [(re.compile(r'http://www\.lanacion\.com\.ar/espectaculos/cartelera-cine/peliculaFicha\.asp\?pelicula=(\d+)$'), get_movie_details),] def save_to_db(): Movie.objects.all().delete() MovieShow.objects.all().delete() for event_info in events.values(): Movie.objects.create(**event_info) for event_title,venue_showtimes in event_showtimes.items(): for venue,showtime in venue_showtimes: MovieShow.objects.create(movie=event_title, sala=venue, horarios=showtime) if __name__ == '__main__': pool = Pool(25, spyder.dispatcher, timeout=5, extra_kargs={'links_path':links_path, 'crawled':set(), 'maxdepth':None, 'valid_urls':valid_urls,}) doc = get_doc('http://www.lanacion.com.ar/espectaculos/cartelera-cine/index.asp') for movie_id in doc.xpath(u'//div[@id="contenido"]//form//select[@name="pelicula"]/option/@value'): if movie_id.strip(): pool.add_to_queue({'depth':0, 'referrer':None, 'url':'http://www.lanacion.com.ar/espectaculos/cartelera-cine/peliculaFicha.asp?pelicula=%s' % movie_id.strip(),}) start = datetime.datetime.now() pool.start() save_to_db() end = datetime.datetime.now() print 'took', end - start, 'to crawl and save to db.'