示例#1
0

valid_urls = [
    (re.compile(r"^http://www\.vuenosairez\.com/V2_1/evento\.php\?idEvento=[0-9]+&fechaEvento=[0-9]+$"), get_details)
]


def save_to_db():
    MusicShow.objects.all().delete()
    for event_info in events.values():
        MusicShow.objects.create(**event_info)


if __name__ == "__main__":
    import datetime

    pool = Pool(
        25,
        spyder.dispatcher,
        timeout=20,
        extra_kargs={"links_path": links_path, "crawled": set(), "maxdepth": 1, "valid_urls": valid_urls},
    )
    pool.add_to_queue(
        {"depth": 0, "referrer": None, "url": "http://www.vuenosairez.com/V2_1/resultados-agenda.php?tipoBusq=27&cat=5"}
    )
    start = datetime.datetime.now()
    pool.start()
    save_to_db()
    end = datetime.datetime.now()
    print "took", end - start, "to crawl and save to db."
示例#2
0
    showtime_pairs = [(showtimes[i],showtimes[i+1]) for i in range(len(showtimes))[::2]]
    with local_mutex:
        for place,times in showtime_pairs:
            event_showtimes[movie_id].add((place.text_content().strip(), u'%s-%s' % (showtime_date, times.text_content().strip())))

valid_urls = [(re.compile(r'http://www\.lanacion\.com\.ar/espectaculos/cartelera-cine/peliculaFicha\.asp\?pelicula=(\d+)$'), get_movie_details),]


def save_to_db():
    Movie.objects.all().delete()
    MovieShow.objects.all().delete()
    for event_info in events.values():
        Movie.objects.create(**event_info)
    
    for event_title,venue_showtimes in event_showtimes.items():
        for venue,showtime in venue_showtimes:
            MovieShow.objects.create(movie=event_title, sala=venue, horarios=showtime)

if __name__ == '__main__':
    pool = Pool(25, spyder.dispatcher, timeout=5, extra_kargs={'links_path':links_path, 'crawled':set(), 'maxdepth':None, 'valid_urls':valid_urls,})
    doc = get_doc('http://www.lanacion.com.ar/espectaculos/cartelera-cine/index.asp')
    for movie_id in doc.xpath(u'//div[@id="contenido"]//form//select[@name="pelicula"]/option/@value'):
        if movie_id.strip():
            pool.add_to_queue({'depth':0, 'referrer':None,
                               'url':'http://www.lanacion.com.ar/espectaculos/cartelera-cine/peliculaFicha.asp?pelicula=%s' % movie_id.strip(),})
    start = datetime.datetime.now()
    pool.start()
    save_to_db()
    end = datetime.datetime.now()
    print 'took', end - start, 'to crawl and save to db.'
示例#3
0
    for key,path in fields.items():
        try:
            path,func = path
        except ValueError:
            func = None
        nodes = doc.xpath(path)
        if nodes:
            value = func(nodes) if func else ', '.join([t.strip() for t in nodes if t.strip()])
            with local_mutex:
                d = {}
                events.setdefault(event_id, d)[key] = value

valid_urls = [(re.compile(r'http://www\.lanacion\.com\.ar/espectaculos/cartelera-teatro/obraFicha\.asp\?obra=[0-9]+&teatro_id=[0-9]+$'),
               get_details),]


def save_to_db():
    Theater.objects.all().delete()
    for event_info in events.values():
        Theater.objects.create(**event_info)

if __name__ == '__main__':
    import datetime
    pool = Pool(25, spyder.dispatcher, timeout=4, extra_kargs={'links_path':links_path, 'crawled':set(), 'maxdepth':1, 'valid_urls':valid_urls,})
    pool.add_to_queue({'depth':0, 'referrer':None, 'url':'http://www.lanacion.com.ar/espectaculos/cartelera-teatro/',})
    start = datetime.datetime.now()
    pool.start()
    save_to_db()
    end = datetime.datetime.now()
    print 'took', end - start, 'to crawl and save to db.'
示例#4
0
        events_synopses[event_id] = synopsis
    

valid_urls = [(re.compile(r'^http://www\.terra\.com\.ar/programaciontv/busqueda\.shtml.*$'), get_details),
              (re.compile(r'^http://www\.terra\.com\.ar/programaciontv/ficha\.pl\?id=.*$'), get_synopsis),]


def save_to_db(these_events):
    for event_id,occurrences in these_events.items():
        for event_info in occurrences:
            TvShow.objects.create(**event_info)
            
def save_synopses():
    for event_id, synopsis in events_synopses.items():
        TvShow.objects.filter(show_name=event_id).update(sinopsis=synopsis)

if __name__ == '__main__':
    extra_kargs = {'links_path':links_path, 'crawled':set(), 'maxdepth':None, 'valid_urls':valid_urls, 'link_getter':get_links,}
    pool = Pool(30, spyder.dispatcher, timeout=4, extra_kargs=extra_kargs)
    for day in range(7):
        this_date = datetime.date.today() + datetime.timedelta(days=day)
        url = this_date.strftime('http://www.terra.com.ar/programaciontv/busqueda.shtml?fe=%Y/%m/%d&o=0')
        pool.add_to_queue({'depth':0, 'referrer':None, 'url':url,})
    start = datetime.datetime.now()
    TvShow.objects.all().delete()
    pool.start()
    save_to_db(events)
    save_synopses()
    end = datetime.datetime.now()
    print 'took', end - start, 'to crawl and save to db.'