def serial_main(daemon_mode): cache = CorgiCache() while True: feeds = cache.get_all_feeds() for feed in feeds: scrap_feed(feed) if not daemon_mode: break return
def async_main(daemon_mode): pool = Pool() cache = CorgiCache() while True: feeds = cache.get_all_feeds() for feed in feeds: pool.apply_async(func=scrap_feed, args=feed) pool.close() pool.join() if not daemon_mode: break return
def scrap_feed(feed): etag = "" last_crawled = "" cache = CorgiCache() tokens = cache.get_token(use='scraper') if 'URL' not in feed: logging.error("no feed for {0}".format(feed)) return if 'ETAG' in feed: etag = feed['ETAG'] if 'CRAWLED' in feed: last_crawled = feed['CRAWLED'] url = feed['URL'] parser = PodcastFeedParser(url=url, etag=etag, last_request=last_crawled) feed['CRAWLED'] = formatdate() try: if parser.has_new_feed(): logging.info("new feed for {0}".format(url)) url = parser.get_new_feed() logging.info("new feed is {0}".format(url)) feed['URL'] = url feed.save() parser = PodcastFeedParser(url=url) feed['CRAWLED'] = formatdate() if parser.get_blocked(): logging.warning("feed blocked for {0}".format(url)) try: publisher = parser.get_owner() pub_filter = {'name': publisher['name']} data = get_data(table_name='publisher', data_filter=pub_filter, token=tokens['TOKEN']) data = data.json() if 0 == len(data): data = post_data(table_name='publisher', data=publisher, token=tokens['TOKEN']) data = data.json() if 'id' not in data: raise IOError else: data = data[0] publisher_id = data['id'] title = parser.get_title() author = parser.get_author() summary = parser.get_summary() category = parser.get_category() explicit = parser.get_explicit() link = parser.get_link() podcast_copyright = parser.get_copyright() blocked = parser.get_blocked() complete = parser.get_complete() keywords = parser.get_keywords() pod_filter = {'publisher': publisher_id, 'title': title} data = get_data(table_name='podcast', data_filter=pod_filter, token=tokens['TOKEN']) data = data.json() if 0 == len(data): podcast = {'publisher': publisher_id, 'title': title, 'author': author, 'summary': summary, 'category': category, 'explicit': explicit, 'link': link, 'copyright': podcast_copyright, 'blocked': blocked, 'complete': complete, 'keywords': keywords} data = post_data(table_name='podcast', data=podcast, token=tokens['TOKEN']) data = data.json() if 'id' not in data: raise IOError else: data = data[0] podcast_id = data['id'] guids = [] if 'GUIDS' in feed: guids = feed['GUIDS'] episodes = parser.get_new_episodes(guids) else: episodes = parser.get_all_episodes() for episode in episodes: guids.append(episode['guid']) episode['podcast'] = podcast_id del episode['guid'] post_data(table_name='episode', data=episode, token=tokens['TOKEN']) feed['GUIDS'] = guids except IOError: return try: feed['ETAG'] = parser.get_etag() except IOError: pass feed.save() logging.info("finished scraping, {0}".format(url)) except IOError: return return