def main(): opts = parse_args() log_to_stderr(verbose=opts.verbose, quiet=opts.quiet) if opts.urls: all_urls = opts.urls elif environ.get('MORPH_URLS'): all_urls = filter(None, environ['MORPH_URLS'].split()) else: all_urls = set() for db_name in SOURCE_DBS: download_db(db_name) db = open_db(db_name) for table in show_tables(db): if table in SKIP_TABLES: continue urls = select_urls(db, table) if urls: log.info('read {} urls from {}.{}'.format( len(urls), db_name, table)) all_urls.update(urls) create_table_if_not_exists('url', with_scraper_id=False) dt = open_dt() failures = [] # tuple of (url, exception) for i, url in enumerate(sorted(all_urls)): log.info('scraping {} ({} of {})'.format( url, i + 1, len(all_urls))) try: html = scrape(url) soup = BeautifulSoup(html) row = dict(url=url, last_scraped=iso_now()) row['twitter_handle'] = scrape_twitter_handle( soup, required=False) row['facebook_url'] = scrape_facebook_url( soup, required=False) log.debug('`url`: {}'.format(repr(row))) dt.upsert(row, 'url') except Exception as e: failures.append((url, e)) print_exc() # show a summary of failures if failures: log.warn('Failed to scrape {} of {} URL{}:'.format( len(failures), len(all_urls), 's' if len(failures) > 2 else '')) for url, e in failures: log.warn(u' {}: {}'.format(url, repr(e))) if len(failures) > len(all_urls) * MAX_PROPORTION_FAILURES: raise Exception('too many failures')
def main(): opts = parse_args() log_to_stderr(verbose=opts.verbose, quiet=opts.quiet) scraper_ids = opts.scraper_ids if not scraper_ids and environ.get('MORPH_COMPANY_SCRAPERS'): scraper_ids = environ['MORPH_COMPANY_SCRAPERS'].split(',') skip_scraper_ids = DISABLED_SCRAPERS if environ.get('MORPH_SKIP_COMPANY_SCRAPERS'): skip_scraper_ids.update( environ['MORPH_SKIP_COMPANY_SCRAPERS'].split(',')) use_decimal_type_in_sqlite() run_scrapers(get_records_from_company_scraper, scraper_ids=scraper_ids, skip_scraper_ids=skip_scraper_ids, default_freq=DEFAULT_SCRAPE_FREQ)