def get_scraper(db, scraper_name): sc = db.get_scraper_by_name(scraper_name=scraper_name) return scraper.ScraperDb( sc["scraper_name"], os.getenv(sc["db_url_var"]), site_table_name=sc["site_table_name"], article_table_name=sc["article_table_name"], snapshot_table_name=sc["snapshot_table_name"], )
def setUpClass(cls): cls.db = scraper.ScraperDb( "TestScraper", os.getenv("SCRAPER_DB_URL"), { "site_table_name": "Site", "article_table_name": "Article", "snapshot_table_name": "ArticleSnapshot", }, )
def main(args): parser_db = db.module("queries") parser_db.connect(os.getenv("DB_URL")) try: sc = parser_db.get_scraper_by_name(scraper_name=args.scraper_name) sc["data"] = db.to_json(sc["data"]) scraper_db = scraper.ScraperDb(sc["scraper_name"], os.getenv(sc["data"]["db_url_var"]), sc["data"]) if args.command == "producer": if args.id is not None: p = db.to_producer( parser_db.get_producer(producer_id=db.of_uuid(args.id))) data_getter = DbGetter(scraper_db, scraper.get_site, site_id=p["site_id"]) elif args.site_id is not None: data_getter = DbGetter(scraper_db, scraper.get_site, site_id=args.site_id) else: data_getter = DbGetter(scraper_db, scraper.get_sites) data_saver = (DbSaver(parser_db, producer.saver, scraper=sc) if not args.dump else JsonSaver()) run_one_shot( data_getter=data_getter, data_saver=data_saver, processor=producer.process_item, ) elif args.command == "publication": if args.id is not None: raise RuntimeError("Unimplemented") elif args.article_id is not None: data_getter = DbGetter( scraper_db, scraper.get_snapshots, article_id=args.article_id, first=args.first, ) elif args.url is not None: data_getter = DbGetter(scraper_db, scraper.get_snapshots, url=args.url, first=args.first) elif args.site_id is not None: data_getter = DbGetter( scraper_db, scraper.get_snapshots, site_id=args.site_id, first=args.first, ) elif args.update: raise RuntimeError("Unimplemented") else: data_getter = get_all_unprocessed_articles(scraper_db, parser_db, args=args) run_batch( data_getter=data_getter, data_saver=DbSaver(parser_db, publication.saver, scraper=sc) if not args.dump else JsonSaver(), processor=partial(publication.process_item, parser=args.parser), batch_size=1000, limit=args.limit, ) else: raise RuntimeError(f"Unknown command '{args.command}'") return 0 except: logger.error(traceback.format_exc()) return -1 finally: parser_db.disconnect()