Пример #1
0
def get_scraper(db, scraper_name):
    sc = db.get_scraper_by_name(scraper_name=scraper_name)
    return scraper.ScraperDb(
        sc["scraper_name"],
        os.getenv(sc["db_url_var"]),
        site_table_name=sc["site_table_name"],
        article_table_name=sc["article_table_name"],
        snapshot_table_name=sc["snapshot_table_name"],
    )
Пример #2
0
 def setUpClass(cls):
     cls.db = scraper.ScraperDb(
         "TestScraper",
         os.getenv("SCRAPER_DB_URL"),
         {
             "site_table_name": "Site",
             "article_table_name": "Article",
             "snapshot_table_name": "ArticleSnapshot",
         },
     )
Пример #3
0
def main(args):
    parser_db = db.module("queries")
    parser_db.connect(os.getenv("DB_URL"))
    try:
        sc = parser_db.get_scraper_by_name(scraper_name=args.scraper_name)
        sc["data"] = db.to_json(sc["data"])
        scraper_db = scraper.ScraperDb(sc["scraper_name"],
                                       os.getenv(sc["data"]["db_url_var"]),
                                       sc["data"])

        if args.command == "producer":
            if args.id is not None:
                p = db.to_producer(
                    parser_db.get_producer(producer_id=db.of_uuid(args.id)))
                data_getter = DbGetter(scraper_db,
                                       scraper.get_site,
                                       site_id=p["site_id"])
            elif args.site_id is not None:
                data_getter = DbGetter(scraper_db,
                                       scraper.get_site,
                                       site_id=args.site_id)
            else:
                data_getter = DbGetter(scraper_db, scraper.get_sites)

            data_saver = (DbSaver(parser_db, producer.saver, scraper=sc)
                          if not args.dump else JsonSaver())
            run_one_shot(
                data_getter=data_getter,
                data_saver=data_saver,
                processor=producer.process_item,
            )

        elif args.command == "publication":
            if args.id is not None:
                raise RuntimeError("Unimplemented")
            elif args.article_id is not None:
                data_getter = DbGetter(
                    scraper_db,
                    scraper.get_snapshots,
                    article_id=args.article_id,
                    first=args.first,
                )
            elif args.url is not None:
                data_getter = DbGetter(scraper_db,
                                       scraper.get_snapshots,
                                       url=args.url,
                                       first=args.first)
            elif args.site_id is not None:
                data_getter = DbGetter(
                    scraper_db,
                    scraper.get_snapshots,
                    site_id=args.site_id,
                    first=args.first,
                )
            elif args.update:
                raise RuntimeError("Unimplemented")
            else:
                data_getter = get_all_unprocessed_articles(scraper_db,
                                                           parser_db,
                                                           args=args)
            run_batch(
                data_getter=data_getter,
                data_saver=DbSaver(parser_db, publication.saver, scraper=sc)
                if not args.dump else JsonSaver(),
                processor=partial(publication.process_item,
                                  parser=args.parser),
                batch_size=1000,
                limit=args.limit,
            )
        else:
            raise RuntimeError(f"Unknown command '{args.command}'")
        return 0
    except:
        logger.error(traceback.format_exc())
        return -1
    finally:
        parser_db.disconnect()