Пример #1
0
def schedule_crawl(spider, workflow, **kwargs):
    """Schedule a crawl using configuration from the workflow objects."""
    from inspire_crawler.utils import get_crawler_instance

    crawler = get_crawler_instance()
    crawler_settings = current_app.config.get('CRAWLER_SETTINGS')
    crawler_settings.update(kwargs.get("crawler_settings", {}))

    crawler_arguments = kwargs
    crawler_arguments.update(
        current_app.config.get('CRAWLER_SPIDER_ARGUMENTS', {}).get(spider, {}))
    job_id = crawler.schedule(
        project=current_app.config.get('CRAWLER_PROJECT'),
        spider=spider,
        settings=crawler_settings,
        **crawler_arguments)
    if job_id:
        crawler_job = CrawlerJob.create(
            job_id=job_id,
            spider=spider,
            workflow=workflow,
        )
        db.session.commit()
        current_app.logger.info(
            "Scheduled scrapyd job with id: {0}".format(job_id))
        current_app.logger.info("Created crawler job with id:{0}".format(
            crawler_job.id))
    else:
        raise CrawlerScheduleError(
            "Could not schedule '{0}' spider for project '{1}'".format(
                spider, current_app.config.get('CRAWLER_PROJECT')))

    return crawler_job
Пример #2
0
def schedule_crawl(spider, workflow, **kwargs):
    """Schedule a crawl using configuration from the workflow objects."""
    from inspire_crawler.utils import get_crawler_instance

    crawler = get_crawler_instance()
    crawler_settings = current_app.config.get('CRAWLER_SETTINGS')
    crawler_settings.update(kwargs.get("crawler_settings", {}))

    crawler_arguments = kwargs
    crawler_arguments.update(
        current_app.config.get('CRAWLER_SPIDER_ARGUMENTS', {}).get(spider, {})
    )
    job_id = crawler.schedule(
        project=current_app.config.get('CRAWLER_PROJECT'),
        spider=spider,
        settings=crawler_settings,
        **crawler_arguments
    )
    if job_id:
        CrawlerJob.create(
            job_id=job_id,
            spider=spider,
            workflow=workflow,
        )
        db.session.commit()
        current_app.logger.info("Scheduled job {0}".format(job_id))
    else:
        raise CrawlerScheduleError(
            "Could not schedule '{0}' spider for project '{1}'".format(
                spider, current_app.config.get('CRAWLER_PROJECT')
            )
        )
Пример #3
0
def test_utils(app):
    """Test tasks."""
    with app.app_context():
        assert get_crawler_instance()
Пример #4
0
def test_utils(app):
    """Test tasks."""
    with app.app_context():
        assert get_crawler_instance()