示例#1
0
def handle_fault_execution(request, schedule_time, exception):
    unique_id = request["unique_id"]
    job_url = request["url"]
    project_name = request["project_name"]
    job_name = request["job_name"]
    user_id = request["user_id"]
    crawler_name = request["crawler_name"]
    schedule_category = request["schedule_category"]

    # update relevant MongoDC entry in jobs collection with task_id and status
    update_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \
                  + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \
                  + '", "crawler_name": "' + crawler_name \
                  + '", "task_id": "Not Generated", "status": "FAILED" }'

    data_item = json.loads(update_data)
    data_item['schedule_category'] = schedule_category
    data_item['schedule_time'] = schedule_time

    mongo_connection = MongoConnection()
    if schedule_category == "Instant":
        # update relevant MongoDC entry in jobs collection with task_id and status
        query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name,
                 'crawler_name': crawler_name}
        mongo_connection.upsert_item(query, data_item, "jobs")
    else:
        # store job records in MongoDB database
        mongo_connection.insert_item(data_item, "jobs")
    logger.exception("Current can not schedule with invalid date or time format. "
                     "Hence, job execution failed. " + str(exception))
示例#2
0
def schedule_cron_job(self, **kwargs):
    json_body = ""
    schedule_time = str(time.time())
    try:
        json_body = kwargs
        if "kwargs" in json_body:
            json_body = json.loads(json_body['kwargs'])
        unique_id = json_body['unique_id']
        job_url = json_body["url"]
        project_name = json_body["project_name"]
        job_name = json_body["job_name"]
        user_id = json_body["user_id"]
        status = json_body["status"]
        crawler_name = json_body["crawler_name"]
        schedule_category = json_body["schedule_category"]

        if not unique_id or not job_url or not project_name or not user_id or not status \
                or not crawler_name or not job_name:
            raise Exception('Required parameters are missing in the consumed message')

        if check_url_gives_response(job_url):
            job_domain = urlparse(job_url).netloc
            try:
                settings = {
                    'unique_id': unique_id,
                    'user_id': user_id,
                    'job_name': job_name,
                    'project_name': project_name,
                    'schedule_time': schedule_time,
                    'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
                }

                # to list available spiders in the project
                # print(scrapyd.list_spiders("crawlerx_project"))

                # Schedule a crawl job with project and a specific spider
                task_id = \
                    scrapy_daemon.schedule("crawlerx_project", crawler_name, settings=settings,
                                           url=job_url, domain=job_domain)

                mongo_connection = MongoConnection()

                job_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \
                           + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \
                           + '", "crawler_name": "' + crawler_name \
                           + '", "task_id": "' + task_id + '", "status": "RUNNING" }'
                data_item = json.loads(job_data)
                data_item['schedule_category'] = schedule_category
                data_item['schedule_time'] = schedule_time

                if schedule_category == "Instant":
                    # update relevant MongoDC entry in jobs collection with task_id and status
                    query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name,
                             'crawler_name': crawler_name}
                    mongo_connection.upsert_item(query, data_item, "jobs")
                else:
                    # store job records in MongoDB database
                    mongo_connection.insert_item(data_item, "jobs")

                # task id of the crawl job
                logger.info("Crawling job has been started with ID - " + task_id)
            except Exception as e:
                handle_fault_execution(json_body, schedule_time, e)
        else:
            handle_fault_execution(json_body, schedule_time,
                                   Exception("Current job URL does not seems available. Hence, job execution failed."))
    except Exception as e:
        handle_fault_execution(json_body, schedule_time, e)