def handle_fault_execution(request, schedule_time, exception): unique_id = request["unique_id"] job_url = request["url"] project_name = request["project_name"] job_name = request["job_name"] user_id = request["user_id"] crawler_name = request["crawler_name"] schedule_category = request["schedule_category"] # update relevant MongoDC entry in jobs collection with task_id and status update_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \ + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \ + '", "crawler_name": "' + crawler_name \ + '", "task_id": "Not Generated", "status": "FAILED" }' data_item = json.loads(update_data) data_item['schedule_category'] = schedule_category data_item['schedule_time'] = schedule_time mongo_connection = MongoConnection() if schedule_category == "Instant": # update relevant MongoDC entry in jobs collection with task_id and status query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name, 'crawler_name': crawler_name} mongo_connection.upsert_item(query, data_item, "jobs") else: # store job records in MongoDB database mongo_connection.insert_item(data_item, "jobs") logger.exception("Current can not schedule with invalid date or time format. " "Hence, job execution failed. " + str(exception))
def project_create(request): # take urls comes from client. try: json_data = json.loads(request.body) user_id = json_data['user_id'] project_name = json_data['project_name'] if not user_id: return JsonResponse( {'Error': 'Request payload does not contain user_id'}, status=400) if not project_name: return JsonResponse( {'Error': 'Request payload does not contain project_name'}, status=400) except JSONDecodeError: return JsonResponse( { 'Error': 'Request payload does not contain required parameters or empty' }, status=400) # user Authorization token_header = request.headers.get('Token') auth = FirebaseAuth(token_header, user_id) if not auth: return JsonResponse( { 'Error': 'User authentication failed. Please try again with a valid user login' }, status=400) try: mongo_connection = MongoConnection() data_item = dict(json_data) query = { 'user_id': data_item['user_id'], 'project_name': data_item['project_name'] } mongo_connection.upsert_item(query, data_item, "projects") except Exception as e: return JsonResponse( { 'Error': 'Error while connecting to the MongoDB database, ' + str(e) }, status=400) return JsonResponse({ 'status': "SUCCESS", 'Message': 'Project:' + project_name + ' created successfully' })
def crawl_new_job(request): # Post requests are for new crawling tasks if request.method == 'POST': # take urls comes from client. try: json_data = json.loads(request.body) url_data = json_data['urls'] job_name = json_data['job_name'] project_name = json_data['project_name'] user_id = json_data['user_id'] crawler_name = json_data['crawler_name'] schedule_type = json_data['schedule_type'] schedule_data = json_data['schedule_data'] except (JSONDecodeError, KeyError) as e: return JsonResponse({'Error': 'Missing fields in the request payload or empty, ' + str(e)}, status=400) if not user_id: return JsonResponse({'Error': 'Missing user id key in the request payload'}, status=400) if not job_name: return JsonResponse({'Error': 'Missing job name key in the request payload'}, status=400) if not url_data: return JsonResponse({'Error': 'Missing urls key in the request payload'}, status=400) if not project_name: return JsonResponse({'Error': 'Missing project_name key in the request payload'}, status=400) if not crawler_name: return JsonResponse({'Error': 'Missing crawler_name key in the request payload'}, status=400) if not schedule_type: return JsonResponse({'Error': 'Missing schedule_type key in the request payload'}, status=400) if (schedule_type != SCHEDULE_TASK_TYPE) \ and (schedule_type != INTERVAL_TASK_TYPE) and (schedule_type != HOT_TASK_TYPE): return JsonResponse({'Error': 'Requested crawler_type:' + schedule_type + ' is not a valid type'}, status=400) publish_url_ids = [] for url in url_data: if not is_valid_url(url): return JsonResponse({'Error': url + ' URL is invalid'}, status=400) unique_id = str(uuid4()) # create a unique ID. publish_data = u'{ "unique_id": "' + unique_id + '", "job_name": "' + job_name \ + '", "url": "' + url + '", "project_name": "' \ + project_name + '", "user_id": "' + user_id + '", "crawler_name": "' + crawler_name \ + '", "task_id":"" }' publish_data = json.loads(publish_data) try: # schedule data with celery task scheduler if schedule_type == SCHEDULE_TASK_TYPE: publish_data['schedule_data'] = schedule_data publish_data['schedule_category'] = CRON publish_data['status'] = "RUNNING" celery_task = schedule_job_with_cron_tab(publish_data) elif schedule_type == INTERVAL_TASK_TYPE: publish_data['schedule_data'] = schedule_data publish_data['schedule_category'] = INTERVAL publish_data['status'] = "RUNNING" celery_task = schedule_job_with_interval(publish_data) else: publish_data['schedule_category'] = INSTANT publish_data['status'] = "PENDING" celery_task = schedule_cron_job.delay(kwargs=json.dumps(publish_data)) if isinstance(celery_task, JsonResponse): return celery_task publish_url_ids.append(unique_id) publish_data['celery_task_name'] = celery_task.name try: # store job records in MongoDB database query = {'user_id': user_id, 'job_name': job_name, 'url': url, 'project_name': project_name, 'crawler_name': crawler_name} mongo_connection = MongoConnection() mongo_connection.upsert_item(query, publish_data, "jobs") except Exception as e: return JsonResponse({'Error': 'Error while connecting to the MongoDB database, ' + str(e)}, status=400) except Exception as e: return JsonResponse({'Status': "400 BAD", 'Error': 'Error occurred while scheduling the data with the Celery executor, ' + str(e)}, status=400) return JsonResponse({'status': "SUCCESS", 'Message': "Crawl job scheduled successfully.\n job_ids:" + str(publish_url_ids)})
def schedule_cron_job(self, **kwargs): json_body = "" schedule_time = str(time.time()) try: json_body = kwargs if "kwargs" in json_body: json_body = json.loads(json_body['kwargs']) unique_id = json_body['unique_id'] job_url = json_body["url"] project_name = json_body["project_name"] job_name = json_body["job_name"] user_id = json_body["user_id"] status = json_body["status"] crawler_name = json_body["crawler_name"] schedule_category = json_body["schedule_category"] if not unique_id or not job_url or not project_name or not user_id or not status \ or not crawler_name or not job_name: raise Exception('Required parameters are missing in the consumed message') if check_url_gives_response(job_url): job_domain = urlparse(job_url).netloc try: settings = { 'unique_id': unique_id, 'user_id': user_id, 'job_name': job_name, 'project_name': project_name, 'schedule_time': schedule_time, 'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' } # to list available spiders in the project # print(scrapyd.list_spiders("crawlerx_project")) # Schedule a crawl job with project and a specific spider task_id = \ scrapy_daemon.schedule("crawlerx_project", crawler_name, settings=settings, url=job_url, domain=job_domain) mongo_connection = MongoConnection() job_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \ + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \ + '", "crawler_name": "' + crawler_name \ + '", "task_id": "' + task_id + '", "status": "RUNNING" }' data_item = json.loads(job_data) data_item['schedule_category'] = schedule_category data_item['schedule_time'] = schedule_time if schedule_category == "Instant": # update relevant MongoDC entry in jobs collection with task_id and status query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name, 'crawler_name': crawler_name} mongo_connection.upsert_item(query, data_item, "jobs") else: # store job records in MongoDB database mongo_connection.insert_item(data_item, "jobs") # task id of the crawl job logger.info("Crawling job has been started with ID - " + task_id) except Exception as e: handle_fault_execution(json_body, schedule_time, e) else: handle_fault_execution(json_body, schedule_time, Exception("Current job URL does not seems available. Hence, job execution failed.")) except Exception as e: handle_fault_execution(json_body, schedule_time, e)