示例#1
0
def delete_crawl_job(request, job_id):
    if request.method == 'DELETE':
        try:
            mongo_connection = MongoConnection()
            json_data = mongo_connection.get_items("jobs", {'unique_id': job_id})
            if len(json_data) == 0:
                return JsonResponse({'Error': 'Requested job id' + str(job_id) + 'does not exists'}, status=400)

            # this should be a interval or cron job
            celery_task_name = ""
            if len(json_data) > 1:
                for obj in json_data:
                    if 'celery_task_name' in obj:
                        celery_task_name = obj['celery_task_name']
                        break

            delete_count = 0
            if json_data[0]['schedule_category'] == INSTANT:
                delete_count = mongo_connection.delete_items("jobs", {'unique_id': job_id})
            else:
                # delete scheduled task from django beat
                if not celery_task_name:
                    celery_task_name = json_data[0]['celery_task_name']
                delete_schedule_job(celery_task_name)
                delete_count = mongo_connection.delete_items("jobs", {'unique_id': job_id})
            if delete_count == 0:
                return JsonResponse({'Error': 'Delete action failed for the job_id: ' + str(job_id)}, status=400)

        except Exception as e:
            return JsonResponse({'Error': 'Error while deleting the job from the database, ' + str(e)}, status=400)

        return JsonResponse({'Status': "SUCCESS", 'Message': 'Crawl job deleted successfully'})
示例#2
0
def get_projects(request):

    try:
        json_data = json.loads(request.body)
        user_id = json_data['user_id']
        if not user_id:
            return JsonResponse(
                {'Error': 'Request payload does not contain user_id'},
                status=400)

    except JSONDecodeError:
        return JsonResponse(
            {
                'Error':
                'Request payload does not contain required parameters or empty'
            },
            status=400)

    try:
        mongo_connection = MongoConnection()
        json_data = mongo_connection.get_items("projects",
                                               {'user_id': user_id})
    except Exception as e:
        return JsonResponse(
            {
                'Error':
                'Error while getting project details from the database, ' +
                str(e)
            },
            status=400)

    return JsonResponse({'Status': "SUCCESS", 'data': json_data})
示例#3
0
def get_jobs_by_project(request):
    try:
        json_data = json.loads(request.body)
        user_id = json_data['user_id']
        project_name = json_data['project_name']
    except JSONDecodeError as e:
        return JsonResponse({
            'Error':
            'Missing user_id in the request payload or empty, ' + str(e)
        })

    if not user_id:
        return JsonResponse(
            {'Error': 'Request payload does not contain user_id'})

    if not project_name:
        return JsonResponse(
            {'Error': 'Request payload does not contain project_name'})

    # take urls comes from client.
    try:
        mongo_connection = MongoConnection()
        json_data = mongo_connection.get_items("jobs", {
            'user_id': user_id,
            'project_name': project_name
        })
    except Exception as e:
        return JsonResponse({
            'Error':
            'Error while getting job details from the database, ' + str(e)
        })

    return JsonResponse({'status': "SUCCESS", 'data': json_data})
示例#4
0
def project_create(request):
    # take urls comes from client.
    try:
        json_data = json.loads(request.body)
        user_id = json_data['user_id']
        project_name = json_data['project_name']
        if not user_id:
            return JsonResponse(
                {'Error': 'Request payload does not contain user_id'},
                status=400)

        if not project_name:
            return JsonResponse(
                {'Error': 'Request payload does not contain project_name'},
                status=400)

    except JSONDecodeError:
        return JsonResponse(
            {
                'Error':
                'Request payload does not contain required parameters or empty'
            },
            status=400)

    # user Authorization
    token_header = request.headers.get('Token')
    auth = FirebaseAuth(token_header, user_id)

    if not auth:
        return JsonResponse(
            {
                'Error':
                'User authentication failed. Please try again with a valid user login'
            },
            status=400)

    try:
        mongo_connection = MongoConnection()
        data_item = dict(json_data)
        query = {
            'user_id': data_item['user_id'],
            'project_name': data_item['project_name']
        }
        mongo_connection.upsert_item(query, data_item, "projects")
    except Exception as e:
        return JsonResponse(
            {
                'Error':
                'Error while connecting to the MongoDB database, ' + str(e)
            },
            status=400)

    return JsonResponse({
        'status':
        "SUCCESS",
        'Message':
        'Project:' + project_name + ' created successfully'
    })
示例#5
0
def delete_crawl_task(request, task_id):
    if request.method == 'DELETE':
        try:
            mongo_connection = MongoConnection()
            delete_count = mongo_connection.delete_items("jobs", {'task_id': task_id})
            if delete_count == 0:
                return JsonResponse({'Error': 'Delete action failed for the task_id: ' + str(task_id)}, status=400)

        except Exception as e:
            return JsonResponse({'Error': 'Error while deleting the job from the database, ' + str(e)}, status=400)

        return JsonResponse({'Status': "SUCCESS", 'Message': 'Crawl job deleted successfully'})
示例#6
0
def get_job_data(request):
    # take urls comes from client.
    try:
        json_data = json.loads(request.body)
    except JSONDecodeError as e:
        return JsonResponse({'Error': 'Missing URLs in the request payload or empty, ' + str(e)}, status=400)

    if "user_id" not in json_data:
        return JsonResponse({'Error': 'Missing user id key in the request payload'}, status=400)

    if ("task_id" not in json_data) and ("unique_id" not in json_data):
        return JsonResponse({'Error': 'Missing unique_id or task_id key in the request payload'}, status=400)

    try:
        mongo_connection = MongoConnection()
        json_data = mongo_connection.get_items("jobs", json_data)
    except Exception as e:
        return JsonResponse({'Error': 'Error while getting project details from the database, ' + str(e)}, status=400)

    return JsonResponse({'Status': "SUCCESS", 'data': json_data})
示例#7
0
def disable_schedule_job(request):

    if request.method == 'POST':
        task_name = ""
        try:
            json_data = json.loads(request.body)
            task_name = json_data['celery_task_name']
            is_enabled = json_data['is_enabled']
            task = PeriodicTask.objects.get(name=task_name)
            if not task:
                return JsonResponse({'Error': 'Scheduled task_name: ' + task_name + ' is invalid or does not exist'}
                                    , status=400)
            if type(is_enabled) != bool:
                return JsonResponse({'Error': 'Scheduled is_enabled: ' + is_enabled + ' is invalid parameter'}
                                    , status=400)
            task.enabled = is_enabled
            task.save()
            mongo_connection = MongoConnection()
            if is_enabled:
                mongo_connection.update_item({"celery_task_name": task_name, "status": "DISABLED"},
                                             {'$set': {"status": "RUNNING"}}, "jobs")
                value = "enabled"
            else:
                mongo_connection.update_item({"celery_task_name": task_name, "status": "RUNNING"},
                                             {'$set': {"status": "DISABLED"}}, "jobs")
                value = "disabled"
            return JsonResponse({'Status': "SUCCESS",
                                 'Message': 'Successfully ' + value + ' the scheduled task_name: ' + task_name})
        except Exception as e:
            return JsonResponse({'status': "400 BAD",
                                 'Error': 'Error occurred while disabling the scheduled task task_name: '
                                          + task_name + ". " + str(e)}, status=400)
示例#8
0
def handle_fault_execution(request, schedule_time, exception):
    unique_id = request["unique_id"]
    job_url = request["url"]
    project_name = request["project_name"]
    job_name = request["job_name"]
    user_id = request["user_id"]
    crawler_name = request["crawler_name"]
    schedule_category = request["schedule_category"]

    # update relevant MongoDC entry in jobs collection with task_id and status
    update_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \
                  + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \
                  + '", "crawler_name": "' + crawler_name \
                  + '", "task_id": "Not Generated", "status": "FAILED" }'

    data_item = json.loads(update_data)
    data_item['schedule_category'] = schedule_category
    data_item['schedule_time'] = schedule_time

    mongo_connection = MongoConnection()
    if schedule_category == "Instant":
        # update relevant MongoDC entry in jobs collection with task_id and status
        query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name,
                 'crawler_name': crawler_name}
        mongo_connection.upsert_item(query, data_item, "jobs")
    else:
        # store job records in MongoDB database
        mongo_connection.insert_item(data_item, "jobs")
    logger.exception("Current can not schedule with invalid date or time format. "
                     "Hence, job execution failed. " + str(exception))
示例#9
0
def get_jobs(request):
    try:
        json_data = json.loads(request.body)
    except JSONDecodeError as e:
        return JsonResponse(
            {
                'Error':
                'Missing user_id in the request payload or empty, ' + str(e)
            },
            status=400)

    # take urls comes from client.
    try:
        mongo_connection = MongoConnection()
        json_data = mongo_connection.get_items("jobs", json_data)
    except Exception as e:
        return JsonResponse(
            {
                'Error':
                'Error while getting job details from the database, ' + str(e)
            },
            status=400)

    return JsonResponse({'Status': "SUCCESS", 'data': json_data})
示例#10
0
def crawl_new_job(request):
    # Post requests are for new crawling tasks
    if request.method == 'POST':

        # take urls comes from client.
        try:
            json_data = json.loads(request.body)
            url_data = json_data['urls']
            job_name = json_data['job_name']
            project_name = json_data['project_name']
            user_id = json_data['user_id']
            crawler_name = json_data['crawler_name']
            schedule_type = json_data['schedule_type']
            schedule_data = json_data['schedule_data']
        except (JSONDecodeError, KeyError) as e:
            return JsonResponse({'Error': 'Missing fields in the request payload or empty, ' + str(e)}, status=400)

        if not user_id:
            return JsonResponse({'Error': 'Missing user id key in the request payload'}, status=400)

        if not job_name:
            return JsonResponse({'Error': 'Missing job name key in the request payload'}, status=400)

        if not url_data:
            return JsonResponse({'Error': 'Missing urls key in the request payload'}, status=400)

        if not project_name:
            return JsonResponse({'Error': 'Missing project_name key in the request payload'}, status=400)

        if not crawler_name:
            return JsonResponse({'Error': 'Missing crawler_name key in the request payload'}, status=400)

        if not schedule_type:
            return JsonResponse({'Error': 'Missing schedule_type key in the request payload'}, status=400)

        if (schedule_type != SCHEDULE_TASK_TYPE) \
                and (schedule_type != INTERVAL_TASK_TYPE) and (schedule_type != HOT_TASK_TYPE):
            return JsonResponse({'Error': 'Requested crawler_type:' + schedule_type + ' is not a valid type'},
                                status=400)

        publish_url_ids = []
        for url in url_data:
            if not is_valid_url(url):
                return JsonResponse({'Error': url + ' URL is invalid'}, status=400)

            unique_id = str(uuid4())  # create a unique ID.
            publish_data = u'{ "unique_id": "' + unique_id + '", "job_name": "' + job_name \
                           + '", "url": "' + url + '", "project_name": "' \
                           + project_name + '", "user_id": "' + user_id + '", "crawler_name": "' + crawler_name \
                           + '", "task_id":"" }'

            publish_data = json.loads(publish_data)
            try:
                # schedule data with celery task scheduler
                if schedule_type == SCHEDULE_TASK_TYPE:
                    publish_data['schedule_data'] = schedule_data
                    publish_data['schedule_category'] = CRON
                    publish_data['status'] = "RUNNING"
                    celery_task = schedule_job_with_cron_tab(publish_data)
                elif schedule_type == INTERVAL_TASK_TYPE:
                    publish_data['schedule_data'] = schedule_data
                    publish_data['schedule_category'] = INTERVAL
                    publish_data['status'] = "RUNNING"
                    celery_task = schedule_job_with_interval(publish_data)
                else:
                    publish_data['schedule_category'] = INSTANT
                    publish_data['status'] = "PENDING"
                    celery_task = schedule_cron_job.delay(kwargs=json.dumps(publish_data))

                if isinstance(celery_task, JsonResponse):
                    return celery_task

                publish_url_ids.append(unique_id)
                publish_data['celery_task_name'] = celery_task.name
                try:
                    # store job records in MongoDB database
                    query = {'user_id': user_id, 'job_name': job_name, 'url': url,
                             'project_name': project_name, 'crawler_name': crawler_name}
                    mongo_connection = MongoConnection()
                    mongo_connection.upsert_item(query, publish_data, "jobs")
                except Exception as e:
                    return JsonResponse({'Error': 'Error while connecting to the MongoDB database, ' + str(e)},
                                        status=400)
            except Exception as e:
                return JsonResponse({'Status': "400 BAD",
                                     'Error': 'Error occurred while scheduling the data with the Celery executor, '
                                              + str(e)}, status=400)

        return JsonResponse({'status': "SUCCESS", 'Message': "Crawl job scheduled successfully.\n job_ids:"
                                                             + str(publish_url_ids)})
示例#11
0
def schedule_cron_job(self, **kwargs):
    json_body = ""
    schedule_time = str(time.time())
    try:
        json_body = kwargs
        if "kwargs" in json_body:
            json_body = json.loads(json_body['kwargs'])
        unique_id = json_body['unique_id']
        job_url = json_body["url"]
        project_name = json_body["project_name"]
        job_name = json_body["job_name"]
        user_id = json_body["user_id"]
        status = json_body["status"]
        crawler_name = json_body["crawler_name"]
        schedule_category = json_body["schedule_category"]

        if not unique_id or not job_url or not project_name or not user_id or not status \
                or not crawler_name or not job_name:
            raise Exception('Required parameters are missing in the consumed message')

        if check_url_gives_response(job_url):
            job_domain = urlparse(job_url).netloc
            try:
                settings = {
                    'unique_id': unique_id,
                    'user_id': user_id,
                    'job_name': job_name,
                    'project_name': project_name,
                    'schedule_time': schedule_time,
                    'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
                }

                # to list available spiders in the project
                # print(scrapyd.list_spiders("crawlerx_project"))

                # Schedule a crawl job with project and a specific spider
                task_id = \
                    scrapy_daemon.schedule("crawlerx_project", crawler_name, settings=settings,
                                           url=job_url, domain=job_domain)

                mongo_connection = MongoConnection()

                job_data = u'{ "unique_id": "' + unique_id + '", "url": "' + job_url + '", "project_name": "' \
                           + project_name + '", "job_name": "' + job_name + '", "user_id": "' + user_id \
                           + '", "crawler_name": "' + crawler_name \
                           + '", "task_id": "' + task_id + '", "status": "RUNNING" }'
                data_item = json.loads(job_data)
                data_item['schedule_category'] = schedule_category
                data_item['schedule_time'] = schedule_time

                if schedule_category == "Instant":
                    # update relevant MongoDC entry in jobs collection with task_id and status
                    query = {'user_id': user_id, 'url': job_url, 'project_name': project_name, 'job_name': job_name,
                             'crawler_name': crawler_name}
                    mongo_connection.upsert_item(query, data_item, "jobs")
                else:
                    # store job records in MongoDB database
                    mongo_connection.insert_item(data_item, "jobs")

                # task id of the crawl job
                logger.info("Crawling job has been started with ID - " + task_id)
            except Exception as e:
                handle_fault_execution(json_body, schedule_time, e)
        else:
            handle_fault_execution(json_body, schedule_time,
                                   Exception("Current job URL does not seems available. Hence, job execution failed."))
    except Exception as e:
        handle_fault_execution(json_body, schedule_time, e)