Пример #1
0
    def post(self, request, *args, **kwargs):
        crawl_object = self.get_object()

        # Start
        if request.POST['action'] == "start":
            # Try to ping celery to see if it is ready. If the response is an
            # empty list, status is NOT READY. If there is an error connecting to
            # with redis, celery status is REDIS ERROR.
            try:
                celery_status = "READY" if celery.current_app.control.ping() else "CELERY ERROR"
            except ConnectionError:
                celery_status = "REDIS ERROR"
            if celery_status in ["REDIS ERROR", "CELERY ERROR"]:
                crawl_object.status = celery_status
                crawl_object.save()
                return HttpResponse(json.dumps(dict(
                        status=crawl_object.status,
                        )),
                    content_type="application/json")
            else:
                crawl_object.status = "STARTING"
                crawl_object.save()
                if crawl_object.crawler == "ache":
                    ache.delay(crawl_object)
                else:
                    crawl_object.rounds_left = int(request.POST["rounds"])
                    crawl_object.save()
                    nutch.delay(crawl_object)
                return HttpResponse(json.dumps(dict(
                        status=crawl_object.status,
                        )),
                    content_type="application/json")

        # Stop
        elif request.POST['action'] == "stop":
            crawl_path = crawl_object.get_crawl_path()
            if crawl_object.crawler == "ache":
                crawl_object.status = 'STOPPED'
                crawl_object.save()
                os.killpg(crawl_object.celerytask.pid, 9)
            if crawl_object.crawler == "nutch":
                crawl_object.rounds_left = 1
                crawl_object.save()
                touch(join(crawl_path, 'stop'))
            return HttpResponse(json.dumps(dict(
                    status="STOP SIGNAL SENT")),
                content_type="application/json")


        # Common Crawl Dump
        elif request.POST['action'] == "ccadump":
            crawl_object.status = "DUMPING"
            crawl_object.save()
            cca_dump(self.get_object())
            return HttpResponse("Success")
        # Dump Images
        elif request.POST['action'] == "dump":
            self.dump_images()
            return HttpResponse("Success")

        # Force Stop Nutch
        elif request.POST['action'] == "force_stop":
            touch(join(crawl_object.get_crawl_path(), 'stop'))
            os.killpg(crawl_object.celerytask.pid, 9)
            crawl_object.status = "FORCE STOPPED"
            crawl_object.save()
            return HttpResponse(json.dumps(dict(
                    status="FORCE STOPPED")),
                content_type="application/json")

        # Update status, statistics
        elif request.POST['action'] == "status":
            if crawl_object.status not in ["REDIS ERROR", "CELERY ERROR", "NOT STARTED", "STOPPED", "FORCE STOPPED"]:
                crawl_object.status = crawl_object.celerytask.task.status
                crawl_object.save()
            if crawl_object.crawler == "ache":
                ache_log_statistics(crawl_object)
            return HttpResponse(json.dumps(dict(
                    status=crawl_object.status,
                    harvest_rate=crawl_object.harvest_rate,
                    pages_crawled=crawl_object.pages_crawled,
                    rounds_left=crawl_object.rounds_left,
                    )),
                content_type="application/json")

        return HttpResponse(json.dumps(dict(
                args=args,
                kwargs=kwargs,
                post=request.POST)),
            content_type="application/json")
Пример #2
0
    def post(self, request, *args, **kwargs):
        crawl_object = self.get_object()

        # Start
        if request.POST['action'] == "start":
            crawl_object.status = "STARTING"
            crawl_object.save()
            if crawl_object.crawler == "ache":
                ache.delay(crawl_object)
            else:
                crawl_object.rounds_left = int(request.POST["rounds"])
                crawl_object.save()
                nutch.delay(crawl_object)

            return HttpResponse(json.dumps(dict(
                    status="STARTING")),
                content_type="application/json")


        # Stop
        elif request.POST['action'] == "stop":
            crawl_path = crawl_object.get_crawl_path()
            if crawl_object.crawler == "ache":
                crawl_object.status = 'STOPPED'
                crawl_object.save()
                os.killpg(crawl_object.crawltask.pid, 9)
            if crawl_object.crawler == "nutch":
                crawl_object.rounds_left = 1
                crawl_object.save()
                touch(join(crawl_path, 'stop'))
            return HttpResponse(json.dumps(dict(
                    status="STOP SIGNAL SENT")),
                content_type="application/json")

        # Dump Images
        elif request.POST['action'] == "dump":
            self.dump_images()
            return HttpResponse("Success")

        # Force Stop Nutch
        elif request.POST['action'] == "force_stop":
            touch(join(crawl_object.get_crawl_path(), 'stop'))
            os.killpg(crawl_object.crawltask.pid, 9)
            crawl_object.status = "FORCE STOPPED"
            crawl_object.save()
            return HttpResponse(json.dumps(dict(
                    status="FORCE STOPPED")),
                content_type="application/json")

        # Update status, statistics
        elif request.POST['action'] == "status":
            if crawl_object.status not in ["NOT STARTED", "STOPPED", "FORCE STOPPED"]:
                crawl_object.status = crawl_object.crawltask.task.status
                crawl_object.save()
            if crawl_object.crawler == "ache":
                ache_log_statistics(crawl_object)
            return HttpResponse(json.dumps(dict(
                    status=crawl_object.status,
                    harvest_rate=crawl_object.harvest_rate,
                    pages_crawled=crawl_object.pages_crawled,
                    rounds_left=crawl_object.rounds_left,
                    )),
                content_type="application/json")


        # TESTING reflect POST request
        return HttpResponse(json.dumps(dict(
                args=args,
                kwargs=kwargs,
                post=request.POST)),
            content_type="application/json")
Пример #3
0
    def post(self, request, *args, **kwargs):
        crawl_object = self.get_object()

        # Start
        if request.POST['action'] == "start":
            # Try to ping celery to see if it is ready. If the response is an
            # empty list, status is NOT READY. If there is an error connecting to
            # with redis, celery status is REDIS ERROR.
            try:
                celery_status = "READY" if celery.current_app.control.ping(
                ) else "CELERY ERROR"
            except ConnectionError:
                celery_status = "REDIS ERROR"
            if celery_status in ["REDIS ERROR", "CELERY ERROR"]:
                crawl_object.status = celery_status
                crawl_object.save()
                return HttpResponse(json.dumps(
                    dict(status=crawl_object.status, )),
                                    content_type="application/json")
            else:
                crawl_object.status = "STARTING"
                crawl_object.save()
                if crawl_object.crawler == "ache":
                    ache.delay(crawl_object)
                else:
                    crawl_object.rounds_left = int(request.POST["rounds"])
                    crawl_object.save()
                    nutch.delay(crawl_object)
                return HttpResponse(json.dumps(
                    dict(status=crawl_object.status, )),
                                    content_type="application/json")

        # Stop
        elif request.POST['action'] == "stop":
            crawl_path = crawl_object.get_crawl_path()
            if crawl_object.crawler == "ache":
                crawl_object.status = "STOPPED"
                crawl_object.save()
                os.killpg(crawl_object.celerytask.pid, 9)
            if crawl_object.crawler == "nutch":
                crawl_object.status = "FINISHING"
                crawl_object.rounds_left = 1
                crawl_object.save()
                touch(join(crawl_path, 'stop'))
            return HttpResponse(json.dumps(dict(status="STOP SIGNAL SENT")),
                                content_type="application/json")

        # Common Crawl Dump
        elif request.POST['action'] == "ccadump":
            crawl_object.status = "DUMPING"
            crawl_object.save()
            cca_dump(self.get_object())
            return HttpResponse("Success")
        # Dump Images
        elif request.POST['action'] == "dump":
            self.dump_images()
            return HttpResponse("Success")

        # Force Stop Nutch
        elif request.POST['action'] == "force_stop":
            touch(join(crawl_object.get_crawl_path(), 'stop'))
            os.killpg(crawl_object.celerytask.pid, 9)
            crawl_object.status = "FORCE STOPPED"
            crawl_object.save()
            return HttpResponse(json.dumps(dict(status="FORCE STOPPED")),
                                content_type="application/json")

        # Update status, statistics
        elif request.POST['action'] == "status":
            # Do not update the status if the current status is any of
            # the following. This is to prevent errors or interface problems
            # when checking the status of a celery task.
            no_go_statuses = [
                "FINISHING", "STOPPING", "REDIS ERROR", "CELERY ERROR",
                "NOT STARTED", "STOPPED", "FORCE STOPPED"
            ]
            if crawl_object.status not in no_go_statuses:
                crawl_object.status = crawl_object.celerytask.task.status
                crawl_object.save()
            if crawl_object.crawler == "ache":
                ache_log_statistics(crawl_object)
            return HttpResponse(json.dumps(
                dict(
                    status=crawl_object.status,
                    harvest_rate=crawl_object.harvest_rate,
                    pages_crawled=crawl_object.pages_crawled,
                    rounds_left=crawl_object.rounds_left,
                )),
                                content_type="application/json")

        return HttpResponse(json.dumps(
            dict(args=args, kwargs=kwargs, post=request.POST)),
                            content_type="application/json")
Пример #4
0
    def post(self, request, *args, **kwargs):
        crawl_object = self.get_object()

        # Start
        if request.POST['action'] == "start":
            # Try to ping celery to see if it is ready. If the response is an
            # empty list, status is NOT READY. If there is an error connecting to
            # with redis, celery status is REDIS ERROR.
            try:
                celery_status = "READY" if celery.current_app.control.ping() else "CELERY ERROR"
            except ConnectionError:
                celery_status = "REDIS ERROR"
            if celery_status in ["REDIS ERROR", "CELERY ERROR"]:
                crawl_object.status = celery_status
                crawl_object.save()
                return HttpResponse(json.dumps(dict(
                        status=crawl_object.status,
                        )),
                    content_type="application/json")
            else:
                crawl_object.status = "STARTING"
                crawl_object.save()
                if crawl_object.crawler == "ache":
                    ache.delay(crawl_object)
                else:
                    crawl_object.rounds_left = int(request.POST["rounds"])
                    crawl_object.save()
                    nutch.delay(crawl_object)
                return HttpResponse(json.dumps(dict(
                        status=crawl_object.status,
                        )),
                    content_type="application/json")

        # Stop
        elif request.POST['action'] == "stop":
            crawl_path = crawl_object.get_crawl_path()
            if crawl_object.crawler == "ache":
                crawl_object.status = "STOPPED"
                crawl_object.save()
                os.killpg(crawl_object.celerytask.pid, 9)
            if crawl_object.crawler == "nutch":
                crawl_object.status = "STOPPING"
                crawl_object.save()
            return HttpResponse(json.dumps(dict(
                    status="STOPPING")),
                content_type="application/json")


        # Common Crawl Dump
        elif request.POST['action'] == "ccadump":
            crawl_object.status = "DUMPING"
            crawl_object.save()
            cca_dump(self.get_object())
            crawl_object.status = "SUCCESS"
            crawl_object.save()
            return HttpResponse("Success")
        # Dump Images
        elif request.POST['action'] == "dump":
            # TODO - restore dump_images
            return HttpResponse("Success")

        # Update status, statistics
        elif request.POST['action'] == "status":
            # Do not update the status if the current status is any of
            # the following. This is to prevent errors or interface problems
            # when checking the status of a celery task.
            no_go_statuses = [
                "FINISHING",
                "STOPPING",
                "REDIS ERROR",
                "CELERY ERROR",
                "NOT STARTED",
                "STOPPED",
                "FORCE STOPPED"
            ]
            if crawl_object.status not in no_go_statuses and crawl_object.crawler != 'nutch':
                crawl_object.status = crawl_object.celerytask.task.status
                crawl_object.save()
            if crawl_object.crawler == "ache":
                ache_log_statistics(crawl_object)
            return HttpResponse(json.dumps(dict(
                    status=crawl_object.status,
                    harvest_rate=crawl_object.harvest_rate,
                    pages_crawled=crawl_object.pages_crawled,
                    rounds_left=crawl_object.rounds_left,
                    )),
                content_type="application/json")

        return HttpResponse(json.dumps(dict(
                args=args,
                kwargs=kwargs,
                post=request.POST)),
            content_type="application/json")