Пример #1
0
def create_capture_job(user, human=True):
    link = Link(created_by=user, submitted_url="http://example.com")
    link.save()
    capture_job = CaptureJob(created_by=user,
                             link=link,
                             human=human,
                             status='pending')
    capture_job.save()
    return capture_job
Пример #2
0
def run_next_capture():
    """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """
    capture_job = CaptureJob.get_next_job(reserve=True)
    if not capture_job:
        return  # no jobs waiting
    proxy_capture.apply([capture_job.link_id])
    run_task(run_next_capture.s())
Пример #3
0
    def test_hard_timeout(self):
        create_capture_job(self.user_one)

        # simulate a failed run_next_capture()
        job = CaptureJob.get_next_job(reserve=True)

        # capture_start_time should be set accurately on the server side
        self.assertLess(
            (job.capture_start_time - timezone.now()).total_seconds(), 60)

        # clean_up_failed_captures shouldn't affect job, since timeout hasn't passed
        clean_up_failed_captures()
        job.refresh_from_db()
        self.assertEqual(job.status, "in_progress")

        # once job is sufficiently old, clean_up_failed_captures should mark it as failed
        job.capture_start_time -= timedelta(
            seconds=settings.CELERY_TASK_TIME_LIMIT + 60)
        job.save()
        clean_up_failed_captures()
        job.refresh_from_db()
        self.assertEqual(job.status, "failed")

        # failed jobs will have a message indicating failure reason
        self.assertEqual(
            json.loads(job.message)[api_settings.NON_FIELD_ERRORS_KEY][0],
            "Timed out.")
Пример #4
0
    def test_job_queue_order(self):
        """ Jobs should be processed round-robin, one per user. """

        jobs = [
            create_capture_job(self.user_one),
            create_capture_job(self.user_one),
            create_capture_job(self.user_one),
            create_capture_job(self.user_two),

            create_capture_job(self.user_two, human=False),

            create_capture_job(self.user_one),
            create_capture_job(self.user_one),
            create_capture_job(self.user_one),
            create_capture_job(self.user_two),
        ]

        expected_order = [
            0, 3,  # u1, u2
            1, 8,  # u1, u2
            2, 5, 6, 7,  # remaining u1 jobs
            4  # robots queue
        ]

        # test CaptureJob.queue_position
        for i, job in enumerate(jobs):
            queue_position = job.queue_position()
            expected_queue_position = expected_order.index(i)+1
            self.assertEqual(queue_position, expected_queue_position, "Job %s has queue position %s, should be %s." % (i, queue_position, expected_queue_position))

        # test CaptureJob.get_next_job
        expected_next_jobs = [jobs[i] for i in expected_order]
        next_jobs = [CaptureJob.get_next_job(reserve=True) for i in range(len(jobs))]
        self.assertListEqual(next_jobs, expected_next_jobs)
Пример #5
0
def run_next_capture():
    """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """
    capture_job = CaptureJob.get_next_job(reserve=True)
    if not capture_job:
        return  # no jobs waiting
    try:
        proxy_capture(capture_job)
    except:
        print "Exception while processing capture job %s:" % capture_job.link_id
        traceback.print_exc()
    finally:
        capture_job.link.captures.filter(status='pending').update(status='failed')
        if capture_job.status == 'pending':
            capture_job.mark_completed('failed')
    run_task(run_next_capture.s())
Пример #6
0
def run_next_capture():
    """ Grab and run the next CaptureJob. This will keep calling itself until there are no jobs left. """
    capture_job = CaptureJob.get_next_job(reserve=True)
    if not capture_job:
        return  # no jobs waiting
    try:
        proxy_capture(capture_job)
    except:
        print "Exception while processing capture job %s:" % capture_job.link_id
        traceback.print_exc()
    finally:
        capture_job.link.captures.filter(status='pending').update(
            status='failed')
        if capture_job.status == 'pending':
            capture_job.mark_completed('failed')
    run_task(run_next_capture.s())
Пример #7
0
    def obj_create(self, bundle, **kwargs):
        # We've received a request to archive a URL. That process is managed here.
        # We create a new entry in our datastore and pass the work off to our indexing
        # workers. They do their thing, updating the model as they go. When we get some minimum
        # set of results we can present the user (a guid for the link), we respond back.
        if settings.READ_ONLY_MODE:
            raise ImmediateHttpResponse(response=self.error_response(
                bundle.request, {
                    'archives': {
                        '__all__':
                        "Perma has paused archive creation for scheduled maintenance. Please try again shortly."
                    },
                    'reason':
                    "Perma has paused archive creation for scheduled maintenance. Please try again shortly.",
                }))

        # Runs validation (exception thrown if invalid), sets properties and saves the object
        if not bundle.data.get('replace'):
            bundle = super(LinkResource,
                           self).obj_create(bundle,
                                            created_by=bundle.request.user)

        link = bundle.obj
        link.save()

        # put link in folder and handle Org settings based on folder
        folder = bundle.data.get('folder')
        if folder.organization and folder.organization.default_to_private:
            link.is_private = True
            link.save()
        link.move_to_folder_for_user(
            folder, bundle.request.user)  # also sets link.organization

        uploaded_file = bundle.data.get('file')
        if uploaded_file:
            # normalize file name to upload.jpg, upload.png, upload.gif, or upload.pdf
            mime_type = get_mime_type(uploaded_file.name)
            file_name = 'upload.%s' % mime_type_lookup[mime_type][
                'new_extension']

            base_warc_url = "file:///%s/%s" % (link.guid, file_name)

            # only append a random number to warc_url if we're replacing a file
            warc_url = base_warc_url if not bundle.data.get(
                'replace') else "%s?version=%s" % (
                    base_warc_url, str(random.random()).replace('.', ''))

            capture = Capture(link=link,
                              role='primary',
                              status='success',
                              record_type='resource',
                              user_upload='True',
                              content_type=mime_type,
                              url=warc_url)

            uploaded_file.file.seek(0)
            capture.write_warc_resource_record(uploaded_file)
            capture.save()

        else:
            # create primary capture placeholder
            Capture(
                link=link,
                role='primary',
                status='pending',
                record_type='response',
                url=link.submitted_url,
            ).save()

            # create screenshot placeholder
            Capture(
                link=link,
                role='screenshot',
                status='pending',
                record_type='resource',
                url="file:///%s/cap.png" % link.guid,
                content_type='image/png',
            ).save()

            # create CaptureJob
            CaptureJob(link=link, human=bundle.data.get('human', False)).save()

            # kick off capture tasks -- no need for guid since it'll work through the queue
            run_task(run_next_capture.s())

        return bundle
Пример #8
0
    def post(self, request, format=None):
        """ Create new link. """
        data = request.data
        capture_job = CaptureJob(human=request.data.get('human', False),
                                 submitted_url=request.data.get('url', ''),
                                 created_by=request.user)
        if settings.ENABLE_BATCH_LINKS:
            # Batch is set directly on the request object by the LinkBatch api,
            # to prevent abuse of this feature by those POSTing directly to this route.
            if getattr(request, 'batch', None):
                capture_job.link_batch = LinkBatch.objects.get(
                    id=request.batch)
        capture_job.save()

        # Set target folder, in order of preference:
        # - 'folder' key in data
        # - parent folder, if posting to /folders/:parent_id/archives
        # - user's personal folder
        try:
            folder = self.get_folder_from_request(
                request) or request.parent or request.user.root_folder
        except ValidationError as e:
            raise_invalid_capture_job(capture_job, e.detail)

        # Make sure a limited user has links left to create
        if not folder.organization:
            if not request.user.link_creation_allowed():
                if request.user.nonpaying:
                    raise_invalid_capture_job(
                        capture_job, "You've already reached your limit.")
                error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information."
                raise_invalid_capture_job(capture_job, error)
        else:
            registrar = folder.organization.registrar
            if not registrar.link_creation_allowed():
                error = 'Perma.cc cannot presently make links on behalf of {}. '.format(
                    registrar.name)
                if request.user.registrar:
                    contact = 'Visit your settings for subscription information.'
                else:
                    registrar_users = [
                        user.email
                        for user in registrar.active_registrar_users()
                    ]
                    contact = 'For assistance with your subscription, contact:  {}.'.format(
                        ", ".join(registrar_users))
                raise_invalid_capture_job(capture_job, error + contact)

        serializer = self.serializer_class(data=data,
                                           context={'request': request})
        if serializer.is_valid():

            link = serializer.save(created_by=request.user)

            # put link in folder and handle Org settings based on folder
            if folder.organization and folder.organization.default_to_private:
                link.is_private = True
                link.save()
            link.move_to_folder_for_user(
                folder, request.user)  # also sets link.organization

            # handle uploaded file
            uploaded_file = request.data.get('file')
            if uploaded_file:
                link.write_uploaded_file(uploaded_file)
                link.warc_size = default_storage.size(link.warc_storage_file())
                link.save()

            # handle submitted url
            else:
                # create primary capture placeholder
                Capture(
                    link=link,
                    role='primary',
                    status='pending',
                    record_type='response',
                    url=link.submitted_url,
                ).save()

                # create screenshot placeholder
                Capture(
                    link=link,
                    role='screenshot',
                    status='pending',
                    record_type='resource',
                    url="file:///%s/cap.png" % link.guid,
                    content_type='image/png',
                ).save()

                # kick off capture tasks -- no need for guid since it'll work through the queue
                capture_job.status = 'pending'
                capture_job.link = link
                capture_job.save(update_fields=['status', 'link'])
                run_task(run_next_capture.s())

            return Response(serializer.data, status=status.HTTP_201_CREATED)

        raise_invalid_capture_job(capture_job, serializer.errors)
Пример #9
0
    def post(self, request, format=None):
        """ Create new link. """
        data = request.data
        capture_job = CaptureJob(
            human=request.data.get('human', False),
            submitted_url=request.data.get('url', ''),
            created_by=request.user
        )
        if settings.ENABLE_BATCH_LINKS:
            # Batch is set directly on the request object by the LinkBatch api,
            # to prevent abuse of this feature by those POSTing directly to this route.
            if getattr(request, 'batch', None):
                capture_job.link_batch = LinkBatch.objects.get(id=request.batch)
        capture_job.save()


        # Set target folder, in order of preference:
        # - 'folder' key in data
        # - parent folder, if posting to /folders/:parent_id/archives
        # - user's personal folder
        try:
            folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder
        except ValidationError as e:
            raise_invalid_capture_job(capture_job, e.detail)

        # Disallow creation of links in top-level sponsored folder
        if folder.is_sponsored_root_folder:
            error = "You can't make links directly in your Sponsored Links folder. Select a folder belonging to a sponsor."
            raise_invalid_capture_job(capture_job, error)

        # Make sure a limited user has links left to create
        if not folder.organization and not folder.sponsored_by:
            if not request.user.link_creation_allowed():
                if request.user.nonpaying:
                    raise_invalid_capture_job(capture_job, "You've already reached your limit.")
                error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information."
                raise_invalid_capture_job(capture_job, error)
        else:
            registrar = folder.sponsored_by if folder.sponsored_by else folder.organization.registrar

            msg = None
            if folder.read_only:
                registrar_users = [user.email for user in registrar.active_registrar_users()]
                msg = f"Your registrar has made this folder read-only. For assistance, contact: {', '.join(registrar_users)}."
            if not registrar.link_creation_allowed():
                error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name)
                if request.user.registrar:
                    contact = 'Visit your settings for subscription information.'
                else:
                    registrar_users = [user.email for user in registrar.active_registrar_users()]
                    contact = 'For assistance with your subscription, contact:  {}.'.format(", ".join(registrar_users))
                msg = error + contact
            if msg:
                raise_invalid_capture_job(capture_job, msg)

        serializer = self.serializer_class(data=data, context={'request': request})
        if serializer.is_valid():

            with transaction.atomic():
                # Technique from https://github.com/harvard-lil/capstone/blob/0f7fb80f26e753e36e0c7a6a199b8fdccdd318be/capstone/capapi/serializers.py#L121
                #
                # Fetch the current user data here inside a transaction, using select_for_update
                # to lock the row so we don't collide with any simultaneous requests
                user = request.user.__class__.objects.select_for_update().get(pk=request.user.pk)

                # If this is a Personal Link, and if the user only has bonus links left, decrement bonus links
                bonus_link = False
                if not folder.organization and not folder.sponsored_by:
                    links_remaining, _ , bonus_links = user.get_links_remaining()
                    if bonus_links and not links_remaining:
                        # (this works because it's part of the same transaction with the select_for_update --
                        # we don't have to use the same object)
                        request.user.bonus_links = bonus_links - 1
                        request.user.save(update_fields=['bonus_links'])
                        bonus_link = True

                link = serializer.save(created_by=request.user, bonus_link=bonus_link)

            # put link in folder and handle Org settings based on folder
            if folder.organization and folder.organization.default_to_private:
                link.is_private = True
                link.save()
            link.move_to_folder_for_user(folder, request.user)  # also sets link.organization

            # handle uploaded file
            uploaded_file = request.data.get('file')
            if uploaded_file:
                link.write_uploaded_file(uploaded_file)

            # handle submitted url
            else:
                # create primary capture placeholder
                Capture(
                    link=link,
                    role='primary',
                    status='pending',
                    record_type='response',
                    url=link.submitted_url,
                ).save()

                # create screenshot placeholder
                Capture(
                    link=link,
                    role='screenshot',
                    status='pending',
                    record_type='resource',
                    url="file:///%s/cap.png" % link.guid,
                    content_type='image/png',
                ).save()


                # kick off capture tasks -- no need for guid since it'll work through the queue
                capture_job.status = 'pending'
                capture_job.link = link
                capture_job.save(update_fields=['status', 'link'])
                run_task(run_next_capture.s())

            return Response(serializer.data, status=status.HTTP_201_CREATED)

        raise_invalid_capture_job(capture_job, serializer.errors)
Пример #10
0
    def post(self, request, format=None):
        """ Create new link. """
        data = request.data

        # Set target folder, in order of preference:
        # - 'folder' key in data
        # - parent folder, if posting to /folders/:parent_id/archives
        # - user's personal folder
        folder = self.get_folder_from_request(
            request) or request.parent or request.user.root_folder

        # Make sure a limited user has links left to create
        if not folder.organization:
            links_remaining = request.user.get_links_remaining()
            if links_remaining < 1:
                raise_validation_error("You've already reached your limit.")
        else:
            registrar = folder.organization.registrar
            if not registrar.link_creation_allowed():
                error = 'Perma.cc cannot presently make links on behalf of {}. '.format(
                    registrar.name)
                if request.user.registrar:
                    contact = 'Visit your settings for subscription information.'
                else:
                    registrar_users = [
                        user.email
                        for user in registrar.active_registrar_users()
                    ]
                    contact = 'For assistance with your subscription, contact:  {}.'.format(
                        ", ".join(registrar_users))
                raise_validation_error(error + contact)

        serializer = self.serializer_class(data=data,
                                           context={'request': request})
        if serializer.is_valid():

            link = serializer.save(created_by=request.user)

            # put link in folder and handle Org settings based on folder
            if folder.organization and folder.organization.default_to_private:
                link.is_private = True
                link.save()
            link.move_to_folder_for_user(
                folder, request.user)  # also sets link.organization

            # handle uploaded file
            uploaded_file = request.data.get('file')
            if uploaded_file:
                link.write_uploaded_file(uploaded_file)

            # handle submitted url
            else:
                # create primary capture placeholder
                Capture(
                    link=link,
                    role='primary',
                    status='pending',
                    record_type='response',
                    url=link.submitted_url,
                ).save()

                # create screenshot placeholder
                Capture(
                    link=link,
                    role='screenshot',
                    status='pending',
                    record_type='resource',
                    url="file:///%s/cap.png" % link.guid,
                    content_type='image/png',
                ).save()

                # create CaptureJob
                CaptureJob(link=link, human=request.data.get('human',
                                                             False)).save()

                # kick off capture tasks -- no need for guid since it'll work through the queue
                run_task(run_next_capture.s())

            return Response(serializer.data, status=status.HTTP_201_CREATED)
        return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
Пример #11
0
    def post(self, request, format=None):
        """ Create new link. """
        data = request.data
        capture_job = CaptureJob(
            human=request.data.get('human', False),
            submitted_url=request.data.get('url', ''),
            created_by=request.user
        )
        if settings.ENABLE_BATCH_LINKS:
            # Batch is set directly on the request object by the LinkBatch api,
            # to prevent abuse of this feature by those POSTing directly to this route.
            if getattr(request, 'batch', None):
                capture_job.link_batch = LinkBatch.objects.get(id=request.batch)
        capture_job.save()


        # Set target folder, in order of preference:
        # - 'folder' key in data
        # - parent folder, if posting to /folders/:parent_id/archives
        # - user's personal folder
        try:
            folder = self.get_folder_from_request(request) or request.parent or request.user.root_folder
        except ValidationError as e:
            raise_invalid_capture_job(capture_job, e.detail)

        # Make sure a limited user has links left to create
        if not folder.organization:
            if not request.user.link_creation_allowed():
                if request.user.nonpaying:
                    raise_invalid_capture_job(capture_job, "You've already reached your limit.")
                error = "Perma.cc cannot presently make additional Perma Links on your behalf. Visit your subscription settings page for more information."
                raise_invalid_capture_job(capture_job, error)
        else:
            registrar = folder.organization.registrar
            if not registrar.link_creation_allowed():
                error = 'Perma.cc cannot presently make links on behalf of {}. '.format(registrar.name)
                if request.user.registrar:
                    contact = 'Visit your settings for subscription information.'
                else:
                    registrar_users = [user.email for user in registrar.active_registrar_users()]
                    contact = 'For assistance with your subscription, contact:  {}.'.format(", ".join(registrar_users))
                raise_invalid_capture_job(capture_job, error + contact)

        serializer = self.serializer_class(data=data, context={'request': request})
        if serializer.is_valid():

            link = serializer.save(created_by=request.user)

            # put link in folder and handle Org settings based on folder
            if folder.organization and folder.organization.default_to_private:
                link.is_private = True
                link.save()
            link.move_to_folder_for_user(folder, request.user)  # also sets link.organization

            # handle uploaded file
            uploaded_file = request.data.get('file')
            if uploaded_file:
                link.write_uploaded_file(uploaded_file)

            # handle submitted url
            else:
                # create primary capture placeholder
                Capture(
                    link=link,
                    role='primary',
                    status='pending',
                    record_type='response',
                    url=link.submitted_url,
                ).save()

                # create screenshot placeholder
                Capture(
                    link=link,
                    role='screenshot',
                    status='pending',
                    record_type='resource',
                    url="file:///%s/cap.png" % link.guid,
                    content_type='image/png',
                ).save()


                # kick off capture tasks -- no need for guid since it'll work through the queue
                capture_job.status = 'pending'
                capture_job.link = link
                capture_job.save(update_fields=['status', 'link'])
                run_task(run_next_capture.s())

            return Response(serializer.data, status=status.HTTP_201_CREATED)

        raise_invalid_capture_job(capture_job, serializer.errors)
Пример #12
0
 def get_next_job(i):
     return CaptureJob.get_next_job(reserve=True)