Пример #1
0
    def test_create_oneshot_task_dict_simple(self, mock_datetime):
        mock_datetime.now.return_value = "some-date"

        actual_task = utils.create_oneshot_task_dict("some-task-type")

        expected_task = {
            "policy": "oneshot",
            "type": "some-task-type",
            "next_run": "some-date",
            "arguments": {"args": [], "kwargs": {},},
        }

        self.assertEqual(actual_task, expected_task)
        mock_datetime.now.assert_called_once_with(tz=timezone.utc)
Пример #2
0
    def test_create_oneshot_task_dict_other_call(self, mock_datetime):
        mock_datetime.now.return_value = "some-other-date"

        actual_task = utils.create_oneshot_task_dict(
            "some-task-type", "arg0", "arg1", priority="high", other_stuff="normal"
        )

        expected_task = {
            "policy": "oneshot",
            "type": "some-task-type",
            "next_run": "some-other-date",
            "arguments": {
                "args": ("arg0", "arg1"),
                "kwargs": {"other_stuff": "normal"},
            },
            "priority": "high",
        }

        self.assertEqual(actual_task, expected_task)
        mock_datetime.now.assert_called_once_with(tz=timezone.utc)
Пример #3
0
def test_cli_admin_reschedule_nominal(cli_runner, complete_deposit, swh_scheduler):
    """Rescheduling deposit with no load_task_id cannot work."""
    deposit = complete_deposit

    from swh.deposit.models import Deposit

    # create a task to keep a reference on it
    task = create_oneshot_task_dict(
        "load-deposit", url=deposit.origin_url, deposit_id=deposit.id, retries_left=3
    )
    scheduled_task = swh_scheduler.create_tasks([task])[0]
    # disable it
    swh_scheduler.set_status_tasks([scheduled_task["id"]], status="disabled")

    # Now update the deposit state with some swhid and relevant load_task_id
    deposit = complete_deposit
    deposit.load_task_id = scheduled_task["id"]
    deposit.swhid = "swh:1:dir:02ed6084fb0e8384ac58980e07548a547431cf74"
    deposit.swhid_context = f"{deposit.swhid};origin=https://url/external-id"
    deposit.save()

    # Reschedule it
    result = cli_runner.invoke(
        cli, ["deposit", "reschedule", "--deposit-id", deposit.id]
    )
    assert result.exit_code == 0

    # Now, ensure the deposit and the associated task are in the right shape
    deposit = Deposit.objects.get(id=deposit.id)

    # got reset to a state which allows rescheduling
    assert deposit.id
    assert deposit.swhid is None
    assert deposit.swhid_context is None
    assert deposit.status == DEPOSIT_STATUS_VERIFIED

    task = swh_scheduler.search_tasks(task_id=deposit.load_task_id)[0]
    assert task["status"] == "next_run_not_scheduled"
Пример #4
0
    def _complete_deposit(self, deposit: Deposit) -> None:
        """Marks the deposit as 'deposited', then schedule a check task if configured
        to do so."""
        deposit.complete_date = timezone.now()
        deposit.status = DEPOSIT_STATUS_DEPOSITED
        deposit.save()

        if not deposit.origin_url:
            deposit.origin_url = guess_deposit_origin_url(deposit)

        if self.config["checks"]:
            scheduler = self.scheduler
            if deposit.status == DEPOSIT_STATUS_DEPOSITED and not deposit.check_task_id:
                task = create_oneshot_task_dict(
                    "check-deposit",
                    collection=deposit.collection.name,
                    deposit_id=deposit.id,
                    retries_left=3,
                )
                check_task_id = scheduler.create_tasks([task])[0]["id"]
                deposit.check_task_id = check_task_id

        deposit.save()
Пример #5
0
def create_save_origin_request(visit_type, origin_url):
    """
    Create a loading task to save a software origin into the archive.

    This function aims to create a software origin loading task
    trough the use of the swh-scheduler component.

    First, some checks are performed to see if the visit type and origin
    url are valid but also if the the save request can be accepted.
    If those checks passed, the loading task is then created.
    Otherwise, the save request is put in pending or rejected state.

    All the submitted save requests are logged into the swh-web
    database to keep track of them.

    Args:
        visit_type (str): the type of visit to perform (currently only
            ``git`` but ``svn`` and ``hg`` will soon be available)
        origin_url (str): the url of the origin to save

    Raises:
        BadInputExc: the visit type or origin url is invalid
        ForbiddenExc: the provided origin url is blacklisted

    Returns:
        dict: A dict describing the save request with the following keys:

            * **visit_type**: the type of visit to perform
            * **origin_url**: the url of the origin
            * **save_request_date**: the date the request was submitted
            * **save_request_status**: the request status, either **accepted**,
              **rejected** or **pending**
            * **save_task_status**: the origin loading task status, either
              **not created**, **not yet scheduled**, **scheduled**,
              **succeed** or **failed**


    """
    _check_visit_type_savable(visit_type)
    _check_origin_url_valid(origin_url)
    save_request_status = can_save_origin(origin_url)
    task = None

    # if the origin save request is accepted, create a scheduler
    # task to load it into the archive
    if save_request_status == SAVE_REQUEST_ACCEPTED:
        # create a task with high priority
        kwargs = {
            "priority": "high",
            "url": origin_url,
        }
        sor = None
        # get list of previously sumitted save requests
        current_sors = list(
            SaveOriginRequest.objects.filter(visit_type=visit_type,
                                             origin_url=origin_url))

        can_create_task = False
        # if no save requests previously submitted, create the scheduler task
        if not current_sors:
            can_create_task = True
        else:
            # get the latest submitted save request
            sor = current_sors[0]
            # if it was in pending state, we need to create the scheduler task
            # and update the save request info in the database
            if sor.status == SAVE_REQUEST_PENDING:
                can_create_task = True
            # a task has already been created to load the origin
            elif sor.loading_task_id != -1:
                # get the scheduler task and its status
                tasks = scheduler.get_tasks([sor.loading_task_id])
                task = tasks[0] if tasks else None
                task_runs = scheduler.get_task_runs([sor.loading_task_id])
                task_run = task_runs[0] if task_runs else None
                save_request = _save_request_dict(sor, task, task_run)
                task_status = save_request["save_task_status"]
                # create a new scheduler task only if the previous one has been
                # already executed
                if (task_status == SAVE_TASK_FAILED
                        or task_status == SAVE_TASK_SUCCEEDED):
                    can_create_task = True
                    sor = None
                else:
                    can_create_task = False

        if can_create_task:
            # effectively create the scheduler task
            task_dict = create_oneshot_task_dict(_visit_type_task[visit_type],
                                                 **kwargs)
            task = scheduler.create_tasks([task_dict])[0]

            # pending save request has been accepted
            if sor:
                sor.status = SAVE_REQUEST_ACCEPTED
                sor.loading_task_id = task["id"]
                sor.save()
            else:
                sor = SaveOriginRequest.objects.create(
                    visit_type=visit_type,
                    origin_url=origin_url,
                    status=save_request_status,
                    loading_task_id=task["id"],
                )
    # save request must be manually reviewed for acceptation
    elif save_request_status == SAVE_REQUEST_PENDING:
        # check if there is already such a save request already submitted,
        # no need to add it to the database in that case
        try:
            sor = SaveOriginRequest.objects.get(visit_type=visit_type,
                                                origin_url=origin_url,
                                                status=save_request_status)
        # if not add it to the database
        except ObjectDoesNotExist:
            sor = SaveOriginRequest.objects.create(visit_type=visit_type,
                                                   origin_url=origin_url,
                                                   status=save_request_status)
    # origin can not be saved as its url is blacklisted,
    # log the request to the database anyway
    else:
        sor = SaveOriginRequest.objects.create(visit_type=visit_type,
                                               origin_url=origin_url,
                                               status=save_request_status)

    if save_request_status == SAVE_REQUEST_REJECTED:
        raise ForbiddenExc(('The "save code now" request has been rejected '
                            "because the provided origin url is blacklisted."))

    return _save_request_dict(sor, task)
Пример #6
0
    def process_get(self, req: Request, collection_name: str,
                    deposit: Deposit) -> Tuple[int, Dict, str]:
        """Trigger the checks on the deposit archives and then on the deposit metadata.
        If any problems (or warnings) are raised, the deposit status and status detail
        are updated accordingly. If all checks are ok, the deposit status is updated to
        the 'verified' status (details updated with warning if any) and a loading task
        is scheduled for the deposit to be ingested. Otherwise, the deposit is marked as
        'rejected' with the error details. A json response is returned to the caller
        with the deposit checks.

        Args:
            req: Client request
            collection_name: Collection owning the deposit
            deposit: Deposit concerned by the reading

        Returns:
            Tuple (status, json response, content-type)

        """
        raw_metadata = self._metadata_get(deposit)
        details_dict: Dict = {}
        # will check each deposit's associated request (both of type
        # archive and metadata) for errors
        archives_status_ok, details = self._check_deposit_archives(deposit)
        if not archives_status_ok:
            assert details is not None
            details_dict.update(details)

        if raw_metadata is None:
            metadata_status_ok = False
            details_dict["metadata"] = [{"summary": "Missing Atom document"}]
        else:
            metadata_tree = ElementTree.fromstring(raw_metadata)
            metadata_status_ok, details = check_metadata(metadata_tree)
            # Ensure in case of error, we do have the rejection details
            assert metadata_status_ok or (not metadata_status_ok
                                          and details is not None)
            # we can have warnings even if checks are ok (e.g. missing suggested field)
            details_dict.update(details or {})

        deposit_status_ok = archives_status_ok and metadata_status_ok
        # if any details_dict arose, the deposit is rejected
        deposit.status = (DEPOSIT_STATUS_VERIFIED
                          if deposit_status_ok else DEPOSIT_STATUS_REJECTED)
        response: Dict = {
            "status": deposit.status,
        }
        if details_dict:
            deposit.status_detail = details_dict
            response["details"] = details_dict

        # Deposit ok, then we schedule the deposit loading task (if not already done)
        if deposit_status_ok and not deposit.load_task_id and self.config[
                "checks"]:
            url = deposit.origin_url
            task = create_oneshot_task_dict("load-deposit",
                                            url=url,
                                            deposit_id=deposit.id,
                                            retries_left=3)
            load_task_id = self.scheduler.create_tasks([task])[0]["id"]
            deposit.load_task_id = load_task_id

        deposit.save()

        return status.HTTP_200_OK, response, "application/json"
Пример #7
0
    def batch_cook(self,
                   batch: List[Tuple[str, str]],
                   db=None,
                   cur=None) -> Dict[str, int]:
        # Import execute_values at runtime only, because it requires
        # psycopg2 >= 2.7 (only available on postgresql servers)
        from psycopg2.extras import execute_values

        for bundle_type, _ in batch:
            if bundle_type not in COOKER_TYPES:
                raise NotFoundExc(f"{bundle_type} is an unknown type.")

        cur.execute("""
            INSERT INTO vault_batch (id)
            VALUES (DEFAULT)
            RETURNING id""")
        batch_id = cur.fetchone()["id"]

        # Delete all failed bundles from the batch
        cur.execute(
            """
            DELETE FROM vault_bundle
            WHERE task_status = 'failed'
              AND (type, swhid) IN %s""",
            (tuple(batch), ),
        )

        # Insert all the bundles, return the new ones
        execute_values(
            cur,
            """
            INSERT INTO vault_bundle (type, swhid)
            VALUES %s ON CONFLICT DO NOTHING""",
            batch,
        )

        # Get the bundle ids and task status
        cur.execute(
            """
            SELECT id, type, swhid, task_id FROM vault_bundle
            WHERE (type, swhid) IN %s""",
            (tuple(batch), ),
        )
        bundles = cur.fetchall()

        # Insert the batch-bundle entries
        batch_id_bundle_ids = [(batch_id, row["id"]) for row in bundles]
        execute_values(
            cur,
            """
            INSERT INTO vault_batch_bundle (batch_id, bundle_id)
            VALUES %s ON CONFLICT DO NOTHING""",
            batch_id_bundle_ids,
        )
        db.conn.commit()

        # Get the tasks to fetch
        batch_new = [(row["type"], CoreSWHID.from_string(row["swhid"]))
                     for row in bundles if row["task_id"] is None]

        # Send the tasks
        args_batch = [(bundle_type, swhid) for bundle_type, swhid in batch_new]
        # TODO: change once the scheduler handles priority tasks
        tasks = [
            create_oneshot_task_dict("swh-vault-batch-cooking", *args)
            for args in args_batch
        ]

        added_tasks = self.scheduler.create_tasks(tasks)
        tasks_ids_bundle_ids = [
            (task_id, bundle_type, swhid)
            for task_id, (bundle_type,
                          swhid) in zip([task["id"]
                                         for task in added_tasks], batch_new)
        ]

        # Update the task ids
        execute_values(
            cur,
            """
            UPDATE vault_bundle
            SET task_id = s_task_id
            FROM (VALUES %s) AS sub (s_task_id, s_type, s_swhid)
            WHERE type = s_type::cook_type AND swhid = s_swhid """,
            tasks_ids_bundle_ids,
        )
        return {"id": batch_id}
Пример #8
0
 def _send_task(self, bundle_type: str, swhid: CoreSWHID):
     """Send a cooking task to the celery scheduler"""
     task = create_oneshot_task_dict("cook-vault-bundle", bundle_type,
                                     str(swhid))
     added_tasks = self.scheduler.create_tasks([task])
     return added_tasks[0]["id"]