예제 #1
0
    def test_origin_visit_stats_upsert_cardinality_failing(
            self, swh_scheduler) -> None:
        """Batch upsert does not support altering multiple times the same origin-visit-status

        """
        with pytest.raises(SchedulerException, match="CardinalityViolation"):
            swh_scheduler.origin_visit_stats_upsert([
                OriginVisitStats(
                    url="foo",
                    visit_type="git",
                    last_eventful=None,
                    last_uneventful=utcnow(),
                    last_notfound=None,
                    last_failed=None,
                    last_snapshot=None,
                ),
                OriginVisitStats(
                    url="foo",
                    visit_type="git",
                    last_eventful=None,
                    last_uneventful=utcnow(),
                    last_notfound=None,
                    last_failed=None,
                    last_snapshot=None,
                ),
            ])
예제 #2
0
    def test_origin_visit_stats_upsert_batch(self, swh_scheduler) -> None:
        """Batch upsert is ok"""
        visit_stats = [
            OriginVisitStats(
                url="foo",
                visit_type="git",
                last_eventful=utcnow(),
                last_uneventful=None,
                last_failed=None,
                last_notfound=None,
                last_snapshot=hash_to_bytes(
                    "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
            ),
            OriginVisitStats(
                url="bar",
                visit_type="git",
                last_eventful=None,
                last_uneventful=utcnow(),
                last_notfound=None,
                last_failed=None,
                last_snapshot=hash_to_bytes(
                    "fffcc0710eb6cf9efd5b920a8453e1e07157bfff"),
            ),
        ]

        swh_scheduler.origin_visit_stats_upsert(visit_stats)

        for visit_stat in swh_scheduler.origin_visit_stats_get([
            (vs.url, vs.visit_type) for vs in visit_stats
        ]):
            assert visit_stat is not None
예제 #3
0
    def test_metrics_origins_never_visited(self, swh_scheduler,
                                           listed_origins):
        swh_scheduler.record_listed_origins(listed_origins)

        # Pretend that we've recorded a visit on one origin
        visited_origin = listed_origins[0]
        swh_scheduler.origin_visit_stats_upsert([
            OriginVisitStats(
                url=visited_origin.url,
                visit_type=visited_origin.visit_type,
                last_eventful=utcnow(),
                last_uneventful=None,
                last_failed=None,
                last_notfound=None,
                last_snapshot=hash_to_bytes(
                    "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
            ),
        ])

        ret = swh_scheduler.update_metrics(lister_id=visited_origin.lister_id)
        for metric in ret:
            if metric.visit_type == visited_origin.visit_type:
                # We visited one of these origins
                assert metric.origins_known - metric.origins_never_visited == 1
            else:
                # But none of these have been visited
                assert metric.origins_known == metric.origins_never_visited
예제 #4
0
    def test_metrics_origins_with_pending_changes(self, swh_scheduler,
                                                  listed_origins):
        swh_scheduler.record_listed_origins(listed_origins)

        # Pretend that we've recorded a visit on one origin, in the past with
        # respect to the "last update" time for the origin
        visited_origin = listed_origins[0]
        assert visited_origin.last_update is not None
        swh_scheduler.origin_visit_stats_upsert([
            OriginVisitStats(
                url=visited_origin.url,
                visit_type=visited_origin.visit_type,
                last_eventful=visited_origin.last_update -
                datetime.timedelta(days=1),
                last_uneventful=None,
                last_failed=None,
                last_notfound=None,
                last_snapshot=hash_to_bytes(
                    "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
            ),
        ])

        ret = swh_scheduler.update_metrics(lister_id=visited_origin.lister_id)
        for metric in ret:
            if metric.visit_type == visited_origin.visit_type:
                # We visited one of these origins, in the past
                assert metric.origins_with_pending_changes == 1
            else:
                # But none of these have been visited
                assert metric.origins_with_pending_changes == 0
def test_journal_client_origin_visit_statuses_same_snapshot_permutation(
        visit_statuses, swh_scheduler):
    """Ensure out of order topic subscription ends up in the same final state"""
    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([
        ("cavabarder", "hg")
    ])
    visit_stats = actual_origin_visit_stats[0]
    assert_visit_stats_ok(
        visit_stats,
        OriginVisitStats(
            url="cavabarder",
            visit_type="hg",
            last_successful=DATE1 + 2 * ONE_YEAR,
            last_visit=DATE1 + 2 * ONE_YEAR,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
        ),
        ignore_fields=[
            "next_visit_queue_position",
            "next_position_offset",
            "successive_visits",
        ],
    )

    # We ignore out of order messages, so the next_position_offset isn't exact
    # depending on the permutation. What matters is consistency of the final
    # dates (last_visit and last_successful).
    assert 4 <= visit_stats.next_position_offset <= 6
    # same goes for successive_visits
    assert 1 <= visit_stats.successive_visits <= 3
def test_journal_client_origin_visit_status_duplicated_messages(swh_scheduler):
    """A duplicated message must be ignored"""
    visit_status = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE1,
        "type": "git",
        "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
    }

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_successful=DATE1,
            last_visit=DATE1,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
            successive_visits=1,
        ),
    )
def test_journal_client_origin_visit_status_from_journal_last_successful(
        swh_scheduler):
    visit_statuses = [
        {
            "origin": "bar",
            "visit": 1,
            "status": "partial",
            "date": utcnow(),
            "type": "git",
            "snapshot":
            hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        },
        {
            "origin": "foo",
            "visit": 1,
            "status": "full",
            "date": DATE1,
            "type": "git",
            "snapshot":
            hash_to_bytes("eeecc0710eb6cf9efd5b920a8453e1e07157bfff"),
        },
        {
            "origin": "foo",
            "visit": 2,
            "status": "partial",
            "date": DATE2,
            "type": "git",
            "snapshot":
            hash_to_bytes("aaacc0710eb6cf9efd5b920a8453e1e07157baaa"),
        },
        {
            "origin": "foo",
            "visit": 3,
            "status": "full",
            "date": DATE3,
            "type": "git",
            "snapshot":
            hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_successful=DATE3,
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
            next_position_offset=0,
            successive_visits=3,
        ),
    )
def test_journal_client_origin_visit_status_from_journal_last_uneventful(
        swh_scheduler):
    visit_status = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE3 + ONE_DAY,
        "type": "git",
        "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
    }

    # Let's insert some visit stats with some previous visit information
    swh_scheduler.origin_visit_stats_upsert([
        OriginVisitStats(
            url=visit_status["origin"],
            visit_type=visit_status["type"],
            last_successful=DATE2,
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.failed,
            last_snapshot=visit_status["snapshot"],
            next_visit_queue_position=None,
            next_position_offset=4,
            successive_visits=1,
        )
    ])

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([
        (visit_status["origin"], visit_status["type"])
    ])

    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url=visit_status["origin"],
            visit_type=visit_status["type"],
            last_visit=DATE3 + ONE_DAY,
            last_successful=DATE3 + ONE_DAY,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=visit_status["snapshot"],
            next_visit_queue_position=None,
            next_position_offset=5,
            successive_visits=1,
        ),
    )
예제 #9
0
    def test_grab_next_visits_already_visited_order_by_lag(
        self,
        swh_scheduler,
        listed_origins_by_type,
    ):
        visit_type, origins = self._grab_next_visits_setup(
            swh_scheduler, listed_origins_by_type)

        # Update known origins with a `last_update` field that we control
        base_date = datetime.datetime(2020,
                                      1,
                                      1,
                                      0,
                                      0,
                                      0,
                                      tzinfo=datetime.timezone.utc)
        updated_origins = [
            attr.evolve(origin,
                        last_update=base_date - datetime.timedelta(seconds=i))
            for i, origin in enumerate(origins)
        ]
        updated_origins = swh_scheduler.record_listed_origins(updated_origins)

        # Update the visit stats with a known visit at a controlled date for
        # half the origins.  Pick the date in the middle of the
        # updated_origins' `last_update` range
        visit_date = updated_origins[len(updated_origins) // 2].last_update
        visited_origins = updated_origins[::2]
        visit_stats = [
            OriginVisitStats(
                url=origin.url,
                visit_type=origin.visit_type,
                last_snapshot=hash_to_bytes(
                    "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
                last_eventful=visit_date,
                last_uneventful=None,
                last_failed=None,
                last_notfound=None,
            ) for origin in visited_origins
        ]
        swh_scheduler.origin_visit_stats_upsert(visit_stats)

        # We expect to retrieve visited origins with the largest lag, but only
        # those which haven't been visited since their last update
        expected_origins = sorted(
            [
                origin for origin in visited_origins
                if origin.last_update > visit_date
            ],
            key=lambda o: visit_date - o.last_update,
        )

        self._check_grab_next_visit(
            swh_scheduler,
            visit_type=visit_type,
            policy="already_visited_order_by_lag",
            expected=expected_origins,
        )
def test_journal_client_origin_visit_status_from_journal_last_failed(
        swh_scheduler):
    visit_statuses = [
        {
            "origin": "foo",
            "visit": 1,
            "status": "partial",
            "date": utcnow(),
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 1,
            "status": "full",
            "date": DATE1,
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 2,
            "status": "full",
            "date": DATE2,
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 3,
            "status": "full",
            "date": DATE3,
            "type": "git",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="bar",
            visit_type="git",
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.failed,
            next_position_offset=6,
            successive_visits=3,
        ),
    )
def test_journal_client_origin_visit_status_several_upsert(swh_scheduler):
    """An old message updates old information"""
    visit_status1 = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE1,
        "type": "git",
        "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
    }

    visit_status2 = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE2,
        "type": "git",
        "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
    }

    process_journal_objects({"origin_visit_status": [visit_status2]},
                            scheduler=swh_scheduler)

    process_journal_objects({"origin_visit_status": [visit_status1]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_successful=DATE2,
            last_visit=DATE2,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
            next_position_offset=4,
            successive_visits=1,
        ),
    )
예제 #12
0
    def test_origin_visit_stats_upsert_with_snapshot(self,
                                                     swh_scheduler) -> None:
        eventful_date = utcnow()
        url = "https://github.com/666/test"

        visit_stats = OriginVisitStats(
            url=url,
            visit_type="git",
            last_eventful=eventful_date,
            last_uneventful=None,
            last_failed=None,
            last_notfound=None,
            last_snapshot=hash_to_bytes(
                "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        )
        swh_scheduler.origin_visit_stats_upsert([visit_stats])

        assert swh_scheduler.origin_visit_stats_get([(url, "git")
                                                     ]) == [visit_stats]
        assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == []
예제 #13
0
    def test_grab_next_visits_oldest_scheduled_first(
        self,
        swh_scheduler,
        listed_origins_by_type,
    ):
        visit_type, origins = self._grab_next_visits_setup(
            swh_scheduler, listed_origins_by_type)

        # Give all origins but one a last_scheduled date
        base_date = datetime.datetime(2020,
                                      1,
                                      1,
                                      0,
                                      0,
                                      0,
                                      tzinfo=datetime.timezone.utc)
        visit_stats = [
            OriginVisitStats(
                url=origin.url,
                visit_type=origin.visit_type,
                last_snapshot=None,
                last_eventful=None,
                last_uneventful=None,
                last_failed=None,
                last_notfound=None,
                last_scheduled=base_date - datetime.timedelta(seconds=i),
            ) for i, origin in enumerate(origins[1:])
        ]
        swh_scheduler.origin_visit_stats_upsert(visit_stats)

        # We expect to retrieve the origin with a NULL last_scheduled
        # as well as those with the oldest values (i.e. the last ones), in order.
        expected = [origins[0]] + origins[1:][::-1]

        self._check_grab_next_visit(
            swh_scheduler,
            visit_type=visit_type,
            policy="oldest_scheduled_first",
            expected=expected,
        )
def test_journal_client_origin_visit_status_from_journal_last_failed2(
        swh_scheduler):
    visit_statuses = [
        {
            "origin": "bar",
            "visit": 2,
            "status": "failed",
            "date": DATE1,
            "type": "git",
            "snapshot":
            hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        },
        {
            "origin": "bar",
            "visit": 3,
            "status": "failed",
            "date": DATE2,
            "type": "git",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="bar",
            visit_type="git",
            last_visit=DATE2,
            last_visit_status=LastVisitStatus.failed,
            next_position_offset=5,
            successive_visits=2,
        ),
    )
예제 #15
0
    def test_origin_visit_stats_get_pagination(self, swh_scheduler) -> None:
        page_size = inspect.signature(
            execute_values).parameters["page_size"].default

        visit_stats = [
            OriginVisitStats(
                url=f"https://example.com/origin-{i:03d}",
                visit_type="git",
                last_eventful=utcnow(),
                last_uneventful=None,
                last_failed=None,
                last_notfound=None,
            ) for i in range(
                page_size + 1
            )  # Ensure overflow of the psycopg2.extras.execute_values page_size
        ]

        swh_scheduler.origin_visit_stats_upsert(visit_stats)

        assert set(
            swh_scheduler.origin_visit_stats_get([
                (ovs.url, ovs.visit_type) for ovs in visit_stats
            ])) == set(visit_stats)
예제 #16
0
    def test_origin_visit_stats_upsert(self, swh_scheduler) -> None:
        eventful_date = utcnow()
        url = "https://github.com/test"

        visit_stats = OriginVisitStats(
            url=url,
            visit_type="git",
            last_eventful=eventful_date,
            last_uneventful=None,
            last_failed=None,
            last_notfound=None,
        )
        swh_scheduler.origin_visit_stats_upsert([visit_stats])
        swh_scheduler.origin_visit_stats_upsert([visit_stats])

        assert swh_scheduler.origin_visit_stats_get([(url, "git")
                                                     ]) == [visit_stats]
        assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == []

        uneventful_date = utcnow()
        visit_stats = OriginVisitStats(
            url=url,
            visit_type="git",
            last_eventful=None,
            last_uneventful=uneventful_date,
            last_failed=None,
            last_notfound=None,
        )
        swh_scheduler.origin_visit_stats_upsert([visit_stats])

        uneventful_visits = swh_scheduler.origin_visit_stats_get([(url, "git")
                                                                  ])

        expected_visit_stats = OriginVisitStats(
            url=url,
            visit_type="git",
            last_eventful=eventful_date,
            last_uneventful=uneventful_date,
            last_failed=None,
            last_notfound=None,
        )

        assert uneventful_visits == [expected_visit_stats]

        failed_date = utcnow()
        visit_stats = OriginVisitStats(
            url=url,
            visit_type="git",
            last_eventful=None,
            last_uneventful=None,
            last_failed=failed_date,
            last_notfound=None,
        )
        swh_scheduler.origin_visit_stats_upsert([visit_stats])

        failed_visits = swh_scheduler.origin_visit_stats_get([(url, "git")])

        expected_visit_stats = OriginVisitStats(
            url=url,
            visit_type="git",
            last_eventful=eventful_date,
            last_uneventful=uneventful_date,
            last_failed=failed_date,
            last_notfound=None,
        )

        assert failed_visits == [expected_visit_stats]
def test_disable_failing_origins(swh_scheduler):
    """Origin with too many failed attempts ends up being deactivated in the scheduler."""

    # actually store the origin in the scheduler so we can check it's deactivated in the
    # end.
    lister = swh_scheduler.get_or_create_lister(name="something",
                                                instance_name="something")
    origin = ListedOrigin(url="bar",
                          enabled=True,
                          visit_type="svn",
                          lister_id=lister.id)
    swh_scheduler.record_listed_origins([origin])

    visit_statuses = [
        {
            "origin": "bar",
            "visit": 2,
            "status": "failed",
            "date": DATE1,
            "type": "svn",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 3,
            "status": "failed",
            "date": DATE2,
            "type": "svn",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 3,
            "status": "failed",
            "date": DATE3,
            "type": "svn",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar",
                                                                       "svn")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="bar",
            visit_type="svn",
            last_successful=None,
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.failed,
            next_position_offset=6,
            successive_visits=3,
        ),
    )

    # Now check that the origin in question is disabled
    actual_page = swh_scheduler.get_listed_origins(url="bar", enabled=False)

    assert len(actual_page.results) == 1
    assert actual_page.next_page_token is None

    for origin in actual_page.results:
        assert origin.enabled is False
        assert origin.lister_id == lister.id
        assert origin.url == "bar"
        assert origin.visit_type == "svn"
예제 #18
0
def process_journal_objects(messages: Dict[str, List[Dict]], *,
                            scheduler: SchedulerInterface) -> None:
    """Read messages from origin_visit_status journal topics, then inserts them in the
    scheduler "origin_visit_stats" table.

    Worker function for `JournalClient.process(worker_fn)`, after
    currification of `scheduler` and `task_names`.

    """
    assert set(messages) <= {
        msg_type
    }, f"Got unexpected {', '.join(set(messages) - set([msg_type]))} message types"
    assert msg_type in messages, f"Expected {msg_type} messages"

    interesting_messages = [
        msg for msg in messages[msg_type]
        if "type" in msg and msg["status"] not in ("created", "ongoing")
    ]

    if not interesting_messages:
        return

    origin_visit_stats: Dict[Tuple[str, str], Dict] = {
        (visit_stats.url, visit_stats.visit_type): attr.asdict(visit_stats)
        for visit_stats in scheduler.origin_visit_stats_get(
            list(set(
                (vs["origin"], vs["type"]) for vs in interesting_messages)))
    }

    for msg_dict in interesting_messages:
        origin = msg_dict["origin"]
        visit_type = msg_dict["type"]
        empty_object = {
            "url": origin,
            "visit_type": visit_type,
            "last_uneventful": None,
            "last_eventful": None,
            "last_failed": None,
            "last_notfound": None,
            "last_snapshot": None,
        }
        pk = origin, visit_type
        if pk not in origin_visit_stats:
            origin_visit_stats[pk] = empty_object
        visit_stats_d = origin_visit_stats[pk]

        if msg_dict["status"] == "not_found":
            visit_stats_d["last_notfound"] = max_date(
                msg_dict["date"], visit_stats_d.get("last_notfound"))
        elif msg_dict["status"] == "failed":
            visit_stats_d["last_failed"] = max_date(
                msg_dict["date"], visit_stats_d.get("last_failed"))
        elif msg_dict["snapshot"] is None:
            visit_stats_d["last_failed"] = max_date(
                msg_dict["date"], visit_stats_d.get("last_failed"))
        else:  # visit with snapshot, something happened
            if visit_stats_d["last_snapshot"] is None:
                # first time visit with snapshot, we keep relevant information
                visit_stats_d["last_eventful"] = msg_dict["date"]
                visit_stats_d["last_snapshot"] = msg_dict["snapshot"]
            else:
                # visit with snapshot already stored, last_eventful should already be
                # stored
                assert visit_stats_d["last_eventful"] is not None
                latest_recorded_visit_date = max_date(
                    visit_stats_d["last_eventful"],
                    visit_stats_d["last_uneventful"])
                current_status_date = msg_dict["date"]
                previous_snapshot = visit_stats_d["last_snapshot"]
                if msg_dict["snapshot"] != previous_snapshot:
                    if (latest_recorded_visit_date and
                            current_status_date < latest_recorded_visit_date):
                        # out of order message so ignored
                        continue
                    # new eventful visit (new snapshot)
                    visit_stats_d["last_eventful"] = current_status_date
                    visit_stats_d["last_snapshot"] = msg_dict["snapshot"]
                else:
                    # same snapshot as before
                    if (latest_recorded_visit_date and
                            current_status_date < latest_recorded_visit_date):
                        # we receive an old message which is an earlier "eventful" event
                        # than what we had, we consider the last_eventful event as
                        # actually an uneventful event.
                        # The last uneventful visit remains the most recent:
                        # max, previously computed
                        visit_stats_d[
                            "last_uneventful"] = latest_recorded_visit_date
                        # The eventful visit remains the oldest one: min
                        visit_stats_d["last_eventful"] = min(
                            visit_stats_d["last_eventful"],
                            current_status_date)
                    elif (latest_recorded_visit_date and current_status_date
                          == latest_recorded_visit_date):
                        # A duplicated message must be ignored to avoid
                        # populating the last_uneventful message
                        continue
                    else:
                        # uneventful event
                        visit_stats_d["last_uneventful"] = current_status_date

    scheduler.origin_visit_stats_upsert(
        OriginVisitStats(**ovs) for ovs in origin_visit_stats.values())
예제 #19
0
def process_journal_objects(
    messages: Dict[str, List[Dict]], *, scheduler: SchedulerInterface
) -> None:
    f"""Read messages from origin_visit_status journal topic to update "origin_visit_stats"
    information on (origin, visit_type). The goal is to compute visit stats information
    per origin and visit_type: `last_successful`, `last_visit`, `last_visit_status`, ...

    Details:

        - This journal consumes origin visit status information for final visit
          status (`"full"`, `"partial"`, `"failed"`, `"not_found"`). It drops
          the information of non final visit statuses (`"ongoing"`,
          `"created"`).

        - This journal client only considers messages that arrive in
          chronological order. Messages that arrive out of order (i.e. with a
          date field smaller than the latest recorded visit of the origin) are
          ignored. This is a tradeoff between correctness and simplicity of
          implementation [1]_.

        - The snapshot is used to determine the eventful or uneventful nature of
          the origin visit.

        - When no snapshot is provided, the visit is considered as failed.

        - Finally, the `next_visit_queue_position` (position in the global per-origin
          type queue at which some new objects are expected to be added for the origin),
          and `next_position_offset` (duration that we expect to wait between visits of
          this origin) are updated.

        - When visits fails at least {DISABLE_ORIGIN_THRESHOLD} times in a row, the
          origins are disabled in the scheduler table. It's up to the lister to activate
          those back when they are listed again.

    This is a worker function to be used with `JournalClient.process(worker_fn)`, after
    currification of `scheduler` and `task_names`.

    .. [1] Ignoring out of order messages makes the initialization of the
      origin_visit_status table (from a full journal) less deterministic: only the
      `last_visit`, `last_visit_state` and `last_successful` fields are guaranteed
      to be exact, the `next_position_offset` field is a best effort estimate
      (which should converge once the client has run for a while on in-order
      messages).

    """
    assert set(messages) <= {
        msg_type
    }, f"Got unexpected {', '.join(set(messages) - set([msg_type]))} message types"
    assert msg_type in messages, f"Expected {msg_type} messages"

    interesting_messages = [
        msg
        for msg in messages[msg_type]
        if "type" in msg and msg["status"] not in ("created", "ongoing")
    ]

    if not interesting_messages:
        return

    origin_visit_stats: Dict[Tuple[str, str], Dict] = {
        (visit_stats.url, visit_stats.visit_type): attr.asdict(visit_stats)
        for visit_stats in scheduler.origin_visit_stats_get(
            list(set((vs["origin"], vs["type"]) for vs in interesting_messages))
        )
    }
    existing_origin_visit_stats = copy.deepcopy(origin_visit_stats)

    # Use the default values from the model object
    empty_object = {
        field.name: field.default if field.default != attr.NOTHING else None
        for field in attr.fields(OriginVisitStats)
    }

    disabled_urls: List[str] = []

    # Retrieve the global queue state
    queue_position_per_visit_type = scheduler.visit_scheduler_queue_position_get()

    for msg_dict in interesting_messages:
        origin = msg_dict["origin"]
        visit_type = msg_dict["type"]
        pk = origin, visit_type
        if pk not in origin_visit_stats:
            origin_visit_stats[pk] = {
                **empty_object,
                "url": origin,
                "visit_type": visit_type,
            }

        visit_stats_d = origin_visit_stats[pk]

        if (
            visit_stats_d.get("last_visit")
            and msg_dict["date"] <= visit_stats_d["last_visit"]
        ):
            # message received out of order, ignore
            continue

        # Compare incoming message to known status of the origin, to determine
        # eventfulness
        last_visit_status, eventful = get_last_status(msg_dict, visit_stats_d)

        # Update the position offset according to the visit status,
        # if we had already visited this origin before.

        if visit_stats_d.get("last_visit"):
            # Update the next position offset according to the existing value and the
            # eventfulness of the visit.
            increment = -2 if eventful else 1
            # Limit the next_position_offset for acceptable date computations
            current_offset = min(
                visit_stats_d["next_position_offset"] + increment,
                MAX_NEXT_POSITION_OFFSET,
            )
            visit_stats_d["next_position_offset"] = max(0, current_offset)
            # increment the counter when last_visit_status is the same
            same_visit_status = last_visit_status == visit_stats_d["last_visit_status"]
        else:
            same_visit_status = False

        # Record current visit date as highest known date (we've rejected out of order
        # messages earlier).
        visit_stats_d["last_visit"] = msg_dict["date"]
        visit_stats_d["last_visit_status"] = last_visit_status

        # Record last successful visit date
        if last_visit_status == LastVisitStatus.successful:
            visit_stats_d["last_successful"] = max_date(
                msg_dict["date"], visit_stats_d.get("last_successful")
            )
            visit_stats_d["last_snapshot"] = msg_dict["snapshot"]

        # Update the next visit queue position (which will be used solely for origin
        # without any last_update, cf. the dedicated scheduling policy
        # "origins_without_last_update")
        visit_stats_d["next_visit_queue_position"] = next_visit_queue_position(
            queue_position_per_visit_type, visit_stats_d
        )

        visit_stats_d["successive_visits"] = (
            visit_stats_d["successive_visits"] + 1 if same_visit_status else 1
        )

        # Disable recurring failing/not-found origins
        if (
            visit_stats_d["last_visit_status"]
            in [LastVisitStatus.not_found, LastVisitStatus.failed]
        ) and visit_stats_d["successive_visits"] >= DISABLE_ORIGIN_THRESHOLD:
            disabled_urls.append(visit_stats_d["url"])

    # Only upsert changed values
    to_upsert = []
    for key, ovs in origin_visit_stats.items():
        if (
            key not in existing_origin_visit_stats
            or ovs != existing_origin_visit_stats[key]
        ):
            to_upsert.append(OriginVisitStats(**ovs))

    if to_upsert:
        scheduler.origin_visit_stats_upsert(to_upsert)

    # Disable any origins if any
    if disabled_urls:
        disabled_origins = []
        for url in disabled_urls:
            origins = scheduler.get_listed_origins(url=url).results
            if len(origins) > 0:
                origin = attr.evolve(origins[0], enabled=False)
                disabled_origins.append(origin)

        if disabled_origins:
            scheduler.record_listed_origins(disabled_origins)
def test_journal_client_origin_visit_status_from_journal_last_not_found(
        swh_scheduler):
    visit_status = {
        "origin": "foo",
        "visit": 1,
        "status": "not_found",
        "date": DATE1,
        "type": "git",
        "snapshot": None,
    }

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_visit=visit_status["date"],
            last_visit_status=LastVisitStatus.not_found,
            next_position_offset=4,
            successive_visits=1,
        ),
    )

    visit_statuses = [
        {
            "origin": "foo",
            "visit": 3,
            "status": "not_found",
            "date": DATE2,
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "foo",
            "visit": 4,
            "status": "not_found",
            "date": DATE3,
            "type": "git",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.not_found,
            next_position_offset=6,
            successive_visits=3,
        ),
    )