def test_origin_visit_stats_upsert_cardinality_failing( self, swh_scheduler) -> None: """Batch upsert does not support altering multiple times the same origin-visit-status """ with pytest.raises(SchedulerException, match="CardinalityViolation"): swh_scheduler.origin_visit_stats_upsert([ OriginVisitStats( url="foo", visit_type="git", last_eventful=None, last_uneventful=utcnow(), last_notfound=None, last_failed=None, last_snapshot=None, ), OriginVisitStats( url="foo", visit_type="git", last_eventful=None, last_uneventful=utcnow(), last_notfound=None, last_failed=None, last_snapshot=None, ), ])
def test_origin_visit_stats_upsert_batch(self, swh_scheduler) -> None: """Batch upsert is ok""" visit_stats = [ OriginVisitStats( url="foo", visit_type="git", last_eventful=utcnow(), last_uneventful=None, last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes( "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ), OriginVisitStats( url="bar", visit_type="git", last_eventful=None, last_uneventful=utcnow(), last_notfound=None, last_failed=None, last_snapshot=hash_to_bytes( "fffcc0710eb6cf9efd5b920a8453e1e07157bfff"), ), ] swh_scheduler.origin_visit_stats_upsert(visit_stats) for visit_stat in swh_scheduler.origin_visit_stats_get([ (vs.url, vs.visit_type) for vs in visit_stats ]): assert visit_stat is not None
def test_metrics_origins_never_visited(self, swh_scheduler, listed_origins): swh_scheduler.record_listed_origins(listed_origins) # Pretend that we've recorded a visit on one origin visited_origin = listed_origins[0] swh_scheduler.origin_visit_stats_upsert([ OriginVisitStats( url=visited_origin.url, visit_type=visited_origin.visit_type, last_eventful=utcnow(), last_uneventful=None, last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes( "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ), ]) ret = swh_scheduler.update_metrics(lister_id=visited_origin.lister_id) for metric in ret: if metric.visit_type == visited_origin.visit_type: # We visited one of these origins assert metric.origins_known - metric.origins_never_visited == 1 else: # But none of these have been visited assert metric.origins_known == metric.origins_never_visited
def test_metrics_origins_with_pending_changes(self, swh_scheduler, listed_origins): swh_scheduler.record_listed_origins(listed_origins) # Pretend that we've recorded a visit on one origin, in the past with # respect to the "last update" time for the origin visited_origin = listed_origins[0] assert visited_origin.last_update is not None swh_scheduler.origin_visit_stats_upsert([ OriginVisitStats( url=visited_origin.url, visit_type=visited_origin.visit_type, last_eventful=visited_origin.last_update - datetime.timedelta(days=1), last_uneventful=None, last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes( "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ), ]) ret = swh_scheduler.update_metrics(lister_id=visited_origin.lister_id) for metric in ret: if metric.visit_type == visited_origin.visit_type: # We visited one of these origins, in the past assert metric.origins_with_pending_changes == 1 else: # But none of these have been visited assert metric.origins_with_pending_changes == 0
def test_journal_client_origin_visit_statuses_same_snapshot_permutation( visit_statuses, swh_scheduler): """Ensure out of order topic subscription ends up in the same final state""" process_journal_objects({"origin_visit_status": visit_statuses}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([ ("cavabarder", "hg") ]) visit_stats = actual_origin_visit_stats[0] assert_visit_stats_ok( visit_stats, OriginVisitStats( url="cavabarder", visit_type="hg", last_successful=DATE1 + 2 * ONE_YEAR, last_visit=DATE1 + 2 * ONE_YEAR, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes( "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), ), ignore_fields=[ "next_visit_queue_position", "next_position_offset", "successive_visits", ], ) # We ignore out of order messages, so the next_position_offset isn't exact # depending on the permutation. What matters is consistency of the final # dates (last_visit and last_successful). assert 4 <= visit_stats.next_position_offset <= 6 # same goes for successive_visits assert 1 <= visit_stats.successive_visits <= 3
def test_journal_client_origin_visit_status_duplicated_messages(swh_scheduler): """A duplicated message must be ignored""" visit_status = { "origin": "foo", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), } process_journal_objects({"origin_visit_status": [visit_status]}, scheduler=swh_scheduler) process_journal_objects({"origin_visit_status": [visit_status]}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_successful=DATE1, last_visit=DATE1, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes( "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), successive_visits=1, ), )
def test_journal_client_origin_visit_status_from_journal_last_successful( swh_scheduler): visit_statuses = [ { "origin": "bar", "visit": 1, "status": "partial", "date": utcnow(), "type": "git", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "foo", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("eeecc0710eb6cf9efd5b920a8453e1e07157bfff"), }, { "origin": "foo", "visit": 2, "status": "partial", "date": DATE2, "type": "git", "snapshot": hash_to_bytes("aaacc0710eb6cf9efd5b920a8453e1e07157baaa"), }, { "origin": "foo", "visit": 3, "status": "full", "date": DATE3, "type": "git", "snapshot": hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), }, ] process_journal_objects({"origin_visit_status": visit_statuses}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_successful=DATE3, last_visit=DATE3, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes( "dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), next_position_offset=0, successive_visits=3, ), )
def test_journal_client_origin_visit_status_from_journal_last_uneventful( swh_scheduler): visit_status = { "origin": "foo", "visit": 1, "status": "full", "date": DATE3 + ONE_DAY, "type": "git", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), } # Let's insert some visit stats with some previous visit information swh_scheduler.origin_visit_stats_upsert([ OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_successful=DATE2, last_visit=DATE3, last_visit_status=LastVisitStatus.failed, last_snapshot=visit_status["snapshot"], next_visit_queue_position=None, next_position_offset=4, successive_visits=1, ) ]) process_journal_objects({"origin_visit_status": [visit_status]}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([ (visit_status["origin"], visit_status["type"]) ]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url=visit_status["origin"], visit_type=visit_status["type"], last_visit=DATE3 + ONE_DAY, last_successful=DATE3 + ONE_DAY, last_visit_status=LastVisitStatus.successful, last_snapshot=visit_status["snapshot"], next_visit_queue_position=None, next_position_offset=5, successive_visits=1, ), )
def test_grab_next_visits_already_visited_order_by_lag( self, swh_scheduler, listed_origins_by_type, ): visit_type, origins = self._grab_next_visits_setup( swh_scheduler, listed_origins_by_type) # Update known origins with a `last_update` field that we control base_date = datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) updated_origins = [ attr.evolve(origin, last_update=base_date - datetime.timedelta(seconds=i)) for i, origin in enumerate(origins) ] updated_origins = swh_scheduler.record_listed_origins(updated_origins) # Update the visit stats with a known visit at a controlled date for # half the origins. Pick the date in the middle of the # updated_origins' `last_update` range visit_date = updated_origins[len(updated_origins) // 2].last_update visited_origins = updated_origins[::2] visit_stats = [ OriginVisitStats( url=origin.url, visit_type=origin.visit_type, last_snapshot=hash_to_bytes( "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), last_eventful=visit_date, last_uneventful=None, last_failed=None, last_notfound=None, ) for origin in visited_origins ] swh_scheduler.origin_visit_stats_upsert(visit_stats) # We expect to retrieve visited origins with the largest lag, but only # those which haven't been visited since their last update expected_origins = sorted( [ origin for origin in visited_origins if origin.last_update > visit_date ], key=lambda o: visit_date - o.last_update, ) self._check_grab_next_visit( swh_scheduler, visit_type=visit_type, policy="already_visited_order_by_lag", expected=expected_origins, )
def test_journal_client_origin_visit_status_from_journal_last_failed( swh_scheduler): visit_statuses = [ { "origin": "foo", "visit": 1, "status": "partial", "date": utcnow(), "type": "git", "snapshot": None, }, { "origin": "bar", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": None, }, { "origin": "bar", "visit": 2, "status": "full", "date": DATE2, "type": "git", "snapshot": None, }, { "origin": "bar", "visit": 3, "status": "full", "date": DATE3, "type": "git", "snapshot": None, }, ] process_journal_objects({"origin_visit_status": visit_statuses}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="bar", visit_type="git", last_visit=DATE3, last_visit_status=LastVisitStatus.failed, next_position_offset=6, successive_visits=3, ), )
def test_journal_client_origin_visit_status_several_upsert(swh_scheduler): """An old message updates old information""" visit_status1 = { "origin": "foo", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), } visit_status2 = { "origin": "foo", "visit": 1, "status": "full", "date": DATE2, "type": "git", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), } process_journal_objects({"origin_visit_status": [visit_status2]}, scheduler=swh_scheduler) process_journal_objects({"origin_visit_status": [visit_status1]}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_successful=DATE2, last_visit=DATE2, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes( "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), next_position_offset=4, successive_visits=1, ), )
def test_origin_visit_stats_upsert_with_snapshot(self, swh_scheduler) -> None: eventful_date = utcnow() url = "https://github.com/666/test" visit_stats = OriginVisitStats( url=url, visit_type="git", last_eventful=eventful_date, last_uneventful=None, last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes( "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) assert swh_scheduler.origin_visit_stats_get([(url, "git") ]) == [visit_stats] assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == []
def test_grab_next_visits_oldest_scheduled_first( self, swh_scheduler, listed_origins_by_type, ): visit_type, origins = self._grab_next_visits_setup( swh_scheduler, listed_origins_by_type) # Give all origins but one a last_scheduled date base_date = datetime.datetime(2020, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) visit_stats = [ OriginVisitStats( url=origin.url, visit_type=origin.visit_type, last_snapshot=None, last_eventful=None, last_uneventful=None, last_failed=None, last_notfound=None, last_scheduled=base_date - datetime.timedelta(seconds=i), ) for i, origin in enumerate(origins[1:]) ] swh_scheduler.origin_visit_stats_upsert(visit_stats) # We expect to retrieve the origin with a NULL last_scheduled # as well as those with the oldest values (i.e. the last ones), in order. expected = [origins[0]] + origins[1:][::-1] self._check_grab_next_visit( swh_scheduler, visit_type=visit_type, policy="oldest_scheduled_first", expected=expected, )
def test_journal_client_origin_visit_status_from_journal_last_failed2( swh_scheduler): visit_statuses = [ { "origin": "bar", "visit": 2, "status": "failed", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "bar", "visit": 3, "status": "failed", "date": DATE2, "type": "git", "snapshot": None, }, ] process_journal_objects({"origin_visit_status": visit_statuses}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="bar", visit_type="git", last_visit=DATE2, last_visit_status=LastVisitStatus.failed, next_position_offset=5, successive_visits=2, ), )
def test_origin_visit_stats_get_pagination(self, swh_scheduler) -> None: page_size = inspect.signature( execute_values).parameters["page_size"].default visit_stats = [ OriginVisitStats( url=f"https://example.com/origin-{i:03d}", visit_type="git", last_eventful=utcnow(), last_uneventful=None, last_failed=None, last_notfound=None, ) for i in range( page_size + 1 ) # Ensure overflow of the psycopg2.extras.execute_values page_size ] swh_scheduler.origin_visit_stats_upsert(visit_stats) assert set( swh_scheduler.origin_visit_stats_get([ (ovs.url, ovs.visit_type) for ovs in visit_stats ])) == set(visit_stats)
def test_origin_visit_stats_upsert(self, swh_scheduler) -> None: eventful_date = utcnow() url = "https://github.com/test" visit_stats = OriginVisitStats( url=url, visit_type="git", last_eventful=eventful_date, last_uneventful=None, last_failed=None, last_notfound=None, ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) swh_scheduler.origin_visit_stats_upsert([visit_stats]) assert swh_scheduler.origin_visit_stats_get([(url, "git") ]) == [visit_stats] assert swh_scheduler.origin_visit_stats_get([(url, "svn")]) == [] uneventful_date = utcnow() visit_stats = OriginVisitStats( url=url, visit_type="git", last_eventful=None, last_uneventful=uneventful_date, last_failed=None, last_notfound=None, ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) uneventful_visits = swh_scheduler.origin_visit_stats_get([(url, "git") ]) expected_visit_stats = OriginVisitStats( url=url, visit_type="git", last_eventful=eventful_date, last_uneventful=uneventful_date, last_failed=None, last_notfound=None, ) assert uneventful_visits == [expected_visit_stats] failed_date = utcnow() visit_stats = OriginVisitStats( url=url, visit_type="git", last_eventful=None, last_uneventful=None, last_failed=failed_date, last_notfound=None, ) swh_scheduler.origin_visit_stats_upsert([visit_stats]) failed_visits = swh_scheduler.origin_visit_stats_get([(url, "git")]) expected_visit_stats = OriginVisitStats( url=url, visit_type="git", last_eventful=eventful_date, last_uneventful=uneventful_date, last_failed=failed_date, last_notfound=None, ) assert failed_visits == [expected_visit_stats]
def test_disable_failing_origins(swh_scheduler): """Origin with too many failed attempts ends up being deactivated in the scheduler.""" # actually store the origin in the scheduler so we can check it's deactivated in the # end. lister = swh_scheduler.get_or_create_lister(name="something", instance_name="something") origin = ListedOrigin(url="bar", enabled=True, visit_type="svn", lister_id=lister.id) swh_scheduler.record_listed_origins([origin]) visit_statuses = [ { "origin": "bar", "visit": 2, "status": "failed", "date": DATE1, "type": "svn", "snapshot": None, }, { "origin": "bar", "visit": 3, "status": "failed", "date": DATE2, "type": "svn", "snapshot": None, }, { "origin": "bar", "visit": 3, "status": "failed", "date": DATE3, "type": "svn", "snapshot": None, }, ] process_journal_objects({"origin_visit_status": visit_statuses}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar", "svn")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="bar", visit_type="svn", last_successful=None, last_visit=DATE3, last_visit_status=LastVisitStatus.failed, next_position_offset=6, successive_visits=3, ), ) # Now check that the origin in question is disabled actual_page = swh_scheduler.get_listed_origins(url="bar", enabled=False) assert len(actual_page.results) == 1 assert actual_page.next_page_token is None for origin in actual_page.results: assert origin.enabled is False assert origin.lister_id == lister.id assert origin.url == "bar" assert origin.visit_type == "svn"
def process_journal_objects(messages: Dict[str, List[Dict]], *, scheduler: SchedulerInterface) -> None: """Read messages from origin_visit_status journal topics, then inserts them in the scheduler "origin_visit_stats" table. Worker function for `JournalClient.process(worker_fn)`, after currification of `scheduler` and `task_names`. """ assert set(messages) <= { msg_type }, f"Got unexpected {', '.join(set(messages) - set([msg_type]))} message types" assert msg_type in messages, f"Expected {msg_type} messages" interesting_messages = [ msg for msg in messages[msg_type] if "type" in msg and msg["status"] not in ("created", "ongoing") ] if not interesting_messages: return origin_visit_stats: Dict[Tuple[str, str], Dict] = { (visit_stats.url, visit_stats.visit_type): attr.asdict(visit_stats) for visit_stats in scheduler.origin_visit_stats_get( list(set( (vs["origin"], vs["type"]) for vs in interesting_messages))) } for msg_dict in interesting_messages: origin = msg_dict["origin"] visit_type = msg_dict["type"] empty_object = { "url": origin, "visit_type": visit_type, "last_uneventful": None, "last_eventful": None, "last_failed": None, "last_notfound": None, "last_snapshot": None, } pk = origin, visit_type if pk not in origin_visit_stats: origin_visit_stats[pk] = empty_object visit_stats_d = origin_visit_stats[pk] if msg_dict["status"] == "not_found": visit_stats_d["last_notfound"] = max_date( msg_dict["date"], visit_stats_d.get("last_notfound")) elif msg_dict["status"] == "failed": visit_stats_d["last_failed"] = max_date( msg_dict["date"], visit_stats_d.get("last_failed")) elif msg_dict["snapshot"] is None: visit_stats_d["last_failed"] = max_date( msg_dict["date"], visit_stats_d.get("last_failed")) else: # visit with snapshot, something happened if visit_stats_d["last_snapshot"] is None: # first time visit with snapshot, we keep relevant information visit_stats_d["last_eventful"] = msg_dict["date"] visit_stats_d["last_snapshot"] = msg_dict["snapshot"] else: # visit with snapshot already stored, last_eventful should already be # stored assert visit_stats_d["last_eventful"] is not None latest_recorded_visit_date = max_date( visit_stats_d["last_eventful"], visit_stats_d["last_uneventful"]) current_status_date = msg_dict["date"] previous_snapshot = visit_stats_d["last_snapshot"] if msg_dict["snapshot"] != previous_snapshot: if (latest_recorded_visit_date and current_status_date < latest_recorded_visit_date): # out of order message so ignored continue # new eventful visit (new snapshot) visit_stats_d["last_eventful"] = current_status_date visit_stats_d["last_snapshot"] = msg_dict["snapshot"] else: # same snapshot as before if (latest_recorded_visit_date and current_status_date < latest_recorded_visit_date): # we receive an old message which is an earlier "eventful" event # than what we had, we consider the last_eventful event as # actually an uneventful event. # The last uneventful visit remains the most recent: # max, previously computed visit_stats_d[ "last_uneventful"] = latest_recorded_visit_date # The eventful visit remains the oldest one: min visit_stats_d["last_eventful"] = min( visit_stats_d["last_eventful"], current_status_date) elif (latest_recorded_visit_date and current_status_date == latest_recorded_visit_date): # A duplicated message must be ignored to avoid # populating the last_uneventful message continue else: # uneventful event visit_stats_d["last_uneventful"] = current_status_date scheduler.origin_visit_stats_upsert( OriginVisitStats(**ovs) for ovs in origin_visit_stats.values())
def process_journal_objects( messages: Dict[str, List[Dict]], *, scheduler: SchedulerInterface ) -> None: f"""Read messages from origin_visit_status journal topic to update "origin_visit_stats" information on (origin, visit_type). The goal is to compute visit stats information per origin and visit_type: `last_successful`, `last_visit`, `last_visit_status`, ... Details: - This journal consumes origin visit status information for final visit status (`"full"`, `"partial"`, `"failed"`, `"not_found"`). It drops the information of non final visit statuses (`"ongoing"`, `"created"`). - This journal client only considers messages that arrive in chronological order. Messages that arrive out of order (i.e. with a date field smaller than the latest recorded visit of the origin) are ignored. This is a tradeoff between correctness and simplicity of implementation [1]_. - The snapshot is used to determine the eventful or uneventful nature of the origin visit. - When no snapshot is provided, the visit is considered as failed. - Finally, the `next_visit_queue_position` (position in the global per-origin type queue at which some new objects are expected to be added for the origin), and `next_position_offset` (duration that we expect to wait between visits of this origin) are updated. - When visits fails at least {DISABLE_ORIGIN_THRESHOLD} times in a row, the origins are disabled in the scheduler table. It's up to the lister to activate those back when they are listed again. This is a worker function to be used with `JournalClient.process(worker_fn)`, after currification of `scheduler` and `task_names`. .. [1] Ignoring out of order messages makes the initialization of the origin_visit_status table (from a full journal) less deterministic: only the `last_visit`, `last_visit_state` and `last_successful` fields are guaranteed to be exact, the `next_position_offset` field is a best effort estimate (which should converge once the client has run for a while on in-order messages). """ assert set(messages) <= { msg_type }, f"Got unexpected {', '.join(set(messages) - set([msg_type]))} message types" assert msg_type in messages, f"Expected {msg_type} messages" interesting_messages = [ msg for msg in messages[msg_type] if "type" in msg and msg["status"] not in ("created", "ongoing") ] if not interesting_messages: return origin_visit_stats: Dict[Tuple[str, str], Dict] = { (visit_stats.url, visit_stats.visit_type): attr.asdict(visit_stats) for visit_stats in scheduler.origin_visit_stats_get( list(set((vs["origin"], vs["type"]) for vs in interesting_messages)) ) } existing_origin_visit_stats = copy.deepcopy(origin_visit_stats) # Use the default values from the model object empty_object = { field.name: field.default if field.default != attr.NOTHING else None for field in attr.fields(OriginVisitStats) } disabled_urls: List[str] = [] # Retrieve the global queue state queue_position_per_visit_type = scheduler.visit_scheduler_queue_position_get() for msg_dict in interesting_messages: origin = msg_dict["origin"] visit_type = msg_dict["type"] pk = origin, visit_type if pk not in origin_visit_stats: origin_visit_stats[pk] = { **empty_object, "url": origin, "visit_type": visit_type, } visit_stats_d = origin_visit_stats[pk] if ( visit_stats_d.get("last_visit") and msg_dict["date"] <= visit_stats_d["last_visit"] ): # message received out of order, ignore continue # Compare incoming message to known status of the origin, to determine # eventfulness last_visit_status, eventful = get_last_status(msg_dict, visit_stats_d) # Update the position offset according to the visit status, # if we had already visited this origin before. if visit_stats_d.get("last_visit"): # Update the next position offset according to the existing value and the # eventfulness of the visit. increment = -2 if eventful else 1 # Limit the next_position_offset for acceptable date computations current_offset = min( visit_stats_d["next_position_offset"] + increment, MAX_NEXT_POSITION_OFFSET, ) visit_stats_d["next_position_offset"] = max(0, current_offset) # increment the counter when last_visit_status is the same same_visit_status = last_visit_status == visit_stats_d["last_visit_status"] else: same_visit_status = False # Record current visit date as highest known date (we've rejected out of order # messages earlier). visit_stats_d["last_visit"] = msg_dict["date"] visit_stats_d["last_visit_status"] = last_visit_status # Record last successful visit date if last_visit_status == LastVisitStatus.successful: visit_stats_d["last_successful"] = max_date( msg_dict["date"], visit_stats_d.get("last_successful") ) visit_stats_d["last_snapshot"] = msg_dict["snapshot"] # Update the next visit queue position (which will be used solely for origin # without any last_update, cf. the dedicated scheduling policy # "origins_without_last_update") visit_stats_d["next_visit_queue_position"] = next_visit_queue_position( queue_position_per_visit_type, visit_stats_d ) visit_stats_d["successive_visits"] = ( visit_stats_d["successive_visits"] + 1 if same_visit_status else 1 ) # Disable recurring failing/not-found origins if ( visit_stats_d["last_visit_status"] in [LastVisitStatus.not_found, LastVisitStatus.failed] ) and visit_stats_d["successive_visits"] >= DISABLE_ORIGIN_THRESHOLD: disabled_urls.append(visit_stats_d["url"]) # Only upsert changed values to_upsert = [] for key, ovs in origin_visit_stats.items(): if ( key not in existing_origin_visit_stats or ovs != existing_origin_visit_stats[key] ): to_upsert.append(OriginVisitStats(**ovs)) if to_upsert: scheduler.origin_visit_stats_upsert(to_upsert) # Disable any origins if any if disabled_urls: disabled_origins = [] for url in disabled_urls: origins = scheduler.get_listed_origins(url=url).results if len(origins) > 0: origin = attr.evolve(origins[0], enabled=False) disabled_origins.append(origin) if disabled_origins: scheduler.record_listed_origins(disabled_origins)
def test_journal_client_origin_visit_status_from_journal_last_not_found( swh_scheduler): visit_status = { "origin": "foo", "visit": 1, "status": "not_found", "date": DATE1, "type": "git", "snapshot": None, } process_journal_objects({"origin_visit_status": [visit_status]}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_visit=visit_status["date"], last_visit_status=LastVisitStatus.not_found, next_position_offset=4, successive_visits=1, ), ) visit_statuses = [ { "origin": "foo", "visit": 3, "status": "not_found", "date": DATE2, "type": "git", "snapshot": None, }, { "origin": "foo", "visit": 4, "status": "not_found", "date": DATE3, "type": "git", "snapshot": None, }, ] process_journal_objects({"origin_visit_status": visit_statuses}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_visit=DATE3, last_visit_status=LastVisitStatus.not_found, next_position_offset=6, successive_visits=3, ), )