def _get_visit_info_for_save_request(save_request): visit_date = None visit_status = None time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # stop trying to find a visit date one month after save request submission # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= 30: try: origin = {"url": save_request.origin_url} origin_info = archive.lookup_origin(origin) origin_visits = get_origin_visits(origin_info) visit_dates = [ parse_iso8601_date_to_utc(v["date"]) for v in origin_visits ] i = bisect_right(visit_dates, save_request.request_date) if i != len(visit_dates): visit_date = visit_dates[i] visit_status = origin_visits[i]["status"] if origin_visits[i]["status"] not in ("full", "partial", "not_found"): visit_date = None except Exception as exc: sentry_sdk.capture_exception(exc) return visit_date, visit_status
def _origin_visits_browse(request, origin_url): if origin_url is None: raise BadInputExc("An origin URL must be provided as query parameter.") origin_info = archive.lookup_origin({"url": origin_url}) origin_visits = get_origin_visits(origin_info) snapshot_context = get_snapshot_context(origin_url=origin_url) for i, visit in enumerate(origin_visits): url_date = format_utc_iso_date(visit["date"], "%Y-%m-%dT%H:%M:%SZ") visit["formatted_date"] = format_utc_iso_date(visit["date"]) query_params = {"origin_url": origin_url, "timestamp": url_date} if i < len(origin_visits) - 1: if visit["date"] == origin_visits[i + 1]["date"]: query_params = {"visit_id": visit["visit"]} if i > 0: if visit["date"] == origin_visits[i - 1]["date"]: query_params = {"visit_id": visit["visit"]} snapshot = visit["snapshot"] if visit["snapshot"] else "" visit["url"] = reverse( "browse-origin-directory", query_params=query_params, ) if not snapshot: visit["snapshot"] = "" visit["date"] = parse_iso8601_date_to_utc(visit["date"]).timestamp() heading = "Origin visits - %s" % origin_url return render( request, "browse/origin-visits.html", { "heading": heading, "swh_object_name": "Visits", "swh_object_metadata": origin_info, "origin_visits": origin_visits, "origin_info": origin_info, "snapshot_context": snapshot_context, "vault_cooking": None, "show_actions": False, }, )
def _stat_counters(request): stat_counters = archive.stat_counters() url = get_config()["history_counters_url"] stat_counters_history = {} try: response = requests.get(url, timeout=5) stat_counters_history = json.loads(response.text) for d, object_counts in _stat_counters_backfill.items(): # convert date to javascript timestamp (in ms) timestamp = int(parse_iso8601_date_to_utc(d).timestamp()) * 1000 for object_type, object_count in object_counts.items(): stat_counters_history[object_type].append( [timestamp, object_count]) except Exception as exc: sentry_sdk.capture_exception(exc) counters = { "stat_counters": stat_counters, "stat_counters_history": stat_counters_history, } return JsonResponse(counters)
def get_origin_visit( origin_info: OriginInfo, visit_ts: Optional[str] = None, visit_id: Optional[int] = None, snapshot_id: Optional[str] = None, ) -> OriginVisitInfo: """Function that returns information about a visit for a given origin. If a timestamp is provided, the closest visit from that timestamp is returned. If a snapshot identifier is provided, the first visit with that snapshot is returned. If no search hints are provided, return the most recent full visit with a valid snapshot or the most recent partial visit with a valid snapshot otherwise. Args: origin_info: a dict filled with origin information visit_ts: an ISO 8601 datetime string to parse snapshot_id: a snapshot identifier Returns: A dict containing the visit info. Raises: swh.web.common.exc.NotFoundExc: if no visit can be found """ # returns the latest full visit with a valid snapshot visit = archive.lookup_origin_visit_latest(origin_info["url"], allowed_statuses=["full"], require_snapshot=True) if not visit: # or the latest partial visit with a valid snapshot otherwise visit = archive.lookup_origin_visit_latest( origin_info["url"], allowed_statuses=["partial"], require_snapshot=True) if not visit_ts and not visit_id and not snapshot_id: if visit: return visit else: raise NotFoundExc( f"No valid visit for origin with url {origin_info['url']} found!" ) # no need to fetch all visits list and search in it if the latest # visit matches some criteria if visit and (visit["snapshot"] == snapshot_id or visit["visit"] == visit_id): return visit visits = get_origin_visits(origin_info) if not visits: raise NotFoundExc( f"No visits associated to origin with url {origin_info['url']}!") if snapshot_id: visits = [v for v in visits if v["snapshot"] == snapshot_id] if len(visits) == 0: raise NotFoundExc( ("Visit for snapshot with id %s for origin with" " url %s not found!" % (snapshot_id, origin_info["url"]))) return visits[0] if visit_id: visits = [v for v in visits if v["visit"] == int(visit_id)] if len(visits) == 0: raise NotFoundExc( ("Visit with id %s for origin with" " url %s not found!" % (visit_id, origin_info["url"]))) return visits[0] if visit_ts: target_visit_ts = math.floor( parse_iso8601_date_to_utc(visit_ts).timestamp()) # Find the visit with date closest to the target (in absolute value) (abs_time_delta, visit_idx) = min( ((math.floor(parse_iso8601_date_to_utc( visit["date"]).timestamp()), i) for (i, visit) in enumerate(visits)), key=lambda ts_and_i: abs(ts_and_i[0] - target_visit_ts), ) if visit_idx is not None: visit = visits[visit_idx] # If multiple visits have the same date, select the one with # the largest id. while (visit_idx < len(visits) - 1 and visit["date"] == visits[visit_idx + 1]["date"]): visit_idx = visit_idx + 1 visit = visits[visit_idx] return visit else: raise NotFoundExc( ("Visit with timestamp %s for origin with " "url %s not found!" % (visit_ts, origin_info["url"]))) return visits[-1]
def _visit_sort_key(visit): ts = parse_iso8601_date_to_utc(visit["date"]).timestamp() return ts + (float(visit["visit"]) / 10e3)
def _origin_directory_view_test_helper( client, archive_data, origin_info, origin_visit, snapshot_sizes, origin_branches, origin_releases, root_directory_sha1, directory_entries, visit_id=None, timestamp=None, snapshot_id=None, path=None, ): dirs = [e for e in directory_entries if e["type"] in ("dir", "rev")] files = [e for e in directory_entries if e["type"] == "file"] if not visit_id and not snapshot_id: visit_id = origin_visit["visit"] query_params = {"origin_url": origin_info["url"]} if timestamp: query_params["timestamp"] = timestamp elif visit_id: query_params["visit_id"] = visit_id else: query_params["snapshot"] = snapshot_id if path: query_params["path"] = path url = reverse("browse-origin-directory", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains(resp, '<td class="swh-directory">', count=len(dirs)) assert_contains(resp, '<td class="swh-content">', count=len(files)) if timestamp: query_params["timestamp"] = format_utc_iso_date( parse_iso8601_date_to_utc(timestamp).isoformat(), "%Y-%m-%dT%H:%M:%SZ" ) for d in dirs: if d["type"] == "rev": dir_url = reverse("browse-revision", url_args={"sha1_git": d["target"]}) else: dir_path = d["name"] if path: dir_path = "%s/%s" % (path, d["name"]) query_params["path"] = dir_path dir_url = reverse("browse-origin-directory", query_params=query_params,) assert_contains(resp, dir_url) for f in files: file_path = f["name"] if path: file_path = "%s/%s" % (path, f["name"]) query_params["path"] = file_path file_url = reverse("browse-origin-content", query_params=query_params) assert_contains(resp, file_url) if "path" in query_params: del query_params["path"] root_dir_branch_url = reverse("browse-origin-directory", query_params=query_params) nb_bc_paths = 1 if path: nb_bc_paths = len(path.split("/")) + 1 assert_contains(resp, '<li class="swh-path">', count=nb_bc_paths) assert_contains( resp, '<a href="%s">%s</a>' % (root_dir_branch_url, root_directory_sha1[:7]) ) origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) nb_releases = len(origin_releases) if nb_releases > 0: assert_contains(resp, f'href="{escape(origin_releases_url)}"') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") if path: query_params["path"] = path assert_contains(resp, '<li class="swh-branch">', count=len(origin_branches)) for branch in origin_branches: query_params["branch"] = branch["name"] root_dir_branch_url = reverse( "browse-origin-directory", query_params=query_params ) assert_contains(resp, '<a href="%s">' % root_dir_branch_url) assert_contains(resp, '<li class="swh-release">', count=len(origin_releases)) query_params["branch"] = None for release in origin_releases: query_params["release"] = release["name"] root_dir_release_url = reverse( "browse-origin-directory", query_params=query_params ) assert_contains(resp, 'href="%s"' % root_dir_release_url) assert_contains(resp, "vault-cook-directory") assert_contains(resp, "vault-cook-revision") snapshot = archive_data.snapshot_get(origin_visit["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) swhid_context = { "origin": origin_info["url"], "visit": gen_swhid(SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(REVISION, head_rev_id), "path": f"/{path}" if path else None, } swh_dir_id = gen_swhid( DIRECTORY, directory_entries[0]["dir_id"], metadata=swhid_context ) swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id}) assert_contains(resp, swh_dir_id) assert_contains(resp, swh_dir_id_url) assert_contains(resp, "swh-take-new-snapshot") _check_origin_link(resp, origin_info["url"]) assert_not_contains(resp, "swh-metadata-popover")
def _origin_content_view_test_helper( client, archive_data, origin_info, origin_visit, snapshot_sizes, origin_branches, origin_releases, root_dir_sha1, content, visit_id=None, timestamp=None, snapshot_id=None, ): content_path = "/".join(content["path"].split("/")[1:]) if not visit_id and not snapshot_id: visit_id = origin_visit["visit"] query_params = {"origin_url": origin_info["url"], "path": content_path} if timestamp: query_params["timestamp"] = timestamp if visit_id: query_params["visit_id"] = visit_id elif snapshot_id: query_params["snapshot"] = snapshot_id url = reverse("browse-origin-content", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) assert type(content["data"]) == str assert_contains(resp, '<code class="%s">' % content["hljs_language"]) assert_contains(resp, escape(content["data"])) split_path = content_path.split("/") filename = split_path[-1] path = content_path.replace(filename, "")[:-1] path_info = gen_path_info(path) del query_params["path"] if timestamp: query_params["timestamp"] = format_utc_iso_date( parse_iso8601_date_to_utc(timestamp).isoformat(), "%Y-%m-%dT%H:%M:%SZ" ) root_dir_url = reverse("browse-origin-directory", query_params=query_params) assert_contains(resp, '<li class="swh-path">', count=len(path_info) + 1) assert_contains(resp, '<a href="%s">%s</a>' % (root_dir_url, root_dir_sha1[:7])) for p in path_info: query_params["path"] = p["path"] dir_url = reverse("browse-origin-directory", query_params=query_params) assert_contains(resp, '<a href="%s">%s</a>' % (dir_url, p["name"])) assert_contains(resp, "<li>%s</li>" % filename) query_string = "sha1_git:" + content["sha1_git"] url_raw = reverse( "browse-content-raw", url_args={"query_string": query_string}, query_params={"filename": filename}, ) assert_contains(resp, url_raw) if "path" in query_params: del query_params["path"] origin_branches_url = reverse("browse-origin-branches", query_params=query_params) assert_contains(resp, f'href="{escape(origin_branches_url)}"') assert_contains(resp, f"Branches ({snapshot_sizes['revision']})") origin_releases_url = reverse("browse-origin-releases", query_params=query_params) assert_contains(resp, f'href="{escape(origin_releases_url)}">') assert_contains(resp, f"Releases ({snapshot_sizes['release']})") assert_contains(resp, '<li class="swh-branch">', count=len(origin_branches)) query_params["path"] = content_path for branch in origin_branches: root_dir_branch_url = reverse( "browse-origin-content", query_params={"branch": branch["name"], **query_params}, ) assert_contains(resp, '<a href="%s">' % root_dir_branch_url) assert_contains(resp, '<li class="swh-release">', count=len(origin_releases)) query_params["branch"] = None for release in origin_releases: root_dir_release_url = reverse( "browse-origin-content", query_params={"release": release["name"], **query_params}, ) assert_contains(resp, '<a href="%s">' % root_dir_release_url) url = reverse("browse-origin-content", query_params=query_params) resp = check_html_get_response( client, url, status_code=200, template_used="browse/content.html" ) snapshot = archive_data.snapshot_get(origin_visit["snapshot"]) head_rev_id = archive_data.snapshot_get_head(snapshot) swhid_context = { "origin": origin_info["url"], "visit": gen_swhid(SNAPSHOT, snapshot["id"]), "anchor": gen_swhid(REVISION, head_rev_id), "path": f"/{content_path}", } swh_cnt_id = gen_swhid(CONTENT, content["sha1_git"], metadata=swhid_context) swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id}) assert_contains(resp, swh_cnt_id) assert_contains(resp, swh_cnt_id_url) assert_contains(resp, "swh-take-new-snapshot") _check_origin_link(resp, origin_info["url"]) assert_not_contains(resp, "swh-metadata-popover")
def test_revision_log_browse(client, archive_data, revision): per_page = 10 revision_log = archive_data.revision_log(revision) revision_log_sorted = sorted( revision_log, key=lambda rev: -parse_iso8601_date_to_utc(rev["committer_date"]). timestamp(), ) url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={"per_page": per_page}, ) next_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={ "offset": per_page, "per_page": per_page, }, ) nb_log_entries = per_page if len(revision_log_sorted) < per_page: nb_log_entries = len(revision_log_sorted) resp = check_html_get_response(client, url, status_code=200, template_used="browse/revision-log.html") assert_contains(resp, '<tr class="swh-revision-log-entry', count=nb_log_entries) assert_contains(resp, '<a class="page-link">Newer</a>') if len(revision_log_sorted) > per_page: assert_contains( resp, '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), ) for log in revision_log_sorted[:per_page]: revision_url = reverse("browse-revision", url_args={"sha1_git": log["id"]}) assert_contains(resp, log["id"][:7]) assert_contains(resp, log["author"]["name"]) assert_contains(resp, format_utc_iso_date(log["date"])) assert_contains(resp, escape(log["message"])) assert_contains(resp, format_utc_iso_date(log["committer_date"])) assert_contains(resp, revision_url) if len(revision_log_sorted) <= per_page: return resp = check_html_get_response(client, next_page_url, status_code=200, template_used="browse/revision-log.html") prev_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={ "offset": 0, "per_page": per_page }, ) next_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={ "offset": 2 * per_page, "per_page": per_page }, ) nb_log_entries = len(revision_log_sorted) - per_page if nb_log_entries > per_page: nb_log_entries = per_page assert_contains(resp, '<tr class="swh-revision-log-entry', count=nb_log_entries) assert_contains( resp, '<a class="page-link" href="%s">Newer</a>' % escape(prev_page_url)) if len(revision_log_sorted) > 2 * per_page: assert_contains( resp, '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), ) if len(revision_log_sorted) <= 2 * per_page: return resp = check_html_get_response(client, next_page_url, status_code=200, template_used="browse/revision-log.html") prev_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={ "offset": per_page, "per_page": per_page }, ) next_page_url = reverse( "browse-revision-log", url_args={"sha1_git": revision}, query_params={ "offset": 3 * per_page, "per_page": per_page }, ) nb_log_entries = len(revision_log_sorted) - 2 * per_page if nb_log_entries > per_page: nb_log_entries = per_page assert_contains(resp, '<tr class="swh-revision-log-entry', count=nb_log_entries) assert_contains( resp, '<a class="page-link" href="%s">Newer</a>' % escape(prev_page_url)) if len(revision_log_sorted) > 3 * per_page: assert_contains( resp, '<a class="page-link" href="%s">Older</a>' % escape(next_page_url), )
def test_parse_iso8601_date_to_utc_ko(invalid_iso8601_timestamp): with pytest.raises(BadInputExc): utils.parse_iso8601_date_to_utc(invalid_iso8601_timestamp)
def test_parse_iso8601_date_to_utc_ok(input_timestamp, output_date): assert utils.parse_iso8601_date_to_utc(input_timestamp) == output_date