Exemplo n.º 1
0
def _get_visit_info_for_save_request(save_request):
    visit_date = None
    visit_status = None
    time_now = datetime.now(tz=timezone.utc)
    time_delta = time_now - save_request.request_date
    # stop trying to find a visit date one month after save request submission
    # as those requests to storage are expensive and associated loading task
    # surely ended up with errors
    if time_delta.days <= 30:
        try:
            origin = {"url": save_request.origin_url}
            origin_info = archive.lookup_origin(origin)
            origin_visits = get_origin_visits(origin_info)
            visit_dates = [
                parse_iso8601_date_to_utc(v["date"]) for v in origin_visits
            ]
            i = bisect_right(visit_dates, save_request.request_date)
            if i != len(visit_dates):
                visit_date = visit_dates[i]
                visit_status = origin_visits[i]["status"]
                if origin_visits[i]["status"] not in ("full", "partial",
                                                      "not_found"):
                    visit_date = None
        except Exception as exc:
            sentry_sdk.capture_exception(exc)
    return visit_date, visit_status
Exemplo n.º 2
0
def _origin_visits_browse(request, origin_url):
    if origin_url is None:
        raise BadInputExc("An origin URL must be provided as query parameter.")

    origin_info = archive.lookup_origin({"url": origin_url})
    origin_visits = get_origin_visits(origin_info)
    snapshot_context = get_snapshot_context(origin_url=origin_url)

    for i, visit in enumerate(origin_visits):
        url_date = format_utc_iso_date(visit["date"], "%Y-%m-%dT%H:%M:%SZ")
        visit["formatted_date"] = format_utc_iso_date(visit["date"])
        query_params = {"origin_url": origin_url, "timestamp": url_date}
        if i < len(origin_visits) - 1:
            if visit["date"] == origin_visits[i + 1]["date"]:
                query_params = {"visit_id": visit["visit"]}
        if i > 0:
            if visit["date"] == origin_visits[i - 1]["date"]:
                query_params = {"visit_id": visit["visit"]}

        snapshot = visit["snapshot"] if visit["snapshot"] else ""

        visit["url"] = reverse(
            "browse-origin-directory",
            query_params=query_params,
        )
        if not snapshot:
            visit["snapshot"] = ""
        visit["date"] = parse_iso8601_date_to_utc(visit["date"]).timestamp()

    heading = "Origin visits - %s" % origin_url

    return render(
        request,
        "browse/origin-visits.html",
        {
            "heading": heading,
            "swh_object_name": "Visits",
            "swh_object_metadata": origin_info,
            "origin_visits": origin_visits,
            "origin_info": origin_info,
            "snapshot_context": snapshot_context,
            "vault_cooking": None,
            "show_actions": False,
        },
    )
Exemplo n.º 3
0
def get_origin_visits(origin_info: OriginInfo) -> List[OriginVisitInfo]:
    """Function that returns the list of visits for a swh origin.
    That list is put in cache in order to speedup the navigation
    in the swh web browse ui.

    The returned visits are sorted according to their date in
    ascending order.

    Args:
        origin_info: dict describing the origin to fetch visits from

    Returns:
        A list of dict describing the origin visits

    Raises:
        swh.web.common.exc.NotFoundExc: if the origin is not found
    """

    from swh.web.common import archive

    if "url" in origin_info:
        origin_url = origin_info["url"]
    else:
        origin_url = archive.lookup_origin(origin_info)["url"]

    cache_entry_id = "origin_visits_%s" % origin_url
    cache_entry = cache.get(cache_entry_id)

    if cache_entry:
        last_visit = cache_entry[-1]["visit"]
        new_visits = list(
            archive.lookup_origin_visits(origin_url, last_visit=last_visit))
        if not new_visits:
            last_snp = archive.lookup_latest_origin_snapshot(origin_url)
            if not last_snp or last_snp["id"] == cache_entry[-1]["snapshot"]:
                return cache_entry

    origin_visits = []

    per_page = archive.MAX_LIMIT
    last_visit = None
    while 1:
        visits = list(
            archive.lookup_origin_visits(origin_url,
                                         last_visit=last_visit,
                                         per_page=per_page))
        origin_visits += visits
        if len(visits) < per_page:
            break
        else:
            if not last_visit:
                last_visit = per_page
            else:
                last_visit += per_page

    def _visit_sort_key(visit):
        ts = parse_iso8601_date_to_utc(visit["date"]).timestamp()
        return ts + (float(visit["visit"]) / 10e3)

    origin_visits = sorted(origin_visits, key=lambda v: _visit_sort_key(v))

    cache.set(cache_entry_id, origin_visits)

    return origin_visits
Exemplo n.º 4
0
def test_lookup_origin_single_slash_after_protocol(archive_data):
    origin_url = "http://snapshot.debian.org/package/r-base/"
    malformed_origin_url = "http:/snapshot.debian.org/package/r-base/"
    archive_data.origin_add([Origin(url=origin_url)])
    origin_info = archive.lookup_origin({"url": malformed_origin_url})
    assert origin_info["url"] == origin_url
Exemplo n.º 5
0
def test_lookup_origin_missing_trailing_slash(archive_data):
    deb_origin = Origin(url="http://snapshot.debian.org/package/r-base/")
    archive_data.origin_add([deb_origin])
    origin_info = archive.lookup_origin({"url": deb_origin.url[:-1]})
    assert origin_info["url"] == deb_origin.url
Exemplo n.º 6
0
def test_lookup_origin_extra_trailing_slash(origin):
    origin_info = archive.lookup_origin({"url": f"{origin['url']}/"})
    assert origin_info["url"] == origin["url"]
Exemplo n.º 7
0
def test_lookup_origin(archive_data, new_origin):
    archive_data.origin_add([new_origin])

    actual_origin = archive.lookup_origin({"url": new_origin.url})
    expected_origin = archive_data.origin_get([new_origin.url])[0]
    assert actual_origin == expected_origin
Exemplo n.º 8
0
def resolve_swhid(
    swhid: str, query_params: Optional[QueryParameters] = None
) -> ResolvedSWHID:
    """
    Try to resolve a SoftWare Heritage persistent IDentifier into an url for
    browsing the targeted object.

    Args:
        swhid: a SoftWare Heritage persistent IDentifier
        query_params: optional dict filled with
            query parameters to append to the browse url

    Returns:
        a dict with the following keys:

            * **swhid_parsed**: the parsed identifier
            * **browse_url**: the url for browsing the targeted object
    """
    swhid_parsed = get_swhid(swhid)
    object_type = swhid_parsed.object_type
    object_id = swhid_parsed.object_id
    browse_url = None
    url_args = {}
    query_dict = QueryDict("", mutable=True)
    fragment = ""
    process_lines = object_type == ObjectType.CONTENT

    if query_params and len(query_params) > 0:
        for k in sorted(query_params.keys()):
            query_dict[k] = query_params[k]

    if swhid_parsed.origin:
        origin_url = unquote(swhid_parsed.origin)
        origin_url = archive.lookup_origin({"url": origin_url})["url"]
        query_dict["origin_url"] = origin_url

    if swhid_parsed.path and swhid_parsed.path != b"/":
        query_dict["path"] = swhid_parsed.path.decode("utf8", errors="replace")
        if swhid_parsed.anchor:
            directory = b""
            if swhid_parsed.anchor.object_type == ObjectType.DIRECTORY:
                directory = swhid_parsed.anchor.object_id
            elif swhid_parsed.anchor.object_type == ObjectType.REVISION:
                revision = archive.lookup_revision(
                    hash_to_hex(swhid_parsed.anchor.object_id)
                )
                directory = revision["directory"]
            elif swhid_parsed.anchor.object_type == ObjectType.RELEASE:
                release = archive.lookup_release(
                    hash_to_hex(swhid_parsed.anchor.object_id)
                )
                if release["target_type"] == REVISION:
                    revision = archive.lookup_revision(release["target"])
                    directory = revision["directory"]
            if object_type == ObjectType.CONTENT:
                if not swhid_parsed.origin:
                    # when no origin context, content objects need to have their
                    # path prefixed by root directory id for proper breadcrumbs display
                    query_dict["path"] = hash_to_hex(directory) + query_dict["path"]
                else:
                    # remove leading slash from SWHID content path
                    query_dict["path"] = query_dict["path"][1:]
            elif object_type == ObjectType.DIRECTORY:
                object_id = directory
                # remove leading and trailing slashes from SWHID directory path
                if query_dict["path"].endswith("/"):
                    query_dict["path"] = query_dict["path"][1:-1]
                else:
                    query_dict["path"] = query_dict["path"][1:]

    # snapshot context
    if swhid_parsed.visit:
        if swhid_parsed.visit.object_type != ObjectType.SNAPSHOT:
            raise BadInputExc("Visit must be a snapshot SWHID.")
        query_dict["snapshot"] = hash_to_hex(swhid_parsed.visit.object_id)

        if swhid_parsed.anchor:
            if swhid_parsed.anchor.object_type == ObjectType.REVISION:
                # check if the anchor revision is the tip of a branch
                branch_name = archive.lookup_snapshot_branch_name_from_tip_revision(
                    hash_to_hex(swhid_parsed.visit.object_id),
                    hash_to_hex(swhid_parsed.anchor.object_id),
                )
                if branch_name:
                    query_dict["branch"] = branch_name
                elif object_type != ObjectType.REVISION:
                    query_dict["revision"] = hash_to_hex(swhid_parsed.anchor.object_id)

            elif swhid_parsed.anchor.object_type == ObjectType.RELEASE:
                release = archive.lookup_release(
                    hash_to_hex(swhid_parsed.anchor.object_id)
                )
                if release:
                    query_dict["release"] = release["name"]

        if object_type == ObjectType.REVISION and "release" not in query_dict:
            branch_name = archive.lookup_snapshot_branch_name_from_tip_revision(
                hash_to_hex(swhid_parsed.visit.object_id), hash_to_hex(object_id)
            )
            if branch_name:
                query_dict["branch"] = branch_name

    # browsing content or directory without snapshot context
    elif (
        object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY)
        and swhid_parsed.anchor
    ):
        if swhid_parsed.anchor.object_type == ObjectType.REVISION:
            # anchor revision, objects are browsed from its view
            object_type = ObjectType.REVISION
            object_id = swhid_parsed.anchor.object_id
        elif (
            object_type == ObjectType.DIRECTORY
            and swhid_parsed.anchor.object_type == ObjectType.DIRECTORY
        ):
            # a directory is browsed from its root
            object_id = swhid_parsed.anchor.object_id

    if object_type == ObjectType.CONTENT:
        url_args["query_string"] = f"sha1_git:{hash_to_hex(object_id)}"
    elif object_type in (ObjectType.DIRECTORY, ObjectType.RELEASE, ObjectType.REVISION):
        url_args["sha1_git"] = hash_to_hex(object_id)
    elif object_type == ObjectType.SNAPSHOT:
        url_args["snapshot_id"] = hash_to_hex(object_id)

    if swhid_parsed.lines and process_lines:
        lines = swhid_parsed.lines
        fragment += "#L" + str(lines[0])
        if lines[1]:
            fragment += "-L" + str(lines[1])

    if url_args:
        browse_url = (
            reverse(
                f"browse-{object_type.name.lower()}",
                url_args=url_args,
                query_params=query_dict,
            )
            + fragment
        )

    return ResolvedSWHID(swhid_parsed=swhid_parsed, browse_url=browse_url)
Exemplo n.º 9
0
def _swh_badge(
    request: HttpRequest,
    object_type: str,
    object_id: str,
    object_swhid: Optional[str] = "",
) -> HttpResponse:
    """
    Generate a Software Heritage badge for a given object type and id.

    Args:
        request: input http request
        object_type: The type of swh object to generate a badge for,
            either *content*, *directory*, *revision*, *release*, *origin*
            or *snapshot*
        object_id: The id of the swh object, either an url for origin
            type or a *sha1* for other object types
        object_swhid: If provided, the object SWHID will not be recomputed

    Returns:
        HTTP response with content type *image/svg+xml* containing the SVG
        badge data. If the provided parameters are invalid, HTTP 400 status
        code will be returned. If the object can not be found in the archive,
        HTTP 404 status code will be returned.

    """
    left_text = "error"
    whole_link = None

    try:
        if object_type == "origin":
            archive.lookup_origin({"url": object_id})
            right_text = "repository"
            whole_link = reverse(
                "browse-origin", query_params={"origin_url": object_id}
            )
        else:
            # when SWHID is provided, object type and id will be parsed
            # from it
            if object_swhid:
                parsed_swhid = QualifiedSWHID.from_string(object_swhid)
                object_type = parsed_swhid.object_type.name.lower()
                object_id = hash_to_hex(parsed_swhid.object_id)
                swh_object = archive.lookup_object(object_type, object_id)
                # remove SWHID qualified if any for badge text
                right_text = str(
                    CoreSWHID(
                        object_type=parsed_swhid.object_type,
                        object_id=parsed_swhid.object_id,
                    )
                )
            else:
                right_text = str(
                    CoreSWHID(
                        object_type=ObjectType[object_type.upper()],
                        object_id=hash_to_bytes(object_id),
                    )
                )
                swh_object = archive.lookup_object(object_type, object_id)

            whole_link = resolve_swhid(str(right_text))["browse_url"]
            # use release name for badge text
            if object_type == RELEASE:
                right_text = "release %s" % swh_object["name"]
        left_text = "archived"
    except (BadInputExc, ValidationError):
        right_text = f'invalid {object_type if object_type else "object"} id'
        object_type = "error"
    except NotFoundExc:
        right_text = f'{object_type if object_type else "object"} not found'
        object_type = "error"

    badge_data = badge(
        left_text=left_text,
        right_text=right_text,
        right_color=_badge_config[object_type]["color"],
        whole_link=request.build_absolute_uri(whole_link),
        whole_title=_badge_config[object_type]["title"],
        logo=_get_logo_data(),
        embed_logo=True,
    )

    return HttpResponse(badge_data, content_type="image/svg+xml")