Exemplo n.º 1
0
    def matching(cls, sockets, annotation):
        """Find sockets with matching filters for the given annotation.

        For this to work, the sockets must have first had `set_filter()` called
        on them.

        :param sockets: Iterable of sockets to check
        :param annotation: Annotation to match
        :return: A generator of matching socket objects
        """
        values = {
            "/id": [annotation.id],
            "/uri": [normalize_uri(annotation.target_uri)],
            "/references": set(annotation.references),
        }

        for socket in sockets:
            # Some sockets might not yet have the filter applied (or had a non
            # parsable filter etc.)
            if not hasattr(socket, "filter_rows"):
                continue

            # Iterate over the filter_rows added by `set_filter()`
            for field, value in socket.filter_rows:
                try:
                    if value in values[field]:
                        yield socket
                        break
                except KeyError:
                    continue
Exemplo n.º 2
0
        def normalize(term):
            # Apply generic normalization.
            normalized = uni_fold(term)

            # Apply field-specific normalization.
            if clause["field"] == "/uri":
                normalized = normalize_uri(term)

            return normalized
Exemplo n.º 3
0
        def normalize(term):
            # Apply generic normalization.
            normalized = uni_fold(term)

            # Apply field-specific normalization.
            if clause["field"] == "/uri":
                normalized = normalize_uri(term)

            return normalized
Exemplo n.º 4
0
    def _rows_for(cls, filter):
        """Convert a filter to field value pairs."""
        for clause in filter["clauses"]:
            field = clause["field"]
            if field not in cls.KNOWN_FIELDS:
                continue

            values = clause["value"]

            # Normalize to an iterable of distinct values
            values = set(values) if isinstance(values, list) else [values]

            for value in values:
                if field == "/uri":
                    value = normalize_uri(value)

                yield field, value
Exemplo n.º 5
0
def expand_uri(session, uri, normalized=False):
    """
    Return all URIs which refer to the same underlying document as `uri`.

    This function determines whether we already have "document" records for the
    passed URI, and if so returns the set of all URIs which we currently
    believe refer to the same document.

    :param session: Database session
    :param uri: URI associated with the document
    :param normalized: Return normalized URIs instead of the raw value

    :returns: a list of equivalent URIs
    """

    normalized_uri = normalize_uri(uri)

    document_id = (session.query(models.DocumentURI.document_id).filter(
        models.DocumentURI.uri_normalized == normalized_uri).limit(
            1).subquery())

    type_uris = list(
        session.query(
            # Using the specific fields we want prevents object creation
            # which significantly speeds this method up (knocks ~40% off)
            models.DocumentURI.type,
            models.DocumentURI.uri,
            models.DocumentURI.uri_normalized,
        ).filter(models.DocumentURI.document_id == document_id))

    if not type_uris:
        return [normalized_uri if normalized else uri]

    # We check if the match was a "canonical" link. If so, all annotations
    # created on that page are guaranteed to have that as their target.source
    # field, so we don't need to expand to other URIs and risk false positives.
    for doc_type, plain_uri, _ in type_uris:
        if doc_type == "rel-canonical" and plain_uri == uri:
            return [normalized_uri if normalized else uri]

    if normalized:
        return [uri_normalized for _, _, uri_normalized in type_uris]

    return [plain_uri for _, plain_uri, _ in type_uris]