def _normalize_document_meta_window(session, window): query = ( session.query(models.DocumentMeta) .filter(models.DocumentMeta.updated.between(window.start, window.end)) .order_by(models.DocumentMeta.updated.asc()) ) for docmeta in query: existing = session.query(models.DocumentMeta).filter( models.DocumentMeta.id != docmeta.id, models.DocumentMeta.claimant_normalized == uri.normalize(docmeta.claimant), models.DocumentMeta.type == docmeta.type, ) if existing.count() > 0: session.delete(docmeta) else: docmeta._claimant_normalized = ( # pylint: disable=protected-access uri.normalize( docmeta.claimant, ) ) session.flush()
def _normalize_document_uris_window(session, window): query = (session.query(models.DocumentURI).filter( models.DocumentURI.updated.between(window.start, window.end)).order_by( models.DocumentURI.updated.asc())) for docuri in query: documents = models.Document.find_by_uris(session, [docuri.uri]) if documents.count() > 1: merge_documents(session, documents) existing = session.query(models.DocumentURI).filter( models.DocumentURI.id != docuri.id, models.DocumentURI.document_id == docuri.document_id, models.DocumentURI.claimant_normalized == uri.normalize( docuri.claimant), models.DocumentURI.uri_normalized == uri.normalize(docuri.uri), models.DocumentURI.type == docuri.type, models.DocumentURI.content_type == docuri.content_type, ) if existing.count() > 0: session.delete(docuri) else: docuri._claimant_normalized = uri.normalize(docuri.claimant) docuri._uri_normalized = uri.normalize(docuri.uri) session.flush()
def _normalize_document_uris_window(session, window): query = ( session.query(models.DocumentURI) .filter(models.DocumentURI.updated.between(window.start, window.end)) .order_by(models.DocumentURI.updated.asc()) ) for docuri in query: documents = models.Document.find_by_uris(session, [docuri.uri]) if documents.count() > 1: merge_documents(session, documents) existing = session.query(models.DocumentURI).filter( models.DocumentURI.id != docuri.id, models.DocumentURI.document_id == docuri.document_id, models.DocumentURI.claimant_normalized == uri.normalize(docuri.claimant), models.DocumentURI.uri_normalized == uri.normalize(docuri.uri), models.DocumentURI.type == docuri.type, models.DocumentURI.content_type == docuri.content_type, ) if existing.count() > 0: session.delete(docuri) else: docuri._claimant_normalized = uri.normalize(docuri.claimant) docuri._uri_normalized = uri.normalize(docuri.uri) session.flush()
def _normalize_document_meta_window(session, window): query = session.query(models.DocumentMeta) \ .filter(models.DocumentMeta.updated.between(window.start, window.end)) \ .order_by(models.DocumentMeta.updated.asc()) for docmeta in query: existing = session.query(models.DocumentMeta).filter( models.DocumentMeta.id != docmeta.id, models.DocumentMeta.claimant_normalized == uri.normalize(docmeta.claimant), models.DocumentMeta.type == docmeta.type) if existing.count() > 0: session.delete(docmeta) else: docmeta._claimant_normalized = uri.normalize(docmeta.claimant) session.flush()
def _fetch_document_uri_canonical_self_claim(session, uri_): return ( session.query(models.DocumentURI) .filter( models.DocumentURI.uri_normalized == uri.normalize(uri_), models.DocumentURI.type.in_([u"self-claim", u"rel-canonical"]), ) .all() )
def _has_uri_ever_been_annotated(db, uri): """Return `True` if a given URI has ever been annotated.""" # This check is written with SQL directly to guarantee an efficient query # and minimize SQLAlchemy overhead. We query `document_uri.uri_normalized` # instead of `annotation.target_uri_normalized` because there is an existing # index on `uri_normalized`. query = 'SELECT EXISTS(SELECT 1 FROM document_uri WHERE uri_normalized = :uri)' result = db.execute(query, {'uri': normalize(uri)}).first() return result[0] is True
def _has_uri_ever_been_annotated(db, uri): """Return `True` if a given URI has ever been annotated.""" # This check is written with SQL directly to guarantee an efficient query # and minimize SQLAlchemy overhead. We query `document_uri.uri_normalized` # instead of `annotation.target_uri_normalized` because there is an existing # index on `uri_normalized`. query = "SELECT EXISTS(SELECT 1 FROM document_uri WHERE uri_normalized = :uri)" result = db.execute(query, {"uri": normalize(uri)}).first() return result[0] is True
def __call__(self, search, params): if 'uri' not in params and 'url' not in params: return search query_uris = popall(params, 'uri') + popall(params, 'url') uris = set() for query_uri in query_uris: expanded = storage.expand_uri(self.request.db, query_uri) us = [uri.normalize(u) for u in expanded] uris.update(us) return search.filter('terms', **{'target.scope': list(uris)})
def _normalize_annotations_window(session, window): query = (session.query(models.Annotation).filter( models.Annotation.updated.between(window.start, window.end)).order_by( models.Annotation.updated.asc())) ids = set() for a in query: normalized = uri.normalize(a.target_uri) if normalized != a.target_uri_normalized: a._target_uri_normalized = normalized ids.add(a.id) return ids
def _normalize_annotations_window(session, window): query = session.query(models.Annotation) \ .filter(models.Annotation.updated.between(window.start, window.end)) \ .order_by(models.Annotation.updated.asc()) ids = set() for a in query: normalized = uri.normalize(a.target_uri) if normalized != a.target_uri_normalized: a._target_uri_normalized = normalized ids.add(a.id) return ids
def __call__(self, params): if 'uri' not in params and 'url' not in params: return None query_uris = [v for k, v in params.items() if k in ['uri', 'url']] if 'uri' in params: del params['uri'] if 'url' in params: del params['url'] uris = set() for query_uri in query_uris: expanded = storage.expand_uri(self.request.db, query_uri) us = [uri.normalize(u) for u in expanded] uris.update(us) return {"terms": {"target.scope": list(uris)}}
def _normalize_annotations_window(session, window): query = ( session.query(models.Annotation) .filter(models.Annotation.updated.between(window.start, window.end)) .order_by(models.Annotation.updated.asc()) ) ids = set() for annotation in query: normalized = uri.normalize(annotation.target_uri) if normalized != annotation.target_uri_normalized: annotation._target_uri_normalized = ( # pylint: disable=protected-access normalized ) ids.add(annotation.id) return ids
def _wildcard_uri_normalized(self, wildcard_uri): """ Same as uri.normalized but it replaces _'s with ?'s after normalization. Although elasticsearch uses ? we use _ since ? is a special reserved url character and this means we can avoid dealing with normalization headaches. While it's possible to escape wildcards`using \\, the uri.normalize converts \\ to encoded url format which does not behave the same in elasticsearch. Thus, escaping wildcard characters is not currently supported. """ # If the url is something like http://example.com/*, normalize it to # http://example.com* so it finds all urls including the base url. trailing_wildcard = "" if wildcard_uri.endswith("*"): trailing_wildcard = wildcard_uri[-1] wildcard_uri = wildcard_uri[:-1] wildcard_uri = uri.normalize(wildcard_uri) wildcard_uri += trailing_wildcard return wildcard_uri.replace("_", "?")
def _wildcard_uri_normalized(self, wildcard_uri): """ Same as uri.normalized but it doesn't strip ending `?` from uri's. It's possible to have a wildcard at the end of a uri, however uri.normalize strips `?`s from the end of uris and something like http://foo.com/* will not be normalized to http://foo.com* without removing the `*` before normalization. To compensate for this, we check for an ending wildcard and add it back after normalization. While it's possible to escape `?` and `*` using \\, the uri.normalize converts \\ to encoded url format which does not behave the same in elasticsearch. Thus, escaping wildcard characters is not currently supported. """ trailing_wildcard = "" if wildcard_uri.endswith("?") or wildcard_uri.endswith("*"): trailing_wildcard = wildcard_uri[-1] wildcard_uri = wildcard_uri[:-1] wildcard_uri = uri.normalize(wildcard_uri) wildcard_uri += trailing_wildcard return wildcard_uri
def test_it_strips_fragments(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def test_it_leaves_invalid_urls_alone(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def test_it_normalises_url_casing(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def test_it_black_lists_invalid_params(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def test_it_strips_via_urls(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def test_it_sorts_params(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def _fetch_document_uri_claimants(session, uri_): return session.query(models.DocumentURI).filter( models.DocumentURI.claimant_normalized == uri.normalize(uri_)).all()
def _fetch_annotations(session, uri_): return session.query(models.Annotation).filter( models.Annotation.target_uri_normalized == uri.normalize(uri_)).all()
def test_it_removes_ports(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def test_normalize_returns_unicode(url, _): assert isinstance(uri.normalize(url), str)
def _fetch_document_uri_canonical_self_claim(session, uri_): return session.query(models.DocumentURI).filter( models.DocumentURI.uri_normalized == uri.normalize(uri_), models.DocumentURI.type.in_([u'self-claim', u'rel-canonical'])).all()
def test_normalize_returns_unicode(url, _): assert isinstance(uri.normalize(url), text_type)
def test_it_removes_trailing_slashes(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def target_uri(self, value): self._target_uri = value self._target_uri_normalized = uri.normalize(value)
def test_it_translates_scheme_correctly(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def test_it_handles_invalid_params(self, url_in, url_out): assert uri.normalize(url_in) == url_out
def test_normalize(url_in, url_out): assert uri.normalize(url_in) == url_out
def test_it_decodes_params_correctly(self, url_in, url_out): assert uri.normalize(url_in) == url_out