예제 #1
0
def test_normalize_host():
    assert urlcanon.normalize_host('EXAMPLE.Com') == b'example.com'
    assert urlcanon.normalize_host('₹.com') == b'xn--yzg.com'
    assert urlcanon.normalize_host('XN--fa-Hia.de..') == b'xn--fa-hia.de'
    assert urlcanon.normalize_host('☕.de') == b'xn--53h.de'
    assert urlcanon.normalize_host(
        '日本⒈co.jp') == b'%E6%97%A5%E6%9C%AC%E2%92%88co%EF%BC%8Ejp'
    assert urlcanon.normalize_host('☃.net') == b'xn--n3h.net'
    assert urlcanon.normalize_host('%e2%98%83.n%45t') == b'xn--n3h.net'
    assert urlcanon.normalize_host('%25e2%98%%383.N%45t') == b'xn--n3h.net'
예제 #2
0
def test_host_matches_domain():
    assert urlcanon.host_matches_domain('1.2.3.4', '1.2.3.4')
    assert urlcanon.host_matches_domain(b'1.2.3.4', '1.2.3.4')
    assert urlcanon.host_matches_domain('1.2.3.4', b'1.2.3.4')
    assert urlcanon.host_matches_domain(b'1.2.3.4', b'1.2.3.4')
    assert urlcanon.host_matches_domain('foo.example.com', 'example.com')
    assert not urlcanon.host_matches_domain('example.com', 'foo.example.com')
    assert not urlcanon.host_matches_domain('foo.EXAMPLE.COM', 'example.com')
    assert urlcanon.host_matches_domain(
            urlcanon.normalize_host('foo.EXAMPLE.COM'), 'example.com')
    assert not urlcanon.host_matches_domain('☃.net', 'xn--n3h.net')
    assert urlcanon.host_matches_domain('☃.net', '☃.net')
    assert urlcanon.host_matches_domain('😬.☃.net', '☃.net')
    assert not urlcanon.host_matches_domain(
            '😬.☃.net', urlcanon.normalize_host('☃.net'))
    assert urlcanon.host_matches_domain(
            urlcanon.normalize_host('😬.☃.net'),
            urlcanon.normalize_host('☃.net'))
예제 #3
0
def test_url_matches_domain():
    assert urlcanon.url_matches_domain('http://1.2.3.4/', '1.2.3.4')
    assert urlcanon.url_matches_domain(b'scheme://1.2.3.4', '1.2.3.4')
    assert urlcanon.url_matches_domain('ftp://1.2.3.4/a/b/c/d', b'1.2.3.4')
    assert urlcanon.url_matches_domain(b'http://1.2.3.4', b'1.2.3.4')
    assert urlcanon.url_matches_domain(
            'http://foo.example.com', 'example.com')
    assert not urlcanon.url_matches_domain(
            'http://example.com', 'foo.example.com')
    assert not urlcanon.url_matches_domain(
            'http://foo.EXAMPLE.COM', 'example.com')
    assert urlcanon.url_matches_domain(
            urlcanon.whatwg('http://foo.EXAMPLE.COM'), 'example.com')
    assert not urlcanon.url_matches_domain('http://☃.net', 'xn--n3h.net')
    assert urlcanon.url_matches_domain('http://☃.net', '☃.net')
    assert urlcanon.url_matches_domain('http://😬.☃.net', '☃.net')
    assert not urlcanon.url_matches_domain(
            'http://😬.☃.net', urlcanon.normalize_host('☃.net'))
    assert urlcanon.url_matches_domain(
            urlcanon.whatwg('https://😬.☃.net'),
            urlcanon.normalize_host('☃.net'))
예제 #4
0
    def _enforce_limit(self, limit_key, limit_value, soft=False):
        if not self.server.stats_db:
            return
        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
        _limit_key = limit_key

        # if limit_key looks like 'job1:foo.com/total/urls' then we only want
        # to apply this rule if the requested url is within domain
        bucket0_fields = bucket0.split(':')
        if len(bucket0_fields) == 2:
            domain = urlcanon.normalize_host(bucket0_fields[1])
            if not urlcanon.host_matches_domain(self.hostname, domain):
                return # else host matches, go ahead and enforce the limit
            bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
            _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)

        value = self.server.stats_db.value(bucket0, bucket1, bucket2)
        if value and limit_value and limit_value > 0 and value >= limit_value:
            body = ("request rejected by warcprox: reached %s %s=%s\n" % (
                        "soft limit" if soft else "limit", _limit_key,
                        limit_value)).encode("utf-8")
            if soft:
                self.send_response(430, "Reached soft limit")
            else:
                self.send_response(420, "Reached limit")
            self.send_header("Content-Type", "text/plain;charset=utf-8")
            self.send_header("Connection", "close")
            self.send_header("Content-Length", len(body))
            response_meta = {
                "stats": {bucket0:self.server.stats_db.value(bucket0)}
            }
            if soft:
                response_meta["reached-soft-limit"] = {_limit_key:limit_value}
            else:
                response_meta["reached-limit"] = {_limit_key:limit_value}
            self.send_header(
                    "Warcprox-Meta",
                    json.dumps(response_meta, separators=(",",":")))
            self.end_headers()
            if self.command != "HEAD":
                self.wfile.write(body)
            self.connection.close()
            raise warcprox.RequestBlockedByRule(
                    "%s %s %s %s -- reached %s %s=%s" % (
                        self.client_address[0], 430 if soft else 420,
                        self.command, self.url,
                        "soft limit" if soft else "limit",
                        _limit_key, limit_value))
예제 #5
0
    def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
        if not self.server.stats_db:
            return

        # parse limit key
        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
        # normalize domain if part of bucket
        if ":" in bucket0:
            b, raw_domain = bucket0.split(":", 1)
            domain = urlcanon.normalize_host(raw_domain).decode("ascii")
            bucket0 = "%s:%s" % (b, domain)
            limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)

        if not bucket0 in buckets:
            return

        value = self.server.stats_db.value(bucket0, bucket1, bucket2)
        if value and limit_value and limit_value > 0 and value >= limit_value:
            body = ("request rejected by warcprox: reached %s %s=%s\n" %
                    ("soft limit" if soft else "limit", limit_key,
                     limit_value)).encode("utf-8")
            if soft:
                self.send_response(430, "Reached soft limit")
            else:
                self.send_response(420, "Reached limit")
            self.send_header("Content-Type", "text/plain;charset=utf-8")
            self.send_header("Connection", "close")
            self.send_header("Content-Length", len(body))
            response_meta = {
                "stats": {
                    bucket0: self.server.stats_db.value(bucket0)
                }
            }
            if soft:
                response_meta["reached-soft-limit"] = {limit_key: limit_value}
            else:
                response_meta["reached-limit"] = {limit_key: limit_value}
            self.send_header("Warcprox-Meta",
                             json.dumps(response_meta, separators=",:"))
            self.end_headers()
            if self.command != "HEAD":
                self.wfile.write(body)
            self.connection.close()
            raise warcprox.RequestBlockedByRule(
                "%s %s %s %s -- reached %s %s=%s" %
                (self.client_address[0], 430 if soft else 420, self.command,
                 self.url, "soft limit" if soft else "limit", limit_key,
                 limit_value))
예제 #6
0
    def _enforce_limit(self, buckets, limit_key, limit_value, soft=False):
        if not self.server.stats_db:
            return

        # parse limit key
        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
        # normalize domain if part of bucket
        if ":" in bucket0:
            b, raw_domain = bucket0.split(":", 1)
            domain = urlcanon.normalize_host(raw_domain).decode("ascii")
            bucket0 = "%s:%s" % (b, domain)
            limit_key = "%s/%s/%s" % (bucket0, bucket1, bucket2)

        if not bucket0 in buckets:
            return

        value = self.server.stats_db.value(bucket0, bucket1, bucket2)
        if value and limit_value and limit_value > 0 and value >= limit_value:
            body = ("request rejected by warcprox: reached %s %s=%s\n" % (
                        "soft limit" if soft else "limit", limit_key,
                        limit_value)).encode("utf-8")
            if soft:
                self.send_response(430, "Reached soft limit")
            else:
                self.send_response(420, "Reached limit")
            self.send_header("Content-Type", "text/plain;charset=utf-8")
            self.send_header("Connection", "close")
            self.send_header("Content-Length", len(body))
            response_meta = {
                "stats": {bucket0:self.server.stats_db.value(bucket0)}
            }
            if soft:
                response_meta["reached-soft-limit"] = {limit_key:limit_value}
            else:
                response_meta["reached-limit"] = {limit_key:limit_value}
            self.send_header(
                    "Warcprox-Meta", json.dumps(response_meta, separators=",:"))
            self.end_headers()
            if self.command != "HEAD":
                self.wfile.write(body)
            self.connection.close()
            raise warcprox.RequestBlockedByRule(
                    "%s %s %s %s -- reached %s %s=%s" % (
                        self.client_address[0], 430 if soft else 420,
                        self.command, self.url,
                        "soft limit" if soft else "limit",
                        limit_key, limit_value))
예제 #7
0
def unravel_buckets(url, warcprox_meta):
    '''
    Unravels bucket definitions in Warcprox-Meta header. Each bucket
    definition can either be a string, which signifies the name of the
    bucket, or a dict. If a dict it is expected to have at least an item
    with key 'bucket' whose value is the name of the bucket. The other
    currently recognized item is 'tally-domains', which if supplied should
    be a list of domains. This instructs warcprox to additionally tally
    substats of the given bucket by domain. Host stats are stored in the
    stats table under the key '{parent-bucket}:{domain(normalized)}'.

    Returns:
        list of strings

    Example Warcprox-Meta header (a real one will likely have other
    sections besides 'stats'):

    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}

    In this case the return value would be
    ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
    '''
    buckets = ["__all__"]
    if (warcprox_meta and "stats" in warcprox_meta
            and "buckets" in warcprox_meta["stats"]):
        for bucket in warcprox_meta["stats"]["buckets"]:
            if isinstance(bucket, dict):
                if not 'bucket' in bucket:
                    self.logger.warning(
                        'ignoring invalid stats bucket in '
                        'warcprox-meta header %s', bucket)
                    continue
                buckets.append(bucket['bucket'])
                if bucket.get('tally-domains'):
                    canon_url = urlcanon.semantic(url)
                    for domain in bucket['tally-domains']:
                        domain = urlcanon.normalize_host(domain).decode(
                            'ascii')
                        if urlcanon.url_matches_domain(canon_url, domain):
                            buckets.append('%s:%s' %
                                           (bucket['bucket'], domain))
            else:
                buckets.append(bucket)
    else:
        buckets.append("__unspecified__")

    return buckets
예제 #8
0
    def _enforce_limit(self, limit_key, limit_value, soft=False):
        bucket0, bucket1, bucket2 = limit_key.rsplit("/", 2)
        _limit_key = limit_key

        # if limit_key looks like 'job1:foo.com/total/urls' then we only want
        # to apply this rule if the requested url is within domain
        bucket0_fields = bucket0.split(':')
        if len(bucket0_fields) == 2:
            domain = urlcanon.normalize_host(bucket0_fields[1])
            if not urlcanon.host_matches_domain(self.hostname, domain):
                return # else host matches, go ahead and enforce the limit
            bucket0 = '%s:%s' % (bucket0_fields[0], domain.decode('ascii'))
            _limit_key = '%s/%s/%s' % (bucket0, bucket1, bucket2)

        value = self.server.stats_db.value(bucket0, bucket1, bucket2)
        if value and limit_value and limit_value > 0 and value >= limit_value:
            body = ("request rejected by warcprox: reached %s %s=%s\n" % (
                        "soft limit" if soft else "limit", _limit_key,
                        limit_value)).encode("utf-8")
            if soft:
                self.send_response(430, "Reached soft limit")
            else:
                self.send_response(420, "Reached limit")
            self.send_header("Content-Type", "text/plain;charset=utf-8")
            self.send_header("Connection", "close")
            self.send_header("Content-Length", len(body))
            response_meta = {
                "stats": {bucket0:self.server.stats_db.value(bucket0)}
            }
            if soft:
                response_meta["reached-soft-limit"] = {_limit_key:limit_value}
            else:
                response_meta["reached-limit"] = {_limit_key:limit_value}
            self.send_header(
                    "Warcprox-Meta",
                    json.dumps(response_meta, separators=(",",":")))
            self.end_headers()
            if self.command != "HEAD":
                self.wfile.write(body)
            self.connection.close()
            raise warcprox.RequestBlockedByRule(
                    "%s %s %s %s -- reached %s %s=%s" % (
                        self.client_address[0], 430 if soft else 420,
                        self.command, self.url,
                        "soft limit" if soft else "limit",
                        _limit_key, limit_value))
예제 #9
0
파일: stats.py 프로젝트: ukwa/warcprox
def unravel_buckets(url, warcprox_meta):
    '''
    Unravels bucket definitions in Warcprox-Meta header. Each bucket
    definition can either be a string, which signifies the name of the
    bucket, or a dict. If a dict it is expected to have at least an item
    with key 'bucket' whose value is the name of the bucket. The other
    currently recognized item is 'tally-domains', which if supplied should
    be a list of domains. This instructs warcprox to additionally tally
    substats of the given bucket by domain. Host stats are stored in the
    stats table under the key '{parent-bucket}:{domain(normalized)}'.

    Returns:
        list of strings

    Example Warcprox-Meta header (a real one will likely have other
    sections besides 'stats'):

    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}

    In this case the return value would be
    ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
    '''
    buckets = ["__all__"]
    if (warcprox_meta and "stats" in warcprox_meta
            and "buckets" in warcprox_meta["stats"]):
        for bucket in warcprox_meta["stats"]["buckets"]:
            if isinstance(bucket, dict):
                if not 'bucket' in bucket:
                    self.logger.warn(
                            'ignoring invalid stats bucket in '
                            'warcprox-meta header %s', bucket)
                    continue
                buckets.append(bucket['bucket'])
                if bucket.get('tally-domains'):
                    canon_url = urlcanon.semantic(url)
                    for domain in bucket['tally-domains']:
                        domain = urlcanon.normalize_host(domain).decode('ascii')
                        if urlcanon.url_matches_domain(canon_url, domain):
                            buckets.append(
                                    '%s:%s' % (bucket['bucket'], domain))
            else:
                buckets.append(bucket)
    else:
        buckets.append("__unspecified__")

    return buckets
예제 #10
0
 def _determine_host_port(self):
     # Get hostname and port to connect to
     if self.is_connect:
         host, self.port = self.path.split(':')
     else:
         self.url = self.path
         u = urllib_parse.urlparse(self.url)
         if u.scheme != 'http':
             raise Exception(
                     'unable to parse request %r as a proxy request' % (
                         self.requestline))
         host = u.hostname
         self.port = u.port or 80
         self.path = urllib_parse.urlunparse(
             urllib_parse.ParseResult(
                 scheme='', netloc='', params=u.params, path=u.path or '/',
                 query=u.query, fragment=u.fragment))
     self.hostname = urlcanon.normalize_host(host).decode('ascii')
예제 #11
0
 def _determine_host_port(self):
     # Get hostname and port to connect to
     if self.is_connect:
         host, self.port = self.path.split(':')
     else:
         self.url = self.path
         u = urllib_parse.urlparse(self.url)
         if u.scheme != 'http':
             raise Exception(
                     'unable to parse request %r as a proxy request' % (
                         self.requestline))
         host = u.hostname
         self.port = u.port or 80
         self.path = urllib_parse.urlunparse(
             urllib_parse.ParseResult(
                 scheme='', netloc='', params=u.params, path=u.path or '/',
                 query=u.query, fragment=u.fragment))
     self.hostname = urlcanon.normalize_host(host).decode('ascii')