def unravel_buckets(url, warcprox_meta): ''' Unravels bucket definitions in Warcprox-Meta header. Each bucket definition can either be a string, which signifies the name of the bucket, or a dict. If a dict it is expected to have at least an item with key 'bucket' whose value is the name of the bucket. The other currently recognized item is 'tally-domains', which if supplied should be a list of domains. This instructs warcprox to additionally tally substats of the given bucket by domain. Host stats are stored in the stats table under the key '{parent-bucket}:{domain(normalized)}'. Returns: list of strings Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} In this case the return value would be ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] ''' buckets = ["__all__"] if (warcprox_meta and "stats" in warcprox_meta and "buckets" in warcprox_meta["stats"]): for bucket in warcprox_meta["stats"]["buckets"]: if isinstance(bucket, dict): if not 'bucket' in bucket: self.logger.warning( 'ignoring invalid stats bucket in ' 'warcprox-meta header %s', bucket) continue buckets.append(bucket['bucket']) if bucket.get('tally-domains'): canon_url = urlcanon.semantic(url) for domain in bucket['tally-domains']: domain = urlcanon.normalize_host(domain).decode( 'ascii') if urlcanon.url_matches_domain(canon_url, domain): buckets.append('%s:%s' % (bucket['bucket'], domain)) else: buckets.append(bucket) else: buckets.append("__unspecified__") return buckets
def unravel_buckets(url, warcprox_meta): ''' Unravels bucket definitions in Warcprox-Meta header. Each bucket definition can either be a string, which signifies the name of the bucket, or a dict. If a dict it is expected to have at least an item with key 'bucket' whose value is the name of the bucket. The other currently recognized item is 'tally-domains', which if supplied should be a list of domains. This instructs warcprox to additionally tally substats of the given bucket by domain. Host stats are stored in the stats table under the key '{parent-bucket}:{domain(normalized)}'. Returns: list of strings Example Warcprox-Meta header (a real one will likely have other sections besides 'stats'): Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}} In this case the return value would be ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"] ''' buckets = ["__all__"] if (warcprox_meta and "stats" in warcprox_meta and "buckets" in warcprox_meta["stats"]): for bucket in warcprox_meta["stats"]["buckets"]: if isinstance(bucket, dict): if not 'bucket' in bucket: self.logger.warn( 'ignoring invalid stats bucket in ' 'warcprox-meta header %s', bucket) continue buckets.append(bucket['bucket']) if bucket.get('tally-domains'): canon_url = urlcanon.semantic(url) for domain in bucket['tally-domains']: domain = urlcanon.normalize_host(domain).decode('ascii') if urlcanon.url_matches_domain(canon_url, domain): buckets.append( '%s:%s' % (bucket['bucket'], domain)) else: buckets.append(bucket) else: buckets.append("__unspecified__") return buckets
def test_url_matches_domain(): assert urlcanon.url_matches_domain('http://1.2.3.4/', '1.2.3.4') assert urlcanon.url_matches_domain(b'scheme://1.2.3.4', '1.2.3.4') assert urlcanon.url_matches_domain('ftp://1.2.3.4/a/b/c/d', b'1.2.3.4') assert urlcanon.url_matches_domain(b'http://1.2.3.4', b'1.2.3.4') assert urlcanon.url_matches_domain( 'http://foo.example.com', 'example.com') assert not urlcanon.url_matches_domain( 'http://example.com', 'foo.example.com') assert not urlcanon.url_matches_domain( 'http://foo.EXAMPLE.COM', 'example.com') assert urlcanon.url_matches_domain( urlcanon.whatwg('http://foo.EXAMPLE.COM'), 'example.com') assert not urlcanon.url_matches_domain('http://☃.net', 'xn--n3h.net') assert urlcanon.url_matches_domain('http://☃.net', '☃.net') assert urlcanon.url_matches_domain('http://😬.☃.net', '☃.net') assert not urlcanon.url_matches_domain( 'http://😬.☃.net', urlcanon.normalize_host('☃.net')) assert urlcanon.url_matches_domain( urlcanon.whatwg('https://😬.☃.net'), urlcanon.normalize_host('☃.net'))