예제 #1
0
def unravel_buckets(url, warcprox_meta):
    '''
    Unravels bucket definitions in Warcprox-Meta header. Each bucket
    definition can either be a string, which signifies the name of the
    bucket, or a dict. If a dict it is expected to have at least an item
    with key 'bucket' whose value is the name of the bucket. The other
    currently recognized item is 'tally-domains', which if supplied should
    be a list of domains. This instructs warcprox to additionally tally
    substats of the given bucket by domain. Host stats are stored in the
    stats table under the key '{parent-bucket}:{domain(normalized)}'.

    Returns:
        list of strings

    Example Warcprox-Meta header (a real one will likely have other
    sections besides 'stats'):

    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}

    In this case the return value would be
    ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
    '''
    buckets = ["__all__"]
    if (warcprox_meta and "stats" in warcprox_meta
            and "buckets" in warcprox_meta["stats"]):
        for bucket in warcprox_meta["stats"]["buckets"]:
            if isinstance(bucket, dict):
                if not 'bucket' in bucket:
                    self.logger.warning(
                        'ignoring invalid stats bucket in '
                        'warcprox-meta header %s', bucket)
                    continue
                buckets.append(bucket['bucket'])
                if bucket.get('tally-domains'):
                    canon_url = urlcanon.semantic(url)
                    for domain in bucket['tally-domains']:
                        domain = urlcanon.normalize_host(domain).decode(
                            'ascii')
                        if urlcanon.url_matches_domain(canon_url, domain):
                            buckets.append('%s:%s' %
                                           (bucket['bucket'], domain))
            else:
                buckets.append(bucket)
    else:
        buckets.append("__unspecified__")

    return buckets
예제 #2
0
파일: stats.py 프로젝트: ukwa/warcprox
def unravel_buckets(url, warcprox_meta):
    '''
    Unravels bucket definitions in Warcprox-Meta header. Each bucket
    definition can either be a string, which signifies the name of the
    bucket, or a dict. If a dict it is expected to have at least an item
    with key 'bucket' whose value is the name of the bucket. The other
    currently recognized item is 'tally-domains', which if supplied should
    be a list of domains. This instructs warcprox to additionally tally
    substats of the given bucket by domain. Host stats are stored in the
    stats table under the key '{parent-bucket}:{domain(normalized)}'.

    Returns:
        list of strings

    Example Warcprox-Meta header (a real one will likely have other
    sections besides 'stats'):

    Warcprox-Meta: {"stats":{"buckets":["bucket1",{"bucket":"bucket2","tally-domains":["foo.bar.com","192.168.10.20"}]}}

    In this case the return value would be
    ["bucket1","bucket2","bucket2:foo.bar.com","bucket2:192.168.10.20"]
    '''
    buckets = ["__all__"]
    if (warcprox_meta and "stats" in warcprox_meta
            and "buckets" in warcprox_meta["stats"]):
        for bucket in warcprox_meta["stats"]["buckets"]:
            if isinstance(bucket, dict):
                if not 'bucket' in bucket:
                    self.logger.warn(
                            'ignoring invalid stats bucket in '
                            'warcprox-meta header %s', bucket)
                    continue
                buckets.append(bucket['bucket'])
                if bucket.get('tally-domains'):
                    canon_url = urlcanon.semantic(url)
                    for domain in bucket['tally-domains']:
                        domain = urlcanon.normalize_host(domain).decode('ascii')
                        if urlcanon.url_matches_domain(canon_url, domain):
                            buckets.append(
                                    '%s:%s' % (bucket['bucket'], domain))
            else:
                buckets.append(bucket)
    else:
        buckets.append("__unspecified__")

    return buckets
예제 #3
0
def test_url_matches_domain():
    assert urlcanon.url_matches_domain('http://1.2.3.4/', '1.2.3.4')
    assert urlcanon.url_matches_domain(b'scheme://1.2.3.4', '1.2.3.4')
    assert urlcanon.url_matches_domain('ftp://1.2.3.4/a/b/c/d', b'1.2.3.4')
    assert urlcanon.url_matches_domain(b'http://1.2.3.4', b'1.2.3.4')
    assert urlcanon.url_matches_domain(
            'http://foo.example.com', 'example.com')
    assert not urlcanon.url_matches_domain(
            'http://example.com', 'foo.example.com')
    assert not urlcanon.url_matches_domain(
            'http://foo.EXAMPLE.COM', 'example.com')
    assert urlcanon.url_matches_domain(
            urlcanon.whatwg('http://foo.EXAMPLE.COM'), 'example.com')
    assert not urlcanon.url_matches_domain('http://☃.net', 'xn--n3h.net')
    assert urlcanon.url_matches_domain('http://☃.net', '☃.net')
    assert urlcanon.url_matches_domain('http://😬.☃.net', '☃.net')
    assert not urlcanon.url_matches_domain(
            'http://😬.☃.net', urlcanon.normalize_host('☃.net'))
    assert urlcanon.url_matches_domain(
            urlcanon.whatwg('https://😬.☃.net'),
            urlcanon.normalize_host('☃.net'))