Exemplo n.º 1
0
def link_to_features(link):
    text = normalize(get_link_text(link))

    href = get_link_href(link)
    p = urlsplit(href)

    query_parsed = parse_qsl(p.query)
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(
        ngrams_wb(" ".join([normalize(name) for name in query_param_names]), 3,
                  5, True))

    elem = get_selector_root(link)
    elem_target = _elem_attr(elem, 'target')
    elem_rel = _elem_attr(elem, 'rel')

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)

    return {
        'bias':
        3.0,
        'isdigit':
        text.isdigit(),
        'isalpha':
        text.isalpha(),
        'elem-target':
        elem_target,
        'elem-rel':
        elem_rel,
        'num-tokens%s' % _num_tokens_feature(text):
        1.0,
        'text':
        _as_list(ngrams_wb(replace_digits(text), 2, 5),
                 AUTOPAGER_LIMITS.max_text_features),
        'text-exact':
        replace_digits(text.strip()[:20].strip()),
        'class':
        _as_list(ngrams_wb(css_classes, 4, 5),
                 AUTOPAGER_LIMITS.max_css_features),
        'query':
        query_param_names_ngrams,
        'path-has-page':
        'page' in p.path.lower(),
        'path-has-pageXX':
        re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None,
        'path-has-number':
        any(part.isdigit() for part in p.path.split('/')),
        'href-has-year':
        re.search('20\d\d', href) is not None,
    }
Exemplo n.º 2
0
def page_to_features(xseq):
    features = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)

    # weight is less than 1 because there is a lot of duplicate information
    # in these ngrams and so we want to regularize them stronger
    # (as if they are a single feature, not many features)
    k = 0.2
    for feat, (before, after) in zip(features, around):
        feat['text-before'] = {
            n: k
            for n in ngrams_wb(normalize(before), 5, 5)
        }
        feat['text-after'] = {n: k for n in ngrams_wb(normalize(after), 5, 5)}
    return features
Exemplo n.º 3
0
def link_to_features(link):
    text = normalize(get_link_text(link))

    href = get_link_href(link)
    p = urlsplit(href)

    query_parsed = parse_qsl(p.query)
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    )

    elem = link.root
    elem_target = _elem_attr(elem, 'target')
    elem_rel = _elem_attr(elem, 'rel')

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(self_and_children_classes + ' ' + parent_classes)

    return {
        'bias': 3.0,
        'isdigit': text.isdigit(),
        'isalpha': text.isalpha(),
        'elem-target': elem_target,
        'elem-rel': elem_rel,
        'num-tokens%s' % _num_tokens_feature(text): 1.0,

        'text': ngrams_wb(replace_digits(text), 2, 5),
        'text-exact': replace_digits(text.strip()[:20].strip()),
        'class': ngrams_wb(css_classes, 4, 5),
        'query': query_param_names_ngrams,

        'path-has-page': 'page' in p.path.lower(),
        'path-has-pageXX': re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None,
        'path-has-number': any(part.isdigit() for part in p.path.split('/')),

        'href-has-year': re.search('20\d\d', href) is not None,
    }
Exemplo n.º 4
0
def test_normalize():
    assert normalize("Hello,\n  world!") == "hello, world!"
Exemplo n.º 5
0
def _elem_attr(elem, attr):
    return normalize(elem.get(attr, ''))
Exemplo n.º 6
0
def _elem_attr(elem, attr):
    return normalize(elem.get(attr, ''))
Exemplo n.º 7
0
def test_normalize():
    assert normalize("Hello,\n  world!") == "hello, world!"