예제 #1
0
def _elem_features(elem):
    elem_name = normalize(elem.name)
    elem_value = _elem_attr(elem, "value")
    elem_placeholder = _elem_attr(elem, "placeholder")
    elem_css_class = _elem_attr(elem, "class")
    elem_id = _elem_attr(elem, "id")
    elem_title = _elem_attr(elem, "title")

    feat = {
        "tag": elem.tag,
        "name": tokenize(elem_name),
        "name-ngrams-3-5": ngrams(elem_name, 3, 5),
        "value": ngrams(elem_value, 5, 5),
        "value-ngrams": ngrams(elem_value, 5, 5),
        "css-class-ngrams": ngrams(elem_css_class, 5, 5),
        "help": tokenize(elem_title + " " + elem_placeholder),
        "id-ngrams": ngrams(elem_id, 4, 4),
        "id": tokenize(elem_id),
    }
    label = elem.label
    if label is not None:
        label_text = normalize(label.text_content())
        feat["label"] = tokenize(label_text)
        feat["label-ngrams-3-5"] = ngrams(label_text, 3, 5)

    if elem.tag == "input":
        feat["input-type"] = elem.get("type", "text").lower()

    if elem.tag == "select":
        feat["option-text"] = [normalize(v) for v in elem.xpath("option//text()")]
        feat["option-value"] = [normalize(el.get("value", "")) for el in elem.xpath("option")]
        feat["option-num-pattern"] = list({number_pattern(v) for v in feat["option-text"] + feat["option-value"]})

    return feat
예제 #2
0
def _elem_features(elem):
    elem_name = normalize(elem.name)
    elem_value = _elem_attr(elem, 'value')
    elem_placeholder = _elem_attr(elem, 'placeholder')
    elem_css_class = _elem_attr(elem, 'class')
    elem_id = _elem_attr(elem, 'id')
    elem_title = _elem_attr(elem, 'title')

    feat = {
        'tag': elem.tag,
        'name': tokenize(elem_name),
        'name-ngrams-3-5': ngrams(elem_name, 3, 5),
        'value': ngrams(elem_value, 5, 5),
        'value-ngrams': ngrams(elem_value, 5, 5),
        'css-class-ngrams': ngrams(elem_css_class, 5, 5),
        'help': tokenize(elem_title + " " + elem_placeholder),
        'id-ngrams': ngrams(elem_id, 4, 4),
        'id': tokenize(elem_id),
    }
    label = elem.label
    if label is not None:
        label_text = normalize(label.text_content())
        feat['label'] = tokenize(label_text)
        feat['label-ngrams-3-5'] = ngrams(label_text, 3, 5)

    if elem.tag == 'input':
        feat['input-type'] = elem.get('type', 'text').lower()

    if elem.tag == 'select':
        feat['option-text'] = [
            normalize(v) for v in elem.xpath('option//text()')
        ]
        feat['option-value'] = [
            normalize(el.get('value', '')) for el in elem.xpath('option')
        ]
        feat['option-num-pattern'] = list({
            number_pattern(v)
            for v in feat['option-text'] + feat['option-value']
        })

    return feat
예제 #3
0
def _elem_features(elem):
    elem_name = normalize(elem.name)
    elem_value = _elem_attr(elem, 'value')
    elem_placeholder = _elem_attr(elem, 'placeholder')
    elem_css_class = _elem_attr(elem, 'class')
    elem_id = _elem_attr(elem, 'id')
    elem_title = _elem_attr(elem, 'title')

    feat = {
        'tag': elem.tag,
        'name': tokenize(elem_name),
        'name-ngrams-3-5': ngrams(elem_name, 3, 5),
        'value': ngrams(elem_value, 5, 5),
        'value-ngrams': ngrams(elem_value, 5, 5),
        'css-class-ngrams': ngrams(elem_css_class, 5, 5),
        'help': tokenize(elem_title + " " + elem_placeholder),
        'id-ngrams': ngrams(elem_id, 4, 4),
        'id': tokenize(elem_id),
    }
    label = elem.label
    if label is not None:
        label_text = normalize(label.text_content())
        feat['label'] = tokenize(label_text)
        feat['label-ngrams-3-5'] = ngrams(label_text, 3, 5)

    if elem.tag == 'input':
        feat['input-type'] = elem.get('type', 'text').lower()

    if elem.tag == 'select':
        feat['option-text'] = [normalize(v) for v in elem.xpath('option//text()')]
        feat['option-value'] = [normalize(el.get('value', '')) for el in elem.xpath('option')]
        feat['option-num-pattern'] = list(
            {number_pattern(v) for v in feat['option-text'] + feat['option-value']}
        )

    return feat
예제 #4
0
def test_ngrams(seq, min_n, max_n, result):
    assert ngrams(seq, min_n, max_n) == result