def _elem_features(elem): elem_name = normalize(elem.name) elem_value = _elem_attr(elem, "value") elem_placeholder = _elem_attr(elem, "placeholder") elem_css_class = _elem_attr(elem, "class") elem_id = _elem_attr(elem, "id") elem_title = _elem_attr(elem, "title") feat = { "tag": elem.tag, "name": tokenize(elem_name), "name-ngrams-3-5": ngrams(elem_name, 3, 5), "value": ngrams(elem_value, 5, 5), "value-ngrams": ngrams(elem_value, 5, 5), "css-class-ngrams": ngrams(elem_css_class, 5, 5), "help": tokenize(elem_title + " " + elem_placeholder), "id-ngrams": ngrams(elem_id, 4, 4), "id": tokenize(elem_id), } label = elem.label if label is not None: label_text = normalize(label.text_content()) feat["label"] = tokenize(label_text) feat["label-ngrams-3-5"] = ngrams(label_text, 3, 5) if elem.tag == "input": feat["input-type"] = elem.get("type", "text").lower() if elem.tag == "select": feat["option-text"] = [normalize(v) for v in elem.xpath("option//text()")] feat["option-value"] = [normalize(el.get("value", "")) for el in elem.xpath("option")] feat["option-num-pattern"] = list({number_pattern(v) for v in feat["option-text"] + feat["option-value"]}) return feat
def _elem_features(elem): elem_name = normalize(elem.name) elem_value = _elem_attr(elem, 'value') elem_placeholder = _elem_attr(elem, 'placeholder') elem_css_class = _elem_attr(elem, 'class') elem_id = _elem_attr(elem, 'id') elem_title = _elem_attr(elem, 'title') feat = { 'tag': elem.tag, 'name': tokenize(elem_name), 'name-ngrams-3-5': ngrams(elem_name, 3, 5), 'value': ngrams(elem_value, 5, 5), 'value-ngrams': ngrams(elem_value, 5, 5), 'css-class-ngrams': ngrams(elem_css_class, 5, 5), 'help': tokenize(elem_title + " " + elem_placeholder), 'id-ngrams': ngrams(elem_id, 4, 4), 'id': tokenize(elem_id), } label = elem.label if label is not None: label_text = normalize(label.text_content()) feat['label'] = tokenize(label_text) feat['label-ngrams-3-5'] = ngrams(label_text, 3, 5) if elem.tag == 'input': feat['input-type'] = elem.get('type', 'text').lower() if elem.tag == 'select': feat['option-text'] = [ normalize(v) for v in elem.xpath('option//text()') ] feat['option-value'] = [ normalize(el.get('value', '')) for el in elem.xpath('option') ] feat['option-num-pattern'] = list({ number_pattern(v) for v in feat['option-text'] + feat['option-value'] }) return feat
def _elem_features(elem): elem_name = normalize(elem.name) elem_value = _elem_attr(elem, 'value') elem_placeholder = _elem_attr(elem, 'placeholder') elem_css_class = _elem_attr(elem, 'class') elem_id = _elem_attr(elem, 'id') elem_title = _elem_attr(elem, 'title') feat = { 'tag': elem.tag, 'name': tokenize(elem_name), 'name-ngrams-3-5': ngrams(elem_name, 3, 5), 'value': ngrams(elem_value, 5, 5), 'value-ngrams': ngrams(elem_value, 5, 5), 'css-class-ngrams': ngrams(elem_css_class, 5, 5), 'help': tokenize(elem_title + " " + elem_placeholder), 'id-ngrams': ngrams(elem_id, 4, 4), 'id': tokenize(elem_id), } label = elem.label if label is not None: label_text = normalize(label.text_content()) feat['label'] = tokenize(label_text) feat['label-ngrams-3-5'] = ngrams(label_text, 3, 5) if elem.tag == 'input': feat['input-type'] = elem.get('type', 'text').lower() if elem.tag == 'select': feat['option-text'] = [normalize(v) for v in elem.xpath('option//text()')] feat['option-value'] = [normalize(el.get('value', '')) for el in elem.xpath('option')] feat['option-num-pattern'] = list( {number_pattern(v) for v in feat['option-text'] + feat['option-value']} ) return feat
def test_ngrams(seq, min_n, max_n, result): assert ngrams(seq, min_n, max_n) == result