def test_get_cleaned_form_html_human_readable(): form = load_html(FORM1) html = get_cleaned_form_html(form, human_readable=True) assert 'style' not in html assert 'script' not in html assert 'div' not in html old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)] new_fields = [(f.name, f.value) for f in get_fields_to_annotate(load_html(html))] assert old_fields == new_fields
def classify_proba(self, form, threshold=0.0): """ Return dict with probabilities of ``form`` and its fields belonging to various form and field classes:: { 'form': {'type1': prob1, 'type2': prob2, ...}, 'fields': { 'name': {'type1': prob1, 'type2': prob2, ...}, ... } } ``form`` should be an lxml HTML <form> element. Only classes with probability >= ``threshold`` are preserved. """ form_types_proba = self.form_classifier.classify_proba(form, threshold) form_type = max(form_types_proba, key=lambda p: form_types_proba[p]) field_elems = get_fields_to_annotate(form) xseq = fieldtype_model.get_form_features(form, form_type, field_elems) yseq = self._field_model.predict_marginals_single(xseq) return { 'form': form_types_proba, 'fields': { elem.name: thresholded(probs, threshold) for elem, probs in zip(field_elems, yseq) }, }
def get_form_features(form, form_type, field_elems=None): """ Return a list of feature dicts, a dict per visible submittable field in a <form> element. """ if field_elems is None: field_elems = get_fields_to_annotate(form) text_before, text_after = get_text_around_elems(form, field_elems) res = [_elem_features(elem) for elem in field_elems] for idx, elem_feat in enumerate(res): if idx == 0: elem_feat['is-first'] = True if idx == len(res)-1: elem_feat['is-last'] = True elem_feat['form-type'] = form_type # get text before element text = normalize(text_before[field_elems[idx]]) tokens = tokenize(text)[-6:] elem_feat['text-before'] = token_ngrams(tokens, 1, 2) # get text after element text = normalize(text_after[field_elems[idx]]) tokens = tokenize(text)[:5] elem_feat['text-after'] = token_ngrams(tokens, 1, 2) elem_feat['bias'] = 1 return res
def check(self, verbose=True): """ Check that items in storage are correct; print the problems found. Return the number of errors found. """ index = self.get_index() items = list(index.items()) errors = 0 if verbose: items = tqdm(items, "Checking", leave=True, mininterval=0, ascii=True, ncols=80, unit=' files') for fn, info in items: fn_full = os.path.join(self.folder, fn) if not os.path.exists(fn_full): print("\nFile not found: %r" % fn_full) errors += 1 continue with open(fn_full, 'rb') as f: data = f.read() doc = load_html(data, info['url']) if len(doc.xpath("//form")) != len(info["forms"]): errors += 1 msg = "\nInvalid form count for entry %r: expected %d, got %d" % ( fn, len(doc.xpath("//form")), len(info["forms"])) print(msg) if 'visible_html_fields' not in info: errors += 1 print("No fields data for entry {!r}".format(fn)) else: fields = info['visible_html_fields'] if len(fields) != len(doc.xpath('//form')): errors += 1 print( "Invalid number of form field annotations for entry {!r}" .format(fn)) else: for idx, (form, fields_info) in enumerate( zip(doc.xpath('//form'), fields)): elems = get_fields_to_annotate(form) names = {elem.name for elem in elems} if names != set(fields_info.keys()): errors += 1 print("Invalid field names for form #{}, " "entry {!r}. Expected: {}, found: {}".format( idx, fn, names, set(fields_info.keys()))) if not errors: print("Status: OK") else: print("Status: %d error(s) found" % errors) return errors
def get_form_features(form, form_type, field_elems=None): """ Return a list of feature dicts, a dict per visible submittable field in a <form> element. """ if field_elems is None: field_elems = get_fields_to_annotate(form) text_before, text_after = get_text_around_elems(form, field_elems) res = [_elem_features(elem) for elem in field_elems] for idx, elem_feat in enumerate(res): if idx == 0: elem_feat['is-first'] = True if idx == len(res) - 1: elem_feat['is-last'] = True elem_feat['form-type'] = form_type # get text before element text = normalize(text_before[field_elems[idx]]) tokens = tokenize(text)[-6:] elem_feat['text-before'] = token_ngrams(tokens, 1, 2) # get text after element text = normalize(text_after[field_elems[idx]]) tokens = tokenize(text)[:5] elem_feat['text-after'] = token_ngrams(tokens, 1, 2) elem_feat['bias'] = 1 return res
def field_elems(self): """ Return a list of lxml Elements for fields which are annotated. Fields are returned in in order they appear in form; only visible submittable fields are considered. """ return get_fields_to_annotate(self.form)
def test_get_fields_to_annotate(): tree = load_html(FORM1) form = get_forms(tree)[0] elems = get_fields_to_annotate(form) assert all(getattr(el, 'name', None) for el in elems) names = get_field_names(elems) assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel'] assert set(names) == {el.name for el in elems}
def add_result(self, html, url, form_answers=None, visible_html_fields=None, index=None, add_empty=True): """ Save HTML source and its <form> and form field types. """ forms = get_forms(load_html(html)) if not add_empty: if not len(forms): return if all(len(get_fields_to_annotate(form)) == 0 for form in forms): return if form_answers is None: form_schema = self.get_form_schema() form_answers = [form_schema.na_value for _ in forms] else: assert len(form_answers) == len(forms) if visible_html_fields is None: field_schema = self.get_field_schema() visible_html_fields = [{ name: field_schema.na_value for name in get_field_names(get_fields_to_annotate(form)) } for form in forms] filename = self.generate_filename(url) path = os.path.relpath(filename, self.folder) if index is None: index = self.get_index() index[path] = { "url": url, "forms": form_answers, "visible_html_fields": visible_html_fields, } with open(filename, 'wb') as f: if not isinstance(html, bytes): html = html.encode('utf8') f.write(html) self.write_index(index) return path
def FormAnnotator(ann, annotate_fields=True, annotate_types=True, max_fields=80): """ Widget for annotating a single HTML form. """ assert annotate_fields or annotate_types form_types_inv = ann.form_schema.types_inv children = [] if annotate_types: children += [FormTypeSelect(ann)] tpl = """ <h4> {tp} <a href='{url}'>{url}</a> <small>{key} #{index}</small> </h4> """ header = widgets.HTML( tpl.format(url=ann.url, index=ann.index, key=ann.key, tp=form_types_inv.get(ann.type, '?'))) children += [header] if annotate_fields: pages = [] names = get_field_names(get_fields_to_annotate(ann.form)) if len(names) > max_fields: children += [ widgets.HTML("<h4>Too many fields ({})</h4>".format( len(names))) ] else: for name in names: field_type_select = FieldTypeSelect(ann, name) html_view = HtmlView(ann.form, name) page = widgets.Box(children=[field_type_select, html_view]) pages.append(page) field_tabs = widgets.Tab(children=pages, padding=4) for idx, name in enumerate(names): field_tabs.set_title(idx, name) children += [field_tabs] else: children += [HtmlView(ann.form)] return widgets.VBox(children, padding=8)
def check(self): """ Check that items in storage are correct; print the problems found. Return the number of errors found. """ index = self.get_index() items = list(index.items()) errors = 0 for fn, info in tqdm(items, "Checking", leave=True, mininterval=0, ascii=True, ncols=80, unit=' files'): fn_full = os.path.join(self.folder, fn) if not os.path.exists(fn_full): print("\nFile not found: %r" % fn_full) errors += 1 continue with open(fn_full, 'rb') as f: data = f.read() doc = load_html(data, info['url']) if len(doc.xpath("//form")) != len(info["forms"]): errors += 1 msg = "\nInvalid form count for entry %r: expected %d, got %d" % ( fn, len(doc.xpath("//form")), len(info["forms"]) ) print(msg) if 'visible_html_fields' not in info: errors += 1 print("No fields data for entry {!r}".format(fn)) else: fields = info['visible_html_fields'] if len(fields) != len(doc.xpath('//form')): errors += 1 print("Invalid number of form field annotations for entry {!r}".format(fn)) else: for idx, (form, fields_info) in enumerate(zip(doc.xpath('//form'), fields)): elems = get_fields_to_annotate(form) names = {elem.name for elem in elems} if names != set(fields_info.keys()): errors += 1 print("Invalid field names for form #{}, " "entry {!r}. Expected: {}, found: {}".format( idx, fn, names, set(fields_info.keys()) )) if not errors: print("Status: OK") else: print("Status: %d error(s) found" % errors) return errors
def FormAnnotator(ann, annotate_fields=True, annotate_types=True, max_fields=80): """ Widget for annotating a single HTML form. """ assert annotate_fields or annotate_types form_types_inv = ann.form_schema.types_inv children = [] if annotate_types: children += [FormTypeSelect(ann)] tpl = """ <h4> {tp} <a href='{url}'>{url}</a> <small>{key} #{index}</small> </h4> """ header = widgets.HTML(tpl.format( url=ann.url, index=ann.index, key=ann.key, tp=form_types_inv.get(ann.type, '?') )) children += [header] if annotate_fields: pages = [] names = get_field_names(get_fields_to_annotate(ann.form)) if len(names) > max_fields: children += [ widgets.HTML("<h4>Too many fields ({})</h4>".format(len(names))) ] else: for name in names: field_type_select = FieldTypeSelect(ann, name) html_view = HtmlView(ann.form, name) page = widgets.Box(children=[field_type_select, html_view]) pages.append(page) field_tabs = widgets.Tab(children=pages, padding=4) for idx, name in enumerate(names): field_tabs.set_title(idx, name) children += [field_tabs] else: children += [HtmlView(ann.form)] return widgets.VBox(children, padding=8)
def classify(self, form): """ Return ``{'form': 'type', 'fields': {'name': 'type', ...}}`` dict with form type and types of its visible submittable fields. """ form_type = self.form_classifier.classify(form) field_elems = get_fields_to_annotate(form) xseq = fieldtype_model.get_form_features(form, form_type, field_elems) yseq = self._field_model.predict_single(xseq) return { 'form': form_type, 'fields': { elem.name: cls for elem, cls in zip(field_elems, yseq) } }
def classify(self, form, fields=True): """ Return ``{'form': 'type', 'fields': {'name': 'type', ...}}`` dict with form type and types of its visible submittable fields. If ``fields`` argument is False, only information about form type is returned: ``{'form': 'type'}``. """ form_type = self.form_classifier.classify(form) res = {'form': form_type} if fields: field_elems = get_fields_to_annotate(form) xseq = fieldtype_model.get_form_features(form, form_type, field_elems) yseq = self._field_model.predict_single(xseq) res['fields'] = { elem.name: cls for elem, cls in zip(field_elems, yseq) } return res
def classify_proba(self, form, threshold=0.0, fields=True): """ Return dict with probabilities of ``form`` and its fields belonging to various form and field classes:: { 'form': {'type1': prob1, 'type2': prob2, ...}, 'fields': { 'name': {'type1': prob1, 'type2': prob2, ...}, ... } } ``form`` should be an lxml HTML <form> element. Only classes with probability >= ``threshold`` are preserved. If ``fields`` is False, only information about the form is returned:: { 'form': {'type1': prob1, 'type2': prob2, ...} } """ form_types_proba = self.form_classifier.classify_proba(form, threshold) res = {'form': form_types_proba} if fields: form_type = max(form_types_proba, key=lambda p: form_types_proba[p]) field_elems = get_fields_to_annotate(form) xseq = fieldtype_model.get_form_features(form, form_type, field_elems) yseq = self._field_model.predict_marginals_single(xseq) res['fields'] = { elem.name: thresholded(probs, threshold) for elem, probs in zip(field_elems, yseq) } return res
def test_get_text_around_elems(): tree = load_html(""" <form> <h1>Login</h1> Please <b>enter</b> your details <p> Username: <input name='username'/> required <div>Email:</div> <input type='text' name='email'> * </p> Thanks! </form> """) elems = get_fields_to_annotate(tree) user, email = elems before, after = get_text_around_elems(tree, elems) assert len(before) == 2 assert before[user] == 'Login Please enter your details Username:'******'required Email:' assert len(after) == 2 assert after[user] == 'required Email:' assert after[email] == '* Thanks!' get_text_around_elems(tree, []) == {}, {}