def test_get_fields_to_annotate(): tree = load_html(FORM1) form = get_forms(tree)[0] elems = get_fields_to_annotate(form) assert all(getattr(el, 'name', None) for el in elems) names = get_field_names(elems) assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel'] assert set(names) == {el.name for el in elems}
def test_get_forms(): forms = get_forms(load_html(""" <p>some text</p> <form action="/go">hi</form> <FORM method='post'><input name='foo'></FORM> """)) assert len(forms) == 2 assert forms[0].action == "/go" assert forms[1].method == "POST"
def test_classify(tree): form = get_forms(tree)[0] assert formasaurus.classify(form) == { 'form': 'login', 'fields': { 'password': '******', 'username': '******' }, }
def test_get_forms(): forms = get_forms( load_html(""" <p>some text</p> <form action="/go">hi</form> <FORM method='post'><input name='foo'></FORM> """)) assert len(forms) == 2 assert forms[0].action == "/go" assert forms[1].method == "POST"
def iter_annotations(self, index=None, drop_duplicates=True, drop_na=True, drop_skipped=True, simplify_form_types=False, simplify_field_types=False, verbose=False, leave=False): """ Return an iterator over :class:`FormAnnotation` objects. """ form_schema = self.get_form_schema() field_schema = self.get_field_schema() trees = self.iter_trees(index=index) if verbose: trees = tqdm(trees, "Loading", mininterval=0, leave=leave, ascii=True, ncols=80, unit=' files') seen = set() for path, tree, info in trees: for idx, (form, tp) in enumerate(zip(get_forms(tree), info["forms"])): if simplify_form_types: tp = form_schema.simplify_map.get(tp, tp) if drop_na and tp == form_schema.na_value: continue if drop_skipped and tp == form_schema.skip_value: continue if drop_duplicates: fp = self.get_fingerprint(form) if fp in seen: continue seen.add(fp) if simplify_field_types: info = copy.deepcopy(info) for fields in info['visible_html_fields']: for k, v in fields.items(): fields[k] = field_schema.simplify_map.get(v, v) yield FormAnnotation(form, tp, idx, info, path, form_schema, field_schema) if verbose and leave: print("")
def extract_forms(self, tree_or_html, proba=False, threshold=0.05): """ Given a lxml tree or HTML source code, return a list of ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results of :meth:`classify` or :meth:`classify_proba`` calls, depending on ``proba`` parameter. """ forms = get_forms(load_html(tree_or_html)) if proba: return [(form, self.classify_proba(form, threshold)) for form in forms] else: return [(form, self.classify(form)) for form in forms]
def add_result(self, html, url, form_answers=None, visible_html_fields=None, index=None, add_empty=True): """ Save HTML source and its <form> and form field types. """ forms = get_forms(load_html(html)) if not add_empty: if not len(forms): return if all(len(get_fields_to_annotate(form)) == 0 for form in forms): return if form_answers is None: form_schema = self.get_form_schema() form_answers = [form_schema.na_value for _ in forms] else: assert len(form_answers) == len(forms) if visible_html_fields is None: field_schema = self.get_field_schema() visible_html_fields = [{ name: field_schema.na_value for name in get_field_names(get_fields_to_annotate(form)) } for form in forms] filename = self.generate_filename(url) path = os.path.relpath(filename, self.folder) if index is None: index = self.get_index() index[path] = { "url": url, "forms": form_answers, "visible_html_fields": visible_html_fields, } with open(filename, 'wb') as f: if not isinstance(html, bytes): html = html.encode('utf8') f.write(html) self.write_index(index) return path
def extract_forms(self, tree_or_html, proba=False, threshold=0.05, fields=True): """ Given a lxml tree or HTML source code, return a list of ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results of :meth:`classify` or :meth:`classify_proba`` calls, depending on ``proba`` parameter. When ``fields`` is False, field type information is not computed. """ if isinstance(tree_or_html, (six.string_types, bytes)): tree = load_html(tree_or_html) else: tree = tree_or_html forms = get_forms(tree) if proba: return [(form, self.classify_proba(form, threshold, fields)) for form in forms] else: return [(form, self.classify(form, fields)) for form in forms]
def test_classify_proba(tree): form = get_forms(tree)[0] res1 = formasaurus.classify_proba(form, threshold=0.05) res2 = formasaurus.extract_forms(tree, proba=True, threshold=0.05)[0][1] assert res1 == res2
def test_classify(tree): form = get_forms(tree)[0] assert formasaurus.classify(form) == { 'form': 'login', 'fields': {'password': '******', 'username': '******'}, }