示例#1
0
def test_get_fields_to_annotate():
    tree = load_html(FORM1)
    form = get_forms(tree)[0]
    elems = get_fields_to_annotate(form)
    assert all(getattr(el, 'name', None) for el in elems)
    names = get_field_names(elems)
    assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel']
    assert set(names) == {el.name for el in elems}
示例#2
0
def test_get_fields_to_annotate():
    tree = load_html(FORM1)
    form = get_forms(tree)[0]
    elems = get_fields_to_annotate(form)
    assert all(getattr(el, 'name', None) for el in elems)
    names = get_field_names(elems)
    assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel']
    assert set(names) == {el.name for el in elems}
示例#3
0
def test_get_forms():
    forms = get_forms(load_html("""
    <p>some text</p>
    <form action="/go">hi</form>
    <FORM method='post'><input name='foo'></FORM>
    """))
    assert len(forms) == 2
    assert forms[0].action == "/go"
    assert forms[1].method == "POST"
def test_classify(tree):
    form = get_forms(tree)[0]
    assert formasaurus.classify(form) == {
        'form': 'login',
        'fields': {
            'password': '******',
            'username': '******'
        },
    }
示例#5
0
def test_get_forms():
    forms = get_forms(
        load_html("""
    <p>some text</p>
    <form action="/go">hi</form>
    <FORM method='post'><input name='foo'></FORM>
    """))
    assert len(forms) == 2
    assert forms[0].action == "/go"
    assert forms[1].method == "POST"
示例#6
0
    def iter_annotations(self,
                         index=None,
                         drop_duplicates=True,
                         drop_na=True,
                         drop_skipped=True,
                         simplify_form_types=False,
                         simplify_field_types=False,
                         verbose=False,
                         leave=False):
        """
        Return an iterator over :class:`FormAnnotation` objects.
        """
        form_schema = self.get_form_schema()
        field_schema = self.get_field_schema()
        trees = self.iter_trees(index=index)

        if verbose:
            trees = tqdm(trees,
                         "Loading",
                         mininterval=0,
                         leave=leave,
                         ascii=True,
                         ncols=80,
                         unit=' files')

        seen = set()
        for path, tree, info in trees:
            for idx, (form,
                      tp) in enumerate(zip(get_forms(tree), info["forms"])):
                if simplify_form_types:
                    tp = form_schema.simplify_map.get(tp, tp)

                if drop_na and tp == form_schema.na_value:
                    continue

                if drop_skipped and tp == form_schema.skip_value:
                    continue

                if drop_duplicates:
                    fp = self.get_fingerprint(form)
                    if fp in seen:
                        continue
                    seen.add(fp)

                if simplify_field_types:
                    info = copy.deepcopy(info)
                    for fields in info['visible_html_fields']:
                        for k, v in fields.items():
                            fields[k] = field_schema.simplify_map.get(v, v)

                yield FormAnnotation(form, tp, idx, info, path, form_schema,
                                     field_schema)

        if verbose and leave:
            print("")
示例#7
0
 def extract_forms(self, tree_or_html, proba=False, threshold=0.05):
     """
     Given a lxml tree or HTML source code, return a list of
     ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results
     of :meth:`classify` or :meth:`classify_proba`` calls, depending on
     ``proba`` parameter.
     """
     forms = get_forms(load_html(tree_or_html))
     if proba:
         return [(form, self.classify_proba(form, threshold))
                 for form in forms]
     else:
         return [(form, self.classify(form)) for form in forms]
示例#8
0
 def extract_forms(self, tree_or_html, proba=False, threshold=0.05):
     """
     Given a lxml tree or HTML source code, return a list of
     ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results
     of :meth:`classify` or :meth:`classify_proba`` calls, depending on
     ``proba`` parameter.
     """
     forms = get_forms(load_html(tree_or_html))
     if proba:
         return [(form, self.classify_proba(form, threshold))
                 for form in forms]
     else:
         return [(form, self.classify(form)) for form in forms]
示例#9
0
    def add_result(self,
                   html,
                   url,
                   form_answers=None,
                   visible_html_fields=None,
                   index=None,
                   add_empty=True):
        """
        Save HTML source and its <form> and form field types.
        """
        forms = get_forms(load_html(html))
        if not add_empty:
            if not len(forms):
                return

            if all(len(get_fields_to_annotate(form)) == 0 for form in forms):
                return

        if form_answers is None:
            form_schema = self.get_form_schema()
            form_answers = [form_schema.na_value for _ in forms]
        else:
            assert len(form_answers) == len(forms)

        if visible_html_fields is None:
            field_schema = self.get_field_schema()
            visible_html_fields = [{
                name: field_schema.na_value
                for name in get_field_names(get_fields_to_annotate(form))
            } for form in forms]

        filename = self.generate_filename(url)
        path = os.path.relpath(filename, self.folder)
        if index is None:
            index = self.get_index()
        index[path] = {
            "url": url,
            "forms": form_answers,
            "visible_html_fields": visible_html_fields,
        }
        with open(filename, 'wb') as f:
            if not isinstance(html, bytes):
                html = html.encode('utf8')
            f.write(html)
        self.write_index(index)
        return path
示例#10
0
    def iter_annotations(self, index=None,
                         drop_duplicates=True, drop_na=True, drop_skipped=True,
                         simplify_form_types=False, simplify_field_types=False,
                         verbose=False, leave=False):
        """
        Return an iterator over :class:`FormAnnotation` objects.
        """
        form_schema = self.get_form_schema()
        field_schema = self.get_field_schema()
        trees = self.iter_trees(index=index)

        if verbose:
            trees = tqdm(trees, "Loading", mininterval=0,
                         leave=leave, ascii=True, ncols=80, unit=' files')

        seen = set()
        for path, tree, info in trees:
            for idx, (form, tp) in enumerate(zip(get_forms(tree), info["forms"])):
                if simplify_form_types:
                    tp = form_schema.simplify_map.get(tp, tp)

                if drop_na and tp == form_schema.na_value:
                    continue

                if drop_skipped and tp == form_schema.skip_value:
                    continue

                if drop_duplicates:
                    fp = self.get_fingerprint(form)
                    if fp in seen:
                        continue
                    seen.add(fp)

                if simplify_field_types:
                    info = copy.deepcopy(info)
                    for fields in info['visible_html_fields']:
                        for k, v in fields.items():
                            fields[k] = field_schema.simplify_map.get(v, v)

                yield FormAnnotation(form, tp, idx, info, path,
                                     form_schema, field_schema)

        if verbose and leave:
            print("")
示例#11
0
    def add_result(self, html, url, form_answers=None,
                   visible_html_fields=None, index=None,
                   add_empty=True):
        """
        Save HTML source and its <form> and form field types.
        """
        forms = get_forms(load_html(html))
        if not add_empty:
            if not len(forms):
                return

            if all(len(get_fields_to_annotate(form)) == 0 for form in forms):
                return

        if form_answers is None:
            form_schema = self.get_form_schema()
            form_answers = [form_schema.na_value for _ in forms]
        else:
            assert len(form_answers) == len(forms)

        if visible_html_fields is None:
            field_schema = self.get_field_schema()
            visible_html_fields = [{
                name: field_schema.na_value
                for name in get_field_names(get_fields_to_annotate(form))
            } for form in forms]

        filename = self.generate_filename(url)
        path = os.path.relpath(filename, self.folder)
        if index is None:
            index = self.get_index()
        index[path] = {
            "url": url,
            "forms": form_answers,
            "visible_html_fields": visible_html_fields,
        }
        with open(filename, 'wb') as f:
            if not isinstance(html, bytes):
                html = html.encode('utf8')
            f.write(html)
        self.write_index(index)
        return path
示例#12
0
    def extract_forms(self, tree_or_html, proba=False, threshold=0.05,
                      fields=True):
        """
        Given a lxml tree or HTML source code, return a list of
        ``(form_elem, form_info)`` tuples.

        ``form_info`` dicts contain results of :meth:`classify` or
        :meth:`classify_proba`` calls, depending on ``proba`` parameter.

        When ``fields`` is False, field type information is not computed.
        """
        if isinstance(tree_or_html, (six.string_types, bytes)):
            tree = load_html(tree_or_html)
        else:
            tree = tree_or_html
        forms = get_forms(tree)
        if proba:
            return [(form, self.classify_proba(form, threshold, fields))
                    for form in forms]
        else:
            return [(form, self.classify(form, fields)) for form in forms]
示例#13
0
def test_classify_proba(tree):
    form = get_forms(tree)[0]
    res1 = formasaurus.classify_proba(form, threshold=0.05)
    res2 = formasaurus.extract_forms(tree, proba=True, threshold=0.05)[0][1]
    assert res1 == res2
def test_classify(tree):
    form = get_forms(tree)[0]
    assert formasaurus.classify(form) == {
        'form': 'login',
        'fields': {'password': '******', 'username': '******'},
    }