Exemplo n.º 1
0
def test_load_html():
    html = b"<div><b></b><b></b></div>"
    tree = load_html(html)
    assert len(tree.xpath('//b')) == 2

    tree2 = load_html(html.decode('ascii'))
    assert len(tree2.xpath('//b')) == 2

    tree3 = load_html(tree)
    assert tree3 is tree
Exemplo n.º 2
0
def test_load_html():
    html = b"<div><b></b><b></b></div>"
    tree = load_html(html)
    assert len(tree.xpath('//b')) == 2

    tree2 = load_html(html.decode('ascii'))
    assert len(tree2.xpath('//b')) == 2

    tree3 = load_html(tree)
    assert tree3 is tree
Exemplo n.º 3
0
def test_get_cleaned_form_html_human_readable():
    form = load_html(FORM1)
    html = get_cleaned_form_html(form, human_readable=True)
    assert 'style' not in html
    assert 'script' not in html
    assert 'div' not in html

    old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)]
    new_fields = [(f.name, f.value)
                  for f in get_fields_to_annotate(load_html(html))]
    assert old_fields == new_fields
Exemplo n.º 4
0
def test_get_cleaned_form_html_human_readable():
    form = load_html(FORM1)
    html = get_cleaned_form_html(form, human_readable=True)
    assert 'style' not in html
    assert 'script' not in html
    assert 'div' not in html

    old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)]
    new_fields = [(f.name, f.value)
                  for f in get_fields_to_annotate(load_html(html))]
    assert old_fields == new_fields
Exemplo n.º 5
0
def test_add_text_before():
    tree = load_html("<div><p>hello<br/>world</p><i>X</i></div>")
    add_text_before(tree.xpath('//br')[0], ",")
    add_text_before(tree.xpath('//p')[0], "!")
    add_text_before(tree.xpath('//i')[0], "1")
    assert html_tostring(
        tree).strip() == "<div>!<p>hello,<br>world</p>1<i>X</i>\n</div>"
Exemplo n.º 6
0
    def check(self, verbose=True):
        """
        Check that items in storage are correct; print the problems found.
        Return the number of errors found.
        """
        index = self.get_index()
        items = list(index.items())
        errors = 0
        if verbose:
            items = tqdm(items,
                         "Checking",
                         leave=True,
                         mininterval=0,
                         ascii=True,
                         ncols=80,
                         unit=' files')
        for fn, info in items:
            fn_full = os.path.join(self.folder, fn)
            if not os.path.exists(fn_full):
                print("\nFile not found: %r" % fn_full)
                errors += 1
                continue

            with open(fn_full, 'rb') as f:
                data = f.read()

            doc = load_html(data, info['url'])
            if len(doc.xpath("//form")) != len(info["forms"]):
                errors += 1
                msg = "\nInvalid form count for entry %r: expected %d, got %d" % (
                    fn, len(doc.xpath("//form")), len(info["forms"]))
                print(msg)

            if 'visible_html_fields' not in info:
                errors += 1
                print("No fields data for entry {!r}".format(fn))
            else:
                fields = info['visible_html_fields']
                if len(fields) != len(doc.xpath('//form')):
                    errors += 1
                    print(
                        "Invalid number of form field annotations for entry {!r}"
                        .format(fn))
                else:
                    for idx, (form, fields_info) in enumerate(
                            zip(doc.xpath('//form'), fields)):
                        elems = get_fields_to_annotate(form)
                        names = {elem.name for elem in elems}
                        if names != set(fields_info.keys()):
                            errors += 1
                            print("Invalid field names for form #{}, "
                                  "entry {!r}. Expected: {}, found: {}".format(
                                      idx, fn, names, set(fields_info.keys())))

        if not errors:
            print("Status: OK")
        else:
            print("Status: %d error(s) found" % errors)

        return errors
Exemplo n.º 7
0
def test_get_fields_to_annotate():
    tree = load_html(FORM1)
    form = get_forms(tree)[0]
    elems = get_fields_to_annotate(form)
    assert all(getattr(el, 'name', None) for el in elems)
    names = get_field_names(elems)
    assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel']
    assert set(names) == {el.name for el in elems}
Exemplo n.º 8
0
def test_get_fields_to_annotate():
    tree = load_html(FORM1)
    form = get_forms(tree)[0]
    elems = get_fields_to_annotate(form)
    assert all(getattr(el, 'name', None) for el in elems)
    names = get_field_names(elems)
    assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel']
    assert set(names) == {el.name for el in elems}
Exemplo n.º 9
0
def test_get_forms():
    forms = get_forms(load_html("""
    <p>some text</p>
    <form action="/go">hi</form>
    <FORM method='post'><input name='foo'></FORM>
    """))
    assert len(forms) == 2
    assert forms[0].action == "/go"
    assert forms[1].method == "POST"
Exemplo n.º 10
0
 def get_tree(self, path, info=None):
     """
     Load a single tree.
     ``path`` is a relative path to a file (key in index.json file),
     ``info`` is annotation data (value in index.json file).
     """
     if info is None:
         info = self.get_index()[path]
     with open(os.path.join(self.folder, path), "rb") as f:
         return load_html(f.read(), info["url"])
Exemplo n.º 11
0
 def get_tree(self, path, info=None):
     """
     Load a single tree.
     ``path`` is a relative path to a file (key in index.json file),
     ``info`` is annotation data (value in index.json file).
     """
     if info is None:
         info = self.get_index()[path]
     with open(os.path.join(self.folder, path), "rb") as f:
         return load_html(f.read(), info["url"])
Exemplo n.º 12
0
def test_get_forms():
    forms = get_forms(
        load_html("""
    <p>some text</p>
    <form action="/go">hi</form>
    <FORM method='post'><input name='foo'></FORM>
    """))
    assert len(forms) == 2
    assert forms[0].action == "/go"
    assert forms[1].method == "POST"
Exemplo n.º 13
0
    def check(self):
        """
        Check that items in storage are correct; print the problems found.
        Return the number of errors found.
        """
        index = self.get_index()
        items = list(index.items())
        errors = 0
        for fn, info in tqdm(items, "Checking", leave=True, mininterval=0,
                             ascii=True, ncols=80, unit=' files'):
            fn_full = os.path.join(self.folder, fn)
            if not os.path.exists(fn_full):
                print("\nFile not found: %r" % fn_full)
                errors += 1
                continue

            with open(fn_full, 'rb') as f:
                data = f.read()

            doc = load_html(data, info['url'])
            if len(doc.xpath("//form")) != len(info["forms"]):
                errors += 1
                msg = "\nInvalid form count for entry %r: expected %d, got %d" % (
                         fn, len(doc.xpath("//form")), len(info["forms"])
                      )
                print(msg)

            if 'visible_html_fields' not in info:
                errors += 1
                print("No fields data for entry {!r}".format(fn))
            else:
                fields = info['visible_html_fields']
                if len(fields) != len(doc.xpath('//form')):
                    errors += 1
                    print("Invalid number of form field annotations for entry {!r}".format(fn))
                else:
                    for idx, (form, fields_info) in enumerate(zip(doc.xpath('//form'), fields)):
                        elems = get_fields_to_annotate(form)
                        names = {elem.name for elem in elems}
                        if names != set(fields_info.keys()):
                            errors += 1
                            print("Invalid field names for form #{}, "
                                  "entry {!r}. Expected: {}, found: {}".format(
                                idx, fn, names, set(fields_info.keys())
                            ))

        if not errors:
            print("Status: OK")
        else:
            print("Status: %d error(s) found" % errors)

        return errors
Exemplo n.º 14
0
 def extract_forms(self, tree_or_html, proba=False, threshold=0.05):
     """
     Given a lxml tree or HTML source code, return a list of
     ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results
     of :meth:`classify` or :meth:`classify_proba`` calls, depending on
     ``proba`` parameter.
     """
     forms = get_forms(load_html(tree_or_html))
     if proba:
         return [(form, self.classify_proba(form, threshold))
                 for form in forms]
     else:
         return [(form, self.classify(form)) for form in forms]
Exemplo n.º 15
0
 def extract_forms(self, tree_or_html, proba=False, threshold=0.05):
     """
     Given a lxml tree or HTML source code, return a list of
     ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results
     of :meth:`classify` or :meth:`classify_proba`` calls, depending on
     ``proba`` parameter.
     """
     forms = get_forms(load_html(tree_or_html))
     if proba:
         return [(form, self.classify_proba(form, threshold))
                 for form in forms]
     else:
         return [(form, self.classify(form)) for form in forms]
Exemplo n.º 16
0
    def add_result(self,
                   html,
                   url,
                   form_answers=None,
                   visible_html_fields=None,
                   index=None,
                   add_empty=True):
        """
        Save HTML source and its <form> and form field types.
        """
        forms = get_forms(load_html(html))
        if not add_empty:
            if not len(forms):
                return

            if all(len(get_fields_to_annotate(form)) == 0 for form in forms):
                return

        if form_answers is None:
            form_schema = self.get_form_schema()
            form_answers = [form_schema.na_value for _ in forms]
        else:
            assert len(form_answers) == len(forms)

        if visible_html_fields is None:
            field_schema = self.get_field_schema()
            visible_html_fields = [{
                name: field_schema.na_value
                for name in get_field_names(get_fields_to_annotate(form))
            } for form in forms]

        filename = self.generate_filename(url)
        path = os.path.relpath(filename, self.folder)
        if index is None:
            index = self.get_index()
        index[path] = {
            "url": url,
            "forms": form_answers,
            "visible_html_fields": visible_html_fields,
        }
        with open(filename, 'wb') as f:
            if not isinstance(html, bytes):
                html = html.encode('utf8')
            f.write(html)
        self.write_index(index)
        return path
Exemplo n.º 17
0
    def add_result(self, html, url, form_answers=None,
                   visible_html_fields=None, index=None,
                   add_empty=True):
        """
        Save HTML source and its <form> and form field types.
        """
        forms = get_forms(load_html(html))
        if not add_empty:
            if not len(forms):
                return

            if all(len(get_fields_to_annotate(form)) == 0 for form in forms):
                return

        if form_answers is None:
            form_schema = self.get_form_schema()
            form_answers = [form_schema.na_value for _ in forms]
        else:
            assert len(form_answers) == len(forms)

        if visible_html_fields is None:
            field_schema = self.get_field_schema()
            visible_html_fields = [{
                name: field_schema.na_value
                for name in get_field_names(get_fields_to_annotate(form))
            } for form in forms]

        filename = self.generate_filename(url)
        path = os.path.relpath(filename, self.folder)
        if index is None:
            index = self.get_index()
        index[path] = {
            "url": url,
            "forms": form_answers,
            "visible_html_fields": visible_html_fields,
        }
        with open(filename, 'wb') as f:
            if not isinstance(html, bytes):
                html = html.encode('utf8')
            f.write(html)
        self.write_index(index)
        return path
Exemplo n.º 18
0
    def extract_forms(self, tree_or_html, proba=False, threshold=0.05,
                      fields=True):
        """
        Given a lxml tree or HTML source code, return a list of
        ``(form_elem, form_info)`` tuples.

        ``form_info`` dicts contain results of :meth:`classify` or
        :meth:`classify_proba`` calls, depending on ``proba`` parameter.

        When ``fields`` is False, field type information is not computed.
        """
        if isinstance(tree_or_html, (six.string_types, bytes)):
            tree = load_html(tree_or_html)
        else:
            tree = tree_or_html
        forms = get_forms(tree)
        if proba:
            return [(form, self.classify_proba(form, threshold, fields))
                    for form in forms]
        else:
            return [(form, self.classify(form, fields)) for form in forms]
Exemplo n.º 19
0
def test_get_text_around_elems():
    tree = load_html("""
        <form>
            <h1>Login</h1>
            Please <b>enter</b> your details
            <p>
                Username: <input name='username'/> required
                <div>Email:</div> <input type='text' name='email'> *
            </p>
            Thanks!
        </form>
    """)
    elems = get_fields_to_annotate(tree)
    user, email = elems
    before, after = get_text_around_elems(tree, elems)
    assert len(before) == 2
    assert before[user] == 'Login  Please  enter  your details  Username:'******'required  Email:'

    assert len(after) == 2
    assert after[user] == 'required  Email:'
    assert after[email] == '* Thanks!'

    get_text_around_elems(tree, []) == {}, {}
Exemplo n.º 20
0
def test_get_text_around_elems():
    tree = load_html("""
        <form>
            <h1>Login</h1>
            Please <b>enter</b> your details
            <p>
                Username: <input name='username'/> required
                <div>Email:</div> <input type='text' name='email'> *
            </p>
            Thanks!
        </form>
    """)
    elems = get_fields_to_annotate(tree)
    user, email = elems
    before, after = get_text_around_elems(tree, elems)
    assert len(before) == 2
    assert before[user] == 'Login  Please  enter  your details  Username:'******'required  Email:'

    assert len(after) == 2
    assert after[user] == 'required  Email:'
    assert after[email] == '* Thanks!'

    get_text_around_elems(tree, []) == {}, {}
Exemplo n.º 21
0
def main():
    args = docopt.docopt(__doc__, version=formasaurus.__version__)

    data_folder = args['--data-folder']
    if data_folder is None:
        data_folder = DEFAULT_DATA_PATH

    storage = Storage(data_folder)

    if args['check-data']:
        errors = storage.check()
        storage.print_form_type_counts(simplify=False)
        storage.print_form_type_counts(simplify=True)
        print("Errors:", errors)
        if errors:
            sys.exit(1)

    elif args['train']:
        ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
        ex.save(args["<modelfile>"])

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"])
        print("Downloading data...")
        data = download(args["<url>"])
        tree = load_html(data, args['<url>'])

        result = ex.extract_forms(tree, proba=True, threshold=threshold)
        if not result:
            print("No forms found.")
            return

        for form, info in result:
            print("\n")
            print("="*60)
            print(get_cleaned_form_html(form))
            print("-"*60)
            print("Form type:    ", end="")
            for form_tp, prob in Counter(info['form']).most_common():
                print("%s %0.1f%%" % (form_tp, prob * 100), end='    ')

            print("\n\nField types:")
            for field_name, probs in info['fields'].items():
                print(field_name, end=':  ')
                for field_tp, prob in Counter(probs).most_common():
                    print("%s %0.1f%%" % (field_tp, prob * 100), end='  ')
                print("")

            print("")

    elif args['evaluate']:
        n_folds = int(args["--cv"])
        annotations = list(
            storage.iter_annotations(verbose=True, leave=True,
                                     simplify_form_types=True,
                                     simplify_field_types=True)
        )

        if args['forms'] or args['all']:
            print("Evaluating form classifier...\n")
            formtype_model.print_classification_report(annotations,
                                                       n_folds=n_folds)
            print("")

        if args['fields'] or args['all']:
            print("Evaluating form field classifier...\n")
            fieldtype_model.print_classification_report(annotations,
                                                        n_folds=n_folds)
Exemplo n.º 22
0
def main():
    args = docopt.docopt(__doc__, version=formasaurus.__version__)

    data_folder = args['--data-folder']
    if data_folder is None:
        data_folder = DEFAULT_DATA_PATH

    storage = Storage(data_folder)

    if args['check-data']:
        errors = storage.check()
        storage.print_form_type_counts(simplify=False)
        storage.print_form_type_counts(simplify=True)
        print("Errors:", errors)
        if errors:
            sys.exit(1)

    elif args['train']:
        ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
        ex.save(args["<modelfile>"])

    elif args['init']:
        formasaurus.FormFieldClassifier.load()

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"])
        print("Downloading {}...".format(args["<url>"]))
        data = download(args["<url>"])
        tree = load_html(data, args['<url>'])

        result = ex.extract_forms(tree, proba=True, threshold=threshold)
        if not result:
            print("No forms found.")
            return

        for form, info in result:
            print("\n")
            print("=" * 60)
            print(get_cleaned_form_html(form))
            print("-" * 60)
            print("Form type:    ", end="")
            for form_tp, prob in Counter(info['form']).most_common():
                print("%s %0.1f%%" % (form_tp, prob * 100), end='    ')

            print("\n\nField types:")
            for field_name, probs in info['fields'].items():
                print(field_name, end=':  ')
                for field_tp, prob in Counter(probs).most_common():
                    print("%s %0.1f%%" % (field_tp, prob * 100), end='  ')
                print("")

            print("")

    elif args['evaluate']:
        n_splits = int(args["--cv"])
        annotations = list(
            storage.iter_annotations(verbose=True,
                                     leave=True,
                                     simplify_form_types=True,
                                     simplify_field_types=True))

        if args['forms'] or args['all']:
            print("Evaluating form classifier...\n")
            formtype_model.print_classification_report(annotations,
                                                       n_splits=n_splits)
            print("")

        if args['fields'] or args['all']:
            print("Evaluating form field classifier...\n")
            fieldtype_model.print_classification_report(annotations,
                                                        n_splits=n_splits)
Exemplo n.º 23
0
def test_add_text_before_root():
    tree = load_html("<p>hello<br/>world</p>")
    add_text_before(tree.xpath('//p')[0], "!")
    assert html_tostring(tree).strip() == "!<p>hello<br>world</p>"
Exemplo n.º 24
0
def test_html_tostring():
    src = "<form><input value='hello'><input type='submit'></form>"
    tree = load_html(src)
    assert html_tostring(tree) == """<form>
Exemplo n.º 25
0
def test_add_text_after():
    tree = load_html("<p>hello,<br/>world</p>")
    add_text_after(tree.xpath('//br')[0], "brave new ")
    add_text_after(tree.xpath('//p')[0], "!")
    assert html_tostring(tree).strip() == "<p>hello,<br>brave new world</p>!"
Exemplo n.º 26
0
def test_add_text_before():
    tree = load_html("<div><p>hello<br/>world</p><i>X</i></div>")
    add_text_before(tree.xpath('//br')[0], ",")
    add_text_before(tree.xpath('//p')[0], "!")
    add_text_before(tree.xpath('//i')[0], "1")
    assert html_tostring(tree).strip() == "<div>!<p>hello,<br>world</p>1<i>X</i>\n</div>"
Exemplo n.º 27
0
def test_add_text_after():
    tree = load_html("<p>hello,<br/>world</p>")
    add_text_after(tree.xpath('//br')[0], "brave new ")
    add_text_after(tree.xpath('//p')[0], "!")
    assert html_tostring(tree).strip() == "<p>hello,<br>brave new world</p>!"
Exemplo n.º 28
0
def test_add_text_before_root():
    tree = load_html("<p>hello<br/>world</p>")
    add_text_before(tree.xpath('//p')[0], "!")
    assert html_tostring(tree).strip() == "!<p>hello<br>world</p>"
Exemplo n.º 29
0
def test_html_tostring():
    src = "<form><input value='hello'><input type='submit'></form>"
    tree = load_html(src)
    assert html_tostring(tree) == """<form>