예제 #1
0
def HtmlView(form, field_name=None):
    """ Show both rendered HTML and its simplified source code """
    html_source = get_cleaned_form_html(form, human_readable=True)
    html_cleaned = get_cleaned_form_html(form, human_readable=False)

    form_display = RawHtml(html_cleaned, field_name, max_height=600)
    form_raw = HtmlCode(html_source, field_name, max_height=None)
    return widgets.VBox([form_display, form_raw])
예제 #2
0
def HtmlView(form, field_name=None):
    """ Show both rendered HTML and its simplified source code """
    html_source = get_cleaned_form_html(form, human_readable=True)
    html_cleaned = get_cleaned_form_html(form, human_readable=False)

    form_display = RawHtml(html_cleaned, field_name, max_height=600)
    form_raw = HtmlCode(html_source, field_name, max_height=None)
    return widgets.VBox([form_display, form_raw])
예제 #3
0
def test_get_cleaned_form_html_human_readable():
    form = load_html(FORM1)
    html = get_cleaned_form_html(form, human_readable=True)
    assert 'style' not in html
    assert 'script' not in html
    assert 'div' not in html

    old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)]
    new_fields = [(f.name, f.value)
                  for f in get_fields_to_annotate(load_html(html))]
    assert old_fields == new_fields
예제 #4
0
def test_get_cleaned_form_html_human_readable():
    form = load_html(FORM1)
    html = get_cleaned_form_html(form, human_readable=True)
    assert 'style' not in html
    assert 'script' not in html
    assert 'div' not in html

    old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)]
    new_fields = [(f.name, f.value)
                  for f in get_fields_to_annotate(load_html(html))]
    assert old_fields == new_fields
예제 #5
0
def main():
    args = docopt.docopt(__doc__, version=formasaurus.__version__)

    data_folder = args['--data-folder']
    if data_folder is None:
        data_folder = DEFAULT_DATA_PATH

    storage = Storage(data_folder)

    if args['check-data']:
        errors = storage.check()
        storage.print_form_type_counts(simplify=False)
        storage.print_form_type_counts(simplify=True)
        print("Errors:", errors)
        if errors:
            sys.exit(1)

    elif args['train']:
        ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
        ex.save(args["<modelfile>"])

    elif args['init']:
        formasaurus.FormFieldClassifier.load()

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"])
        print("Downloading {}...".format(args["<url>"]))
        data = download(args["<url>"])
        tree = load_html(data, args['<url>'])

        result = ex.extract_forms(tree, proba=True, threshold=threshold)
        if not result:
            print("No forms found.")
            return

        for form, info in result:
            print("\n")
            print("=" * 60)
            print(get_cleaned_form_html(form))
            print("-" * 60)
            print("Form type:    ", end="")
            for form_tp, prob in Counter(info['form']).most_common():
                print("%s %0.1f%%" % (form_tp, prob * 100), end='    ')

            print("\n\nField types:")
            for field_name, probs in info['fields'].items():
                print(field_name, end=':  ')
                for field_tp, prob in Counter(probs).most_common():
                    print("%s %0.1f%%" % (field_tp, prob * 100), end='  ')
                print("")

            print("")

    elif args['evaluate']:
        n_splits = int(args["--cv"])
        annotations = list(
            storage.iter_annotations(verbose=True,
                                     leave=True,
                                     simplify_form_types=True,
                                     simplify_field_types=True))

        if args['forms'] or args['all']:
            print("Evaluating form classifier...\n")
            formtype_model.print_classification_report(annotations,
                                                       n_splits=n_splits)
            print("")

        if args['fields'] or args['all']:
            print("Evaluating form field classifier...\n")
            fieldtype_model.print_classification_report(annotations,
                                                        n_splits=n_splits)
예제 #6
0
def main():
    args = docopt.docopt(__doc__, version=formasaurus.__version__)

    data_folder = args['--data-folder']
    if data_folder is None:
        data_folder = DEFAULT_DATA_PATH

    storage = Storage(data_folder)

    if args['check-data']:
        errors = storage.check()
        storage.print_form_type_counts(simplify=False)
        storage.print_form_type_counts(simplify=True)
        print("Errors:", errors)
        if errors:
            sys.exit(1)

    elif args['train']:
        ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
        ex.save(args["<modelfile>"])

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"])
        print("Downloading data...")
        data = download(args["<url>"])
        tree = load_html(data, args['<url>'])

        result = ex.extract_forms(tree, proba=True, threshold=threshold)
        if not result:
            print("No forms found.")
            return

        for form, info in result:
            print("\n")
            print("="*60)
            print(get_cleaned_form_html(form))
            print("-"*60)
            print("Form type:    ", end="")
            for form_tp, prob in Counter(info['form']).most_common():
                print("%s %0.1f%%" % (form_tp, prob * 100), end='    ')

            print("\n\nField types:")
            for field_name, probs in info['fields'].items():
                print(field_name, end=':  ')
                for field_tp, prob in Counter(probs).most_common():
                    print("%s %0.1f%%" % (field_tp, prob * 100), end='  ')
                print("")

            print("")

    elif args['evaluate']:
        n_folds = int(args["--cv"])
        annotations = list(
            storage.iter_annotations(verbose=True, leave=True,
                                     simplify_form_types=True,
                                     simplify_field_types=True)
        )

        if args['forms'] or args['all']:
            print("Evaluating form classifier...\n")
            formtype_model.print_classification_report(annotations,
                                                       n_folds=n_folds)
            print("")

        if args['fields'] or args['all']:
            print("Evaluating form field classifier...\n")
            fieldtype_model.print_classification_report(annotations,
                                                        n_folds=n_folds)