def HtmlView(form, field_name=None): """ Show both rendered HTML and its simplified source code """ html_source = get_cleaned_form_html(form, human_readable=True) html_cleaned = get_cleaned_form_html(form, human_readable=False) form_display = RawHtml(html_cleaned, field_name, max_height=600) form_raw = HtmlCode(html_source, field_name, max_height=None) return widgets.VBox([form_display, form_raw])
def HtmlView(form, field_name=None): """ Show both rendered HTML and its simplified source code """ html_source = get_cleaned_form_html(form, human_readable=True) html_cleaned = get_cleaned_form_html(form, human_readable=False) form_display = RawHtml(html_cleaned, field_name, max_height=600) form_raw = HtmlCode(html_source, field_name, max_height=None) return widgets.VBox([form_display, form_raw])
def test_get_cleaned_form_html_human_readable(): form = load_html(FORM1) html = get_cleaned_form_html(form, human_readable=True) assert 'style' not in html assert 'script' not in html assert 'div' not in html old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)] new_fields = [(f.name, f.value) for f in get_fields_to_annotate(load_html(html))] assert old_fields == new_fields
def test_get_cleaned_form_html_human_readable(): form = load_html(FORM1) html = get_cleaned_form_html(form, human_readable=True) assert 'style' not in html assert 'script' not in html assert 'div' not in html old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)] new_fields = [(f.name, f.value) for f in get_fields_to_annotate(load_html(html))] assert old_fields == new_fields
def main(): args = docopt.docopt(__doc__, version=formasaurus.__version__) data_folder = args['--data-folder'] if data_folder is None: data_folder = DEFAULT_DATA_PATH storage = Storage(data_folder) if args['check-data']: errors = storage.check() storage.print_form_type_counts(simplify=False) storage.print_form_type_counts(simplify=True) print("Errors:", errors) if errors: sys.exit(1) elif args['train']: ex = formasaurus.FormFieldClassifier.trained_on(data_folder) ex.save(args["<modelfile>"]) elif args['init']: formasaurus.FormFieldClassifier.load() elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"]) print("Downloading {}...".format(args["<url>"])) data = download(args["<url>"]) tree = load_html(data, args['<url>']) result = ex.extract_forms(tree, proba=True, threshold=threshold) if not result: print("No forms found.") return for form, info in result: print("\n") print("=" * 60) print(get_cleaned_form_html(form)) print("-" * 60) print("Form type: ", end="") for form_tp, prob in Counter(info['form']).most_common(): print("%s %0.1f%%" % (form_tp, prob * 100), end=' ') print("\n\nField types:") for field_name, probs in info['fields'].items(): print(field_name, end=': ') for field_tp, prob in Counter(probs).most_common(): print("%s %0.1f%%" % (field_tp, prob * 100), end=' ') print("") print("") elif args['evaluate']: n_splits = int(args["--cv"]) annotations = list( storage.iter_annotations(verbose=True, leave=True, simplify_form_types=True, simplify_field_types=True)) if args['forms'] or args['all']: print("Evaluating form classifier...\n") formtype_model.print_classification_report(annotations, n_splits=n_splits) print("") if args['fields'] or args['all']: print("Evaluating form field classifier...\n") fieldtype_model.print_classification_report(annotations, n_splits=n_splits)
def main(): args = docopt.docopt(__doc__, version=formasaurus.__version__) data_folder = args['--data-folder'] if data_folder is None: data_folder = DEFAULT_DATA_PATH storage = Storage(data_folder) if args['check-data']: errors = storage.check() storage.print_form_type_counts(simplify=False) storage.print_form_type_counts(simplify=True) print("Errors:", errors) if errors: sys.exit(1) elif args['train']: ex = formasaurus.FormFieldClassifier.trained_on(data_folder) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"]) print("Downloading data...") data = download(args["<url>"]) tree = load_html(data, args['<url>']) result = ex.extract_forms(tree, proba=True, threshold=threshold) if not result: print("No forms found.") return for form, info in result: print("\n") print("="*60) print(get_cleaned_form_html(form)) print("-"*60) print("Form type: ", end="") for form_tp, prob in Counter(info['form']).most_common(): print("%s %0.1f%%" % (form_tp, prob * 100), end=' ') print("\n\nField types:") for field_name, probs in info['fields'].items(): print(field_name, end=': ') for field_tp, prob in Counter(probs).most_common(): print("%s %0.1f%%" % (field_tp, prob * 100), end=' ') print("") print("") elif args['evaluate']: n_folds = int(args["--cv"]) annotations = list( storage.iter_annotations(verbose=True, leave=True, simplify_form_types=True, simplify_field_types=True) ) if args['forms'] or args['all']: print("Evaluating form classifier...\n") formtype_model.print_classification_report(annotations, n_folds=n_folds) print("") if args['fields'] or args['all']: print("Evaluating form field classifier...\n") fieldtype_model.print_classification_report(annotations, n_folds=n_folds)