예제 #1
0
 def on_submit(_):
     url = url_field.value.strip()
     html = download(url)
     path = storage.add_result(html, url, add_empty=False)
     if path is None:
         print("No forms at ", url)
     else:
         print("Added:", path, url)
     url_field.value = ""
예제 #2
0
 def on_submit(_):
     url = url_field.value.strip()
     html = download(url)
     path = storage.add_result(html, url, add_empty=False)
     if path is None:
         print(("No forms at ", url))
     else:
         print(("Added:", path, url))
     url_field.value = ""
예제 #3
0
def main():
    args = docopt.docopt(__doc__, version=formasaurus.__version__)

    data_folder = args['--data-folder']
    if data_folder is None:
        data_folder = DEFAULT_DATA_PATH

    storage = Storage(data_folder)

    if args['check-data']:
        errors = storage.check()
        storage.print_form_type_counts(simplify=False)
        storage.print_form_type_counts(simplify=True)
        print("Errors:", errors)
        if errors:
            sys.exit(1)

    elif args['train']:
        ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
        ex.save(args["<modelfile>"])

    elif args['init']:
        formasaurus.FormFieldClassifier.load()

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"])
        print("Downloading {}...".format(args["<url>"]))
        data = download(args["<url>"])
        tree = load_html(data, args['<url>'])

        result = ex.extract_forms(tree, proba=True, threshold=threshold)
        if not result:
            print("No forms found.")
            return

        for form, info in result:
            print("\n")
            print("=" * 60)
            print(get_cleaned_form_html(form))
            print("-" * 60)
            print("Form type:    ", end="")
            for form_tp, prob in Counter(info['form']).most_common():
                print("%s %0.1f%%" % (form_tp, prob * 100), end='    ')

            print("\n\nField types:")
            for field_name, probs in info['fields'].items():
                print(field_name, end=':  ')
                for field_tp, prob in Counter(probs).most_common():
                    print("%s %0.1f%%" % (field_tp, prob * 100), end='  ')
                print("")

            print("")

    elif args['evaluate']:
        n_splits = int(args["--cv"])
        annotations = list(
            storage.iter_annotations(verbose=True,
                                     leave=True,
                                     simplify_form_types=True,
                                     simplify_field_types=True))

        if args['forms'] or args['all']:
            print("Evaluating form classifier...\n")
            formtype_model.print_classification_report(annotations,
                                                       n_splits=n_splits)
            print("")

        if args['fields'] or args['all']:
            print("Evaluating form field classifier...\n")
            fieldtype_model.print_classification_report(annotations,
                                                        n_splits=n_splits)
예제 #4
0
def main():
    args = docopt.docopt(__doc__, version=formasaurus.__version__)

    data_folder = args['--data-folder']
    if data_folder is None:
        data_folder = DEFAULT_DATA_PATH

    storage = Storage(data_folder)

    if args['check-data']:
        errors = storage.check()
        storage.print_form_type_counts(simplify=False)
        storage.print_form_type_counts(simplify=True)
        print("Errors:", errors)
        if errors:
            sys.exit(1)

    elif args['train']:
        ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
        ex.save(args["<modelfile>"])

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"])
        print("Downloading data...")
        data = download(args["<url>"])
        tree = load_html(data, args['<url>'])

        result = ex.extract_forms(tree, proba=True, threshold=threshold)
        if not result:
            print("No forms found.")
            return

        for form, info in result:
            print("\n")
            print("="*60)
            print(get_cleaned_form_html(form))
            print("-"*60)
            print("Form type:    ", end="")
            for form_tp, prob in Counter(info['form']).most_common():
                print("%s %0.1f%%" % (form_tp, prob * 100), end='    ')

            print("\n\nField types:")
            for field_name, probs in info['fields'].items():
                print(field_name, end=':  ')
                for field_tp, prob in Counter(probs).most_common():
                    print("%s %0.1f%%" % (field_tp, prob * 100), end='  ')
                print("")

            print("")

    elif args['evaluate']:
        n_folds = int(args["--cv"])
        annotations = list(
            storage.iter_annotations(verbose=True, leave=True,
                                     simplify_form_types=True,
                                     simplify_field_types=True)
        )

        if args['forms'] or args['all']:
            print("Evaluating form classifier...\n")
            formtype_model.print_classification_report(annotations,
                                                       n_folds=n_folds)
            print("")

        if args['fields'] or args['all']:
            print("Evaluating form field classifier...\n")
            fieldtype_model.print_classification_report(annotations,
                                                        n_folds=n_folds)