Exemplo n.º 1
0
def empty_storage(tmpdir):
    storage = Storage(str(tmpdir))
    config = {
        "form_types": {
            "types": [
                {"short": "s", "full": "search"},
                {"short": "l", "full": "login"},
                {"short": "o", "full": "other"},
                {"short": "X", "full": "NOT ANNOTATED"}
            ],
            "simplify_map": {
                "l": "o",
            },
            "NA_value": "X",
            "skip_value": "-"
        },

        "field_types": {
            "types": [
                {"short": "us", "full": "username"},
                {"short": "p1", "full": "password"},
                {"short": "qq", "full": "search query"},
                {"short": "XX", "full": "NOT ANNOTATED"}
            ],
            "simplify_map": {},
            "NA_value": "XX",
            "skip_value": "--"
        }
    }
    storage.initialize(config)
    return storage
def main():
    args = docopt.docopt(__doc__)

    if args['--data-folder'] is None:
        # by default, use 'data' folder relative to this file
        args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data')

    if args['add']:
        annotate_forms(
            data_folder=args['--data-folder'],
            url_argument=args["<url>"],
        )

    elif args['check-data']:
        check_annotated_data(args['--data-folder'])

    elif args['train']:
        ex = FormExtractor.trained_on(
            data_folder=args["--data-folder"],
            train_ratio=1.0,
        )
        ex.save(args["<modelfile>"])

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = FormExtractor.load(args["<modelfile>"])
        print("Downloading data...")
        data, url = load_data(args["<url>"])
        tree = load_html(data, url)

        result = ex.extract_forms_proba(tree, threshold)
        if not result:
            print("No forms found.")
            return

        for form, probs in result:
            print("-"*40)
            print_form_html(form)
            print("")
            for tp, prob in Counter(probs).most_common():
                tp_full = FORM_TYPES_INV[tp]
                print("%s %0.1f%%" % (tp_full, prob*100), end='    ')

            print("")

    elif args['evaluate']:
        n_folds = int(args["--cv"])
        ratio = float(args['--test-size'])

        store = Storage(args["--data-folder"])
        model = get_model()
        X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True)

        test_size = int(len(y) * ratio)
        train_size = len(y) - test_size
        X_train, X_test, y_train, y_test = X[:train_size], X[train_size:], y[:train_size], y[train_size:]

        evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test,
                                 ipython=False, cv=n_folds, short_matrix=True)
def check_annotated_data(data_folder):
    """
    Check that annotated data is correct; exit with code 1 if it is not.
    """
    storage = Storage(data_folder)
    ok = storage.check()
    storage.print_type_counts()
    if not ok:
        sys.exit(1)
Exemplo n.º 4
0
def check_annotated_data(data_folder):
    """
    Check that annotated data is correct; exit with code 1 if it is not.
    """
    storage = Storage(data_folder)
    ok = storage.check()
    storage.print_type_counts()
    if not ok:
        sys.exit(1)
Exemplo n.º 5
0
def check_annotated_data(data_folder):
    """
    Check that annotated data is correct; exit with code 1 if it is not.
    """
    storage = Storage(data_folder)
    errors = storage.check()
    storage.print_type_counts()
    print("Errors:", errors)
    if errors:
        sys.exit(1)
Exemplo n.º 6
0
    def train(self, data_folder, train_ratio=1.0):
        """ Train the model using data from ``data_folder``. """
        store = Storage(data_folder)
        X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True)
        train_size = int(len(y) * train_ratio)
        X, y = X[:train_size], y[:train_size]

        model = get_model()
        print("Training on %d example(s)..." % len(y))
        model.fit(X, y)
        self.model = model
Exemplo n.º 7
0
    def train(self, data_folder, train_ratio=1.0):
        """ Train the model using data from ``data_folder``. """
        store = Storage(data_folder)
        X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True)
        train_size = int(len(y) * train_ratio)
        X, y = X[:train_size], y[:train_size]

        model = get_model()
        print("Training on %d example(s)..." % len(y))
        model.fit(X, y)
        self.model = model
Exemplo n.º 8
0
 def trained_on(cls, data_folder):
     """ Return Formasaurus object trained on data from data_folder """
     store = Storage(data_folder)
     print("Loading training data...")
     annotations = list(store.iter_annotations(
         simplify_form_types=True,
         simplify_field_types=True,
         verbose=True,
         leave=True,
     ))
     ex = cls()
     ex.train(annotations)
     return ex
Exemplo n.º 9
0
def annotate_forms(data_folder, url_argument):
    """
    Run an interactive HTML form annotation tool.

    The process is to download a web page, display all HTML forms and for
    each form ask user about form type. The result is saved on disk:
    web page is stored as a html file and the URL and the annotation
    results are added to index.json file.
    """
    storage = Storage(data_folder)
    html, url = load_data(url_argument)
    doc = load_html(html, url)
    answers = _annotate_forms(storage, doc)
    if answers:
        storage.store_result(html, answers, url)
Exemplo n.º 10
0
def annotate_forms(data_folder, url_argument):
    """
    Run an interactive HTML form annotation tool.

    The process is to download a web page, display all HTML forms and for
    each form ask user about form type. The result is saved on disk:
    web page is stored as a html file and the URL and the annotation
    results are added to index.json file.
    """
    storage = Storage(data_folder)
    html, url = load_data(url_argument)
    doc = load_html(html, url)
    answers = _annotate_forms(storage, doc)
    if answers:
        storage.store_result(html, answers, url)
Exemplo n.º 11
0
def main():
    args = docopt.docopt(__doc__)

    if args['--data-folder'] is None:
        # by default, use 'data' folder relative to this file
        args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data')

    if args['add']:
        annotate_forms(
            data_folder=args['--data-folder'],
            url_argument=args["<url>"],
        )

    elif args['check-data']:
        check_annotated_data(args['--data-folder'])

    elif args['train']:
        ex = FormExtractor.trained_on(
            data_folder=args["--data-folder"],
            train_ratio=1.0,
        )
        ex.save(args["<modelfile>"])

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = FormExtractor.load(args["<modelfile>"])
        print("Downloading data...")
        data, url = load_data(args["<url>"])
        tree = load_html(data, url)

        result = ex.extract_forms_proba(tree, threshold)
        if not result:
            print("No forms found.")
            return

        for form, probs in result:
            print("-" * 40)
            print_form_html(form)
            print("")
            for tp, prob in Counter(probs).most_common():
                tp_full = FORM_TYPES_INV[tp]
                print("%s %0.1f%%" % (tp_full, prob * 100), end='    ')

            print("")

    elif args['evaluate']:
        n_folds = int(args["--cv"])
        ratio = float(args['--test-size'])

        store = Storage(args["--data-folder"])
        model = get_model()
        X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True)

        test_size = int(len(y) * ratio)
        train_size = len(y) - test_size
        X_train, X_test, y_train, y_test = X[:train_size], X[
            train_size:], y[:train_size], y[train_size:]

        evaluation.print_metrics(model,
                                 X,
                                 y,
                                 X_train,
                                 X_test,
                                 y_train,
                                 y_test,
                                 ipython=False,
                                 cv=n_folds,
                                 short_matrix=True)
Exemplo n.º 12
0
def main():
    args = docopt.docopt(__doc__, version=formasaurus.__version__)

    data_folder = args['--data-folder']
    if data_folder is None:
        data_folder = DEFAULT_DATA_PATH

    storage = Storage(data_folder)

    if args['check-data']:
        errors = storage.check()
        storage.print_form_type_counts(simplify=False)
        storage.print_form_type_counts(simplify=True)
        print("Errors:", errors)
        if errors:
            sys.exit(1)

    elif args['train']:
        ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
        ex.save(args["<modelfile>"])

    elif args['init']:
        formasaurus.FormFieldClassifier.load()

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"])
        print("Downloading {}...".format(args["<url>"]))
        data = download(args["<url>"])
        tree = load_html(data, args['<url>'])

        result = ex.extract_forms(tree, proba=True, threshold=threshold)
        if not result:
            print("No forms found.")
            return

        for form, info in result:
            print("\n")
            print("=" * 60)
            print(get_cleaned_form_html(form))
            print("-" * 60)
            print("Form type:    ", end="")
            for form_tp, prob in Counter(info['form']).most_common():
                print("%s %0.1f%%" % (form_tp, prob * 100), end='    ')

            print("\n\nField types:")
            for field_name, probs in info['fields'].items():
                print(field_name, end=':  ')
                for field_tp, prob in Counter(probs).most_common():
                    print("%s %0.1f%%" % (field_tp, prob * 100), end='  ')
                print("")

            print("")

    elif args['evaluate']:
        n_splits = int(args["--cv"])
        annotations = list(
            storage.iter_annotations(verbose=True,
                                     leave=True,
                                     simplify_form_types=True,
                                     simplify_field_types=True))

        if args['forms'] or args['all']:
            print("Evaluating form classifier...\n")
            formtype_model.print_classification_report(annotations,
                                                       n_splits=n_splits)
            print("")

        if args['fields'] or args['all']:
            print("Evaluating form field classifier...\n")
            fieldtype_model.print_classification_report(annotations,
                                                        n_splits=n_splits)
Exemplo n.º 13
0
def storage():
    return Storage(DEFAULT_DATA_PATH)
Exemplo n.º 14
0
def main():
    args = docopt.docopt(__doc__, version=formasaurus.__version__)

    data_folder = args['--data-folder']
    if data_folder is None:
        data_folder = DEFAULT_DATA_PATH

    storage = Storage(data_folder)

    if args['check-data']:
        errors = storage.check()
        storage.print_form_type_counts(simplify=False)
        storage.print_form_type_counts(simplify=True)
        print("Errors:", errors)
        if errors:
            sys.exit(1)

    elif args['train']:
        ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
        ex.save(args["<modelfile>"])

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"])
        print("Downloading data...")
        data = download(args["<url>"])
        tree = load_html(data, args['<url>'])

        result = ex.extract_forms(tree, proba=True, threshold=threshold)
        if not result:
            print("No forms found.")
            return

        for form, info in result:
            print("\n")
            print("="*60)
            print(get_cleaned_form_html(form))
            print("-"*60)
            print("Form type:    ", end="")
            for form_tp, prob in Counter(info['form']).most_common():
                print("%s %0.1f%%" % (form_tp, prob * 100), end='    ')

            print("\n\nField types:")
            for field_name, probs in info['fields'].items():
                print(field_name, end=':  ')
                for field_tp, prob in Counter(probs).most_common():
                    print("%s %0.1f%%" % (field_tp, prob * 100), end='  ')
                print("")

            print("")

    elif args['evaluate']:
        n_folds = int(args["--cv"])
        annotations = list(
            storage.iter_annotations(verbose=True, leave=True,
                                     simplify_form_types=True,
                                     simplify_field_types=True)
        )

        if args['forms'] or args['all']:
            print("Evaluating form classifier...\n")
            formtype_model.print_classification_report(annotations,
                                                       n_folds=n_folds)
            print("")

        if args['fields'] or args['all']:
            print("Evaluating form field classifier...\n")
            fieldtype_model.print_classification_report(annotations,
                                                        n_folds=n_folds)