def main(): args = docopt.docopt(__doc__) if args['--data-folder'] is None: # by default, use 'data' folder relative to this file args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data') if args['add']: annotate_forms( data_folder=args['--data-folder'], url_argument=args["<url>"], ) elif args['check-data']: check_annotated_data(args['--data-folder']) elif args['train']: ex = FormExtractor.trained_on( data_folder=args["--data-folder"], train_ratio=1.0, ) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = FormExtractor.load(args["<modelfile>"]) print("Downloading data...") data, url = load_data(args["<url>"]) tree = load_html(data, url) result = ex.extract_forms_proba(tree, threshold) if not result: print("No forms found.") return for form, probs in result: print("-"*40) print_form_html(form) print("") for tp, prob in Counter(probs).most_common(): tp_full = FORM_TYPES_INV[tp] print("%s %0.1f%%" % (tp_full, prob*100), end=' ') print("") elif args['evaluate']: n_folds = int(args["--cv"]) ratio = float(args['--test-size']) store = Storage(args["--data-folder"]) model = get_model() X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True) test_size = int(len(y) * ratio) train_size = len(y) - test_size X_train, X_test, y_train, y_test = X[:train_size], X[train_size:], y[:train_size], y[train_size:] evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test, ipython=False, cv=n_folds, short_matrix=True)
def train(self, data_folder, train_ratio=1.0): """ Train the model using data from ``data_folder``. """ store = Storage(data_folder) X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True) train_size = int(len(y) * train_ratio) X, y = X[:train_size], y[:train_size] model = get_model() print("Training on %d example(s)..." % len(y)) model.fit(X, y) self.model = model
def main(): args = docopt.docopt(__doc__) if args['--data-folder'] is None: # by default, use 'data' folder relative to this file args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data') if args['add']: annotate_forms( data_folder=args['--data-folder'], url_argument=args["<url>"], ) elif args['check-data']: check_annotated_data(args['--data-folder']) elif args['train']: ex = FormExtractor.trained_on( data_folder=args["--data-folder"], train_ratio=1.0, ) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = FormExtractor.load(args["<modelfile>"]) print("Downloading data...") data, url = load_data(args["<url>"]) tree = load_html(data, url) result = ex.extract_forms_proba(tree, threshold) if not result: print("No forms found.") return for form, probs in result: print("-" * 40) print_form_html(form) print("") for tp, prob in Counter(probs).most_common(): tp_full = FORM_TYPES_INV[tp] print("%s %0.1f%%" % (tp_full, prob * 100), end=' ') print("") elif args['evaluate']: n_folds = int(args["--cv"]) ratio = float(args['--test-size']) store = Storage(args["--data-folder"]) model = get_model() X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True) test_size = int(len(y) * ratio) train_size = len(y) - test_size X_train, X_test, y_train, y_test = X[:train_size], X[ train_size:], y[:train_size], y[train_size:] evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test, ipython=False, cv=n_folds, short_matrix=True)