Exemplo n.º 1
0
 def _cached_model_path(cls):
     env_path = os.environ.get("FORMASAURUS_MODEL")
     if env_path:
         return os.path.expanduser(env_path)
     path = "formasaurus-%s.joblib" % dependencies_string()
     return at_root(path)
Exemplo n.º 2
0
 def _cached_model_path(cls):
     env_path = os.environ.get("FORMASAURUS_MODEL")
     if env_path:
         return os.path.expanduser(env_path)
     path = "formasaurus-%s.joblib" % dependencies_string()
     return at_root(path)
Exemplo n.º 3
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os

import six
import joblib

from formasaurus import formtype_model, fieldtype_model
from formasaurus.html import get_forms, get_fields_to_annotate, load_html
from formasaurus.storage import Storage
from formasaurus.utils import dependencies_string, at_root, thresholded

DEFAULT_DATA_PATH = at_root('data')


def extract_forms(tree_or_html, proba=False, threshold=0.05, fields=True):
    """
    Given a lxml tree or HTML source code, return a list of
    ``(form_elem, form_info)`` tuples.

    ``form_info`` dicts contain results of :meth:`classify` or
    :meth:`classify_proba`` calls, depending on ``proba`` parameter.

    When ``fields`` is False, field type information is not computed.
    """
    return get_instance().extract_forms(
        tree_or_html=tree_or_html,
        proba=proba,
        threshold=threshold,
        fields=fields,
    )
Exemplo n.º 4
0
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os

import six
from sklearn.externals import joblib

from formasaurus import formtype_model, fieldtype_model
from formasaurus.html import get_forms, get_fields_to_annotate, load_html
from formasaurus.storage import Storage
from formasaurus.utils import dependencies_string, at_root, thresholded

DEFAULT_DATA_PATH = at_root('data')


def extract_forms(tree_or_html, proba=False, threshold=0.05, fields=True):
    """
    Given a lxml tree or HTML source code, return a list of
    ``(form_elem, form_info)`` tuples.

    ``form_info`` dicts contain results of :meth:`classify` or
    :meth:`classify_proba`` calls, depending on ``proba`` parameter.

    When ``fields`` is False, field type information is not computed.
    """
    return get_instance().extract_forms(
        tree_or_html=tree_or_html,
        proba=proba,
        threshold=threshold,
        fields=fields,
    )