Exemplo n.º 1
0
    def from_disk(self, path, exclude=tuple(), disable=None):
        """Loads state from a directory. Modifies the object in place and
        returns it. If the saved `Language` object contains a model, the
        model will be loaded.

        path (unicode or Path): A path to a directory.
        exclude (list): Names of components or serialization fields to exclude.
        RETURNS (Language): The modified `Language` object.

        DOCS: https://spacy.io/api/language#from_disk
        """
        if disable is not None:
            deprecation_warning(Warnings.W014)
            exclude = disable
        path = util.ensure_path(path)
        deserializers = OrderedDict()
        deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
        deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self)
        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"])
        for name, proc in self.pipeline:
            if name in exclude:
                continue
            if not hasattr(proc, "from_disk"):
                continue
            deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"])
        if not (path / "vocab").exists() and "vocab" not in exclude:
            # Convert to list here in case exclude is (default) tuple
            exclude = list(exclude) + ["vocab"]
        util.from_disk(path, deserializers, exclude)
        self._path = path
        return self
Exemplo n.º 2
0
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over. If --create-meta is
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
    msg = Printer()
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        msg.fail("Can't locate model data", input_path, exits=1)
    if not output_path or not output_path.exists():
        msg.fail("Output directory not found", output_path, exits=1)
    if meta_path and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)

    meta_path = meta_path or input_path / "meta.json"
    if meta_path.is_file():
        meta = srsly.read_json(meta_path)
        if not create_meta:  # only print if user doesn't want to overwrite
            msg.good("Loaded meta.json from file", meta_path)
        else:
            meta = generate_meta(input_dir, meta, msg)
    for key in ("lang", "name", "version"):
        if key not in meta or meta[key] == "":
            msg.fail(
                "No '{}' setting found in meta.json".format(key),
                "This setting is required to build your package.",
                exits=1,
            )
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_path / model_name_v
    package_path = main_path / model_name

    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing "
                "directories.".format(path=path2str(package_path)),
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
    msg.good("Successfully created package '{}'".format(model_name_v), main_path)
    msg.text("To build the package, run `python setup.py sdist` in this directory.")
Exemplo n.º 3
0
def _find_best(experiment_dir, component):
    accuracies = []
    for epoch_model in experiment_dir.iterdir():
        if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final":
            accs = srsly.read_json(epoch_model / "accuracy.json")
            scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)]
            accuracies.append((scores, epoch_model))
    if accuracies:
        return max(accuracies)[1]
    else:
        return None
Exemplo n.º 4
0
def _collate_best_model(meta, output_path, components):
    bests = {}
    for component in components:
        bests[component] = _find_best(output_path, component)
    best_dest = output_path / "model-best"
    shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
    for component, best_component_src in bests.items():
        shutil.rmtree(path2str(best_dest / component))
        shutil.copytree(
            path2str(best_component_src / component), path2str(best_dest / component)
        )
        accs = srsly.read_json(best_component_src / "accuracy.json")
        for metric in _get_metrics(component):
            meta["accuracy"][metric] = accs[metric]
    srsly.write_json(best_dest / "meta.json", meta)
    return best_dest
Exemplo n.º 5
0
def _load_file(file_path, msg):
    file_name = file_path.parts[-1]
    if file_path.suffix == ".json":
        with msg.loading("Loading {}...".format(file_name)):
            data = srsly.read_json(file_path)
        msg.good("Loaded {}".format(file_name))
        return data
    elif file_path.suffix == ".jsonl":
        with msg.loading("Loading {}...".format(file_name)):
            data = srsly.read_jsonl(file_path)
        msg.good("Loaded {}".format(file_name))
        return data
    msg.fail(
        "Can't load file extension {}".format(file_path.suffix),
        "Expected .json or .jsonl",
        exits=1,
    )
Exemplo n.º 6
0
def get_model_meta(path):
    """Get model meta.json from a directory path and validate its contents.

    path (unicode or Path): Path to model directory.
    RETURNS (dict): The model's meta data.
    """
    model_path = ensure_path(path)
    if not model_path.exists():
        raise IOError(Errors.E052.format(path=path2str(model_path)))
    meta_path = model_path / "meta.json"
    if not meta_path.is_file():
        raise IOError(Errors.E053.format(path=meta_path))
    meta = srsly.read_json(meta_path)
    for setting in ["lang", "name", "version"]:
        if setting not in meta or not meta[setting]:
            raise ValueError(Errors.E054.format(setting=setting))
    return meta
Exemplo n.º 7
0
def info(model=None, markdown=False, silent=False):
    """
    Print info about spaCy installation. If a model shortcut link is
    speficied as an argument, print model information. Flag --markdown
    prints details in Markdown for easy copy-pasting to GitHub issues.
    """
    msg = Printer()
    if model:
        if util.is_package(model):
            model_path = util.get_package_path(model)
        else:
            model_path = util.get_data_path() / model
        meta_path = model_path / "meta.json"
        if not meta_path.is_file():
            msg.fail("Can't find model meta.json", meta_path, exits=1)
        meta = srsly.read_json(meta_path)
        if model_path.resolve() != model_path:
            meta["link"] = path2str(model_path)
            meta["source"] = path2str(model_path.resolve())
        else:
            meta["source"] = path2str(model_path)
        if not silent:
            title = "Info about model '{}'".format(model)
            model_meta = {
                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
            }
            if markdown:
                print_markdown(model_meta, title=title)
            else:
                msg.table(model_meta, title=title)
        return meta
    data = {
        "spaCy version": about.__version__,
        "Location": path2str(Path(__file__).parent.parent),
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
        "Models": list_models(),
    }
    if not silent:
        title = "Info about spaCy"
        if markdown:
            print_markdown(data, title=title)
        else:
            msg.table(data, title=title)
    return data
Exemplo n.º 8
0
def get_model_links(compat):
    links = {}
    data_path = get_data_path()
    if data_path:
        models = [p for p in data_path.iterdir() if is_model_path(p)]
        for model in models:
            meta_path = Path(model) / "meta.json"
            if not meta_path.exists():
                continue
            meta = srsly.read_json(meta_path)
            link = model.parts[-1]
            name = meta["lang"] + "_" + meta["name"]
            links[link] = {
                "name": name,
                "version": meta["version"],
                "compat": is_compat(compat, name, meta["version"]),
            }
    return links
Exemplo n.º 9
0
def main(file_path: str):

    data = srsly.read_json(file_path)

    strat: str
    for strat in data["strategies"]:

        print()

        print(f"EVALUATION RESULTS FOR {strat}")

        print()

        for attr in data["strategies"][strat]:
            if attr.endswith("labels") or "micro" not in attr:
                continue

            print(f"{attr}: {data['strategies'][strat][attr]}")
Exemplo n.º 10
0
def get_model_meta(path):
    """Get model meta.json from a directory path and validate its contents.

    path (unicode or Path): Path to model directory.
    RETURNS (dict): The model's meta data.
    """
    model_path = ensure_path(path)
    if not model_path.exists():
        raise IOError(Errors.E052.format(path=path2str(model_path)))
    meta_path = model_path / "meta.json"
    if not meta_path.is_file():
        raise IOError(Errors.E053.format(path=meta_path))
    meta = srsly.read_json(meta_path)
    for setting in ["lang", "name", "version"]:
        if setting not in meta or not meta[setting]:
            raise ValueError(Errors.E054.format(setting=setting))
    if "spacy_version" in meta:
        about_major_minor = ".".join(about.__version__.split(".")[:2])
        if not meta["spacy_version"].startswith(">=" + about_major_minor):
            # try to simplify version requirements from model meta to vx.x
            # for warning message
            meta_spacy_version = "v" + ".".join(
                meta["spacy_version"].replace(">=", "").split(".")[:2]
            )
            # if the format is unexpected, supply the full version
            if not re.match(r"v\d+\.\d+", meta_spacy_version):
                meta_spacy_version = meta["spacy_version"]
            warn_msg = Warnings.W031.format(
                model=meta["lang"] + "_" + meta["name"],
                model_version=meta["version"],
                version=meta_spacy_version,
                current=about.__version__,
            )
            warnings.warn(warn_msg)
    else:
        warn_msg = Warnings.W032.format(
            model=meta["lang"] + "_" + meta["name"],
            model_version=meta["version"],
            current=about.__version__,
        )
        warnings.warn(warn_msg)
    return meta
Exemplo n.º 11
0
    def get_model_meta(self, model_path):
        """ Get model's meta.json from the directory path of the model, and
        validate its contents. This method is ported from spaCy.
        `https://github.com/explosion/spaCy/blob/master/spacy/util.py#L231`

        Parameters
        ----------
        model_path: `pathlib.Path`
            Path to model directory.

        Returns
        -------
        dict
            The model's meta data.

        Raises
        ------
        FileNotFoundError
            If the model is not found (i.e., it has not been downloaded) or if
            the model misses the metafile.
        ValueError
            If the metafile of the model `meta.json` is malformed.
        """
        if not model_path.exists():
            raise FileNotFoundError(
                'Module not found at path %s. Verify it is installed.' %
                str(model_path))

        meta_path = model_path / 'meta.json'
        if not meta_path.is_file():
            raise FileNotFoundError(
                'It seems that model %s is  missing meta.json file.'
                'Contact maintainers' % model_path.name)

        meta = srsly.read_json(meta_path)
        for setting in ['name', 'version']:
            if setting not in meta or not meta[setting]:
                raise ValueError(
                    'Malformed meta.json file, value %s is missing. '
                    'Contact maintainers' % setting)
        return meta
Exemplo n.º 12
0
def read_file(path: Union[Path, str], **kwargs) -> List[Dict]:
    """Read train/dev examples from file, either JSON, MD or ConLL format.

    Args:
        path: file path.

    Returns:
        list of examples
    """
    if not isinstance(path, Path):
        path = Path(path)
    assert isinstance(path, Path)

    ext = path.suffix.lower()

    if ext == ".json":
        # JSON format is the GOLD standard ...
        return list(srsly.read_json(path))

    elif ext == ".jsonl":
        # same here ..
        return list(srsly.read_jsonl(path))

    elif ext in (".md", ".markdown"):
        from spacy_crfsuite.markdown import MarkdownReader

        # With markdown, we can easily convert to JSON
        with path.open("r", encoding="utf-8") as f:
            md_reader = MarkdownReader()
            return md_reader(f.read(), **kwargs)

    elif ext in (".txt", ".conll"):
        from spacy_crfsuite.conll import read_conll

        # CoNLL-02, CoNLL-03
        return list(read_conll(path, **kwargs))

    else:
        raise ValueError(
            f"Can't read examples from file with extension: ({ext}). "
            f"spacy_crfsuite accepts .json, .jsonl, .txt, .conll files.")
Exemplo n.º 13
0
def pytest_sessionstart(session):
    test_dir = Path(TESTS_DIR)
    if test_dir.exists():
        shutil.rmtree(str(test_dir))
        msg.info("Deleted existing test directory {}".format(TESTS_DIR))
    test_dir.mkdir()
    msg.good("Created test directory {}".format(TESTS_DIR))
    meta = srsly.read_json(META_FILE)
    n_files = 0
    for test_file, solution_file in get_source_files():
        with test_file.open("r", encoding="utf8") as f:
            test_code = f.read()
        with solution_file.open("r", encoding="utf8") as f:
            solution_code = f.read()
        full_code = format_test(test_file.stem, meta[PYTEST_TEMPLATE],
                                test_code, solution_code)
        test_path = test_dir / test_file.name
        with test_path.open("w", encoding="utf8") as f:
            f.write(full_code)
        n_files += 1
    msg.good("Created {} files for pytest in {}".format(n_files, TESTS_DIR))
Exemplo n.º 14
0
def test_link(trained_linker):
    @app.middleware("http")
    async def add_nlp_to_state(request: Request, call_next):
        request.state.nlp = trained_linker
        response = await call_next(request)
        return response

    client = TestClient(app)

    example_request = srsly.read_json(
        Path(__file__).parent.parent / "spacy_ann/api/example_request.json"
    )

    res = client.post("/link", json=example_request)
    assert res.status_code == 200

    data = res.json()

    for doc in data["documents"]:
        for span in doc["spans"]:
            assert "id" in span
Exemplo n.º 15
0
    def from_disk(
        self,
        path: Union[str, Path],
        *,
        exclude: Iterable[str] = SimpleFrozenList()) -> "EntityRuler":
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

        path (str / Path): The JSONL file to load.
        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        self.clear()
        depr_patterns_path = path.with_suffix(".jsonl")
        if depr_patterns_path.is_file():
            patterns = srsly.read_jsonl(depr_patterns_path)
            self.add_patterns(patterns)
        else:
            cfg = {}
            deserializers_patterns = {
                "patterns":
                lambda p: self.add_patterns(
                    srsly.read_jsonl(p.with_suffix(".jsonl")))
            }
            deserializers_cfg = {
                "cfg": lambda p: cfg.update(srsly.read_json(p))
            }
            from_disk(path, deserializers_cfg, {})
            self.overwrite = cfg.get("overwrite", False)
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)

            if self.phrase_matcher_attr is not None:
                self.phrase_matcher = PhraseMatcher(
                    self.nlp.vocab, attr=self.phrase_matcher_attr)
            from_disk(path, deserializers_patterns, {})
        return self
def _collate_best_model(meta, output_path, components):
    bests = {}
    for component in components:
        bests[component] = _find_best(output_path, component)
    best_dest = output_path / "model-best"
    shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest))
    for component, best_component_src in bests.items():
        shutil.rmtree(path2str(best_dest / component))
        if component == "ner":
            shutil.copytree(
                path2str(best_component_src / component), path2str(best_dest / component)
            )
            accs = srsly.read_json(best_component_src / "accuracy.json")
            for metric in _get_metrics(component):
                meta["accuracy"][metric] = accs[metric]
        else:
            best_component_src = output_path / "model-final"
            shutil.copytree(
                path2str(best_component_src / component), path2str(best_dest / component)
            )
    srsly.write_json(best_dest / "meta.json", meta)
    return best_dest
Exemplo n.º 17
0
    def from_disk(self, path: Path, **kwargs):
        """Deserialize saved AnnLinker from disk.
        
        path (Path): directory to deserialize from
        
        RETURNS (AnnLinker): Initialized AnnLinker
        """        
        path = util.ensure_path(path)

        kb = KnowledgeBase(self.nlp.vocab, 300)
        kb.load_bulk(path / "kb")
        self.set_kb(kb)

        cg = CandidateGenerator().from_disk(path)
        self.set_cg(cg)

        cfg = srsly.read_json(path / "cfg")
        
        self.threshold = cfg.get("threshold", 0.7)
        self.no_description_threshold = cfg.get("no_description_threshold", 0.95)
        self.disambiguate = cfg.get("disambiguate", True)

        return self
Exemplo n.º 18
0
def main(in_file, model_file=None, config_file=None, spacy_model=None):
    """Train CRF entity tagger."""
    if config_file:
        msg.info(f"Loading config: {config_file}")
        component_config = srsly.read_json(config_file)
    else:
        component_config = None

    model_file = model_file or "model.pkl"
    msg.info("Loading model from file", model_file)
    crf_extractor = CRFExtractor(
        component_config=component_config).from_disk(model_file)
    msg.good("Successfully loaded CRF tagger", crf_extractor)

    msg.info("Loading dev dataset from file", in_file)
    dev_examples = read_file(in_file)
    msg.good(f"Successfully loaded {len(dev_examples)} dev examples.")

    if spacy_model is not None:
        nlp = spacy.load(spacy_model)
        msg.info(f"Using spaCy model: {spacy_model}")
    else:
        nlp = spacy.blank("en")
        msg.info(f"Using spaCy blank: 'en'")

    tokenizer = SpacyTokenizer(nlp=nlp)
    use_dense_features = crf_extractor.use_dense_features()
    dev_crf_examples = [
        gold_example_to_crf_tokens(ex,
                                   tokenizer=tokenizer,
                                   use_dense_features=use_dense_features)
        for ex in dev_examples
    ]

    f1_score, classification_report = crf_extractor.eval(dev_crf_examples)
    msg.warn(f"f1 score: {f1_score}")
    print(classification_report)
Exemplo n.º 19
0
    def from_disk(self, path, exclude=tuple(), disable=None):
        """Loads state from a directory. Modifies the object in place and
        returns it. If the saved `Language` object contains a model, the
        model will be loaded.

        path (unicode or Path): A path to a directory.
        exclude (list): Names of components or serialization fields to exclude.
        RETURNS (Language): The modified `Language` object.

        DOCS: https://spacy.io/api/language#from_disk
        """
        if disable is not None:
            deprecation_warning(Warnings.W014)
            exclude = disable
        path = util.ensure_path(path)
        deserializers = OrderedDict()
        deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
        deserializers["vocab"] = lambda p: self.vocab.from_disk(
            p
        ) and _fix_pretrained_vectors_name(self)
        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
            p, exclude=["vocab"]
        )
        for name, proc in self.pipeline:
            if name in exclude:
                continue
            if not hasattr(proc, "from_disk"):
                continue
            deserializers[name] = lambda p, proc=proc: proc.from_disk(
                p, exclude=["vocab"]
            )
        if not (path / "vocab").exists() and "vocab" not in exclude:
            # Convert to list here in case exclude is (default) tuple
            exclude = list(exclude) + ["vocab"]
        util.from_disk(path, deserializers, exclude)
        self._path = path
        return self
Exemplo n.º 20
0
    def from_disk(self, path, **kwargs):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.

        RETURNS (EntityRuler): The loaded entity ruler.

        DOCS: https://spacy.io/api/entityruler#from_disk
        """
        path = ensure_path(path)
        depr_patterns_path = path.with_suffix(".jsonl")
        if depr_patterns_path.is_file():
            patterns = srsly.read_jsonl(depr_patterns_path)
            self.add_patterns(patterns)
        else:
            cfg = {}
            deserializers_patterns = {
                "patterns":
                lambda p: self.add_patterns(
                    srsly.read_jsonl(p.with_suffix(".jsonl")))
            }
            deserializers_cfg = {
                "cfg": lambda p: cfg.update(srsly.read_json(p))
            }
            from_disk(path, deserializers_cfg, {})
            self.overwrite = cfg.get("overwrite", False)
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)

            if self.phrase_matcher_attr is not None:
                self.phrase_matcher = PhraseMatcher(
                    self.nlp.vocab, attr=self.phrase_matcher_attr)
            from_disk(path, deserializers_patterns, {})
        return self
Exemplo n.º 21
0
load_dotenv(find_dotenv())
PREFIX: str = os.getenv('CLUSTER_ROUTE_PREFIX', '').rstrip('/')

# Path to `saved_model.pb`
MODEL_DIR: str = os.getenv('MODEL_DIR', FS.SAVED_MODELS)

# App object.
app = FastAPI(
    title='heart-disease',
    version='1.0',
    description='Predict heart disease with different ML algorithms.',
    openapi_prefix=PREFIX,
)

# Request example.
single_example = srsly.read_json('app/data/single_request_sample.json')
batch_example = srsly.read_json('app/data/batch_request_sample.json')

# Loaded saved model object.
model = SavedModel(model_dir=MODEL_DIR)


@app.get('/', include_in_schema=False)
async def docs_redirect():
    return RedirectResponse(f'{PREFIX}/docs')


@app.get('/models',
         response_model=AvailableModels,
         response_description='List of available models',
         summary='Return available models',
Exemplo n.º 22
0
def package(
    input_dir: Path,
    output_dir: Path,
    meta_path: Optional[Path] = None,
    code_paths: List[Path] = [],
    name: Optional[str] = None,
    version: Optional[str] = None,
    create_meta: bool = False,
    create_sdist: bool = True,
    create_wheel: bool = False,
    force: bool = False,
    silent: bool = True,
) -> None:
    msg = Printer(no_print=silent, pretty=not silent)
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if create_wheel and not has_wheel():
        err = "Generating a binary .whl file requires wheel to be installed"
        msg.fail(err, "pip install wheel", exits=1)
    if not input_path or not input_path.exists():
        msg.fail("Can't locate pipeline data", input_path, exits=1)
    if not output_path or not output_path.exists():
        msg.fail("Output directory not found", output_path, exits=1)
    if create_sdist or create_wheel:
        opts = ["sdist" if create_sdist else "", "wheel" if create_wheel else ""]
        msg.info(f"Building package artifacts: {', '.join(opt for opt in opts if opt)}")
    for code_path in code_paths:
        if not code_path.exists():
            msg.fail("Can't find code file", code_path, exits=1)
        # Import the code here so it's available when model is loaded (via
        # get_meta helper). Also verifies that everything works
        util.import_file(code_path.stem, code_path)
    if code_paths:
        msg.good(f"Including {len(code_paths)} Python module(s) with custom code")
    if meta_path and not meta_path.exists():
        msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
    meta_path = meta_path or input_dir / "meta.json"
    if not meta_path.exists() or not meta_path.is_file():
        msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    meta = get_meta(input_dir, meta)
    if meta["requirements"]:
        msg.good(
            f"Including {len(meta['requirements'])} package requirement(s) from "
            f"meta and config",
            ", ".join(meta["requirements"]),
        )
    if name is not None:
        if not name.isidentifier():
            msg.fail(
                f"Model name ('{name}') is not a valid module name. "
                "This is required so it can be imported as a module.",
                "We recommend names that use ASCII A-Z, a-z, _ (underscore), "
                "and 0-9. "
                "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers",
                exits=1,
            )
        if not _is_permitted_package_name(name):
            msg.fail(
                f"Model name ('{name}') is not a permitted package name. "
                "This is required to correctly load the model with spacy.load.",
                "We recommend names that use ASCII A-Z, a-z, _ (underscore), "
                "and 0-9. "
                "For specific details see: https://www.python.org/dev/peps/pep-0426/#name",
                exits=1,
            )
        meta["name"] = name
    if version is not None:
        meta["version"] = version
    if not create_meta:  # only print if user doesn't want to overwrite
        msg.good("Loaded meta.json from file", meta_path)
    else:
        meta = generate_meta(meta, msg)
    errors = validate(ModelMetaSchema, meta)
    if errors:
        msg.fail("Invalid pipeline meta.json")
        print("\n".join(errors))
        sys.exit(1)
    model_name = meta["name"]
    if not model_name.startswith(meta["lang"] + "_"):
        model_name = f"{meta['lang']}_{model_name}"
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_dir / model_name_v
    package_path = main_path / model_name
    if package_path.exists():
        if force:
            shutil.rmtree(str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing directories.",
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(str(input_dir), str(package_path / model_name_v))
    for file_name in FILENAMES_DOCS:
        file_path = package_path / model_name_v / file_name
        if file_path.exists():
            shutil.copy(str(file_path), str(main_path))
    readme_path = main_path / "README.md"
    if not readme_path.exists():
        readme = generate_readme(meta)
        create_file(readme_path, readme)
        create_file(package_path / model_name_v / "README.md", readme)
        msg.good("Generated README.md from meta.json")
    else:
        msg.info("Using existing README.md from pipeline directory")
    imports = []
    for code_path in code_paths:
        imports.append(code_path.stem)
        shutil.copy(str(code_path), str(package_path))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    init_py = TEMPLATE_INIT.format(
        imports="\n".join(f"from . import {m}" for m in imports)
    )
    create_file(package_path / "__init__.py", init_py)
    msg.good(f"Successfully created package directory '{model_name_v}'", main_path)
    if create_sdist:
        with util.working_dir(main_path):
            util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
        zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
        msg.good(f"Successfully created zipped Python package", zip_file)
    if create_wheel:
        with util.working_dir(main_path):
            util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False)
        wheel_name_squashed = re.sub("_+", "_", model_name_v)
        wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}"
        msg.good(f"Successfully created binary wheel", wheel)
    if "__" in model_name:
        msg.warn(
            f"Model name ('{model_name}') contains a run of underscores. "
            "Runs of underscores are not significant in installed package names.",
        )
Exemplo n.º 23
0
 def from_disk(self, path, **kwargs):
     path = util.ensure_path(path)
     serializers = OrderedDict(
         (("cfg", lambda p: self._set_config(srsly.read_json(p))), ))
     util.from_disk(path, serializers, [])
     self.tokenizer = try_sudachi_import(self.split_mode)
Exemplo n.º 24
0
from spacy_ann import __version__
from spacy_ann.api.types import LinkingRecord, LinkingRequest, LinkingResponse
from starlette.requests import Request
from starlette.responses import RedirectResponse

load_dotenv(find_dotenv())
openapi_prefix = os.getenv("CLUSTER_ROUTE_PREFIX", "").rstrip("/")

app = FastAPI(
    title="spacy-ann-linker",
    version=__version__,
    description=
    "Remote Entity Linking with Approximate Nearest Neighbors index lookup for Aliases",
    openapi_prefix=openapi_prefix,
)
example_request = srsly.read_json(
    Path(__file__).parent / "example_request.json")

security = APIKeyHeader(name="api-key")


@app.get("/", include_in_schema=False)
def docs_redirect():
    return RedirectResponse(f"{openapi_prefix}/docs")


@app.post("/link", response_model=LinkingResponse)
async def link(
        request: Request,
        #    api_key = Depends(security),
        similarity_threshold: float = 0.65,
        body: LinkingRequest = Body(..., example=example_request),
Exemplo n.º 25
0
 def from_disk(self, path, **kwargs):
     path = util.ensure_path(path)
     serializers = {
         "cfg": lambda p: self._set_config(srsly.read_json(p)),
     }
     util.from_disk(path, serializers, [])
Exemplo n.º 26
0
def main(output_dir, evaluate=False, sort_metric="ents_f"):
    output_dir = Path(output_dir)
    reports = []

    for work_dir in output_dir.glob("*"):
        report = {"path": str(work_dir), "training": []}
        report["vectors"] = srsly.read_json(work_dir.joinpath("meta.json"))

        model_dirs = list(work_dir.joinpath("training").glob("model[0-9]*"))

        for model_dir in model_dirs:
            model = {
                "meta": srsly.read_json(model_dir.joinpath("meta.json")),
                "path": str(model_dir),
                "size": get_size(model_dir),
            }
            report["training"].append(model)

        best_dir = work_dir.joinpath("training/model-best")

        if best_dir.exists():
            report["best"] = {
                "meta": srsly.read_json(model_dir.joinpath("meta.json")),
                "path": str(best_dir),
                "size": get_size(best_dir),
            }

        reports.append(report)

    reports.sort(
        key=lambda r: r["best"]["meta"]["accuracy"][sort_metric]
        if "best" in r else 0,
        reverse=True,
    )

    for (idx, report) in enumerate(reports):
        head = "Model {:>3}".format(idx)
        print("=" * len(head))
        print(head)
        print("=" * len(head))
        print("\tPath: {}".format(report["path"]))

        vec = report["vectors"]
        corp_desc = []

        for corp in vec["corpus"]:
            corp_desc.append(
                "{} (lemmatized={}, case preserved={}, tokens={})".format(
                    corp["description"],
                    corp["lemmatized"],
                    corp["case preserved"],
                    corp["tokens"],
                ))

        print()
        print("Vectors")
        print("-------")

        print("\tAlgorithm: {}".format(vec["algorithm"]["name"]))
        print("\tCorpus   : {}".format(" + ".join(corp_desc)))
        print(
            "\tURL      : http://vectors.nlpl.eu/repository/11/{}.zip".format(
                vec['id']))

        print(
            "\tVectors  : dimensions={}, window={}, iterations={}, vocab size={}"
            .format(
                vec["dimensions"],
                vec["window"],
                vec["iterations"],
                vec["vocabulary size"],
            ))

        print()
        print("Training")
        print("--------")

        for (idx, training) in enumerate(report["training"]):
            if "accuracy" in training["meta"]:
                print_accuracy(training["meta"]["accuracy"],
                               header=idx == 0,
                               indent=1)

        print()
        print("Best")
        print("----")

        if not "best" in report:
            print("\n\tNone saved.")
            return

        print("\tPath: {}".format(report["best"]["path"]))
        print("\tSize: {} MB".format(round(report["best"]["size"] / 1024**2)))
        print()

        print_accuracy(report["best"]["meta"]["accuracy"], indent=1)

        if evaluate:
            print()
            print("Evaluate")
            print("--------")

            res = subprocess.run(
                [
                    sys.executable,
                    "-m",
                    "spacy",
                    "evaluate",
                    "-g",
                    "1",
                    report["best"]["path"],
                    "data/norne-spacy/ud/nob/no-ud-test-ner.json",
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                encoding="utf8",
            )

            if res.returncode != 0:
                print("Evaluation failed!")
                print(res.stderr)

            for line in res.stdout.split("\n"):
                if not "===" in line:
                    print("\t", line)

            report["evaluation"] = res.stdout

        print("\n")
        srsly.write_json("nlpl-report.json", reports)
Exemplo n.º 27
0
def convert(
    input_path: Union[str, Path],
    output_dir: Union[str, Path],
    *,
    file_type: str = "json",
    n_sents: int = 1,
    seg_sents: bool = False,
    model: Optional[str] = None,
    morphology: bool = False,
    merge_subtokens: bool = False,
    converter: str = "auto",
    ner_map: Optional[Path] = None,
    lang: Optional[str] = None,
    concatenate: bool = False,
    silent: bool = True,
    msg: Optional[Printer],
) -> None:
    if not msg:
        msg = Printer(no_print=silent)
    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
    doc_files = []
    for input_loc in walk_directory(Path(input_path), converter):
        input_data = input_loc.open("r", encoding="utf-8").read()
        # Use converter function to convert data
        func = CONVERTERS[converter]
        docs = func(
            input_data,
            n_sents=n_sents,
            seg_sents=seg_sents,
            append_morphology=morphology,
            merge_subtokens=merge_subtokens,
            lang=lang,
            model=model,
            no_print=silent,
            ner_map=ner_map,
        )
        doc_files.append((input_loc, docs))
    if concatenate:
        all_docs = itertools.chain.from_iterable(
            [docs for _, docs in doc_files])
        doc_files = [(input_path, all_docs)]
    for input_loc, docs in doc_files:
        if file_type == "json":
            data = [docs_to_json(docs)]
            len_docs = len(data)
        else:
            db = DocBin(docs=docs, store_user_data=True)
            len_docs = len(db)
            data = db.to_bytes()
        if output_dir == "-":
            _print_docs_to_stdout(data, file_type)
        else:
            if input_loc != input_path:
                subpath = input_loc.relative_to(input_path)
                output_file = Path(output_dir) / subpath.with_suffix(
                    f".{file_type}")
            else:
                output_file = Path(output_dir) / input_loc.parts[-1]
                output_file = output_file.with_suffix(f".{file_type}")
            _write_docs_to_file(data, output_file, file_type)
            msg.good(
                f"Generated output file ({len_docs} documents): {output_file}")
Exemplo n.º 28
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
        nlp.disable_pipes(*other_pipes)
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                if pipe == "parser":
                    pipe_cfg = {"learn_tokens": learn_tokens}
                else:
                    pipe_cfg = {}
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples,
                                       device=use_gpu)

    nlp._optimizer = None

    # Load in pre-trained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # fmt: off
    row_head = [
        "Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F",
        "Tag %", "Token %", "CPU WPS", "GPU WPS"
    ]
    row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7]
    if has_beam_widths:
        row_head.insert(1, "Beam W.")
        row_widths.insert(1, 7)
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(nlp,
                                           noise_level=noise_level,
                                           gold_preproc=gold_preproc,
                                           max_length=0)
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, debug)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(
                                epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(nlp_loaded,
                                                gold_preproc=gold_preproc))
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(i - iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path,
                                                  nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Exemplo n.º 29
0
import spacy_streamlit
from pathlib import Path
import srsly
import importlib

MODELS = srsly.read_json(Path(__file__).parent / "models.json")
DEFAULT_MODEL = "en_core_web_sm"
DEFAULT_TEXT = "David Bowie moved to the US in 1974, initially staying in New York City before settling in Los Angeles."
DESCRIPTION = """**Explore trained [spaCy v3.0](https://nightly.spacy.io) pipelines**"""


def get_default_text(nlp):
    # Check if spaCy has built-in example texts for the language
    try:
        examples = importlib.import_module(f".lang.{nlp.lang}.examples",
                                           "spacy")
        return examples.sentences[0]
    except (ModuleNotFoundError, ImportError):
        return ""


spacy_streamlit.visualize(
    MODELS,
    default_model=DEFAULT_MODEL,
    visualizers=["parser", "ner", "similarity", "tokens"],
    show_visualizer_select=True,
    sidebar_description=DESCRIPTION,
    get_default_text=get_default_text)
Exemplo n.º 30
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
        nlp.disable_pipes(*other_pipes)
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe(pipe))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            nlp.add_pipe(nlp.create_pipe(pipe))

    if learn_tokens:
        nlp.add_pipe(nlp.create_pipe("merge_subtokens"))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail(
                    "Can't use multitask objective without '{}' in the "
                    "pipeline".format(pipe_name)
                )
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)

    nlp._optimizer = None

    # Load in pre-trained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # fmt: off
    row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"]
    row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7]
    if has_beam_widths:
        row_head.insert(1, "Beam W.")
        row_widths.insert(1, 7)
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                    )
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, debug)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                            )
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text(
                            "Early stopping, best iteration "
                            "is: {}".format(i - iter_since_best)
                        )
                        msg.text(
                            "Best score = {}; Final iteration "
                            "score = {}".format(best_score, current_score)
                        )
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Exemplo n.º 31
0
    RecordsResponse,
    RecordsEntitiesByTypeResponse,
)
from app.spacy_extractor import SpacyExtractor

load_dotenv(find_dotenv())
prefix = os.getenv("CLUSTER_ROUTE_PREFIX", "").rstrip("/")

app = FastAPI(
    title="{{cookiecutter.project_name}}",
    version="1.0",
    description="{{cookiecutter.project_short_description}}",
    openapi_prefix=prefix,
)

example_request = srsly.read_json("app/data/example_request.json")

nlp = spacy.load("{{cookiecutter.project_language}}")
extractor = SpacyExtractor(nlp)


@app.get("/", include_in_schema=False)
def docs_redirect():
    return RedirectResponse(f"{prefix}/docs")


@app.post("/entities", response_model=RecordsResponse, tags=["NER"])
async def extract_entities(body: RecordsRequest = Body(
    ..., example=example_request)):
    """Extract Named Entities from a batch of Records."""
def main(
    model="./zh_vectors_web_ud_lg/model-final",
    new_model_name="zh_vectors_web_ud_clue_lg",
    output_dir="./zh_vectors_web_ud_clue_lg",
    train_path="./clue_spacy_train.jsonl",
    dev_path="./clue_spacy_dev.jsonl",
    meta_path="./meta.json",
    use_gpu=0,
    n_iter=50
):
    import tqdm
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    for label in LABEL:
        if label not in ner.labels:
            ner.add_label(label)  # add new entity label to entity recognizer

    train_path = ensure_path(train_path)
    dev_path = ensure_path(dev_path)

    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    if output_dir.exists() and [p for p in output_dir.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_dir.exists():
        output_dir.mkdir()

    meta = srsly.read_json(meta_path) if meta_path else {}

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(0))
    corpus = GoldCorpus(train_path, dev_path, limit=0)
    n_train_words = corpus.count_train()
    
    if model is None:   
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    else:
        optimizer = create_default_optimizer(Model.ops)
        # Todo: gpu train?

    dropout_rates = decaying( 0.2, 0.2, 0.0)
    
    batch_sizes = compounding( 100.0, 1000.0 , 1.001)

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]


    # UnboundLocalError: local variable 'has_beam_widths' referenced before assignment
    # fmt: off
    eval_beam_widths=""
    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]
    row_head, output_stats = _configure_training_output(["ner"], use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        noise_level = 0.0
        orth_variant_level = 0.0
        gold_preproc = False
        verbose = False

        best_score = 0.0
        with nlp.disable_pipes(*other_pipes):  # only train NER
            for itn in range(n_iter):
                train_docs = corpus.train_docs(
                    nlp,
                    noise_level=noise_level,
                    orth_variant_level=orth_variant_level,
                    gold_preproc=gold_preproc,
                    max_length=0,
                    ignore_misaligned=True,
                )
                words_seen = 0
                with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                    losses = {}
                    for batch in minibatch_by_words(train_docs, size=batch_sizes):
                        if not batch:
                            continue
                        docs, golds = zip(*batch)
                        nlp.update(
                            docs,
                            golds,
                            sgd=optimizer,
                            drop=next(dropout_rates),
                            losses=losses,
                        )
                        if not int(os.environ.get("LOG_FRIENDLY", 0)):
                            pbar.update(sum(len(doc) for doc in docs))
                        words_seen += sum(len(doc) for doc in docs)
                with nlp.use_params(optimizer.averages):
                    set_env_log(False)
                    epoch_model_path = output_dir / ("model%d" % itn)
                    nlp.to_disk(epoch_model_path)
                    nlp_loaded = load_model_from_path(epoch_model_path)
                    for beam_width in eval_beam_widths:
                        for name, component in nlp_loaded.pipeline:
                            if hasattr(component, "cfg"):
                                component.cfg["beam_width"] = beam_width
                        dev_docs = list(
                            corpus.dev_docs(
                                nlp_loaded,
                                gold_preproc=gold_preproc,
                                ignore_misaligned=True,
                            )
                        )
                        nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                        end_time = timer()
                        if use_gpu < 0:
                            gpu_wps = None
                            cpu_wps = nwords / (end_time - start_time)
                        else:
                            gpu_wps = nwords / (end_time - start_time)
                            with Model.use_device("cpu"):
                                nlp_loaded = load_model_from_path(epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
                                    if hasattr(component, "cfg"):
                                        component.cfg["beam_width"] = beam_width
                                dev_docs = list(
                                    corpus.dev_docs(
                                        nlp_loaded,
                                        gold_preproc=gold_preproc,
                                        ignore_misaligned=True,
                                    )
                                )
                                start_time = timer()
                                scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                                end_time = timer()
                                cpu_wps = nwords / (end_time - start_time)
                        acc_loc = output_dir / ("model%d" % itn) / "accuracy.json"
                        srsly.write_json(acc_loc, scorer.scores)

                        # Update model meta.json
                        meta["lang"] = nlp.lang
                        meta["pipeline"] = nlp.pipe_names
                        meta["spacy_version"] = ">=%s" % spacy.__version__
                        if beam_width == 1:
                            meta["speed"] = {
                                "nwords": nwords,
                                "cpu": cpu_wps,
                                "gpu": gpu_wps,
                            }
                            meta["accuracy"] = scorer.scores
                        else:
                            meta.setdefault("beam_accuracy", {})
                            meta.setdefault("beam_speed", {})
                            meta["beam_accuracy"][beam_width] = scorer.scores
                            meta["beam_speed"][beam_width] = {
                                "nwords": nwords,
                                "cpu": cpu_wps,
                                "gpu": gpu_wps,
                            }
                        meta["vectors"] = {
                            "width": nlp.vocab.vectors_length,
                            "vectors": len(nlp.vocab.vectors),
                            "keys": nlp.vocab.vectors.n_keys,
                            "name": nlp.vocab.vectors.name,
                        }
                        meta.setdefault("name", "model%d" % itn)
                        meta.setdefault("version", "0.0.1")
                        meta["labels"] = nlp.meta["labels"]
                        meta_loc = output_dir / ("model%d" % itn) / "meta.json"
                        srsly.write_json(meta_loc, meta)
                        set_env_log(verbose)

                        progress = _get_progress(
                            itn,
                            losses,
                            scorer.scores,
                            output_stats,
                            beam_width=beam_width if has_beam_widths else None,
                            cpu_wps=cpu_wps,
                            gpu_wps=gpu_wps,
                        )

                        msg.row(progress, **row_settings)

    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_dir / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        meta["pipeline"] = nlp.pipe_names
        meta["labels"] = nlp.meta["labels"]
        meta["factories"] = nlp.meta["factories"]
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_dir, nlp.pipe_names)
        msg.good("Created best model", best_model_path)
Exemplo n.º 33
0
def debug_data(
    lang,
    train_path,
    dev_path,
    tag_map_path=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    ignore_warnings=False,
    verbose=False,
    no_format=False,
):
    """
    Analyze, debug and validate your training and development data, get useful
    stats, and find problems like invalid entity annotations, cyclic
    dependencies, low data labels and more.
    """
    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)

    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    tag_map = {}
    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)

    # Initialize the model and pipeline
    pipeline = [p.strip() for p in pipeline.split(",")]
    if base_model:
        nlp = load_model(base_model)
    else:
        lang_cls = get_lang_class(lang)
        nlp = lang_cls()
    # Update tag map with provided mapping
    nlp.vocab.morphology.tag_map.update(tag_map)

    msg.divider("Data format validation")

    # TODO: Validate data format using the JSON schema
    # TODO: update once the new format is ready
    # TODO: move validation to GoldCorpus in order to be able to load from dir

    # Create the gold corpus to be able to better analyze data
    loading_train_error_message = ""
    loading_dev_error_message = ""
    with msg.loading("Loading corpus..."):
        corpus = GoldCorpus(train_path, dev_path)
        try:
            train_docs = list(corpus.train_docs(nlp))
            train_docs_unpreprocessed = list(
                corpus.train_docs_without_preprocessing(nlp))
        except ValueError as e:
            loading_train_error_message = "Training data cannot be loaded: {}".format(
                str(e))
        try:
            dev_docs = list(corpus.dev_docs(nlp))
        except ValueError as e:
            loading_dev_error_message = "Development data cannot be loaded: {}".format(
                str(e))
    if loading_train_error_message or loading_dev_error_message:
        if loading_train_error_message:
            msg.fail(loading_train_error_message)
        if loading_dev_error_message:
            msg.fail(loading_dev_error_message)
        sys.exit(1)
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_docs constantly
    gold_train_data = _compile_gold(train_docs, pipeline, nlp)
    gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed,
                                                   pipeline, nlp)
    gold_dev_data = _compile_gold(dev_docs, pipeline, nlp)

    train_texts = gold_train_data["texts"]
    dev_texts = gold_dev_data["texts"]

    msg.divider("Training stats")
    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
    for pipe in [p for p in pipeline if p not in nlp.factories]:
        msg.fail(
            "Pipeline component '{}' not available in factories".format(pipe))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
    msg.text("{} training docs".format(len(train_docs)))
    msg.text("{} evaluation docs".format(len(dev_docs)))

    if not len(dev_docs):
        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn(
            "{} training examples also in evaluation data".format(overlap))
    else:
        msg.good("No overlap between training and evaluation data")
    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
        text = "Low number of examples to train from a blank model ({})".format(
            len(train_docs))
        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            "It's recommended to use at least {} examples (minimum {})".format(
                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD),
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_train_data["n_words"]
    msg.info("{} total {} in the data ({} unique)".format(
        n_words, "word" if n_words == 1 else "words",
        len(gold_train_data["words"])))
    if gold_train_data["n_misaligned_words"] > 0:
        msg.warn("{} misaligned tokens in the training data".format(
            gold_train_data["n_misaligned_words"]))
    if gold_dev_data["n_misaligned_words"] > 0:
        msg.warn("{} misaligned tokens in the dev data".format(
            gold_dev_data["n_misaligned_words"]))
    most_common_words = gold_train_data["words"].most_common(10)
    msg.text(
        "10 most common words: {}".format(
            _format_labels(most_common_words, counts=True)),
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info("{} vectors ({} unique keys, {} dimensions)".format(
            len(nlp.vocab.vectors),
            nlp.vocab.vectors.n_keys,
            nlp.vocab.vectors_length,
        ))
        n_missing_vectors = sum(
            gold_train_data["words_missing_vectors"].values())
        msg.warn(
            "{} words in training data without vectors ({:0.2f}%)".format(
                n_missing_vectors,
                n_missing_vectors / gold_train_data["n_words"],
            ), )
        msg.text(
            "10 most common words without vectors: {}".format(
                _format_labels(
                    gold_train_data["words_missing_vectors"].most_common(10),
                    counts=True,
                )),
            show=verbose,
        )
    else:
        msg.info("No word vectors present in the model")

    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_train_data["ner"]
                     if label not in ("O", "-"))
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False
        has_punct_ents_warning = False

        msg.divider("Named Entity Recognition")
        msg.info("{} new {}, {} existing {}".format(
            len(new_labels),
            "label" if len(new_labels) == 1 else "labels",
            len(existing_labels),
            "label" if len(existing_labels) == 1 else "labels",
        ))
        missing_values = label_counts["-"]
        msg.text("{} missing {} (tokens with '-' label)".format(
            missing_values, "value" if missing_values == 1 else "values"))
        for label in new_labels:
            if len(label) == 0:
                msg.fail("Empty label found in new labels")
        if new_labels:
            labels_with_counts = [
                (label, count) for label, count in label_counts.most_common()
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts,
                                                counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text("Existing: {}".format(_format_labels(existing_labels)),
                     show=verbose)

        if gold_train_data["ws_ents"]:
            msg.fail("{} invalid whitespace entity span(s)".format(
                gold_train_data["ws_ents"]))
            has_ws_ents_error = True

        if gold_train_data["punct_ents"]:
            msg.warn("{} entity span(s) with punctuation".format(
                gold_train_data["punct_ents"]))
            has_punct_ents_warning = True

        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    "Low number of examples for new label '{}' ({})".format(
                        label, label_counts[label]))
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_docs, label)
                if neg_docs == 0:
                    msg.warn(
                        "No examples for texts WITHOUT new label '{}'".format(
                            label))
                    has_no_neg_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurrences available for all labels")
        if not has_ws_ents_error:
            msg.good(
                "No entities consisting of or starting/ending with whitespace")
        if not has_punct_ents_warning:
            msg.good(
                "No entities consisting of or starting/ending with punctuation"
            )

        if has_low_data_warning:
            msg.text(
                "To train a new entity type, your data should include at "
                "least {} instances of the new label".format(
                    NEW_LABEL_THRESHOLD),
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text(
                "As of spaCy v2.1.0, entity spans consisting of or starting/ending "
                "with whitespace characters are considered invalid.")

        if has_punct_ents_warning:
            msg.text(
                "Entity spans consisting of or starting/ending "
                "with punctuation can not be trained with a noise level > 0.")

    if "textcat" in pipeline:
        msg.divider("Text Classification")
        labels = [label for label in gold_train_data["cats"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info("Text Classification: {} new label(s), {} existing label(s)".
                 format(len(new_labels), len(existing_labels)))
        if new_labels:
            labels_with_counts = _format_labels(
                gold_train_data["cats"].most_common(), counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text("Existing: {}".format(_format_labels(existing_labels)),
                     show=verbose)
        if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
            msg.fail("The train and dev labels are not the same. "
                     "Train labels: {}. "
                     "Dev labels: {}.".format(
                         _format_labels(gold_train_data["cats"]),
                         _format_labels(gold_dev_data["cats"]),
                     ))
        if gold_train_data["n_cats_multilabel"] > 0:
            msg.info("The train data contains instances without "
                     "mutually-exclusive classes. Use '--textcat-multilabel' "
                     "when training.")
            if gold_dev_data["n_cats_multilabel"] == 0:
                msg.warn(
                    "Potential train/dev mismatch: the train data contains "
                    "instances without mutually-exclusive classes while the "
                    "dev data does not.")
        else:
            msg.info("The train data contains only instances with "
                     "mutually-exclusive classes.")
            if gold_dev_data["n_cats_multilabel"] > 0:
                msg.fail(
                    "Train/dev mismatch: the dev data contains instances "
                    "without mutually-exclusive classes while the train data "
                    "contains only instances with mutually-exclusive classes.")

    if "tagger" in pipeline:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_train_data["tags"]]
        tag_map = nlp.vocab.morphology.tag_map
        msg.info("{} {} in data ({} {} in tag map)".format(
            len(labels),
            "label" if len(labels) == 1 else "labels",
            len(tag_map),
            "label" if len(tag_map) == 1 else "labels",
        ))
        labels_with_counts = _format_labels(
            gold_train_data["tags"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
            msg.good("All labels present in tag map for language '{}'".format(
                nlp.lang))
        for label in non_tagmap:
            msg.fail(
                "Label '{}' not found in tag map for language '{}'".format(
                    label, nlp.lang))

    if "parser" in pipeline:
        has_low_data_warning = False
        msg.divider("Dependency Parsing")

        # profile sentence length
        msg.info("Found {} sentence{} with an average length of {:.1f} words.".
                 format(
                     gold_train_data["n_sents"],
                     "s" if len(train_docs) > 1 else "",
                     gold_train_data["n_words"] / gold_train_data["n_sents"],
                 ))

        # check for documents with multiple sentences
        sents_per_doc = gold_train_data["n_sents"] / len(
            gold_train_data["texts"])
        if sents_per_doc < 1.1:
            msg.warn(
                "The training data contains {:.2f} sentences per "
                "document. When there are very few documents containing more "
                "than one sentence, the parser will not learn how to segment "
                "longer texts into sentences.".format(sents_per_doc))

        # profile labels
        labels_train = [label for label in gold_train_data["deps"]]
        labels_train_unpreprocessed = [
            label for label in gold_train_unpreprocessed_data["deps"]
        ]
        labels_dev = [label for label in gold_dev_data["deps"]]

        if gold_train_unpreprocessed_data["n_nonproj"] > 0:
            msg.info("Found {} nonprojective train sentence{}".format(
                gold_train_unpreprocessed_data["n_nonproj"],
                "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
            ))
        if gold_dev_data["n_nonproj"] > 0:
            msg.info("Found {} nonprojective dev sentence{}".format(
                gold_dev_data["n_nonproj"],
                "s" if gold_dev_data["n_nonproj"] > 1 else "",
            ))

        msg.info("{} {} in train data".format(
            len(labels_train_unpreprocessed),
            "label" if len(labels_train) == 1 else "labels",
        ))
        msg.info("{} {} in projectivized train data".format(
            len(labels_train),
            "label" if len(labels_train) == 1 else "labels"))

        labels_with_counts = _format_labels(
            gold_train_unpreprocessed_data["deps"].most_common(), counts=True)
        msg.text(labels_with_counts, show=verbose)

        # rare labels in train
        for label in gold_train_unpreprocessed_data["deps"]:
            if gold_train_unpreprocessed_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD:
                msg.warn("Low number of examples for label '{}' ({})".format(
                    label, gold_train_unpreprocessed_data["deps"][label]))
                has_low_data_warning = True

        # rare labels in projectivized train
        rare_projectivized_labels = []
        for label in gold_train_data["deps"]:
            if gold_train_data["deps"][
                    label] <= DEP_LABEL_THRESHOLD and "||" in label:
                rare_projectivized_labels.append("{}: {}".format(
                    label, str(gold_train_data["deps"][label])))

        if len(rare_projectivized_labels) > 0:
            msg.warn(
                "Low number of examples for {} label{} in the "
                "projectivized dependency trees used for training. You may "
                "want to projectivize labels such as punct before "
                "training in order to improve parser performance.".format(
                    len(rare_projectivized_labels),
                    "s" if len(rare_projectivized_labels) > 1 else "",
                ))
            msg.warn(
                "Projectivized labels with low numbers of examples: "
                "{}".format("\n".join(rare_projectivized_labels)),
                show=verbose,
            )
            has_low_data_warning = True

        # labels only in train
        if set(labels_train) - set(labels_dev):
            msg.warn(
                "The following labels were found only in the train data: "
                "{}".format(", ".join(set(labels_train) - set(labels_dev))),
                show=verbose,
            )

        # labels only in dev
        if set(labels_dev) - set(labels_train):
            msg.warn(
                "The following labels were found only in the dev data: " +
                ", ".join(set(labels_dev) - set(labels_train)),
                show=verbose,
            )

        if has_low_data_warning:
            msg.text(
                "To train a parser, your data should include at "
                "least {} instances of each label.".format(
                    DEP_LABEL_THRESHOLD),
                show=verbose,
            )

        # multiple root labels
        if len(gold_train_unpreprocessed_data["roots"]) > 1:
            msg.warn(
                "Multiple root labels ({}) ".format(", ".join(
                    gold_train_unpreprocessed_data["roots"])) +
                "found in training data. spaCy's parser uses a single root "
                "label ROOT so this distinction will not be available.")

        # these should not happen, but just in case
        if gold_train_data["n_nonproj"] > 0:
            msg.fail(
                "Found {} nonprojective projectivized train sentence{}".format(
                    gold_train_data["n_nonproj"],
                    "s" if gold_train_data["n_nonproj"] > 1 else "",
                ))
        if gold_train_data["n_cycles"] > 0:
            msg.fail(
                "Found {} projectivized train sentence{} with cycles".format(
                    gold_train_data["n_cycles"],
                    "s" if gold_train_data["n_cycles"] > 1 else "",
                ))

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good("{} {} passed".format(
            good_counts, "check" if good_counts == 1 else "checks"))
    if warn_counts:
        msg.warn("{} {}".format(warn_counts,
                                "warning" if warn_counts == 1 else "warnings"))
    if fail_counts:
        msg.fail("{} {}".format(fail_counts,
                                "error" if fail_counts == 1 else "errors"))

    if fail_counts:
        sys.exit(1)
Exemplo n.º 34
0
def package(
    input_dir: Path,
    output_dir: Path,
    meta_path: Optional[Path] = None,
    code_paths: List[Path] = [],
    name: Optional[str] = None,
    version: Optional[str] = None,
    create_meta: bool = False,
    create_sdist: bool = True,
    create_wheel: bool = False,
    force: bool = False,
    silent: bool = True,
) -> None:
    msg = Printer(no_print=silent, pretty=not silent)
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if create_wheel and not has_wheel():
        err = "Generating a binary .whl file requires wheel to be installed"
        msg.fail(err, "pip install wheel", exits=1)
    if not input_path or not input_path.exists():
        msg.fail("Can't locate pipeline data", input_path, exits=1)
    if not output_path or not output_path.exists():
        msg.fail("Output directory not found", output_path, exits=1)
    if create_sdist or create_wheel:
        opts = [
            "sdist" if create_sdist else "", "wheel" if create_wheel else ""
        ]
        msg.info(
            f"Building package artifacts: {', '.join(opt for opt in opts if opt)}"
        )
    for code_path in code_paths:
        if not code_path.exists():
            msg.fail("Can't find code file", code_path, exits=1)
        # Import the code here so it's available when model is loaded (via
        # get_meta helper). Also verifies that everything works
        util.import_file(code_path.stem, code_path)
    if code_paths:
        msg.good(
            f"Including {len(code_paths)} Python module(s) with custom code")
    if meta_path and not meta_path.exists():
        msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
    meta_path = meta_path or input_dir / "meta.json"
    if not meta_path.exists() or not meta_path.is_file():
        msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    meta = get_meta(input_dir, meta)
    if name is not None:
        meta["name"] = name
    if version is not None:
        meta["version"] = version
    if not create_meta:  # only print if user doesn't want to overwrite
        msg.good("Loaded meta.json from file", meta_path)
    else:
        meta = generate_meta(meta, msg)
    errors = validate(ModelMetaSchema, meta)
    if errors:
        msg.fail("Invalid pipeline meta.json")
        print("\n".join(errors))
        sys.exit(1)
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_dir / model_name_v
    package_path = main_path / model_name
    if package_path.exists():
        if force:
            shutil.rmtree(str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing directories.",
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(str(input_dir), str(package_path / model_name_v))
    license_path = package_path / model_name_v / "LICENSE"
    if license_path.exists():
        shutil.move(str(license_path), str(main_path))
    imports = []
    for code_path in code_paths:
        imports.append(code_path.stem)
        shutil.copy(str(code_path), str(package_path))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    init_py = TEMPLATE_INIT.format(imports="\n".join(f"from . import {m}"
                                                     for m in imports))
    create_file(package_path / "__init__.py", init_py)
    msg.good(f"Successfully created package '{model_name_v}'", main_path)
    if create_sdist:
        with util.working_dir(main_path):
            util.run_command([sys.executable, "setup.py", "sdist"],
                             capture=False)
        zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}"
        msg.good(f"Successfully created zipped Python package", zip_file)
    if create_wheel:
        with util.working_dir(main_path):
            util.run_command([sys.executable, "setup.py", "bdist_wheel"],
                             capture=False)
        wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}"
        msg.good(f"Successfully created binary wheel", wheel)
Exemplo n.º 35
0
def read_labels(path: Path, *, require: bool = False):
    # I decided not to give this a generic name, because I don't want people to
    # use it for arbitrary stuff, as I want this require arg with default False.
    if not require and not path.exists():
        return None
    return srsly.read_json(path)
Exemplo n.º 36
0
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    replace_components=False,
    vectors=None,
    width=96,
    conv_depth=4,
    cnn_window=1,
    cnn_pieces=3,
    bilstm_depth=0,
    embed_rows=2000,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    orth_variant_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    textcat_multilabel=False,
    textcat_arch="bow",
    textcat_positive_label=None,
    tag_map_path=None,
    omit_extra_lookups=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()
        msg.good("Created output directory: {}".format(output_path))

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    disabled_pipes = None
    pipes_added = False
    msg.text("Training pipeline: {}".format(pipeline))
    if use_gpu >= 0:
        activated_gpu = None
        try:
            activated_gpu = set_gpu(use_gpu)
        except Exception as e:
            msg.warn("Exception: {}".format(e))
        if activated_gpu is not None:
            msg.text("Using GPU: {}".format(use_gpu))
        else:
            msg.warn("Unable to activate GPU: {}".format(use_gpu))
            msg.text("Using CPU only")
            use_gpu = -1
    base_components = []
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        for pipe in pipeline:
            pipe_cfg = {}
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            if pipe not in nlp.pipe_names:
                msg.text("Adding component to base model: '{}'".format(pipe))
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            elif replace_components:
                msg.text(
                    "Replacing component from base model '{}'".format(pipe))
                nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg))
                pipes_added = True
            else:
                if pipe == "textcat":
                    textcat_cfg = nlp.get_pipe("textcat").cfg
                    base_cfg = {
                        "exclusive_classes": textcat_cfg["exclusive_classes"],
                        "architecture": textcat_cfg["architecture"],
                        "positive_label": textcat_cfg["positive_label"],
                    }
                    if base_cfg != pipe_cfg:
                        msg.fail(
                            "The base textcat model configuration does"
                            "not match the provided training options. "
                            "Existing cfg: {}, provided cfg: {}".format(
                                base_cfg, pipe_cfg),
                            exits=1,
                        )
                msg.text(
                    "Extending component from base model '{}'".format(pipe))
                base_components.append(pipe)
        disabled_pipes = nlp.disable_pipes(
            [p for p in nlp.pipe_names if p not in pipeline])
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if tag_map_path is not None:
        tag_map = srsly.read_json(tag_map_path)
        # Replace tag map with provided mapping
        nlp.vocab.morphology.load_tag_map(tag_map)

    # Create empty extra lexeme tables so the data from spacy-lookups-data
    # isn't loaded if these features are accessed
    if omit_extra_lookups:
        nlp.vocab.lookups_extra = Lookups()
        nlp.vocab.lookups_extra.add_table("lexeme_cluster")
        nlp.vocab.lookups_extra.add_table("lexeme_prob")
        nlp.vocab.lookups_extra.add_table("lexeme_settings")

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model and not pipes_added:
        # Start with an existing model, use default optimizer
        optimizer = nlp.resume_training(device=use_gpu)
    else:
        # Start with a blank model, call begin_training
        cfg = {"device": use_gpu}
        cfg["conv_depth"] = conv_depth
        cfg["token_vector_width"] = width
        cfg["bilstm_depth"] = bilstm_depth
        cfg["cnn_maxout_pieces"] = cnn_pieces
        cfg["embed_size"] = embed_rows
        cfg["conv_window"] = cnn_window
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg)

    nlp._optimizer = None

    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec,
                                              base_components)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # Verify textcat config
    if "textcat" in pipeline:
        textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", [])
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
            msg.fail(
                "The textcat_positive_label (tpl) '{}' does not match any "
                "label in the training data.".format(textcat_positive_label),
                exits=1,
            )
        if textcat_positive_label and len(textcat_labels) != 2:
            msg.fail(
                "A textcat_positive_label (tpl) '{}' was provided for training "
                "data that does not appear to be a binary classification "
                "problem with two labels.".format(textcat_positive_label),
                exits=1,
            )
        train_docs = corpus.train_docs(
            nlp,
            noise_level=noise_level,
            gold_preproc=gold_preproc,
            max_length=0,
            ignore_misaligned=True,
        )
        train_labels = set()
        if textcat_multilabel:
            multilabel_found = False
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1:
                    multilabel_found = True
            if not multilabel_found and not base_model:
                msg.warn("The textcat training instances look like they have "
                         "mutually-exclusive classes. Remove the flag "
                         "'--textcat-multilabel' to train a classifier with "
                         "mutually-exclusive classes.")
        if not textcat_multilabel:
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
                    msg.warn(
                        "Some textcat training instances do not have exactly "
                        "one positive label. Modifying training options to "
                        "include the flag '--textcat-multilabel' for classes "
                        "that are not mutually exclusive.")
                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
                    textcat_multilabel = True
                    break
        if base_model and set(textcat_labels) != train_labels:
            msg.fail(
                "Cannot extend textcat model using data with different "
                "labels. Base model labels: {}, training data labels: "
                "{}.".format(textcat_labels, list(train_labels)),
                exits=1,
            )
        if textcat_multilabel:
            msg.text(
                "Textcat evaluation score: ROC AUC score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        elif textcat_positive_label and len(textcat_labels) == 2:
            msg.text("Textcat evaluation score: F1-score for the "
                     "label '{}'".format(textcat_positive_label))
        elif len(textcat_labels) > 1:
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
                    "exclusive classes, provide '--textcat-positive-label' for "
                    "an evaluation on the positive class.")
            msg.text(
                "Textcat evaluation score: F1-score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        else:
            msg.fail(
                "Unsupported textcat configuration. Use `spacy debug-data` "
                "for more information.")

    # fmt: off
    row_head, output_stats = _configure_training_output(
        pipeline, use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
                ignore_misaligned=True,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    try:
                        nlp.update(
                            docs,
                            golds,
                            sgd=optimizer,
                            drop=next(dropout_rates),
                            losses=losses,
                        )
                    except ValueError as e:
                        err = "Error during training"
                        if init_tok2vec:
                            err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
                        msg.fail(err,
                                 "Original error message: {}".format(e),
                                 exits=1)
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(
                            nlp_loaded,
                            gold_preproc=gold_preproc,
                            ignore_misaligned=True,
                        ))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        # Only evaluate on CPU in the first iteration (for
                        # timing) if GPU is enabled
                        if i == 0:
                            with Model.use_device("cpu"):
                                nlp_loaded = util.load_model_from_path(
                                    epoch_model_path)
                                for name, component in nlp_loaded.pipeline:
                                    if hasattr(component, "cfg"):
                                        component.cfg[
                                            "beam_width"] = beam_width
                                dev_docs = list(
                                    corpus.dev_docs(
                                        nlp_loaded,
                                        gold_preproc=gold_preproc,
                                        ignore_misaligned=True,
                                    ))
                                start_time = timer()
                                scorer = nlp_loaded.evaluate(dev_docs,
                                                             verbose=verbose)
                                end_time = timer()
                                cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta.setdefault("accuracy", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["accuracy"][metric] = scorer.scores[
                                    metric]
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        for component in nlp.pipe_names:
                            for metric in _get_metrics(component):
                                meta["beam_accuracy"][metric] = scorer.scores[
                                    metric]
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get(
                            "textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
                                    "Textcat ROC AUC score is undefined due to "
                                    "only one value in label '{}'.".format(
                                        cat))
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        iter_current = i + 1
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(iter_current -
                                                 iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    except Exception as e:
        msg.warn(
            "Aborting and saving the final best model. "
            "Encountered exception: {}".format(e),
            exits=1,
        )
    finally:
        best_pipes = nlp.pipe_names
        if disabled_pipes:
            disabled_pipes.restore()
            meta["pipeline"] = nlp.pipe_names
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
            srsly.write_json(final_model_path / "meta.json", meta)

            meta_loc = output_path / "model-final" / "meta.json"
            final_meta = srsly.read_json(meta_loc)
            final_meta.setdefault("accuracy", {})
            final_meta["accuracy"].update(meta.get("accuracy", {}))
            final_meta.setdefault("speed", {})
            final_meta["speed"].setdefault("cpu", None)
            final_meta["speed"].setdefault("gpu", None)
            meta.setdefault("speed", {})
            meta["speed"].setdefault("cpu", None)
            meta["speed"].setdefault("gpu", None)
            # combine cpu and gpu speeds with the base model speeds
            if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
                final_meta["speed"]["cpu"] = speed
            if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
                speed = _get_total_speed(
                    [final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
                final_meta["speed"]["gpu"] = speed
            # if there were no speeds to update, overwrite with meta
            if (final_meta["speed"]["cpu"] is None
                    and final_meta["speed"]["gpu"] is None):
                final_meta["speed"].update(meta["speed"])
            # note: beam speeds are not combined with the base model
            if has_beam_widths:
                final_meta.setdefault("beam_accuracy", {})
                final_meta["beam_accuracy"].update(
                    meta.get("beam_accuracy", {}))
                final_meta.setdefault("beam_speed", {})
                final_meta["beam_speed"].update(meta.get("beam_speed", {}))
            srsly.write_json(meta_loc, final_meta)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(final_meta, output_path,
                                                  best_pipes)
        msg.good("Created best model", best_model_path)
Exemplo n.º 37
0
 def _get_skills(self):
     """Query skills from skills collection"""
     skills_path = self.data_path / "skills.json"
     skills = srsly.read_json(skills_path)
     return skills
Exemplo n.º 38
0
 def from_disk(self, path, **_kwargs):
     path = util.ensure_path(path)
     serializers = OrderedDict(
         (("cfg", lambda p: self._set_config(srsly.read_json(p))), ))
     util.from_disk(path, serializers, [])
Exemplo n.º 39
0
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import json
import logging

import azure.functions as func
import srsly


skills = srsly.read_json("data/skills.json")


def main(req: func.HttpRequest) -> func.HttpResponse:
    skill_id = req.route_params.get("skill_id")
    logging.info(f"Fetching skill by id {skill_id}")

    if skill_id:
        if skill_id not in skills:
            res = func.HttpResponse(
                f"Not Found: Skill with id {skill_id} does not exist", status_code=404
            )
        else:
            res = func.HttpResponse(json.dumps(skills[skill_id]))
    else:
        res = func.HttpResponse(
            "Please pass a skill_id on the query string or in the request body",
            status_code=400,
        )

    return res