def from_disk(self, path, exclude=tuple(), disable=None): """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the model will be loaded. path (unicode or Path): A path to a directory. exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The modified `Language` object. DOCS: https://spacy.io/api/language#from_disk """ if disable is not None: deprecation_warning(Warnings.W014) exclude = disable path = util.ensure_path(path) deserializers = OrderedDict() deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) deserializers["vocab"] = lambda p: self.vocab.from_disk(p) and _fix_pretrained_vectors_name(self) deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(p, exclude=["vocab"]) for name, proc in self.pipeline: if name in exclude: continue if not hasattr(proc, "from_disk"): continue deserializers[name] = lambda p, proc=proc: proc.from_disk(p, exclude=["vocab"]) if not (path / "vocab").exists() and "vocab" not in exclude: # Convert to list here in case exclude is (default) tuple exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) self._path = path return self
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. If --create-meta is set and a meta.json already exists in the output directory, the existing values will be used as the defaults in the command-line prompt. """ msg = Printer() input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if not input_path or not input_path.exists(): msg.fail("Can't locate model data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if meta_path and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta_path = meta_path or input_path / "meta.json" if meta_path.is_file(): meta = srsly.read_json(meta_path) if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(input_dir, meta, msg) for key in ("lang", "name", "version"): if key not in meta or meta[key] == "": msg.fail( "No '{}' setting found in meta.json".format(key), "This setting is required to build your package.", exits=1, ) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_path / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(path2str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing " "directories.".format(path=path2str(package_path)), exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) msg.good("Successfully created package '{}'".format(model_name_v), main_path) msg.text("To build the package, run `python setup.py sdist` in this directory.")
def _find_best(experiment_dir, component): accuracies = [] for epoch_model in experiment_dir.iterdir(): if epoch_model.is_dir() and epoch_model.parts[-1] != "model-final": accs = srsly.read_json(epoch_model / "accuracy.json") scores = [accs.get(metric, 0.0) for metric in _get_metrics(component)] accuracies.append((scores, epoch_model)) if accuracies: return max(accuracies)[1] else: return None
def _collate_best_model(meta, output_path, components): bests = {} for component in components: bests[component] = _find_best(output_path, component) best_dest = output_path / "model-best" shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest)) for component, best_component_src in bests.items(): shutil.rmtree(path2str(best_dest / component)) shutil.copytree( path2str(best_component_src / component), path2str(best_dest / component) ) accs = srsly.read_json(best_component_src / "accuracy.json") for metric in _get_metrics(component): meta["accuracy"][metric] = accs[metric] srsly.write_json(best_dest / "meta.json", meta) return best_dest
def _load_file(file_path, msg): file_name = file_path.parts[-1] if file_path.suffix == ".json": with msg.loading("Loading {}...".format(file_name)): data = srsly.read_json(file_path) msg.good("Loaded {}".format(file_name)) return data elif file_path.suffix == ".jsonl": with msg.loading("Loading {}...".format(file_name)): data = srsly.read_jsonl(file_path) msg.good("Loaded {}".format(file_name)) return data msg.fail( "Can't load file extension {}".format(file_path.suffix), "Expected .json or .jsonl", exits=1, )
def get_model_meta(path): """Get model meta.json from a directory path and validate its contents. path (unicode or Path): Path to model directory. RETURNS (dict): The model's meta data. """ model_path = ensure_path(path) if not model_path.exists(): raise IOError(Errors.E052.format(path=path2str(model_path))) meta_path = model_path / "meta.json" if not meta_path.is_file(): raise IOError(Errors.E053.format(path=meta_path)) meta = srsly.read_json(meta_path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) return meta
def info(model=None, markdown=False, silent=False): """ Print info about spaCy installation. If a model shortcut link is speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ msg = Printer() if model: if util.is_package(model): model_path = util.get_package_path(model) else: model_path = util.get_data_path() / model meta_path = model_path / "meta.json" if not meta_path.is_file(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: meta["link"] = path2str(model_path) meta["source"] = path2str(model_path.resolve()) else: meta["source"] = path2str(model_path) if not silent: title = "Info about model '{}'".format(model) model_meta = { k: v for k, v in meta.items() if k not in ("accuracy", "speed") } if markdown: print_markdown(model_meta, title=title) else: msg.table(model_meta, title=title) return meta data = { "spaCy version": about.__version__, "Location": path2str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), "Models": list_models(), } if not silent: title = "Info about spaCy" if markdown: print_markdown(data, title=title) else: msg.table(data, title=title) return data
def get_model_links(compat): links = {} data_path = get_data_path() if data_path: models = [p for p in data_path.iterdir() if is_model_path(p)] for model in models: meta_path = Path(model) / "meta.json" if not meta_path.exists(): continue meta = srsly.read_json(meta_path) link = model.parts[-1] name = meta["lang"] + "_" + meta["name"] links[link] = { "name": name, "version": meta["version"], "compat": is_compat(compat, name, meta["version"]), } return links
def main(file_path: str): data = srsly.read_json(file_path) strat: str for strat in data["strategies"]: print() print(f"EVALUATION RESULTS FOR {strat}") print() for attr in data["strategies"][strat]: if attr.endswith("labels") or "micro" not in attr: continue print(f"{attr}: {data['strategies'][strat][attr]}")
def get_model_meta(path): """Get model meta.json from a directory path and validate its contents. path (unicode or Path): Path to model directory. RETURNS (dict): The model's meta data. """ model_path = ensure_path(path) if not model_path.exists(): raise IOError(Errors.E052.format(path=path2str(model_path))) meta_path = model_path / "meta.json" if not meta_path.is_file(): raise IOError(Errors.E053.format(path=meta_path)) meta = srsly.read_json(meta_path) for setting in ["lang", "name", "version"]: if setting not in meta or not meta[setting]: raise ValueError(Errors.E054.format(setting=setting)) if "spacy_version" in meta: about_major_minor = ".".join(about.__version__.split(".")[:2]) if not meta["spacy_version"].startswith(">=" + about_major_minor): # try to simplify version requirements from model meta to vx.x # for warning message meta_spacy_version = "v" + ".".join( meta["spacy_version"].replace(">=", "").split(".")[:2] ) # if the format is unexpected, supply the full version if not re.match(r"v\d+\.\d+", meta_spacy_version): meta_spacy_version = meta["spacy_version"] warn_msg = Warnings.W031.format( model=meta["lang"] + "_" + meta["name"], model_version=meta["version"], version=meta_spacy_version, current=about.__version__, ) warnings.warn(warn_msg) else: warn_msg = Warnings.W032.format( model=meta["lang"] + "_" + meta["name"], model_version=meta["version"], current=about.__version__, ) warnings.warn(warn_msg) return meta
def get_model_meta(self, model_path): """ Get model's meta.json from the directory path of the model, and validate its contents. This method is ported from spaCy. `https://github.com/explosion/spaCy/blob/master/spacy/util.py#L231` Parameters ---------- model_path: `pathlib.Path` Path to model directory. Returns ------- dict The model's meta data. Raises ------ FileNotFoundError If the model is not found (i.e., it has not been downloaded) or if the model misses the metafile. ValueError If the metafile of the model `meta.json` is malformed. """ if not model_path.exists(): raise FileNotFoundError( 'Module not found at path %s. Verify it is installed.' % str(model_path)) meta_path = model_path / 'meta.json' if not meta_path.is_file(): raise FileNotFoundError( 'It seems that model %s is missing meta.json file.' 'Contact maintainers' % model_path.name) meta = srsly.read_json(meta_path) for setting in ['name', 'version']: if setting not in meta or not meta[setting]: raise ValueError( 'Malformed meta.json file, value %s is missing. ' 'Contact maintainers' % setting) return meta
def read_file(path: Union[Path, str], **kwargs) -> List[Dict]: """Read train/dev examples from file, either JSON, MD or ConLL format. Args: path: file path. Returns: list of examples """ if not isinstance(path, Path): path = Path(path) assert isinstance(path, Path) ext = path.suffix.lower() if ext == ".json": # JSON format is the GOLD standard ... return list(srsly.read_json(path)) elif ext == ".jsonl": # same here .. return list(srsly.read_jsonl(path)) elif ext in (".md", ".markdown"): from spacy_crfsuite.markdown import MarkdownReader # With markdown, we can easily convert to JSON with path.open("r", encoding="utf-8") as f: md_reader = MarkdownReader() return md_reader(f.read(), **kwargs) elif ext in (".txt", ".conll"): from spacy_crfsuite.conll import read_conll # CoNLL-02, CoNLL-03 return list(read_conll(path, **kwargs)) else: raise ValueError( f"Can't read examples from file with extension: ({ext}). " f"spacy_crfsuite accepts .json, .jsonl, .txt, .conll files.")
def pytest_sessionstart(session): test_dir = Path(TESTS_DIR) if test_dir.exists(): shutil.rmtree(str(test_dir)) msg.info("Deleted existing test directory {}".format(TESTS_DIR)) test_dir.mkdir() msg.good("Created test directory {}".format(TESTS_DIR)) meta = srsly.read_json(META_FILE) n_files = 0 for test_file, solution_file in get_source_files(): with test_file.open("r", encoding="utf8") as f: test_code = f.read() with solution_file.open("r", encoding="utf8") as f: solution_code = f.read() full_code = format_test(test_file.stem, meta[PYTEST_TEMPLATE], test_code, solution_code) test_path = test_dir / test_file.name with test_path.open("w", encoding="utf8") as f: f.write(full_code) n_files += 1 msg.good("Created {} files for pytest in {}".format(n_files, TESTS_DIR))
def test_link(trained_linker): @app.middleware("http") async def add_nlp_to_state(request: Request, call_next): request.state.nlp = trained_linker response = await call_next(request) return response client = TestClient(app) example_request = srsly.read_json( Path(__file__).parent.parent / "spacy_ann/api/example_request.json" ) res = client.post("/link", json=example_request) assert res.status_code == 200 data = res.json() for doc in data["documents"]: for span in doc["spans"]: assert "id" in span
def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()) -> "EntityRuler": """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. path (str / Path): The JSONL file to load. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) self.clear() depr_patterns_path = path.with_suffix(".jsonl") if depr_patterns_path.is_file(): patterns = srsly.read_jsonl(depr_patterns_path) self.add_patterns(patterns) else: cfg = {} deserializers_patterns = { "patterns": lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix(".jsonl"))) } deserializers_cfg = { "cfg": lambda p: cfg.update(srsly.read_json(p)) } from_disk(path, deserializers_cfg, {}) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) if self.phrase_matcher_attr is not None: self.phrase_matcher = PhraseMatcher( self.nlp.vocab, attr=self.phrase_matcher_attr) from_disk(path, deserializers_patterns, {}) return self
def _collate_best_model(meta, output_path, components): bests = {} for component in components: bests[component] = _find_best(output_path, component) best_dest = output_path / "model-best" shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest)) for component, best_component_src in bests.items(): shutil.rmtree(path2str(best_dest / component)) if component == "ner": shutil.copytree( path2str(best_component_src / component), path2str(best_dest / component) ) accs = srsly.read_json(best_component_src / "accuracy.json") for metric in _get_metrics(component): meta["accuracy"][metric] = accs[metric] else: best_component_src = output_path / "model-final" shutil.copytree( path2str(best_component_src / component), path2str(best_dest / component) ) srsly.write_json(best_dest / "meta.json", meta) return best_dest
def from_disk(self, path: Path, **kwargs): """Deserialize saved AnnLinker from disk. path (Path): directory to deserialize from RETURNS (AnnLinker): Initialized AnnLinker """ path = util.ensure_path(path) kb = KnowledgeBase(self.nlp.vocab, 300) kb.load_bulk(path / "kb") self.set_kb(kb) cg = CandidateGenerator().from_disk(path) self.set_cg(cg) cfg = srsly.read_json(path / "cfg") self.threshold = cfg.get("threshold", 0.7) self.no_description_threshold = cfg.get("no_description_threshold", 0.95) self.disambiguate = cfg.get("disambiguate", True) return self
def main(in_file, model_file=None, config_file=None, spacy_model=None): """Train CRF entity tagger.""" if config_file: msg.info(f"Loading config: {config_file}") component_config = srsly.read_json(config_file) else: component_config = None model_file = model_file or "model.pkl" msg.info("Loading model from file", model_file) crf_extractor = CRFExtractor( component_config=component_config).from_disk(model_file) msg.good("Successfully loaded CRF tagger", crf_extractor) msg.info("Loading dev dataset from file", in_file) dev_examples = read_file(in_file) msg.good(f"Successfully loaded {len(dev_examples)} dev examples.") if spacy_model is not None: nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model: {spacy_model}") else: nlp = spacy.blank("en") msg.info(f"Using spaCy blank: 'en'") tokenizer = SpacyTokenizer(nlp=nlp) use_dense_features = crf_extractor.use_dense_features() dev_crf_examples = [ gold_example_to_crf_tokens(ex, tokenizer=tokenizer, use_dense_features=use_dense_features) for ex in dev_examples ] f1_score, classification_report = crf_extractor.eval(dev_crf_examples) msg.warn(f"f1 score: {f1_score}") print(classification_report)
def from_disk(self, path, exclude=tuple(), disable=None): """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the model will be loaded. path (unicode or Path): A path to a directory. exclude (list): Names of components or serialization fields to exclude. RETURNS (Language): The modified `Language` object. DOCS: https://spacy.io/api/language#from_disk """ if disable is not None: deprecation_warning(Warnings.W014) exclude = disable path = util.ensure_path(path) deserializers = OrderedDict() deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) deserializers["vocab"] = lambda p: self.vocab.from_disk( p ) and _fix_pretrained_vectors_name(self) deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( p, exclude=["vocab"] ) for name, proc in self.pipeline: if name in exclude: continue if not hasattr(proc, "from_disk"): continue deserializers[name] = lambda p, proc=proc: proc.from_disk( p, exclude=["vocab"] ) if not (path / "vocab").exists() and "vocab" not in exclude: # Convert to list here in case exclude is (default) tuple exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) self._path = path return self
def from_disk(self, path, **kwargs): """Load the entity ruler from a file. Expects a file containing newline-delimited JSON (JSONL) with one entry per line. path (unicode / Path): The JSONL file to load. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) depr_patterns_path = path.with_suffix(".jsonl") if depr_patterns_path.is_file(): patterns = srsly.read_jsonl(depr_patterns_path) self.add_patterns(patterns) else: cfg = {} deserializers_patterns = { "patterns": lambda p: self.add_patterns( srsly.read_jsonl(p.with_suffix(".jsonl"))) } deserializers_cfg = { "cfg": lambda p: cfg.update(srsly.read_json(p)) } from_disk(path, deserializers_cfg, {}) self.overwrite = cfg.get("overwrite", False) self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) if self.phrase_matcher_attr is not None: self.phrase_matcher = PhraseMatcher( self.nlp.vocab, attr=self.phrase_matcher_attr) from_disk(path, deserializers_patterns, {}) return self
load_dotenv(find_dotenv()) PREFIX: str = os.getenv('CLUSTER_ROUTE_PREFIX', '').rstrip('/') # Path to `saved_model.pb` MODEL_DIR: str = os.getenv('MODEL_DIR', FS.SAVED_MODELS) # App object. app = FastAPI( title='heart-disease', version='1.0', description='Predict heart disease with different ML algorithms.', openapi_prefix=PREFIX, ) # Request example. single_example = srsly.read_json('app/data/single_request_sample.json') batch_example = srsly.read_json('app/data/batch_request_sample.json') # Loaded saved model object. model = SavedModel(model_dir=MODEL_DIR) @app.get('/', include_in_schema=False) async def docs_redirect(): return RedirectResponse(f'{PREFIX}/docs') @app.get('/models', response_model=AvailableModels, response_description='List of available models', summary='Return available models',
def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, code_paths: List[Path] = [], name: Optional[str] = None, version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, create_wheel: bool = False, force: bool = False, silent: bool = True, ) -> None: msg = Printer(no_print=silent, pretty=not silent) input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if create_wheel and not has_wheel(): err = "Generating a binary .whl file requires wheel to be installed" msg.fail(err, "pip install wheel", exits=1) if not input_path or not input_path.exists(): msg.fail("Can't locate pipeline data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if create_sdist or create_wheel: opts = ["sdist" if create_sdist else "", "wheel" if create_wheel else ""] msg.info(f"Building package artifacts: {', '.join(opt for opt in opts if opt)}") for code_path in code_paths: if not code_path.exists(): msg.fail("Can't find code file", code_path, exits=1) # Import the code here so it's available when model is loaded (via # get_meta helper). Also verifies that everything works util.import_file(code_path.stem, code_path) if code_paths: msg.good(f"Including {len(code_paths)} Python module(s) with custom code") if meta_path and not meta_path.exists(): msg.fail("Can't find pipeline meta.json", meta_path, exits=1) meta_path = meta_path or input_dir / "meta.json" if not meta_path.exists() or not meta_path.is_file(): msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) if meta["requirements"]: msg.good( f"Including {len(meta['requirements'])} package requirement(s) from " f"meta and config", ", ".join(meta["requirements"]), ) if name is not None: if not name.isidentifier(): msg.fail( f"Model name ('{name}') is not a valid module name. " "This is required so it can be imported as a module.", "We recommend names that use ASCII A-Z, a-z, _ (underscore), " "and 0-9. " "For specific details see: https://docs.python.org/3/reference/lexical_analysis.html#identifiers", exits=1, ) if not _is_permitted_package_name(name): msg.fail( f"Model name ('{name}') is not a permitted package name. " "This is required to correctly load the model with spacy.load.", "We recommend names that use ASCII A-Z, a-z, _ (underscore), " "and 0-9. " "For specific details see: https://www.python.org/dev/peps/pep-0426/#name", exits=1, ) meta["name"] = name if version is not None: meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: msg.fail("Invalid pipeline meta.json") print("\n".join(errors)) sys.exit(1) model_name = meta["name"] if not model_name.startswith(meta["lang"] + "_"): model_name = f"{meta['lang']}_{model_name}" model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) for file_name in FILENAMES_DOCS: file_path = package_path / model_name_v / file_name if file_path.exists(): shutil.copy(str(file_path), str(main_path)) readme_path = main_path / "README.md" if not readme_path.exists(): readme = generate_readme(meta) create_file(readme_path, readme) create_file(package_path / model_name_v / "README.md", readme) msg.good("Generated README.md from meta.json") else: msg.info("Using existing README.md from pipeline directory") imports = [] for code_path in code_paths: imports.append(code_path.stem) shutil.copy(str(code_path), str(package_path)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) init_py = TEMPLATE_INIT.format( imports="\n".join(f"from . import {m}" for m in imports) ) create_file(package_path / "__init__.py", init_py) msg.good(f"Successfully created package directory '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" msg.good(f"Successfully created zipped Python package", zip_file) if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) wheel_name_squashed = re.sub("_+", "_", model_name_v) wheel = main_path / "dist" / f"{wheel_name_squashed}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel) if "__" in model_name: msg.warn( f"Model name ('{model_name}') contains a run of underscores. " "Runs of underscores are not significant in installed package names.", )
def from_disk(self, path, **kwargs): path = util.ensure_path(path) serializers = OrderedDict( (("cfg", lambda p: self._set_config(srsly.read_json(p))), )) util.from_disk(path, serializers, []) self.tokenizer = try_sudachi_import(self.split_mode)
from spacy_ann import __version__ from spacy_ann.api.types import LinkingRecord, LinkingRequest, LinkingResponse from starlette.requests import Request from starlette.responses import RedirectResponse load_dotenv(find_dotenv()) openapi_prefix = os.getenv("CLUSTER_ROUTE_PREFIX", "").rstrip("/") app = FastAPI( title="spacy-ann-linker", version=__version__, description= "Remote Entity Linking with Approximate Nearest Neighbors index lookup for Aliases", openapi_prefix=openapi_prefix, ) example_request = srsly.read_json( Path(__file__).parent / "example_request.json") security = APIKeyHeader(name="api-key") @app.get("/", include_in_schema=False) def docs_redirect(): return RedirectResponse(f"{openapi_prefix}/docs") @app.post("/link", response_model=LinkingResponse) async def link( request: Request, # api_key = Depends(security), similarity_threshold: float = 0.65, body: LinkingRequest = Body(..., example=example_request),
def from_disk(self, path, **kwargs): path = util.ensure_path(path) serializers = { "cfg": lambda p: self._set_config(srsly.read_json(p)), } util.from_disk(path, serializers, [])
def main(output_dir, evaluate=False, sort_metric="ents_f"): output_dir = Path(output_dir) reports = [] for work_dir in output_dir.glob("*"): report = {"path": str(work_dir), "training": []} report["vectors"] = srsly.read_json(work_dir.joinpath("meta.json")) model_dirs = list(work_dir.joinpath("training").glob("model[0-9]*")) for model_dir in model_dirs: model = { "meta": srsly.read_json(model_dir.joinpath("meta.json")), "path": str(model_dir), "size": get_size(model_dir), } report["training"].append(model) best_dir = work_dir.joinpath("training/model-best") if best_dir.exists(): report["best"] = { "meta": srsly.read_json(model_dir.joinpath("meta.json")), "path": str(best_dir), "size": get_size(best_dir), } reports.append(report) reports.sort( key=lambda r: r["best"]["meta"]["accuracy"][sort_metric] if "best" in r else 0, reverse=True, ) for (idx, report) in enumerate(reports): head = "Model {:>3}".format(idx) print("=" * len(head)) print(head) print("=" * len(head)) print("\tPath: {}".format(report["path"])) vec = report["vectors"] corp_desc = [] for corp in vec["corpus"]: corp_desc.append( "{} (lemmatized={}, case preserved={}, tokens={})".format( corp["description"], corp["lemmatized"], corp["case preserved"], corp["tokens"], )) print() print("Vectors") print("-------") print("\tAlgorithm: {}".format(vec["algorithm"]["name"])) print("\tCorpus : {}".format(" + ".join(corp_desc))) print( "\tURL : http://vectors.nlpl.eu/repository/11/{}.zip".format( vec['id'])) print( "\tVectors : dimensions={}, window={}, iterations={}, vocab size={}" .format( vec["dimensions"], vec["window"], vec["iterations"], vec["vocabulary size"], )) print() print("Training") print("--------") for (idx, training) in enumerate(report["training"]): if "accuracy" in training["meta"]: print_accuracy(training["meta"]["accuracy"], header=idx == 0, indent=1) print() print("Best") print("----") if not "best" in report: print("\n\tNone saved.") return print("\tPath: {}".format(report["best"]["path"])) print("\tSize: {} MB".format(round(report["best"]["size"] / 1024**2))) print() print_accuracy(report["best"]["meta"]["accuracy"], indent=1) if evaluate: print() print("Evaluate") print("--------") res = subprocess.run( [ sys.executable, "-m", "spacy", "evaluate", "-g", "1", report["best"]["path"], "data/norne-spacy/ud/nob/no-ud-test-ner.json", ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf8", ) if res.returncode != 0: print("Evaluation failed!") print(res.stderr) for line in res.stdout.split("\n"): if not "===" in line: print("\t", line) report["evaluation"] = res.stdout print("\n") srsly.write_json("nlpl-report.json", reports)
def convert( input_path: Union[str, Path], output_dir: Union[str, Path], *, file_type: str = "json", n_sents: int = 1, seg_sents: bool = False, model: Optional[str] = None, morphology: bool = False, merge_subtokens: bool = False, converter: str = "auto", ner_map: Optional[Path] = None, lang: Optional[str] = None, concatenate: bool = False, silent: bool = True, msg: Optional[Printer], ) -> None: if not msg: msg = Printer(no_print=silent) ner_map = srsly.read_json(ner_map) if ner_map is not None else None doc_files = [] for input_loc in walk_directory(Path(input_path), converter): input_data = input_loc.open("r", encoding="utf-8").read() # Use converter function to convert data func = CONVERTERS[converter] docs = func( input_data, n_sents=n_sents, seg_sents=seg_sents, append_morphology=morphology, merge_subtokens=merge_subtokens, lang=lang, model=model, no_print=silent, ner_map=ner_map, ) doc_files.append((input_loc, docs)) if concatenate: all_docs = itertools.chain.from_iterable( [docs for _, docs in doc_files]) doc_files = [(input_path, all_docs)] for input_loc, docs in doc_files: if file_type == "json": data = [docs_to_json(docs)] len_docs = len(data) else: db = DocBin(docs=docs, store_user_data=True) len_docs = len(db) data = db.to_bytes() if output_dir == "-": _print_docs_to_stdout(data, file_type) else: if input_loc != input_path: subpath = input_loc.relative_to(input_path) output_file = Path(output_dir) / subpath.with_suffix( f".{file_type}") else: output_file = Path(output_dir) / input_loc.parts[-1] output_file = output_file.with_suffix(f".{file_type}") _write_docs_to_file(data, output_file, file_type) msg.good( f"Generated output file ({len_docs} documents): {output_file}")
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline] nlp.disable_pipes(*other_pipes) for pipe in pipeline: if pipe not in nlp.pipe_names: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pre-trained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # fmt: off row_head = [ "Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS" ] row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7] if has_beam_widths: row_head.insert(1, "Beam W.") row_widths.insert(1, 7) row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs(nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, debug) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text("Early stopping, best iteration " "is: {}".format(i - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)
import spacy_streamlit from pathlib import Path import srsly import importlib MODELS = srsly.read_json(Path(__file__).parent / "models.json") DEFAULT_MODEL = "en_core_web_sm" DEFAULT_TEXT = "David Bowie moved to the US in 1974, initially staying in New York City before settling in Los Angeles." DESCRIPTION = """**Explore trained [spaCy v3.0](https://nightly.spacy.io) pipelines**""" def get_default_text(nlp): # Check if spaCy has built-in example texts for the language try: examples = importlib.import_module(f".lang.{nlp.lang}.examples", "spacy") return examples.sentences[0] except (ModuleNotFoundError, ImportError): return "" spacy_streamlit.visualize( MODELS, default_model=DEFAULT_MODEL, visualizers=["parser", "ner", "similarity", "tokens"], show_visualizer_select=True, sidebar_description=DESCRIPTION, get_default_text=get_default_text)
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", vectors=None, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ msg = Printer() util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] msg.text("Training pipeline: {}".format(pipeline)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline] nlp.disable_pipes(*other_pipes) for pipe in pipeline: if pipe not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe(pipe)) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: nlp.add_pipe(nlp.create_pipe(pipe)) if learn_tokens: nlp.add_pipe(nlp.create_pipe("merge_subtokens")) if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail( "Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name) ) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model: # Start with an existing model, use default optimizer optimizer = create_default_optimizer(Model.ops) else: # Start with a blank model, call begin_training optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None # Load in pre-trained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # fmt: off row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"] row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7] if has_beam_widths: row_head.insert(1, "Beam W.") row_widths.insert(1, 7) row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8 ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, debug) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: msg.text( "Early stopping, best iteration " "is: {}".format(i - iter_since_best) ) msg.text( "Best score = {}; Final iteration " "score = {}".format(best_score, current_score) ) break finally: with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names) msg.good("Created best model", best_model_path)
RecordsResponse, RecordsEntitiesByTypeResponse, ) from app.spacy_extractor import SpacyExtractor load_dotenv(find_dotenv()) prefix = os.getenv("CLUSTER_ROUTE_PREFIX", "").rstrip("/") app = FastAPI( title="{{cookiecutter.project_name}}", version="1.0", description="{{cookiecutter.project_short_description}}", openapi_prefix=prefix, ) example_request = srsly.read_json("app/data/example_request.json") nlp = spacy.load("{{cookiecutter.project_language}}") extractor = SpacyExtractor(nlp) @app.get("/", include_in_schema=False) def docs_redirect(): return RedirectResponse(f"{prefix}/docs") @app.post("/entities", response_model=RecordsResponse, tags=["NER"]) async def extract_entities(body: RecordsRequest = Body( ..., example=example_request)): """Extract Named Entities from a batch of Records."""
def main( model="./zh_vectors_web_ud_lg/model-final", new_model_name="zh_vectors_web_ud_clue_lg", output_dir="./zh_vectors_web_ud_clue_lg", train_path="./clue_spacy_train.jsonl", dev_path="./clue_spacy_dev.jsonl", meta_path="./meta.json", use_gpu=0, n_iter=50 ): import tqdm """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") for label in LABEL: if label not in ner.labels: ner.add_label(label) # add new entity label to entity recognizer train_path = ensure_path(train_path) dev_path = ensure_path(dev_path) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if output_dir.exists() and [p for p in output_dir.iterdir() if p.is_dir()]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_dir.exists(): output_dir.mkdir() meta = srsly.read_json(meta_path) if meta_path else {} # Prepare training corpus msg.text("Counting training words (limit={})".format(0)) corpus = GoldCorpus(train_path, dev_path, limit=0) n_train_words = corpus.count_train() if model is None: optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) else: optimizer = create_default_optimizer(Model.ops) # Todo: gpu train? dropout_rates = decaying( 0.2, 0.2, 0.0) batch_sizes = compounding( 100.0, 1000.0 , 1.001) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] # UnboundLocalError: local variable 'has_beam_widths' referenced before assignment # fmt: off eval_beam_widths="" if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] row_head, output_stats = _configure_training_output(["ner"], use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: noise_level = 0.0 orth_variant_level = 0.0 gold_preproc = False verbose = False best_score = 0.0 with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): set_env_log(False) epoch_model_path = output_dir / ("model%d" % itn) nlp.to_disk(epoch_model_path) nlp_loaded = load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_dir / ("model%d" % itn) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % spacy.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % itn) meta.setdefault("version", "0.0.1") meta["labels"] = nlp.meta["labels"] meta_loc = output_dir / ("model%d" % itn) / "meta.json" srsly.write_json(meta_loc, meta) set_env_log(verbose) progress = _get_progress( itn, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) finally: with nlp.use_params(optimizer.averages): final_model_path = output_dir / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) meta["pipeline"] = nlp.pipe_names meta["labels"] = nlp.meta["labels"] meta["factories"] = nlp.meta["factories"] with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_dir, nlp.pipe_names) msg.good("Created best model", best_model_path)
def debug_data( lang, train_path, dev_path, tag_map_path=None, base_model=None, pipeline="tagger,parser,ner", ignore_warnings=False, verbose=False, no_format=False, ): """ Analyze, debug and validate your training and development data, get useful stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. """ msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) # Make sure all files and paths exists if they are needed if not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) tag_map = {} if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) # Initialize the model and pipeline pipeline = [p.strip() for p in pipeline.split(",")] if base_model: nlp = load_model(base_model) else: lang_cls = get_lang_class(lang) nlp = lang_cls() # Update tag map with provided mapping nlp.vocab.morphology.tag_map.update(tag_map) msg.divider("Data format validation") # TODO: Validate data format using the JSON schema # TODO: update once the new format is ready # TODO: move validation to GoldCorpus in order to be able to load from dir # Create the gold corpus to be able to better analyze data loading_train_error_message = "" loading_dev_error_message = "" with msg.loading("Loading corpus..."): corpus = GoldCorpus(train_path, dev_path) try: train_docs = list(corpus.train_docs(nlp)) train_docs_unpreprocessed = list( corpus.train_docs_without_preprocessing(nlp)) except ValueError as e: loading_train_error_message = "Training data cannot be loaded: {}".format( str(e)) try: dev_docs = list(corpus.dev_docs(nlp)) except ValueError as e: loading_dev_error_message = "Development data cannot be loaded: {}".format( str(e)) if loading_train_error_message or loading_dev_error_message: if loading_train_error_message: msg.fail(loading_train_error_message) if loading_dev_error_message: msg.fail(loading_dev_error_message) sys.exit(1) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_docs constantly gold_train_data = _compile_gold(train_docs, pipeline, nlp) gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline, nlp) gold_dev_data = _compile_gold(dev_docs, pipeline, nlp) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] msg.divider("Training stats") msg.text("Training pipeline: {}".format(", ".join(pipeline))) for pipe in [p for p in pipeline if p not in nlp.factories]: msg.fail( "Pipeline component '{}' not available in factories".format(pipe)) if base_model: msg.text("Starting with base model '{}'".format(base_model)) else: msg.text("Starting with blank model '{}'".format(lang)) msg.text("{} training docs".format(len(train_docs))) msg.text("{} evaluation docs".format(len(dev_docs))) if not len(dev_docs): msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn( "{} training examples also in evaluation data".format(overlap)) else: msg.good("No overlap between training and evaluation data") if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD: text = "Low number of examples to train from a blank model ({})".format( len(train_docs)) if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( "It's recommended to use at least {} examples (minimum {})".format( BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD), show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info("{} total {} in the data ({} unique)".format( n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"]))) if gold_train_data["n_misaligned_words"] > 0: msg.warn("{} misaligned tokens in the training data".format( gold_train_data["n_misaligned_words"])) if gold_dev_data["n_misaligned_words"] > 0: msg.warn("{} misaligned tokens in the dev data".format( gold_dev_data["n_misaligned_words"])) most_common_words = gold_train_data["words"].most_common(10) msg.text( "10 most common words: {}".format( _format_labels(most_common_words, counts=True)), show=verbose, ) if len(nlp.vocab.vectors): msg.info("{} vectors ({} unique keys, {} dimensions)".format( len(nlp.vocab.vectors), nlp.vocab.vectors.n_keys, nlp.vocab.vectors_length, )) n_missing_vectors = sum( gold_train_data["words_missing_vectors"].values()) msg.warn( "{} words in training data without vectors ({:0.2f}%)".format( n_missing_vectors, n_missing_vectors / gold_train_data["n_words"], ), ) msg.text( "10 most common words without vectors: {}".format( _format_labels( gold_train_data["words_missing_vectors"].most_common(10), counts=True, )), show=verbose, ) else: msg.info("No word vectors present in the model") if "ner" in pipeline: # Get all unique NER labels present in the data labels = set(label for label in gold_train_data["ner"] if label not in ("O", "-")) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] has_low_data_warning = False has_no_neg_warning = False has_ws_ents_error = False has_punct_ents_warning = False msg.divider("Named Entity Recognition") msg.info("{} new {}, {} existing {}".format( len(new_labels), "label" if len(new_labels) == 1 else "labels", len(existing_labels), "label" if len(existing_labels) == 1 else "labels", )) missing_values = label_counts["-"] msg.text("{} missing {} (tokens with '-' label)".format( missing_values, "value" if missing_values == 1 else "values")) for label in new_labels: if len(label) == 0: msg.fail("Empty label found in new labels") if new_labels: labels_with_counts = [ (label, count) for label, count in label_counts.most_common() if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text("Existing: {}".format(_format_labels(existing_labels)), show=verbose) if gold_train_data["ws_ents"]: msg.fail("{} invalid whitespace entity span(s)".format( gold_train_data["ws_ents"])) has_ws_ents_error = True if gold_train_data["punct_ents"]: msg.warn("{} entity span(s) with punctuation".format( gold_train_data["punct_ents"])) has_punct_ents_warning = True for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( "Low number of examples for new label '{}' ({})".format( label, label_counts[label])) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_docs, label) if neg_docs == 0: msg.warn( "No examples for texts WITHOUT new label '{}'".format( label)) has_no_neg_warning = True if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good( "No entities consisting of or starting/ending with whitespace") if not has_punct_ents_warning: msg.good( "No entities consisting of or starting/ending with punctuation" ) if has_low_data_warning: msg.text( "To train a new entity type, your data should include at " "least {} instances of the new label".format( NEW_LABEL_THRESHOLD), show=verbose, ) if has_no_neg_warning: msg.text( "Training data should always include examples of entities " "in context, as well as examples without a given entity " "type.", show=verbose, ) if has_ws_ents_error: msg.text( "As of spaCy v2.1.0, entity spans consisting of or starting/ending " "with whitespace characters are considered invalid.") if has_punct_ents_warning: msg.text( "Entity spans consisting of or starting/ending " "with punctuation can not be trained with a noise level > 0.") if "textcat" in pipeline: msg.divider("Text Classification") labels = [label for label in gold_train_data["cats"]] model_labels = _get_labels_from_model(nlp, "textcat") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] msg.info("Text Classification: {} new label(s), {} existing label(s)". format(len(new_labels), len(existing_labels))) if new_labels: labels_with_counts = _format_labels( gold_train_data["cats"].most_common(), counts=True) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: msg.text("Existing: {}".format(_format_labels(existing_labels)), show=verbose) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.fail("The train and dev labels are not the same. " "Train labels: {}. " "Dev labels: {}.".format( _format_labels(gold_train_data["cats"]), _format_labels(gold_dev_data["cats"]), )) if gold_train_data["n_cats_multilabel"] > 0: msg.info("The train data contains instances without " "mutually-exclusive classes. Use '--textcat-multilabel' " "when training.") if gold_dev_data["n_cats_multilabel"] == 0: msg.warn( "Potential train/dev mismatch: the train data contains " "instances without mutually-exclusive classes while the " "dev data does not.") else: msg.info("The train data contains only instances with " "mutually-exclusive classes.") if gold_dev_data["n_cats_multilabel"] > 0: msg.fail( "Train/dev mismatch: the dev data contains instances " "without mutually-exclusive classes while the train data " "contains only instances with mutually-exclusive classes.") if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] tag_map = nlp.vocab.morphology.tag_map msg.info("{} {} in data ({} {} in tag map)".format( len(labels), "label" if len(labels) == 1 else "labels", len(tag_map), "label" if len(tag_map) == 1 else "labels", )) labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] if not non_tagmap: msg.good("All labels present in tag map for language '{}'".format( nlp.lang)) for label in non_tagmap: msg.fail( "Label '{}' not found in tag map for language '{}'".format( label, nlp.lang)) if "parser" in pipeline: has_low_data_warning = False msg.divider("Dependency Parsing") # profile sentence length msg.info("Found {} sentence{} with an average length of {:.1f} words.". format( gold_train_data["n_sents"], "s" if len(train_docs) > 1 else "", gold_train_data["n_words"] / gold_train_data["n_sents"], )) # check for documents with multiple sentences sents_per_doc = gold_train_data["n_sents"] / len( gold_train_data["texts"]) if sents_per_doc < 1.1: msg.warn( "The training data contains {:.2f} sentences per " "document. When there are very few documents containing more " "than one sentence, the parser will not learn how to segment " "longer texts into sentences.".format(sents_per_doc)) # profile labels labels_train = [label for label in gold_train_data["deps"]] labels_train_unpreprocessed = [ label for label in gold_train_unpreprocessed_data["deps"] ] labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: msg.info("Found {} nonprojective train sentence{}".format( gold_train_unpreprocessed_data["n_nonproj"], "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "", )) if gold_dev_data["n_nonproj"] > 0: msg.info("Found {} nonprojective dev sentence{}".format( gold_dev_data["n_nonproj"], "s" if gold_dev_data["n_nonproj"] > 1 else "", )) msg.info("{} {} in train data".format( len(labels_train_unpreprocessed), "label" if len(labels_train) == 1 else "labels", )) msg.info("{} {} in projectivized train data".format( len(labels_train), "label" if len(labels_train) == 1 else "labels")) labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True) msg.text(labels_with_counts, show=verbose) # rare labels in train for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][ label] <= DEP_LABEL_THRESHOLD: msg.warn("Low number of examples for label '{}' ({})".format( label, gold_train_unpreprocessed_data["deps"][label])) has_low_data_warning = True # rare labels in projectivized train rare_projectivized_labels = [] for label in gold_train_data["deps"]: if gold_train_data["deps"][ label] <= DEP_LABEL_THRESHOLD and "||" in label: rare_projectivized_labels.append("{}: {}".format( label, str(gold_train_data["deps"][label]))) if len(rare_projectivized_labels) > 0: msg.warn( "Low number of examples for {} label{} in the " "projectivized dependency trees used for training. You may " "want to projectivize labels such as punct before " "training in order to improve parser performance.".format( len(rare_projectivized_labels), "s" if len(rare_projectivized_labels) > 1 else "", )) msg.warn( "Projectivized labels with low numbers of examples: " "{}".format("\n".join(rare_projectivized_labels)), show=verbose, ) has_low_data_warning = True # labels only in train if set(labels_train) - set(labels_dev): msg.warn( "The following labels were found only in the train data: " "{}".format(", ".join(set(labels_train) - set(labels_dev))), show=verbose, ) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn( "The following labels were found only in the dev data: " + ", ".join(set(labels_dev) - set(labels_train)), show=verbose, ) if has_low_data_warning: msg.text( "To train a parser, your data should include at " "least {} instances of each label.".format( DEP_LABEL_THRESHOLD), show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( "Multiple root labels ({}) ".format(", ".join( gold_train_unpreprocessed_data["roots"])) + "found in training data. spaCy's parser uses a single root " "label ROOT so this distinction will not be available.") # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail( "Found {} nonprojective projectivized train sentence{}".format( gold_train_data["n_nonproj"], "s" if gold_train_data["n_nonproj"] > 1 else "", )) if gold_train_data["n_cycles"] > 0: msg.fail( "Found {} projectivized train sentence{} with cycles".format( gold_train_data["n_cycles"], "s" if gold_train_data["n_cycles"] > 1 else "", )) msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: msg.good("{} {} passed".format( good_counts, "check" if good_counts == 1 else "checks")) if warn_counts: msg.warn("{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")) if fail_counts: msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) if fail_counts: sys.exit(1)
def package( input_dir: Path, output_dir: Path, meta_path: Optional[Path] = None, code_paths: List[Path] = [], name: Optional[str] = None, version: Optional[str] = None, create_meta: bool = False, create_sdist: bool = True, create_wheel: bool = False, force: bool = False, silent: bool = True, ) -> None: msg = Printer(no_print=silent, pretty=not silent) input_path = util.ensure_path(input_dir) output_path = util.ensure_path(output_dir) meta_path = util.ensure_path(meta_path) if create_wheel and not has_wheel(): err = "Generating a binary .whl file requires wheel to be installed" msg.fail(err, "pip install wheel", exits=1) if not input_path or not input_path.exists(): msg.fail("Can't locate pipeline data", input_path, exits=1) if not output_path or not output_path.exists(): msg.fail("Output directory not found", output_path, exits=1) if create_sdist or create_wheel: opts = [ "sdist" if create_sdist else "", "wheel" if create_wheel else "" ] msg.info( f"Building package artifacts: {', '.join(opt for opt in opts if opt)}" ) for code_path in code_paths: if not code_path.exists(): msg.fail("Can't find code file", code_path, exits=1) # Import the code here so it's available when model is loaded (via # get_meta helper). Also verifies that everything works util.import_file(code_path.stem, code_path) if code_paths: msg.good( f"Including {len(code_paths)} Python module(s) with custom code") if meta_path and not meta_path.exists(): msg.fail("Can't find pipeline meta.json", meta_path, exits=1) meta_path = meta_path or input_dir / "meta.json" if not meta_path.exists() or not meta_path.is_file(): msg.fail("Can't load pipeline meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) meta = get_meta(input_dir, meta) if name is not None: meta["name"] = name if version is not None: meta["version"] = version if not create_meta: # only print if user doesn't want to overwrite msg.good("Loaded meta.json from file", meta_path) else: meta = generate_meta(meta, msg) errors = validate(ModelMetaSchema, meta) if errors: msg.fail("Invalid pipeline meta.json") print("\n".join(errors)) sys.exit(1) model_name = meta["lang"] + "_" + meta["name"] model_name_v = model_name + "-" + meta["version"] main_path = output_dir / model_name_v package_path = main_path / model_name if package_path.exists(): if force: shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) shutil.copytree(str(input_dir), str(package_path / model_name_v)) license_path = package_path / model_name_v / "LICENSE" if license_path.exists(): shutil.move(str(license_path), str(main_path)) imports = [] for code_path in code_paths: imports.append(code_path.stem) shutil.copy(str(code_path), str(package_path)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) init_py = TEMPLATE_INIT.format(imports="\n".join(f"from . import {m}" for m in imports)) create_file(package_path / "__init__.py", init_py) msg.good(f"Successfully created package '{model_name_v}'", main_path) if create_sdist: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "sdist"], capture=False) zip_file = main_path / "dist" / f"{model_name_v}{SDIST_SUFFIX}" msg.good(f"Successfully created zipped Python package", zip_file) if create_wheel: with util.working_dir(main_path): util.run_command([sys.executable, "setup.py", "bdist_wheel"], capture=False) wheel = main_path / "dist" / f"{model_name_v}{WHEEL_SUFFIX}" msg.good(f"Successfully created binary wheel", wheel)
def read_labels(path: Path, *, require: bool = False): # I decided not to give this a generic name, because I don't want people to # use it for arbitrary stuff, as I want this require arg with default False. if not require and not path.exists(): return None return srsly.read_json(path)
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", replace_components=False, vectors=None, width=96, conv_depth=4, cnn_window=1, cnn_pieces=3, bilstm_depth=0, embed_rows=2000, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, omit_extra_lookups=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] disabled_pipes = None pipes_added = False msg.text("Training pipeline: {}".format(pipeline)) if use_gpu >= 0: activated_gpu = None try: activated_gpu = set_gpu(use_gpu) except Exception as e: msg.warn("Exception: {}".format(e)) if activated_gpu is not None: msg.text("Using GPU: {}".format(use_gpu)) else: msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) for pipe in pipeline: pipe_cfg = {} if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if pipe not in nlp.pipe_names: msg.text("Adding component to base model: '{}'".format(pipe)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True elif replace_components: msg.text( "Replacing component from base model '{}'".format(pipe)) nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) msg.text( "Extending component from base model '{}'".format(pipe)) base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline]) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) # Replace tag map with provided mapping nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if omit_extra_lookups: nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups_extra.add_table("lexeme_prob") nlp.vocab.lookups_extra.add_table("lexeme_settings") if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model and not pipes_added: # Start with an existing model, use default optimizer optimizer = nlp.resume_training(device=use_gpu) else: # Start with a blank model, call begin_training cfg = {"device": use_gpu} cfg["conv_depth"] = conv_depth cfg["token_vector_width"] = width cfg["bilstm_depth"] = bilstm_depth cfg["cnn_maxout_pieces"] = cnn_pieces cfg["embed_size"] = embed_rows cfg["conv_window"] = cnn_window optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat-positive-label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) try: nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) except ValueError as e: err = "Error during training" if init_tok2vec: err += " Did you provide the same parameters during 'train' as during 'pretrain'?" msg.fail(err, "Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) # Only evaluate on CPU in the first iteration (for # timing) if GPU is enabled if i == 0: with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg[ "beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta.setdefault("accuracy", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["accuracy"][metric] = scorer.scores[ metric] else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["beam_accuracy"][metric] = scorer.scores[ metric] meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: iter_current = i + 1 msg.text("Early stopping, best iteration " "is: {}".format(iter_current - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break except Exception as e: msg.warn( "Aborting and saving the final best model. " "Encountered exception: {}".format(e), exits=1, ) finally: best_pipes = nlp.pipe_names if disabled_pipes: disabled_pipes.restore() meta["pipeline"] = nlp.pipe_names with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) srsly.write_json(final_model_path / "meta.json", meta) meta_loc = output_path / "model-final" / "meta.json" final_meta = srsly.read_json(meta_loc) final_meta.setdefault("accuracy", {}) final_meta["accuracy"].update(meta.get("accuracy", {})) final_meta.setdefault("speed", {}) final_meta["speed"].setdefault("cpu", None) final_meta["speed"].setdefault("gpu", None) meta.setdefault("speed", {}) meta["speed"].setdefault("cpu", None) meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: speed = _get_total_speed( [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) final_meta["speed"]["cpu"] = speed if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: speed = _get_total_speed( [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) final_meta["speed"]["gpu"] = speed # if there were no speeds to update, overwrite with meta if (final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None): final_meta["speed"].update(meta["speed"]) # note: beam speeds are not combined with the base model if has_beam_widths: final_meta.setdefault("beam_accuracy", {}) final_meta["beam_accuracy"].update( meta.get("beam_accuracy", {})) final_meta.setdefault("beam_speed", {}) final_meta["beam_speed"].update(meta.get("beam_speed", {})) srsly.write_json(meta_loc, final_meta) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(final_meta, output_path, best_pipes) msg.good("Created best model", best_model_path)
def _get_skills(self): """Query skills from skills collection""" skills_path = self.data_path / "skills.json" skills = srsly.read_json(skills_path) return skills
def from_disk(self, path, **_kwargs): path = util.ensure_path(path) serializers = OrderedDict( (("cfg", lambda p: self._set_config(srsly.read_json(p))), )) util.from_disk(path, serializers, [])
# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. import json import logging import azure.functions as func import srsly skills = srsly.read_json("data/skills.json") def main(req: func.HttpRequest) -> func.HttpResponse: skill_id = req.route_params.get("skill_id") logging.info(f"Fetching skill by id {skill_id}") if skill_id: if skill_id not in skills: res = func.HttpResponse( f"Not Found: Skill with id {skill_id} does not exist", status_code=404 ) else: res = func.HttpResponse(json.dumps(skills[skill_id])) else: res = func.HttpResponse( "Please pass a skill_id on the query string or in the request body", status_code=400, ) return res