def debug_data_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True), code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"), verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"), no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"), # fmt: on ): """ Analyze, debug and validate your training and development data. Outputs useful stats, and can help you find problems like invalid entity annotations, cyclic dependencies, low data labels and more. DOCS: https://spacy.io/api/cli#debug-data """ if ctx.command.name == "debug-data": msg.warn( "The debug-data command is now available via the 'debug data' " "subcommand (without the hyphen). You can run python -m spacy debug " "--help for an overview of the other available debugging commands." ) overrides = parse_config_overrides(ctx.args) import_code(code_path) debug_data( config_path, config_overrides=overrides, ignore_warnings=ignore_warnings, verbose=verbose, no_format=no_format, silent=False, )
def collect_cmd( spec_file: typer.FileText, jobs: int = typer.Option(1, "--jobs", "-j"), commit_in: str = typer.Option( get_current_commit().hash, "--commit", show_default=True ), branch: str = typer.Option(get_branch(), "--branch", show_default=True), ) -> None: spec = load_spec(spec_file) storage = Storage(spec.storage_dir) commit = Commit( hash=str(commit_in), date=get_commit_date(commit_in), message=get_commit_message(commit_in), ) # parent = Commit(hash=str(parent_in), date=get_commit_date(parent_in)) parent = storage.get_branch_tip(get_branch()) assert commit != parent, "We ran on this commit before it seems" msg.info(f"#jobs: {jobs}") msg.info(f"on commit: {commit}") msg.info(f"parent commit: {parent}") if jobs > 1: msg.warn( "If you're running benchmarks from the collect call," " concurrency can affect results" ) assert jobs > 0, "Jobs value must be positive" msg.good("Spec loaded successfully") msg.divider() try: results = run_collectors(spec.collectors, jobs=jobs) except CollectorError as e: msg.fail("Collector returned invalid format") typer.echo(str(e.exc)) return # raise e msg.good("Collection completed") # print(results) run = Run( commit=commit, parent=parent, branch=branch, date=datetime.now(), results=sum((r.metrics for r in results), []), context={}, ) # print(run) storage = Storage(spec.storage_dir) storage.store_run(run)
def print_pipe_analysis( analysis: Dict[str, Dict[str, Union[List[str], Dict]]], *, keys: List[str] = DEFAULT_KEYS, ) -> None: """Print a formatted version of the pipe analysis produced by analyze_pipes. analysis (Dict[str, Union[List[str], Dict[str, List[str]]]]): The analysis. keys (List[str]): The meta keys to show in the table. """ msg.divider("Pipeline Overview") header = ["#", "Component", *[key.capitalize() for key in keys]] summary: ItemsView = analysis["summary"].items() body = [[i, n, *[v for v in m.values()]] for i, (n, m) in enumerate(summary)] msg.table(body, header=header, divider=True, multiline=True) n_problems = sum(len(p) for p in analysis["problems"].values()) if any(p for p in analysis["problems"].values()): msg.divider(f"Problems ({n_problems})") for name, problem in analysis["problems"].items(): if problem: msg.warn( f"'{name}' requirements not met: {', '.join(problem)}") else: msg.good("No problems found.")
def eval_dataset(set_id): DB = connect() data = DB.get_dataset(set_id) accepted = [ eg for eg in data if eg["answer"] == "accept" and eg.get("accept") ] rejected = [eg for eg in data if eg["answer"] == "reject"] ignored = [eg for eg in data if eg["answer"] == "ignore"] if not accepted and not rejected: msg.warn("No annotations collected", exits=1) counts = Counter() for eg in accepted: for model_id in eg["accept"]: counts[model_id] += 1 preference, _ = counts.most_common(1)[0] ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}" msg.info(f"Evaluating data from '{set_id}'") msg.text( f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") if counts["A"] == counts["B"]: msg.warn(f"No preference ({ratio})") else: pc = counts[preference] / sum(counts.values()) msg.good( f"You preferred vectors {preference} with {ratio} ({pc:.0%})") msg.text(mapping[preference])
def git_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master", sparse: bool = False): git_version = get_git_version() if dest.exists(): msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): msg.fail("Parent of destination of checkout must exist", exits=1) if sparse and git_version >= (2, 22): return git_sparse_checkout(repo, subpath, dest, branch) elif sparse: # Only show warnings if the user explicitly wants sparse checkout but # the Git version doesn't support it err_old = ( f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) " f"that doesn't fully support sparse checkout yet.") err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled." msg.warn( f"{err_unk if git_version == (0, 0) else err_old} " f"This means that more files than necessary may be downloaded " f"temporarily. To only download the files needed, make sure " f"you're using Git v2.22 or above.") with make_tempdir() as tmp_dir: cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}" run_command(cmd, capture=True) # We need Path(name) to make sure we also support subdirectories shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume): if not config_path or (str(config_path) != "-" and not config_path.exists()): msg.fail("Config file not found", config_path, exits=1) if output_dir.exists() and [p for p in output_dir.iterdir()]: if resume_path: msg.warn( "Output directory is not empty.", "If you're resuming a run in this directory, the old weights " "for the consecutive epochs will be overwritten with the new ones.", ) else: msg.warn( "Output directory is not empty. ", "It is better to use an empty directory or refer to a new output path, " "then the new directory will be created for you.", ) if resume_path is not None: if resume_path.is_dir(): # This is necessary because Windows gives a Permission Denied when we # try to open the directory later, which is confusing. See #7878 msg.fail( "--resume-path should be a weights file, but {resume_path} is a directory.", exits=True, ) model_name = re.search(r"model\d+\.bin", str(resume_path)) if not model_name and not epoch_resume: msg.fail( "You have to use the --epoch-resume setting when using a renamed weight file for --resume-path", exits=True, ) elif not model_name and epoch_resume < 0: msg.fail( f"The argument --epoch-resume has to be greater or equal to 0. {epoch_resume} is invalid", exits=True, )
def project_clone( name: str, dest: Path, *, repo: str = about.__projects__, branch: str = about.__projects_branch__, sparse_checkout: bool = False, ) -> None: """Clone a project template from a repository. name (str): Name of subdirectory to clone. dest (Path): Destination path of cloned project. repo (str): URL of Git repo containing project templates. branch (str): The branch to clone from """ dest = ensure_path(dest) check_clone(name, dest, repo) project_dir = dest.resolve() repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) try: git_checkout(repo, name, dest, branch=branch, sparse=sparse_checkout) except subprocess.CalledProcessError: err = f"Could not clone '{name}' from repo '{repo_name}' (branch '{branch}')" msg.fail(err, exits=1) msg.good(f"Cloned '{name}' from '{repo_name}' (branch '{branch}')", project_dir) if not (project_dir / PROJECT_FILE).exists(): msg.warn(f"No {PROJECT_FILE} found in directory") else: msg.good(f"Your project is now ready!") print(f"To fetch the assets, run:\n{COMMAND} project assets {dest}")
def profile_cli( # fmt: off ctx: typer.Context, # This is only used to read current calling context model: str = Arg(..., help="Trained pipeline to load"), inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True), n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"), # fmt: on ): """ Profile which functions take the most time in a spaCy pipeline. Input should be formatted as one JSON object per line with a key "text". It can either be provided as a JSONL file, or be read from sys.sytdin. If no input file is specified, the IMDB dataset is loaded via Thinc. DOCS: https://spacy.io/api/cli#debug-profile """ if ctx.parent.command.name == NAME: # type: ignore[union-attr] # called as top-level command msg.warn( "The profile command is now available via the 'debug profile' " "subcommand. You can run python -m spacy debug --help for an " "overview of the other available debugging commands.") profile(model, inputs=inputs, n_texts=n_texts)
def link(*args, **kwargs): """As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained pipeline packages using their full names or from a directory path.""" msg.warn( "As of spaCy v3.0, model symlinks are not supported anymore. You can load trained " "pipeline packages using their full names or from a directory path." )
def project_assets(project_dir: Path, *, sparse_checkout: bool = False) -> None: """Fetch assets for a project using DVC if possible. project_dir (Path): Path to project directory. """ project_path = ensure_path(project_dir) config = load_project_config(project_path) assets = config.get("assets", {}) if not assets: msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.info(f"Fetching {len(assets)} asset(s)") for asset in assets: dest = (project_dir / asset["dest"]).resolve() checksum = asset.get("checksum") if "git" in asset: git_err = ( f"Cloning spaCy project templates requires Git and the 'git' command. " f"Make sure it's installed and that the executable is available." ) get_git_version(error=git_err) if dest.exists(): # If there's already a file, check for checksum if checksum and checksum == get_checksum(dest): msg.good( f"Skipping download with matching checksum: {asset['dest']}" ) continue else: if dest.is_dir(): shutil.rmtree(dest) else: dest.unlink() if "repo" not in asset["git"] or asset["git"]["repo"] is None: msg.fail( "A git asset must include 'repo', the repository address.", exits=1) if "path" not in asset["git"] or asset["git"]["path"] is None: msg.fail( "A git asset must include 'path' - use \"\" to get the entire repository.", exits=1, ) git_checkout( asset["git"]["repo"], asset["git"]["path"], dest, branch=asset["git"].get("branch"), sparse=sparse_checkout, ) msg.good(f"Downloaded asset {dest}") else: url = asset.get("url") if not url: # project.yml defines asset without URL that the user has to place check_private_asset(dest, checksum) continue fetch_asset(project_path, url, dest, checksum)
def init_model( lang, output_dir, freqs_loc=None, clusters_loc=None, jsonl_loc=None, vectors_loc=None, truncate_vectors=0, prune_vectors=-1, vectors_name=None, model_name=None, ): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. If vectors are provided in Word2Vec format, they can be either a .txt or zipped as a .zip or .tar.gz. """ if jsonl_loc is not None: if freqs_loc is not None or clusters_loc is not None: settings = ["-j"] if freqs_loc: settings.append("-f") if clusters_loc: settings.append("-c") msg.warn( "Incompatible arguments", "The -f and -c arguments are deprecated, and not compatible " "with the -j argument, which should specify the same " "information. Either merge the frequencies and clusters data " "into the JSONL-formatted file (recommended), or use only the " "-f and -c files, without the other lexical attributes.", ) jsonl_loc = ensure_path(jsonl_loc) lex_attrs = srsly.read_jsonl(jsonl_loc) else: clusters_loc = ensure_path(clusters_loc) freqs_loc = ensure_path(freqs_loc) if freqs_loc is not None and not freqs_loc.exists(): msg.fail("Can't find words frequencies file", freqs_loc, exits=1) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) with msg.loading("Creating model..."): nlp = create_model(lang, lex_attrs, name=model_name) msg.good("Successfully created model") if vectors_loc is not None: add_vectors(nlp, vectors_loc, truncate_vectors, prune_vectors, vectors_name) vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( "Sucessfully compiled vocab", "{} entries, {} vectors".format(lex_added, vec_added), ) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) return nlp
async def insert_commons_categories(self, row, delay): try: await asyncio.sleep(delay) self.process_data(row, config.CORPUS_ARTICLES_HEADER[31], data=Extract(self.project, row["qid"]).commons_categories()) self.utils.report_time(self.start_time) except pwb.exceptions.NoPage: msg.warn("Sin elemento de Wikidata") self.utils.report_time(self.start_time)
async def insert_wikidata_references_p143(self, row, delay): try: await asyncio.sleep(delay) self.process_data(row, config.CORPUS_ARTICLES_HEADER[25], data=Extract(self.project, row["qid"]).wikidata_references()[1]) self.utils.report_time(self.start_time) except pwb.exceptions.NoPage: msg.warn("Sin elemento de Wikidata") self.utils.report_time(self.start_time)
def process_data(self, row, header_id, data=None): if data is None: data = row msg.info(f"id: {row['qid']}") msg.good(f"{data}") try: self.insert(data, row["id"], header_id) self.export() except pwb.exceptions.InvalidTitle: msg.warn("Título invalido: {row['qid']}") logging.error("Título invalido: {row['qid']}") self.utils.should_continue() except pwb.exceptions.NoPage: msg.warn("No tiene página en eswiki: {row['qid']}") logging.error("No tiene página en eswiki: {row['qid']}") except pwb.exceptions.IsRedirectPage: # TODO: Se debe añadir un sistema por el que detectar que es una redirección y obtener # la página de destino, y luego trabajar con ella. msg.warn(f"Es una redirección: {row['qid']}") logging.error(f"Es una redirección: {row['qid']}") pass # TODO: redefinir este "bare except" except: msg.warn(f"Error inesperado: {sys.exc_info()[0]}") logging.error(f"Error inesperado: {sys.exc_info()[0]}") pass
def validate() -> None: model_pkgs, compat = get_model_pkgs() spacy_version = get_minor_version(about.__version__) current_compat = compat.get(spacy_version, {}) if not current_compat: msg.warn(f"No compatible packages found for v{spacy_version} of spaCy") incompat_models = { d["name"] for _, d in model_pkgs.items() if not d["compat"] } na_models = [m for m in incompat_models if m not in current_compat] update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent msg.divider(f"Installed pipeline packages (spaCy v{about.__version__})") msg.info(f"spaCy installation: {spacy_dir}") if model_pkgs: header = ("NAME", "SPACY", "VERSION", "") rows = [] for name, data in model_pkgs.items(): if data["compat"]: comp = msg.text("", color="green", icon="good", no_print=True) version = msg.text(data["version"], color="green", no_print=True) else: version = msg.text(data["version"], color="yellow", no_print=True) comp = f"--> {current_compat.get(data['name'], ['n/a'])[0]}" rows.append((data["name"], data["spacy"], version, comp)) msg.table(rows, header=header) else: msg.text("No pipeline packages found in your current environment.", exits=0) if update_models: msg.divider("Install updates") msg.text("Use the following commands to update the packages:") cmd = "python -m spacy download {}" print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.info( f"The following packages are custom spaCy pipelines or not " f"available for spaCy v{about.__version__}:", ", ".join(na_models), ) if incompat_models: sys.exit(1)
def apply_bilou_schema(self, message: Dict) -> List[Text]: """Apply BILOU schema to a gold standard JSON example. Args: message (dict): message dict. Returns: a list of BILOU tags. """ tokens = self.tokens_without_cls(message) entity_offsets = get_entity_offsets(message) entities = bilou_tags_from_offsets(tokens, entity_offsets) collected = [] for t, e in zip(tokens, entities): if e == "-": collected.append(t) elif collected: collected_text = " ".join([t.text for t in collected]) msg and msg.warn( f"Misaligned entity annotation for '{collected_text}' " f"in sentence: \"{message['text']}\". " f"Make sure the start and end values of the " f"annotated training examples end at token " f"boundaries (e.g. don't include trailing " f"whitespaces or punctuation).") collected = [] return entities
def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None: """Check and validate assets without a URL (private assets that the user has to provide themselves) and give feedback about the checksum. dest (Path): Destination path of the asset. checksum (Optional[str]): Optional checksum of the expected file. """ if not Path(dest).exists(): err = f"No URL provided for asset. You need to add this file yourself: {dest}" msg.warn(err) else: if not checksum: msg.good(f"Asset already exists: {dest}") elif checksum == get_checksum(dest): msg.good(f"Asset exists with matching checksum: {dest}") else: msg.fail(f"Asset available but with incorrect checksum: {dest}")
def fetch_asset(project_path: Path, url: str, dest: Path, checksum: Optional[str] = None) -> None: """Fetch an asset from a given URL or path. If a checksum is provided and a local file exists, it's only re-downloaded if the checksum doesn't match. project_path (Path): Path to project directory. url (str): URL or path to asset. checksum (Optional[str]): Optional expected checksum of local file. RETURNS (Optional[Path]): The path to the fetched asset or None if fetching the asset failed. """ dest_path = (project_path / dest).resolve() if dest_path.exists(): # If there's already a file, check for checksum if checksum: if checksum == get_checksum(dest_path): msg.good(f"Skipping download with matching checksum: {dest}") return else: # If there's not a checksum, make sure the file is a possibly valid size if os.path.getsize(dest_path) == 0: msg.warn( f"Asset exists but with size of 0 bytes, deleting: {dest}") os.remove(dest_path) # We might as well support the user here and create parent directories in # case the asset dir isn't listed as a dir to create in the project.yml if not dest_path.parent.exists(): dest_path.parent.mkdir(parents=True) with working_dir(project_path): url = convert_asset_url(url) try: download_file(url, dest_path) msg.good(f"Downloaded asset {dest}") except requests.exceptions.RequestException as e: if Path(url).exists() and Path(url).is_file(): # If it's a local file, copy to destination shutil.copy(url, str(dest_path)) msg.good(f"Copied local asset {dest}") else: msg.fail(f"Download failed: {dest}", e) if checksum and checksum != get_checksum(dest_path): msg.fail( f"Checksum doesn't match value defined in {PROJECT_FILE}: {dest}")
def convert_asset_url(url: str) -> str: """Check and convert the asset URL if needed. url (str): The asset URL. RETURNS (str): The converted URL. """ # If the asset URL is a regular GitHub URL it's likely a mistake if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url: converted = url.replace("github.com", "raw.githubusercontent.com") converted = re.sub(r"/(tree|blob)/", "/", converted) msg.warn( "Downloading from a regular GitHub URL. This will only download " "the source of the page, not the actual file. Converting the URL " "to a raw URL.", converted, ) return converted return url
def __init__(self, vectorizer, architecture, state_dict_path, labels): """ - param vectorizer(callable): a function that converts any string to a NumPy 1-D array. - param architecture(class): a `torch.nn.Module` child class to be instantiated into a neural net. - param state_dict_path(str): path to a PyTorch state dict that matches the architecture. - param labels(list of str): the classification labels, e.g. ["POSITIVE", "NEGATIVE"]. """ # set up label conversion self.label_encoder = {_label: i for i, _label in enumerate(labels)} self.label_decoder = {i: _label for i, _label in enumerate(labels)} self.num_classes = len(self.label_encoder) # set up vectorizer and the neural network with appropriate dimensions self.vectorizer = vectorizer vec_dim = self.vectorizer("").shape[0] self.nn = architecture(vec_dim, self.num_classes) # if a state dict exists, load it and create a backup copy import os if os.path.isfile(state_dict_path): from shutil import copyfile try: self.nn.load_state_dict(torch.load(state_dict_path)) except Exception as e: logger.warn(f"Load VectorNet state path failed with {type(e)}: e") state_dict_backup_path = ( f"{state_dict_path}.{datetime.now().strftime('%Y%m%d%H%M%S')}" ) copyfile(state_dict_path, state_dict_backup_path) # set a path to store updated parameters self.nn_update_path = state_dict_path # initialize an optimizer object and a dict to hold dynamic parameters self.nn_optimizer = torch.optim.Adam(self.nn.parameters()) self._dynamic_params = {"optimizer": {"lr": 0.01, "betas": (0.9, 0.999)}}
def main(name: ("模型名称", "positional", None, None, trf_list), make_cache_dir: (" 创建缓存文件夹", "flag", "mk"), use_local_class: ("不使用网络读取", "flag", "local")): if make_cache_dir: c_path = ensure_path(f"{cache_path + name}") if c_path.exists(): msg.warn(f"{cache_path + name} already exists") else: c_path.mkdir() msg.good(f" 缓存文件夹已创建:\t{cache_path}{name}") msg.warn("\n================url================\n") config_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[name] model_file = ALL_PRETRAINED_MODEL_ARCHIVE_MAP[name] msg.text(f"{config_file}\n{model_file}\n") vocab = get_tokenizer(name, use_local_class) pretrained_vocab_files_map = vocab.pretrained_vocab_files_map for vocab_file in pretrained_vocab_files_map.values(): msg.text(f"{vocab_file[name]}\n") msg.warn("\n================url================\n") msg.good("\n使用下载工具下载后,将模型文件放入缓存文件夹中。")
def get_source_files(lang): exercises_path = Path(EXERCISES_DIR) if not exercises_path.exists(): msg.fail(f"Can't find exercises directory: {EXERCISES_DIR}", exits=1) for lang_path in exercises_path.iterdir(): if lang_path.is_dir(): lang_name = lang_path.stem if lang and lang_name != lang: continue for py_file in lang_path.iterdir(): if py_file.name.startswith("test_"): solution_name = f"solution_{py_file.name.split('test_')[1]}" solution_file = lang_path / solution_name if not solution_file.exists(): if py_file.name == GENERAL_TEST: yield (lang_name, py_file, None) else: msg.warn( f"Didn't find solution for test: {py_file.stem} ({lang_path})" ) else: yield (lang_name, py_file, solution_file)
def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -> None: if (not (is_package("spacy") or is_package("spacy-nightly")) and "--no-deps" not in pip_args): msg.warn( "Skipping pipeline package dependencies and setting `--no-deps`. " "You don't seem to have the spaCy package itself installed " "(maybe because you've built from source?), so installing the " "package dependencies would cause spaCy to be downloaded, which " "probably isn't what you want. If the pipeline package has other " "dependencies, you'll have to install them manually.") pip_args = pip_args + ("--no-deps", ) suffix = SDIST_SUFFIX if sdist else WHEEL_SUFFIX dl_tpl = "{m}-{v}/{m}-{v}{s}#egg={m}=={v}" if direct: components = model.split("-") model_name = "".join(components[:-1]) version = components[-1] download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args) else: model_name = model if model in OLD_MODEL_SHORTCUTS: msg.warn( f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please" f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead." ) model_name = OLD_MODEL_SHORTCUTS[model] compatibility = get_compatibility() version = get_version(model_name, compatibility) download_model(dl_tpl.format(m=model_name, v=version, s=suffix), pip_args) msg.good( "Download and installation successful", f"You can now load the package via spacy.load('{model_name}')", )
def main(in_file, model_file=None, config_file=None, spacy_model=None): """Train CRF entity tagger.""" if config_file: msg.info(f"Loading config: {config_file}") component_config = srsly.read_json(config_file) else: component_config = None model_file = model_file or "model.pkl" msg.info("Loading model from file", model_file) crf_extractor = CRFExtractor( component_config=component_config).from_disk(model_file) msg.good("Successfully loaded CRF tagger", crf_extractor) msg.info("Loading dev dataset from file", in_file) dev_examples = read_file(in_file) msg.good(f"Successfully loaded {len(dev_examples)} dev examples.") if spacy_model is not None: nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model: {spacy_model}") else: nlp = spacy.blank("en") msg.info(f"Using spaCy blank: 'en'") tokenizer = SpacyTokenizer(nlp=nlp) use_dense_features = crf_extractor.use_dense_features() dev_crf_examples = [ gold_example_to_crf_tokens(ex, tokenizer=tokenizer, use_dense_features=use_dense_features) for ex in dev_examples ] f1_score, classification_report = crf_extractor.eval(dev_crf_examples) msg.warn(f"f1 score: {f1_score}") print(classification_report)
def check_workflows(workflows: List[str], workflow: Optional[str] = None) -> None: """Validate workflows provided in project.yml and check that a given workflow can be used to generate a DVC config. workflows (List[str]): Names of the available workflows. workflow (Optional[str]): The name of the workflow to convert. """ if not workflows: msg.fail( f"No workflows defined in {PROJECT_FILE}. To generate a DVC config, " f"define at least one list of commands.", exits=1, ) if workflow is not None and workflow not in workflows: msg.fail( f"Workflow '{workflow}' not defined in {PROJECT_FILE}. " f"Available workflows: {', '.join(workflows)}", exits=1, ) if not workflow: msg.warn( f"No workflow specified for DVC pipeline. Using the first workflow " f"defined in {PROJECT_FILE}: '{workflows[0]}'")
def eval_dataset(set_id): """Output summary about user agreement with the model.""" DB = connect() data = DB.get_dataset(set_id) accepted = [ eg for eg in data if eg["answer"] == "accept" and eg.get("accept") ] rejected = [eg for eg in data if eg["answer"] == "reject"] if not accepted and not rejected: msg.warn("No annotations collected", exits=1) high_conf = 0.8 agree_count = 0 disagree_high_conf = len( [e for e in rejected if e["confidence"] > high_conf]) for eg in accepted: choice = eg["accept"][0] score_choice = [ o["score"] for o in eg["options"] if o["id"] == choice ][0] score_other = [ o["score"] for o in eg["options"] if o["id"] != choice ][0] if score_choice > score_other: agree_count += 1 elif eg["confidence"] > high_conf: disagree_high_conf += 1 pc = agree_count / (len(accepted) + len(rejected)) text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" msg.info(f"Evaluating data from '{set_id}'") if pc > 0.5: msg.good(text) else: msg.fail(text) msg.text( f"You disagreed on {disagree_high_conf} high confidence scores") msg.text(f"You rejected {len(rejected)} suggestions as not similar")
def eval_dataset(set_id): DB = connect() data = DB.get_dataset(set_id) accepted = [ eg for eg in data if eg["answer"] == "accept" and eg.get("accept") ] rejected = [eg for eg in data if eg["answer"] == "reject"] ignored = [eg for eg in data if eg["answer"] == "ignore"] if not accepted and not rejected: msg.warn("No annotations collected", exits=1) total_count = 0 agree_count = 0 for eg in accepted: total_count += len(eg.get("options", [])) agree_count += len(eg.get("accept", [])) msg.info(f"Evaluating data from '{set_id}'") msg.text( f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") pc = agree_count / total_count text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})" if pc > 0.5: msg.good(text) else: msg.fail(text)
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", replace_components=False, vectors=None, width=96, conv_depth=4, cnn_window=1, cnn_pieces=3, bilstm_depth=0, embed_rows=2000, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, omit_extra_lookups=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] disabled_pipes = None pipes_added = False msg.text("Training pipeline: {}".format(pipeline)) if use_gpu >= 0: activated_gpu = None try: activated_gpu = set_gpu(use_gpu) except Exception as e: msg.warn("Exception: {}".format(e)) if activated_gpu is not None: msg.text("Using GPU: {}".format(use_gpu)) else: msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) for pipe in pipeline: pipe_cfg = {} if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if pipe not in nlp.pipe_names: msg.text("Adding component to base model: '{}'".format(pipe)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True elif replace_components: msg.text( "Replacing component from base model '{}'".format(pipe)) nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) msg.text( "Extending component from base model '{}'".format(pipe)) base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline]) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) # Replace tag map with provided mapping nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if omit_extra_lookups: nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups_extra.add_table("lexeme_prob") nlp.vocab.lookups_extra.add_table("lexeme_settings") if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model and not pipes_added: # Start with an existing model, use default optimizer optimizer = nlp.resume_training(device=use_gpu) else: # Start with a blank model, call begin_training cfg = {"device": use_gpu} cfg["conv_depth"] = conv_depth cfg["token_vector_width"] = width cfg["bilstm_depth"] = bilstm_depth cfg["cnn_maxout_pieces"] = cnn_pieces cfg["embed_size"] = embed_rows cfg["conv_window"] = cnn_window optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat-positive-label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) try: nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) except ValueError as e: err = "Error during training" if init_tok2vec: err += " Did you provide the same parameters during 'train' as during 'pretrain'?" msg.fail(err, "Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) # Only evaluate on CPU in the first iteration (for # timing) if GPU is enabled if i == 0: with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg[ "beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta.setdefault("accuracy", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["accuracy"][metric] = scorer.scores[ metric] else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["beam_accuracy"][metric] = scorer.scores[ metric] meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: iter_current = i + 1 msg.text("Early stopping, best iteration " "is: {}".format(iter_current - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break except Exception as e: msg.warn( "Aborting and saving the final best model. " "Encountered exception: {}".format(e), exits=1, ) finally: best_pipes = nlp.pipe_names if disabled_pipes: disabled_pipes.restore() meta["pipeline"] = nlp.pipe_names with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) srsly.write_json(final_model_path / "meta.json", meta) meta_loc = output_path / "model-final" / "meta.json" final_meta = srsly.read_json(meta_loc) final_meta.setdefault("accuracy", {}) final_meta["accuracy"].update(meta.get("accuracy", {})) final_meta.setdefault("speed", {}) final_meta["speed"].setdefault("cpu", None) final_meta["speed"].setdefault("gpu", None) meta.setdefault("speed", {}) meta["speed"].setdefault("cpu", None) meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: speed = _get_total_speed( [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) final_meta["speed"]["cpu"] = speed if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: speed = _get_total_speed( [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) final_meta["speed"]["gpu"] = speed # if there were no speeds to update, overwrite with meta if (final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None): final_meta["speed"].update(meta["speed"]) # note: beam speeds are not combined with the base model if has_beam_widths: final_meta.setdefault("beam_accuracy", {}) final_meta["beam_accuracy"].update( meta.get("beam_accuracy", {})) final_meta.setdefault("beam_speed", {}) final_meta["beam_speed"].update(meta.get("beam_speed", {})) srsly.write_json(meta_loc, final_meta) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(final_meta, output_path, best_pipes) msg.good("Created best model", best_model_path)
def project_assets( project_dir: Path, *, overrides: Dict[str, Any] = SimpleFrozenDict(), sparse_checkout: bool = False, extra: bool = False, ) -> None: """Fetch assets for a project using DVC if possible. project_dir (Path): Path to project directory. sparse_checkout (bool): Use sparse checkout for assets provided via Git, to only check out and clone the files needed. extra (bool): Whether to download all assets, including those marked as 'extra'. """ project_path = ensure_path(project_dir) config = load_project_config(project_path, overrides=overrides) assets = [ asset for asset in config.get("assets", []) if extra or not asset.get("extra", EXTRA_DEFAULT) ] if not assets: msg.warn( f"No assets specified in {PROJECT_FILE} (if assets are marked as extra, download them with --extra)", exits=0, ) msg.info(f"Fetching {len(assets)} asset(s)") for asset in assets: dest = (project_dir / asset["dest"]).resolve() checksum = asset.get("checksum") if "git" in asset: git_err = ( f"Cloning spaCy project templates requires Git and the 'git' command. " f"Make sure it's installed and that the executable is available." ) get_git_version(error=git_err) if dest.exists(): # If there's already a file, check for checksum if checksum and checksum == get_checksum(dest): msg.good( f"Skipping download with matching checksum: {asset['dest']}" ) continue else: if dest.is_dir(): shutil.rmtree(dest) else: dest.unlink() if "repo" not in asset["git"] or asset["git"]["repo"] is None: msg.fail( "A git asset must include 'repo', the repository address.", exits=1 ) if "path" not in asset["git"] or asset["git"]["path"] is None: msg.fail( "A git asset must include 'path' - use \"\" to get the entire repository.", exits=1, ) git_checkout( asset["git"]["repo"], asset["git"]["path"], dest, branch=asset["git"].get("branch"), sparse=sparse_checkout, ) msg.good(f"Downloaded asset {dest}") else: url = asset.get("url") if not url: # project.yml defines asset without URL that the user has to place check_private_asset(dest, checksum) continue fetch_asset(project_path, url, dest, checksum)
def main(model_path, out_dir, min_freq_ratio=0.0, min_distance=0.0, check_keys=''): check_keys_list = [] if len(check_keys) > 0: check_keys_list = list(map(lambda x: x.strip(), check_keys.split(','))) s2v = Sense2Vec().from_disk(model_path) output_path = Path(out_dir) vocab = {} for key, score in s2v.frequencies: vocab[key] = score vectors = {} for key, val in s2v: vectors[key] = val msg.info("loading vectors") for key, val in s2v: vector_size = len(val) break all_senses = s2v.senses msg.info("loaded vectors") if len(check_keys_list) > 0: blacklist = {} whitelist = [] blacklisted_sense_keys = get_blacklisted_sense_keys(vocab) markdown_and_url_keys = get_markdown_and_url_keys(vocab) minority_keys = get_minority_keys(vocab, min_freq_ratio) redundant_keys = get_redundant_keys(vocab, vectors, min_distance) for k in check_keys_list: if k in blacklisted_sense_keys: blacklist[k] = 'sense' elif k in markdown_and_url_keys: blacklist[k] = 'markdown / url' elif k in minority_keys: blacklist[k] = 'minority' elif k in redundant_keys: blacklist[k] = 'redundant' else: whitelist.append(k) msg.warn('blacklist') for k in blacklist.keys(): msg.warn("{k}: {v}".format(k=k, v=blacklist[k])) msg.good('whitelist') for k in whitelist: msg.good(k) else: discarded = set() discarded.update(get_blacklisted_sense_keys(vocab)) discarded.update(get_markdown_and_url_keys(vocab)) discarded.update(get_minority_keys(vocab, min_freq_ratio)) discarded.update(get_redundant_keys(vocab, vectors, min_distance)) n_vectors = len(vectors) - len(discarded) s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) for key, vector in vectors.items(): if key not in discarded: s2v.add(key, vector) if key in vocab: s2v.set_freq(key, vocab[key]) msg.good("Created the sense2vec model") msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir)