def log_step(info: Dict[str, Any]): try: losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) for pipe_name in nlp.pipe_names ] except KeyError as e: raise KeyError( Errors.E983.format( dict="scores (losses)", key=str(e), keys=list(info["losses"].keys()), )) from None try: scores = [ "{0:.2f}".format( float(info["other_scores"].get(col, 0.0)) * 100) for col in score_cols ] except KeyError as e: raise KeyError( Errors.E983.format( dict="scores (other)", key=str(e), keys=list(info["other_scores"].keys()), )) from None time = timedelta(seconds=info["seconds"]) data = ([str(time), info["epoch"], info["step"], info["words"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]) msg.row(data, widths=table_widths, aligns=table_aligns)
def main(n_hidden: int = 256, dropout: float = 0.2, n_iter: int = 10, batch_size: int = 128): # Define the model model: Model = chain( Relu(nO=n_hidden, dropout=dropout), Relu(nO=n_hidden, dropout=dropout), Softmax(), ) # Load the data (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist() # Set any missing shapes for the model. model.initialize(X=train_X[:5], Y=train_Y[:5]) train_data = model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True) dev_data = model.ops.multibatch(batch_size, dev_X, dev_Y) # Create the optimizer. optimizer = Adam(0.001) for i in range(n_iter): for X, Y in tqdm(train_data, leave=False): Yh, backprop = model.begin_update(X) backprop(Yh - Y) model.finish_update(optimizer) # Evaluate and print progress correct = 0 total = 0 for X, Y in dev_data: Yh = model.predict(X) correct += (Yh.argmax(axis=1) == Y.argmax(axis=1)).sum() total += Yh.shape[0] score = correct / total msg.row((i, f"{score:.3f}"), widths=(3, 5))
def train_model( model, train_path, eval_path, n_iter=10, output=None, tok2vec=None, ): """ Train a model from Prodigy annotations and optionally save out the best model to disk. """ spacy.util.fix_random_seed(0) with msg.loading(f"Loading '{model}'..."): if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = spacy.load(model) msg.good(f"Loaded model '{model}'") train_data, labels = format_data(srsly.read_jsonl(train_path)) eval_data, _ = format_data(srsly.read_jsonl(eval_path)) ner = nlp.create_pipe("ner") for label in labels: ner.add_label(label) nlp.add_pipe(ner) t2v_cfg = { "embed_rows": 10000, "token_vector_width": 128, "conv_depth": 8, "nr_feature_tokens": 3, } optimizer = nlp.begin_training( component_cfg={"ner": t2v_cfg} if tok2vec else {}) if tok2vec: _load_pretrained_tok2vec(nlp, Path(tok2vec)) batch_size = spacy.util.compounding(1.0, 16.0, 1.001) best_acc = 0 best_model = None row_widths = (2, 8, 8, 8, 8) msg.row(("#", "L", "P", "R", "F"), widths=row_widths) for i in range(n_iter): random.shuffle(train_data) losses = {} data = tqdm.tqdm(train_data, leave=False) for batch in spacy.util.minibatch(data, size=batch_size): texts, annots = zip(*batch) nlp.update(texts, annots, drop=0.2, losses=losses) with nlp.use_params(optimizer.averages): sc = nlp.evaluate(eval_data) if sc.ents_f > best_acc: best_acc = sc.ents_f if output: best_model = nlp.to_bytes() acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}") msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths) msg.text(f"Best F-Score: {best_acc:.3f}") if output and best_model: with msg.loading("Saving model..."): nlp.from_bytes(best_model).to_disk(output) msg.good("Saved model", output)
def train_model(model, train_path, eval_path, n_iter=10, output="./model2/", tok2vec=None): spacy.util.fix_random_seed(0) with msg.loading(f"Loading '{model}'..."): if model.startswith("blank:"): nlp = spacy.blank(model.replace("blank:", "")) else: nlp = spacy.load(model) msg.good(f"Loaded model '{model}'") train_data, labels = format_data(srsly.read_jsonl(train_path)) eval_data, _ = format_data(srsly.read_jsonl(eval_path)) if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe("textcat") nlp.add_pipe(textcat, last=True) else: textcat = nlp.get_pipe("textcat") for label in labels: textcat.add_label(label) optimizer = nlp.begin_training(component_cfg={"exclusive_classes": True}) batch_size = spacy.util.compounding(1.0, 16.0, 1.001) best_acc = 0 best_model = None row_widths = (2, 8, 8) msg.row(("#", "L", "F"), widths=row_widths) for i in range(n_iter): random.shuffle(train_data) losses = {} data = tqdm.tqdm(train_data, leave=False) for batch in spacy.util.minibatch(data, size=batch_size): #texts = [text for text, entities in batch] #annotations = [entities for text, entities in batch] texts, annotations = zip(*batch) nlp.update(texts, annotations, drop=0.2, losses=losses) with nlp.use_params(optimizer.averages): scorer = nlp.evaluate(eval_data) if scorer.textcat_score > best_acc: best_acc = scorer.textcat_score if output: best_model = nlp.to_bytes() acc = f"{scorer.textcat_score:.3f}" msg.row((i + 1, f"{losses['textcat']:.2f}", acc), widths=row_widths) msg.text(f"Best F-Score: {best_acc:.3f}") if output and best_model: with msg.loading("Saving model..."): nlp.from_bytes(best_model).to_disk(output) msg.good("Saved model", output)
def setup_printer( nlp: "Language", ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: score_cols = list(nlp.config["training"]["score_weights"]) score_widths = [max(len(col), 6) for col in score_cols] loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] loss_widths = [max(len(col), 8) for col in loss_cols] table_header = ["T", "E", "#", "W" ] + loss_cols + score_cols + ["Score"] table_header = [col.upper() for col in table_header] table_widths = [8, 3, 6, 6] + loss_widths + score_widths + [6] table_aligns = ["r" for _ in table_widths] msg.row(table_header, widths=table_widths) msg.row(["-" * width for width in table_widths]) def log_step(info: Dict[str, Any]): try: losses = [ "{0:.2f}".format(float(info["losses"][pipe_name])) for pipe_name in nlp.pipe_names ] except KeyError as e: raise KeyError( Errors.E983.format( dict="scores (losses)", key=str(e), keys=list(info["losses"].keys()), )) from None try: scores = [ "{0:.2f}".format( float(info["other_scores"].get(col, 0.0)) * 100) for col in score_cols ] except KeyError as e: raise KeyError( Errors.E983.format( dict="scores (other)", key=str(e), keys=list(info["other_scores"].keys()), )) from None time = timedelta(seconds=info["seconds"]) data = ([str(time), info["epoch"], info["step"], info["words"]] + losses + scores + ["{0:.2f}".format(float(info["score"]))]) msg.row(data, widths=table_widths, aligns=table_aligns) def finalize(): pass return log_step, finalize
def train( lang, output_path, train_path, dev_path, raw_text=None, base_model=None, pipeline="tagger,parser,ner", replace_components=False, vectors=None, width=96, conv_depth=4, cnn_window=1, cnn_pieces=3, bilstm_depth=0, embed_rows=2000, n_iter=30, n_early_stopping=None, n_examples=0, use_gpu=-1, version="0.0.0", meta_path=None, init_tok2vec=None, parser_multitasks="", entity_multitasks="", noise_level=0.0, orth_variant_level=0.0, eval_beam_widths="", gold_preproc=False, learn_tokens=False, textcat_multilabel=False, textcat_arch="bow", textcat_positive_label=None, tag_map_path=None, omit_extra_lookups=False, verbose=False, debug=False, ): """ Train or update a spaCy model. Requires data to be formatted in spaCy's JSON format. To convert data from other formats, use the `spacy convert` command. """ util.fix_random_seed() util.set_env_log(verbose) # Make sure all files and paths exists if they are needed train_path = util.ensure_path(train_path) dev_path = util.ensure_path(dev_path) meta_path = util.ensure_path(meta_path) output_path = util.ensure_path(output_path) if raw_text is not None: raw_text = list(srsly.read_jsonl(raw_text)) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if meta_path is not None and not meta_path.exists(): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if meta_path else {} if output_path.exists() and [ p for p in output_path.iterdir() if p.is_dir() ]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_path.exists(): output_path.mkdir() msg.good("Created output directory: {}".format(output_path)) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying( util.env_opt("dropout_from", 0.2), util.env_opt("dropout_to", 0.2), util.env_opt("dropout_decay", 0.0), ) batch_sizes = util.compounding( util.env_opt("batch_from", 100.0), util.env_opt("batch_to", 1000.0), util.env_opt("batch_compound", 1.001), ) if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] # Set up the base model and pipeline. If a base model is specified, load # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] disabled_pipes = None pipes_added = False msg.text("Training pipeline: {}".format(pipeline)) if use_gpu >= 0: activated_gpu = None try: activated_gpu = set_gpu(use_gpu) except Exception as e: msg.warn("Exception: {}".format(e)) if activated_gpu is not None: msg.text("Using GPU: {}".format(use_gpu)) else: msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( "Model language ('{}') doesn't match language specified as " "`lang` argument ('{}') ".format(nlp.lang, lang), exits=1, ) for pipe in pipeline: pipe_cfg = {} if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } if pipe not in nlp.pipe_names: msg.text("Adding component to base model: '{}'".format(pipe)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True elif replace_components: msg.text( "Replacing component from base model '{}'".format(pipe)) nlp.replace_pipe(pipe, nlp.create_pipe(pipe, config=pipe_cfg)) pipes_added = True else: if pipe == "textcat": textcat_cfg = nlp.get_pipe("textcat").cfg base_cfg = { "exclusive_classes": textcat_cfg["exclusive_classes"], "architecture": textcat_cfg["architecture"], "positive_label": textcat_cfg["positive_label"], } if base_cfg != pipe_cfg: msg.fail( "The base textcat model configuration does" "not match the provided training options. " "Existing cfg: {}, provided cfg: {}".format( base_cfg, pipe_cfg), exits=1, ) msg.text( "Extending component from base model '{}'".format(pipe)) base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline]) else: msg.text("Starting with blank model '{}'".format(lang)) lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: if pipe == "parser": pipe_cfg = {"learn_tokens": learn_tokens} elif pipe == "textcat": pipe_cfg = { "exclusive_classes": not textcat_multilabel, "architecture": textcat_arch, "positive_label": textcat_positive_label, } else: pipe_cfg = {} nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) if tag_map_path is not None: tag_map = srsly.read_json(tag_map_path) # Replace tag map with provided mapping nlp.vocab.morphology.load_tag_map(tag_map) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed if omit_extra_lookups: nlp.vocab.lookups_extra = Lookups() nlp.vocab.lookups_extra.add_table("lexeme_cluster") nlp.vocab.lookups_extra.add_table("lexeme_prob") nlp.vocab.lookups_extra.add_table("lexeme_settings") if vectors: msg.text("Loading vector from model '{}'".format(vectors)) _load_vectors(nlp, vectors) # Multitask objectives multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)] for pipe_name, multitasks in multitask_options: if multitasks: if pipe_name not in pipeline: msg.fail("Can't use multitask objective without '{}' in the " "pipeline".format(pipe_name)) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus msg.text("Counting training words (limit={})".format(n_examples)) corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() if base_model and not pipes_added: # Start with an existing model, use default optimizer optimizer = nlp.resume_training(device=use_gpu) else: # Start with a blank model, call begin_training cfg = {"device": use_gpu} cfg["conv_depth"] = conv_depth cfg["token_vector_width"] = width cfg["bilstm_depth"] = bilstm_depth cfg["cnn_maxout_pieces"] = cnn_pieces cfg["embed_size"] = embed_rows cfg["conv_window"] = cnn_window optimizer = nlp.begin_training(lambda: corpus.train_tuples, **cfg) nlp._optimizer = None # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg.get("labels", []) if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( "The textcat_positive_label (tpl) '{}' does not match any " "label in the training data.".format(textcat_positive_label), exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( "A textcat_positive_label (tpl) '{}' was provided for training " "data that does not appear to be a binary classification " "problem with two labels.".format(textcat_positive_label), exits=1, ) train_docs = corpus.train_docs( nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) train_labels = set() if textcat_multilabel: multilabel_found = False for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1: multilabel_found = True if not multilabel_found and not base_model: msg.warn("The textcat training instances look like they have " "mutually-exclusive classes. Remove the flag " "'--textcat-multilabel' to train a classifier with " "mutually-exclusive classes.") if not textcat_multilabel: for text, gold in train_docs: train_labels.update(gold.cats.keys()) if list(gold.cats.values()).count(1.0) != 1 and not base_model: msg.warn( "Some textcat training instances do not have exactly " "one positive label. Modifying training options to " "include the flag '--textcat-multilabel' for classes " "that are not mutually exclusive.") nlp.get_pipe("textcat").cfg["exclusive_classes"] = False textcat_multilabel = True break if base_model and set(textcat_labels) != train_labels: msg.fail( "Cannot extend textcat model using data with different " "labels. Base model labels: {}, training data labels: " "{}.".format(textcat_labels, list(train_labels)), exits=1, ) if textcat_multilabel: msg.text( "Textcat evaluation score: ROC AUC score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) elif textcat_positive_label and len(textcat_labels) == 2: msg.text("Textcat evaluation score: F1-score for the " "label '{}'".format(textcat_positive_label)) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: msg.warn( "If the textcat component is a binary classifier with " "exclusive classes, provide '--textcat-positive-label' for " "an evaluation on the positive class.") msg.text( "Textcat evaluation score: F1-score macro-averaged across " "the labels '{}'".format(", ".join(textcat_labels))) else: msg.fail( "Unsupported textcat configuration. Use `spacy debug-data` " "for more information.") # fmt: off row_head, output_stats = _configure_training_output( pipeline, use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = { "widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2 } # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: iter_since_best = 0 best_score = 0.0 for i in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) if raw_text: random.shuffle(raw_text) raw_batches = util.minibatch( (nlp.make_doc(rt["text"]) for rt in raw_text), size=8) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in util.minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) try: nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) except ValueError as e: err = "Error during training" if init_tok2vec: err += " Did you provide the same parameters during 'train' as during 'pretrain'?" msg.fail(err, "Original error message: {}".format(e), exits=1) if raw_text: # If raw text is available, perform 'rehearsal' updates, # which use unlabelled data to reduce overfitting. raw_batch = list(next(raw_batches)) nlp.rehearse(raw_batch, sgd=optimizer, losses=losses) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ("model%d" % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) # Only evaluate on CPU in the first iteration (for # timing) if GPU is enabled if i == 0: with Model.use_device("cpu"): nlp_loaded = util.load_model_from_path( epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg[ "beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, )) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_path / ("model%d" % i) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % about.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta.setdefault("accuracy", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["accuracy"][metric] = scorer.scores[ metric] else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) for component in nlp.pipe_names: for metric in _get_metrics(component): meta["beam_accuracy"][metric] = scorer.scores[ metric] meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % i) meta.setdefault("version", version) meta["labels"] = nlp.meta["labels"] meta_loc = output_path / ("model%d" % i) / "meta.json" srsly.write_json(meta_loc, meta) util.set_env_log(verbose) progress = _get_progress( i, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) if i == 0 and "textcat" in pipeline: textcats_per_cat = scorer.scores.get( "textcats_per_cat", {}) for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( "Textcat ROC AUC score is undefined due to " "only one value in label '{}'.".format( cat)) msg.row(progress, **row_settings) # Early stopping if n_early_stopping is not None: current_score = _score_for_model(meta) if current_score < best_score: iter_since_best += 1 else: iter_since_best = 0 best_score = current_score if iter_since_best >= n_early_stopping: iter_current = i + 1 msg.text("Early stopping, best iteration " "is: {}".format(iter_current - iter_since_best)) msg.text("Best score = {}; Final iteration " "score = {}".format(best_score, current_score)) break except Exception as e: msg.warn( "Aborting and saving the final best model. " "Encountered exception: {}".format(e), exits=1, ) finally: best_pipes = nlp.pipe_names if disabled_pipes: disabled_pipes.restore() meta["pipeline"] = nlp.pipe_names with nlp.use_params(optimizer.averages): final_model_path = output_path / "model-final" nlp.to_disk(final_model_path) srsly.write_json(final_model_path / "meta.json", meta) meta_loc = output_path / "model-final" / "meta.json" final_meta = srsly.read_json(meta_loc) final_meta.setdefault("accuracy", {}) final_meta["accuracy"].update(meta.get("accuracy", {})) final_meta.setdefault("speed", {}) final_meta["speed"].setdefault("cpu", None) final_meta["speed"].setdefault("gpu", None) meta.setdefault("speed", {}) meta["speed"].setdefault("cpu", None) meta["speed"].setdefault("gpu", None) # combine cpu and gpu speeds with the base model speeds if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]: speed = _get_total_speed( [final_meta["speed"]["cpu"], meta["speed"]["cpu"]]) final_meta["speed"]["cpu"] = speed if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]: speed = _get_total_speed( [final_meta["speed"]["gpu"], meta["speed"]["gpu"]]) final_meta["speed"]["gpu"] = speed # if there were no speeds to update, overwrite with meta if (final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None): final_meta["speed"].update(meta["speed"]) # note: beam speeds are not combined with the base model if has_beam_widths: final_meta.setdefault("beam_accuracy", {}) final_meta["beam_accuracy"].update( meta.get("beam_accuracy", {})) final_meta.setdefault("beam_speed", {}) final_meta["beam_speed"].update(meta.get("beam_speed", {})) srsly.write_json(meta_loc, final_meta) msg.good("Saved model to output directory", final_model_path) with msg.loading("Creating best model..."): best_model_path = _collate_best_model(final_meta, output_path, best_pipes) msg.good("Created best model", best_model_path)
def pretrain( texts_loc, vectors_model, output_dir, width=96, conv_depth=4, bilstm_depth=0, cnn_pieces=3, sa_depth=0, use_chars=False, cnn_window=1, embed_rows=2000, loss_func="cosine", use_vectors=False, dropout=0.2, n_iter=1000, batch_size=3000, max_length=500, min_length=5, seed=0, n_save_every=None, init_tok2vec=None, epoch_start=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, using an approximate language-modelling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which match the pretrained ones. The weights are saved to a directory after each epoch. You can then pass a path to one of these pretrained weights files to the 'spacy train' command. This technique may be especially helpful if you have little labelled data. However, it's still quite experimental, so your mileage may vary. To load the weights back in during 'spacy train', you need to ensure all settings are the same between pretraining and training. The API and errors around this need some improvement. """ config = dict(locals()) for key in config: if isinstance(config[key], Path): config[key] = str(config[key]) util.fix_random_seed(seed) has_gpu = prefer_gpu() if has_gpu: import torch torch.set_default_tensor_type("torch.cuda.FloatTensor") msg.info("Using GPU" if has_gpu else "Not using GPU") output_dir = Path(output_dir) if output_dir.exists() and [p for p in output_dir.iterdir()]: msg.warn( "Output directory is not empty", "It is better to use an empty directory or refer to a new output path, " "then the new directory will be created for you.", ) if not output_dir.exists(): output_dir.mkdir() msg.good("Created output directory: {}".format(output_dir)) srsly.write_json(output_dir / "config.json", config) msg.good("Saved settings to config.json") # Load texts from file or stdin if texts_loc != "-": # reading from a file texts_loc = Path(texts_loc) if not texts_loc.exists(): msg.fail("Input text file doesn't exist", texts_loc, exits=1) with msg.loading("Loading input texts..."): texts = list(srsly.read_jsonl(texts_loc)) if not texts: msg.fail("Input file is empty", texts_loc, exits=1) msg.good("Loaded input texts") random.shuffle(texts) else: # reading from stdin msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") with msg.loading("Loading model '{}'...".format(vectors_model)): nlp = util.load_model(vectors_model) msg.good("Loaded model '{}'".format(vectors_model)) pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, Tok2Vec( width, embed_rows, conv_depth=conv_depth, pretrained_vectors=pretrained_vectors, bilstm_depth=bilstm_depth, # Requires PyTorch. Experimental. subword_features=not use_chars, # Set to False for Chinese etc cnn_maxout_pieces=cnn_pieces, # If set to 1, use Mish activation. ), ) # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) if model_name: # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' epoch_start = int(model_name.group(0)[5:][:-4]) + 1 else: if not epoch_start: msg.fail( "You have to use the '--epoch-start' argument when using a renamed weight file for " "'--init-tok2vec'", exits=True, ) elif epoch_start < 0: msg.fail( "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start, exits=True, ) else: # Without '--init-tok2vec' the '--epoch-start' argument is ignored epoch_start = 0 optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start) row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) def _save_model(epoch, is_temp=False): is_temp_str = ".temp" if is_temp else "" with model.use_params(optimizer.averages): with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open( "wb" ) as file_: file_.write(model.tok2vec.to_bytes()) log = { "nr_word": tracker.nr_word, "loss": tracker.loss, "epoch_loss": tracker.epoch_loss, "epoch": epoch, } with (output_dir / "log.jsonl").open("a") as file_: file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): docs, count = make_docs( nlp, [text for (text, _) in batch], max_length=max_length, min_length=min_length, ) skip_counter += count loss = make_update( model, docs, optimizer, objective=loss_func, drop=dropout ) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7: break if n_save_every and (batch_id % n_save_every == 0): _save_model(epoch, is_temp=True) _save_model(epoch) tracker.epoch_loss = 0.0 if texts_loc != "-": # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) if skip_counter > 0: msg.warn("Skipped {count} empty values".format(count=str(skip_counter))) msg.good("Successfully finished pretrain")
def main( model="./zh_vectors_web_ud_lg/model-final", new_model_name="zh_vectors_web_ud_clue_lg", output_dir="./zh_vectors_web_ud_clue_lg", train_path="./clue_spacy_train.jsonl", dev_path="./clue_spacy_dev.jsonl", meta_path="./meta.json", use_gpu=0, n_iter=50 ): import tqdm """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") for label in LABEL: if label not in ner.labels: ner.add_label(label) # add new entity label to entity recognizer train_path = ensure_path(train_path) dev_path = ensure_path(dev_path) if not train_path or not train_path.exists(): msg.fail("Training data not found", train_path, exits=1) if not dev_path or not dev_path.exists(): msg.fail("Development data not found", dev_path, exits=1) if output_dir.exists() and [p for p in output_dir.iterdir() if p.is_dir()]: msg.warn( "Output directory is not empty", "This can lead to unintended side effects when saving the model. " "Please use an empty directory or a different path instead. If " "the specified output path doesn't exist, the directory will be " "created for you.", ) if not output_dir.exists(): output_dir.mkdir() meta = srsly.read_json(meta_path) if meta_path else {} # Prepare training corpus msg.text("Counting training words (limit={})".format(0)) corpus = GoldCorpus(train_path, dev_path, limit=0) n_train_words = corpus.count_train() if model is None: optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) else: optimizer = create_default_optimizer(Model.ops) # Todo: gpu train? dropout_rates = decaying( 0.2, 0.2, 0.0) batch_sizes = compounding( 100.0, 1000.0 , 1.001) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions] # UnboundLocalError: local variable 'has_beam_widths' referenced before assignment # fmt: off eval_beam_widths="" if not eval_beam_widths: eval_beam_widths = [1] else: eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")] if 1 not in eval_beam_widths: eval_beam_widths.append(1) eval_beam_widths.sort() has_beam_widths = eval_beam_widths != [1] row_head, output_stats = _configure_training_output(["ner"], use_gpu, has_beam_widths) row_widths = [len(w) for w in row_head] row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} # fmt: on print("") msg.row(row_head, **row_settings) msg.row(["-" * width for width in row_settings["widths"]], **row_settings) try: noise_level = 0.0 orth_variant_level = 0.0 gold_preproc = False verbose = False best_score = 0.0 with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): train_docs = corpus.train_docs( nlp, noise_level=noise_level, orth_variant_level=orth_variant_level, gold_preproc=gold_preproc, max_length=0, ignore_misaligned=True, ) words_seen = 0 with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch_by_words(train_docs, size=batch_sizes): if not batch: continue docs, golds = zip(*batch) nlp.update( docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses, ) if not int(os.environ.get("LOG_FRIENDLY", 0)): pbar.update(sum(len(doc) for doc in docs)) words_seen += sum(len(doc) for doc in docs) with nlp.use_params(optimizer.averages): set_env_log(False) epoch_model_path = output_dir / ("model%d" % itn) nlp.to_disk(epoch_model_path) nlp_loaded = load_model_from_path(epoch_model_path) for beam_width in eval_beam_widths: for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device("cpu"): nlp_loaded = load_model_from_path(epoch_model_path) for name, component in nlp_loaded.pipeline: if hasattr(component, "cfg"): component.cfg["beam_width"] = beam_width dev_docs = list( corpus.dev_docs( nlp_loaded, gold_preproc=gold_preproc, ignore_misaligned=True, ) ) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = output_dir / ("model%d" % itn) / "accuracy.json" srsly.write_json(acc_loc, scorer.scores) # Update model meta.json meta["lang"] = nlp.lang meta["pipeline"] = nlp.pipe_names meta["spacy_version"] = ">=%s" % spacy.__version__ if beam_width == 1: meta["speed"] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["accuracy"] = scorer.scores else: meta.setdefault("beam_accuracy", {}) meta.setdefault("beam_speed", {}) meta["beam_accuracy"][beam_width] = scorer.scores meta["beam_speed"][beam_width] = { "nwords": nwords, "cpu": cpu_wps, "gpu": gpu_wps, } meta["vectors"] = { "width": nlp.vocab.vectors_length, "vectors": len(nlp.vocab.vectors), "keys": nlp.vocab.vectors.n_keys, "name": nlp.vocab.vectors.name, } meta.setdefault("name", "model%d" % itn) meta.setdefault("version", "0.0.1") meta["labels"] = nlp.meta["labels"] meta_loc = output_dir / ("model%d" % itn) / "meta.json" srsly.write_json(meta_loc, meta) set_env_log(verbose) progress = _get_progress( itn, losses, scorer.scores, output_stats, beam_width=beam_width if has_beam_widths else None, cpu_wps=cpu_wps, gpu_wps=gpu_wps, ) msg.row(progress, **row_settings) finally: with nlp.use_params(optimizer.averages): final_model_path = output_dir / "model-final" nlp.to_disk(final_model_path) msg.good("Saved model to output directory", final_model_path) meta["pipeline"] = nlp.pipe_names meta["labels"] = nlp.meta["labels"] meta["factories"] = nlp.meta["factories"] with msg.loading("Creating best model..."): best_model_path = _collate_best_model(meta, output_dir, nlp.pipe_names) msg.good("Created best model", best_model_path)