def test_train_empty(): """Test that training an empty text does not throw errors.""" train_data = [ ("Who is Shaka Khan?", { "entities": [(7, 17, "PERSON")] }), ("", { "entities": [] }), ] nlp = English() ner = nlp.create_pipe("ner") ner.add_label("PERSON") nlp.add_pipe(ner, last=True) nlp.begin_training() for itn in range(2): losses = {} batches = minibatch(train_data) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations losses=losses, )
def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev_loc, output_loc): nlp = load_model(spacy_model) vec_nlp = spacy.util.load_model( 'spacy/data/en_core_web_lg/en_core_web_lg-2.0.0') nlp.vocab.vectors = vec_nlp.vocab.vectors for lex in vec_nlp.vocab: _ = nlp.vocab[lex.orth_] with open(conllu_train_loc) as conllu_file: with open(text_train_loc) as text_file: docs, golds = read_data(nlp, conllu_file, text_file, oracle_segments=False, raw_text=True, limit=None) print("Create parser") nlp.add_pipe(nlp.create_pipe('parser')) nlp.parser.add_multitask_objective('tag') nlp.parser.add_multitask_objective('sent_start') nlp.add_pipe(nlp.create_pipe('tagger')) for gold in golds: for tag in gold.tags: if tag is not None: nlp.tagger.add_label(tag) optimizer = nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds)) # Replace labels that didn't make the frequency cutoff actions = set(nlp.parser.labels) label_set = set([act.split('-')[1] for act in actions if '-' in act]) for gold in golds: for i, label in enumerate(gold.labels): if label is not None and label not in label_set: gold.labels[i] = label.split('||')[0] n_train_words = sum(len(doc) for doc in docs) print(n_train_words) print("Begin training") # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. batch_sizes = spacy.util.compounding( spacy.util.env_opt('batch_from', 1), spacy.util.env_opt('batch_to', 8), spacy.util.env_opt('batch_compound', 1.001)) for i in range(30): docs = refresh_docs(docs) batches = minibatch(list(zip(docs, golds)), size=batch_sizes) with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in batches: if not batch: continue batch_docs, batch_gold = zip(*batch) nlp.update(batch_docs, batch_gold, sgd=optimizer, drop=0.2, losses=losses) pbar.update(sum(len(doc) for doc in batch_docs)) with nlp.use_params(optimizer.averages): dev_docs, scorer = parse_dev_data(nlp, text_dev_loc, conllu_dev_loc, oracle_segments=False, joint_sbd=True) print_progress(i, losses, scorer) with open(output_loc, 'w') as file_: print_conllu(dev_docs, file_) dev_docs, scorer = parse_dev_data(nlp, text_dev_loc, conllu_dev_loc, oracle_segments=False, joint_sbd=False) print_progress(i, losses, scorer)
def train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") if meta_overrides is not None: metadata = json.load(open(meta_overrides)) nlp.meta.update(metadata) original_tokenizer = nlp.tokenizer nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="parser") elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="tagger") elif 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe('ner') # add labels for _, annotations in train_data: for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.005)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 32), util.env_opt('batch_compound', 1.001)) with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() best_epoch = 0 best_f1 = 0 for i in range(n_iter): random.shuffle(train_data) count = 0 losses = {} total = len(train_data) with nlp.disable_pipes(*other_pipes): # only train NER with tqdm.tqdm(total=total, leave=True) as pbar: for batch in minibatch(train_data, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, losses=losses, drop=next(dropout_rates)) pbar.update(len(batch)) if count % 100 == 0 and count > 0: print('sum loss: %s' % losses['ner']) count += 1 # save model to output directory output_dir_path = Path(output_dir + "/" + str(i)) if not output_dir_path.exists(): output_dir_path.mkdir() with nlp.use_params(optimizer.averages): nlp.tokenizer = original_tokenizer nlp.to_disk(output_dir_path) print("Saved model to", output_dir_path) # test the saved model print("Loading from", output_dir_path) nlp2 = util.load_model_from_path(output_dir_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) metrics = evaluate_ner(nlp2, dev_data) if metrics["f1-measure-overall"] > best_f1: best_f1 = metrics["f1-measure-overall"] best_epoch = i # save model to output directory best_model_path = Path(output_dir + "/" + "best") print(f"Best Epoch: {best_epoch} of {n_iter}") if os.path.exists(best_model_path): shutil.rmtree(best_model_path) shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path) # test the saved model print("Loading from", best_model_path) nlp2 = util.load_model_from_path(best_model_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) evaluate_ner(nlp2, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp2, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))
def train(pretrained, output_dir, train_data, dev_data, n_iter=30, n_sents=0, parser_multitasks='', entity_multitasks='', use_gpu=-1, no_tagger=False, no_parser=False, no_entities=False, gold_preproc=False, version="0.0.0", meta_path=None, verbose=False): """ Re-train a pre-trained model. Expects data in spaCy's JSON format. This code is based on https://github.com/explosion/spaCy/blob/master/spacy/cli/train.py. """ # There is a bug that prevents me from using the GPU when resuming # training from a saved model. See # https://github.com/explosion/spaCy/issues/1806. if use_gpu >= 0: msg = "\nWARNING: using GPU may require re-installing thinc. " msg += "See https://github.com/explosion/spaCy/issues/1806.\n" print(msg) util.fix_random_seed() util.set_env_log(True) n_sents = n_sents or None output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) meta_path = util.ensure_path(meta_path) if not output_path.exists(): output_path.mkdir() if not train_path.exists(): prints(train_path, title=Messages.M050, exits=1) if dev_path and not dev_path.exists(): prints(dev_path, title=Messages.M051, exits=1) if meta_path is not None and not meta_path.exists(): prints(meta_path, title=Messages.M020, exits=1) meta = util.read_json(meta_path) if meta_path else {} if not isinstance(meta, dict): prints(Messages.M053.format(meta_type=type(meta)), title=Messages.M052, exits=1) # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. # Batch size starts at 1 and grows, so that we make updates quickly # at the beginning of training. dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.0)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 16), util.env_opt('batch_compound', 1.001)) max_doc_len = util.env_opt('max_doc_len', 5000) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) n_train_words = corpus.count_train() # Load pre-trained model. Remove components that we are not # re-training. nlp = load(pretrained) if no_tagger and 'tagger' in nlp.pipe_names: nlp.remove_pipe('tagger') if no_parser and 'parser' in nlp.pipe_names: nlp.remove_pipe('parser') if no_entities and 'ner' in nlp.pipe_names: nlp.remove_pipe('ner') meta.setdefault('name', 'unnamed') meta['pipeline'] = nlp.pipe_names meta.setdefault('lang', nlp.lang) nlp.meta.update(meta) # Add multi-task objectives if parser_multitasks: for objective in parser_multitasks.split(','): nlp.parser.add_multitask_objective(objective) if entity_multitasks: for objective in entity_multitasks.split(','): nlp.entity.add_multitask_objective(objective) # Get optimizer optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None print(nlp.pipe_names) print(nlp.pipeline) print( "Itn. Dep Loss NER Loss UAS NER P. NER R. NER F. Tag % Token % CPU WPS GPU WPS" ) try: train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, gold_preproc=gold_preproc, max_length=0) train_docs = list(train_docs) for i in range(n_iter): with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch(train_docs, size=batch_sizes): batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len] if not batch: continue docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses) pbar.update(sum(len(doc) for doc in docs)) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) nlp_loaded = util.load_model_from_path(epoch_model_path) dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs, verbose) end_time = timer() if use_gpu < 0: gpu_wps = None cpu_wps = nwords / (end_time - start_time) else: gpu_wps = nwords / (end_time - start_time) with Model.use_device('cpu'): nlp_loaded = util.load_model_from_path( epoch_model_path) dev_docs = list( corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)) start_time = timer() scorer = nlp_loaded.evaluate(dev_docs) end_time = timer() cpu_wps = nwords / (end_time - start_time) acc_loc = (output_path / ('model%d' % i) / 'accuracy.json') with acc_loc.open('w') as file_: file_.write(json_dumps(scorer.scores)) meta_loc = output_path / ('model%d' % i) / 'meta.json' meta['accuracy'] = scorer.scores meta['speed'] = { 'nwords': nwords, 'cpu': cpu_wps, 'gpu': gpu_wps } meta['vectors'] = { 'width': nlp.vocab.vectors_length, 'vectors': len(nlp.vocab.vectors), 'keys': nlp.vocab.vectors.n_keys } meta['lang'] = nlp.lang meta['pipeline'] = nlp.pipe_names meta['spacy_version'] = '>=%s' % about.__version__ meta.setdefault('name', 'model%d' % i) meta.setdefault('version', version) with meta_loc.open('w') as file_: file_.write(json_dumps(meta)) util.set_env_log(True) print_progress(i, losses, scorer.scores, cpu_wps=cpu_wps, gpu_wps=gpu_wps) finally: print("Saving model...") with nlp.use_params(optimizer.averages): final_model_path = output_path / 'model-final' nlp.to_disk(final_model_path)