Exemplo n.º 1
0
def test_train_empty():
    """Test that training an empty text does not throw errors."""
    train_data = [
        ("Who is Shaka Khan?", {
            "entities": [(7, 17, "PERSON")]
        }),
        ("", {
            "entities": []
        }),
    ]

    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PERSON")
    nlp.add_pipe(ner, last=True)

    nlp.begin_training()
    for itn in range(2):
        losses = {}
        batches = minibatch(train_data)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                losses=losses,
            )
Exemplo n.º 2
0
def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc,
         text_dev_loc, output_loc):
    nlp = load_model(spacy_model)
    vec_nlp = spacy.util.load_model(
        'spacy/data/en_core_web_lg/en_core_web_lg-2.0.0')
    nlp.vocab.vectors = vec_nlp.vocab.vectors
    for lex in vec_nlp.vocab:
        _ = nlp.vocab[lex.orth_]
    with open(conllu_train_loc) as conllu_file:
        with open(text_train_loc) as text_file:
            docs, golds = read_data(nlp,
                                    conllu_file,
                                    text_file,
                                    oracle_segments=False,
                                    raw_text=True,
                                    limit=None)
    print("Create parser")
    nlp.add_pipe(nlp.create_pipe('parser'))
    nlp.parser.add_multitask_objective('tag')
    nlp.parser.add_multitask_objective('sent_start')
    nlp.add_pipe(nlp.create_pipe('tagger'))
    for gold in golds:
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    optimizer = nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
    # Replace labels that didn't make the frequency cutoff
    actions = set(nlp.parser.labels)
    label_set = set([act.split('-')[1] for act in actions if '-' in act])
    for gold in golds:
        for i, label in enumerate(gold.labels):
            if label is not None and label not in label_set:
                gold.labels[i] = label.split('||')[0]
    n_train_words = sum(len(doc) for doc in docs)
    print(n_train_words)
    print("Begin training")
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    batch_sizes = spacy.util.compounding(
        spacy.util.env_opt('batch_from', 1), spacy.util.env_opt('batch_to', 8),
        spacy.util.env_opt('batch_compound', 1.001))
    for i in range(30):
        docs = refresh_docs(docs)
        batches = minibatch(list(zip(docs, golds)), size=batch_sizes)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            losses = {}
            for batch in batches:
                if not batch:
                    continue
                batch_docs, batch_gold = zip(*batch)

                nlp.update(batch_docs,
                           batch_gold,
                           sgd=optimizer,
                           drop=0.2,
                           losses=losses)
                pbar.update(sum(len(doc) for doc in batch_docs))

        with nlp.use_params(optimizer.averages):
            dev_docs, scorer = parse_dev_data(nlp,
                                              text_dev_loc,
                                              conllu_dev_loc,
                                              oracle_segments=False,
                                              joint_sbd=True)
            print_progress(i, losses, scorer)
            with open(output_loc, 'w') as file_:
                print_conllu(dev_docs, file_)
            dev_docs, scorer = parse_dev_data(nlp,
                                              text_dev_loc,
                                              conllu_dev_loc,
                                              oracle_segments=False,
                                              joint_sbd=False)
            print_progress(i, losses, scorer)
Exemplo n.º 3
0
def train(model, train_data, dev_data, test_data, output_dir, n_iter,
          meta_overrides):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)

    original_tokenizer = nlp.tokenizer

    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="parser")
    elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="tagger")
    elif 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.005))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 32),
                                   util.env_opt('batch_compound', 1.001))

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
    best_epoch = 0
    best_f1 = 0
    for i in range(n_iter):

        random.shuffle(train_data)
        count = 0
        losses = {}
        total = len(train_data)

        with nlp.disable_pipes(*other_pipes):  # only train NER
            with tqdm.tqdm(total=total, leave=True) as pbar:
                for batch in minibatch(train_data, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs,
                               golds,
                               sgd=optimizer,
                               losses=losses,
                               drop=next(dropout_rates))
                    pbar.update(len(batch))
                    if count % 100 == 0 and count > 0:
                        print('sum loss: %s' % losses['ner'])
                    count += 1

        # save model to output directory
        output_dir_path = Path(output_dir + "/" + str(i))
        if not output_dir_path.exists():
            output_dir_path.mkdir()

        with nlp.use_params(optimizer.averages):
            nlp.tokenizer = original_tokenizer
            nlp.to_disk(output_dir_path)
            print("Saved model to", output_dir_path)

        # test the saved model
        print("Loading from", output_dir_path)
        nlp2 = util.load_model_from_path(output_dir_path)
        nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

        metrics = evaluate_ner(nlp2, dev_data)
        if metrics["f1-measure-overall"] > best_f1:
            best_f1 = metrics["f1-measure-overall"]
            best_epoch = i
    # save model to output directory
    best_model_path = Path(output_dir + "/" + "best")
    print(f"Best Epoch: {best_epoch} of {n_iter}")
    if os.path.exists(best_model_path):
        shutil.rmtree(best_model_path)
    shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path)

    # test the saved model
    print("Loading from", best_model_path)
    nlp2 = util.load_model_from_path(best_model_path)
    nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

    evaluate_ner(nlp2,
                 dev_data,
                 dump_path=os.path.join(output_dir, "dev_metrics.json"))
    evaluate_ner(nlp2,
                 test_data,
                 dump_path=os.path.join(output_dir, "test_metrics.json"))
Exemplo n.º 4
0
def train(pretrained,
          output_dir,
          train_data,
          dev_data,
          n_iter=30,
          n_sents=0,
          parser_multitasks='',
          entity_multitasks='',
          use_gpu=-1,
          no_tagger=False,
          no_parser=False,
          no_entities=False,
          gold_preproc=False,
          version="0.0.0",
          meta_path=None,
          verbose=False):
    """
    Re-train a pre-trained model. Expects data in spaCy's JSON
    format. This code is based on
    https://github.com/explosion/spaCy/blob/master/spacy/cli/train.py.
    """
    # There is a bug that prevents me from using the GPU when resuming
    # training from a saved model. See
    # https://github.com/explosion/spaCy/issues/1806.
    if use_gpu >= 0:
        msg = "\nWARNING: using GPU may require re-installing thinc. "
        msg += "See https://github.com/explosion/spaCy/issues/1806.\n"
        print(msg)

    util.fix_random_seed()
    util.set_env_log(True)
    n_sents = n_sents or None
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    meta_path = util.ensure_path(meta_path)
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title=Messages.M050, exits=1)
    if dev_path and not dev_path.exists():
        prints(dev_path, title=Messages.M051, exits=1)
    if meta_path is not None and not meta_path.exists():
        prints(meta_path, title=Messages.M020, exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
        prints(Messages.M053.format(meta_type=type(meta)),
               title=Messages.M052,
               exits=1)

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
    max_doc_len = util.env_opt('max_doc_len', 5000)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()

    # Load pre-trained model. Remove components that we are not
    # re-training.
    nlp = load(pretrained)
    if no_tagger and 'tagger' in nlp.pipe_names:
        nlp.remove_pipe('tagger')
    if no_parser and 'parser' in nlp.pipe_names:
        nlp.remove_pipe('parser')
    if no_entities and 'ner' in nlp.pipe_names:
        nlp.remove_pipe('ner')
    meta.setdefault('name', 'unnamed')
    meta['pipeline'] = nlp.pipe_names
    meta.setdefault('lang', nlp.lang)
    nlp.meta.update(meta)

    # Add multi-task objectives
    if parser_multitasks:
        for objective in parser_multitasks.split(','):
            nlp.parser.add_multitask_objective(objective)
    if entity_multitasks:
        for objective in entity_multitasks.split(','):
            nlp.entity.add_multitask_objective(objective)

    # Get optimizer
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None

    print(nlp.pipe_names)
    print(nlp.pipeline)

    print(
        "Itn.  Dep Loss  NER Loss  UAS     NER P.  NER R.  NER F.  Tag %   Token %  CPU WPS  GPU WPS"
    )
    try:
        train_docs = corpus.train_docs(nlp,
                                       projectivize=True,
                                       noise_level=0.0,
                                       gold_preproc=gold_preproc,
                                       max_length=0)
        train_docs = list(train_docs)
        for i in range(n_iter):
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    batch = [(d, g) for (d, g) in batch
                             if len(d) < max_doc_len]
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(docs,
                               golds,
                               sgd=optimizer,
                               drop=next(dropout_rates),
                               losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))

            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                dev_docs = list(
                    corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                start_time = timer()
                scorer = nlp_loaded.evaluate(dev_docs, verbose)
                end_time = timer()
                if use_gpu < 0:
                    gpu_wps = None
                    cpu_wps = nwords / (end_time - start_time)
                else:
                    gpu_wps = nwords / (end_time - start_time)
                    with Model.use_device('cpu'):
                        nlp_loaded = util.load_model_from_path(
                            epoch_model_path)
                        dev_docs = list(
                            corpus.dev_docs(nlp_loaded,
                                            gold_preproc=gold_preproc))
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs)
                        end_time = timer()
                        cpu_wps = nwords / (end_time - start_time)
                acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
                meta['speed'] = {
                    'nwords': nwords,
                    'cpu': cpu_wps,
                    'gpu': gpu_wps
                }
                meta['vectors'] = {
                    'width': nlp.vocab.vectors_length,
                    'vectors': len(nlp.vocab.vectors),
                    'keys': nlp.vocab.vectors.n_keys
                }
                meta['lang'] = nlp.lang
                meta['pipeline'] = nlp.pipe_names
                meta['spacy_version'] = '>=%s' % about.__version__
                meta.setdefault('name', 'model%d' % i)
                meta.setdefault('version', version)

                with meta_loc.open('w') as file_:
                    file_.write(json_dumps(meta))
                util.set_env_log(True)
            print_progress(i,
                           losses,
                           scorer.scores,
                           cpu_wps=cpu_wps,
                           gpu_wps=gpu_wps)
    finally:
        print("Saving model...")
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / 'model-final'
            nlp.to_disk(final_model_path)