Exemplo n.º 1
0
def main(use_gpu=False, nb_epoch=100):
    if use_gpu:
        Model.ops = CupyOps()
        Model.Ops = CupyOps
    train, test = datasets.imdb(limit=2000)
    print("Load data")
    train_X, train_y = zip(*train)
    test_X, test_y = zip(*test)
    train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2))
    test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2))
    
    nlp = spacy.load('en_vectors_web_lg')
    nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)

    preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
    train_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X))]
    test_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X))]

    dev_X = train_X[-1000:]
    dev_y = train_y[-1000:]
    train_X = train_X[:-1000]
    train_y = train_y[:-1000]
    print("Parse data")
    n_sent = sum([len(list(sents)) for sents in train_X])
    print("%d sentences" % n_sent)

    model = build_model(2, width=128, conv_depth=2, depth=2,
                        train_X=train_X, train_y=train_y)
    with model.begin_training(train_X[:100], train_y[:100]) as (trainer, optimizer):
        epoch_loss = [0.]
        def report_progress():
            with model.use_params(optimizer.averages):
                print(epoch_loss[-1], epoch_var[-1], model.evaluate(dev_X, dev_y), trainer.dropout)
            epoch_loss.append(0.)
            epoch_var.append(0.)
        trainer.each_epoch.append(report_progress)
        batch_sizes = compounding(64, 64, 1.01)
        trainer.dropout = 0.3
        trainer.batch_size = int(next(batch_sizes))
        trainer.dropout_decay = 0.0
        trainer.nb_epoch = nb_epoch
        #optimizer.alpha = 0.1
        #optimizer.max_grad_norm = 10.0
        #optimizer.b1 = 0.0
        #optimizer.b2 = 0.0
        epoch_var = [0.]
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            losses = ((yh-y)**2.).sum(axis=1) / y.shape[0]
            epoch_var[-1] += losses.var()
            loss = losses.mean()
            backprop((yh-y)/yh.shape[0], optimizer)
            epoch_loss[-1] += loss
            trainer.batch_size = int(next(batch_sizes))
        with model.use_params(optimizer.averages):
            print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y))
Exemplo n.º 2
0
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the parser to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "parser" not in nlp.pipe_names:
        parser = nlp.create_pipe("parser")
        nlp.add_pipe(parser, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        parser = nlp.get_pipe("parser")

    # add labels to the parser
    for _, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
Exemplo n.º 3
0
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if "parser" in nlp.pipe_names:
        nlp.remove_pipe("parser")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)
Exemplo n.º 4
0
def main(model_name, unlabelled_loc):
    n_iter = 10
    dropout = 0.2
    batch_size = 4
    nlp = spacy.load(model_name)
    nlp.get_pipe("ner").add_label(LABEL)
    raw_docs = list(read_raw_data(nlp, unlabelled_loc))
    optimizer = nlp.resume_training()
    # Avoid use of Adam when resuming training. I don't understand this well
    # yet, but I'm getting weird results from Adam. Try commenting out the
    # nlp.update(), and using Adam -- you'll find the models drift apart.
    # I guess Adam is losing precision, introducing gradient noise?
    optimizer.alpha = 0.1
    optimizer.b1 = 0.0
    optimizer.b2 = 0.0

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    sizes = compounding(1.0, 4.0, 1.001)
    with nlp.disable_pipes(*other_pipes):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            random.shuffle(raw_docs)
            losses = {}
            r_losses = {}
            # batch up the examples using spaCy's minibatch
            raw_batches = minibatch(raw_docs, size=4)
            for batch in minibatch(TRAIN_DATA, size=sizes):
                docs, golds = zip(*batch)
                nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
                raw_batch = list(next(raw_batches))
                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
            print("Losses", losses)
            print("R. Losses", r_losses)
    print(nlp.get_pipe('ner').model.unseen_classes)
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
Exemplo n.º 5
0
def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    """Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
    The `vocab` should be the one used during creation of the KB."""
    vocab = Vocab().from_disk(vocab_path)
    # create blank Language class with correct vocab
    nlp = spacy.blank("en", vocab=vocab)
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
    nlp.add_pipe(nlp.create_pipe('sentencizer'))

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
    # Note that in a realistic application, an actual NER algorithm should be used instead.
    ruler = EntityRuler(nlp)
    patterns = [{
        "label": "PERSON",
        "pattern": [{
            "LOWER": "russ"
        }, {
            "LOWER": "cochran"
        }]
    }]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
        # use only the predicted EL score and not the prior probability (for demo purposes)
        cfg = {"incl_prior": False}
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)

    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
        with nlp.disable_pipes("entity_linker"):
            doc = nlp(text)
        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
                if kb_id in kb_ids:
                    new_dict[kb_id] = value
                else:
                    print("Removed", kb_id,
                          "from training because it is not in the KB.")
            annotation_clean["links"][offset] = new_dict
        TRAIN_DOCS.append((doc, annotation_clean))

    # get names of other pipes to disable them during training
    pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    with nlp.disable_pipes(*other_pipes):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DOCS)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer,
                )
            print(itn, "Losses", losses)

    # test the trained model
    _apply_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print()
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        _apply_model(nlp2)
Exemplo n.º 6
0
    def __train(self, level: int, n_iter=40):
        label = self.label(level)
        prepared_data = self.training_data.prepare_data(level, label)
        model_path = self.model_path(level)

        # Setup model path, verify access
        model_path.mkdir(parents=True, exist_ok=True)

        random.seed(0)
        nlp = spacy.blank('en')
        self.__console.info('Created blank model')

        # Add entity recognizer to model if it's not in the pipeline
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if 'ner' not in nlp.pipe_names:
            ner = nlp.create_pipe('ner')
            nlp.add_pipe(ner)
        # otherwise, get it, so we can add labels to it
        else:
            ner = nlp.get_pipe('ner')

        ner.add_label(label)

        optimizer = nlp.begin_training()

        move_names = list(ner.move_names)
        # get names of other pipes to disable them during training
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
        ]
        # only train NER
        with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
            # show warnings for misaligned entity spans once
            warnings.filterwarnings("once",
                                    category=UserWarning,
                                    module='spacy')

            sizes = compounding(1.0, 4.0, 1.001)
            # batch up the examples using spaCy's minibatch
            with self.__console.timed(f'Training model {label}',
                                      'Trained model in {0:.3f}s'):
                for itn in range(n_iter):
                    random.shuffle(prepared_data)
                    batches = minibatch(prepared_data, size=sizes)
                    losses = {}
                    for batch in batches:
                        texts, annotations = zip(*batch)
                        nlp.update(texts,
                                   annotations,
                                   sgd=optimizer,
                                   drop=0.35,
                                   losses=losses)
                    self.__console.info("Losses", losses)

        # test the trained model
        test_text = random.choice(self.training_data.questions)
        doc = nlp(test_text)
        self.__console.info("Entities in '%s'" % test_text)
        for ent in doc.ents:
            self.__console.info(ent.label_, ent.text)

        # save model to output directory
        nlp.meta["name"] = label
        nlp.to_disk(model_path)
        self.__console.okay("Saved model to", model_path)

        # test the saved model
        self.__console.info("Loading from", model_path)
        nlp2 = spacy.load(model_path)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            self.__console.info(ent.label_, ent.text)

        self.model_qsize_path(level).write_text(
            str(len(self.training_data.questions)))
        return nlp
other_pipes
#n_iter=100
n_iter = 40
with nlp.disable_pipes(*other_pipes):  # only train NER
    nlp.begin_training()
    #print("3*****")
    for itn in range(n_iter):
        print("Iteration:", itn)
        #print("Iteration# " , itn)
        random.shuffle(TR_DATA)
        #print("2")
        losses = {}
        # batch up the examples using spaCy's minibatch
        #print("4*****")
        #print("3")
        batches = minibatch(TR_DATA, size=compounding(4.0, 32.0, 1.001))
        #print("batches len:",size)
        #print(type(batches))
        #print("4")
        #cnt=1
        for batch in batches:
            #print("current batch:",cnt)
            #print("5")
            texts, annotations = zip(*batch)
            #print(texts)
            #print("##########")
            #print(annotations)
            #print("*****")
            #print("6")
            nlp.update(
                texts,  # batch of texts
def chunk_courses(
    model=None, new_model_name="chunk_course_info_model", output_dir=None, n_iter=30
):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    (
        TRAIN_DATA,
        LABELS,
    ) = get_data()  # needs to not pull from file and pull from DB eventually

    # Train a series of models
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)

    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    # Add labels
    for l in LABELS:
        ner.add_label(l)  # add new entity label to entity recognizer

    # Begin/Resume training
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)

        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    test_model(nlp)  ## TEST <------

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
Exemplo n.º 9
0
def main(model=None, new_model_name="animal", output_dir=None, n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Exemplo n.º 10
0
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat",
            config={
                "exclusive_classes": True,
                "architecture": "simple_cnn",
            }
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
print("Using {} examples ({} training, {} evaluation)".format(
    n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats]))

#training
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        t1 = time.time()
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts,
                       annotations,
                       sgd=optimizer,
                       drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        t2 = time.time()
        time_taken = t2 - t1
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'.format(
            losses['textcat'], scores['textcat_p'], scores['textcat_r'],
            scores['textcat_f'], time_taken))
def train_spacy_model(train_data,
                      test_data,
                      model,
                      output_dir=None,
                      n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    nlp = model

    ent_types = []
    for _, e in train_data:
        ee = [ent[2] for ent in e['entities']]
        ent_types += ee

    for text, ent in train_data:
        doc = nlp(text)
        entities = ent['entities']
        tags = biluo_tags_from_offsets(doc, entities)

        # # if "-" in tags:
        # print(text)
        # print(entities, tags)
        # for t in doc:
        #     print(t, tags[t.i])
        # print("\n\n\n")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        # if model is None:
        nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print(f"{itn}:")
            print("\tLosses", losses)
            score = evaluate(nlp, test_data)
            if not os.path.isdir("models"):
                os.mkdir("models")
            nlp.to_disk(os.path.join("models", f"model_{itn}"))
            print("\t", score)
Exemplo n.º 13
0
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()


In[ ]:


    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
            losses=losses)
        print('Losses', losses)


In[ ]:


    #ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
Exemplo n.º 14
0
def main(
    ud_dir,
    parses_dir,
    corpus,
    config=None,
    limit=0,
    gpu_device=-1,
    vectors_dir=None,
    use_oracle_segments=False,
):
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False

    if config is not None:
        config = Config.load(config, vectors_dir=vectors_dir)
    else:
        config = Config(vectors_dir=vectors_dir)
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)

    docs, golds = read_data(
        nlp,
        paths.train.conllu.open(),
        paths.train.text.open(),
        max_doc_length=config.max_doc_length,
        limit=limit,
    )

    optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)

    batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
    beam_prob = compounding(0.2, 0.8, 1.001)
    for i in range(config.nr_epoch):
        docs, golds = read_data(
            nlp,
            paths.train.conllu.open(),
            paths.train.text.open(),
            max_doc_length=config.max_doc_length,
            limit=limit,
            oracle_segments=use_oracle_segments,
            raw_text=not use_oracle_segments,
        )
        Xs = list(zip(docs, golds))
        random.shuffle(Xs)
        if config.batch_by_words:
            batches = minibatch_by_words(Xs, size=batch_sizes)
        else:
            batches = minibatch(Xs, size=batch_sizes)
        losses = {}
        n_train_words = sum(len(doc) for doc in docs)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
                batch_docs, batch_gold = zip(*batch)
                pbar.update(sum(len(doc) for doc in batch_docs))
                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
                nlp.update(
                    batch_docs,
                    batch_gold,
                    sgd=optimizer,
                    drop=config.dropout,
                    losses=losses,
                )

        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
        with nlp.use_params(optimizer.averages):
            if use_oracle_segments:
                parsed_docs, scores = evaluate(
                    nlp, paths.dev.conllu, paths.dev.conllu, out_path
                )
            else:
                parsed_docs, scores = evaluate(
                    nlp, paths.dev.text, paths.dev.conllu, out_path
                )
            print_progress(i, losses, scores)
Exemplo n.º 15
0
def main(model=None, output_dir=None, n_iter=1000):
    """Load the model, set up the pipeline and train the entity recognizer."""

    # Create blank Language class
    nlp = spacy.blank("en")

    # Create the built-in pipeline components and add them to the pipeline
    # Get it so we can add labels
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)

    # Add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # TEST THE TRAINED MODEL
    print('\nSo what are we detected?:')

    # Manually test. Try with text, what wont be used in train data!
    doc = nlp(
        "I like snakes and other animals. Byt python is my favourive animal. Besides, I like to work with python programming language."
    )
    print(
        "Entities", [(ent.text, ent.label_) for ent in doc.ents]
    )  # Should mark first 'python' as a ANIMAL and second 'python' as a P-LANG

    # auto - from text's from our train data
    #for text, _ in TRAIN_DATA:
    #doc = nlp(text)
    #print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    ##print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Exemplo n.º 16
0
    def _train_one_round(self, i: int) -> None:
        n_iter = 100  #number of iterations. could make this customizable but I feel that it would be too messy
        #train model and save to self.nlp
        self.anns_this_round = i * self.anns_per_point
        if self.verbose:
            print("Training on %s annotations" % (self.anns_this_round))
        count = 0
        train_data = []
        for line in self.train_file:
            train_data.append(line)
            count += 1
            if count >= self.anns_this_round:
                break
        """Set up the pipeline and entity recognizer, and train the new entity."""
        random.seed(0)
        self.nlp = spacy.blank("en")  # create blank Language class
        # Add entity recognizer to model if it's not in the pipeline
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if "ner" not in self.nlp.pipe_names:
            ner = self.nlp.create_pipe("ner")
            self.nlp.add_pipe(ner)
        # otherwise, get it, so we can add labels to it
        else:
            ner = self.nlp.get_pipe("ner")

        for label in self.label:
            ner.add_label(label)  # add new entity label to entity recognizer
        optimizer = self.nlp.begin_training()

        move_names = list(ner.move_names)
        # get names of other pipes to disable them during training
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in self.nlp.pipe_names if pipe not in pipe_exceptions
        ]
        # only train NER
        with self.nlp.disable_pipes(
                *other_pipes) and warnings.catch_warnings():
            # show warnings for misaligned entity spans once
            warnings.filterwarnings("once",
                                    category=UserWarning,
                                    module='spacy')

            sizes = compounding(1.0, 4.0, 1.001)
            # batch up the examples using spaCy's minibatch
            for itn in range(n_iter):
                random.shuffle(train_data)
                # Need some oversampling somewhere in here
                batches = minibatch(train_data, size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self.nlp.update(texts,
                                    annotations,
                                    sgd=optimizer,
                                    drop=0.35,
                                    losses=losses)
                #print("Losses", losses)
        output_dir = Path("Baby")
        if not output_dir.exists():
            output_dir.mkdir()
        self.nlp.meta["name"] = "BabyModel"  # rename model
        self.nlp.to_disk(output_dir)
Exemplo n.º 17
0
def main(model=None,
         new_model_name='new_model',
         output_dir='F://ThesisProject//data//ann//',
         n_iter=20):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    for i in LABEL:
        ner.add_label(i)  # Add new entity labels to entity recognizer

    print(i)

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # Test the trained model
    test_text = "A new harvesting system and the fibre properties of reed canary grass (Phalaris arundinacea L.) makes this grass an interesting new raw material source for the pulp and paper industry in the Nordic countries. Pilot scale tests in Finland shows that high quality fine paper can successfully be produced from delayed harvested reed canary grass. Birch pulp can be replaced with reed canary grass pulp in fine paper furnish without any significant differences in the functional properties of paper."
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # Save model
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Exemplo n.º 18
0
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("sv")  # create blank Language class
        print("Created blank 'sv' model")

    if model is None:
        with open('cc.sv.300.vec', 'rb') as file_:
            header = file_.readline()
            nr_row, nr_dim = header.split()
            nlp.vocab.reset_vectors(width=int(nr_dim))
            for line in file_:
                line = line.rstrip().decode('utf8')
                pieces = line.rsplit(' ', int(nr_dim))
                word = pieces[0]
                vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
                nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            print("\nStatring iteration " + str(itn+1))
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print(f"Loss: {losses['ner']}")
            print('------- Evaluation on TRAIN -------')
            print(evaluate(nlp, TRAIN_DATA))
            print('\n------- Evaluation on TEST -------')
            print(evaluate(nlp, TEST_DATA))

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        print(evaluate(nlp2, TEST_DATA))
Exemplo n.º 19
0
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    textcat.add_label('POSITIVE')

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    textcat.add_label('Neutral')
    textcat.add_label('Bullish')
    textcat.add_label('Bearish')

    # load the IMDB dataset
    print("Loading tweets data...")
    # (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
    (train_texts, train_cats), (dev_texts,
                                dev_cats) = load_data_2(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)".format(
        n_texts * 2, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts, [{
        'cats': cats
    } for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    dropout = decaying(0.6, 0.2, 1e-4)
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(2., 8., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=next(dropout),
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                try:
                    scores = evaluate(nlp.tokenizer, textcat, dev_texts,
                                      dev_cats)
                    print(
                        '{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                        .format(losses['textcat'], scores['textcat_p'],
                                scores['textcat_r'], scores['textcat_f']))
                except Exception as e:
                    print(e)
                    pass

    # test the trained model
    test_text = "#aapl buy for 250m the market!!!"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is None:
        output_dir = Path('tweetsClassifier/spacyTrainingModel')
        if not output_dir.exists():
            output_dir.mkdir()
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        test_text = "long #aapl for 250m the market!!!"
        test_text2 = "#aapl lead the market!"
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
Exemplo n.º 21
0
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    n_iter=100

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.55,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
                print('Losses', losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        #print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Exemplo n.º 22
0
    def _set_params(self, kwargs):
        """
        Set input parameters based on the request.
        :
        :For details refer to the GitHub project: https://github.com/nabeel-oz/qlik-py-tools
        """
        
        # Set default values which will be used if execution arguments are not passed
        
        # Default parameters:
        self.debug = False
        self.model = 'en_core_web_sm'
        self.custom = False
        self.base_model = 'en_core_web_sm'
        self.blank = False
        self.epochs = 100
        self.batch_size = compounding(4.0, 32.0, 1.001)
        self.drop = 0.25
        self.test = 0
              
        # Extract the model path if required
        try:
            # Get the model name from the first row in the request_df 
            self.model = self.request_df.loc[0, 'model_name']

            # Remove the model_name column from the request_df
            self.request_df = self.request_df.drop(['model_name'], axis=1)
        except KeyError:
            pass
        
        # If key word arguments were included in the request, get the parameters and values
        if len(kwargs) > 0:
            
            # Transform the string of arguments into a dictionary
            self.kwargs = utils.get_kwargs(kwargs)
            
            # Set the debug option for generating execution logs
            # Valid values are: true, false
            if 'debug' in self.kwargs:
                self.debug = 'true' == self.kwargs['debug'].lower()
                
                # Additional information is printed to the terminal and logs if the paramater debug = true
                if self.debug:
                    # Increment log counter for the class. Each instance of the class generates a new log.
                    self.__class__.log_no += 1

                    # Create a log file for the instance
                    # Logs will be stored in ..\logs\SpaCy Log <n>.txt
                    self.logfile = os.path.join(os.getcwd(), 'logs', 'SpaCy Log {}.txt'.format(self.log_no))

                    self._print_log(1)
            
            # Set whether the model (if getting named entites) or base model (if retraining) is a custom model
            # i.e. not one of the pre-trained models provided by spaCy
            if 'custom' in self.kwargs:
                self.custom = 'true' == self.kwargs['custom'].lower()
           
            # Set the base model, i.e an existing spaCy model to be retrained.
            if 'base_model' in self.kwargs:
                self.base_model = self.kwargs['base_model'].lower()
            
            # Set the retraining to be done on a blank Language class
            if 'blank' in self.kwargs:
                self.blank = 'true' == self.kwargs['blank'].lower()
            
            # Set the epochs for training the model. 
            # This is the the number times that the learning algorithm will work through the entire training dataset.
            # Valid values are an integer e.g. 200
            if 'epochs' in self.kwargs:
                self.epochs = utils.atoi(self.kwargs['epochs'])
            
            # Set the batch size to be used during model training. 
            # The model's internal parameters will be updated at the end of each batch.
            # Valid values are a single integer or compounding or decaying parameters.
            if 'batch_size' in self.kwargs:
                # The batch size may be a single integer
                try:
                    self.batch_size = utils.atoi(self.kwargs['batch_size'])
                # Or a list of floats
                except ValueError:
                    sizes = utils.get_kwargs_by_type(self.kwargs['batch_size']) 

                    # If the start < end, batch sizes will be compounded
                    if sizes[0] < sizes[1]:
                        self.batch_size = compounding(sizes[0], sizes[1], sizes[2])
                    # else bath sizes will decay during training
                    else:
                        self.batch_size = decaying(sizes[0], sizes[1], sizes[2])
            
            # Set the dropout rate for retraining the model
            # This determines the likelihood that a feature or internal representation in the model will be dropped,
            # making it harder for the model to memorize the training data.
            # Valid values are a float lesser than 1.0 e.g. 0.35
            if 'drop' in self.kwargs:
                self.drop = utils.atof(self.kwargs['drop'])
            
            # Set the ratio of data to be used for testing. 
            # This data will be held out from training and just used to provide evaluation metrics.
            # Valid values are a float >= zero and < 1.0 e.g. 0.3
            if 'test' in self.kwargs:
                self.test = utils.atof(self.kwargs['test'])
               
        # Debug information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            self._print_log(2)
        
        # Remove the kwargs column from the request_df
        self.request_df = self.request_df.drop(['kwargs'], axis=1)
Exemplo n.º 23
0
def train_spacy_ner(training_data, model=None, n_iter=20, debug=False):
    """Example of training spaCy's named entity recognizer, starting off with an
    existing model or a blank model.

    For more details, see the documentation:
    * Training: https://spacy.io/usage/training
    * NER: https://spacy.io/usage/linguistic-features#named-entities

    Compatible with: spaCy v2.0.0+
    Last tested with: v2.1.0
    """
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in training_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()

        ## gather up docs that throw exception (only in debug mode)
        baddocs = set()

        for itn in range(n_iter):
            random.shuffle(training_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            # singlebatch for debug
            singlebatch = 1
            compoundbatch = compounding(4.0, 32.0, 1.001)
            batchsize = singlebatch if debug else compoundbatch
            batches = minibatch(training_data, size=batchsize)
            for batch in batches:
                texts, annotations = zip(*batch)
                try:
                    nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        drop=0.5,  # dropout - make it harder to memorise data
                        losses=losses,
                    )
                except Exception:
                    if debug:
                        print("Exception thrown when processing doc:")
                        print(texts, annotations)
                        baddocs.add(batch[0][0])
                        continue
            print("Losses", losses)
    return nlp, baddocs
Exemplo n.º 24
0
def main(model=str(get_newest_model()),
         new_model_name=LABEL.lower(),
         output_dir='custom-models',
         n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # test the trained model with test_text provided by secondary script exec argument; defaulting to a very basic test
    test_text = "This is " + new_model_name.replace('_', ' ').title() + "."
    if len(sys.argv) >= 3 and sys.argv[2] != "":
        test_text = sys.argv[2]

    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        millis = int(round(time.time() * 1000))
        output_dir = Path(output_dir + '/' + str(millis) + "-" +
                          new_model_name)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Exemplo n.º 25
0
def main(model=None, new_model_name='pineapple', output_dir='./pineapple_1_0', n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    # if model is not None:
    #     nlp = spacy.load('en')
    #     print("Loaded model '%s'" % model)
    # else:
    #     nlp = spacy.blank('en')  # create blank Language class
    #     print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy

    nlp = spacy.load('en_core_web_sm')

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    ner.add_label(LABEL_2)
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses', losses)


    test_text = 'Office E&T 420 Phone (323) 343-669 Address 5151 State University Drive, Los Angeles, CA 90032 Web cs.calstatela.edu or www.calstatela.edu/cs'

    doc = nlp(test_text)
    #print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Exemplo n.º 26
0
def main(path,
         model=None,
         new_model_name='new_model',
         output_dir=None,
         n_iter=10):
    """Setting up the pipeline and entity recognizer, and training the new entity."""
    with open(path, 'rb') as fp:
        train_data = pickle.load(fp)
    trim_entity_spans(train_data)
    train_data = [fix(el) for el in train_data]
    if model is not None:
        nlp = spacy.load(model)  # load existing spacy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('xx')  # create blank Language class
        print("Created blank 'xx' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe('ner')

    # for i in LABEL:
    #    ner.add_label(i)  # Add new entity labels to entity recognizer

    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.entity.create_optimizer()

    # Get names of other pipes to disable them during training to train only NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                texts = [str.strip(text) for text in texts]
                try:
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               drop=0.35,
                               losses=losses)
                except Exception as e:
                    print(e)
            if losses['ner'] < 20:
                break
            print('Losses', losses)

    # Test the trained model
    test_text = 'Republika Srbija se odriće Kosova na dan 22. maja 2019. godine, i prestonica drzave je  u Novi Sad.'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # Save model
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
def main(dir_kb,
         output_dir=None,
         loc_training=None,
         epochs=10,
         dropout=0.5,
         lr=0.005,
         l2=1e-6,
         train_inst=None,
         dev_inst=None,
         labels_discard=None):
    logger.info("Creating Entity Linker with Wikipedia and WikiData")

    output_dir = Path(output_dir) if output_dir else dir_kb
    training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE
    nlp_dir = dir_kb / KB_MODEL_DIR
    kb_path = dir_kb / KB_FILE
    nlp_output_dir = output_dir / OUTPUT_MODEL_DIR

    # STEP 0: set up IO
    if not output_dir.exists():
        output_dir.mkdir()

    # STEP 1 : load the NLP object
    logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
    nlp = spacy.load(nlp_dir)
    logger.info("STEP 1b: Loading KB from {}".format(kb_path))
    kb = read_kb(nlp, kb_path)

    # check that there is a NER component in the pipeline
    if "ner" not in nlp.pipe_names:
        raise ValueError(
            "The `nlp` object should have a pretrained `ner` component.")

    # STEP 2: read the training dataset previously created from WP
    logger.info(
        "STEP 2: Reading training dataset from {}".format(training_path))

    if labels_discard:
        labels_discard = [x.strip() for x in labels_discard.split(",")]
        logger.info("Discarding {} NER types: {}".format(
            len(labels_discard), labels_discard))

    train_data = wikipedia_processor.read_training(
        nlp=nlp,
        entity_file_path=training_path,
        dev=False,
        limit=train_inst,
        kb=kb,
        labels_discard=labels_discard)

    # for testing, get all pos instances (independently of KB)
    dev_data = wikipedia_processor.read_training(
        nlp=nlp,
        entity_file_path=training_path,
        dev=True,
        limit=dev_inst,
        kb=None,
        labels_discard=labels_discard)

    # STEP 3: create and train an entity linking pipe
    logger.info("STEP 3: Creating and training an Entity Linking pipe")

    el_pipe = nlp.create_pipe(name="entity_linker",
                              config={
                                  "pretrained_vectors": nlp.vocab.vectors.name,
                                  "labels_discard": labels_discard
                              })
    el_pipe.set_kb(kb)
    nlp.add_pipe(el_pipe, last=True)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
    with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
        optimizer = nlp.begin_training()
        optimizer.learn_rate = lr
        optimizer.L2 = l2

    logger.info("Training on {} articles".format(len(train_data)))
    logger.info("Dev testing on {} articles".format(len(dev_data)))

    # baseline performance on dev data
    logger.info("Dev Baseline Accuracies:")
    measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)

    for itn in range(epochs):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
        batchnr = 0

        with nlp.disable_pipes(*other_pipes):
            for batch in batches:
                try:
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs=docs,
                        golds=golds,
                        sgd=optimizer,
                        drop=dropout,
                        losses=losses,
                    )
                    batchnr += 1
                except Exception as e:
                    logger.error("Error updating batch:" + str(e))
        if batchnr > 0:
            logging.info("Epoch {}, train loss {}".format(
                itn, round(losses["entity_linker"] / batchnr, 2)))
            measure_performance(dev_data,
                                kb,
                                el_pipe,
                                baseline=False,
                                context=True)

    # STEP 4: measure the performance of our trained pipe on an independent dev set
    logger.info("STEP 4: Final performance measurement of Entity Linking pipe")
    measure_performance(dev_data, kb, el_pipe)

    # STEP 5: apply the EL pipe on a toy example
    logger.info("STEP 5: Applying Entity Linking to toy example")
    run_el_toy_example(nlp=nlp)

    if output_dir:
        # STEP 6: write the NLP pipeline (now including an EL model) to file
        logger.info("STEP 6: Writing trained NLP to {}".format(nlp_output_dir))
        nlp.to_disk(nlp_output_dir)

        logger.info("Done!")
Exemplo n.º 28
0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
init_tok2vec=None
# Train
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    if init_tok2vec is not None:
        with init_tok2vec.open("rb") as file_:
            textcat.model.tok2vec.from_bytes(file_.read())
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            
            if texts[0] == None or annotations[0] == None:
                print('texts[0]  | annotations[0] == None, continuing')
                continue
            
            nlp.update(texts, 
                       annotations, 
                       sgd=optimizer, 
Exemplo n.º 29
0
def main(model=None, n_iter=100):
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(preprocess(text))
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    output_dir = Path("vi")
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

    # test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    for text, _ in TRAIN_DATA:
        doc = nlp2(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main(model=None, output_dir="model_entRuler", n_iter=70):
    # setup pipeline
    # load the model you want to use
    # use nlp.disable_pipes to disable all pipes but NER
    """
    Load the model, set up the pipeline and train the entity recognizer.
    """
    if model is not None:
        nlp = spacy.load('model')  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        #nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser'])
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # ADD EXISTING/TRAINED ENTITY RULER  ----------------
    # load patterns from external file
    #nu_ruler = EntityRuler(nlp).from_disk('iesa_ners_patterns_mmat.jsonl')

    # putting the ruler before ner will override ner decisions in favor of ruler patterns
    #nlp.add_pipe(nu_ruler, before='ner')

    # show pipeline components:
    print(nlp.pipe_names)
    # --------------------------------------------------

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model

        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # show pipeline components:
    print(nlp.pipe_names)

    # end program
    print('Done.')
Exemplo n.º 31
0
def train(args, db):
    """
    Set up the pipeline and entity recognizer, and train the new entity.
    training data
    Note: If you're using an existing model, make sure to mix in examples of
    other entity types that spaCy correctly recognized before. Otherwise, your
    model might learn the new type, but "forget" what it previously knew.
    https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
    """

    n_iter = 1
    model = None
    usecase = args.usecase
    source = args.source
    threshold_value = args.threshold
    print("Building model for usecase: %s , Source: %s, Threshold: %s" %
          (usecase, source, threshold_value))
    base_dir = Path("{}/usecases/{}".format(MODEL_PATH, usecase))
    output_dir = "{}/usecases/{}/Model1".format(MODEL_PATH, usecase)

    if not base_dir.exists():
        os.makedirs("{}/usecases/{}".format(MODEL_PATH, usecase))

    output_dir = Path(output_dir)
    if output_dir.exists():
        model = output_dir

    training_set = []
    labels = []
    if source == 'csv':
        df = pd.read_csv(DATA_PATH + '/usecases_trainingset.csv')
        trainingset = df[df['use_case_id'] == usecase][[
            'sentence', 'entities'
        ]].values.tolist()
    else:
        trainingset = db.query(
            'select sentence, entities from usecases_trainingset where use_case_id=%s'
            % usecase)

    for data in trainingset:
        training_set.append((data[0], {"entities": ast.literal_eval(data[1])}))
        for label in ast.literal_eval(data[1]):
            labels.append(label[2])

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.load('en_core_web_sm')  # create blank Language class
        # nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    for label in labels:
        ner.add_label(label)

    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(training_set)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(training_set, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # test the trained model
    # test_text = "Your PNR is U7223N."
    # doc = nlp(test_text)
    # print("Entities in '%s'" % test_text)
    # for ent in doc.ents:
    #     print(ent.label_, ent.text)

    # save model to output directory
    print('output_dir')

    if output_dir is not None:
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = 'usecase_{}.model'.format(usecase)
        nlp.to_disk(output_dir)
Exemplo n.º 32
0
def train_spacy(data, n_iter, load, output_dir):
    """
    Train the spacy model to tag new entity and save the trained model
    @param: 
        TRAIN_DATA: training data ()
        n_iter: number of iterations, type integer
        load: bool value, False to load pretrained spacy model. True to load an empty model
        output_dir: str, path of directory to save the trained model
    
    """

    TRAIN_DATA = data
    # load space model
    if load:
        print('Load Spacy model')
        nlp = spacy.load("en_core_web_sm")
    else:
        nlp = spacy.blank('en')

    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    ner.add_label('AstroTerm')
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    # Resume training
    #optimizer = nlp.resume_training()
    #move_names = list(ner.move_names)

    print("Model training...")
    nlp.begin_training()

    # Begin training by disabling other pipeline components
    with nlp.disable_pipes(*other_pipes):
        # show warnings for misaligned entity spans once
        optimizer = nlp.begin_training()

        sizes = compounding(1.0, 5.0, 1.001)
        # Training for n_iter iterations
        for itn in progressbar(range(n_iter)):
            sleep(0.02)
            # shuffle examples before training
            random.shuffle(TRAIN_DATA)
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=sizes)
            # dictionary to store losses
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                # Calling update() over the iteration
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.25,
                           losses=losses)
                #print("Losses", losses)
    print("Saving the trained model...")
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
Exemplo n.º 33
0
    def main(TRAIN_DATA, model=None, output_dir=None, n_iter=30):
        """Load the model, set up the pipeline and train the entity recognizer."""
        if model is not None:
            nlp = spacy.load(model)  # load existing spaCy model
            print("Loaded model '%s'" % model)
        else:
            nlp = spacy.blank("es")  # create blank Language class
            print("Created blank 'es' model")
    
        # create the built-in pipeline components and add them to the pipeline
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if "ner" not in nlp.pipe_names:
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner, last=True)
        # otherwise, get it so we can add labels
        else:
            ner = nlp.get_pipe("ner")
    
        # add labels
        for _, annotations in TRAIN_DATA:
            for ent in annotations.get("entities"):
                ner.add_label(ent[2])
    
        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        with nlp.disable_pipes(*other_pipes):  # only train NER
            # reset and initialize the weights randomly – but only if we're
            # training a new model
            if model is None:
                nlp.begin_training()
            for itn in range(n_iter):
                random.shuffle(TRAIN_DATA)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        drop=0.5,  # dropout - make it harder to memorise data
                        losses=losses,
                    )
                print("Losses", losses)
    
        # test the trained model
#        for text, _ in TRAIN_DATA:
#            doc = nlp(text)
#            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
#            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    
        # save model to output directory
        output_dir = "./neroutput"
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.to_disk(output_dir)
            print("Saved model to", output_dir)
        
        return nlp
            # test the saved model
#            print("Loading from", output_dir)
#            nlp2 = spacy.load(output_dir)
#            for text, _ in TRAIN_DATA:
#                doc = nlp2(text)
#                print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
#                print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Exemplo n.º 34
0
def main(
        model=None,
        # '.\\trained_models\\omkar_model',
        new_model_name="newtrain",
        output_dir='.\\trained_models\\new_model',
        n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        # nlp = spacy.load("en_core_web_sm")  # create Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy

    if "ner" not in nlp.pipe_names:
        print("Creating new pipe")
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)

    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # if model is None or reset_weights:
    #     optimizer = nlp.begin_training()
    # else:
    #     optimizer = nlp.resume_training()
    # move_names = list(ner.move_names)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        # batch up the examples using spaCy's minibatch
        sizes = compounding(1.0, 4.0, 1.001)
        for itn in range(n_iter):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                try:
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               drop=0.4,
                               losses=losses)
                except:
                    pass
            print("Losses", losses)

    # test the trained model
    test_text = "Marathwada Mitra Mandals College of Engineering"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        # assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Exemplo n.º 35
0
def main(model=None,
         output_dir=None,
         n_iter=20,
         n_texts=2000,
         init_tok2vec=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat",
                                  config={
                                      "exclusive_classes": True,
                                      "architecture": "simple_cnn"
                                  })
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]
    print("Using {} examples ({} training, {} evaluation)".format(
        n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts, [{
        "cats": cats
    } for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".
                  format(  # print a simple table
                      losses["textcat"],
                      scores["textcat_p"],
                      scores["textcat_r"],
                      scores["textcat_f"],
                  ))

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
Exemplo n.º 36
0
def main(model=None,
         new_model_name="rosatom-docs",
         output_dir='./model',
         n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("ru")  # create blank Language class
        print("Created blank 'ru' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    for train_data in TRAIN_DATA:
        print("Train data '%s'" % train_data['label'])
        ner.add_label(
            train_data['label'])  # add new entity label to entity recognizer
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()
        move_names = list(ner.move_names)
        # get names of other pipes to disable them during training
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
        ]
        # only train NER
        with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
            # show warnings for misaligned entity spans once
            warnings.filterwarnings("once",
                                    category=UserWarning,
                                    module='spacy')

            sizes = compounding(1.0, 4.0, 1.001)
            # batch up the examples using spaCy's minibatch
            for itn in range(n_iter):
                random.shuffle(train_data['data'])
                batches = minibatch(train_data['data'], size=sizes)
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               drop=0.35,
                               losses=losses)
                print("Losses", losses)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
Exemplo n.º 37
0
def main(model=None, output_dir=None, n_iter=100):

    if model is not None:
        nlp = spacy.load(model) 
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")
        print("Created blank 'en' model")

    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes): 
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            ts = time.time()
            st=  datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
            print (st)
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}

            batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts, 
                    annotations, 
                    drop=0.5, 
                    losses=losses,
                )
            print("Losses", losses)
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    print (st)
    print ("CONCLUIDO") 

    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def train(model=None, output_dir=None, n_iter=20, n_texts=2000, categories=[], train_texts=[], train_cats=[], dev_texts=[], dev_cats=[]):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    #categories = ['greet', 'time', 'direction', 'self-location', 'location', 'search-general', 
    #'search-restaurants', 'affirmation', 'negation', 'launch', 'news', 'shut-down',
    #'compliment', 'search-wikipedia']

    for category in categories:
        textcat.add_label(category)

    # load the IMDB dataset
    print("Loading categorisation data...")
    #(train_texts, train_cats), (dev_texts, dev_cats) = load_data(categories, limit=n_texts)


    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('Iter #', 'LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            #batches = get_batches(train_data, 'textcat')
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            dropout = decaying(0.6, 0.2, 1e-4)
            for batch in batches:
                texts, annotations = zip(*batch)

                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'  # print a simple table
                  .format(i, losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
Exemplo n.º 39
0
            ner.add_label(ent[2])

    n_iter=10

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            ts = time.time()
            st=  datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
            print (st)
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            #batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.3,
                    sgd=optimizer,
                    losses=losses)
                print('Losses', losses)
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    print (st)
    print ("CONCLUIDO") 

    # test the trained model
Exemplo n.º 40
0
def main(
    ud_dir,
    parses_dir,
    corpus,
    config=None,
    limit=0,
    gpu_device=-1,
    vectors_dir=None,
    use_oracle_segments=False,
):
    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm

    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False

    if config is not None:
        config = Config.load(config, vectors_dir=vectors_dir)
    else:
        config = Config(vectors_dir=vectors_dir)
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)

    docs, golds = read_data(
        nlp,
        paths.train.conllu.open(),
        paths.train.text.open(),
        max_doc_length=config.max_doc_length,
        limit=limit,
    )

    optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)

    batch_sizes = compounding(config.min_batch_size, config.max_batch_size,
                              1.001)
    beam_prob = compounding(0.2, 0.8, 1.001)
    for i in range(config.nr_epoch):
        docs, golds = read_data(
            nlp,
            paths.train.conllu.open(),
            paths.train.text.open(),
            max_doc_length=config.max_doc_length,
            limit=limit,
            oracle_segments=use_oracle_segments,
            raw_text=not use_oracle_segments,
        )
        Xs = list(zip(docs, golds))
        random.shuffle(Xs)
        if config.batch_by_words:
            batches = minibatch_by_words(Xs, size=batch_sizes)
        else:
            batches = minibatch(Xs, size=batch_sizes)
        losses = {}
        n_train_words = sum(len(doc) for doc in docs)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
                batch_docs, batch_gold = zip(*batch)
                pbar.update(sum(len(doc) for doc in batch_docs))
                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
                nlp.update(
                    batch_docs,
                    batch_gold,
                    sgd=optimizer,
                    drop=config.dropout,
                    losses=losses,
                )

        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
        with nlp.use_params(optimizer.averages):
            if use_oracle_segments:
                parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
                                               paths.dev.conllu, out_path)
            else:
                parsed_docs, scores = evaluate(nlp, paths.dev.text,
                                               paths.dev.conllu, out_path)
        print_progress(i, losses, scores)
Exemplo n.º 41
0
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Exemplo n.º 42
0
def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)