def create_model():
    set_gpu()

    TRAIN_DATA, TEST_DATA = load_data()

    nlp = spacy.load("en_pytt_bertbaseuncased_lg")

    textcat = nlp.create_pipe("pytt_textcat", config={"exclusive_classes": True})

    for label in ("POSITIVE", "NEGATIVE"):
        textcat.add_label(label)

    nlp.add_pipe(textcat)

    optimizer = nlp.resume_training()

    dropout = decaying(0.6, 0.2, 1e-4)

    print("Training the model...")

    for i in range(10):
        print("Iteration =>", i)
        random.shuffle(TRAIN_DATA)
        losses = {}
        for batch in get_batches(TRAIN_DATA, "textcat"):
            texts, cats = zip(*batch)
            print(texts, cats)
            nlp.update(texts, cats, sgd=optimizer, losses=losses, drop=dropout)
        print(i, losses)

    with nlp.use_params(optimizer.averages):
        nlp.to_disk("models")
예제 #2
0
def test_issue3447():
    sizes = decaying(10.0, 1.0, 0.5)
    size = next(sizes)
    assert size == 10.0
    size = next(sizes)
    assert size == 10.0 - 0.5
    size = next(sizes)
    assert size == 10.0 - 0.5 - 0.5
예제 #3
0
def test_issue3447():
    sizes = decaying(10.0, 1.0, 0.5)
    size = next(sizes)
    assert size == 10.0
    size = next(sizes)
    assert size == 10.0 - 0.5
    size = next(sizes)
    assert size == 10.0 - 0.5 - 0.5
예제 #4
0
def test_decaying():
    sizes = decaying(10., 1., .5)
    size = next(sizes)
    assert size == 10.
    size = next(sizes)
    assert size == 10. - 0.5
    size = next(sizes)
    assert size == 10. - 0.5 - 0.5
예제 #5
0
    def train_entity(self, nlp, output_dir, train_data, n_iter, dropout):
        """Load the model, set up the pipeline and train the entity recognizer.
        

        Keyword arguments:
        model -- path to the model if existent
        output_dir -- path where model is saved at
        n_iter -- amount of times data is trained with
        train_data -- training data in BILOU Format

        Returns:
        output_dir -- path to model
        """
        dropout = decaying(0.6, 0.2, 1e-4)
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
        ]
        disabled = nlp.disable_pipes(*other_pipes)
        logging.info("Started training entities...")
        optimizer = nlp.begin_training()
        for iteration in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations, _ = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=next(
                        dropout),  # dropout - make it harder to memorise data
                    sgd=optimizer,
                    losses=losses,
                )
            p, r, f = self.evaluate_entity(nlp)
            self.entity_score.append([p, r, f])
            logging.info("Finished %s iteration for NER with %s losses",
                         iteration, losses)
            self.losses_ner.append(losses)
        logging.info("Finished training entities...")
        disabled.restore()

        # save model to output directory
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.to_disk(output_dir)
            logging.info("Saved entity model to %s", output_dir)

        return output_dir
예제 #6
0
    def __init__(self):
        self.model_file = 'custommodel'
        self.train_data_file = 'data/train-data.txt'
        self.dropout = decaying(0.1, 0.0, 1e-4)
        self.iters = 20
        self.batch_size = 2

        #address
        self.address_label = "GPE"
        address_pattern = '\d+[\w\s]+(?:avenue|ave|road|rd|boulevard|blvd|street|st|drive|dr|court|ct|highway|hwy|square|sq|park|parkway|pkwy|circle|cir|trail|trl)[,*\w\s]+([a-z][0-9][a-z]\s*[0-9][a-z][0-9](,*\s*canada)?)'
        self.address_pattern_object = re.compile(address_pattern,
                                                 re.IGNORECASE)
        #date
        self.date_label = "DATE"
        date_pattern = '\d+(jan|january|feb|february|mar|march|apr|april|may|jun|june|jul|july|aug|august|sep|september|oct|october|nov|november|dec|december)\d+'
        self.date_pattern_object = re.compile(date_pattern, re.IGNORECASE)
예제 #7
0
def train(new_model_name='persons', output_dir=None):

    optimizer = nlp.begin_training()
    
    other_pipes = [pipe
                    for pipe
                    in nlp.pipe_names
                    if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(5):
            batches = minibatch(to_train_ents, size=compounding(4., 32., 1.001))
            losses = {}
            # for text, annotations in to_train_ents:
            #     nlp.update([text], [annotations], sgd=optimizer, drop=0.40,
            #                losses=losses)
            random.shuffle(to_train_ents)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=next(decaying(0.35, 0.25, 1e-4)),
                           losses=losses)
            print(losses)

    if output_dir is None:
        output_dir = "./model1"


    noutput_dir = Path(output_dir)
    if not noutput_dir.exists():
        noutput_dir.mkdir()
    if output_dir is not None:
        nlp.meta['accuracy'] = {'ner': best_acc}
    nlp.meta['name'] = new_model_name
    
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(output_dir)

    random.shuffle(to_train_ents)

        # quick test the saved model
    test_text = 'Gina Haspel, President Donald Trump’s controversial pick to be the next CIA director, has officially been confirmed by the Senate in a 54-45 vote.'
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    doc2 = nlp2(preprocess(nlp2(test_text)))
    print("Entities in '%s'" % doc2)
    for ent in doc2.ents:
        print(ent.label_, ent.text)
예제 #8
0
def train_ner(train_data, validation_data):
    nlp = spacy.blank('en')

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    for raw_text, annotations in train_data:
        '''doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]'''
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        dropout = decaying(0.6, 0.2, 0.03)
        '''for itn in range(10):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.01))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, drop=next(dropout), sgd=optimizer, losses=losses)'''
        for itn in range(10):
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                nlp.update([text], [annotations],
                           drop=next(dropout),
                           sgd=optimizer,
                           losses=losses)
            print("Losses: {}".format(losses))
            validate(nlp, validation_data, itn + 1)
            print('Epoch {} complete.\n'.format(itn + 1))
    return nlp
예제 #9
0
def main(model='en',
         new_model_name='en-animals',
         output_dir=animal_model,
         use_gpu=-1,
         n_iter=20):
    if model_exists(output_dir):
        print('model exists.')
        test_model(output_dir, use_gpu)
        return
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        print("Loading model '%s' ... " % model)
        if (use_gpu >= 0):
            spacy.util.use_gpu(0)
        nlp = spacy.load(model)  # load existing spaCy model
    else:
        print("Creating blank 'en' model ... ")
        nlp = spacy.blank('en')  # create blank Language class

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        # otherwise, get it, so we can add labels to it
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)  # add new entity label to entity recognizer

    print('begin training... ')
    if model is None:
        optimizer = nlp.begin_training(device=use_gpu)
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = decaying(env_opt('dropout_from', 0.6),
                             env_opt('dropout_to', 0.2),
                             env_opt('dropout_decay', 1e-4))
    batch_sizes = compounding(env_opt('batch_from', 15),
                              env_opt('batch_to', 30),
                              env_opt('batch_compound', 1.005))

    # disable other pipes during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        n_train_words = count_train()
        for i in range(n_iter):
            losses = {}
            random.shuffle(TRAIN_DATA)

            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                for batch in minibatch(TRAIN_DATA, size=batch_sizes):
                    texts, annotations = zip(*batch)
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               drop=next(dropout_rates),
                               losses=losses)
                    pbar.update(count_tokens(texts))
            print('{}/{} loss: {}'.format(i + 1, n_iter, losses))

    for text in test_texts:
        doc = nlp(text)
        print("Entities in '%s'" % text)
        for ent in doc.ents:
            print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        test_model(output_dir, use_gpu)
    nlp.add_pipe(ner, last=True)

ner.add_label("KATZ")  # add all new labels
n_iter = 100  # number of iterations
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"
               ]  # get names of other pipes to disable them during training

annotations = []

# ********************************
# start training
# ********************************
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()

    dropouts = decaying(0.3, 0.1, 1e-4)
    batch_size = compounding(
        4.0, 32.0, 1.001)  # https://spacy.io/usage/training#tips-batch-size

    for itn in range(n_iter):
        shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(
            TRAIN_DATA,
            size=batch_size)  # batch up the examples using spaCy's minibatch
        dropout = next(dropouts)

        print(itn)
        print("Dropout", dropout)

        for batch in batches:
예제 #11
0
    def _set_params(self, kwargs):
        """
        Set input parameters based on the request.
        :
        :For details refer to the GitHub project: https://github.com/nabeel-oz/qlik-py-tools
        """

        # Set default values which will be used if execution arguments are not passed

        # Default parameters:
        self.debug = False
        self.model = 'en_core_web_sm'
        self.custom = False
        self.base_model = 'en_core_web_sm'
        self.blank = False
        self.epochs = 100
        self.batch_size = compounding(4.0, 32.0, 1.001)
        self.drop = 0.25
        self.test = 0

        # Extract the model path if required
        try:
            # Get the model name from the first row in the request_df
            self.model = self.request_df.loc[0, 'model_name']

            # Remove the model_name column from the request_df
            self.request_df = self.request_df.drop(['model_name'], axis=1)
        except KeyError:
            pass

        # If key word arguments were included in the request, get the parameters and values
        if len(kwargs) > 0:

            # Transform the string of arguments into a dictionary
            self.kwargs = utils.get_kwargs(kwargs)

            # Set the debug option for generating execution logs
            # Valid values are: true, false
            if 'debug' in self.kwargs:
                self.debug = 'true' == self.kwargs['debug'].lower()

                # Additional information is printed to the terminal and logs if the paramater debug = true
                if self.debug:
                    # Increment log counter for the class. Each instance of the class generates a new log.
                    self.__class__.log_no += 1

                    # Create a log file for the instance
                    # Logs will be stored in ..\logs\SpaCy Log <n>.txt
                    self.logfile = os.path.join(
                        os.getcwd(), 'logs',
                        'SpaCy Log {}.txt'.format(self.log_no))

                    self._print_log(1)

            # Set whether the model (if getting named entites) or base model (if retraining) is a custom model
            # i.e. not one of the pre-trained models provided by spaCy
            if 'custom' in self.kwargs:
                self.custom = 'true' == self.kwargs['custom'].lower()

            # Set the base model, i.e an existing spaCy model to be retrained.
            if 'base_model' in self.kwargs:
                self.base_model = self.kwargs['base_model'].lower()

            # Set the retraining to be done on a blank Language class
            if 'blank' in self.kwargs:
                self.blank = 'true' == self.kwargs['blank'].lower()

            # Set the epochs for training the model.
            # This is the the number times that the learning algorithm will work through the entire training dataset.
            # Valid values are an integer e.g. 200
            if 'epochs' in self.kwargs:
                self.epochs = utils.atoi(self.kwargs['epochs'])

            # Set the batch size to be used during model training.
            # The model's internal parameters will be updated at the end of each batch.
            # Valid values are a single integer or compounding or decaying parameters.
            if 'batch_size' in self.kwargs:
                # The batch size may be a single integer
                try:
                    self.batch_size = utils.atoi(self.kwargs['batch_size'])
                # Or a list of floats
                except ValueError:
                    sizes = utils.get_kwargs_by_type(self.kwargs['batch_size'])

                    # If the start < end, batch sizes will be compounded
                    if sizes[0] < sizes[1]:
                        self.batch_size = compounding(sizes[0], sizes[1],
                                                      sizes[2])
                    # else bath sizes will decay during training
                    else:
                        self.batch_size = decaying(sizes[0], sizes[1],
                                                   sizes[2])

            # Set the dropout rate for retraining the model
            # This determines the likelihood that a feature or internal representation in the model will be dropped,
            # making it harder for the model to memorize the training data.
            # Valid values are a float lesser than 1.0 e.g. 0.35
            if 'drop' in self.kwargs:
                self.drop = utils.atof(self.kwargs['drop'])

            # Set the ratio of data to be used for testing.
            # This data will be held out from training and just used to provide evaluation metrics.
            # Valid values are a float >= zero and < 1.0 e.g. 0.3
            if 'test' in self.kwargs:
                self.test = utils.atof(self.kwargs['test'])

        # Debug information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            self._print_log(2)

        # Remove the kwargs column from the request_df
        self.request_df = self.request_df.drop(['kwargs'], axis=1)
예제 #12
0
 def determine_dropout(self):
     """
     For small datasets, it’s useful to set a high dropout rate at first, and decay it down towards a more reasonable value. This helps avoid the network immediately overfitting, while   still encouraging it to learn some of the more interesting things in your data.
     """
     dropout = decaying(self.dropout_start, self.dropout_end, self.interval)
     return dropout
예제 #13
0
def train_textcat(nlp,
                  train_data,
                  init_tok2vec=None,
                  continue_training=False,
                  epochs=10,
                  dropout_rates=(0.6, 0.2, 1e-4),
                  minibatch_sizes=(1.0, 64.0, 1.001),
                  valid_docs=None,
                  valid_labels=None,
                  output_dir=None,
                  use_tqdm=False):
    """Train, evaluate, and store TextCategorizer model."""
    if "textcat" in nlp.pipe_names:
        train_eval_time = time.time()

        if valid_docs is not None or init_tok2vec is not None:
            textcat = nlp.get_pipe("textcat")

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
        with nlp.disable_pipes(*other_pipes):  # only train textcat
            # if base
            if continue_training:
                # Start with an existing model, use default optimizer
                optimizer = nlp.resume_training()
            else:
                optimizer = nlp.begin_training()

            # load pretrained LMAO weights
            if init_tok2vec is not None:
                with init_tok2vec.open("rb") as file_:
                    print("Loading LMAO weights...")
                    textcat.model.tok2vec.from_bytes(file_.read())

            print("Training the model...")
            print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))

            # create batch sizes
            min_batch_size, max_batch_size, update_by = minibatch_sizes
            batch_sizes = compounding(min_batch_size, max_batch_size,
                                      update_by)

            # create decaying dropout
            starting_dropout, ending_dropout, decay_rate = dropout_rates
            dropouts = decaying(starting_dropout, ending_dropout, decay_rate)

            best_avg_f1 = 0
            for i in range(epochs):
                print("Epoch:", i)
                losses = {}

                # batch up the examples using spaCy's minibatch
                random.shuffle(train_data)
                if use_tqdm:
                    train_data = tqdm(train_data, leave=False)
                batches = minibatch(train_data, size=batch_sizes)
                for batch, dropout in zip(batches, dropouts):
                    texts, annotations = zip(*batch)
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               drop=dropout,
                               losses=losses)

                # evaluate model on validatation set
                if valid_docs is not None and valid_labels is not None:
                    with textcat.model.use_params(optimizer.averages):
                        scores, valid_label_set = evaluate(
                            textcat, valid_docs, valid_labels)
                    print("{0:.3f}\t{1:}\t{2:}\t{3:}".format(
                        losses["textcat"], "_____", "_____", "_____"))
                    avg_f1 = 0
                    for vc in valid_label_set:
                        print("{0:}\t{1:.3f}\t{2:.3f}\t{3:.3f}".
                              format(  # print as a table
                                  vc,
                                  scores[vc]["precision"],
                                  scores[vc]["recall"],
                                  scores[vc]["f1-score"],
                              ))
                        avg_f1 += scores[vc]["f1-score"]
                    print("Accuracy:", scores["accuracy"])
                    print("_____________________________")

                    # assign best model, score, and epoch
                    avg_f1 = avg_f1 / len(valid_label_set)
                    if avg_f1 > best_avg_f1:
                        best_avg_f1 = avg_f1
                        # overwrite the weak with the strong
                        store_model(output_dir, nlp, optimizer)
                else:
                    print("{0:.3f}\t{1:}\t{2:}\t{3:}".format(
                        losses["textcat"], "_____", "_____", "_____"))

                if use_tqdm:
                    # train_data was put into tqdm object and won't shuffle properly due to indexing
                    # put train_data back to it's original type
                    train_data = train_data.iterable

            # store final model if no evaluation performed
            if valid_docs is None:
                store_model(output_dir, nlp, optimizer)

        print("Finished after: {0:.2f} minutes".format(
            (time.time() - train_eval_time) / 60))
    else:
        raise NameError(
            "Pipe 'textcat' is not in the nlp pipeline. Be sure to run mk_model() before training."
        )

    return nlp
예제 #14
0
    def train_classifier_model(self, task, proj):
        # TODO buscar essas variaveis dos parametros da task, project ou self.config
        n_iter = 20
        n_texts = 2000
        init_tok2vec = None
        languages = self.cfg.get('languages', {})
        self.logging.debug(f"languages: {languages}")
        for lang, model in languages.items():
            # data sample query selection
            proj['index_query'] = util.createTrainDataQuery(proj, lang)
            # load correct dataset from index
            (train_texts, train_cats), (
                dev_texts,
                dev_cats), categories = self.generate_classifier_data(proj)
            if len(categories) == 0 or len(train_texts) == 0:
                # only load the model if is data to train
                self.logging.debug(
                    f"No new training data found in index for language: {lang}"
                )
                continue

            # check if already exists the taget model file, if so then load it, and update this existing model
            model_file, _, _ = util.getDataFilename(self.cfg,
                                                    f"{proj['id']}/{lang}",
                                                    None, None)
            _nlp = None
            if os.path.exists(model_file):
                self.logging.info(f"Loading project model '{model_file}'")
                _nlp = spacy.load(model_file)
            else:  # fallback to language core model
                self.logging.info(f"Loading default lang model '{model}'")
                _nlp = spacy.load(model)

            # add the text classifier to the pipeline if it doesn't exist
            if "textcat" not in _nlp.pipe_names:
                textcat = _nlp.create_pipe("textcat",
                                           config={
                                               "exclusive_classes": True,
                                               "architecture": "simple_cnn"
                                           })
                _nlp.add_pipe(textcat, last=True)
            # otherwise, get it, so we can add labels to it
            else:
                textcat = _nlp.get_pipe("textcat")

            # add label to text classifier
            self.logging.info(f"Categories: {categories}")
            for cat in categories:
                textcat.add_label(cat)

            train_texts = train_texts[:n_texts]
            train_cats = train_cats[:n_texts]
            self.logging.info(
                f"Using {n_texts} examples ({len(train_texts)} training, {len(dev_texts)} evaluation)"
            )
            train_data = list(
                zip(train_texts, [{
                    "cats": cats
                } for cats in train_cats]))

            # get names of other pipes to disable them during training
            pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
            other_pipes = [
                pipe for pipe in _nlp.pipe_names if pipe not in pipe_exceptions
            ]
            with _nlp.disable_pipes(*other_pipes):  # only train textcat
                optimizer = _nlp.begin_training()
                if init_tok2vec is not None:
                    with init_tok2vec.open("rb") as file_:
                        textcat.model.tok2vec.from_bytes(file_.read())
                self.logging.info("Training the model...")
                self.logging.debug("{:^5}\t{:^5}\t{:^5}\t{:^5}".format(
                    "LOSS", "P", "R", "F"))
                batch_sizes = compounding(4.0, 32.0, 1.001)
                dropout = decaying(0.6, 0.2, 1e-4)
                for _i in range(n_iter):
                    losses = {}
                    # batch up the examples using spaCy's minibatch
                    random.shuffle(train_data)
                    batches = minibatch(train_data, size=batch_sizes)
                    for batch in batches:
                        texts, annotations = zip(*batch)
                        _nlp.update(texts,
                                    annotations,
                                    sgd=optimizer,
                                    drop=next(dropout),
                                    losses=losses)
                        #_nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
                    with textcat.model.use_params(optimizer.averages):
                        # evaluate on the dev data split off in load_data()
                        scores = self.evaluate(_nlp.tokenizer, textcat,
                                               dev_texts, dev_cats)
                    self.logging.debug("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".
                                       format(  # print a simple table
                                           losses["textcat"],
                                           scores["textcat_p"],
                                           scores["textcat_r"],
                                           scores["textcat_f"],
                                       ))

            # save the model
            with _nlp.use_params(optimizer.averages):
                _nlp.to_disk(model_file)
            self.logging.info(f"Saved model to {model_file}")

            # Load the saved model
            self.loaded_models[f"{proj['id']}/{lang}"] = spacy.load(model_file)
            self.logging.debug(self.loaded_models)
예제 #15
0
def modelSpacy(model=None,
               new_model_name="Product",
               output_dir=None,
               n_iter=40,
               training_data=None,
               validation_data=None,
               validation_plot=False,
               dropout=(0.35, 0.35, 1),
               batch=(1., 32., 1.001),
               verbose=1):
    # IMPORT LIBRARIES
    from bloo.mlMaster import spacyEvaluate
    from spacy.util import decaying

    # DECLARE VARIABLES
    lossesList = []
    lossesList.append(len(training_data))
    metricsLocal = {"precision": [0], "recall": [0], "f1score": [0]}

    # We want to reproduce the same random situation in each test
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)
    else:
        nlp = spacy.blank("en")
        if (verbose >= 1):
            print("Created a blank 'en' model")

    # Now we add the NER recognizer to the model
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe("ner")

    # Add the new entity label to entity recognizer
    ner.add_label("PRODUCTS")

    # Decide whether to start or to resume training
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)

    # Get names of other pipes to disable them during training
    pipe_executions = ["ner", "trf_wordpiercer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_executions
    ]

    # Only train the given NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        sizes = compounding(batch[0], batch[1], batch[2])
        dropout = decaying(dropout[0], dropout[1], dropout[2])
        # batch up the examples using spacy's mini batch
        for itn in range(n_iter + 1):
            random.shuffle(training_data)
            batches = minibatch(training_data, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=next(dropout),
                           losses=losses)

            # Print information about the current iteration after it is finished so we can better visualize the progress
            if (verbose >= 1):
                print("Losses after iteration %i: %s" % (itn, str(losses)))
                print("Current dropout rate: %.2f" % (next(dropout)))
            # Add the current loss to the list of losses, so we can plot it later
            lossesList.append(int(losses['ner']))

            # AFTER EACH ITERATION WE WANT TO VALIDATE THE DATA AND GET SOME RESULTS
            if validation_data is not None:
                random.shuffle(validation_data)
                metrics = spacyEvaluate(ner_model=nlp,
                                        examples=validation_data)
                print(
                    "Metrics after iteration %i: PRECISION: %.2f%% | RECALL: %.2f%% | F1SCORE: %.2f%% |"
                    % (itn, metrics['precision'], metrics['recall'],
                       metrics['f1score']))
                for key, value in metricsLocal.items():
                    metricsLocal[key].append(metrics[key])

    # SAVE THE MODEL
    if output_dir is not None:
        print("Spacy model: Saving the model in the output directory: \"%s\"" %
              (str(output_dir)))

        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Spacy model: saved successfully.")
    return nlp, lossesList, metricsLocal
예제 #16
0
def train(model, train_data, dev_data, test_data, output_dir, n_iter,
          meta_overrides):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)

    original_tokenizer = nlp.tokenizer

    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="parser")
    elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="tagger")
    elif 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.005))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 32),
                                   util.env_opt('batch_compound', 1.001))

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
    best_epoch = 0
    best_f1 = 0
    for i in range(n_iter):

        random.shuffle(train_data)
        count = 0
        losses = {}
        total = len(train_data)

        with nlp.disable_pipes(*other_pipes):  # only train NER
            with tqdm.tqdm(total=total, leave=True) as pbar:
                for batch in minibatch(train_data, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs,
                               golds,
                               sgd=optimizer,
                               losses=losses,
                               drop=next(dropout_rates))
                    pbar.update(len(batch))
                    if count % 100 == 0 and count > 0:
                        print('sum loss: %s' % losses['ner'])
                    count += 1

        # save model to output directory
        output_dir_path = Path(output_dir + "/" + str(i))
        if not output_dir_path.exists():
            output_dir_path.mkdir()

        with nlp.use_params(optimizer.averages):
            nlp.tokenizer = original_tokenizer
            nlp.to_disk(output_dir_path)
            print("Saved model to", output_dir_path)

        # test the saved model
        print("Loading from", output_dir_path)
        nlp2 = util.load_model_from_path(output_dir_path)
        nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

        metrics = evaluate_ner(nlp2, dev_data)
        if metrics["f1-measure-overall"] > best_f1:
            best_f1 = metrics["f1-measure-overall"]
            best_epoch = i
    # save model to output directory
    best_model_path = Path(output_dir + "/" + "best")
    print(f"Best Epoch: {best_epoch} of {n_iter}")
    if os.path.exists(best_model_path):
        shutil.rmtree(best_model_path)
    shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path)

    # test the saved model
    print("Loading from", best_model_path)
    nlp2 = util.load_model_from_path(best_model_path)
    nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

    evaluate_ner(nlp2,
                 dev_data,
                 dump_path=os.path.join(output_dir, "dev_metrics.json"))
    evaluate_ner(nlp2,
                 test_data,
                 dump_path=os.path.join(output_dir, "test_metrics.json"))
예제 #17
0
Created on Sat Sep  1 11:26:50 2018

@author: Gurunath
"""

from __future__ import unicode_literals, print_function
import plac
import random
import pandas as pd
from pathlib import Path
import thinc.extra.datasets

import spacy
from spacy.util import minibatch, compounding
from spacy.util import decaying
dropout = decaying(0.6, 0.2, 1e-4)

tweet_df = pd.read_csv(
    r'F:\E\Learning_DL_fastai\competition\NLP_data\train_2kmZucJ.csv')


def get_batches(train_data, model_type):
    max_batch_sizes = {'tagger': 32, 'parser': 16, 'ner': 16, 'textcat': 64}
    max_batch_size = max_batch_sizes[model_type]
    if len(train_data) < 1000:
        max_batch_size /= 2
    if len(train_data) < 500:
        max_batch_size /= 2
    batch_size = compounding(1, max_batch_size, 1.001)
    batches = minibatch(train_data, size=batch_size)
    return batches
def train(model=None, output_dir=None, n_iter=20, n_texts=2000, categories=[], train_texts=[], train_cats=[], dev_texts=[], dev_cats=[]):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    #categories = ['greet', 'time', 'direction', 'self-location', 'location', 'search-general', 
    #'search-restaurants', 'affirmation', 'negation', 'launch', 'news', 'shut-down',
    #'compliment', 'search-wikipedia']

    for category in categories:
        textcat.add_label(category)

    # load the IMDB dataset
    print("Loading categorisation data...")
    #(train_texts, train_cats), (dev_texts, dev_cats) = load_data(categories, limit=n_texts)


    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('Iter #', 'LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            #batches = get_batches(train_data, 'textcat')
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            dropout = decaying(0.6, 0.2, 1e-4)
            for batch in batches:
                texts, annotations = zip(*batch)

                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'  # print a simple table
                  .format(i, losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
예제 #19
0
            textcat = nlp_model.create_pipe('textcat')
            nlp_model.add_pipe(textcat, last=True)
            # add label to text classifier
            for custom_label in [
                    "toxic", "severe_toxic", "obscene", "threat", "insult",
                    "identity_hate"
            ]:  # Enter custom labels here <---------------------
                textcat.add_label(custom_label)

        # otherwise, get it, so we can add labels to it
        else:
            print('model already contains textcat!')
            textcat = nlp_model.get_pipe('textcat')

    optimizer = textcat.begin_training()
    dropout = decaying(drop_max, drop_min, drop_step)

    # store for evaluation
    precisions = list()
    recalls = list()
    f_scores = list()
    loss_vals = list()
    data_pds = pd.read_csv(nrows=n_texts,
                           filepath_or_buffer=data_filepath,
                           chunksize=chunk_size,
                           header=0,
                           names=[
                               "id", "comment_text", "toxic", "severe_toxic",
                               "obscene", "threat", "insult", "identity_hate"
                           ])
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    textcat.add_label('Neutral')
    textcat.add_label('Bullish')
    textcat.add_label('Bearish')

    # load the IMDB dataset
    print("Loading tweets data...")
    # (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
    (train_texts, train_cats), (dev_texts,
                                dev_cats) = load_data_2(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)".format(
        n_texts * 2, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts, [{
        'cats': cats
    } for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    dropout = decaying(0.6, 0.2, 1e-4)
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(2., 8., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=next(dropout),
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                try:
                    scores = evaluate(nlp.tokenizer, textcat, dev_texts,
                                      dev_cats)
                    print(
                        '{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                        .format(losses['textcat'], scores['textcat_p'],
                                scores['textcat_r'], scores['textcat_f']))
                except Exception as e:
                    print(e)
                    pass

    # test the trained model
    test_text = "#aapl buy for 250m the market!!!"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is None:
        output_dir = Path('tweetsClassifier/spacyTrainingModel')
        if not output_dir.exists():
            output_dir.mkdir()
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        test_text = "long #aapl for 250m the market!!!"
        test_text2 = "#aapl lead the market!"
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
예제 #21
0
def train_spacy(data, iterations):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("id")  # create blank Language class
        print("Created blank 'indo' model")

    TRAIN_DATA = data

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    if model is None:
        optimizer = nlp.begin_training()
        # For training with customized cfg
        # nlp.entity.cfg['conv_depth'] = 16
        # nlp.entity.cfg['token_vector_width'] = 256
        # nlp.entity.cfg['bilstm_depth'] = 1
        # nlp.entity.cfg['beam_width'] = 2
    else:
        print("resuming")
        optimizer = nlp.resume_training()
        print(optimizer.learn_rate)

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    dropout = decaying(0.8, 0.2, 1e-6)  # minimum, max, decay rate
    # sizes = compounding(1.0, 4.0, 1.001)
    sizes = compounding(4., 32., 1.001)

    with nlp.disable_pipes(*other_pipes):  # only train NER

        warnings.filterwarnings("once", category=UserWarning, module='spacy')
        optimizer.learn_rate = 0.001
        for itn in range(iterations):

            file = open(outlog_txt, 'a')  # For logging losses of iterations

            start = time.time()  # Iteration Time

            if itn % 100 == 0 and itn != 0:
                # print("Itn  : " + str(itn), time.time() - start_training_time)
                # print('Testing')

                # results = evaluate(nlp, TEST_DATA)
                # file1 = open(outlog_file, 'a')
                # file1.write(str(itn) + ',' + str(results['ents_p']) + ',' + str(results['ents_r']) + ',' + str(
                #     results['ents_f']) + ',' + str(results["ents_per_type"]) + "\n")
                # file1.close()

                # results = evaluate(nlp, TRAIN_DATA)
                # file2 = open(train_file, 'a')
                # file2.write(str(itn) + ',' + str(results['ents_p']) + ',' + str(results['ents_r']) + ',' + str(
                #     results['ents_f']) + ',' + str(results["ents_per_type"]) + "\n")
                # file2.close()

                #todo check point
                modelfile = output_dir + "training_model" + str(itn)
                nlp.to_disk(modelfile)

            # Reducing Learning rate after certain operations
            if itn == 100:
                optimizer.learn_rate = 0.0005
            if itn == 150:
                optimizer.learn_rate = 0.0001

            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}

            # use either batches or entire set at once

            ##### For training in Batches
            batches = minibatch(TRAIN_DATA[:int(len(TRAIN_DATA) * 1)],
                                size=sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                # nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.3,
                           losses=losses)

            ###########################################

            ##### For training in as a single iteration
            # for text, annotations in TRAIN_DATA:
            #     nlp.update(
            #         [text],  # batch of texts
            #         [annotations],  # batch of annotations
            #         drop=0.2,  # dropout - make it harder to memorise data
            # # drop=next(dropout),  Incase you are using decaying drop
            # sgd=optimizer,  # callable to update weights
            # losses=losses)

            print("Losses", losses)
            file.write(str(itn) + "," + str(losses['ner']) + "\n")
            print("time for iteration:", time.time() - start)
            file.close()

    return nlp
예제 #22
0
def train_textcat_model(
        load_data_func: Callable[
            [], Tuple[List[Tuple[Any, Dict[str, Dict[str, bool]]]], List[Tuple[Any, Dict[str, Dict[str, bool]]]]]
        ],
        n_iter: int = 20, max_texts: int = 2000, model: Optional[str] = None,
        output_dir: str = '/tmp/model', labels: Optional[Iterable[str]] = None,
        test_text: Optional[str] = None
) -> None:
    # Load data and verify there is some
    train_data, eval_data = load_data_func()
    if not train_data:
        raise Exception('There is no data provided to train')
    if not eval_data:
        raise Exception('There is no data provided to evaluate the trained model')

    if not labels:
        raise Exception('No labels were provided to train')
    if not output_dir:
        raise Exception('Output dir must be specified')

    if model:
        nlp = spacy.load(model)
        print(f'Loaded model "{model}"')
    else:
        nlp = spacy.blank("en")
        print('Created blank "en" model')

    # Add the text classifier to the pipeline if it doesn't exist
    if 'textcat' not in nlp.pipe_names:
        # nlp.create_pipe works for built-ins that are registered with spaCy
        textcat = nlp.create_pipe(
            'textcat', config={'exclusive_classes': True, 'architecture': 'simple_cnn'}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        # Otherwise, get it, so we can add labels to it
        textcat = nlp.get_pipe('textcat')

    # Add labels to text classifier
    for label in labels:
        textcat.add_label(label)

    print(
        "Using max {} examples ({} training, {} evaluation)".format(
            max_texts, len(train_data), len(eval_data)
        )
    )

    # We mainly have small data sets, so it's recommended to use a high dropout rate at first
    # From https://spacy.io/usage/training#tips-dropout
    dropout = decaying(0.6, 0.2, 1e-4)

    # Get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        print('Training the model...')
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = _evaluate(nlp.tokenizer, textcat, eval_data)
            # Print a simple table
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    # Create the output dir (if it doesn't exist
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()

    # Use the averages when writing out the model
    # From https://spacy.io/usage/training#tips-param-avg
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(output_dir)
    print(f'Saved model to {output_dir}')

    # test the saved model
    if test_text:
        print(f'Loading saved model from {output_dir}')
        test_textcat_model(output_dir, test_text)
예제 #23
0
def custom_train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    orth_variant_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    textcat_multilabel=False,
    textcat_arch="bow",
    textcat_positive_label=None,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """

    # temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
    import tqdm

    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [
            p for p in output_path.iterdir() if p.is_dir()
    ]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                if pipe == "parser":
                    pipe_cfg = {"learn_tokens": learn_tokens}
                elif pipe == "textcat":
                    pipe_cfg = {
                        "exclusive_classes": not textcat_multilabel,
                        "architecture": textcat_arch,
                        "positive_label": textcat_positive_label,
                    }
                else:
                    pipe_cfg = {}
                nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
            else:
                if pipe == "textcat":
                    textcat_cfg = nlp.get_pipe("textcat").cfg
                    base_cfg = {
                        "exclusive_classes": textcat_cfg["exclusive_classes"],
                        "architecture": textcat_cfg["architecture"],
                        "positive_label": textcat_cfg["positive_label"],
                    }
                    pipe_cfg = {
                        "exclusive_classes": not textcat_multilabel,
                        "architecture": textcat_arch,
                        "positive_label": textcat_positive_label,
                    }
                    if base_cfg != pipe_cfg:
                        msg.fail(
                            "The base textcat model configuration does"
                            "not match the provided training options. "
                            "Existing cfg: {}, provided cfg: {}".format(
                                base_cfg, pipe_cfg),
                            exits=1,
                        )
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        ### Here are our modifications:
        lang_cls.Defaults.tag_map = custom_tag_map
        nlp = lang_cls()
        assert nlp.vocab.morphology.n_tags == 36
        ###
        for pipe in pipeline:
            if pipe == "parser":
                pipe_cfg = {"learn_tokens": learn_tokens}
            elif pipe == "textcat":
                pipe_cfg = {
                    "exclusive_classes": not textcat_multilabel,
                    "architecture": textcat_arch,
                    "positive_label": textcat_positive_label,
                }
            else:
                pipe_cfg = {}
            nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks),
                         ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail("Can't use multitask objective without '{}' in the "
                         "pipeline".format(pipe_name))
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples,
                                       device=use_gpu)

    nlp._optimizer = None

    # Load in pretrained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # Verify textcat config
    if "textcat" in pipeline:
        textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
        if textcat_positive_label and textcat_positive_label not in textcat_labels:
            msg.fail(
                "The textcat_positive_label (tpl) '{}' does not match any "
                "label in the training data.".format(textcat_positive_label),
                exits=1,
            )
        if textcat_positive_label and len(textcat_labels) != 2:
            msg.fail(
                "A textcat_positive_label (tpl) '{}' was provided for training "
                "data that does not appear to be a binary classification "
                "problem with two labels.".format(textcat_positive_label),
                exits=1,
            )
        train_docs = corpus.train_docs(
            nlp,
            noise_level=noise_level,
            gold_preproc=gold_preproc,
            max_length=0,
            ignore_misaligned=True,
        )
        train_labels = set()
        if textcat_multilabel:
            multilabel_found = False
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1:
                    multilabel_found = True
            if not multilabel_found and not base_model:
                msg.warn("The textcat training instances look like they have "
                         "mutually-exclusive classes. Remove the flag "
                         "'--textcat-multilabel' to train a classifier with "
                         "mutually-exclusive classes.")
        if not textcat_multilabel:
            for text, gold in train_docs:
                train_labels.update(gold.cats.keys())
                if list(gold.cats.values()).count(1.0) != 1 and not base_model:
                    msg.warn(
                        "Some textcat training instances do not have exactly "
                        "one positive label. Modifying training options to "
                        "include the flag '--textcat-multilabel' for classes "
                        "that are not mutually exclusive.")
                    nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
                    textcat_multilabel = True
                    break
        if base_model and set(textcat_labels) != train_labels:
            msg.fail(
                "Cannot extend textcat model using data with different "
                "labels. Base model labels: {}, training data labels: "
                "{}.".format(textcat_labels, list(train_labels)),
                exits=1,
            )
        if textcat_multilabel:
            msg.text(
                "Textcat evaluation score: ROC AUC score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        elif textcat_positive_label and len(textcat_labels) == 2:
            msg.text("Textcat evaluation score: F1-score for the "
                     "label '{}'".format(textcat_positive_label))
        elif len(textcat_labels) > 1:
            if len(textcat_labels) == 2:
                msg.warn(
                    "If the textcat component is a binary classifier with "
                    "exclusive classes, provide '--textcat_positive_label' for "
                    "an evaluation on the positive class.")
            msg.text(
                "Textcat evaluation score: F1-score macro-averaged across "
                "the labels '{}'".format(", ".join(textcat_labels)))
        else:
            msg.fail(
                "Unsupported textcat configuration. Use `spacy debug-data` "
                "for more information.")

    # fmt: off
    row_head, output_stats = _configure_training_output(
        pipeline, use_gpu, has_beam_widths)
    row_widths = [len(w) for w in row_head]
    row_settings = {
        "widths": row_widths,
        "aligns": tuple(["r" for i in row_head]),
        "spacing": 2
    }
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
                ignore_misaligned=True,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8)
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs,
                                                     size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(
                            nlp_loaded,
                            gold_preproc=gold_preproc,
                            ignore_misaligned=True,
                        ))
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(
                                epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(
                                    nlp_loaded,
                                    gold_preproc=gold_preproc,
                                    ignore_misaligned=True,
                                ))
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs,
                                                         verbose=verbose)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta["labels"] = nlp.meta["labels"]
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        output_stats,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    if i == 0 and "textcat" in pipeline:
                        textcats_per_cat = scorer.scores.get(
                            "textcats_per_cat", {})
                        for cat, cat_score in textcats_per_cat.items():
                            if cat_score.get("roc_auc_score", 0) < 0:
                                msg.warn(
                                    "Textcat ROC AUC score is undefined due to "
                                    "only one value in label '{}'.".format(
                                        cat))
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text("Early stopping, best iteration "
                                 "is: {}".format(i - iter_since_best))
                        msg.text("Best score = {}; Final iteration "
                                 "score = {}".format(best_score,
                                                     current_score))
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path,
                                                  nlp.pipe_names)
        msg.good("Created best model", best_model_path)
예제 #24
0
    def update(self):
        '''
    Using spacy V2.1 to update the dependency parser
    '''
        dropout = decaying(0.5, 0.2, 1e-4)

        self.require_gpu(self.gpu)

        # getting data in spacy required format
        data = self.get_data(self.train_path)

        random.seed(777)
        random.shuffle(data)

        if self.model is not None:
            nlp = spacy.load(self.model)  # load existing spaCy model
            print("Loaded model '%s'" % self.model)
        else:
            nlp = spacy.blank(self.lang)  # create blank Language class
            print("Created blank '%s' model" % self.lang)

        # add the parser to the pipeline if it doesn't exist
        # nlp.create_pipe works for built-ins that are registered with spaCy
        if "parser" not in nlp.pipe_names:
            parser = nlp.create_pipe("parser")
            nlp.add_pipe(parser, first=True)
        # otherwise, get it, so we can add labels to it
        else:
            parser = nlp.get_pipe("parser")

        # sentence segmentation diable
        #nlp.add_pipe(self.prevent_sentence_boundary_detection, name='prevent-sbd', before='parser')

        # change the tokens to spacy Doc
        new_data = list()
        for dat in data:
            assert (len(Doc(nlp.vocab, words=dat[0])) == len(dat[1]['deps']))
            assert (len(Doc(nlp.vocab, words=dat[0])) == len(dat[1]['heads']))
            doc = Doc(nlp.vocab, words=dat[0])
            new_data.append(
                (doc, GoldParse(doc,
                                heads=dat[1]['heads'],
                                deps=dat[1]['deps'])))

        # add labels to the parser
        for _, annotations in data:
            for dep in annotations.get("deps", []):
                parser.add_label(dep)

        pretrain_weights_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),\
          'pretrained_weights','pretrained_weights.bin')
        if os.path.exists(pretrain_weights_path):
            # loading pretrained weights
            with open(pretrain_weights_path, "rb") as file_:
                nlp.from_bytes(file_.read())
                print('LOADED from PRETRAIN %s' % pretrain_weights_path)

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
        with nlp.disable_pipes(*other_pipes):  # only train parser
            optimizer = nlp.begin_training()
            for itn in range(self.n_iter):
                random.shuffle(new_data)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(new_data,
                                    size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    print(type(texts[0]))
                    print(annotations[0])
                    nlp.update(texts,
                               annotations,
                               sgd=optimizer,
                               losses=losses,
                               drop=next(dropout))
                print("Losses", losses)

        # save model to output directory
        if self.output_dir is not None:
            self.output_dir = Path(self.output_dir)
            if not self.output_dir.exists():
                self.output_dir.mkdir()
            nlp.meta['name'] = "Custom-launguage-model"  # rename model
            with nlp.use_params(optimizer.averages):
                nlp.to_disk(self.output_dir)
                print("Saved model to", self.output_dir)
예제 #25
0
    def train_intent(self, nlp, output_dir, train_data, n_iter, dropout):
        """Load the model, set up the pipeline and train the entity recognizer.
        

        Keyword arguments:
        model -- path to the model if existent
        output_dir -- path where model is saved at
        n_iter -- amount of times data is trained with
        train_data -- training data in BILOU Format

        Returns:
        output_dir -- path to model
        """
        dropout = decaying(0.6, 0.2, 1e-4)
        pipe_exceptions = ["trf_textcat", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
        ]
        disabled = nlp.disable_pipes(*other_pipes)
        logging.info("Started training intents...")
        optimizer = nlp.resume_training()
        optimizer.alpha = 0.001
        optimizer.trf_weight_decay = 0.005
        optimizer.L2 = 0.0

        learn_rate = 2e-5
        batch_size = 8
        learn_rates = cyclic_triangular_rate(learn_rate / 3, learn_rate * 3,
                                             2 * len(train_data) // batch_size)
        for iteration in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                optimizer.trf_lr = next(learn_rates)
                texts, _, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    sgd=optimizer,
                    drop=next(
                        dropout),  # dropout - make it harder to memorise data
                    losses=losses)
            self.losses_cat.append(losses)
            p, r, f = self.evaluate_intent(nlp)
            self.cat_scores.append([p, r, f])
            logging.info(
                "Finished %s iteration for text classification with %s losses",
                iteration, losses)
            #if cat_score <= self.cat_scores[-2]:
            #break
        logging.info("Finished training intents...")
        disabled.restore()

        # save model to output directory
        if output_dir is not None:
            output_dir = Path(output_dir)
            if not output_dir.exists():
                output_dir.mkdir()
            nlp.to_disk(output_dir)
            logging.info("Saved model to %s", output_dir)

        return output_dir
def train_parser_and_tagger(train_json_path: str,
                            dev_json_path: str,
                            test_json_path: str,
                            model_output_dir: str,
                            model_path: str = None,
                            ontonotes_path: str = None,
                            ontonotes_train_percent: float = 0.0):
    """Function to train the spacy parser and tagger from a blank model, with the default, en_core_web_sm vocab.
       Training setup is mostly copied from the spacy cli train command.

       @param train_json_path: path to the conll formatted training data
       @param dev_json_path: path to the conll formatted dev data
       @param test_json_path: path to the conll formatted test data
       @param model_output_dir: path to the output directory for the trained models
       @param model_path: path to the model to load
       @param ontonotes_path: path to the directory containnig ontonotes in spacy format (optional)
       @param ontonotes_train_percent: percentage of the ontonotes training data to use (optional)
    """
    msg = Printer()

    train_json_path = cached_path(train_json_path)
    dev_json_path = cached_path(dev_json_path)
    test_json_path = cached_path(test_json_path)

    if model_path is not None:
        nlp = spacy.load(model_path)
    else:
        lang_class = util.get_lang_class('en')
        nlp = lang_class()

    if 'tagger' not in nlp.pipe_names:
        tagger = nlp.create_pipe('tagger')
        nlp.add_pipe(tagger, first=True)
    else:
        tagger = nlp.get_pipe('tagger')

    if 'parser' not in nlp.pipe_names:
        parser = nlp.create_pipe('parser')
        nlp.add_pipe(parser)
    else:
        parser = nlp.get_pipe('parser')

    train_corpus = GoldCorpus(train_json_path, dev_json_path)
    test_corpus = GoldCorpus(train_json_path, test_json_path)

    if ontonotes_path:
        onto_train_path = os.path.join(ontonotes_path, "train")
        onto_dev_path = os.path.join(ontonotes_path, "dev")
        onto_test_path = os.path.join(ontonotes_path, "test")
        onto_train_corpus = GoldCorpus(onto_train_path, onto_dev_path)
        onto_test_corpus = GoldCorpus(onto_train_path, onto_test_path)

    dropout_rates = util.decaying(0.2, 0.2, 0.0)
    batch_sizes = util.compounding(1., 16., 1.001)

    if model_path is not None:
        meta = nlp.meta
    else:
        meta = {}
        meta["lang"] = "en"
        meta["pipeline"] = ["tagger", "parser"]
        meta["name"] = "scispacy_core_web_sm"
        meta["license"] = "CC BY-SA 3.0"
        meta["author"] = "Allen Institute for Artificial Intelligence"
        meta["url"] = "allenai.org"
        meta["sources"] = ["OntoNotes 5", "Common Crawl", "GENIA 1.0"]
        meta["version"] = "1.0.0"
        meta["spacy_version"] = ">=2.2.1"
        meta["parent_package"] = "spacy"
        meta["email"] = "*****@*****.**"

    n_train_words = train_corpus.count_train()

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in  ['tagger', 'parser']]
    with nlp.disable_pipes(*other_pipes):
        if ontonotes_path:
            optimizer = nlp.begin_training(lambda: itertools.chain(train_corpus.train_tuples, onto_train_corpus.train_tuples))
        else:
            optimizer = nlp.begin_training(lambda: train_corpus.train_tuples)
        nlp._optimizer = None

    train_docs = train_corpus.train_docs(nlp)
    train_docs = list(train_docs)

    train_mixture = train_docs
    if ontonotes_path:
        onto_train_docs = onto_train_corpus.train_docs(nlp)
        onto_train_docs = list(onto_train_docs)
        num_onto_docs = int(float(ontonotes_train_percent)*len(onto_train_docs))
        randomly_sampled_onto = random.sample(onto_train_docs, num_onto_docs)
        train_mixture += randomly_sampled_onto

    row_head, output_stats = _configure_training_output(nlp.pipe_names, -1, False)
    row_widths = [len(w) for w in row_head]
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}

    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)

    best_epoch = 0
    best_epoch_uas = 0.0
    for i in range(20):
        random.shuffle(train_mixture)
        with nlp.disable_pipes(*other_pipes):
            with tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                minibatches = list(util.minibatch(train_docs, size=batch_sizes))
                for batch in minibatches:
                    docs, golds = zip(*batch)
                    nlp.update(docs, golds, sgd=optimizer,
                               drop=next(dropout_rates), losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))

        # save intermediate model and output results on the dev set
        with nlp.use_params(optimizer.averages):
            epoch_model_path = os.path.join(model_output_dir, "model"+str(i))
            os.makedirs(epoch_model_path, exist_ok=True)
            nlp.to_disk(epoch_model_path)

            with open(os.path.join(model_output_dir, "model"+str(i), "meta.json"), "w") as meta_fp:
                meta_fp.write(json.dumps(meta))

            nlp_loaded = util.load_model_from_path(epoch_model_path)
            dev_docs = train_corpus.dev_docs(nlp_loaded)
            dev_docs = list(dev_docs)
            nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
            start_time = timer()
            scorer = nlp_loaded.evaluate(dev_docs)
            end_time = timer()
            gpu_wps = None
            cpu_wps = nwords/(end_time-start_time)

            if ontonotes_path:
                onto_dev_docs = list(onto_train_corpus.dev_docs(nlp_loaded))
                onto_scorer = nlp_loaded.evaluate(onto_dev_docs)


        if scorer.scores["uas"] > best_epoch_uas:
            best_epoch_uas = scorer.scores["uas"]
            best_epoch = i
        progress = _get_progress(
            i, losses, scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps
        )
        msg.row(progress, **row_settings)

        if ontonotes_path:
            progress = _get_progress(
                i, losses, onto_scorer.scores, output_stats, cpu_wps=cpu_wps, gpu_wps=gpu_wps
            )
            msg.row(progress, **row_settings)

    # save final model and output results on the test set
    final_model_path = os.path.join(model_output_dir, "best")
    if os.path.exists(final_model_path):
        shutil.rmtree(final_model_path)
    shutil.copytree(os.path.join(model_output_dir, "model" + str(best_epoch)),
                    final_model_path)

    nlp_loaded = util.load_model_from_path(final_model_path)
    start_time = timer()
    test_docs = test_corpus.dev_docs(nlp_loaded)
    test_docs = list(test_docs)
    nwords = sum(len(doc_gold[0]) for doc_gold in test_docs)
    scorer = nlp_loaded.evaluate(test_docs)
    end_time = timer()
    gpu_wps = None
    cpu_wps = nwords/(end_time-start_time)
    meta["speed"] = {"gpu": None, "nwords": nwords, "cpu": cpu_wps}

    print("Retrained genia evaluation")
    print("Test results:")
    print("UAS:", scorer.uas)
    print("LAS:", scorer.las)
    print("Tag %:", scorer.tags_acc)
    print("Token acc:", scorer.token_acc)
    with open(os.path.join(model_output_dir, "genia_test.json"), "w+") as metric_file:
        json.dump(scorer.scores, metric_file)
    with open(os.path.join(model_output_dir, "best", "meta.json"), "w") as meta_fp:
        meta_fp.write(json.dumps(meta))

    if ontonotes_path:
        onto_test_docs = list(onto_test_corpus.dev_docs(nlp_loaded))
        print("Retrained ontonotes evaluation")
        scorer_onto_retrained = nlp_loaded.evaluate(onto_test_docs)
        print("Test results:")
        print("UAS:", scorer_onto_retrained.uas)
        print("LAS:", scorer_onto_retrained.las)
        print("Tag %:", scorer_onto_retrained.tags_acc)
        print("Token acc:", scorer_onto_retrained.token_acc)

        with open(os.path.join(model_output_dir, "ontonotes_test.json"), "w+") as metric_file:
            json.dump(scorer_onto_retrained.scores, metric_file)
예제 #27
0
def train_NER(model=None,
              new_model_name="skill",
              output_dir='./models/train_textrank_labels',
              n_iter=5):
    # model = None for starting with an empty model
    # model = 'en_core_web_sm' for starting with a pretrained model
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label('SKILL')  # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        dropout = decaying(0.2, 0, 0.02)
        loss_dict = {}
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.05,
                           losses=losses)  # or drop=next(dropout)
            loss_dict[itn] = losses['ner']
            print("Losses", losses)
        lists = sorted(loss_dict.items())
        x, y = zip(*lists)
        plt.plot(x, y)
        plt.show()

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
예제 #28
0
def training(train_texts,
             train_cats,
             dev_texts,
             dev_cats,
             test_texts,
             test_cats,
             L2,
             learn_rate,
             n_iter,
             output_dir=None):
    """
       Spacy example function modified
       Trains citation needed classifier and saves model
       Parameters:
           train_texts :str -list - text train features
           train_cats :str - list - label citation sentence - TRUE else FALSE
           dev_texts :str - list - text train features
           dev_cats :str - list - label citation sentence - TRUE else FALSE
           test_texts :str - list - text train features
           test_cats :str - list - label citation sentence - TRUE else FALSE
           L2 : int - regularization parameter - default value 1e-6
           learn_rate : learning rate - default rate - 0.001,
           output_dir :str = None - path to save the model
       returns:
           returns list of evaluated metrics (accuracy, f1, precision and recall)
           train_results : list - evaluated metrics for training dataset
           val_results : list - evaluated metrics for validation dataset
       """

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    # Disabling other components
    nlp = spacy.load('en_core_web_sm')
    # Adding the built-in textcat component to the pipeline.
    textcat = nlp.create_pipe("textcat",
                              config={
                                  "exclusive_classes": True,
                                  "architecture": "simple_cnn"
                              })
    nlp.add_pipe(textcat, last=True)
    # Adding the labels to textcat
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        optimizer.L2 = L2
        optimizer.learn_rate = learn_rate
        #dec = decaying(0.6, 0.2, 1e-4)
        dec = decaying(10.0, 1.0, 0.001)
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format(
            'LOSS', 'A_train', 'A_dev', 'A_test', 'P', 'R', 'F'))
        train_results = []
        dev_results = []
        test_results = []
        # Performing training
        for i in range(n_iter):
            losses = {}
            train_data = list(
                zip(train_texts, [{
                    'cats': cats
                } for cats in train_cats]))
            random.shuffle(train_data)
            # (train_texts, train_cats) = zip(*train_data)
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=next(dec),
                           losses=losses)
            # Calling the evaluate() function and printing the train scores
            scores1 = evaluate(nlp.tokenizer, textcat, train_texts, train_cats)
            train_results.append(scores1)
            # Calling the evaluate() function and printing the test scores
            with textcat.model.use_params(optimizer.averages):

                scores2 = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
                scores3 = evaluate(nlp.tokenizer, textcat, test_texts,
                                   test_cats)
            dev_results.append(scores2)
            test_results.append(scores3)
            print(
                '{0:.4f}\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}\t{5:.4f}\t{6:.4f}'
                .format(losses['textcat'], scores1['textcat_a'],
                        scores2['textcat_a'], scores3['textcat_a'],
                        scores1['textcat_p'], scores1['textcat_r'],
                        scores1['textcat_f']))
    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

    return train_results, dev_results, test_results
예제 #29
0
def train(pretrained,
          output_dir,
          train_data,
          dev_data,
          n_iter=30,
          n_sents=0,
          parser_multitasks='',
          entity_multitasks='',
          use_gpu=-1,
          no_tagger=False,
          no_parser=False,
          no_entities=False,
          gold_preproc=False,
          version="0.0.0",
          meta_path=None,
          verbose=False):
    """
    Re-train a pre-trained model. Expects data in spaCy's JSON
    format. This code is based on
    https://github.com/explosion/spaCy/blob/master/spacy/cli/train.py.
    """
    # There is a bug that prevents me from using the GPU when resuming
    # training from a saved model. See
    # https://github.com/explosion/spaCy/issues/1806.
    if use_gpu >= 0:
        msg = "\nWARNING: using GPU may require re-installing thinc. "
        msg += "See https://github.com/explosion/spaCy/issues/1806.\n"
        print(msg)

    util.fix_random_seed()
    util.set_env_log(True)
    n_sents = n_sents or None
    output_path = util.ensure_path(output_dir)
    train_path = util.ensure_path(train_data)
    dev_path = util.ensure_path(dev_data)
    meta_path = util.ensure_path(meta_path)
    if not output_path.exists():
        output_path.mkdir()
    if not train_path.exists():
        prints(train_path, title=Messages.M050, exits=1)
    if dev_path and not dev_path.exists():
        prints(dev_path, title=Messages.M051, exits=1)
    if meta_path is not None and not meta_path.exists():
        prints(meta_path, title=Messages.M020, exits=1)
    meta = util.read_json(meta_path) if meta_path else {}
    if not isinstance(meta, dict):
        prints(Messages.M053.format(meta_type=type(meta)),
               title=Messages.M052,
               exits=1)

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
    max_doc_len = util.env_opt('max_doc_len', 5000)
    corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
    n_train_words = corpus.count_train()

    # Load pre-trained model. Remove components that we are not
    # re-training.
    nlp = load(pretrained)
    if no_tagger and 'tagger' in nlp.pipe_names:
        nlp.remove_pipe('tagger')
    if no_parser and 'parser' in nlp.pipe_names:
        nlp.remove_pipe('parser')
    if no_entities and 'ner' in nlp.pipe_names:
        nlp.remove_pipe('ner')
    meta.setdefault('name', 'unnamed')
    meta['pipeline'] = nlp.pipe_names
    meta.setdefault('lang', nlp.lang)
    nlp.meta.update(meta)

    # Add multi-task objectives
    if parser_multitasks:
        for objective in parser_multitasks.split(','):
            nlp.parser.add_multitask_objective(objective)
    if entity_multitasks:
        for objective in entity_multitasks.split(','):
            nlp.entity.add_multitask_objective(objective)

    # Get optimizer
    optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
    nlp._optimizer = None

    print(nlp.pipe_names)
    print(nlp.pipeline)

    print(
        "Itn.  Dep Loss  NER Loss  UAS     NER P.  NER R.  NER F.  Tag %   Token %  CPU WPS  GPU WPS"
    )
    try:
        train_docs = corpus.train_docs(nlp,
                                       projectivize=True,
                                       noise_level=0.0,
                                       gold_preproc=gold_preproc,
                                       max_length=0)
        train_docs = list(train_docs)
        for i in range(n_iter):
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in minibatch(train_docs, size=batch_sizes):
                    batch = [(d, g) for (d, g) in batch
                             if len(d) < max_doc_len]
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(docs,
                               golds,
                               sgd=optimizer,
                               drop=next(dropout_rates),
                               losses=losses)
                    pbar.update(sum(len(doc) for doc in docs))

            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ('model%d' % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                dev_docs = list(
                    corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc))
                nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                start_time = timer()
                scorer = nlp_loaded.evaluate(dev_docs, verbose)
                end_time = timer()
                if use_gpu < 0:
                    gpu_wps = None
                    cpu_wps = nwords / (end_time - start_time)
                else:
                    gpu_wps = nwords / (end_time - start_time)
                    with Model.use_device('cpu'):
                        nlp_loaded = util.load_model_from_path(
                            epoch_model_path)
                        dev_docs = list(
                            corpus.dev_docs(nlp_loaded,
                                            gold_preproc=gold_preproc))
                        start_time = timer()
                        scorer = nlp_loaded.evaluate(dev_docs)
                        end_time = timer()
                        cpu_wps = nwords / (end_time - start_time)
                acc_loc = (output_path / ('model%d' % i) / 'accuracy.json')
                with acc_loc.open('w') as file_:
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
                meta['speed'] = {
                    'nwords': nwords,
                    'cpu': cpu_wps,
                    'gpu': gpu_wps
                }
                meta['vectors'] = {
                    'width': nlp.vocab.vectors_length,
                    'vectors': len(nlp.vocab.vectors),
                    'keys': nlp.vocab.vectors.n_keys
                }
                meta['lang'] = nlp.lang
                meta['pipeline'] = nlp.pipe_names
                meta['spacy_version'] = '>=%s' % about.__version__
                meta.setdefault('name', 'model%d' % i)
                meta.setdefault('version', version)

                with meta_loc.open('w') as file_:
                    file_.write(json_dumps(meta))
                util.set_env_log(True)
            print_progress(i,
                           losses,
                           scorer.scores,
                           cpu_wps=cpu_wps,
                           gpu_wps=gpu_wps)
    finally:
        print("Saving model...")
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / 'model-final'
            nlp.to_disk(final_model_path)
예제 #30
0
def retrain_model(project, model=None, n_iter=30):
    """Load the model, set up the pipeline and train the entity recognizer."""
    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.0))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 16),
                                   util.env_opt('batch_compound', 1.001))
    if model == 'model_1':
        output_model = 'model_2'
    else:
        output_model = 'model_1'
    al_project = ContentType.objects.get(app_label="spacyal",
                                         model="al_project").model_class()
    project = al_project.objects.get(pk=project)
    base_d = '/'.join(project.texts.path.split('/')[:-1])
    output_dir = os.path.join(base_d, output_model)
    if project.len_training_data() < project.num_retrain:
        message = {'folder': os.path.join(base_d, model),
                   'retrained': False, 'project': project.pk}
        return message
    TRAIN_DATA, eval_data, hist_object = project.get_training_data(
        include_all=True, include_negative=True)
    nlp = spacy.load(os.path.join(base_d, model))# load existing spaCy model
    if project.project_history_set.all().count() == 1:
        project_history = ContentType.objects.get(
            app_label="spacyal", model="project_history").model_class()
        ev = test_model(eval_data, nlp)
        f1 = ev.compute_f1()
        hist2 = project_history.objects.create(
            project=project, eval_f1=f1['fbeta'],
            eval_precission=f1['precission'], eval_recall=f1['recall'])
        hist2.cases_training.add(*list(hist_object.cases_training.all()))
        hist2.cases_evaluation.add(*list(hist_object.cases_evaluation.all()))
    TRAIN_DATA = mix_train_data(nlp, TRAIN_DATA)
    with open(os.path.join(base_d, 'training_data.json'), 'w') as outp:
        json.dump(TRAIN_DATA, outp)
    r = nlp.get_pipe('ner')
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    count_prog_list = list(range(0, n_iter, int(n_iter/10)))
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            if itn in count_prog_list:
                current_task.update_state(
                    state='PROGRESS',
                    meta={'progress': count_prog_list.index(itn)*10,
                          'model': output_model, 'project': project.pk})
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in util.minibatch(TRAIN_DATA, size=batch_sizes):
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=next(dropout_rates),  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
    if not Path(output_dir).exists():
        Path(output_dir).mkdir()
    nlp.to_disk(output_dir)
    print(eval_data)
    ev = test_model(eval_data, nlp)
    f1 = ev.compute_f1()
    hist_object.eval_f1 = f1['fbeta']
    hist_object.eval_precission = f1['precission']
    hist_object.eval_recall = f1['recall']
    hist_object.model_path = output_dir
    hist_object.save()
    message = {'folder': output_dir, 'retrained': True, 'project': project.pk, 'f1': f1}
    return message
예제 #31
0
def main(model=None,
         new_model_name='DCC_ent',
         input_dir=input_dir,
         saved_model_dir=model_dir,
         output_dir=output_dir,
         test_dir=test_dir,
         n_iter=n_iter):
    random.seed(1234)

    # create the training from annotated data produced by using Brat
    data_reading_start_time = time.time()
    training_data = create_training_data(input_dir)
    data_reading_end_time = time.time()
    data_reading_time = data_reading_end_time - data_reading_start_time
    print("--->data reading time: ", data_reading_time)

    # check if the user provides an existing language model
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded existing model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("No model provided, created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    else:
        # otherwise, get it, so we can add labels to it
        ner = nlp.get_pipe('ner')

    # add all new entities to the recognizer
    for i in range(len(new_entities_list)):
        ner.add_label(new_entities_list[i])

    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # start the training of the recognizer (and the time)
    training_start_time = time.time()
    for itn in range(n_iter):
        iter_start_time = time.time()
        dropout = decaying(0.4, 0.2, 1.0e-2)
        random.shuffle(training_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(training_data, size=compounding(4., 32., 1.001))
        for ib, batch in enumerate(batches):
            # print("     batch ", ib)
            ignore_batch = False
            for bl in range(len(batch)):
                # print(batch[bl])
                # print(len(batch[bl]))
                if len(batch[bl]) < 2:
                    ignore_batch = True
            if ignore_batch == True:
                continue
            texts, annotations = zip(*batch)
            # print(texts)
            # print(annotations)
            nlp.update(texts,
                       annotations,
                       sgd=optimizer,
                       drop=0.35,
                       losses=losses)
        iter_end_time = time.time()
        iter_elapsed_time = iter_end_time - iter_start_time
        print('     iter:', itn)
        print('     Losses', losses)
        print('     iter elapsed time:', iter_elapsed_time)

    training_end_time = time.time()
    print("training time: ", training_end_time - training_start_time)

    ############################
    # test the ner model on a set of text data taken from papers
    # (if the user does not provide text data, no testing will be performed)
    if test_dir is not None:
        # test_ner_model(nlp, test_dir)
        test_ner_model(nlp,
                       test_dir,
                       output_dir,
                       out_tag='_ents_from_existing_model')

    ##########################
    # model evaluation
    #
    # define a set of examples that will be used as ground truth
    examples = [
        ('Deep learning is applied in many every day application with great success in object recognition.',
         [(0, 13, 'Method'), (77, 95, 'Task')]),
        ('Recurrent neural networks are used for forecasting and natural language processing.',
         [(0, 25, 'Method'), (39, 50, 'Task'), (55, 82, 'Task')]),
        ('Convolutional neural networks are frequently used in object recognition and medical image processing.',
         [(0, 29, 'Method'), (53, 72, 'Task'), (84, 101, 'Task')])
    ]
    res = ner_eval(nlp, examples)
    print("\nModel evaluation results:")
    print(res)

    ############################################
    # save trained model
    # (if the user does not provide a directory, the trained model will not be saved)
    if saved_model_dir is not None:
        saved_model_dir = Path(saved_model_dir)
        if not saved_model_dir.exists():
            saved_model_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(saved_model_dir)
        print("The model was saved to the directory: ", saved_model_dir)