Exemplo n.º 1
0
def eval(dataset_type='conll2003',
         lang='en',
         architecture='BidLSTM_CRF',
         use_ELMo=False,
         use_BERT=False,
         data_path=None):

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading CoNLL-2003 NER data...')
        x_test, y_test = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')
        stats(x_eval=x_test, y_eval=y_test)

        # load model
        model_name = 'ner-en-conll2003'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture
        model = Sequence(model_name)
        model.load()

    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_test, y_test = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')
        stats(x_eval=x_test, y_eval=y_test)

        # load model
        model_name = 'ner-en-conll2012'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture
        model = Sequence(model_name)
        model.load()

    else:
        print("dataset/language combination is not supported for fixed eval:",
              dataset_type, lang)
        return

    start_time = time.time()

    print("\nEvaluation on test set:")
    model.eval(x_test, y_test)
    runtime = round(time.time() - start_time, 3)

    print("runtime: %s seconds " % (runtime))
Exemplo n.º 2
0
def train(dataset_type='conll2003',
          lang='en',
          embeddings_name=None,
          architecture='BidLSTM_CRF',
          transformer=None,
          data_path=None,
          use_ELMo=False):

    batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \
        configure(architecture, dataset_type, lang, embeddings_name, use_ELMo)

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading data...')
        x_train1, y_train1 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.train')
        x_train2, y_train2 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testa')
        x_train3, y_train3 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')

        # we concatenate all sets
        x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0)
        y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0)

        # split train and valid sets in a random way
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-en-conll2003-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)
    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_train1, y_train1 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.train')
        x_train2, y_train2 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.dev')
        x_train3, y_train3 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')

        # we concatenate train and valid sets
        x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0)
        y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0)

        # split train and valid sets in a random way
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-en-conll2012-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)
    elif (lang == 'fr'):
        print('Loading data...')
        dataset_type = 'lemonde'
        x_all, y_all = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml')
        shuffle_arrays([x_all, y_all])
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-fr-lemonde-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    #elif (dataset_type == 'ontonotes') and (lang == 'en'):
    #    model = sequenceLabelling.Sequence('ner-en-ontonotes', max_epoch=60, embeddings_name=embeddings_name)
    #elif (lang == 'fr'):
    #    model = sequenceLabelling.Sequence('ner-fr-lemonde', max_epoch=60, embeddings_name=embeddings_name)

    start_time = time.time()
    model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    # saving the model
    model.save()
Exemplo n.º 3
0
def train_eval(embeddings_name=None,
               dataset_type='conll2003',
               lang='en',
               architecture='BidLSTM_CRF',
               transformer=None,
               fold_count=1,
               train_with_validation_set=False,
               data_path=None,
               use_ELMo=False):

    batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \
        configure(architecture, dataset_type, lang, embeddings_name, use_ELMo)

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading CoNLL 2003 data...')
        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.train')
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testa')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-conll2003-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        if not train_with_validation_set:
            # restrict training on train set, use validation set for early stop, as in most papers
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             early_stop=True,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003)
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=False,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)

    elif (dataset_type == 'ontonotes-all') and (lang == 'en'):
        print(
            "Loading all Ontonotes 5.0 XML data, evaluation will be on 10\% random partition"
        )
        x_all, y_all = load_data_and_labels_ontonotes(data_path)
        x_train_all, x_eval, y_train_all, y_eval = train_test_split(
            x_all, y_all, test_size=0.1)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                              y_train_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-ontonotes-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         fold_number=fold_count,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)

    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.train')
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.dev')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-conll2012-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        if not train_with_validation_set:
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             early_stop=True,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
        else:
            # also use validation set to train (no early stop, hyperparameters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=False,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)

    elif (lang == 'fr') and (dataset_type == 'ftb' or dataset_type is None):
        print('Loading data for ftb...')
        x_all, y_all = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml')
        shuffle_arrays([x_all, y_all])
        x_train_all, x_eval, y_train_all, y_eval = train_test_split(
            x_all, y_all, test_size=0.1)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                              y_train_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-fr-lemonde-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        model = Sequence(model_name,
                         max_epoch=max_epoch,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embeddings_name,
                         fold_number=fold_count,
                         architecture=architecture,
                         transformer_name=transformer,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         early_stop=early_stop,
                         patience=patience,
                         max_sequence_length=max_sequence_length,
                         use_ELMo=use_ELMo,
                         multiprocessing=multiprocessing)
    elif (lang == 'fr') and (dataset_type == 'ftb_force_split'):
        print('Loading data for ftb_force_split...')
        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/leMonde/ftb6_train.conll')
        shuffle_arrays([x_train, y_train])
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/leMonde/ftb6_dev.conll')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/leMonde/ftb6_test.conll')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-fr-lemonde-force-split-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        if not train_with_validation_set:
            # restrict training on train set, use validation set for early stop, as in most papers
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=True,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003)
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=False,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
    elif (lang == 'fr') and (dataset_type == 'ftb_force_split_xml'):
        print('Loading data for ftb_force_split_xml...')
        x_train, y_train = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.train.xml'
        )
        shuffle_arrays([x_train, y_train])
        x_valid, y_valid = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.dev.xml')
        x_eval, y_eval = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.test.xml'
        )
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-fr-lemonde-force-split-xml-' + architecture
        if use_ELMo:
            model_name += '-with_ELMo'

        if not train_with_validation_set:
            # restrict training on train set, use validation set for early stop, as in most papers
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=True,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003)
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embeddings_name,
                             early_stop=False,
                             fold_number=fold_count,
                             architecture=architecture,
                             transformer_name=transformer,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             patience=patience,
                             max_sequence_length=max_sequence_length,
                             use_ELMo=use_ELMo,
                             multiprocessing=multiprocessing)
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    start_time = time.time()
    if fold_count == 1:
        model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    else:
        model.train_nfold(x_train, y_train, x_valid=x_valid, y_valid=y_valid)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    print("\nEvaluation on test set:")
    model.eval(x_eval, y_eval)

    # # saving the model (must be called after eval for multiple fold training)
    model.save()
Exemplo n.º 4
0
def train_eval(embedding_name,
               dataset_type='conll2003',
               lang='en',
               architecture='BidLSTM_CRF',
               fold_count=1,
               train_with_validation_set=False,
               use_ELMo=False,
               use_BERT=False,
               data_path=None):

    if (architecture == "BidLSTM_CNN_CRF"):
        word_lstm_units = 200
        max_epoch = 30
        recurrent_dropout = 0.5
    else:
        word_lstm_units = 100
        max_epoch = 25
        recurrent_dropout = 0.5

    if use_ELMo or use_BERT:
        batch_size = 120
    else:
        batch_size = 20

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading CoNLL 2003 data...')
        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.train')
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testa')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-conll2003'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        if not train_with_validation_set:
            # restrict training on train set, use validation set for early stop, as in most papers
            model = Sequence(model_name,
                             max_epoch=60,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embedding_name,
                             early_stop=True,
                             fold_number=fold_count,
                             model_type=architecture,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003)
            model = Sequence(model_name,
                             max_epoch=max_epoch,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embedding_name,
                             early_stop=False,
                             fold_number=fold_count,
                             model_type=architecture,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT)

    elif (dataset_type == 'ontonotes-all') and (lang == 'en'):
        print('Loading Ontonotes 5.0 XML data...')
        x_all, y_all = load_data_and_labels_ontonotes(data_path)
        x_train_all, x_eval, y_train_all, y_eval = train_test_split(
            x_all, y_all, test_size=0.1)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                              y_train_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-ontonotes'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=60,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embedding_name,
                         early_stop=True,
                         fold_number=fold_count,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)

    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_train, y_train = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.train')
        x_valid, y_valid = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.dev')
        x_eval, y_eval = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-en-conll2012'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        if not train_with_validation_set:
            model = Sequence(model_name,
                             max_epoch=80,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embedding_name,
                             early_stop=True,
                             fold_number=fold_count,
                             model_type=architecture,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT)
        else:
            # also use validation set to train (no early stop, hyperparmeters must be set preliminarly),
            # as (Chui & Nochols, 2016) and (Peters and al., 2017)
            # this leads obviously to much higher results
            model = Sequence(model_name,
                             max_epoch=40,
                             recurrent_dropout=recurrent_dropout,
                             embeddings_name=embedding_name,
                             early_stop=False,
                             fold_number=fold_count,
                             model_type=architecture,
                             word_lstm_units=word_lstm_units,
                             batch_size=batch_size,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT)

    elif (lang == 'fr'):
        print('Loading data...')
        dataset_type = 'lemonde'
        x_all, y_all = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml')
        x_train_all, x_eval, y_train_all, y_eval = train_test_split(
            x_all, y_all, test_size=0.1)
        x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                              y_train_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval)

        model_name = 'ner-fr-lemonde'
        if use_ELMo:
            model_name += '-with_ELMo'
            # custom batch size for French ELMo
            batch_size = 20
        elif use_BERT:
            # need to find a French BERT :/
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=60,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embedding_name,
                         early_stop=True,
                         fold_number=fold_count,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    start_time = time.time()
    if fold_count == 1:
        model.train(x_train, y_train, x_valid, y_valid)
    else:
        model.train_nfold(x_train,
                          y_train,
                          x_valid,
                          y_valid,
                          fold_number=fold_count)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    print("\nEvaluation on test set:")
    model.eval(x_eval, y_eval)

    # saving the model
    model.save()
Exemplo n.º 5
0
def train(embedding_name,
          dataset_type='conll2003',
          lang='en',
          architecture='BidLSTM_CRF',
          use_ELMo=False,
          use_BERT=False,
          data_path=None):

    if (architecture == "BidLSTM_CNN_CRF"):
        word_lstm_units = 200
        recurrent_dropout = 0.5
    else:
        word_lstm_units = 100
        recurrent_dropout = 0.5

    if use_ELMo:
        batch_size = 100
    else:
        batch_size = 20

    if (dataset_type == 'conll2003') and (lang == 'en'):
        print('Loading data...')
        x_train1, y_train1 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.train')
        x_train2, y_train2 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testa')
        x_train3, y_train3 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2003/eng.testb')

        # we concatenate all sets
        x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0)
        y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0)

        # split train and valid sets in a random way
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-en-conll2003'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=60,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embedding_name,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)
    elif (dataset_type == 'conll2012') and (lang == 'en'):
        print('Loading Ontonotes 5.0 CoNLL-2012 NER data...')

        x_train1, y_train1 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.train')
        x_train2, y_train2 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.dev')
        x_train3, y_train3 = load_data_and_labels_conll(
            'data/sequenceLabelling/CoNLL-2012-NER/eng.test')

        # we concatenate train and valid sets
        x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0)
        y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0)

        # split train and valid sets in a random way
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-en-conll2012'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=80,
                         recurrent_dropout=0.20,
                         embeddings_name=embedding_name,
                         early_stop=True,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)
    elif (lang == 'fr'):
        print('Loading data...')
        dataset_type = 'lemonde'
        x_all, y_all = load_data_and_labels_lemonde(
            'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml')
        x_train, x_valid, y_train, y_valid = train_test_split(x_all,
                                                              y_all,
                                                              test_size=0.1)
        stats(x_train, y_train, x_valid, y_valid)

        model_name = 'ner-fr-lemonde'
        if use_ELMo:
            model_name += '-with_ELMo'
        elif use_BERT:
            model_name += '-with_BERT'
        model_name += '-' + architecture

        model = Sequence(model_name,
                         max_epoch=60,
                         recurrent_dropout=recurrent_dropout,
                         embeddings_name=embedding_name,
                         model_type=architecture,
                         word_lstm_units=word_lstm_units,
                         batch_size=batch_size,
                         use_ELMo=use_ELMo,
                         use_BERT=use_BERT)
    else:
        print("dataset/language combination is not supported:", dataset_type,
              lang)
        return

    #elif (dataset_type == 'ontonotes') and (lang == 'en'):
    #    model = sequenceLabelling.Sequence('ner-en-ontonotes', max_epoch=60, embeddings_name=embedding_name)
    #elif (lang == 'fr'):
    #    model = sequenceLabelling.Sequence('ner-fr-lemonde', max_epoch=60, embeddings_name=embedding_name)

    start_time = time.time()
    model.train(x_train, y_train, x_valid, y_valid)
    runtime = round(time.time() - start_time, 3)
    print("training runtime: %s seconds " % (runtime))

    # saving the model
    model.save()