Exemplo n.º 1
0
    def dataset_build(self, opt):
        fields = onmt.IO.get_fields("text", 0, 0)

        train = preprocess.build_dataset('train', fields, opt)

        onmt.IO.build_vocab(train, opt.data_type, opt.share_vocab,
                            opt.src_vocab_size, opt.src_words_min_frequency,
                            opt.tgt_vocab_size, opt.tgt_words_min_frequency)

        preprocess.build_dataset('valid', fields, opt)
Exemplo n.º 2
0
def train_it(train_path, checkpoint_filepath, model_path, start, span):
    dataset = build_dataset(train_path)
    train_x, train_y = [], []
    valid_x, valid_y = [], []
    rng = np.random.RandomState(0)
    k = 0
    for x, y in dataset.as_numpy_iterator():
        x = [str(i, 'utf-8') for i in x]
        y = [str(i, 'utf-8') for i in y]
        rnum = rng.rand()
        k += 1
        if rnum < start or rnum >= start + span:
            train_x += [x]
            train_y += [y]
        else:
            valid_x += [x]
            valid_y += [y]
    # dataset = dataset.batch(32)
    print('====' * 8)
    print('total = ', k)
    print('start , span = ', (start, span))
    print('len train = ', len(train_x))
    # checkpoint_filepath = './checkpoint'
    if not os.path.exists(os.path.dirname(checkpoint_filepath)):
        os.mkdir(os.path.dirname(checkpoint_filepath))

    # model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    #     filepath=checkpoint_filepath,
    #     save_weights_only=True,
    #     monitor='val_accuracy',
    #     mode='max',
    #     save_best_only=True)

    model = BiLSTM_CRF_Model(bert_embed, sequence_length=100)
    evaluator = Evaluator(model, checkpoint_filepath, valid_x, valid_y)
    model.fit(train_x,
              train_y,
              valid_x,
              valid_y,
              batch_size=64,
              epochs=20,
              callbacks=[evaluator])
    model.save(model_path)
Exemplo n.º 3
0
def predict_it(test_path, model_path, output_path):
    bert_embed = BertEmbedding(bert_path)
    dataset = build_dataset(test_path)
    test_x, test_y = [], []
    for x, y in dataset.as_numpy_iterator():
        x = [str(i, 'utf-8') for i in x]
        y = [str(i, 'utf-8') for i in y]
        test_x += [x]
        test_y += [y]

    # 加载保存模型
    loaded_model = kashgari.utils.load_model('saved_ner_model')
    # loaded_model = tf.keras.models.load_model(model_path)
    loaded_model.tf_model.load_weights(model_path)
    # 使用模型进行预测
    test_y = loaded_model.predict(test_x)
    with open(output_path, 'w') as f:
        for y in test_y:
            f.write('\t'.join(y) + '\n')
    print('predict_it done {} {} {}'.format(test_path, model_path,
                                            output_path))
Exemplo n.º 4
0
def train():

    # to change learnig rate every 100 interations
    # def Scheduler(epoch):
    # 	lr = K.eval(model.optimizer.lr)
    # 	if epoch == 10:
    # 		new_lr = lr * 0.1
    # 	# elif epoch == 12:
    # 	# 	new_lr = 0.0001
    # 	# elif epoch == 25:
    # 	# 	new_lr = 0.002
    # 	elif epoch != 0 and epoch % 30 == 0:
    # 		new_lr = lr * 0.1
    # 	else:
    # 		new_lr = lr
    # 	model.optimizer.lr.assign(new_lr)
    # 	return new_lr

    X, Y = preprocess.build_dataset()
    print('X shape:', X.shape)
    print('Y shape:', Y.shape)

    temp_model_file = os.path.join(config.PATH, "temp_model.h5")
    if os.path.exists(temp_model_file):
        model = load_model(temp_model_file)
        print("LSTM Network loaded")
    else:
        model = create_model()
        print("LSTM Network created")

    # model summary
    print("Model Summary:")
    print(model.summary())

    # define the checkpoint and learning rate change
    filepath = os.path.join(
        config.CHKPT_PATH,
        "weights-improvement-{epoch:03d}-{val_acc:.4f}.hdf5")
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')
    reduce_lr = ReduceLROnPlateau(monitor='val_acc',
                                  factor=0.1,
                                  patience=10,
                                  min_lr=0.000001)
    logfilepath = os.path.join(config.PATH, "logs.csv")
    logger = CSVLogger(logfilepath)
    stopper = EarlyStopping(monitor='val_acc',
                            min_delta=0.00001,
                            patience=15,
                            verbose=1,
                            mode='auto')
    # lr_change = LearningRateScheduler(Scheduler)
    callbacks_list = [
        checkpoint, reduce_lr, logger,
        LearningRatePrinter(), stopper
    ]

    # fit the model
    history = model.fit(X,
                        Y,
                        batch_size=config.BATCH_SIZE,
                        validation_split=0.2,
                        verbose=2,
                        epochs=config.NUM_EPOCHS,
                        callbacks=callbacks_list)
    print("LSTM Network trained")

    # save model
    model.save(config.MODEL_FILE)
    print("LSTM Network saved")

    # serialize model to JSON
    model_json = model.to_json()
    json_filename = os.path.join(config.PATH, "model_json.json")
    with open(json_filename, "w") as json_file:
        json_file.write(model_json)

    # delete the existing model
    del model

    # save history
    pkl_filename = os.path.join(config.PATH, "history.pkl")
    pickle.dump(history.history, open(pkl_filename, "wb"))

    # statistics of training the model
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('LSTM accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt_filename = os.path.join(config.PATH, 'LSTM accuracy.png')
    plt.savefig(plt_filename)
    plt.show()

    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('LSTM loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt_filename = os.path.join(config.PATH, 'LSTM loss.png')
    plt.savefig(plt_filename)
    plt.show()
Exemplo n.º 5
0
#               'soc.religion.christian',
#               'talk.politics.guns',
#               'talk.politics.mideast',
#               'talk.religion.misc']
categories = [
    'comp.os.ms-windows.misc',
    'rec.motorcycles',
    'sci.space',
    'sci.crypt',
    'sci.electronics',
    'soc.religion.christian',
    'talk.politics.guns',
    'talk.politics.mideast',
]
vocab, train_set, test_set = build_dataset(train_size=4000,
                                           test_rate=0.1,
                                           categories=categories)
vocab_size = len(vocab)
input_dim = 256
hidden_dim = 128
output_dim = len(categories)
in_channels = 1
out_channels = 256
kernel_sizes = [3, 4, 5]
keep_proba = 0.5
print('vocab size: ', vocab_size, '  output_dim: ', output_dim)
print('train size: ', len(train_set), '  test size: ', len(test_set))

model = RNN(vocab_size, input_dim, hidden_dim, output_dim)
# model = CNN(vocab_size, input_dim, output_dim, in_channels, out_channels, kernel_sizes, keep_proba)
Exemplo n.º 6
0
@author: cbasu
"""
from preprocess import build_dataset, get_corpus, make_mappings
import numpy as np
import argparse
import os

parser = argparse.ArgumentParser(description='Trains a LSTM on Text provided.')
parser.add_argument('file_path',
                    metavar='F',
                    type=str,
                    help='A path to a text file.')
args = parser.parse_args()
corpus = get_corpus(args.file_path)
inputs, outputs = build_dataset(corpus)
word_to_id, id_to_word = make_mappings(corpus)

from lstm import LSTM

clf = LSTM(inputs.shape[1], outputs.shape[1], 250, 128)

for i in range(10000):
    clf.fit(inputs, outputs, learning_rate=.001, epochs=1)

    generated = []
    index = np.random.randint(len(inputs))
    init = inputs[index]
    hprev, cprev = clf.hidden_states[-1], clf.internal_memory[-1]

    generated.append(id_to_word[np.argmax(init)])