예제 #1
0
def train():
    input = Input(shape=(max_len, ))
    model = Embedding(input_dim=n_words + 1,
                      output_dim=100,
                      input_length=max_len,
                      mask_zero=True)(input)  # 20-dim embedding
    model = Bidirectional(
        LSTM(units=50, return_sequences=True,
             recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(
        model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)

    model.compile(optimizer="rmsprop",
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])
    model.summary()

    history = model.fit(x_train,
                        np.array(y_train),
                        batch_size=64,
                        epochs=5,
                        validation_split=0.1,
                        verbose=1)
    save_load_utils.save_all_weights(model, filepath="models/bilstm-crf.h5")

    hist = pd.DataFrame(history.history)
    print(hist)
    plt.figure(figsize=(12, 12))
    plt.plot(hist["crf_viterbi_accuracy"])
    plt.plot(hist["val_crf_viterbi_accuracy"])
    plt.show()
def test_save_and_load_all_weights():
    '''
    Test save_all_weights and load_all_weights. Save and load optimizer and model weights but not configuration.
    '''
    def make_model():
        _x = Input((10, ))
        _y = Dense(10)(_x)
        _m = Model(_x, _y)
        _m.compile('adam', 'mean_squared_error')
        _m._make_train_function()
        return _m

    # make a model
    m1 = make_model()
    # set weights
    w1 = m1.layers[1].kernel  # dense layer
    w1value = K.get_value(w1)
    w1value[0, 0:4] = [1, 3, 3, 7]
    K.set_value(w1, w1value)
    # set optimizer weights
    ow1 = m1.optimizer.weights[3]  # momentum weights
    ow1value = K.get_value(ow1)
    ow1value[0, 0:3] = [4, 2, 0]
    K.set_value(ow1, ow1value)
    # save all weights
    save_all_weights(m1, 'model.h5')
    # new model
    m2 = make_model()
    # load all weights
    load_all_weights(m2, 'model.h5')
    # check weights
    assert_allclose(K.get_value(m2.layers[1].kernel)[0, 0:4], [1, 3, 3, 7])
    # check optimizer weights
    assert_allclose(K.get_value(m2.optimizer.weights[3])[0, 0:3], [4, 2, 0])
    os.remove('model.h5')
예제 #3
0
def trainKerasModel(max_len, num_LSTM_Units, learning_rate, vector_dim,
                    num_docs, embedding_matrix, embeddingLayerFlag,
                    embeddingFlag, dropout, batch_size, epochs, X_Train,
                    y_Train, experiment):
    model, crf = createModelArchitecture(max_len, num_LSTM_Units, vector_dim,
                                         num_docs, embedding_matrix, dropout)
    print("Model Architecture Created")
    callbacks = [EarlyStopping(patience=2, monitor='val_loss')]

    model.compile(loss=crf.loss_function,
                  optimizer=RMSprop(lr=learning_rate),
                  metrics=[crf.accuracy])
    model.summary()

    history = model.fit(X_Train,
                        np.array(y_Train),
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_split=0.15,
                        verbose=1,
                        callbacks=callbacks)

    print("Model Training Done, Saving...")
    modelName = 'KerasModel' + '_lstm' + str(num_LSTM_Units) + '_lr' + str(
        learning_rate) + '_dropOut' + str(dropout) + '_bSize' + str(
            batch_size) + '_epochs' + str(
                epochs
            ) + '_' + embeddingLayerFlag + '_' + embeddingFlag + '_' + str(
                experiment) + 'exp.h5'
    save_load_utils.save_all_weights(model,
                                     path + modelName,
                                     include_optimizer=False)
    print("Model Saved")
예제 #4
0
def train(train_x, train_y, train_docs, valid_x, valid_docs, word_vector,
          wv_for_score_model, score_clf):

    resultWriter = open(FLAGS.val_acc_predict, 'w', encoding='utf8')
    trainloss = open(FLAGS.train_loss, 'w', encoding='utf8')

    model_object = dnnModel(FLAGS)
    model = model_object.build_bilstm_model(word_vector)
    maxF1 = 0.0
    file_list = ['../dictionary/trainSenDict', '../dictionary/theme1117.txt']
    dictionary_list = get_dict(file_list)
    valid_dict_index = matrix_index(valid_docs, dictionary_list, FLAGS.max_len)

    for i in range(FLAGS.epoch):
        resultWriter.write('recycle number is ' + str(i) + '\n')
        trainloss.write('recycle number is ' + str(i) + '\n')
        batchIter = batch_iter(len(train_x), zip(train_x, train_y, train_docs),
                               FLAGS.batch_size, False)
        j = 0
        for zip_xydoc in batchIter:
            print(str(i) + ' th epoch, ' + str(j) + ' th batch.')
            j += 1
            print('train.....')
            batch_x, batch_y, batch_doc = zip(*zip_xydoc)
            batch_x = np.array(list(batch_x))
            batch_y = np.array(list(batch_y))
            dict_index = matrix_index(batch_doc, dictionary_list,
                                      FLAGS.max_len)
            trainHistory = model.train_on_batch([batch_x, dict_index], batch_y)
            print("train loss is: " + str(trainHistory[0]) + '\n')
            trainloss.write(str(trainHistory[0]))
            if (i > FLAGS.evaluate_epoch and j % FLAGS.checkpoint_every == 0):
                f1 = evaluateVal(model,
                                 valid_x,
                                 FLAGS.val_label_file,
                                 valid_docs,
                                 valid_dict_index,
                                 wv_for_score_model,
                                 score_clf,
                                 window=FLAGS.window_size)
                print("the validation f1 score is " + str(f1))
                if (f1 > maxF1):
                    print(
                        '####################update model#####################################################'
                    )
                    maxF1 = f1
                    resultWriter.write("the validation f1 is " + str(f1) +
                                       '\n')
                    predict(model, FLAGS.val_data_file, FLAGS.result_file,
                            wv_for_score_model, score_clf, FLAGS.window_size,
                            False)
                    resultWriter.flush()
                    trainloss.flush()
                    save_load_utils.save_all_weights(model, FLAGS.model_path)
            del batch_x
            del batch_y
        del batchIter
    trainloss.close()
    resultWriter.close()
예제 #5
0
    def save(self, path):
        """
        Save model to path

        Args:
            path (str): path to save model weights
        """
        save_load_utils.save_all_weights(self.model, path)
예제 #6
0
    def save(self, path):
        """
        Save model to path

        Args:
            path (str): path to save model weights
        """
        save_load_utils.save_all_weights(self.model, path)
    def on_epoch_end(self, epoch, logs={}):
        """
            At the end of each epoch, compute the F1 score for the validation data.
            In case of multi-outputs model, compute one value per output and average all to return the overall F1 score.
            Same model's weights for the best epoch.
        """
        self.compute_epoch_training_F1()
        in_length = len(self.model.input_layers)  # X data - to predict from
        out_length = len(self.model.output_layers)  # Number of tasks

        # Compute the model predictions
        predictions = self.model.predict(self.validation_data[:in_length])
        # In case of single output
        if len(predictions) != out_length:
            predictions = [predictions]

        vals_acc = []
        vals_recall = []
        vals_f1 = []
        reports = ""
        # Iterate over all output predictions
        for i, pred in enumerate(predictions):
            _val_acc, _val_recall, _val_f1 = self.compute_scores(
                np.asarray(pred), self.validation_data[in_length + i])

            # Classification report
            reports += "For task " + str(i + 1) + "\n"
            reports += "===================================================================================="
            reports += self.classification_report(
                i, np.asarray(pred),
                self.validation_data[in_length + i]) + "\n\n\n"

            # Add scores internally
            vals_acc.append(_val_acc)
            vals_recall.append(_val_recall)
            vals_f1.append(_val_f1)

            # Add F1 score to be log
            f1_name = "val_" + self.model.output_layers[i].name + "_f1"
            logs[f1_name] = _val_f1

        # Add classification reports for all the predicitions/tasks
        self.test_report.append(reports)

        # Add internally
        self.test_acc.append(sum(vals_acc) / len(vals_acc))
        self.test_recall.append(sum(vals_recall) / len(vals_recall))
        self.test_f1s.append(sum(vals_f1) / len(vals_f1))

        # Add to log
        f1_mean = sum(vals_f1) / len(vals_f1)
        logs["val_f1"] = f1_mean

        # Save best model's weights
        if f1_mean > self.best_score:
            self.best_score = f1_mean
            save_load_utils.save_all_weights(self.model, self.model_save_path)
예제 #8
0
파일: model.py 프로젝트: gsj4ever/DCWS
def build_model(x, y, vocab_size, max_len):
    """Build up and train a bi-directional LSTM + CRF model, saving model architecture and weights, as well as history
    :param x:
    :param y:
    :param vocab_size:
    :param max_len:
    :return:
    """

    # TODO: read from an existing Word2Vec model, to enhance embedding performance

    model = Sequential()
    model.add(
        Embedding(input_dim=vocab_size + 1,
                  output_dim=EMBEDDING_SIZE,
                  input_length=max_len,
                  mask_zero=True))
    model.add(Bidirectional(LSTM(HIDDEN_UNITS, return_sequences=True)))
    model.add(Dropout(DROPOUT_RATE))
    model.add(Bidirectional(LSTM(HIDDEN_UNITS, return_sequences=True)))
    model.add(Dropout(DROPOUT_RATE))

    # TODO: consider to add a CNN layer to get higher accuracies

    model.add(TimeDistributed(Dense(HIDDEN_UNITS, activation='relu')))
    crf = CRF(5)  # CAUTION!!! sparse_target: True for index, False for one-hot
    model.add(crf)
    model.summary()

    model.compile(optimizer='adam',
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])

    checkpointer = ModelCheckpoint(filepath='./data/weights.hdf5',
                                   monitor='val_loss',
                                   verbose=1,
                                   save_best_only=True)
    stopper = EarlyStopping(monitor="val_loss", patience=2)
    terminator = TerminateOnNaN()
    history = model.fit(x,
                        y,
                        batch_size=BATCH_SIZE,
                        epochs=EPOCH_NUM,
                        validation_split=0.1,
                        callbacks=[checkpointer, stopper, terminator])

    # Save model architecture and weights
    with open('./data/model_architecture.json', 'w') as f:
        f.write(model.to_json())
    save_load_utils.save_all_weights(model, './data/model_weights.hdf5')

    with open('./data/history', 'wb') as f:
        pickle.dump(history.history, f)
예제 #9
0
 def save(self, file_path):
     """ Saves a model to the local disk, provided a file path. """
     save_path = Path(file_path)
     mkdir(save_path)
     model_save_path = save_path.joinpath("KerasNER.model")
     config_save_path = save_path.joinpath("KerasNER.config")
     arch_save_path = save_path.joinpath("KerasNER.json")
     encoder_save_path = save_path.joinpath("encoder")
     if self.config.get_parameter("use_crf"):
         save_load_utils.save_all_weights(self.model, str(model_save_path))
     else:
         self.model.save(str(model_save_path))
     self.config.save(config_save_path)
     # human-readable model architecture in json
     with open(arch_save_path, "w") as wf:
         wf.write(self.model.to_json())
     self.encoder.save(encoder_save_path)
def save_model(model, filename):
    save_load_utils.save_all_weights(model, filename)
예제 #11
0
 def save(self, path):
     save_load_utils.save_all_weights(self.model, path)
def save_model(model, name, result_folder):
    path = os.path.join(result_folder, name + '.model')
    save_load_utils.save_all_weights(model, path)
    logger.info('saving model under ' + path)
예제 #13
0
    def model_with_padding(self, DICT, n_char):

        # get sequences and labels separated.
        # convert BIO tags to numbers
        sequences, labels = self.get_seq(DICT)

        # sequences = sequences[:100]
        # labels = labels[:100]

        # X = pad_sequences(sequences, maxlen=self.w_arit_mean, padding='post', truncating='post')
        # y_pad = pad_sequences(labels, maxlen=self.w_arit_mean, padding='post', truncating='post')

        X = pad_sequences(sequences, maxlen=self.maxSeqLength, padding='post')
        y_pad = pad_sequences(labels, maxlen=self.maxSeqLength, padding='post')

        y = [to_categorical(i, num_classes=self.lab_len) for i in y_pad]

        # early stopping and best epoch
        #early_stop = keras.callbacks.EarlyStopping(monitor='loss', patience=2, verbose=0, mode='auto')
        #filepath = "max-seq.h5"
        #checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='max')
        #callbacks_list = [checkpoint, early_stop]

        # Set up the keras model
        input = Input(shape=(self.maxSeqLength, ))
        el = Embedding(n_char + 1, 200, name="embed")(input)
        bl1 = Bidirectional(LSTM(128,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm1")(el)
        bl2 = Bidirectional(LSTM(64,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm2")(bl1)
        bl3 = Bidirectional(LSTM(64,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm3")(bl2)
        model = TimeDistributed(Dense(self.lab_len, activation="relu"))(bl3)
        crf = CRF(self.lab_len)  # CRF layer
        out = crf(model)  # output

        model = Model(input, out)
        model.compile(optimizer="rmsprop",
                      loss=crf.loss_function,
                      metrics=[crf.accuracy])
        model.summary()

        #treinar com 32, 147, 245, 735
        history = model.fit(X,
                            np.array(y),
                            batch_size=32,
                            epochs=self.epochsN,
                            validation_split=0.0,
                            verbose=1)
        # save all epochs
        save_load_utils.save_all_weights(model,
                                         'max_seq_%s_32b.h5' % self.epochsN)
예제 #14
0
model.summary()



history = model.fit([X_train_sents, X_train_sents, X_train_pos, X_train_npos, X_train_features], y_train_ner,
                    batch_size=BATCH_SIZE,
                    epochs=MAX_EPOCHS,
                    verbose=2)

hist_dict = history.history


# save the model
# because we are using keras-contrib, we must save weights like this, and load into network
# (see decoding.ipynb)
save_load_utils.save_all_weights(model, '../model/nltkposcrf_model.h5')
np.save('../model/nltkhist_dict.npy', hist_dict)
print("models saved!\n")


preds = model.predict([X_test_sents, X_test_sents, X_test_pos, X_test_npos, X_test_features])



preds = np.argmax(preds, axis=-1)
preds.shape
print(preds[:5])

trues = np.squeeze(y_test_ner, axis=-1)
trues.shape
    model.add(TimeDistributed(LSTM(BiRNN_UNITS, return_sequences=False)))

    model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
    crf = CRF(len(class_labels), sparse_target=True)
    model.add(crf)
    model.summary()

    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
    model.fit(train_x,
              train_y,
              epochs=EPOCHS,
              validation_data=[test_x, test_y])

    test_y_pred = model.predict(test_x).argmax(-1)[test_x > 0]
    test_y_true = test_y[test_x > 0]

    print('\n---- Result of Character Emebdding + BiLSTM-CRF ----\n')

classification_report(test_y_true, test_y_pred, class_labels)
plotConfusionMatrix(test_y_true, test_y_pred, class_labels,
                    reverseDictionary(dictionary_labels))

model.save('modelo.h5')  # creates a HDF5 file 'my_model.h5'
save_load_utils.save_all_weights(model, 'pesos.h5')

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
예제 #16
0
    def model_no_padding(self, DICT, n_char):

        # convert BIO tags to numbers
        self.convert_tags()
        '''
        check if bion contains 'B' and 'I'
        for i in self.train_data:
            print(i['bion'])
        '''

        for i in range(len(self.train_data)):
            corp = self.train_data[i]['corpus']

            corp_num = []
            for c in corp:
                corp_num.append(DICT.get(c))
            self.train_data[i]['corpus'] = corp_num

        # get all sizes from the sequences with training data
        train_l_d = {}
        train_l_labels = {}
        for seq in self.train_data:
            # corpus
            l = len(seq['corpus'])
            if l not in train_l_d: train_l_d[l] = []
            train_l_d[l].append(seq['corpus'])

            # labels
            l1 = len(seq['bion'])
            if l1 not in train_l_labels: train_l_labels[l1] = []
            train_l_labels[l1].append(seq['bion'])
        '''
        for i in range(len(train_l_d[110])):
            print(len(train_l_d[110][i]) == len(train_l_labels[110][i]))
            print()
        print("\n\n")

        for i in range(len(train_l_d[31])):
            print(len(train_l_d[31][i]) == len(train_l_labels[31][i]))
        print("\n\n")

        for i in range(len(train_l_d[103])):
            print(len(train_l_d[103][i]) == len(train_l_labels[103][i]))
        print("\n\n")
        exit()
        '''
        sizes = list(train_l_d.keys())

        # Set up the keras model
        il = Input(shape=(None, ), dtype='int32')
        el = Embedding(n_char + 1, 200, name="embed")(il)
        bl1 = Bidirectional(LSTM(128,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm1")(el)
        bl2 = Bidirectional(LSTM(64,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm2")(bl1)
        bl3 = Bidirectional(LSTM(64,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm3")(bl2)
        model = TimeDistributed(Dense(self.num_labs, activation="relu"))(bl3)
        crf = CRF(self.num_labs)  # CRF layer
        out = crf(model)  # output

        model = Model(il, out)
        model.compile(optimizer="rmsprop",
                      loss=crf.loss_function,
                      metrics=[crf.accuracy])
        model.summary()

        f_best = -1
        f_index = -1
        # OK, start actually training
        for epoch in range(self.epochsN):
            print("Epoch", epoch, "start at", datetime.now())
            # Train in batches of different sizes - randomize the order of sizes
            # Except for the first few epochs
            if epoch > 2:
                random.shuffle(sizes)
            for size in sizes:
                batch = train_l_d[size]
                labs = train_l_labels[size]

                tx = np.array([seq for seq in batch])
                y = [seq for seq in labs]

                ty = [to_categorical(i, num_classes=self.num_labs) for i in y]

                # This trains in mini-batches
                model.fit(tx, np.array(ty), verbose=0, epochs=1)
            print("Trained at", datetime.now())

            # save all epochs
            save_load_utils.save_all_weights(
                model, 'mini-batch-results/epoch_%s.h5' % epoch)
            # test the results
            self.test_minibatch(DICT, model)
            f = self.eval()

            if f > f_best:
                f_best = f
                f_index = epoch

        # Pick the best model, and save it with a useful name
        print("Choosing the best epoch")
        shutil.copyfile("mini-batch-results/epoch_%s.h5" % f_index,
                        "minibatch_%s.h5" % f_index)
예제 #17
0
                        y_train,
                        batch_size=32,
                        epochs=2,
                        validation_data=(x_test, y_test))

    # Model3: BERT -> BiLSTM -> CRF
    nerbertbilstm = NerBiLSTM_Bert(config)
    model = nerbertbilstm.model
    model.compile(optimizer=Adam(1e-4), loss=crf_loss, metrics=[crf_accuracy])

    # 其他
    from keras.callbacks import ModelCheckpoint, Callback

    class LossHistory(Callback):
        def on_train_begin(self, logs={}):
            self.losses = []

        def on_batch_end(self, batch, logs={}):
            self.losses.append(logs.get('loss'))

    checkpointer = ModelCheckpoint(filepath="bilstm_1102_k205_tf130.w",
                                   verbose=0,
                                   save_best_only=True,
                                   save_weights_only=True)
    losshistory = LossHistory()

    # 保存和加载模型 # TODO 只能使用keras_contrib的这个API么?
    from keras_contrib.utils import save_load_utils
    model_path = 'xxx'
    save_load_utils.save_all_weights(model, model_path)  # 保存模型
    save_load_utils.load_all_weights(model, model_path)  # 加载模型
model = Sequential()
model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Bidirectional(LSTM(hidden_size, return_sequences=True)))
model.add(TimeDistributed(Dense(hidden_size, activation='softmax')))

crf = (CRF(n_tags))
model.add(crf)

model.compile(optimizer="rmsprop",
              loss=crf.loss_function,
              metrics=[crf.accuracy])

for i in range(nb_epoch):
    model.fit(X_train, Y_train, batch_size=batch_size, epochs=1)
    save_load_utils.save_all_weights(model,
                                     '../models/bilstm_crf_' + str(i) + '.h5')

# In[ ]:

score = model.evaluate(X_test, Y_test)

print('Test score:', score[0])
print('Test accuracy:', score[1])

# In[ ]:

pred = model.predict(X_test)

# In[ ]:

pred_argmax = [[np.argmax(word) for word in sent] for sent in pred]
예제 #19
0
history = model.fit([trn_text_idx, trn_char_idx], [trn_slot_idx, trn_int_idx],
                    validation_data=([dev_text_idx, dev_char_idx],
                                     [dev_slot_idx, dev_int_idx]),
                    batch_size=BATCH_SIZE,
                    epochs=MAX_EPOCHS,
                    callbacks=callbacks_list,
                    verbose=0)

hist_dict = history.history

# save architecture with json
with open('model/' + modelname + '.json', 'w') as f:
    f.write(model.to_json())
# save weights
save_load_utils.save_all_weights(model, 'model/' + modelname + '.h5')
# save training history
np.save('model/' + modelname + '_dict.npy', hist_dict)

# load test
model.load_weights('model/' + modelname + '.h5')

from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score


# remove nulls and pads and get F1 on only labels
def procslots(trues, preds, nonull=True):
    tru_slots = []
    prd_slots = []
    for i in range(len(trues)):
 def save_crf_model(self, path, name):
     # check if dir exist
     DataSaverLoader.directory_exists(path)
     # Save model
     save_load_utils.save_all_weights(self.model, path + name)
예제 #21
0
 def save_model(self, filepath):
     save_load_utils.save_all_weights(self.model, filepath)
예제 #22
0
 def save(self, path):
     save_load_utils.save_all_weights(self.model, path)
예제 #23
0
def save_embedding_bilstm2_crf_model(model, filename):
    save_load_utils.save_all_weights(model, filename)
예제 #24
0
def get_measures(yTrue, yPred):
    y1 = yTrue.reshape(1, -1).squeeze()
    y2 = yPred.reshape(1, -1).squeeze()

    P = precision_score(y1, y2, average=None)
    R = recall_score(y1, y2, average=None)
    F1 = f1_score(y1, y2, average=None)

    print("Precision=", flush=True)
    print(P, flush=True)
    print("Recall=", flush=True)
    print(R, flush=True)
    print("F1 score=", flush=True)
    print(F1, flush=True)


# print("Train...", flush=True)
# get_measures(y_tr_true, pred_train)

print("Test...", flush=True)
get_measures(y_true, y_pred)

# np.save("y_tr_true.npy", y_tr_true)
# np.save("pred_train.npy", pred_train)

# np.save("y_true.npy", y_true)
# np.save("y_pred.npy", y_pred)

save_load_utils.save_all_weights(model, os.path.join(log_dir, state_dict))
예제 #25
0
def save_model(model, save_model_path):
    save_load_utils.save_all_weights(model, save_model_path)
예제 #26
0
def save_model(model, save_model_path):
    save_load_utils.save_all_weights(model, save_model_path)
def save_embedding_bilstm2_crf_model(model, filename):
    save_load_utils.save_all_weights(model,filename)
 def save_trained_model(self, name):
     if self.verbose:
         print("\n[INFO] Saving trained model to '" + name + "'\n")
     save_load_utils.save_all_weights(self.model, name)
예제 #29
0
def main():

    data, words, tags, pos = load_dataset()  # unique
    n_words = len(words)  # total words in vocab
    n_tags = len(tags)  # total tags in vocab
    n_pos = len(pos)

    print("Data Loaded successfully..")
    getter = SentenceGetter(data)

    # list of (word,POS,Tag)
    sentences = getter.sentences

    print("First sentence")
    print(sentences[0])
    max_len = 75  # length of each sequence/sentence

    # if model is trained, load previous results
    if trained:
        assert trained == True, "Trained must be True"

        # load trained indices
        word2idx = save_load_word_idx("word2idx.pkl", load=True)
        idx2word = save_load_word_idx("idx2word.pkl", load=True)
        tag2idx = save_load_word_idx("tag2idx.pkl", load=True)
        idx2tag = save_load_word_idx("idx2tag.pkl", load=True)

    else:
        assert trained == False

        # save trained indices
        word2idx = {w: i + 2 for i, w in enumerate(words)}
        word2idx["ENDPAD"] = 0
        word2idx["UNK"] = 1

        idx2word = {i: w for w, i in word2idx.items()}

        tag2idx = {t: i + 1 for i, t in enumerate(tags)}
        tag2idx["ENDPAD"] = 0

        idx2tag = {i: t for t, i in tag2idx.items()}

        save_load_word_idx("word2idx.pkl", word2idx=word2idx, save=True)
        save_load_word_idx("idx2word.pkl", word2idx=idx2word, save=True)
        save_load_word_idx("tag2idx.pkl", word2idx=tag2idx, save=True)
        save_load_word_idx("idx2tag.pkl", word2idx=idx2tag, save=True)

    print("word2idx[\"demonstrators\"].................",
          word2idx["demonstrators"])

    # convert sequence of sentences into corresponding int vectors
    X = [[word2idx[w[0]] for w in s] for s in sentences]

    # max length of sequence/sentence
    print("Max length of sequence(len(sentence)):", max([len(x) for x in X]))

    # add padding for same length i.e, max_len= 75 with "0" value
    X = pad_sequences(maxlen=max_len,
                      sequences=X,
                      truncating='post',
                      padding='post',
                      value=0)

    y = [[tag2idx[w[2]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len,
                      sequences=y,
                      padding="post",
                      truncating="post",
                      value=tag2idx["ENDPAD"])

    # y: class vector to be converted into a matrix (integers from 0 to num_classes).
    # num_classes: total number of classes.
    # y = [to_categorical(i, num_classes = n_tags + 1) for i in y]

    # create a list with all possible chars
    chars = set([chars for word in words for chars in word])
    n_chars = len(chars)
    max_len_chars = 10

    # i, len_word = max([(i,len(word)) for i,word in enumerate(words)])

    _max, imax = -1, 0
    for i, w in enumerate(words):
        if len(w) > _max:
            _max, imax = len(w), i

    print("Actual max len chars(n_chars){} and word is {}:".format(
        _max, words[imax]))

    # create char2idx for converting chars as vector of integers to feed to LSTM
    char2idx = {char: i + 2 for i, char in enumerate(chars)}
    char2idx["ENDPAD"] = 0  # to ignore this by mask_zero = True
    char2idx["UNK"] = 1

    # vice versa
    idx2char = {i: char for char, i in char2idx.items()}

    # generate char_sequence for input to model
    X_char = []
    for sentence in sentences:
        sent_seq = []
        # max_len = 75
        for i in range(max_len):
            word_seq = []
            # char sequence for words
            for j in range(max_len_chars):
                try:
                    # chars of specific sentence of i
                    word_seq.append(char2idx.get(sentence[i][0][j]))
                except:  # if char-sequence is out of range , pad it with "PAD" tag
                    word_seq.append(char2idx.get("ENDPAD"))

            sent_seq.append(word_seq)
        # append sentence sequences as character-by-character to X_char for Model input
        X_char.append(np.array(sent_seq))

    print(X_char[:1])
    print("shape of one X_char[0]: ", X_char[0].shape)
    print("shape of  X_char:{} ".format(np.array(X_char).shape))
    print("shape of  X:{} ".format(X.shape))

    from sklearn.model_selection import train_test_split
    # split data into (test=90%,train=10%) percentage
    X_tr, X_te, y_tr, y_te = train_test_split(X,
                                              y,
                                              test_size=0.1,
                                              shuffle=True,
                                              random_state=2018)
    X_char_tr, X_char_te, _, _ = train_test_split(X_char,
                                                  y,
                                                  test_size=0.1,
                                                  shuffle=True,
                                                  random_state=2018)

    print("shape of  X_char_tr:{} ".format(np.array(X_char_tr).shape))
    print("shape of  X_char_te:{} ".format(np.array(X_char_te).shape))
    print("shape of  y_tr:{} ".format(np.array(y_tr).shape))

    print(
        "Reshaped X_char_tr:",
        np.array(X_char_tr).reshape(
            (len(X_char_tr), max_len, max_len_chars)).shape)

    print("Reshaped y_tr:",
          np.array(y_tr).reshape(len(y_tr), max_len, 1).shape)

    print("X_tr : ", X_tr.shape)

    # import sys
    # sys.exit(0)

    if trained:
        # model.evaluate(X_te, np.array(y_te), verbose=1)
        main2(X,
              X_te,
              y_te,
              words=words,
              tags=tags,
              idx2word=idx2word,
              idx2tag=idx2tag)
        return

    model = create_model(max_len, n_words, n_tags, n_pos, max_len_chars,
                         n_chars)

    # second input to be fed like : model.fit([X_tr, second_input])
    # second_input_emb = np.array(X_pos[:len(X_tr)])
    # second_input_hot = np.array(X_pos[:len(X_tr)])

    # score = model.evaluate([X_te, np.array(X_pos[len(X_tr):])], np.array(y_te), verbose=1)
    # #print accuracy
    # print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

    # score = model.evaluate([X_te, np.array(X_pos[len(X_tr):])], np.array(y_te), verbose=1)
    # print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

    # history = model.fit([X_tr, second_input_hot], np.array(y_tr),\
    #  batch_size=32, epochs=2, validation_split=0.1, verbose=1)

    history = model.fit([X_tr, np.array(X_char_tr)], np.array(y_tr).reshape(len(y_tr),max_len,1),\
     batch_size=32, epochs=5, validation_split=0.1, verbose=1)

    # TODO: pass second arg to model.evaluate()

    # score = model.evaluate(X_te, y_te, batch_size=16)
    # evaluate the model for training examples and print accuracy =>98.63%

    # score = model.evaluate([X_te, np.array(X_pos[len(X_tr):])], np.array(y_te), verbose=1)
    # print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

    score = model.evaluate([X_tr, np.array(X_char_tr)],
                           np.array(y_tr).reshape(len(y_te), max_len, 1),
                           verbose=1)
    print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100))

    # save model on disk
    # save_path = "C:\\Users\\Usman Ahmad\\Desktop\\P3_LSTM_saved.pkl"

    model.save("My_Custom_Model3.h5")

    from keras_contrib.utils import save_load_utils
    # save using keras_contrib.utils.save_load_utils

    save_load_utils.save_all_weights(model, "Model_saved_using_contrib.h5")

    model.save_weights("model_weights.h5")
    with open("model_architecture.json", "w") as json_file:
        json_file.write(model.to_json())

    # print("Saved model to disk"
    # serialize weights to HDF5
    print("Saved model to disk")

    plot_history(history)

    print(model.summary())

    print('*' * 50)
    if words is not None and tags is not None:
        i = 2318
        p = model.predict(np.array([X_te[i]]))
        p = np.argmax(p, axis=-1)
        print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
        for w, pred in zip(X_te[i], p[0]):
            if w != 0:
                print("{:15}: {}".format(words[w - 1], tags[pred]))

        print('*' * 50)
        print(p)
        print("len(p) = ", len(p))

    # for x in X_te[i]:
    # 	print(words[x], end = " ")
    # print(" ")

    del model

    from k.models import load_model
    # loaded_model = load_model("My_Custom_Model3.h5")
    # loaded_model = ""
    load_all_weights(loaded_model, "Model_saved_using_contrib.h5")

    print("Model Loaded.. Evaluating again")

    score = loaded_model.evaluate(X_te, np.array(y_te), verbose=1)
    print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1] * 100))

    score = loaded_model.evaluate(X_te, np.array(y_te), verbose=1)
    print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1] * 100))

    # score = loaded_model.evaluate(X,Y, verbose=1)
    #print accuracy
    # print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
    if words is not None and tags is not None:
        i = 2319
        p = loaded_model.predict(np.array([X[i]]))
        # p = loaded_model.predict([np.array(X[i]),second_input[i]])

        p = np.argmax(p, axis=-1)
        print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
        for w, pred in zip(X[i], p[0]):  # p[0] = p[[1,3,4,5]]
            print("{:15}: {}".format(words[w], tags[pred]))
예제 #30
0
    def main(self, glove):
        # get word embeddings
        utils = wordUtils.Utils()

        if glove:
            # use glove
            self.words_list, self.embedding_matrix = utils.load_glove()
            unword_n = len(self.words_list)

        else:
            self.words_list, self.embedding_matrix = utils.load_word2vec()
            unword_n = len(self.words_list)

        # get the training corpus
        cr = corpusreader.CorpusReader(self.textfile, self.annotfile)
        corpus = cr.trainseqs
        print(len(corpus))
        train = []
        print("Processing training data", datetime.now())
        for doc in corpus:
            tmp_dic = {}

            tmp_dic['tokens'] = doc['tokens']

            # convert SOBIE tags to numbers
            tags = doc['bio']
            tags = [self.lablist[i] for i in tags]
            tmp_dic['bion'] = tags
            train.append(tmp_dic)


        n_emb = 0
        n_unk = 0

        # get the number of the embedding
        for idx in range(len(train)):
            words = train[idx]['tokens']
            words_id = []
            for i in words:
                # get the number of the embedding
                try:
                    # the index of the word in the embedding matrix
                    index = self.words_list.index(i)
                    n_emb = n_emb + 1
                except ValueError:
                    # use the embedding full of zeros to identify an unknown word
                    n_unk = n_unk + 1
                    index = unword_n

                # the index of the word in the embedding matrix
                words_id.append(index)

            train[idx]['tokens'] = words_id


        # get all sizes from the sequences with training data
        train_l_d = {}
        train_l_labels = {}
        for seq in train:
            # corpus
            l = len(seq['tokens'])
            if l not in train_l_d: train_l_d[l] = []
            train_l_d[l].append(seq['tokens'])

            # labels
            l1 = len(seq['bion'])
            if l1 not in train_l_labels: train_l_labels[l1] = []
            train_l_labels[l1].append(seq['bion'])

        sizes = list(train_l_d.keys())
        for i in sizes:
            if len(train_l_d[i]) != len(train_l_labels[i]):
                print("merda")

            for m in range(len(train_l_d[i])):
                if len(train_l_d[i][m]) != len(train_l_labels[i][m]):
                    print("XXX")

        input = Input(shape=(None,))
        el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input)
        model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout = 0.1))(el)  # variational biLSTM
        model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
        crf = CRF(self.lab_len)  # CRF layer
        out = crf(model)  # output

        model = Model(input, out)
        model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
        model.summary()

        f_best = -1
        f_index = -1
        # OK, start actually training
        for epoch in range(self.epochsN):
            print("Epoch", epoch, "start at", datetime.now())
            # Train in batches of different sizes - randomize the order of sizes
            # Except for the first few epochs
            if epoch > 2:
                random.shuffle(sizes)
            for size in sizes:
                batch = train_l_d[size]
                labs = train_l_labels[size]

                tx = np.array([seq for seq in batch])
                y = [seq for seq in labs]

                ty = [to_categorical(i, num_classes=self.lab_len) for i in y]

                # This trains in mini-batches
                model.fit(tx, np.array(ty), verbose=0, epochs=1)
            print("Trained at", datetime.now())

            # save all epochs
            save_load_utils.save_all_weights(model, 'words-results/epoch_%s.h5' % epoch)
            # test the results
            test_data = 'corpus_char/tmVarCorpus/treated/test_data.txt'
            test_labels = 'corpus_char/tmVarCorpus/treated/test_labels.tsv'
            self.test_model(test_data, test_labels, model, glove)
            f = self.eval()

            if f > f_best:
                f_best = f
                f_index = epoch


        # Pick the best model, and save it with a useful name
        print("Choosing the best epoch")
        shutil.copyfile("words-results/epoch_%s.h5" % f_index, "words_glove_%s.h5" % f_index)
예제 #31
0
 def save_model(self):
     save_load_utils.save_all_weights(
         self.model,
         self.filepath + '/' + str(self.steps_counter) + '.hdf5')
예제 #32
0
 y_tr = data_dict["y_tr"]
 y_te = data_dict["y_te"]
 max_len = 75
 n_words = data_dict["n_words"]
 n_tags = data_dict["n_tags"]
 tag2idx = data_dict["tag2idx"]
 pos2idx = data_dict["pos2idx"]
 word2idx = data_dict["word2idx"]
 ## Model definition
 input = Input(shape=(max_len,))
 model = Embedding(input_dim=n_words + 1, output_dim=20,
                   input_length=max_len)(input)  # 20-dim embedding
 model = Bidirectional(LSTM(units=50, return_sequences=True,
                            recurrent_dropout=0.1))(model)  # variational biLSTM
 model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
 crf = CRF(n_tags)  # CRF layer
 out = crf(model)  # output
 model = Model(input, out)
 model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
 print(model.summary())
 history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=100,
                 validation_split=0.1, verbose=1)
 #Testing
 test_pred = model.predict(X_te, verbose=1)
 idx2tag = {i: w for w, i in tag2idx.items()}
 pred_labels = pred2label(test_pred)
 test_labels = pred2label(y_te)
 print("Recall, Precision and F-score are",
       get_recall_precision(test_labels, pred_labels, "Destination"))
 save_load_utils.save_all_weights(model,"BILSTM+CRF_without_pos_without_embeddings")