예제 #1
0
def fit_lstm_model(X_train, y_train, n_words, n_tags, seq_len, class_weights,
                   epochs):
    '''Set up LSTM model with one input - equal length sequences of encoded text'''
    input_seq = Input(shape=(seq_len, ))
    '''Pass the GloVe pretrained model weights into the embedding layer'''
    embedding = Embedding(input_dim=n_words,
                          output_dim=300,
                          weights=[embedding_matrix],
                          trainable=True)(input_seq)
    embedding = Dropout(0.1)(embedding)
    '''Add Bidirectional LSTM layer, dense hidden layer, and final output layer'''
    model = Bidirectional(
        LSTM(units=64, return_sequences=True,
             recurrent_dropout=0.1))(embedding)
    model = TimeDistributed(Dense(64, activation='relu'))(model)
    output = Dense(n_tags, activation="softmax")(model)
    '''Compile and fit deep neural network'''
    model = Model(inputs=input_seq, outputs=output)
    model.compile(optimizer="adam",
                  loss="categorical_crossentropy",
                  metrics=["accuracy"])
    history = model.fit(X_train,
                        y_train,
                        epochs=epochs,
                        batch_size=32,
                        validation_split=0.1,
                        verbose=1,
                        class_weight=[class_weights])
    '''Create simple performance report for the model'''
    val_loss, val_acc = model.evaluate(X_test, y_test)
    print(f'Model validation loss was {val_loss}')
    print(f'Model validation accuracy was {val_acc}')
    return model, history
예제 #2
0
    def train(self, epochs, embedding=None):
        # Embedded Words
        txt_input = Input(shape=(None, ), name='txt_input')
        txt_embed = Embedding(input_dim=self.num_words,
                              output_dim=MAX_LEN,
                              input_length=None,
                              name='txt_embedding',
                              trainable=False,
                              weights=([embedding]))(txt_input)
        txt_drpot = Dropout(0.1, name='txt_dropout')(txt_embed)

        # Embedded Part of Speech
        pos_input = Input(shape=(None, ), name='pos_input')
        pos_embed = Embedding(input_dim=self.num_pos,
                              output_dim=MAX_LEN,
                              input_length=None,
                              name='pos_embedding')(pos_input)
        pos_drpot = Dropout(0.1, name='pos_dropout')(pos_embed)

        # Embedded Characters
        char_in = Input(shape=(
            None,
            MAX_LEN_CHAR,
        ), name="char_input")
        emb_char = TimeDistributed(
            Embedding(input_dim=self.num_chars,
                      output_dim=MAX_LEN_CHAR,
                      input_length=None))(char_in)
        char_enc = TimeDistributed(
            LSTM(units=20, return_sequences=False,
                 recurrent_dropout=0.5))(emb_char)

        # Concatenate inputs
        x = concatenate([txt_drpot, pos_drpot, char_enc], axis=2)
        x = SpatialDropout1D(0.3)(x)

        # Deep Layers
        model = Bidirectional(
            LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(x)
        model = Bidirectional(
            LSTM(units=100, return_sequences=True,
                 recurrent_dropout=0.1))(model)

        # Output
        out = TimeDistributed(Dense(self.num_entities,
                                    activation="softmax"))(model)
        model = Model(inputs=[txt_input, pos_input, char_in], outputs=[out])

        model.compile(optimizer="rmsprop",
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

        plot_model(model, to_file=self.save_path + 'model_structure.png')
        print(model.summary())

        history = model.fit(
            [self.X_train, self.train_pos, self.train_characters],
            np.array(self.Y_train),
            batch_size=32,
            epochs=epochs,
            validation_data=([
                self.X_validation, self.valid_pos, self.valid_characters
            ], np.array(self.Y_validation)),
            verbose=1)

        model.save(self.save_path + 'model_ner')

        test_eval = model.evaluate(
            [self.X_test, self.test_pos, self.test_characters],
            np.array(self.Y_test))

        print('Test loss:', test_eval[0])
        print('Test accuracy:', test_eval[1])

        return model, history
def bilstm(X_train, X_test, Y_train, Y_test, wordembeddings):
    np.random.seed(1234)
    tf.random.set_seed(1234)
    random.seed(1234)

    max_length_sentence = X_train.str.split().str.len().max()
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                          lower=True)
    tokenizer.fit_on_texts(X_train)
    word_index = tokenizer.word_index
    EMBEDDING_DIM = 300
    vocabulary_size = len(word_index) + 1
    print('Found %s unique tokens.' % len(word_index))

    sequences_train = tokenizer.texts_to_sequences(X_train)
    sequences_valid = tokenizer.texts_to_sequences(X_test)
    X_train = pad_sequences(sequences_train, maxlen=max_length_sentence)
    X_val = pad_sequences(sequences_valid, maxlen=X_train.shape[1])
    y_train = np.asarray(Y_train)
    y_val = np.asarray(Y_test)
    #print(word_index)
    '''
    print('Shape of data tensor:', X_train.shape)
    print('Shape of data tensor:', X_val.shape)
    print('Shape of data tensor:', y_train.shape)
    print('Shape of data tensor:', y_val.shape)
    
    print(X_train)
    print("*"*100)
    print(X_val)
    print("*"*100)
    print(y_train)
    print("*"*100)
    print(y_val)
    '''

    embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if (word in wordembeddings.keys()):
            embedding_vector = wordembeddings[word]
            if len(embedding_vector) == 0:  #if array is empty
                embedding_vector = wordembeddings[word.title()]
                if len(embedding_vector) == 0:
                    embedding_vector = wordembeddings[word.upper()]
                    if len(embedding_vector) == 0:
                        embedding_vector = np.array([
                            round(np.random.rand(), 8) for i in range(0, 300)
                        ])

        else:
            #print("WORD NOT IN DICT",word)
            embedding_vector = np.array(
                [round(np.random.rand(), 8) for i in range(0, 300)])

        if len(embedding_vector) != 0:
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(vocabulary_size,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                trainable=False)  #Try with True

    inputs = Input(shape=(X_train.shape[1], ))
    model = (Embedding(vocabulary_size,
                       EMBEDDING_DIM,
                       input_length=max_length_sentence,
                       weights=[embedding_matrix]))(inputs)

    model = Bidirectional(GRU(64))(
        model)  # !!!!!!! CHANGE THIS FOR OTHER MODELS
    model = (Dense(900, activation='relu'))(model)
    model = (Dense(400, activation='relu'))(model)
    model = (Dense(250, activation='relu'))(model)
    model = (Dense(204, activation='softmax'))(model)
    model = Model(inputs=inputs, outputs=model)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

    callbacks = [EarlyStopping(monitor='val_loss')]
    hist_adam = model.fit(
        X_train,
        y_train,
        batch_size=1000,
        epochs=200,
        verbose=1,
        validation_data=(X_val, y_val),
        callbacks=callbacks
    )  #!!!!!!!!!!!!!!!!!!!!!!!CHANGE BATCH SIZE TO 1000 #change epochs to 200

    model.save(config.bigru_prepocessed_dataset1_chai
               )  # !!!!!!! CHANGE THIS FOR OTHER MODELS

    y_pred = model.predict(X_val)
    print(y_pred)

    y_val_class = pd.DataFrame(y_val).idxmax(axis=1)
    print(y_val_class)

    y_val_class_argmax = np.argmax(y_val, axis=1)
    y_pred_class_argmax = np.argmax(y_pred, axis=1)

    y_pred_class = pd.DataFrame(y_pred).idxmax(axis=1)
    print(y_pred_class)

    print(classification_report(y_val_class, y_pred_class))

    plt.suptitle('Optimizer : Adam', fontsize=10)
    plt.ylabel('Loss', fontsize=16)
    plt.xlabel('Epoch', fontsize=14)
    plt.plot(hist_adam.history['loss'], color='b', label='Training Loss')
    plt.plot(hist_adam.history['val_loss'], color='r', label='Validation Loss')
    plt.legend(loc='upper right')

    plt.savefig(
        '/home/ubuntu/asset_classification/results/bigru_model_dataset1_preprocessed_chai.png'
    )  # !!!!!!! CHANGE THIS FOR OTHER MODELS

    tf.keras.utils.plot_model(
        model, to_file=config.bigru_architecture,
        show_shapes=True)  # !!!!!!! CHANGE THIS FOR OTHER MODELS

    return (y_pred, y_val_class, y_pred_class, y_val_class_argmax,
            y_pred_class_argmax)
예제 #4
0
    def main(self, glove):
        # get word embeddings
        utils = wordUtils.Utils()

        if glove:
            # use glove
            self.words_list, self.embedding_matrix = utils.load_glove()
            unword_n = len(self.words_list)

        else:
            self.words_list, self.embedding_matrix = utils.load_word2vec()
            unword_n = len(self.words_list)

        # get the training corpus
        cr = corpusreader.CorpusReader(self.textfile, self.annotfile)
        corpus = cr.trainseqs
        print(len(corpus))
        train = []
        print("Processing training data", datetime.now())
        for doc in corpus:
            tmp_dic = {}

            tmp_dic['tokens'] = doc['tokens']

            # convert SOBIE tags to numbers
            tags = doc['bio']
            tags = [self.lablist[i] for i in tags]
            tmp_dic['bion'] = tags
            train.append(tmp_dic)


        n_emb = 0
        n_unk = 0

        # get the number of the embedding
        for idx in range(len(train)):
            words = train[idx]['tokens']
            words_id = []
            for i in words:
                # get the number of the embedding
                try:
                    # the index of the word in the embedding matrix
                    index = self.words_list.index(i)
                    n_emb = n_emb + 1
                except ValueError:
                    # use the embedding full of zeros to identify an unknown word
                    n_unk = n_unk + 1
                    index = unword_n

                # the index of the word in the embedding matrix
                words_id.append(index)

            train[idx]['tokens'] = words_id


        # get all sizes from the sequences with training data
        train_l_d = {}
        train_l_labels = {}
        for seq in train:
            # corpus
            l = len(seq['tokens'])
            if l not in train_l_d: train_l_d[l] = []
            train_l_d[l].append(seq['tokens'])

            # labels
            l1 = len(seq['bion'])
            if l1 not in train_l_labels: train_l_labels[l1] = []
            train_l_labels[l1].append(seq['bion'])

        sizes = list(train_l_d.keys())
        for i in sizes:
            if len(train_l_d[i]) != len(train_l_labels[i]):
                print("merda")

            for m in range(len(train_l_d[i])):
                if len(train_l_d[i][m]) != len(train_l_labels[i][m]):
                    print("XXX")

        input = Input(shape=(None,))
        el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input)
        model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout = 0.1))(el)  # variational biLSTM
        model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
        crf = CRF(self.lab_len)  # CRF layer
        out = crf(model)  # output

        model = Model(input, out)
        model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
        model.summary()

        f_best = -1
        f_index = -1
        # OK, start actually training
        for epoch in range(self.epochsN):
            print("Epoch", epoch, "start at", datetime.now())
            # Train in batches of different sizes - randomize the order of sizes
            # Except for the first few epochs
            if epoch > 2:
                random.shuffle(sizes)
            for size in sizes:
                batch = train_l_d[size]
                labs = train_l_labels[size]

                tx = np.array([seq for seq in batch])
                y = [seq for seq in labs]

                ty = [to_categorical(i, num_classes=self.lab_len) for i in y]

                # This trains in mini-batches
                model.fit(tx, np.array(ty), verbose=0, epochs=1)
            print("Trained at", datetime.now())

            # save all epochs
            save_load_utils.save_all_weights(model, 'words-results/epoch_%s.h5' % epoch)
            # test the results
            test_data = 'corpus_char/tmVarCorpus/treated/test_data.txt'
            test_labels = 'corpus_char/tmVarCorpus/treated/test_labels.tsv'
            self.test_model(test_data, test_labels, model, glove)
            f = self.eval()

            if f > f_best:
                f_best = f
                f_index = epoch


        # Pick the best model, and save it with a useful name
        print("Choosing the best epoch")
        shutil.copyfile("words-results/epoch_%s.h5" % f_index, "words_glove_%s.h5" % f_index)
예제 #5
0
    LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(input)
model = TimeDistributed(Dense(50, activation="relu"))(model)
crf = CRF_2nd(len(data.tag_to_index))
out_layer = crf(model)

model = Model(input, out_layer)
model.compile(optimizer="rmsprop",
              loss=crf.loss_function,
              metrics=[crf.accuracy])

model.summary()
BATCH_SIZE = 64
EPOCHS = 10
history = model.fit(X_train,
                    np.array(y_train),
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=2)


def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PADword", "O"))
        out.append(out_i)
    return out

# learning rate decay
dataset_size = train_data.shape[0]
batches_per_epoch = dataset_size/batch_size
lr_decay = (1./(1/32) -1)/batches_per_epoch
model.compile(
    optimizer=Adam(lr=0.012, decay=lr_decay),
    loss=crf.loss_function,
    metrics=[crf.accuracy]
)
model.summary()
from keras.utils.vis_utils import plot_model


history = model.fit([X_w_tr,np.array(X_c_tr).reshape((len(X_c_tr), max_len, max_len_char))], np.array(y_tr),
                    batch_size=batch_size,
                    epochs=ephochs,
                    validation_data = ([X_w_v,np.array(X_c_v).reshape((len(X_c_v), max_len, max_len_char))], np.array(y_v)),
                    verbose=1,
                    )


# # history is a dictionary,keys are val_loss,val_acc,loss,acc
hist = pd.DataFrame(history.history)
fig = plt.figure(figsize=(12,12))
# add subplots
sub_fig1 = fig.add_subplot(1,2,1) # 1 row 2 cols 1st figure
sub_fig2 = fig.add_subplot(1,2,2)
# set titles
sub_fig1.set_title('Accuracy')
sub_fig2.set_title('Loss')
print(hist)
# set values and labels
예제 #7
0
Y = Y.reshape((Y.shape[0], Y.shape[1], 1))

all_train = all[:int(0.8 * all.shape[0]), ...]
Y_train = Y[:int(0.8 * all.shape[0]), ...]
Y_train_dense = np.reshape(Y_train, (Y_train.shape[0], Y_train.shape[1]))
Y_train_dense = np.argmax(Y_train_dense, axis=-1)

all_test = all[int(0.8 * all.shape[0]):, ...]
Y_test = Y[int(0.8 * all.shape[0]):, ...]
# pu.db
Y_test_dense = np.reshape(Y_test, (Y_test.shape[0], Y_test.shape[1]))
Y_test_dense = np.argmax(Y_test_dense, axis=-1)

for i in xrange(100):
    print i
    model.fit(all_train, Y_train, batch_size=1000, epochs=5, verbose=1)

    Y_pred_train = model.predict(all_train, batch_size=1000)
    Y_pred_test = model.predict(all_test, batch_size=1000)

    Y_pred_train_dense = np.reshape(
        Y_pred_train, (Y_pred_train.shape[0], Y_pred_train.shape[1]))
    Y_pred_train_dense = np.argmax(Y_pred_train_dense, axis=-1)

    Y_pred_test_dense = np.reshape(
        Y_pred_test, (Y_pred_test.shape[0], Y_pred_test.shape[1]))
    Y_pred_test_dense = np.argmax(Y_pred_test_dense, axis=-1)

    train_acc = np.sum(
        Y_pred_train_dense == Y_train_dense) * 100.0 / len(Y_pred_train_dense)
    val_acc = np.sum(