Пример #1
0
def TrainLstmCrf(data_name, model_name):
    n_classes = 4
    max_len = 75
    batch_size = 128
    epoch = 100
    tags = ['S', 'B', 'I', 'E']
    sentences, words = get_sents(datasets=data_name)
    print(len(sentences), len(words))
    word2idx = {w: i + 1 for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}
    vocab_size = len(words)

    X = [[word2idx[w[0]] for w in s] for s in sentences]
    X = pad_sequences(maxlen=max_len,
                      sequences=X,
                      padding="post",
                      value=vocab_size - 1)

    y = [[tag2idx[w[1]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len,
                      sequences=y,
                      padding="post",
                      value=tag2idx["E"])
    y = [to_categorical(i, num_classes=n_classes) for i in y]
    # 获得数据
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
    print(len(X_tr), len(y_tr), len(X_te), len(y_te))
    s = np.asarray([max_len] * batch_size, dtype='int32')

    # 建立模型
    word_ids = Input(batch_shape=(batch_size, max_len), dtype='int32')
    sequence_lengths = Input(batch_shape=[batch_size, 1], dtype='int32')
    print(sequence_lengths)
    word_embeddings = Embedding(vocab_size, n_classes)(word_ids)
    blstm = Bidirectional(LSTM(units=50,
                               return_sequences=True))(word_embeddings)
    model = TimeDistributed(Dense(4, activation='tanh'))(blstm)
    crf = CrfModel()
    pred = crf(inputs=[model, sequence_lengths])
    model = Model(inputs=[word_ids, sequence_lengths], outputs=[pred])
    print("word_ids:{}".format(word_ids))
    print("sequence_lengths:{}".format(sequence_lengths))
    model.compile(optimizer="rmsprop", loss=crf.loss, metrics=['accuracy'])

    print(model.summary())

    k = 0
    for batch_x, batch_y in minibatches(X_tr, y_tr, batch_size=batch_size):
        model.fit([batch_x, s],
                  np.array(batch_y),
                  epochs=epoch,
                  batch_size=batch_size)
        k += 1
        if k % 50 == 0:
            model.save("./models/{}_{}".format(k, model_name))
            print("saved")

    # 保存模型
    model.save(model_name)
def new_model(image_size=299, video_length=40, cnn_trainable=False):
    inputs = Input(shape=(video_length, image_size, image_size, 3))
    cnn = inception_v3.InceptionV3(include_top=False, weights='imagenet')
    model = TimeDistributed(cnn)(inputs)
    model.trainable = cnn_trainable

    model = LSTM(512)(model)
    model = Dropout(0.5)(model)
    model = Dense(1, activation='softmax')(model)
    model = Model(inputs=inputs, outputs=model)

    adam = keras.optimizers.Adam(learning_rate=1e-5)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])

    model.summary()
    return model
Пример #3
0
def seq2seq():
    n_classes = len(LABELS)

    converse_input = Input(shape=(None, SENTENCE_ENCODING_DIM))
    # length_input = Input(shape=(None, 1))
    # word_input = Input(shape=(None, WORD_EMBEDDING_DIM))
    time_input = Input(shape=(None, 1))

    converse = Masking(mask_value=-1.)(converse_input)
    converse = Dropout(0.2)(converse)
    converse = Bidirectional(LSTM(1024, return_sequences=True))(converse)
    converse = Bidirectional(LSTM(1024, return_sequences=True))(converse)
    converse = Dropout(0.3)(converse)

    # lengths = Masking(mask_value=-1)(length_input)

    # words = Masking(mask_value=-1.)(word_input)
    # words = Dropout(0.2)(words)

    model = concatenate([converse, time_input], axis=-1)

    # print("merged outpout shape", model.output_shape)

    model = TimeDistributed(Dense(1024, activation='relu'))(model)
    model = Dropout(0.3)(model)
    model = TimeDistributed(Dense(512, activation='relu'))(model)
    model = Dropout(0.3)(model)
    # predictions = TimeDistributed(Dense(n_classes, activation='softmax'))(model)

    crf = CRF(n_classes, sparse_target=True)
    predictions = crf(model)

    model = Model(inputs=[converse_input, time_input], outputs=predictions)
    model.summary()
    # model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
    return model
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.6))(x)  #dropout 0.1试试?
model = TimeDistributed(Dense(50, activation="relu"))(main_lstm)

crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PD)
out = crf(model)  # output

# out = Lambda(lambda x: K.reshape(x,(-1,5)))(out)
model = Model([word_in, char_in], out)

# set optimizer 
# rmsprop = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=1e-5)

adam = optimizers.Adam(lr=0.01, epsilon=None, decay=1e-1)
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy]) #use crf
model.summary()
#sample_weight_mode="temporal"


tr_pubs = pub_ids[:int(len(pub_ids)*0.9)]
val_pubs = pub_ids[int(len(pub_ids)*0.9):] 

train = subdata_getter(tr_pubs,data)
validation = subdata_getter(val_pubs,data)


tr_generator =  DataGenerator(tr_pubs,train)
val_generator =  DataGenerator(val_pubs,validation)

history = NBatchLogger()
model.fit_generator(generator=tr_generator,shuffle=False, epochs=10, verbose=0,callbacks=[history]) #,callbacks=callbacks_list
Пример #5
0
    pos_emb = Embedding(input_dim=len(pos),
                        output_dim=10,
                        input_length=max_len)(pos_input)
    modified_input = keras.layers.concatenate([word_emb, pos_emb])
    model_1 = Bidirectional(
        LSTM(units=50, return_sequences=True,
             recurrent_dropout=0.1))(modified_input)
    model = TimeDistributed(Dense(50, activation="relu"))(
        model_1)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model([input, pos_input], out)
    model.compile(optimizer="rmsprop",
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])
    print(model.summary())
    history = model.fit([X_tr, X_pos_tr],
                        np.array(y_tr),
                        batch_size=32,
                        epochs=60,
                        validation_split=0.1,
                        verbose=1)
    #Testing
    test_pred = model.predict([X_te, X_pos_te], verbose=1)
    idx2tag = {i: w for w, i in tag2idx.items()}
    pred_labels = pred2label(test_pred)
    test_labels = pred2label(y_te)
    print("Recall, Precision and F-score are",
          get_recall_precision(test_labels, pred_labels, "Destination"))
    model.save("BILSTM+CRF_with_pos_without_embeddings.model")
Пример #6
0
    def test_exist(self, glove, test_data, test_labels):
        # get word embeddings
        utils = wordUtils.Utils()

        if glove:
            # use glove
            self.words_list, self.embedding_matrix = utils.load_glove()
            unword_n = len(self.words_list)

        else:
            self.words_list, self.embedding_matrix = utils.load_word2vec()
            unword_n = len(self.words_list)

        # get the training corpus
        cr = corpusreader.CorpusReader(test_data, test_labels)
        corpus = cr.trainseqs

        # get the number of the embedding
        for idx in range(len(corpus)):
            words = corpus[idx]['tokens']
            words_id = []
            for i in words:

                # get the number of the embedding
                try:
                    # the index of the word in the embedding matrix
                    index = self.words_list.index(i)
                except ValueError:
                    # use the embedding full of zeros to identify an unknown word
                    index = unword_n

                # the index of the word in the embedding matrix
                words_id.append(index)

            corpus[idx]['embs'] = words_id

        input = Input(shape=(None,))
        el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input)
        bl1 = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.5, dropout=0.5),
                            merge_mode="concat",
                            name="lstm1")(el)
        bl2 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5),
                            merge_mode="concat",
                            name="lstm2")(bl1)
        bl3 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5),
                            merge_mode="concat",
                            name="lstm3")(bl2)
        model = TimeDistributed(Dense(50, activation="relu"))(bl3)  # a dense layer as suggested by neuralNer
        crf = CRF(self.lab_len)  # CRF layer
        out = crf(model)  # output

        model = Model(input, out)
        model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
        model.summary()
        save_load_utils.load_all_weights(model, 'word_models/words_glove_multiLSTM31.h5')

        for doc in corpus:
            doc_arr = doc['embs']
            p = model.predict(np.array([doc_arr]))
            p = np.argmax(p, axis=-1)

            position = 0
            offsets = defaultdict(list)
            counter = 0
            # check if there are any mutations identified
            # {'O': 0, 'B-E': 1, 'I-E': 2, 'E-E': 3, 'S-E': 4}
            B = False
            last = 0
            for idx in p[0]:
                if idx == 1 and last == 1:
                    counter = counter + 1
                    offsets[counter].append(position)
                    B = True
                elif idx == 1:
                    B = True
                    offsets[counter].append(position)
                    last = 1
                elif idx == 2 and B:
                    offsets[counter].append(position)
                    last = 2
                elif idx == 3 and B:
                    offsets[counter].append(position)
                    last = 3
                    B = False
                    counter = counter + 1
                elif idx == 4:
                    offsets[counter].append(position)
                    counter = counter + 1
                    last = 4
                else:
                    B = False

                position = position + 1

            # open file to write
            textid = str(doc['textid'])
            abstract = open("words-silver/" + textid + ".a1", 'w')
            for i in offsets:
                word = offsets.get(i)
                size = len(word)
                if size == 1:
                    s = word[0]  # just one; singleton

                    abstract.write(str(doc['tokstart'][s]) + "\t")
                    abstract.write(str(doc['tokend'][s]) + "\t")
                    abstract.write(str(doc['tokens'][s]) + "\n")


                elif size > 1:
                    s = word[0]  # start of token
                    e = word[-1]  # end of token

                    abstract.write(str(doc['tokstart'][s]) + "\t")
                    abstract.write(str(doc['tokend'][e]) + "\t")
                    token = ""
                    for c in word:
                        token = token + doc['tokens'][c]

                    abstract.write(str(token) + "\n")
Пример #7
0
                                    outputs=[conv_model_time_distributed])
conv_model_time_distributed._uses_learning_phase = True  #for learning=True, for testing = False

#Visualize Model:
if flag_plot_model == 1:
    keras.utils.plot_model(conv_model_single_image_as_model)
    keras.utils.vis_utils.plot_model(conv_model_single_image_as_model)
    from IPython.display import SVG
    from keras.utils.vis_utils import model_to_dot
    SVG(
        model_to_dot(conv_model_single_image_as_model).create(prog='dot',
                                                              format='svg'))

#Summarize Model:
conv_model_single_image_as_model.summary()
conv_model_time_distributed.summary()


def clip_shift_layer(predicted_shifts, max_shift=1):
    #    predicted_shifts[(predicted_shifted > max_shift)] = max_shift;
    return K.clip(predicted_shifts, -max_shift, max_shift)


def custom_loss_function(predicted_shifts, true_shifts):
    #if i predict images
    max_shift = max_shift_number_global
    predicted_x = predicted_shifts[0]
    predicted_y = predicted_shifts[1]
    true_x = true_shifts[0]
    true_y = true_shifts[1]
    difference_clipped = K.clip(K.abs(predicted_shifts - true_shifts),
Пример #8
0
    def model_with_padding(self, DICT, n_char):

        # get sequences and labels separated.
        # convert BIO tags to numbers
        sequences, labels = self.get_seq(DICT)

        # sequences = sequences[:100]
        # labels = labels[:100]

        # X = pad_sequences(sequences, maxlen=self.w_arit_mean, padding='post', truncating='post')
        # y_pad = pad_sequences(labels, maxlen=self.w_arit_mean, padding='post', truncating='post')

        X = pad_sequences(sequences, maxlen=self.maxSeqLength, padding='post')
        y_pad = pad_sequences(labels, maxlen=self.maxSeqLength, padding='post')

        y = [to_categorical(i, num_classes=self.lab_len) for i in y_pad]

        # early stopping and best epoch
        #early_stop = keras.callbacks.EarlyStopping(monitor='loss', patience=2, verbose=0, mode='auto')
        #filepath = "max-seq.h5"
        #checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='max')
        #callbacks_list = [checkpoint, early_stop]

        # Set up the keras model
        input = Input(shape=(self.maxSeqLength, ))
        el = Embedding(n_char + 1, 200, name="embed")(input)
        bl1 = Bidirectional(LSTM(128,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm1")(el)
        bl2 = Bidirectional(LSTM(64,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm2")(bl1)
        bl3 = Bidirectional(LSTM(64,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm3")(bl2)
        model = TimeDistributed(Dense(self.lab_len, activation="relu"))(bl3)
        crf = CRF(self.lab_len)  # CRF layer
        out = crf(model)  # output

        model = Model(input, out)
        model.compile(optimizer="rmsprop",
                      loss=crf.loss_function,
                      metrics=[crf.accuracy])
        model.summary()

        #treinar com 32, 147, 245, 735
        history = model.fit(X,
                            np.array(y),
                            batch_size=32,
                            epochs=self.epochsN,
                            validation_split=0.0,
                            verbose=1)
        # save all epochs
        save_load_utils.save_all_weights(model,
                                         'max_seq_%s_32b.h5' % self.epochsN)
Пример #9
0
    def model_no_padding(self, DICT, n_char):

        # convert BIO tags to numbers
        self.convert_tags()
        '''
        check if bion contains 'B' and 'I'
        for i in self.train_data:
            print(i['bion'])
        '''

        for i in range(len(self.train_data)):
            corp = self.train_data[i]['corpus']

            corp_num = []
            for c in corp:
                corp_num.append(DICT.get(c))
            self.train_data[i]['corpus'] = corp_num

        # get all sizes from the sequences with training data
        train_l_d = {}
        train_l_labels = {}
        for seq in self.train_data:
            # corpus
            l = len(seq['corpus'])
            if l not in train_l_d: train_l_d[l] = []
            train_l_d[l].append(seq['corpus'])

            # labels
            l1 = len(seq['bion'])
            if l1 not in train_l_labels: train_l_labels[l1] = []
            train_l_labels[l1].append(seq['bion'])
        '''
        for i in range(len(train_l_d[110])):
            print(len(train_l_d[110][i]) == len(train_l_labels[110][i]))
            print()
        print("\n\n")

        for i in range(len(train_l_d[31])):
            print(len(train_l_d[31][i]) == len(train_l_labels[31][i]))
        print("\n\n")

        for i in range(len(train_l_d[103])):
            print(len(train_l_d[103][i]) == len(train_l_labels[103][i]))
        print("\n\n")
        exit()
        '''
        sizes = list(train_l_d.keys())

        # Set up the keras model
        il = Input(shape=(None, ), dtype='int32')
        el = Embedding(n_char + 1, 200, name="embed")(il)
        bl1 = Bidirectional(LSTM(128,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm1")(el)
        bl2 = Bidirectional(LSTM(64,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm2")(bl1)
        bl3 = Bidirectional(LSTM(64,
                                 return_sequences=True,
                                 recurrent_dropout=0.5,
                                 dropout=0.5),
                            merge_mode="concat",
                            name="lstm3")(bl2)
        model = TimeDistributed(Dense(self.num_labs, activation="relu"))(bl3)
        crf = CRF(self.num_labs)  # CRF layer
        out = crf(model)  # output

        model = Model(il, out)
        model.compile(optimizer="rmsprop",
                      loss=crf.loss_function,
                      metrics=[crf.accuracy])
        model.summary()

        f_best = -1
        f_index = -1
        # OK, start actually training
        for epoch in range(self.epochsN):
            print("Epoch", epoch, "start at", datetime.now())
            # Train in batches of different sizes - randomize the order of sizes
            # Except for the first few epochs
            if epoch > 2:
                random.shuffle(sizes)
            for size in sizes:
                batch = train_l_d[size]
                labs = train_l_labels[size]

                tx = np.array([seq for seq in batch])
                y = [seq for seq in labs]

                ty = [to_categorical(i, num_classes=self.num_labs) for i in y]

                # This trains in mini-batches
                model.fit(tx, np.array(ty), verbose=0, epochs=1)
            print("Trained at", datetime.now())

            # save all epochs
            save_load_utils.save_all_weights(
                model, 'mini-batch-results/epoch_%s.h5' % epoch)
            # test the results
            self.test_minibatch(DICT, model)
            f = self.eval()

            if f > f_best:
                f_best = f
                f_index = epoch

        # Pick the best model, and save it with a useful name
        print("Choosing the best epoch")
        shutil.copyfile("mini-batch-results/epoch_%s.h5" % f_index,
                        "minibatch_%s.h5" % f_index)