batch_size = 32 # バッチサイズ vocab_size = 1000 # 扱う語彙の数 embedding_dim = 100 # 単語ベクトルの次元 seq_length1 = 20 # 質問の長さ seq_length2 = 10 # 回答の長さ lstm_units = 200 # LSTMの隠れ状態ベクトルの次元数 hidden_dim = lstm_units * 2 # 最終出力のベクトルの次元数 def abs_sub(x): return K.abs(x[0] - x[1]) embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim) input1 = Input(shape=(seq_length1,)) embed1 = embedding(input1) bilstm1 = Bidirectional(LSTM(lstm_units, return_sequences=True), merge_mode='concat')(embed1) h1 = Dropout(0.2)(bilstm1) input2 = Input(shape=(seq_length2,)) embed2 = embedding(input2) bilstm2 = Bidirectional(LSTM(lstm_units, return_sequences=True), merge_mode='concat')(embed2) h2 = Dropout(0.2)(bilstm2) # 要素ごとの積を計算する product = dot([h2, h1], axes=2) # サイズ:[バッチサイズ、回答の長さ、質問の長さ] a = Activation('softmax')(product) c = dot([a, h1], axes=[2, 1]) c_h2 = concatenate([c, h2], axis=2) h = Dense(hidden_dim, activation='tanh')(c_h2) mean_pooled_1 = AveragePooling1D(pool_size=seq_length1, strides=1, padding='valid')(h1)
def train(params, checkpoint_directory, queue): # Hyper-parameters embedding_neuron = params['embedding_neuron'] lstm_params = params['lstm'] lstm_num_layer = lstm_params['layer'] optimizer = params['optimizer'] batch_size = params['batch_size'] # Debug print("[Params]", params) # Initialize checkpoint directory tensorboard_directory = os.path.join(checkpoint_directory, "tensorboard") os.makedirs(checkpoint_directory) os.makedirs(tensorboard_directory) # Sequential model model = Sequential() # Embedding layer model.add(Embedding(constant.NUM_CHARS, embedding_neuron, input_length=num_step)) for i in range(lstm_num_layer): neuron = lstm_params['neuron'][i] dropout_rate = lstm_params['dropout'][i] # LSTM layer lstm = LSTM(neuron, return_sequences=True, unroll=True, dropout=dropout_rate, recurrent_dropout=dropout_rate) # Bidirectional LSTM bi_lstm = Bidirectional(lstm) model.add(bi_lstm) # LSTM dropout model.add(Dropout(dropout_rate)) # RNN model.add(TimeDistributed(Dense(constant.NUM_TAGS, activation="softmax"), input_shape=(num_step, lstm_params['neuron'][-1]))) # Compile model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["categorical_accuracy"]) # Save model architecture to file with open(os.path.join(checkpoint_directory, "model.json"), "w") as file: file.write(model.to_json()) # Save model config to file with open(os.path.join(checkpoint_directory, "model_config.txt"), "w") as file: pprint(model.get_config(), stream=file) # Display model summary before train model.summary() # Callback params = DottableDict({ "es_enable": False, "es_min_delta": 0, "es_patience": 0 }) path = DottableDict({ "checkpoint": checkpoint_directory, "tensorboard": tensorboard_directory, "loss_log": os.path.join(checkpoint_directory, "loss.csv"), "score_log": os.path.join(checkpoint_directory, "score.csv") }) callbacks = CustomCallback(params, path).callbacks # Train model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs, batch_size=batch_size, verbose=2, callbacks=callbacks, shuffle=shuffle) # Evaluate _, accuracy = model.evaluate(x_test, y_test, verbose=0) # Debug print("[Validation] categorical_accuracy:", accuracy) print("") # Put accuracy to queue queue.put(accuracy)
images = [] count = 0 embedding_size = 300 image_model = Sequential([ Dense(embedding_size, input_shape=(2048,), activation='relu'), RepeatVector(max_len) ]) caption_model = Sequential([ Embedding(vocab_size, embedding_size, input_length=max_len), LSTM(256, return_sequences=True), TimeDistributed(Dense(300)) ]) final_model = Sequential([ Merge([image_model, caption_model], mode='concat', concat_axis=1), Bidirectional(LSTM(256, return_sequences=False)), Dense(vocab_size), Activation('softmax') ]) final_model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) print(final_model.summary()) final_model.load_weights('E:\\PycharmProjects\\image captioningg\\Image-Captioning-master11\\weights\\time_inceptionV3_1.5987_loss.h5') def predict_captions(image): start_word = ["<start>"] while True: par_caps = [word2idx[i] for i in start_word] par_caps = sequence.pad_sequences([par_caps], maxlen=max_len, padding='post')
def create_model(params, computed_params): net_arch = params['net_arch'] logging.info('Constructing neural net: {}...'.format(net_arch)) max_inputseq_len = computed_params['max_inputseq_len'] word_dims = computed_params['word_dims'] max_nb_premises = computed_params['max_nb_premises'] inputs = [] input_question = Input(shape=(max_inputseq_len, word_dims,), dtype='float32', name='question') inputs.append(input_question) for ipremise in range(max_nb_premises): input_premise = Input(shape=(max_inputseq_len, word_dims,), dtype='float32', name='premise{}'.format(ipremise)) inputs.append(input_premise) input_word = Input(shape=(word_dims,), dtype='float32', name='word') layers = [] encoder_size = 0 if net_arch == 'lstm': rnn_size = params['rnn_size'] # Энкодер на базе LSTM, на выходе которого получаем вектор с упаковкой слов # предложения. Этот слой общий для всех входных предложений. shared_words_rnn = Bidirectional(recurrent.LSTM(rnn_size, input_shape=(max_inputseq_len, word_dims), return_sequences=False)) for input in inputs: encoder_rnn = shared_words_rnn(input) layers.append(encoder_rnn) encoder_size += rnn_size*2 elif net_arch == 'lstm(cnn)': rnn_size = params['rnn_size'] nb_filters = params['nb_filters'] max_kernel_size = params['max_kernel_size'] for kernel_size in range(1, max_kernel_size+1): # сначала идут сверточные слои, образующие детекторы словосочетаний # и синтаксических конструкций conv = Conv1D(filters=nb_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1, name='shared_conv_{}'.format(kernel_size)) lstm = recurrent.LSTM(rnn_size, return_sequences=False) for input in inputs: conv_layer1 = conv(input) if params['pooling'] == 'max': pooling = keras.layers.MaxPooling1D() elif params['pooling'] == 'average': pooling = keras.layers.AveragePooling1D() else: raise NotImplementedError() conv_layer1 = pooling(conv_layer1) conv_layer1 = lstm(conv_layer1) layers.append(conv_layer1) encoder_size += rnn_size elif net_arch == 'cnn': nb_filters = params['nb_filters'] max_kernel_size = params['max_kernel_size'] for kernel_size in range(1, max_kernel_size+1): conv = Conv1D(filters=nb_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1, name='shared_conv_{}'.format(kernel_size)) for input in inputs: conv_layer1 = conv(input) if params['pooling'] == 'max': pooling = keras.layers.GlobalMaxPooling1D() elif params['pooling'] == 'average': pooling = keras.layers.GlobalAveragePooling1D() else: raise NotImplementedError() conv_layer1 = pooling(conv_layer1) layers.append(conv_layer1) else: raise NotImplementedError() layers.append(input_word) encoder_merged = keras.layers.concatenate(inputs=list(layers)) decoder = encoder_merged if params['units1'] > 0: decoder = Dense(params['units1'], activation='relu')(decoder) if params['units2'] > 0: decoder = Dense(params['units2'], activation='relu')(decoder) if params['units3'] > 0: decoder = Dense(params['units3'], activation='relu')(decoder) output_dims = 2 decoder = Dense(output_dims, activation='softmax', name='output')(decoder) inputs2 = list(itertools.chain(inputs, [input_word])) model = Model(inputs=inputs2, outputs=decoder) model.compile(loss='categorical_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) #model.summary() return model
feats[:, :, dim_counter] = (feats[:, :, dim_counter] - np.mean( feats[:, :, dim_counter])) / np.std(feats[:, :, dim_counter]) train_ind, test_ind = train_test_split(range(len(labels_ind)), test_size=0.05) feats_train = feats[train_ind, :, :] labels_ind_train = labels_1hot[train_ind, :] feats_test = feats[test_ind, :, :] labels_ind_test = labels_1hot[test_ind, :] model = Sequential() model.add( Bidirectional(LSTM(LAYER_SIZE1, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), input_shape=(frame_dim, vec_dim))) model.add( Bidirectional( LSTM(LAYER_SIZE2, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))) model.add(Dense(out_dict_size, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
trainx1, trainx2, trainy = GetXY(train_questions, train_entitys) #(num_sample,max_len) testx1, testx2, testy = GetXY(test_questions, test_entitys) print(trainx1.shape) #搭建模型 bert_model = load_trained_model_from_checkpoint( config_path, checkpoint_path, seq_len=None) #这里预训练的bert模型被看待为一个keras层 for l in bert_model.layers: l.trainable = True x1_in = Input(shape=(None, )) x2_in = Input(shape=(None, )) x = bert_model([x1_in, x2_in]) #(batch,step,feature) x = Bidirectional(LSTM(512, return_sequences=True, recurrent_dropout=0.2))(x) p = Dense(1, activation='sigmoid')(x) model = Model([x1_in, x2_in], p) model.compile(loss='binary_crossentropy', optimizer=Adam(1e-5), metrics=['accuracy']) model.summary() #训练模型 maxf = 0.0 def computeF(gold_entity, pre_entity): ''' 根据标注的实体位置和预测的实体位置,计算prf,完全匹配 输入: Python-list 3D,值为每个实体的起始位置列表[begin,end]
def build(input_shape=(32, None, 1), rnn_unit=256, num_classes=5991, max_string_len=10): input = Input(shape=input_shape, name='the_input') m = Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same', name='conv1')(input) m = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='pool1')(m) m = Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same', name='conv2')(m) m = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='pool2')(m) m = Conv2D(256, kernel_size=(3, 3), activation='relu', padding='same', name='conv3')(m) m = Conv2D(256, kernel_size=(3, 3), activation='relu', padding='same', name='conv4')(m) m = MaxPooling2D(pool_size=(2, 1), strides=(2, 1), padding='valid', name='pool3')(m) m = Conv2D(512, kernel_size=(3, 3), activation='relu', padding='same', name='conv5')(m) m = BatchNormalization(axis=3)(m) m = Conv2D(512, kernel_size=(3, 3), activation='relu', padding='same', name='conv6')(m) m = BatchNormalization(axis=3)(m) m = MaxPooling2D(pool_size=(2, 1), strides=(2, 1), padding='valid', name='pool4')(m) m = Conv2D(512, kernel_size=(2, 2), activation='relu', padding='valid', name='conv7')(m) m = Permute((2, 1, 3), name='permute')(m) m = TimeDistributed(Flatten(), name='timedistrib')(m) m = Bidirectional(GRU(rnn_unit, return_sequences=True, implementation=2), name='blstm1')(m) m = Bidirectional(GRU(rnn_unit, return_sequences=True, implementation=2), name='blstm2')(m) y_pred = Dense(num_classes, name='blstm2_out', activation='softmax')(m) base_model = Model(inputs=input, outputs=y_pred) label = Input(name='label', shape=[max_string_len], dtype='int64') seq_length = Input(name='seq_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') loss_out = Lambda(ctc_lambda_func, output_shape=(1, ), name='ctc')( [label, y_pred, seq_length, label_length]) model = Model(input=[input, label, seq_length, label_length], output=[loss_out]) model.summary() return base_model, model
states_train = list(f_train.keys())[0] projed_rep_Ru_train = list(f_train[states_train]) projed_rep_Ru_train = np.array(projed_rep_Ru_train) # Load Russian test projected data f_test = h5py.File("./output_adv_NoMT/projected_rep_%s_test_1k.hdf5"%lang, 'r') states_test = list(f_test.keys())[0] projed_rep_Ru_test = list(f_test[states_test]) projed_rep_Ru_val = np.array(projed_rep_Ru_test) # Building model myInput = Input(shape=(150,200)) LSTM_Russian = Bidirectional(LSTM(100,return_sequences=False))(myInput) #LSTM_Russian=Bidirectional(LSTM(32, return_sequences=False))(LSTM_Russian) predictions = Dense(1, activation='sigmoid')(LSTM_Russian) model_Ru = Model(inputs=myInput, outputs=predictions) model_Ru.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) print(model_Ru.summary()) print(len(projed_rep_Ru_train)) print(len(y_train_Ru)) class_weight = {0: 1.,1: 1} early_stopping = EarlyStopping(monitor='val_loss', patience=3) model_Ru.fit(projed_rep_Ru_train, y_train_Ru, epochs=10, batch_size=32, validation_data=[projed_rep_Ru_val, y_val_Ru], callbacks=[early_stopping],class_weight=class_weight) # Load Russian test projected data
def create_model(params, computed_params): logging.info('Constructing the NN model arch={}...'.format( params['net_arch'])) max_wordseq_len = computed_params['max_wordseq_len'] word_dims = computed_params['word_dims'] input_words = Input(shape=( max_wordseq_len, word_dims, ), dtype='float32', name='input_words') # суммарный размер выходных тензоров в conv1, то есть это сумма размеров векторов # для всех слоев в списке conv1, если их смерджить. layers = [] if params['net_arch'] == 'rnn': # энкодер на базе LSTM, на выходе которого получаем вектор с упаковкой слов предложения. rnn_size = params['rnn_size'] words_rnn = Bidirectional( recurrent.LSTM(rnn_size, input_shape=(max_wordseq_len, word_dims), return_sequences=False)) encoder_rnn = words_rnn(input_words) layers.append(encoder_rnn) elif params['net_arch'] == 'rnn(cnn)': rnn_size = params['rnn_size'] nb_filters = params['nb_filters'] max_kernel_size = params['max_kernel_size'] for kernel_size in range(1, max_kernel_size + 1): # сначала идут сверточные слои, образующие детекторы словосочетаний # и синтаксических конструкций conv = Conv1D(filters=nb_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1, name='shared_conv_{}'.format(kernel_size)) lstm = recurrent.LSTM(rnn_size, return_sequences=False) conv_layer1 = conv(input_words) if params['pooling'] == 'max': pooling = keras.layers.MaxPooling1D() elif params['pooling'] == 'average': pooling = keras.layers.AveragePooling1D() else: raise NotImplementedError() conv_layer1 = pooling(conv_layer1) conv_layer1 = lstm(conv_layer1) layers.append(conv_layer1) elif params['net_arch'] == 'cnn': nb_filters = params['nb_filters'] max_kernel_size = params['max_kernel_size'] for kernel_size in range(1, max_kernel_size + 1): conv = Conv1D(filters=nb_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1, name='shared_conv_{}'.format(kernel_size)) conv_layer1 = conv(input_words) if params['pooling'] == 'max': pooling = keras.layers.GlobalMaxPooling1D() elif params['pooling'] == 'average': pooling = keras.layers.GlobalAveragePooling1D() else: raise NotImplementedError() conv_layer1 = pooling(conv_layer1) layers.append(conv_layer1) else: raise NotImplementedError() if len(layers) == 1: classif = layers[0] else: classif = keras.layers.concatenate(inputs=layers) if params['units1'] > 0: classif = Dense(units=params['units1'], activation=params['activation1'])(classif) classif = Dense(units=2, activation='softmax', name='output')(classif) model = Model(inputs=input_words, outputs=classif) model.compile(loss='categorical_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) model.summary() return model
def trainging(storage, exp, sampleweights, char_x, pos_x, unicate_x, trainy_interval, trainy_operator_ex, trainy_operator_im, char_x_cv, pos_x_cv, unicate_x_cv, cv_y_interval, cv_y_operator_ex, cv_y_operator_im, batchsize, epoch_size, n_char, n_pos, n_unicate, n_vocab, reload=False, modelpath=None, embedding_size_char=64, embedding_size_pos=48, embedding_size_unicate=32, embedding_size_vocab=32, gru_size1=128, gru_size2=160): seq_length = char_x.shape[1] type_size_interval = trainy_interval.shape[-1] type_size_operator_ex = trainy_operator_ex.shape[-1] type_size_operator_im = trainy_operator_im.shape[-1] if not os.path.exists(storage): os.makedirs(storage) CharEmbedding = Embedding(output_dim=embedding_size_char, input_dim=n_char, input_length=seq_length, embeddings_regularizer=l2(.01), mask_zero=True) PosEmbedding = Embedding(output_dim=embedding_size_pos, input_dim=n_pos, input_length=seq_length, embeddings_regularizer=l2(.01), mask_zero=True) UnicateEmbedding = Embedding(output_dim=embedding_size_unicate, input_dim=n_unicate, input_length=seq_length, embeddings_regularizer=l2(.01), mask_zero=True) Gru_out_1 = Bidirectional( GRU(gru_size1, return_sequences=True, input_shape=(seq_length, embedding_size_char + embedding_size_pos + embedding_size_unicate))) Gru_out_2 = GRU(gru_size2, return_sequences=True) Interval_output = Dense(type_size_interval, activation='softmax', kernel_regularizer=l2(.01), name='dense_1') Gru_out_3 = Bidirectional(GRU(gru_size1, return_sequences=True)) Gru_out_4 = GRU(gru_size2, return_sequences=True) # Explicit_operator = Dense(type_size_operator_ex, activation='softmax', kernel_regularizer=l2(.01), name='dense_2') Gru_out_5 = Bidirectional(GRU(gru_size1, return_sequences=True)) Gru_out_6 = GRU(gru_size2, return_sequences=True) Implicit_operator = Dense(type_size_operator_im, activation='softmax', kernel_regularizer=l2(.01), name='dense_3') char_input = Input(shape=(seq_length, ), dtype='int8', name='character') pos_input = Input(shape=(seq_length, ), dtype='int8', name='pos') unicate_input = Input(shape=(seq_length, ), dtype='int8', name='unicate') char_em = Dropout(0.25)(CharEmbedding(char_input)) pos_em = Dropout(0.15)(PosEmbedding(pos_input)) unicate_em = Dropout(0.15)(UnicateEmbedding(unicate_input)) merged = keras.layers.concatenate([char_em, pos_em, unicate_em], axis=-1) gru_out1 = Gru_out_1(merged) gru_out2 = Gru_out_2(gru_out1) interval_output = Interval_output(gru_out2) gru_out3 = Gru_out_3(merged) gru_out4 = Gru_out_4(gru_out3) explicit_operator = Explicit_operator(gru_out4) gru_out5 = Gru_out_5(merged) gru_out6 = Gru_out_6(gru_out5) implicit_operator = Implicit_operator(gru_out6) model = Model( inputs=[char_input, pos_input, unicate_input], outputs=[interval_output, explicit_operator, implicit_operator]) model.compile(optimizer='sgd', loss={ 'dense_1': 'categorical_crossentropy', 'dense_2': 'categorical_crossentropy', 'dense_3': 'categorical_crossentropy' }, loss_weights={ 'dense_1': 1.0, 'dense_2': 0.75, 'dense_3': 0.5 }, metrics=['categorical_accuracy'], sample_weight_mode="temporal") print(model.summary()) filepath = storage + "/weights-improvement-{epoch:02d}.hdf5" checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=False) csv_logger = CSVLogger('training_%s.csv' % exp) callbacks_list = [checkpoint, csv_logger] hist = model.fit(x={ 'character': char_x, 'pos': pos_x, 'unicate': unicate_x }, y={ 'dense_1': trainy_interval, 'dense_2': trainy_operator_ex, 'dense_3': trainy_operator_im }, epochs=epoch_size, batch_size=batchsize, callbacks=callbacks_list, validation_data=({ 'character': char_x_cv, 'pos': pos_x_cv, 'unicate': unicate_x_cv }, { 'dense_1': cv_y_interval, 'dense_2': cv_y_operator_ex, 'dense_3': cv_y_operator_im }), sample_weight=sampleweights) model.save(storage + '/model_result.hdf5') np.save(storage + '/epoch_history.npy', hist.history)
fd = {'shifted': shifted, 'lr': learning_rate, 'emdim': chord_embedding_dim, 'opt': optimizer, 'bi': bidirectional, 'lstms': lstm_size, 'trainsize': train_set_size, 'testsize': test_set_size} model_name = 'Shifted_%(shifted)s_Lr_%(lr)s_EmDim_%(emdim)s_opt_%(opt)s_bi_%(bi)s_lstmsize_%(lstms)s_trainsize_%(trainsize)s_testsize_%(testsize)s' % fd model_path = model_path + model_name + '/' if not os.path.exists(model_path): os.makedirs(model_path) print('loading data...') train_set, test_set = data_class.get_chord_train_and_test_set(train_set_size, test_set_size) print('creating model...') model = Sequential() model.add(Embedding(num_chords, chord_embedding_dim, batch_size=1, input_length=1)) if bidirectional: model.add(Bidirectional(LSTM(lstm_size, stateful=True))) else: model.add(LSTM(lstm_size, stateful=True)) model.add(Dense(num_chords)) model.add(Activation('softmax')) if optimizer == 'Adam': optimizer = Adam(lr=learning_rate) elif optimizer == 'RMS': optimizer = RMSprop(lr=learning_rate) loss = 'categorical_crossentropy' model.compile(optimizer, loss) total_test_loss_array = [] total_train_loss_array = [] total_test_loss = 0
def create_model(args, initial_mean_value, overal_maxlen, vocab): ############################################################################################################################### ## Recurrence unit type # if args.recurrent_unit == 'lstm': from keras.layers.recurrent import LSTM as RNN elif args.recurrent_unit == 'gru': from keras.layers.recurrent import GRU as RNN elif args.recurrent_unit == 'simple': from keras.layers.recurrent import SimpleRNN as RNN ############################################################################################################################### ## Create Model # if args.dropout_w > 0: dropout_W = args.dropout_w else: dropout_W = args.dropout_prob # default=0.5 if args.dropout_u > 0: dropout_U = args.dropout_u else: dropout_U = args.dropout_prob # default=0.1 cnn_border_mode = 'same' if args.model_type == 'reg': if initial_mean_value.ndim == 0: initial_mean_value = np.expand_dims(initial_mean_value, axis=1) num_outputs = len(initial_mean_value) else: num_outputs = initial_mean_value ############################################################################################################################### ## Initialize embeddings if requested # if args.emb_path: def my_init(shape, name=None): from nea.w2vEmbReader import W2VEmbReader as EmbReader logger.info('Initializing lookup table') emb_reader = EmbReader(args.emb_path, emb_dim=args.emb_dim) emb_matrix = np.random.random(shape) # logger.info(' initial matrix \n %s ' % (emb_matrix,)) emb_matrix = emb_reader.get_emb_matrix_given_vocab( vocab, emb_matrix) # from keras.backend import set_value, get_value # set_value(model.layers[model.emb_index].W, get_value(emb_reader.get_emb_matrix_given_vocab(vocab, model.layers[model.emb_index].W))) # model.layers[model.emb_index].W.set_value(emb_reader.get_emb_matrix_given_vocab(vocab, model.layers[model.emb_index].W.get_value())) # logger.info(' pre-trained matrix \n %s ' % (emb_matrix,)) return K.variable(emb_matrix, name=name) logger.info(' Use pre-trained embedding') else: my_init = 'uniform' logger.info(' Use default initializing embedding') ############################################################################################################################### ## Model Stacking # if args.model_type == 'cls': logger.info('Building a CLASSIFICATION model with POOLING') dense_activation = 'tanh' dense_init = 'glorot_normal' if args.loss == 'cnp': final_activation = 'softmax' final_init = 'glorot_uniform' elif args.loss == 'hng': final_activation = 'linear' final_init = 'glorot_uniform' elif args.model_type == 'reg': logger.info('Building a REGRESSION model with POOLING') dense_activation = 'tanh' dense_init = 'he_normal' if args.normalize: final_activation = 'sigmoid' final_init = 'he_normal' else: final_activation = 'relu' final_init = 'he_uniform' else: raise NotImplementedError sequence = Input(shape=(overal_maxlen, ), dtype='int32') x = Embedding(len(vocab), args.emb_dim, mask_zero=True, init=my_init, trainable=args.embd_train)(sequence) # Conv Layer if args.cnn_dim > 0: x = Conv1DWithMasking(nb_filter=args.cnn_dim, filter_length=args.cnn_window_size, border_mode=cnn_border_mode, subsample_length=1)(x) # RNN Layer if args.rnn_dim > 0: rnn_layer = RNN(args.rnn_dim, return_sequences=True, consume_less=args.rnn_opt, dropout_W=dropout_W, dropout_U=dropout_U) if args.bi: rnn_layer = Bidirectional(rnn_layer) x = rnn_layer(x) if args.dropout_prob > 0: x = Dropout(args.dropout_prob)(x) # Stack 2 Layers if args.rnn_2l or args.rnn_3l: rnn_layer2 = RNN(args.rnn_dim, return_sequences=True, consume_less=args.rnn_opt, dropout_W=dropout_W, dropout_U=dropout_U) if args.bi: rnn_layer2 = Bidirectional(rnn_layer2) x = rnn_layer2(x) if args.dropout_prob > 0: x = Dropout(args.dropout_prob)(x) # Stack 3 Layers if args.rnn_3l: rnn_layer3 = RNN(args.rnn_dim, return_sequences=True, consume_less=args.rnn_opt, dropout_W=dropout_W, dropout_U=dropout_U) if args.bi: rnn_layer3 = Bidirectional(rnn_layer3) x = rnn_layer3(x) if args.dropout_prob > 0: x = Dropout(args.dropout_prob)(x) # Mean over Time if args.aggregation == 'mot': x = MeanOverTime(mask_zero=True)(x) elif args.aggregation == 'att': attention_rnn = RNN(args.rnn_dim, return_sequences=False, consume_less=args.rnn_opt, dropout_W=dropout_W, dropout_U=dropout_U) attention_rnn = Attention(attention_rnn) x = attention_rnn(x) else: raise NotImplementedError # Augmented TF/IDF Layer if args.tfidf > 0: pca_input = Input(shape=(args.tfidf, ), dtype='float32') merged = merge([x, pca_input], mode='concat') else: merged = x # Augmented Numerical Features if args.features: ftr_input = Input(shape=(13, ), dtype='float32') merged = merge([merged, ftr_input], mode='concat') # Optional Dense Layer if args.dense > 0: if args.loss == 'hng': merged = DenseWithMasking(num_outputs, init=dense_init, W_regularizer=l2(0.001), activity_regularizer=l2(0.001))(merged) else: merged = DenseWithMasking(num_outputs, init=dense_init)(merged) if final_activation == 'relu' or final_activation == 'linear': merged = BatchNormalization()(merged) merged = Activation(dense_activation)(merged) if args.dropout_prob > 0: merged = Dropout(args.dropout_prob)(merged) # Final Prediction Layer if args.loss == 'hng': merged = DenseWithMasking(num_outputs, init=final_init, W_regularizer=l2(0.001), activity_regularizer=l2(0.001))(merged) else: merged = DenseWithMasking(num_outputs, init=final_init)(merged) if final_activation == 'relu' or final_activation == 'linear': merged = BatchNormalization()(merged) predictions = Activation(final_activation)(merged) # Model Input/Output model_input = [ sequence, ] if args.tfidf > 0: model_input.append(pca_input) if args.features: model_input.append(ftr_input) model = Model(input=model_input, output=predictions) logger.info(' Model Done') return model
def build(self): if K.image_data_format() == 'channels_first': input_shape = (self.img_c, self.frames_n, self.img_w, self.img_h) else: input_shape = (self.frames_n, self.img_w, self.img_h, self.img_c) self.input_data = Input(name='the_input', shape=input_shape, dtype='float32') self.zero1 = ZeroPadding3D(padding=(1, 2, 2), name='zero1')(self.input_data) self.conv1 = Conv3D(32, (3, 5, 5), strides=(1, 2, 2), kernel_initializer='he_normal', name='conv1')(self.zero1) self.batc1 = BatchNormalization(name='batc1')(self.conv1) self.actv1 = Activation('relu', name='actv1')(self.batc1) self.drop1 = SpatialDropout3D(0.5)(self.actv1) self.maxp1 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max1')(self.drop1) self.zero2 = ZeroPadding3D(padding=(1, 2, 2), name='zero2')(self.maxp1) self.conv2 = Conv3D(64, (3, 5, 5), strides=(1, 1, 1), kernel_initializer='he_normal', name='conv2')(self.zero2) self.batc2 = BatchNormalization(name='batc2')(self.conv2) self.actv2 = Activation('relu', name='actv2')(self.batc2) self.drop2 = SpatialDropout3D(0.5)(self.actv2) self.maxp2 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max2')(self.drop2) self.zero3 = ZeroPadding3D(padding=(1, 1, 1), name='zero3')(self.maxp2) self.conv3 = Conv3D(96, (3, 3, 3), strides=(1, 1, 1), kernel_initializer='he_normal', name='conv3')(self.zero3) self.batc3 = BatchNormalization(name='batc3')(self.conv3) self.actv3 = Activation('relu', name='actv3')(self.batc3) self.drop3 = SpatialDropout3D(0.5)(self.actv3) self.maxp3 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max3')(self.drop3) self.resh1 = TimeDistributed(Flatten())(self.maxp3) self.lstm_1 = Bidirectional(LSTM(256, return_sequences=True, kernel_initializer='Orthogonal', name='lstm1'), merge_mode='concat')(self.resh1) self.lstm_2 = Bidirectional(LSTM(256, return_sequences=True, kernel_initializer='Orthogonal', name='lstm2'), merge_mode='concat')(self.lstm_1) # transforms RNN output to character activations: self.dense1 = Dense(self.output_size, kernel_initializer='he_normal', name='dense1')(self.lstm_2) self.y_pred = Activation('softmax', name='softmax')(self.dense1) self.labels = Input(name='the_labels', shape=[self.absolute_max_string_len], dtype='float32') self.input_length = Input(name='input_length', shape=[1], dtype='int64') self.label_length = Input(name='label_length', shape=[1], dtype='int64') self.loss_out = CTC( 'ctc', [self.y_pred, self.labels, self.input_length, self.label_length]) self.model = Model(inputs=[ self.input_data, self.labels, self.input_length, self.label_length ], outputs=self.loss_out)
def build_model(embedding_layer): #, params: Params): question_input = layers.Input( shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') # * 2 since doubling the question and passage answer_input = layers.Input( shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') # * 2 since doubling the question and passage question_embedding = embedding_layer(question_input) answer_embedding = embedding_layer(answer_input) # Min's model has some highway layers here, with relu activations. Note that highway # layers don't change the tensor's shape. We need to have two different `TimeDistributed` # layers instantiated here, because Keras doesn't like it if a single `TimeDistributed` # layer gets applied to two inputs with different numbers of time steps. highway_layers = 2 for i in range(highway_layers): highway_layer = highway.Highway(activation='relu', name='highway_{}'.format(i)) question_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_qtd", trainable=False) question_embedding = question_layer(question_embedding) passage_layer = layers.TimeDistributed(highway_layer, name=highway_layer.name + "_ptd", trainable=False) answer_embedding = passage_layer(answer_embedding) # Then we pass the question and passage through a seq2seq encoder (like a biLSTM). This # essentially pushes phrase-level information into the embeddings of each word. phrase_layer = Bidirectional( layers.GRU(return_sequences=True, units=500, activation='relu', recurrent_dropout=0.2, dropout=0.3, trainable=False) ) #, **(params["encoder_params"]), **(params["wrapper_params"]))) # Shape: (batch_size, num_question_words, embedding_dim * 2) encoded_question = phrase_layer(question_embedding) # Shape: (batch_size, num_passage_words, embedding_dim * 2) encoded_answer = phrase_layer(answer_embedding) # PART 2: # Now we compute a similarity between the passage words and the question words, and # normalize the matrix in a couple of different ways for input into some more layers. matrix_attention_layer = MatrixAttention( similarity_function={ 'type': 'linear', 'combination': 'x,y,x*y' }, name='passage_question_similarity', trainable=False) # Shape: (batch_size, num_passage_words, num_question_words) answer_question_similarity = matrix_attention_layer( [encoded_answer, encoded_question]) # Shape: (batch_size, num_passage_words, num_question_words), normalized over question # words for each passage word. answer_question_attention = MaskedSoftmax()(answer_question_similarity) # Shape: (batch_size, num_passage_words, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="answer_question_vectors", use_masking=False, trainable=False) answer_question_vectors = weighted_sum_layer( [encoded_question, answer_question_attention]) # Min's paper finds, for each document word, the most similar question word to it, and # computes a single attention over the whole document using these max similarities. # Shape: (batch_size, num_passage_words) question_answer_similarity = Max(axis=-1)(answer_question_similarity) # Shape: (batch_size, num_passage_words) question_answer_attention = MaskedSoftmax()(question_answer_similarity) # Shape: (batch_size, embedding_dim * 2) weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False, trainable=False) question_answer_vector = weighted_sum_layer( [encoded_answer, question_answer_attention]) # Then he repeats this question/passage vector for every word in the passage, and uses it # as an additional input to the hidden layers above. repeat_layer = RepeatLike(axis=1, copy_from_axis=1) # Shape: (batch_size, num_passage_words, embedding_dim * 2) tiled_question_answer_vector = repeat_layer( [question_answer_vector, encoded_answer]) # Shape: (batch_size, num_passage_words, embedding_dim * 8) complex_concat_layer = complex_concat.ComplexConcat( combination='1,2,1*2,1*3', name='final_merged_passage') final_merged_answer = complex_concat_layer([ encoded_answer, answer_question_vectors, tiled_question_answer_vector ]) # PART 3: # Having computed a combined representation of the document that includes attended question # vectors, we'll pass this through a few more bi-directional encoder layers, then predict # the span_begin word. Hard to find a good name for this; Min calls this part of the # network the "modeling layer", so we'll call this the `modeled_passage`. modeled_answer = final_merged_answer for i in range(1): hidden_layer = Bidirectional( layers.GRU( return_sequences=True, units=300, activation='relu', recurrent_dropout=0.2, dropout=0.3, )) #, **(params["encoder_params"]), **(params["wrapper_params"]))) modeled_answer = hidden_layer(modeled_answer) #PART 4: BY HELEN #get the maximum for each word max_answer = Max(axis=-1)(modeled_answer) print("max answer shape", max_answer.shape) print("modeled_answer shape", modeled_answer.shape) preds = layers.Dense(1, activation='sigmoid', name='prediction')(max_answer) print("pred shape", preds.shape) model = models.Model(inputs=[question_input, answer_input], outputs=preds) return model
model.add( Embedding( numwords + 1, embedding, input_length=seq_len, mask_zero=True, embeddings_regularizer=regularizers.l2(l2_regularizer_embeddings), # embeddings_initializer=he_normal(seed=42) )) model.add(Dropout(0.2)) # model.add(SpatialDropout1D(0.2)) if nlayers == 1: model.add( Bidirectional( RNN(neurons, implementation=impl, recurrent_dropout=rdrop, dropout=drop, kernel_regularizer=regularizers.l2(l2_regularizer)))) else: model.add( Bidirectional( RNN(neurons, implementation=impl, recurrent_dropout=rdrop, dropout=drop, return_sequences=True, kernel_regularizer=regularizers.l2(l2_regularizer)))) for i in range(1, nlayers - 1): model.add( Bidirectional( RNN(neurons,
batch_size = 250 feat_dim = 512 w = 7 # build model print("Build model...") tweet = Input(shape=(maxlen, ), dtype='int32') # input_1 embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen, mask_zero=False)(tweet) lstm = Bidirectional( LSTM(embedding_dim, return_sequences=True, input_shape=(maxlen, embedding_dim)))(embedding) # lstm = LSTM(embedding_dim, return_sequences=True, i nput_shape=(maxlen, embedding_dim))(embedding) dropout = Dropout(0.5)(lstm) # img = Input(shape=(1, 14, 14,512)) img = Input(shape=(7, 7, 512)) # input_2 # -------Image Bcnn start cnn_out_a = img cnn_out_shape = img.shape cnn_out_a = Reshape( [cnn_out_shape[1] * cnn_out_shape[2], cnn_out_shape[-1]])(cnn_out_a) print("cnn_out_a.shape is:---------", cnn_out_a.shape) # (,196,512) cnn_out_b = cnn_out_a
def embedding_cnn_glove(training_list, validation_list, test_list): tweets_train = list() score_train = list() total_dataset = list() for tweet in training_list: tweets_train.append(tweet.text) score_train.append(float(tweet.intensity)) total_dataset.append(tweet.text) tweets_val = list() score_val = list() for tweet in validation_list: tweets_val.append(tweet.text) score_val.append(float(tweet.intensity)) total_dataset.append(tweet.text) tweets_test = list() score_test = list() for tweet in test_list: tweets_test.append(tweet.text) score_test.append(float(tweet.intensity)) total_dataset.append(tweet.text) t = Tokenizer() t.fit_on_texts(total_dataset) word_index = t.word_index print(t.document_count) vocab_size = len(t.word_counts) print(vocab_size) print(len(word_index)) max_len = 50 sequences_train = t.texts_to_sequences(tweets_train) # print (tweets_train[0]) # print (sequences_train[0]) # print (tweets_train[0][0:3]) # print (word_index.get(tweets_train[0][0:3])) # print (word_index.get(sequences_train[0][0])) sequences_val = t.texts_to_sequences(tweets_val) sequences_test = t.texts_to_sequences(tweets_test) padded_train = pad_sequences(sequences_train, maxlen=max_len, padding='post') padded_val = pad_sequences(sequences_val, maxlen=max_len, padding='post') padded_test = pad_sequences(sequences_test, maxlen=max_len, padding='post') EMBEDDING_DIM = 100 X = np.ones( (len(padded_train), max_len, EMBEDDING_DIM, 1), dtype=np.int64) * -1 y = np.array(score_train) X_val = np.ones( (len(padded_val), max_len, EMBEDDING_DIM, 1), dtype=np.int64) * -1 y_val = np.array(score_val) X_test = np.ones( (len(padded_test), max_len, EMBEDDING_DIM, 1), dtype=np.int64) * -1 y_test = np.array(score_test) print(len(y_val)) print(len(X_val)) print(len(y_test)) print(len(X_test)) GLOVE_DIR = "./Data/glove.twitter.27B/glove.twitter.27B.100d.txt" embeddings_index = {} f = open(GLOVE_DIR) for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Read Glove and Made Dict') embedding_matrix = np.zeros((vocab_size + 1, EMBEDDING_DIM)) number_found = 0 number_not_found = 0 embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector number_found += 1 else: print(word) number_not_found += 1 print(number_found) print(number_not_found) for i in range(len(padded_train)): for j in range(max_len): X[i, j, :, 0] = embedding_matrix[padded_train[i][j]] for i in range(len(padded_val)): for j in range(max_len): X_val[i, j, :, 0] = embedding_matrix[padded_val[i][j]] for i in range(len(padded_test)): for j in range(max_len): X_test[i, j, :, 0] = embedding_matrix[padded_test[i][j]] conv_1 = Conv1D(64, 5, activation='relu', name='conv1', input_shape=(max_len, )) conv_2 = Conv1D(32, 3, activation='relu', name='conv2') conv_3 = Conv1D(32, 3, activation='relu', name='conv3') #pooling layers pool_1 = AveragePooling1D(pool_size=3, strides=2, name='pool1') pool_2 = AveragePooling1D(pool_size=3, strides=2, name='pool2') pool_3 = MaxPooling1D(pool_size=3, strides=2, name='pool3') pool_4 = MaxPooling1D(pool_size=3, strides=2, name='pool4') #LSTM Layers lstm_1 = LSTM(256, dropout=0.2, recurrent_dropout=0.2, name='lstm1', return_sequences=True) lstm_2 = LSTM(128, dropout=0.2, recurrent_dropout=0.2, name='lstm2', return_sequences=True) lstm_3 = LSTM(64, dropout=0.2, recurrent_dropout=0.2, name='lstm3') lstm_4 = LSTM(32, dropout=0.2, recurrent_dropout=0.2, name='lstm4', return_sequences=True) #GRU Layers gru_1 = GRU(256, dropout=0.2, recurrent_dropout=0.2, name='gru1', return_sequences=True) gru_2 = GRU(128, dropout=0.2, recurrent_dropout=0.2, name='gru2', return_sequences=True) gru_3 = GRU(64, dropout=0.2, recurrent_dropout=0.2, name='gru3') #Bidirectional Layers bi_lstm_1 = Bidirectional(lstm_1) bi_lstm_2 = Bidirectional(lstm_2) bi_lstm_3 = Bidirectional(lstm_3) bi_lstm_4 = Bidirectional(lstm_4) #Dense layers dense_1 = Dense(200, activation='relu', name='dense1') dense_2 = Dense(1, activation='sigmoid', name='dense2') def get_model(): model = Sequential() model.add(conv_1) model.add(Dropout(0.3)) model.add(pool_3) model.add(conv_2) model.add(Dropout(0.3)) model.add(pool_3) # model.add(conv_3) # model.add(Dropout(0.3)) # model.add(pool_3) model.add(Flatten()) #model.add(Dense(200, activation='relu', name='dense3')) model.add(dense_1) model.add(dense_2) #compile the model model.compile(optimizer='adam', loss='mean_squared_error') # summarize the model print(model.summary()) # fit the model return model estimator = KerasRegressor(build_fn=get_model, epochs=50, batch_size=32, verbose=1) estimator.fit(X, y, validation_data=(X_val, y_val)) train_prediction = estimator.predict(X) print(pearsonr(train_prediction, y)) print(spearmanr(train_prediction, y)) val_prediction = estimator.predict(X_val) print(pearsonr(val_prediction, y_val)) print(spearmanr(val_prediction, y_val)) test_prediction = estimator.predict(X_test) print(pearsonr(test_prediction, y_test)) print(spearmanr(test_prediction, y_test))
def CRNN(input_shape, num_classes, prediction_only=False, gru=True, alpha=0.75, gamma=0.5): """CRNN architecture. # Arguments input_shape: Shape of the input image, (256, 32, 1). num_classes: Number of characters in alphabet, including CTC blank. # References https://arxiv.org/abs/1507.05717 """ #K.clear_session() act = LeakyReLU(alpha=0.05) #act = 'relu' x = image_input = Input(shape=input_shape, name='image_input') x = Conv2D(64, (3, 3), strides=(1, 1), activation=act, padding='same', name='conv1_1')(x) x = MaxPool2D(pool_size=(2, 2), strides=(2, 2), name='pool1', padding='same')(x) x = Conv2D(128, (3, 3), strides=(1, 1), activation=act, padding='same', name='conv2_1')(x) x = MaxPool2D(pool_size=(2, 2), strides=(2, 2), name='pool2', padding='same')(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation=act, padding='same', name='conv3_1')(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation=act, padding='same', name='conv3_2')(x) x = MaxPool2D(pool_size=(2, 2), strides=(1, 2), name='pool3', padding='same')(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation=act, padding='same', name='conv4_1')(x) x = BatchNormalization(name='batchnorm1')(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation=act, padding='same', name='conv5_1')(x) x = BatchNormalization(name='batchnorm2')(x) x = MaxPool2D(pool_size=(2, 2), strides=(1, 2), name='pool5', padding='valid')(x) x = Conv2D(512, (2, 2), strides=(1, 1), activation=act, padding='valid', name='conv6_1')(x) x = Reshape((-1, 512))(x) if gru: x = Bidirectional( GRU(256, dropout=0.1, recurrent_dropout=0.1, return_sequences=True))(x) x = Bidirectional( GRU(256, dropout=0.1, recurrent_dropout=0.1, return_sequences=True))(x) else: x = Bidirectional( LSTM(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1, name='lstm1'))(x) x = Bidirectional( LSTM(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1, name='lstm2'))(x) x = Dense( num_classes, #kernel_regularizer=regularizers.l2(0.01), #activity_regularizer=regularizers.l1(0.01), name='dense1')(x) #x = Dropout(0.1)(x) x = y_pred = Activation('softmax', name='softmax')(x) model_pred = Model(image_input, x) if prediction_only: return model_pred max_string_len = int(y_pred.shape[1]) def focal_ctc_lambda_func(args): labels, y_pred, input_length, label_length = args ctc_loss = K.ctc_batch_cost(labels, y_pred, input_length, label_length) p = tf.exp(-ctc_loss) focal_ctc_loss = alpha * tf.pow((1 - p), gamma) * ctc_loss return focal_ctc_loss labels = Input(name='label_input', shape=[max_string_len], dtype='float32') input_length = Input(name='input_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') focal_ctc_loss = Lambda(focal_ctc_lambda_func, output_shape=(1, ), name='focal_ctc_loss')( [labels, y_pred, input_length, label_length]) model_train = Model( inputs=[image_input, labels, input_length, label_length], outputs=focal_ctc_loss) return model_train, model_pred
def create_model(X_vocab_len, X_max_len, y_vocab_len, y_max_len, n_phonetic_features, y1, n1, y2, n2, y3, n3, y4, n4, y5, n5, y6, n6, hidden_size, num_layers): def smart_merge(vectors, **kwargs): return vectors[0] if len(vectors) == 1 else merge(vectors, **kwargs) current_word = Input(shape=(X_max_len, ), dtype='float32', name='input1') # for encoder (shared) decoder_input = Input(shape=(X_max_len, ), dtype='float32', name='input3') # for decoder -- attention right_word1 = Input(shape=(X_max_len, ), dtype='float32', name='input4') right_word2 = Input(shape=(X_max_len, ), dtype='float32', name='input5') right_word3 = Input(shape=(X_max_len, ), dtype='float32', name='input6') right_word4 = Input(shape=(X_max_len, ), dtype='float32', name='input7') left_word1 = Input(shape=(X_max_len, ), dtype='float32', name='input8') left_word2 = Input(shape=(X_max_len, ), dtype='float32', name='input9') left_word3 = Input(shape=(X_max_len, ), dtype='float32', name='input10') left_word4 = Input(shape=(X_max_len, ), dtype='float32', name='input11') phonetic_input = Input(shape=(n_phonetic_features, ), dtype='float32', name='input12') emb_layer1 = Embedding(X_vocab_len, EMBEDDING_DIM, input_length=X_max_len, mask_zero=False, name='Embedding') list_of_inputs = [ current_word, right_word1, right_word2, right_word3, right_word4, left_word1, left_word2, left_word3, left_word4 ] current_word_embedding, right_word_embedding1, right_word_embedding2,right_word_embedding3, right_word_embedding4, \ left_word_embedding1, left_word_embedding2, left_word_embedding3, left_word_embedding4 = [emb_layer1(i) for i in list_of_inputs] print("Type:: ", type(current_word_embedding)) list_of_embeddings1 = [current_word_embedding, right_word_embedding1, right_word_embedding2,right_word_embedding3, right_word_embedding4, \ left_word_embedding1, left_word_embedding2, left_word_embedding3, left_word_embedding4] list_of_embeddings = [ Dropout(0.50, name='drop1_' + str(j))(i) for i, j in zip(list_of_embeddings1, range(len(list_of_embeddings1))) ] list_of_embeddings = [ GaussianNoise(0.05, name='noise1_' + str(j))(i) for i, j in zip(list_of_embeddings, range(len(list_of_embeddings))) ] conv4_curr, conv4_right1, conv4_right2, conv4_right3, conv4_right4, conv4_left1, conv4_left2, conv4_left3, conv4_left4 =\ [Conv1D(filters=no_filters, kernel_size=4, padding='valid',activation='relu', strides=1, name='conv4_'+str(j))(i) for i,j in zip(list_of_embeddings, range(len(list_of_embeddings)))] conv4s = [ conv4_curr, conv4_right1, conv4_right2, conv4_right3, conv4_right4, conv4_left1, conv4_left2, conv4_left3, conv4_left4 ] maxPool4 = [ MaxPooling1D(name='max4_' + str(j))(i) for i, j in zip(conv4s, range(len(conv4s))) ] avgPool4 = [ AveragePooling1D(name='avg4_' + str(j))(i) for i, j in zip(conv4s, range(len(conv4s))) ] pool4_curr, pool4_right1, pool4_right2, pool4_right3, pool4_right4, pool4_left1, pool4_left2, pool4_left3, pool4_left4 = \ [merge([i,j], name='merge_conv4_'+str(k)) for i,j,k in zip(maxPool4, avgPool4, range(len(maxPool4)))] conv5_curr, conv5_right1, conv5_right2, conv5_right3, conv5_right4, conv5_left1, conv5_left2, conv5_left3, conv5_left4 = \ [Conv1D(filters=no_filters, kernel_size=5, padding='valid', activation='relu', strides=1, name='conv5_'+str(j))(i) for i,j in zip(list_of_embeddings, range(len(list_of_embeddings)))] conv5s = [ conv5_curr, conv5_right1, conv5_right2, conv5_right3, conv5_right4, conv5_left1, conv5_left2, conv5_left3, conv5_left4 ] maxPool5 = [ MaxPooling1D(name='max5_' + str(j))(i) for i, j in zip(conv5s, range(len(conv5s))) ] avgPool5 = [ AveragePooling1D(name='avg5_' + str(j))(i) for i, j in zip(conv5s, range(len(conv5s))) ] pool5_curr, pool5_right1, pool5_right2, pool5_right3, pool5_right4, pool5_left1, pool5_left2, pool5_left3, pool5_left4 = \ [merge([i,j], name='merge_conv5_'+str(k)) for i,j,k in zip(maxPool5, avgPool5, range(len(maxPool5)))] maxPools = [pool4_curr, pool4_right1, pool4_right2, pool4_right3, pool4_right4, \ pool4_left1, pool4_left2, pool4_left3, pool4_left4, \ pool5_curr, pool5_right1, pool5_right2, pool5_right3, pool5_right4, \ pool5_left1, pool5_left2, pool5_left3, pool5_left4] concat = merge(maxPools, mode='concat', name='main_merge') x = Dropout(0.15, name='drop_single1')(concat) x = Bidirectional(RNN(rnn_output_size), name='bidirec1')(x) total_features = [x, phonetic_input] concat2 = merge(total_features, mode='concat', name='phonetic_merging') x = Dense(HIDDEN_DIM, activation='relu', kernel_initializer='he_normal', kernel_constraint=maxnorm(3), bias_constraint=maxnorm(3), name='dense1')(concat2) x = Dropout(0.15, name='drop_single2')(x) x = Dense(HIDDEN_DIM, kernel_initializer='he_normal', activation='tanh', kernel_constraint=maxnorm(3), bias_constraint=maxnorm(3), name='dense2')(x) x = Dropout(0.15, name='drop_single3')(x) out1 = Dense(n1, kernel_initializer='he_normal', activation='softmax', name='output1')(x) out2 = Dense(n2, kernel_initializer='he_normal', activation='softmax', name='output2')(x) out3 = Dense(n3, kernel_initializer='he_normal', activation='softmax', name='output3')(x) out4 = Dense(n4, kernel_initializer='he_normal', activation='softmax', name='output4')(x) out5 = Dense(n5, kernel_initializer='he_normal', activation='softmax', name='output5')(x) out6 = Dense(n6, kernel_initializer='he_normal', activation='softmax', name='output6')(x) # Luong et al. 2015 attention model emb_layer = Embedding(X_vocab_len, EMBEDDING_DIM, input_length=X_max_len, mask_zero=True, name='Embedding_for_seq2seq') current_word_embedding, right_word_embedding1, right_word_embedding2,right_word_embedding3, right_word_embedding4, \ left_word_embedding1, left_word_embedding2, left_word_embedding3, left_word_embedding4 = [emb_layer(i) for i in list_of_inputs] # current_word_embedding = smart_merge([ current_word_embedding, right_word_embedding1, left_word_embedding1]) encoder, state = GRU(rnn_output_size, return_sequences=True, unroll=True, return_state=True, name='encoder')(current_word_embedding) encoder_last = encoder[:, -1, :] decoder = emb_layer(decoder_input) decoder = GRU(rnn_output_size, return_sequences=True, unroll=True, name='decoder')(decoder, initial_state=[encoder_last]) attention = dot([decoder, encoder], axes=[2, 2], name='dot') attention = Activation('softmax', name='attention')(attention) context = dot([attention, encoder], axes=[2, 1], name='dot2') decoder_combined_context = concatenate([context, decoder], name='concatenate') outputs = TimeDistributed(Dense(64, activation='tanh'), name='td1')(decoder_combined_context) outputs = TimeDistributed(Dense(X_vocab_len, activation='softmax'), name='td2')(outputs) all_inputs = [current_word, decoder_input, right_word1, right_word2, right_word3, right_word4, left_word1, left_word2, left_word3,\ left_word4, phonetic_input] all_outputs = [outputs, out1, out2, out3, out4, out5, out6] model = Model(input=all_inputs, output=all_outputs) opt = Adam() return model
def __init__(self, dim, batch_norm, dropout, rec_dropout, task, target_repl=False, deep_supervision=False, num_classes=1, depth=1, input_dim=69, **kwargs): print("==> not used params in network class:", kwargs.keys()) self.dim = dim self.batch_norm = batch_norm self.dropout = dropout self.rec_dropout = rec_dropout self.depth = depth #sess = tf.InteractiveSession() if task in ['ihm']: # final_activation = 'softmax' final_activation = 'sigmod' elif task in ['los']: if num_classes == 1: final_activation = 'relu' else: final_activation = 'softmax' else: raise ValueError("Wrong value for task") # Input layers and masking X = Input(shape=(48, input_dim), name='X') inputs = [X] #mX = Masking()(X) # Configurations is_bidirectional = True # Main part of the network for i in range(depth - 1): num_units = dim if is_bidirectional: num_units = num_units // 2 lstm = LSTM(units=num_units, activation='tanh', return_sequences=True, recurrent_dropout=rec_dropout, kernel_regularizer=regularizers.l2(0.01), dropout=dropout) if is_bidirectional: X = Bidirectional(lstm)(X) else: X = lstm(X) # Output module of the network #return_sequences = (target_repl or deep_supervision) L = LSTM(units=dim, activation='tanh', return_sequences=True, dropout=dropout, kernel_regularizer=regularizers.l2(0.01), recurrent_dropout=True)(X) A_L = AttentionLayer()(L) if dropout > 0: A_L = Dropout(dropout)(A_L) y = Dense(num_classes, activation=final_activation)(A_L) outputs = [y] super(Network, self).__init__(inputs=inputs, outputs=outputs)
def createHierarchicalAttentionModel( maxSeq, embWeights=None, embeddingSize=None, vocabSize=None, #embedding recursiveClass=GRU, wordRnnSize=100, sentenceRnnSize=100, #rnn #wordDenseSize = 100, sentenceHiddenSize = 128, #dense dropWordEmb=0.2, dropWordRnnOut=0.2, dropSentenceRnnOut=0.5): ''' Creates a model based on the Hierarchical Attention model according to : https://arxiv.org/abs/1606.02393 inputs: maxSeq : max size for sentences embedding embWeights : numpy matrix with embedding values embeddingSize (if embWeights is None) : embedding size vocabSize (if embWeights is None) : vocabulary size Recursive Layers recursiveClass : class for recursive class. Default is GRU wordRnnSize : RNN size for word sequence sentenceRnnSize : RNN size for sentence sequence Dense Layers wordDenseSize: dense layer at exit from RNN , on sentence at word level sentenceHiddenSize : dense layer at exit from RNN , on document at sentence level Dropout returns : Two models. They are the same, but the second contains multiple outputs that can be use to analyse attention. ''' ## ## Sentence level logic wordsInputs = Input(shape=(maxSeq, ), dtype='int32', name='words_input') if embWeights is None: # , mask_zero=True emb = Embedding(vocabSize, embeddingSize)(wordsInputs) else: emb = Embedding(embWeights.shape[0], embWeights.shape[1], weights=[embWeights], trainable=False)(wordsInputs) if dropWordEmb != 0.0: emb = Dropout(dropWordEmb)(emb) wordRnn = Bidirectional(recursiveClass(wordRnnSize, return_sequences=True), merge_mode='concat')(emb) if dropWordRnnOut > 0.0: wordRnn = Dropout(dropWordRnnOut)(wordRnn) attention = AttentionLayer()(wordRnn) sentenceEmb = Lambda(lambda x: x[1] * x[0], output_shape=lambda x: x[0])([wordRnn, attention]) # sentenceEmb = Concatenate([wordRnn, attention], mode=lambda x:x[1]*x[0], output_shape=lambda x:x[0]) sentenceEmb = Lambda(lambda x: K.sum(x, axis=1), output_shape=lambda x: (x[0], x[2]))(sentenceEmb) modelSentence = Model(wordsInputs, sentenceEmb) modelSentAttention = Model(wordsInputs, attention) documentInputs = Input(shape=(None, maxSeq), dtype='int32', name='document_input') # sentenceMasking = Masking(mask_value=0)(documentInputs) sentenceEmbbeding = TimeDistributed(modelSentence)(documentInputs) sentenceAttention = TimeDistributed(modelSentAttention)(documentInputs) sentenceRnn = Bidirectional(recursiveClass(wordRnnSize, return_sequences=True), merge_mode='concat')(sentenceEmbbeding) if dropSentenceRnnOut > 0.0: sentenceRnn = Dropout(dropSentenceRnnOut)(sentenceRnn) attentionSent = AttentionLayer()(sentenceRnn) documentEmb = multiply(inputs=[sentenceRnn, attentionSent]) # documentEmb = Merge([sentenceRnn, attentionSent], mode=lambda x:x[1]*x[0], output_shape=lambda x:x[0]) documentEmb = Lambda(lambda x: K.sum(x, axis=1), output_shape=lambda x: (x[0], x[2]), name="att2")(documentEmb) documentOut = Dense(1, activation="sigmoid", name="documentOut")(documentEmb) model = Model(input=[documentInputs], output=[documentOut]) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) modelAttentionEv = Model( input=[documentInputs], output=[documentOut, sentenceAttention, attentionSent]) modelAttentionEv.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) return model, modelAttentionEv
def __init__(self, inputs=None, outputs=None, N=None, M=None, C=None, word2vec_dim=None, label_size=None, embedding_matrix=None, hdim=None, dropout_rate=None, output_type=None, unroll=False, **kwargs): # Load model from config if inputs is not None and outputs is not None: super(RNet, self).__init__(inputs=inputs, outputs=outputs, **kwargs) return '''Dimensions''' B = None H = hdim W = word2vec_dim v = SharedWeight(size=(H, 1), name='v') WQ_u = SharedWeight(size=(2 * H, H), name='WQ_u') WP_u = SharedWeight(size=(2 * H, H), name='WP_u') WP_v = SharedWeight(size=(H, H), name='WP_v') W_g1 = SharedWeight(size=(4 * H, 4 * H), name='W_g1') W_g2 = SharedWeight(size=(2 * H, 2 * H), name='W_g2') WP_h = SharedWeight(size=(2 * H, H), name='WP_h') Wa_h = SharedWeight(size=(2 * H, H), name='Wa_h') WQ_v = SharedWeight(size=(2 * H, H), name='WQ_v') WPP_v = SharedWeight(size=(H, H), name='WPP_v') VQ_r = SharedWeight(size=(H, H), name='VQ_r') shared_weights = [ v, WQ_u, WP_u, WP_v, W_g1, W_g2, WP_h, Wa_h, WQ_v, WPP_v, VQ_r ] P_vecs = Input(shape=(N, ), name='P_vecs') P = Embedding(len(embedding_matrix), W, weights=[embedding_matrix], trainable=False, input_length=N)(P_vecs) Q_vecs = Input(shape=(M, ), name='Q_vecs') Q = Embedding(len(embedding_matrix), W, weights=[embedding_matrix], trainable=False, input_length=M)(Q_vecs) input_placeholders = [P_vecs, Q_vecs] uP = Masking()(P) for i in range(1): uP = Bidirectional( GRU(units=H, return_sequences=True, dropout=dropout_rate, unroll=False))(uP) uP = VariationalDropout(rate=dropout_rate, noise_shape=(None, 1, 2 * H), name='uP')(uP) uQ = Masking()(Q) for i in range(1): uQ = Bidirectional( GRU(units=H, return_sequences=True, dropout=dropout_rate, unroll=False))(uQ) uQ = VariationalDropout(rate=dropout_rate, noise_shape=(None, 1, 2 * H), name='uQ')(uQ) vP = QuestionAttnGRU(units=H, return_sequences=True, unroll=unroll)( [uP, uQ, WQ_u, WP_v, WP_u, v, W_g1]) vP = VariationalDropout(rate=dropout_rate, noise_shape=(None, 1, H), name='vP')(vP) hP = Bidirectional( SelfAttnGRU(units=H, return_sequences=True, unroll=unroll))([vP, vP, WP_v, WPP_v, v, W_g2]) hP = VariationalDropout(rate=dropout_rate, noise_shape=(None, 1, 2 * H), name='hP')(hP) # rQ = QuestionPooling() ([uQ, WQ_u, WQ_v, v, VQ_r]) # rQ = Dropout(rate=dropout_rate, name='rQ') (rQ) if output_type == "bi": gP = Bidirectional( GRU(units=H, return_sequences=True, unroll=unroll))(hP) preds = TimeDistributed(Dense(1, activation='sigmoid'))(gP) elif output_type == "multi": gP = Bidirectional( GRU(units=H, return_sequences=False, unroll=unroll))(hP) preds = Dense(label_size, activation='softmax')(gP) inputs = input_placeholders + shared_weights outputs = preds super(RNet, self).__init__(inputs=inputs, outputs=outputs, **kwargs)
def __init__(self, params, mask_zero=True): # input words self.wds = tf.placeholder(tf.float32, [None, params['words']['dim']], name='words') # input pos self.pos = tf.placeholder(tf.float32, [None, params['pos']['dim']], name='pos') # output Y0 self.Y0 = tf.placeholder(tf.float32, [None, params['Y0']['dim']], name='Y0') # output Y1 self.Y1 = tf.placeholder(tf.float32, [None, params['Y1']['dim']], name='Y1') # 1.base layers: embedding wd_embedding = Embedding(output_dim=params['embed_size'], input_dim=params['voc_size'], input_length=params['words']['dim'], mask_zero=mask_zero, name='wd_embedding')(self.wds) # wd_embedding = BatchNormalization(momentum=0.9, name='wd_embedding_BN')(wd_embedding) pos_embedding = Embedding(output_dim=params['embed_size'], input_dim=params['pos_size'], input_length=params['pos']['dim'], mask_zero=mask_zero, name='pos_embedding')(self.pos) # pos_embedding = BatchNormalization(momentum=0.9, name='pos_embedding_BN')(pos_embeding) # 2. semantic layers: Bidirectional GRU wd_Bi_GRU = Bidirectional(GRU( params['words']['RNN']['cell'], dropout=params['words']['RNN']['drop_out'], recurrent_dropout=params['words']['RNN']['rnn_drop_out']), merge_mode='concat', name='word_Bi_GRU')(wd_embedding) if 'batch_norm' in params['words']['RNN']: wd_Bi_GRU = BatchNormalization( momentum=params['words']['RNN']['batch_norm'], name='word_Bi_GRU_BN')(wd_Bi_GRU) pos_Bi_GRU = Bidirectional(GRU( params['pos']['RNN']['cell'], dropout=params['pos']['RNN']['drop_out'], recurrent_dropout=params['pos']['RNN']['rnn_drop_out']), merge_mode='concat', name='word_Bi_GRU')(pos_embedding) if 'batch_norm' in params['pos']['RNN']: pos_Bi_GRU = BatchNormalization( momentum=params['pos']['RNN']['batch_norm'], name='pos_Bi_GRU_BN')(pos_Bi_GRU) # use pos as attention attention_probs = Dense(2 * params['pos']['RNN']['cell'], activation='softmax', name='attention_vec')(pos_Bi_GRU) attention_mul = multiply([wd_Bi_GRU, attention_probs], name='attention_mul') # ATTENTION PART FINISHES HERE # 3. middle layer for predict Y0 kwargs = params['Y0']['kwargs'] if 'kwargs' in params['Y0'] else {} if 'W_regularizer' in kwargs: kwargs['W_regularizer'] = l2(kwargs['W_regularizer']) self.Y0_probs = Dense( params['Y0']['dim'], # activation='softmax', name='Y0_probs', bias_regularizer=l2(0.01), **kwargs)(pos_Bi_GRU) # batch_norm if 'batch_norm' in params['Y0']: self.Y0_probs = BatchNormalization(**params['Y0']['batch_norm'])( self.Y0_probs) self.Y0_probs = Activation(params['Y0']['activate_func'])( self.Y0_probs) if 'activity_reg' in params['Y0']: self.Y0_probs = ActivityRegularization( name='Y0_activity_reg', **params['Y0']['activity_reg'])(self.Y0_probs) # 4. upper hidden layers # Firstly, learn a hidden layer from Bi_GRU # Secondly, consider Y0_preds as middle feature and combine it with hidden layer combine_layer = concatenate([self.Y0_probs, attention_mul], axis=-1, name='combine_layer') hidden_layer = Dense(params['H']['dim'], name='hidden_layer')(combine_layer) if 'batch_norm' in params['H']: hidden_layer = BatchNormalization( momentum=0.9, name='hidden_layer_BN')(hidden_layer) hidden_layer = Activation('relu')(hidden_layer) if 'drop_out' in params['H']: hidden_layer = Dropout(params['H']['drop_out'], name='hidden_layer_dropout')(hidden_layer) # 5. layer for predict Y1 kwargs = params['Y1']['kwargs'] if 'kwargs' in params['Y1'] else {} if 'W_regularizer' in kwargs: kwargs['W_regularizer'] = l2(kwargs['W_regularizer']) self.Y1_probs = Dense( params['Y1']['dim'], # activation='softmax', name='Y1_probs', bias_regularizer=l2(0.01), **kwargs)(hidden_layer) # batch_norm if 'batch_norm' in params['Y1']: self.Y1_probs = BatchNormalization(**params['Y1']['batch_norm'])( self.Y1_probs) self.Y1_probs = Activation(params['Y1']['activate_func'])( self.Y1_probs) if 'activity_reg' in params['Y1']: self.Y1_probs = ActivityRegularization( name='Y1_activity_reg', **params['Y1']['activity_reg'])(self.Y1_probs) # 6. Calculate loss with tf.name_scope('loss'): Y0_loss = tf.reduce_mean(binary_crossentropy( self.Y0, self.Y0_probs), name='Y0_loss') Y1_loss = tf.reduce_mean(binary_crossentropy( self.Y1, self.Y1_probs), name='Y1_loss') self.loss = tf.add_n([Y0_loss, Y1_loss], name='loss') self.train_op = tf.train.RMSPropOptimizer( params['learning_rate']).minimize(self.loss)
def create_model(params, computed_params): logging.info('Constructing the NN model...') max_inputseq_len = computed_params['max_inputseq_len'] word_dims = computed_params['word_dims'] max_outputseq_len = computed_params['max_outputseq_len'] max_nb_premises = computed_params['max_nb_premises'] inputs = [] input_question = Input(shape=( max_inputseq_len, word_dims, ), dtype='float32', name='question') inputs.append(input_question) for ipremise in range(max_nb_premises): input_premise = Input(shape=( max_inputseq_len, word_dims, ), dtype='float32', name='premise{}'.format(ipremise)) inputs.append(input_premise) layers = [] net_arch = params['net_arch'] if net_arch == 'lstm': # Энкодер на базе LSTM, на выходе которого получаем вектор с упаковкой слов # предложения. Этот слой общий для всех входных предложений. rnn_size = params['rnn_size'] shared_words_rnn = Bidirectional( recurrent.LSTM(rnn_size, input_shape=(max_inputseq_len, word_dims), return_sequences=False)) for input in inputs: encoder_rnn = shared_words_rnn(input) layers.append(encoder_rnn) elif net_arch == 'lstm(cnn)': nb_filters = params['nb_filters'] rnn_size = params['rnn_size'] for kernel_size in range(1, 4): # сначала идут сверточные слои, образующие детекторы словосочетаний # и синтаксических конструкций conv = Conv1D(filters=nb_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1, name='shared_conv_{}'.format(kernel_size)) lstm = recurrent.LSTM(rnn_size, return_sequences=False) for input in inputs: conv_layer1 = conv(input) conv_layer1 = keras.layers.MaxPooling1D( pool_size=kernel_size, strides=None, padding='valid')(conv_layer1) conv_layer1 = lstm(conv_layer1) layers.append(conv_layer1) encoder_merged = keras.layers.concatenate(inputs=list(layers)) # финальный классификатор определяет длину ответа output_dims = max_outputseq_len decoder = encoder_merged if 'units1' in params and params['units1'] > 0: decoder = Dense(units=params['units1'], activation='relu')(decoder) if 'units2' in params and params['units2'] > 0: decoder = Dense(params['units2'], activation='relu')(decoder) if 'units3' in params and params['units3'] > 0: decoder = Dense(params['units3'], activation='relu')(decoder) decoder = Dense(output_dims, activation='softmax', name='output')(decoder) model = Model(inputs=inputs, outputs=decoder) model.compile(loss='categorical_crossentropy', optimizer=params['optimizer'], metrics=['accuracy']) return model
For Keras internal compatability checking """ if self.return_probabilities: return (None, self.timesteps, self.timesteps) else: return (None, self.timesteps, self.output_dim) def get_config(self): """ For rebuilding models on load time. """ config = { 'output_dim': self.output_dim, 'units': self.units, 'return_probabilities': self.return_probabilities } base_config = super(AttentionDecoder, self).get_config() return dict(list(base_config.items()) + list(config.items())) # check to see if it compiles if __name__ == '__main__': from keras.layers import Input, LSTM from keras.models import Model from keras.layers.wrappers import Bidirectional i = Input(shape=(100, 104), dtype='float32') enc = Bidirectional(LSTM(64, return_sequences=True), merge_mode='concat')(i) dec = AttentionDecoder(32, 4)(enc) model = Model(inputs=i, outputs=dec) model.summary()
conv = Conv1D(filters=nb_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1) conv_layer = conv(words_input) conv_layer = GlobalMaxPooling1D()(conv_layer) convs.append(conv_layer) repr_size += nb_filters elif NET_ARCH == 'lstm+cnn': # энкодер на базе LSTM, на выходе которого получаем вектор с упаковкой слов # предложения. encoder_rnn = Bidirectional( recurrent.LSTM(rnn_size, input_shape=(max_inputseq_len, word_dims), return_sequences=False))(words_input) convs.append(encoder_rnn) repr_size += rnn_size * 2 # добавляем входы со сверточными слоями for kernel_size in range(2, 4): conv = Conv1D(filters=nb_filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1) conv_layer = conv(words_input) conv_layer = GlobalMaxPooling1D()(conv_layer)
A = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=100, trainable=False)(Answer) A2 = Embedding(nb_words + 1, EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=100, trainable=False)(Answer2) #e1_aligned, e2_aligned = align(q1, q2) #q1 = concatenate([q1,e2_aligned]) #q2 = concatenate([q2,e1_aligned]) Encoder = Bidirectional(LSTM(units=300, return_sequences=True)) q1_encoded = Dropout(DROPOUT)(Encoder(q1)) q2_encoded = Dropout(DROPOUT)(Encoder(q2)) Encoder_A = Bidirectional(LSTM(units=300, return_sequences=True)) A_encoded = Dropout(DROPOUT)(Encoder_A(A)) A2_encoded = Dropout(DROPOUT)(Encoder_A(A2)) q1_aligned, q2_aligned = align(q1_encoded, q2_encoded) A1_aligned, A2_aligned = align2(A_encoded, A2_encoded) q1_A_aligned, A_q1_aligned = align_A(A_encoded, q1_encoded) q2_A_aligned, A_q2_aligned = align_B(A2_encoded, q1_encoded) #q1_combined = concatenate([q1_encoded, q2_aligned, subtract(q1_encoded, q2_aligned), multiply([q1_encoded, q2_aligned]),q1_A_aligned]) #q2_combined = concatenate([q2_encoded, q1_aligned, subtract(q2_encoded, q1_aligned), multiply([q2_encoded, q1_aligned]),q2_A_aligned]) #A1_combined = concatenate([A_encoded, A2_aligned, subtract(A_encoded, A2_aligned), multiply([A_encoded, A2_aligned]),A_q1_aligned])
def build(self): """ 构造联合训练模型 模型框架采用:embedding+BiLSTM语义表征+全连接层compare self-attention和attention +BiLSTM Align + Dense layer + softmax 其中embedding和第一个BiLSTM和comapre层共享,且模型参数不可训练。 从compared之后,分为三个BiLSTM, 分为为源数据独享,源数据和目标数据共享,目标数据独享。 模型损失包括: 基于任务的损失,adversary损失,基于协方差的损失 Return: 联合训练模型 """ senA = Input(shape=(self.senMaxLen, ), name='senA') senB = Input(shape=(self.senMaxLen, ), name='senB') CharA = Input(shape=(self.senMaxLen, ), name='CharA') CharB = Input(shape=(self.senMaxLen, ), name='CharB') senA1 = Input(shape=(self.senMaxLen, ), name='senA1') senB1 = Input(shape=(self.senMaxLen, ), name='senB1') CharA1 = Input(shape=(self.senMaxLen, ), name='CharA1') CharB1 = Input(shape=(self.senMaxLen, ), name='CharB1') i = 0 for layerA, layerB in zip(self.basemodelA.layers, self.basemodelB.layers): # 固定matching layer前面的层的权重 if i < 26: layerA.trainable = False layerB.trainable = False i += 1 print(layerA.name) mergedVectorA = self.basemodelA.get_layer('mergedVectorA').output mergedVectorB = self.basemodelA.get_layer('mergedVectorB').output _mergedVectorA = self.basemodelB.get_layer('mergedVectorA').output _mergedVectorB = self.basemodelB.get_layer('mergedVectorA').output cross = self.basemodelA.get_layer('cross') cross.trainable = False mergedVectorA = TimeDistributed(cross)(mergedVectorA) mergedVectorA = TimeDistributed(BatchNormalization())(mergedVectorA) mergedVectorB = TimeDistributed(cross)(mergedVectorB) mergedVectorB = TimeDistributed(BatchNormalization())(mergedVectorB) _mergedVectorA = TimeDistributed(BatchNormalization())(_mergedVectorA) _mergedVectorA = TimeDistributed(cross)(_mergedVectorA) _mergedVectorB = TimeDistributed(BatchNormalization())(_mergedVectorB) _mergedVectorB = TimeDistributed(cross)(_mergedVectorB) # 构造共享BiLSTM context layer sharedBiLSTM = self.basemodelA.get_layer('bidirectional_2') special1BiLSTM = self.basemodelA.get_layer('bidirectional_2') special2BiLSTM = Bidirectional( LSTM(units=jointTaskParamSetting.SharedTaskLSTMUnits, return_sequences=False, dropout=self.dropout, recurrent_dropout=self.dropout)) # ******************************************************************** sharedLSTMSenA = sharedBiLSTM(mergedVectorA) sharedLSTMSenB = sharedBiLSTM(mergedVectorB) _sharedLSTMSenA = sharedBiLSTM(_mergedVectorA) _sharedLSTMSenB = sharedBiLSTM(_mergedVectorB) # ********************************************************************* specialLSTMSenA = special1BiLSTM(mergedVectorA) specialLSTMSenB = sharedBiLSTM(mergedVectorB) # ********************************************************************* _specialLSTMSenA = special2BiLSTM(_mergedVectorA) _specialLSTMSenB = special2BiLSTM(_mergedVectorB) # ********************************************************************** # 合并生成不同task的input task1Input = concatenate( [specialLSTMSenA, specialLSTMSenB, sharedLSTMSenA, sharedLSTMSenB], axis=-1, name="taskInput1") task2Input = concatenate([ _specialLSTMSenA, _specialLSTMSenB, _sharedLSTMSenA, _sharedLSTMSenB ], axis=-1, name="taskInput2") # *************************************************= SpecialTask1 = Dense(jointTaskParamSetting.SpecialTaskAUnits, activation="relu")(task1Input) SpecialTask1 = Dropout(self.dropout)(SpecialTask1) SpecialTask1 = BatchNormalization()(SpecialTask1) SpecialTask2 = Dense(jointTaskParamSetting.SpecialTaskBUnits, activation="relu")(task2Input) SpecialTask2 = Dropout(self.dropout)(SpecialTask2) SpecialTask2 = BatchNormalization()(SpecialTask2) # 构造一个分类器,用于判断接收的数据来自于源数据还是目标数据。 # 详见论文:Adversarial Multi-task Learning for Text Classification中adversarial loss部分 sharedDenseLayer = Dense(units=jointTaskParamSetting.SharedTaskUnits, activation='relu') SharedTask1 = sharedDenseLayer( concatenate([sharedLSTMSenA, sharedLSTMSenB], axis=-1, name='share1Input')) SharedTask1 = Dropout(self.dropout)(SharedTask1) SharedTask1 = BatchNormalization()(SharedTask1) SharedTask2 = sharedDenseLayer( concatenate([_sharedLSTMSenA, _sharedLSTMSenB], axis=-1, name='share2Input')) SharedTask2 = Dropout(self.dropout)(SharedTask2) SharedTask2 = BatchNormalization()(SharedTask2) # ************************************************************* # feature1用于计算 task1 的loss; task1对应于原任务 feature1 = concatenate([SpecialTask1, SharedTask1], axis=-1) logits1 = Dense(2, activation="softmax", name="taskAloss")(feature1) # feature2用于计算 task2的 loss; task2对应于目标任务 feature2 = concatenate([SpecialTask2, SharedTask2], axis=-1) logits2 = Dense(2, activation="softmax", name="taskBloss")(feature2) # 计算GAN损失 ganLayer = Dense(2, activation='softmax', name='GAN') tasklabel1 = ganLayer(SharedTask1) tasklabel2 = ganLayer(SharedTask2) # 计算基于协方差矩阵的loss diff_loss = Lambda(self.diff_loss, name="diff_loss") dif1 = diff_loss(task1Input) dif2 = diff_loss(task2Input) # 参数说明: # SpecialTask1: 任务1 special task的output # SpecialTask2: 任务2 special task的output # SharedTaks1: 任务1 shared task的output # SharedTaks2: 任务2 shared task的output # logits1: 任务1分类标签 # logits2: 任务2分类标签 myModel = Model( inputs=[senA, senB, CharA, CharB, senA1, senB1, CharA1, CharB1], outputs=[logits1, logits2, tasklabel1, tasklabel2, dif1, dif2]) myModel.compile(optimizer="adam", loss={ "taskAloss": "mse", "taskBloss": "mse", "GAN": "mse", "diff_loss": self.sumloss }, loss_weights={ "taskAloss": 1., "taskBloss": .2, "GAN": .5, "diff_loss": 1 }, metrics=["accuracy"]) print(myModel.summary()) return myModel
def _build_model(self, input_size, stacked_sizes=None, fully_connected_sizes=None, optimizer_name=None, learning_rate=None, decay=None, gpus=0, custom_batch_size=None): """ Build Keras Sequential model architecture with given parameters :param input_size: Dimensionality of input vector (number of features) :param stacked_sizes: Add given number of additional Bi-LSTM layers after first Bi-LSTM layer, provided as list of sizes :param fully_connected_sizes: Add a given number of additional fully connected layers after the Bi-LSTM layers, provided as list of sizes :param optimizer_name: Name of Keras optimizer, default 'adam' :param learning_rate: Keras learning rate :param decay: Optimizer decay :param gpus: Number of gpus to train on (Not implemented) :param custom_batch_size: Use different batch size than self.batch_size :return: Keras Sequential model """ from keras.layers.core import Dense from keras.layers.recurrent import LSTM from keras.layers.wrappers import TimeDistributed, Bidirectional from keras.models import Sequential from keras import optimizers if stacked_sizes is None: stacked_sizes = [] if fully_connected_sizes is None: fully_connected_sizes = [] model = Sequential() model.add( Bidirectional(layer=LSTM(units=self.hidden_size, return_sequences=True, dropout=0.2, recurrent_dropout=0.2, stateful=self.stateful), batch_input_shape=(custom_batch_size or self.batch_size, None, input_size))) for size in stacked_sizes: model.add( Bidirectional(layer=LSTM(units=size, return_sequences=True, stateful=self.stateful))) for size in fully_connected_sizes: model.add(TimeDistributed(Dense(size, activation='sigmoid'))) model.add(TimeDistributed(Dense(1, activation='sigmoid'))) if gpus > 1: raise NotImplementedError( "Multi GPU model not implemented due to input size mismatch.") #model = multi_gpu_model(model, gpus=gpus) if optimizer_name is None: optimizer_name = "adam" optimizer_args = {} if learning_rate is not None: optimizer_args['lr'] = learning_rate if decay is not None: optimizer_args['decay'] = decay if optimizer_name == 'adam': optimizer = optimizers.Adam(**optimizer_args) elif optimizer_args: raise ValueError( 'Optimizer {} not implemented for custom params yet'.format( optimizer_name)) else: optimizer = optimizer_name print('Using optimizer', optimizer_name, optimizer_args) model.compile(loss=self.loss, optimizer=optimizer, sample_weight_mode='temporal', metrics=["accuracy", precision, recall, auc_roc]) return model
def __init__(self, dim, batch_norm, dropout, rec_dropout, header, task, mask_demographics, target_repl=False, deep_supervision=False, num_classes=1, depth=1, input_dim=94, size_coef=4, **kwargs): self.dim = dim self.batch_norm = batch_norm self.dropout = dropout self.rec_dropout = rec_dropout self.depth = depth self.size_coef = size_coef # (0) demographics: adjust input dimension and record retained variables included = ['GEN', 'ETH', 'INS'] for dem in mask_demographics: if dem == 'Gender': input_dim -= 5 included.remove('GEN') elif dem == 'Ethnicity': input_dim -= 6 included.remove('ETH') elif dem == 'Insurance': input_dim -= 7 included.remove('INS') if len(included) == 0: included.append("NONE") self._included = included # (1) define task-specific final activation layer if task in ['decomp', 'ihm', 'ph']: final_activation = 'sigmoid' elif task in ['los']: if num_classes == 1: final_activation = 'relu' else: final_activation = 'softmax' else: raise ValueError("Wrong value for task") print("==> not used params in network class:", kwargs.keys()) # (2) Parse channels channel_names = set() # find: returns lowest index in string where substring is found # step necessary to clean up header after doing one-hot encoding for ch in header: # (a) not include if "mask->" is found if ch.find("mask->") != -1: continue pos = ch.find("->") # (b) add header up to "->" if pos != -1: channel_names.add(ch[:pos]) # (c) add full header else: channel_names.add(ch) channel_names = sorted(list(channel_names)) self.channel_names = channel_names print("==> excluded demographics:", mask_demographics) print("==> found {} channels: {}".format(len(channel_names), channel_names)) # each channel is a list of columns # step: select all channels associated with a certain header name (due to one-hot encoding) channels = [] for ch in channel_names: indices = range(len(header)) # only keep indices that correspond to retained channel names from header indices = list(filter(lambda i: header[i].find(ch) != -1, indices)) channels.append(indices) # (3) Input layers and masking X = Input(shape=(None, input_dim), name='X') inputs = [X] mX = Masking()( X) # Masks a sequence by using a mask value to skip timesteps # (4) Deep supervision and bidirectionality if deep_supervision: M = Input(shape=(None, ), name='M') inputs.append(M) is_bidirectional = True if deep_supervision: is_bidirectional = False # (5) Preprocess each channel cX = [] for ch in channels: cX.append(Slice(ch)(mX)) # Slice 3D tensor by taking mX[:, :, ch] pX = [] # LSTM processed version of cX for x in cX: p = x for i in range(depth): num_units = dim if is_bidirectional: num_units = num_units // 2 lstm = LSTM(units=num_units, activation='tanh', return_sequences=True, dropout=dropout, recurrent_dropout=rec_dropout) if is_bidirectional: p = Bidirectional(lstm)(p) else: p = lstm(p) pX.append(p) # (6) Concatenate processed channels Z = Concatenate(axis=2)(pX) # (7) Main part of the network for i in range(depth - 1): num_units = int(size_coef * dim) if is_bidirectional: num_units = num_units // 2 lstm = LSTM(units=num_units, activation='tanh', return_sequences=True, dropout=dropout, recurrent_dropout=rec_dropout) if is_bidirectional: Z = Bidirectional(lstm)(Z) else: Z = lstm(Z) # (8) Output module of the network return_sequences = (target_repl or deep_supervision) L = LSTM(units=int(size_coef * dim), activation='tanh', return_sequences=return_sequences, dropout=dropout, recurrent_dropout=rec_dropout)(Z) # (9) Additional tuning if dropout > 0: L = Dropout(dropout)(L) # (10) Output if target_repl: y = TimeDistributed(Dense(num_classes, activation=final_activation), name='seq')(L) y_last = LastTimestep(name='single')(y) outputs = [y_last, y] elif deep_supervision: y = TimeDistributed(Dense(num_classes, activation=final_activation))(L) y = ExtendMask()([y, M]) # this way we extend mask of y to M outputs = [y] else: y = Dense(num_classes, activation=final_activation)(L) outputs = [y] # (11) build the specified network in keras super(Network, self).__init__(inputs=inputs, outputs=outputs)