# we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(embedding_layer) # {'Conv1D': 128, 'batch_size': 96, 'Conv1D_3': 4, 'Dropout_1': 0.3, 'Dropout': 0.4, 'Conv1D_1': 4, 'Conv1D_2': 64, 'Dense': 32} # {'Conv1D_1': 8, 'batch_size': 96, 'Dense': 32, 'Conv1D': 16, 'Dropout': 0.28192006496913374} # {'Conv1D': 128, 'Dropout': 0.37678000665362027, 'Conv1D_1': 12, 'batch_size': 32, 'Dense': 32} # we add a Convolution1D, which will learn filters # word group filters of size filter_length: model.add( Conv1D(32, 6, padding='valid', activation='relu', kernel_regularizer=regularizers.l2(l=0.01), strides=1)) # we use max pooling: model.add(GlobalMaxPooling1D()) # We add a vanilla hidden layer: model.add(Dense(64)) model.add(Dropout(0.5)) # 0.1 model.add(Activation('relu')) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(1)) model.add(Activation('sigmoid'))
from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.layers import Embedding from keras.layers import LSTM, Bidirectional from keras.layers import Conv1D, MaxPooling1D from keras import optimizers max_features = 26 embedding_size = 256 kernel_size = 5 filters = 250 pool_size = 2 lstm_output_size = 64 model = Sequential() model.add(Embedding(max_features, embedding_size)) model.add(Dropout(0.2)) model.add(Conv1D(filters, kernel_size,padding ='valid',activation = 'relu',strides = 1)) model.add(MaxPooling1D(pool_size = pool_size)) model.add(Bidirectional(LSTM(lstm_output_size))) model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss = 'binary_crossentropy',optimizer = optimizers.Adam(),metrics = ['acc'])
y_kmeans_test[0:100] # # CNN # In[217]: from keras.layers import Input, Dense, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dropout X_train = X_train.reshape(3147, 720, 1) #print(X_train.shape[1]) X_test = X_test.reshape(1350, 720, 1) model_scratch = Sequential() model_scratch.add( Conv1D(64, 3, activation='relu', input_shape=X_train.shape[1:])) model_scratch.add(MaxPooling1D(pool_size=2)) model_scratch.add(Dropout(0.25)) model_scratch.add(Conv1D(64, 3, activation='relu')) model_scratch.add(MaxPooling1D(pool_size=2)) model_scratch.add(Dropout(0.25)) model_scratch.add(Conv1D(64, 3, activation='relu')) model_scratch.add(MaxPooling1D(pool_size=2)) model_scratch.add(Dropout(0.25)) model_scratch.add(Conv1D(128, 3, activation='relu')) model_scratch.add(MaxPooling1D(pool_size=2)) model_scratch.add(Dropout(0.25))
def run_experiment(max_len, dropout_rate, n_layers): global dataset, train_ids, valid_ids, test_ids, mode, task, val_method, val_mode, use_PCA # for PCA if set to True visual_components = 25 audio_components = 20 text_components = 110 nodes = 100 epochs = 200 outfile = "MOSI_sweep/late_" + mode + "_" + str(task) + "_" + str( n_layers) + "_" + str(max_len) + "_" + str(dropout_rate) experiment_prefix = "late" batch_size = 64 logs_path = "regression_logs/" experiment_name = "{}_n_{}_dr_{}_nl_{}_ml_{}".format( experiment_prefix, nodes, dropout_rate, n_layers, max_len) # sort through all the video ID, segment ID pairs train_set_ids = [] for vid in train_ids: for sid in dataset['embeddings'][vid].keys(): if mode == "all" or mode == "AV": if dataset['embeddings'][vid][sid] and dataset['facet'][vid][ sid] and dataset['covarep'][vid][sid]: train_set_ids.append((vid, sid)) if mode == "AT" or mode == "A": if dataset['embeddings'][vid][sid] and dataset['covarep'][vid][ sid]: train_set_ids.append((vid, sid)) if mode == "VT" or mode == "V": if dataset['embeddings'][vid][sid] and dataset['facet'][vid][ sid]: train_set_ids.append((vid, sid)) if mode == "T": if dataset['embeddings'][vid][sid]: train_set_ids.append((vid, sid)) valid_set_ids = [] for vid in valid_ids: for sid in dataset['embeddings'][vid].keys(): if mode == "all" or mode == "AV": if dataset['embeddings'][vid][sid] and dataset['facet'][vid][ sid] and dataset['covarep'][vid][sid]: valid_set_ids.append((vid, sid)) if mode == "AT" or mode == "A": if dataset['embeddings'][vid][sid] and dataset['covarep'][vid][ sid]: valid_set_ids.append((vid, sid)) if mode == "VT" or mode == "V": if dataset['embeddings'][vid][sid] and dataset['facet'][vid][ sid]: valid_set_ids.append((vid, sid)) if mode == "T": if dataset['embeddings'][vid][sid]: valid_set_ids.append((vid, sid)) test_set_ids = [] for vid in test_ids: if vid in dataset['embeddings']: for sid in dataset['embeddings'][vid].keys(): if mode == "all" or mode == "AV": if dataset['embeddings'][vid][sid] and dataset['facet'][ vid][sid] and dataset['covarep'][vid][sid]: test_set_ids.append((vid, sid)) if mode == "AT" or mode == "A": if dataset['embeddings'][vid][sid] and dataset['covarep'][ vid][sid]: test_set_ids.append((vid, sid)) if mode == "VT" or mode == "V": if dataset['embeddings'][vid][sid] and dataset['facet'][ vid][sid]: test_set_ids.append((vid, sid)) if mode == "T": if dataset['embeddings'][vid][sid]: test_set_ids.append((vid, sid)) # partition the training, valid and test set. all sequences will be padded/truncated to 15 steps # data will have shape (dataset_size, max_len, feature_dim) if mode == "all" or mode == "AV" or mode == "AT": train_set_audio = np.stack([ pad(dataset['covarep'][vid][sid], max_len) for (vid, sid) in train_set_ids if dataset['covarep'][vid][sid] ], axis=0) valid_set_audio = np.stack([ pad(dataset['covarep'][vid][sid], max_len) for (vid, sid) in valid_set_ids if dataset['covarep'][vid][sid] ], axis=0) test_set_audio = np.stack([ pad(dataset['covarep'][vid][sid], max_len) for (vid, sid) in test_set_ids if dataset['covarep'][vid][sid] ], axis=0) if mode == "all" or mode == "VT" or mode == "AV": train_set_visual = np.stack([ pad(dataset['facet'][vid][sid], max_len) for (vid, sid) in train_set_ids if dataset['facet'][vid][sid] ], axis=0) valid_set_visual = np.stack([ pad(dataset['facet'][vid][sid], max_len) for (vid, sid) in valid_set_ids if dataset['facet'][vid][sid] ], axis=0) test_set_visual = np.stack([ pad(dataset['facet'][vid][sid], max_len) for (vid, sid) in test_set_ids if dataset['facet'][vid][sid] ], axis=0) if mode == "all" or mode == "VT" or mode == "AT": train_set_text = np.stack([ pad(dataset['embeddings'][vid][sid], max_len) for (vid, sid) in train_set_ids if dataset['embeddings'][vid][sid] ], axis=0) valid_set_text = np.stack([ pad(dataset['embeddings'][vid][sid], max_len) for (vid, sid) in valid_set_ids if dataset['embeddings'][vid][sid] ], axis=0) test_set_text = np.stack([ pad(dataset['embeddings'][vid][sid], max_len) for (vid, sid) in test_set_ids if dataset['embeddings'][vid][sid] ], axis=0) if task == "SB": # binarize the sentiment scores for binary classification task y_train = np.array( [sentiments[vid][sid] for (vid, sid) in train_set_ids]) > 0 y_valid = np.array( [sentiments[vid][sid] for (vid, sid) in valid_set_ids]) > 0 y_test = np.array( [sentiments[vid][sid] for (vid, sid) in test_set_ids]) > 0 if task == "SR": y_train = np.array( [sentiments[vid][sid] for (vid, sid) in train_set_ids]) y_valid = np.array( [sentiments[vid][sid] for (vid, sid) in valid_set_ids]) y_test = np.array( [sentiments[vid][sid] for (vid, sid) in test_set_ids]) if task == "S5": y_train1 = np.array( [sentiments[vid][sid] for (vid, sid) in train_set_ids]) y_valid1 = np.array( [sentiments[vid][sid] for (vid, sid) in valid_set_ids]) y_test1 = np.array( [sentiments[vid][sid] for (vid, sid) in test_set_ids]) y_train = convert_S5_hot(y_train1) y_valid = convert_S5_hot(y_valid1) y_test = convert_S5_hot(y_test1) # normalize covarep and facet features, remove possible NaN values if mode == "all" or mode == "AV" or mode == "VT": visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0) visual_max[visual_max == 0] = 1 # if the maximum is 0 we don't normalize train_set_visual = train_set_visual / visual_max valid_set_visual = valid_set_visual / visual_max test_set_visual = test_set_visual / visual_max train_set_visual[train_set_visual != train_set_visual] = 0 valid_set_visual[valid_set_visual != valid_set_visual] = 0 test_set_visual[test_set_visual != test_set_visual] = 0 if mode == "all" or mode == "AT" or mode == "AV": audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0) train_set_audio = train_set_audio / audio_max valid_set_audio = valid_set_audio / audio_max test_set_audio = test_set_audio / audio_max train_set_audio[train_set_audio != train_set_audio] = 0 valid_set_audio[valid_set_audio != valid_set_audio] = 0 test_set_audio[test_set_audio != test_set_audio] = 0 if use_PCA == True: if mode == "all" or mode == "AV" or mode == "VT": nsamples1, nx1, ny1 = train_set_visual.shape train_set_visual = train_set_visual.reshape(nsamples1 * nx1, ny1) nsamples2, nx2, ny2 = valid_set_visual.shape valid_set_visual = valid_set_visual.reshape(nsamples2 * nx2, ny2) nsamples3, nx3, ny3 = test_set_visual.shape test_set_visual = test_set_visual.reshape(nsamples3 * nx3, ny3) pca = decomposition.PCA(n_components=visual_components) train_set_visual_pca = pca.fit_transform(train_set_visual) valid_set_visual_pca = pca.transform(valid_set_visual) test_set_visual_pca = pca.transform(test_set_visual) train_set_visual = train_set_visual_pca.reshape( nsamples1, nx1, visual_components) valid_set_visual = valid_set_visual_pca.reshape( nsamples2, nx2, visual_components) test_set_visual = test_set_visual_pca.reshape( nsamples3, nx3, visual_components) if mode == "all" or mode == "AT" or mode == "AV": nsamples1, nx1, ny1 = train_set_audio.shape train_set_audio = train_set_audio.reshape(nsamples1 * nx1, ny1) nsamples2, nx2, ny2 = valid_set_audio.shape valid_set_audio = valid_set_audio.reshape(nsamples2 * nx2, ny2) nsamples3, nx3, ny3 = test_set_audio.shape test_set_audio = test_set_audio.reshape(nsamples3 * nx3, ny3) pca = decomposition.PCA(n_components=audio_components) train_set_audio_pca = pca.fit_transform(train_set_audio) valid_set_audio_pca = pca.transform(valid_set_audio) test_set_audio_pca = pca.transform(test_set_audio) train_set_audio = train_set_audio_pca.reshape( nsamples1, nx1, audio_components) valid_set_audio = valid_set_audio_pca.reshape( nsamples2, nx2, audio_components) test_set_audio = test_set_audio_pca.reshape( nsamples3, nx3, audio_components) if mode == "all" or mode == "AT" or mode == "VT": nsamples1, nx1, ny1 = train_set_text.shape train_set_text = train_set_text.reshape(nsamples1 * nx1, ny1) nsamples2, nx2, ny2 = valid_set_text.shape valid_set_text = valid_set_text.reshape(nsamples2 * nx2, ny2) nsamples3, nx3, ny3 = test_set_text.shape test_set_text = test_set_text.reshape(nsamples3 * nx3, ny3) pca = decomposition.PCA(n_components=text_components) train_set_text_pca = pca.fit_transform(train_set_text) valid_set_text_pca = pca.transform(valid_set_text) test_set_text_pca = pca.transform(test_set_text) train_set_text = train_set_text_pca.reshape( nsamples1, nx1, text_components) valid_set_text = valid_set_text_pca.reshape( nsamples2, nx2, text_components) test_set_text = test_set_text_pca.reshape(nsamples3, nx3, text_components) k = 3 m = 2 if task == "SB": val_method = "val_acc" val_mode = "max" emote_final = 'sigmoid' last_node = 1 if task == "SR": val_method = "val_loss" val_mode = "min" emote_final = 'linear' last_node = 1 if task == "S5": val_method = "val_acc" val_mode = "max" emote_final = 'softmax' last_node = 5 model = Sequential() # AUDIO if mode == "all" or mode == "AT" or mode == "AV": model1_in = Input(shape=(max_len, train_set_audio.shape[2])) model1_cnn = Conv1D(filters=64, kernel_size=k, activation='relu')(model1_in) model1_mp = MaxPooling1D(m)(model1_cnn) model1_fl = Flatten()(model1_mp) model1_dropout = Dropout(dropout_rate)(model1_fl) model1_dense = Dense(nodes, activation="relu")(model1_dropout) model1_out = Dense(last_node, activation=emote_final)(model1_dense) # TEXT = BLSTM from unimodal if mode == "all" or mode == "AT" or mode == "VT": model2_in = Input(shape=(max_len, train_set_text.shape[2])) model2_lstm = Bidirectional(LSTM(64))(model2_in) model2_dropout = Dropout(dropout_rate)(model2_lstm) model2_dense = Dense(nodes, activation="relu")(model2_dropout) model2_out = Dense(last_node, activation=emote_final)(model2_dense) # VIDEO - CNN from unimodal if mode == "all" or mode == "AV" or mode == "VT": model3_in = Input(shape=(max_len, train_set_visual.shape[2])) model3_cnn = Conv1D(filters=64, kernel_size=k, activation='relu')(model3_in) model3_mp = MaxPooling1D(m)(model3_cnn) model3_fl = Flatten()(model3_mp) model3_dropout = Dropout(dropout_rate)(model3_fl) model3_dense = Dense(nodes, activation="relu")(model3_dropout) model3_out = Dense(last_node, activation=emote_final)(model3_dense) if mode == "all": concatenated = concatenate([model1_out, model2_out, model3_out]) if mode == "AV": concatenated = concatenate([model1_out, model3_out]) if mode == "AT": concatenated = concatenate([model1_out, model2_out]) if mode == "VT": concatenated = concatenate([model2_out, model3_out]) out = Dense(last_node, activation=emote_final)(concatenated) if mode == "all": merged_model = Model([model1_in, model2_in, model3_in], out) if mode == "AV": merged_model = Model([model1_in, model3_in], out) if mode == "AT": merged_model = Model([model1_in, model2_in], out) if mode == "VT": merged_model = Model([model2_in, model3_in], out) if task == "SB": merged_model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) if task == "S5": merged_model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) if task == "SR": merged_model.compile('adam', loss='mean_absolute_error') if mode == "all": x_train = [train_set_audio, train_set_text, train_set_visual] x_valid = [valid_set_audio, valid_set_text, valid_set_visual] x_test = [test_set_audio, test_set_text, test_set_visual] if mode == "AV": x_train = [train_set_audio, train_set_visual] x_valid = [valid_set_audio, valid_set_visual] x_test = [test_set_audio, test_set_visual] if mode == "AT": x_train = [train_set_audio, train_set_text] x_valid = [valid_set_audio, valid_set_text] x_test = [test_set_audio, test_set_text] if mode == "VT": x_train = [train_set_text, train_set_visual] x_valid = [valid_set_text, valid_set_visual] x_test = [test_set_text, test_set_visual] early_stopping = EarlyStopping(monitor=val_method, min_delta=0, patience=10, verbose=1, mode=val_mode) callbacks_list = [early_stopping] merged_model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=[x_valid, y_valid], callbacks=callbacks_list) preds = merged_model.predict(x_test) out = open(outfile, "wb") print "testing output before eval metrics calcs.." print y_test[0] print preds[0] if task == "SR": preds = np.concatenate(preds) mae = sklearn.metrics.mean_absolute_error(y_test, preds) r = scipy.stats.pearsonr(y_test, preds) out.write("Test MAE: " + str(mae) + "\n") out.write("Test CORR: " + str(r) + "\n") if task == "S5": preds = convert_pred_hot(preds) acc = sklearn.metrics.accuracy_score(y_test, preds) out.write("Test ACC: " + str(acc) + "\n") if task == "SB": acc = np.mean((preds > 0.5) == y_test.reshape(-1, 1)) preds = np.concatenate(preds) preds = preds > 0.5 f1 = sklearn.metrics.f1_score(y_test, preds) out.write("Test ACC: " + str(acc) + "\n") out.write("Test F1: " + str(f1) + "\n") out.write("use_PCA=" + str(use_PCA) + "\n") out.write("dropout_rate=" + str(dropout_rate) + "\n") out.write("n_layers=" + str(n_layers) + "\n") out.write("max_len=" + str(max_len) + "\n") out.write("nodes=" + str(nodes) + "\n") out.write("task=" + str(task) + "\n") out.write("mode=" + str(mode) + "\n") out.write("num_train=" + str(len(train_set_ids)) + "\n") out.write("num_valid=" + str(len(valid_set_ids)) + "\n") out.write("num_test=" + str(len(test_set_ids)) + "\n") out.close()
y_train, y_test = y_values[train_index], y_values[test_index] y_train = np.eye(n_classes)[y_train - 1] print X_train.shape # # input layer input_signal = Input(shape=(signal_rows, 1)) print K.int_shape(input_signal) # define initial parameters b_init = Constant(value=0.0) k_init = TruncatedNormal(mean=0.0, stddev=0.01, seed=2018) # first feature extractor conv11 = Conv1D(16, kernel_size=32, strides=1, padding='valid', bias_initializer=b_init, kernel_initializer=k_init)(input_signal) bn11 = BatchNormalization()(conv11) actv11 = Activation('relu')(bn11) conv12 = Conv1D(32, kernel_size=32, strides=1, padding='valid', bias_initializer=b_init, kernel_initializer=k_init)(actv11) bn12 = BatchNormalization()(conv12) actv12 = Activation('relu')(bn12) flat1 = Flatten()(actv12) # second feature extractor
def create(cls, embedSize, vocabSize, paddedSentSize, recurrentSize=None): if not recurrentSize: recurrentSize = embedSize sentenceAInput = Input(shape=(paddedSentSize, vocabSize)) # maskA = Masking(mask_value=0.0)(sentenceAInput) sentenceBInput = Input(shape=(paddedSentSize, vocabSize)) # maskB = Masking(mask_value=0.0)(sentenceBInput) normal = keras.initializers.glorot_normal() conv_A_a = Conv1D(recurrentSize, 5) conv_A_a_built = conv_A_a(sentenceAInput) conv_A_b = Conv1D(recurrentSize, 5) conv_A_b_built = conv_A_b(conv_A_a_built) conv_A_c = Conv1D(recurrentSize, 5) conv_A_c_built = conv_A_c(MaxPooling1D()(conv_A_b_built)) # conv_A_flat = Flatten()(conv_A_c_built) dense_A_a = Dense(embedSize, kernel_initializer=normal, activation="relu") dense_A_a_built = dense_A_a(conv_A_c_built) dense_A_b = Dense(embedSize, kernel_initializer=normal, activation="relu") dense_A_b_built = dense_A_b(dense_A_a_built) sentenceAEmbedded = Dense(embedSize, kernel_initializer=normal, activation="relu") sentenceAEmbedded_built = sentenceAEmbedded(dense_A_b_built) conv_B_a = Conv1D(recurrentSize, 5) conv_B_a_built = conv_B_a(sentenceBInput) conv_B_b = Conv1D(recurrentSize, 5) conv_B_b_built = conv_B_b(conv_B_a_built) conv_B_c = Conv1D(recurrentSize, 5) conv_B_c_built = conv_B_c(conv_B_b_built) # conv_B_flat = Flatten()(conv_B_c_built) dense_B_a = Dense(embedSize, kernel_initializer=normal, activation="relu") dense_B_a_built = dense_B_a(conv_B_c_built) dense_B_b = Dense(embedSize, kernel_initializer=normal, activation="relu") dense_B_b_built = dense_B_b(dense_B_a_built) sentenceBEmbedded = Dense(embedSize, kernel_initializer=normal, activation="relu") sentenceBEmbedded_built = sentenceBEmbedded(dense_B_b_built) # Combining/Output adder = Concatenate(axis=1) added = adder([sentenceAEmbedded_built, sentenceBEmbedded_built]) recurrentA = LSTM(recurrentSize*2, return_sequences=True) recurrentA_built = recurrentA(added) recurrentB = LSTM(recurrentSize*2) recurrentB_built = recurrentB(recurrentA_built) combineEmbedded = Dense(embedSize, kernel_initializer=normal, activation="relu") combineEmbedded_built = combineEmbedded(recurrentB_built) score = Dense(1, kernel_initializer=normal, activation="relu") score_built = score(combineEmbedded_built) trainer = Model(inputs=[sentenceAInput, sentenceBInput], outputs=score_built) optimizer = Adam(lr=4e-4) trainer.compile(optimizer, 'mae') sentenceAEmbedder = Model(inputs=sentenceAInput, outputs=sentenceAEmbedded_built) sentenceBEmbedder = Model(inputs=sentenceBInput, outputs=sentenceBEmbedded_built) engine = cls() engine.trainer = trainer engine.embedder_a = sentenceAEmbedder engine.embedder_b = sentenceBEmbedder return engine
# pdb.set_trace() model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add( Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)) model.add( Conv1D(int(EMBEDDING_DIM / 4), 10, padding='same', activation='relu', strides=10)) # model.add(Dropout(0.2)) model.add(CuDNNLSTM(int(EMBEDDING_DIM / 4))) model.add(Dense(12, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) model.fit(x_train, y_train, epochs=5, validation_data=(x_val, y_val)) model.save('cnn.h5') print("Step 5: testing model...")