def run_experiment(max_len, dropout_rate, n_layers): global dataset, train_ids, valid_ids, test_ids, mode, task, val_method, val_mode, use_PCA # For PCA if set to True visual_components = 25 audio_components = 20 text_components = 110 nodes = 100 epochs = 200 outfile = "MOSI_sweep/int_" + mode + "_" + str(task) + "_" + str( n_layers) + "_" + str(max_len) + "_" + str(dropout_rate) experiment_prefix = "intermediate" batch_size = 64 logs_path = "regression_logs/" experiment_name = "{}_n_{}_dr_{}_nl_{}_ml_{}".format( experiment_prefix, nodes, dropout_rate, n_layers, max_len) # sort through all the video ID, segment ID pairs train_set_ids = [] for vid in train_ids: for sid in dataset['embeddings'][vid].keys(): if mode == "all" or mode == "AV": if dataset['embeddings'][vid][sid] and dataset['facet'][vid][ sid] and dataset['covarep'][vid][sid]: train_set_ids.append((vid, sid)) if mode == "AT" or mode == "A": if dataset['embeddings'][vid][sid] and dataset['covarep'][vid][ sid]: train_set_ids.append((vid, sid)) if mode == "VT" or mode == "V": if dataset['embeddings'][vid][sid] and dataset['facet'][vid][ sid]: train_set_ids.append((vid, sid)) if mode == "T": if dataset['embeddings'][vid][sid]: train_set_ids.append((vid, sid)) valid_set_ids = [] for vid in valid_ids: for sid in dataset['embeddings'][vid].keys(): if mode == "all" or mode == "AV": if dataset['embeddings'][vid][sid] and dataset['facet'][vid][ sid] and dataset['covarep'][vid][sid]: valid_set_ids.append((vid, sid)) if mode == "AT" or mode == "A": if dataset['embeddings'][vid][sid] and dataset['covarep'][vid][ sid]: valid_set_ids.append((vid, sid)) if mode == "VT" or mode == "V": if dataset['embeddings'][vid][sid] and dataset['facet'][vid][ sid]: valid_set_ids.append((vid, sid)) if mode == "T": if dataset['embeddings'][vid][sid]: valid_set_ids.append((vid, sid)) test_set_ids = [] for vid in test_ids: if vid in dataset['embeddings']: for sid in dataset['embeddings'][vid].keys(): if mode == "all" or mode == "AV": if dataset['embeddings'][vid][sid] and dataset['facet'][ vid][sid] and dataset['covarep'][vid][sid]: test_set_ids.append((vid, sid)) if mode == "AT" or mode == "A": if dataset['embeddings'][vid][sid] and dataset['covarep'][ vid][sid]: test_set_ids.append((vid, sid)) if mode == "VT" or mode == "V": if dataset['embeddings'][vid][sid] and dataset['facet'][ vid][sid]: test_set_ids.append((vid, sid)) if mode == "T": if dataset['embeddings'][vid][sid]: test_set_ids.append((vid, sid)) # partition the training, valid and test set. all sequences will be padded/truncated to 15 steps # data will have shape (dataset_size, max_len, feature_dim) if mode == "all" or mode == "AV" or mode == "AT": train_set_audio = np.stack([ pad(dataset['covarep'][vid][sid], max_len) for (vid, sid) in train_set_ids if dataset['covarep'][vid][sid] ], axis=0) valid_set_audio = np.stack([ pad(dataset['covarep'][vid][sid], max_len) for (vid, sid) in valid_set_ids if dataset['covarep'][vid][sid] ], axis=0) test_set_audio = np.stack([ pad(dataset['covarep'][vid][sid], max_len) for (vid, sid) in test_set_ids if dataset['covarep'][vid][sid] ], axis=0) if mode == "all" or mode == "VT" or mode == "AV": train_set_visual = np.stack([ pad(dataset['facet'][vid][sid], max_len) for (vid, sid) in train_set_ids if dataset['facet'][vid][sid] ], axis=0) valid_set_visual = np.stack([ pad(dataset['facet'][vid][sid], max_len) for (vid, sid) in valid_set_ids if dataset['facet'][vid][sid] ], axis=0) test_set_visual = np.stack([ pad(dataset['facet'][vid][sid], max_len) for (vid, sid) in test_set_ids if dataset['facet'][vid][sid] ], axis=0) if mode == "all" or mode == "VT" or mode == "AT": train_set_text = np.stack([ pad(dataset['embeddings'][vid][sid], max_len) for (vid, sid) in train_set_ids if dataset['embeddings'][vid][sid] ], axis=0) valid_set_text = np.stack([ pad(dataset['embeddings'][vid][sid], max_len) for (vid, sid) in valid_set_ids if dataset['embeddings'][vid][sid] ], axis=0) test_set_text = np.stack([ pad(dataset['embeddings'][vid][sid], max_len) for (vid, sid) in test_set_ids if dataset['embeddings'][vid][sid] ], axis=0) if task == "SB": # binarize the sentiment scores for binary classification task y_train = np.array( [sentiments[vid][sid] for (vid, sid) in train_set_ids]) > 0 y_valid = np.array( [sentiments[vid][sid] for (vid, sid) in valid_set_ids]) > 0 y_test = np.array( [sentiments[vid][sid] for (vid, sid) in test_set_ids]) > 0 if task == "SR": y_train = np.array( [sentiments[vid][sid] for (vid, sid) in train_set_ids]) y_valid = np.array( [sentiments[vid][sid] for (vid, sid) in valid_set_ids]) y_test = np.array( [sentiments[vid][sid] for (vid, sid) in test_set_ids]) if task == "S5": y_train1 = np.array( [sentiments[vid][sid] for (vid, sid) in train_set_ids]) y_valid1 = np.array( [sentiments[vid][sid] for (vid, sid) in valid_set_ids]) y_test1 = np.array( [sentiments[vid][sid] for (vid, sid) in test_set_ids]) y_train = convert_S5_hot(y_train1) y_valid = convert_S5_hot(y_valid1) y_test = convert_S5_hot(y_test1) # normalize covarep and facet features, remove possible NaN values if mode == "all" or mode == "AV" or mode == "VT": visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0) visual_max[visual_max == 0] = 1 # if the maximum is 0 we don't normalize train_set_visual = train_set_visual / visual_max valid_set_visual = valid_set_visual / visual_max test_set_visual = test_set_visual / visual_max train_set_visual[train_set_visual != train_set_visual] = 0 valid_set_visual[valid_set_visual != valid_set_visual] = 0 test_set_visual[test_set_visual != test_set_visual] = 0 if mode == "all" or mode == "AT" or mode == "AV": audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0) train_set_audio = train_set_audio / audio_max valid_set_audio = valid_set_audio / audio_max test_set_audio = test_set_audio / audio_max train_set_audio[train_set_audio != train_set_audio] = 0 valid_set_audio[valid_set_audio != valid_set_audio] = 0 test_set_audio[test_set_audio != test_set_audio] = 0 if use_PCA == True: if mode == "all" or mode == "AV" or mode == "VT": nsamples1, nx1, ny1 = train_set_visual.shape train_set_visual = train_set_visual.reshape(nsamples1 * nx1, ny1) nsamples2, nx2, ny2 = valid_set_visual.shape valid_set_visual = valid_set_visual.reshape(nsamples2 * nx2, ny2) nsamples3, nx3, ny3 = test_set_visual.shape test_set_visual = test_set_visual.reshape(nsamples3 * nx3, ny3) pca = decomposition.PCA(n_components=visual_components) train_set_visual_pca = pca.fit_transform(train_set_visual) valid_set_visual_pca = pca.transform(valid_set_visual) test_set_visual_pca = pca.transform(test_set_visual) train_set_visual = train_set_visual_pca.reshape( nsamples1, nx1, visual_components) valid_set_visual = valid_set_visual_pca.reshape( nsamples2, nx2, visual_components) test_set_visual = test_set_visual_pca.reshape( nsamples3, nx3, visual_components) if mode == "all" or mode == "AT" or mode == "AV": nsamples1, nx1, ny1 = train_set_audio.shape train_set_audio = train_set_audio.reshape(nsamples1 * nx1, ny1) nsamples2, nx2, ny2 = valid_set_audio.shape valid_set_audio = valid_set_audio.reshape(nsamples2 * nx2, ny2) nsamples3, nx3, ny3 = test_set_audio.shape test_set_audio = test_set_audio.reshape(nsamples3 * nx3, ny3) pca = decomposition.PCA(n_components=audio_components) train_set_audio_pca = pca.fit_transform(train_set_audio) valid_set_audio_pca = pca.transform(valid_set_audio) test_set_audio_pca = pca.transform(test_set_audio) train_set_audio = train_set_audio_pca.reshape( nsamples1, nx1, audio_components) valid_set_audio = valid_set_audio_pca.reshape( nsamples2, nx2, audio_components) test_set_audio = test_set_audio_pca.reshape( nsamples3, nx3, audio_components) if mode == "all" or mode == "AT" or mode == "VT": nsamples1, nx1, ny1 = train_set_text.shape train_set_text = train_set_text.reshape(nsamples1 * nx1, ny1) nsamples2, nx2, ny2 = valid_set_text.shape valid_set_text = valid_set_text.reshape(nsamples2 * nx2, ny2) nsamples3, nx3, ny3 = test_set_text.shape test_set_text = test_set_text.reshape(nsamples3 * nx3, ny3) pca = decomposition.PCA(n_components=text_components) train_set_text_pca = pca.fit_transform(train_set_text) valid_set_text_pca = pca.transform(valid_set_text) test_set_text_pca = pca.transform(test_set_text) train_set_text = train_set_text_pca.reshape( nsamples1, nx1, text_components) valid_set_text = valid_set_text_pca.reshape( nsamples2, nx2, text_components) test_set_text = test_set_text_pca.reshape(nsamples3, nx3, text_components) k = 3 m = 2 if task == "SB": val_method = "val_acc" val_mode = "max" emote_final = 'sigmoid' if task == "SR": val_method = "val_loss" val_mode = "min" emote_final = 'linear' if task == "S5": val_method = "val_acc" val_mode = "max" emote_final = 'softmax' model = Sequential() # AUDIO if mode == "all" or mode == "AT" or mode == "AV": model1_in = Input(shape=(max_len, train_set_audio.shape[2])) model1_cnn = Conv1D(filters=64, kernel_size=k, activation='relu')(model1_in) model1_mp = MaxPooling1D(m)(model1_cnn) model1_fl = Flatten()(model1_mp) model1_dropout = Dropout(dropout_rate)(model1_fl) model1_dense = Dense(nodes, activation="relu")(model1_dropout) for i in range(2, n_layers + 1): model1_dropout = Dropout(dropout_rate)(model1_dense) model1_dense = Dense(nodes, activation="relu")(model1_dropout) # TEXT = BLSTM from unimodal if mode == "all" or mode == "AT" or mode == "VT": model2_in = Input(shape=(max_len, train_set_text.shape[2])) model2_lstm = Bidirectional(LSTM(64))(model2_in) model2_dropout = Dropout(dropout_rate)(model2_lstm) model2_dense = Dense(nodes, activation="relu")(model2_dropout) for i in range(2, n_layers + 1): model2_dropout = Dropout(dropout_rate)(model2_dense) model2_dense = Dense(nodes, activation="relu")(model2_dropout) # VIDEO - CNN from unimodal if mode == "all" or mode == "AV" or mode == "VT": model3_in = Input(shape=(max_len, train_set_visual.shape[2])) model3_cnn = Conv1D(filters=64, kernel_size=k, activation='relu')(model3_in) model3_mp = MaxPooling1D(m)(model3_cnn) model3_fl = Flatten()(model3_mp) model3_dropout = Dropout(dropout_rate)(model3_fl) model3_dense = Dense(nodes, activation="relu")(model3_dropout) for i in range(2, n_layers + 1): model3_dropout = Dropout(dropout_rate)(model3_dense) model3_dense = Dense(nodes, activation="relu")(model3_dropout) if mode == "all": concatenated = concatenate([model1_dense, model2_dense, model3_dense]) if mode == "AV": concatenated = concatenate([model1_dense, model3_dense]) if mode == "AT": concatenated = concatenate([model1_dense, model2_dense]) if mode == "VT": concatenated = concatenate([model2_dense, model3_dense]) dense = Dense(200, activation='relu')(concatenated) dense2 = Dense(200, activation='relu')(dense) if task == "SR": out = Dense(1, activation=emote_final)(dense2) if task == "SB": out = Dense(1, activation=emote_final)(dense2) if task == "S5": out = Dense(5, activation=emote_final)(dense2) if mode == "all": merged_model = Model([model1_in, model2_in, model3_in], out) if mode == "AV": merged_model = Model([model1_in, model3_in], out) if mode == "AT": merged_model = Model([model1_in, model2_in], out) if mode == "VT": merged_model = Model([model2_in, model3_in], out) if task == "SB": merged_model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) if task == "S5": merged_model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) if task == "SR": merged_model.compile('adam', loss='mean_absolute_error') if mode == "all": x_train = [train_set_audio, train_set_text, train_set_visual] x_valid = [valid_set_audio, valid_set_text, valid_set_visual] x_test = [test_set_audio, test_set_text, test_set_visual] if mode == "AV": x_train = [train_set_audio, train_set_visual] x_valid = [valid_set_audio, valid_set_visual] x_test = [test_set_audio, test_set_visual] if mode == "AT": x_train = [train_set_audio, train_set_text] x_valid = [valid_set_audio, valid_set_text] x_test = [test_set_audio, test_set_text] if mode == "VT": x_train = [train_set_text, train_set_visual] x_valid = [valid_set_text, valid_set_visual] x_test = [test_set_text, test_set_visual] early_stopping = EarlyStopping(monitor=val_method, min_delta=0, patience=10, verbose=1, mode=val_mode) callbacks_list = [early_stopping] merged_model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=[x_valid, y_valid], callbacks=callbacks_list) preds = merged_model.predict(x_test) out = open(outfile, "wb") print "testing output before eval metrics calcs.." print y_test[0] print preds[0] if task == "SR": preds = np.concatenate(preds) mae = sklearn.metrics.mean_absolute_error(y_test, preds) r = scipy.stats.pearsonr(y_test, preds) out.write("Test MAE: " + str(mae) + "\n") out.write("Test CORR: " + str(r) + "\n") if task == "S5": preds = convert_pred_hot(preds) acc = sklearn.metrics.accuracy_score(y_test, preds) out.write("Test ACC: " + str(acc) + "\n") if task == "SB": acc = np.mean((preds > 0.5) == y_test.reshape(-1, 1)) preds = np.concatenate(preds) preds = preds > 0.5 f1 = sklearn.metrics.f1_score(y_test, preds) out.write("Test ACC: " + str(acc) + "\n") out.write("Test F1: " + str(f1) + "\n") out.write("use_PCA=" + str(use_PCA) + "\n") out.write("dropout_rate=" + str(dropout_rate) + "\n") out.write("n_layers=" + str(n_layers) + "\n") out.write("max_len=" + str(max_len) + "\n") out.write("nodes=" + str(nodes) + "\n") out.write("task=" + str(task) + "\n") out.write("mode=" + str(mode) + "\n") out.write("num_train=" + str(len(train_set_ids)) + "\n") out.write("num_valid=" + str(len(valid_set_ids)) + "\n") out.write("num_test=" + str(len(test_set_ids)) + "\n") out.close()
model_cpu=Model(inputs=[q1_input,q2_input],outputs=output) model=multi_gpu_model(model_cpu,gpus=4) model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) model.fit([padded_q1,padded_q2],y_train,epochs=2,batch_size=128,shuffle=True) score=model.evaluate([test_padded_q1,test_padded_q2],y_test,verbose=1) print(score[1]) print(x_test_q1[0]+" "+x_test_q2[0]) prediction=model.predict([test_padded_q1,test_padded_q2]) count=0 output=open("wrong_prediction1.txt","a") for p in prediction: if p[0]<0.5 and y_test[count]==1: output.write(x_test_q1[count]+" "+x_test_q2[count]+"\t"+"Pred:"+str(p[0])+"True:"+str(y_test[count])+"\n") elif p[0]>=0.5 and y_test[count]==0: output.write(x_test_q1[count]+" "+x_test_q2[count]+"\t"+"Pred:"+str(p[0])+"True:"+str(y_test[count])+"\n") count+=1 output.close()