def infer_embedding(self, partition): """infer embeddings given documents using trained model """ infer_docs = [] labels = dict() _, _, level_dev, level_train = load_label() labels['train'] = level_train labels['dev'] = level_dev with smart_open(self.data_config['transcript_preproc'][partition], 'rb', encoding='utf-8') as all_data: for line_no, line in enumerate(all_data): tokens = gensim.utils.to_unicode(line).split() words = tokens tags = [line_no] sentiment = [labels[partition][line_no]] infer_docs.append( self.interview_transcript(words, tags, sentiment)) infer_vecs = [ self.model.infer_vector(doc.words, alpha=.1) for doc in infer_docs ] infer_labels = [doc.sentiment for doc in infer_docs] # save inferred vectors and labels print("\nsaving inferred vectors and labels to file") if os.path.isdir(self.save_dir): np.save(os.path.join(self.save_dir, 'vectors_%s' % partition), infer_vecs) np.save(os.path.join(self.save_dir, 'labels_%s' % partition), infer_labels)
def prepare_data(self, corpus): """prepared training data """ labels = dict() _, _, level_dev, level_train = load_label() labels['train'] = level_train labels['dev'] = level_dev # evaluate without test partition as there is no label for partition in ['train', 'dev']: with smart_open(self.data_config['transcript_preproc'][partition], 'rb', encoding='utf-8') as all_data: for line_no, line in enumerate(all_data): tokens = gensim.utils.to_unicode(line).split() words = tokens tags = [line_no] sentiment = [labels[partition][line_no] ] if partition != 'test' else [None] self.all_docs.append( self.interview_transcript(words, tags, sentiment)) # use addition Turkish corpus for performance boost if corpus: with smart_open(self.data_config['turkish_corpus_proc'], 'rb', encoding='utf-8') as all_data: for line_no, line in enumerate(all_data): tokens = gensim.utils.to_unicode(line).split() words = tokens tags = [line_no] sentiment = [None] self.all_docs.append( self.interview_transcript(words, tags, sentiment))
def RF(self): print( "\nrunning RF on features selected with RF with doc2vec embeddings" ) feature_path = smart_open('./pre-trained/fusion/feature_list.txt', 'rb', encoding='utf-8') feature_list = [] for _, line in enumerate(feature_path): feature_list.append(str(line).replace('\n', '')) for _ in range(3): for feature in feature_list: _, _, y_dev, y_train = load_label() y_train = y_train.astype('int') y_dev = y_dev.astype('int') X_train = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_train.npy')) X_dev = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_dev.npy')) random_forest = RandomForest(feature, X_train, y_train, X_dev, y_dev, test=False) random_forest.run() y_pred_train, y_pred_dev = random_forest.evaluate() get_UAR(y_pred_train, y_train, np.array([]), 'RF', feature, 'multiple', train_set=True, test=False) get_UAR(y_pred_dev, y_dev, np.array([]), 'RF', feature, 'multiple', test=False)
def test_multi_task_dnn(self): X_train, y_train, inst_train, X_dev, y_dev, inst_dev = load_proc_baseline_feature('BoAW', verbose=True) ymrs_dev, ymrs_train, _, _ = load_label() self.assertEqual(X_train.shape[1], X_dev.shape[1]) num_classes = max(max(y_train), max(y_dev)) test_dnn = MultiTaskDNN('BoAW', X_train.shape[1], num_classes) y_dev_r = test_dnn.prepare_regression_label(ymrs_dev.values[:, 1], inst_dev) y_train_r = test_dnn.prepare_regression_label(ymrs_train.values[:, 1], inst_train) self.assertEqual(len(y_dev_r), len(y_dev)) self.assertEqual(len(y_train_r), len(y_train)) test_dnn.build_model() test_dnn.train_model(X_train, y_train, y_train_r, X_dev, y_dev, y_dev_r) test_dnn.evaluate_model(X_train, y_train, y_train_r, X_dev, y_dev, y_dev_r)
def get_late_fusion_UAR(model_name, feature_name_1, feature_name_2, baseline=False): """ apply late fusion strategy on posterior probabilities of two modalities --- # para model_name: str given model name # para feature_name_1: str given 1st feature name # para feature_name_2: str given 2nd feature name # para baseline: bool whether to get baseline performance or not """ prob_dev_1 = load_post_probability(model_name, feature_name_1) prob_dev_2 = load_post_probability(model_name, feature_name_2) assert prob_dev_1.shape == prob_dev_2.shape # PROB_DEV_1 = (3, 60) # PROB_DEV_2 = (3, 60) _, _, level_dev, _ = load_label() y_dev = level_dev.values[:, 1] # get the shape (_, num_inst) = prob_dev_1.shape y_pred = np.array([0] * num_inst) for i in range(num_inst): prob = prob_dev_1[:, i] + prob_dev_2[:, i] # fusion based on majority voting and averaging two modalities y_pred[i] = np.argmax(prob) + 1 get_UAR(y_pred, y_dev, np.array([]), model_name, feature_name_1 + feature_name_2, 'multiple', baseline=baseline, fusion=True)
def DNN(self): print( "\nrunning Multi-Task DNN on features selected with RF with doc2vec embeddings" ) feature_path = smart_open('./pre-trained/fusion/feature_list.txt', 'rb', encoding='utf-8') feature_list = [] for _, line in enumerate(feature_path): feature_list.append(str(line).replace('\n', '')) feature = feature_list[0] X_train = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_train_tree.npy')) X_dev = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_dev_tree.npy')) y_dev_r, y_train_r, y_dev, y_train = load_label() y_train = y_train.astype('int') y_dev = y_dev.astype('int') num_classes = 3 if False: multi_dnn = MultiTaskDNN(feature, X_train.shape[1], num_classes) multi_dnn.build_model() multi_dnn.train_model(X_train, y_train, y_train_r, X_dev, y_dev, y_dev_r) multi_dnn.evaluate_model(X_train, y_train, y_train_r, X_dev, y_dev, y_dev_r) else: single_dnn = SingleTaskDNN(feature, X_train.shape[1], num_classes) single_dnn.build_model() single_dnn.train_model(X_train, y_train, X_dev, y_dev) single_dnn.evaluate_model(X_dev, y_dev)
def get_UAR(y_pred, y_dev, inst, model_name, feature_name, modality, frame=True, session=True, baseline=False, train_set=False, fusion=False, test=False): """ get UAR metric for both frame-level and session-level --- # para y_pred: np.array() predicted mania level for each frame # para y_dev: np.array() actual mania level for each frame # para inst: np.array() session mappings of frames # para model_name: str given model name # para feature_name: str given feature name # para modality: str either single or multiple # para frame: bool whether to get frame-level UAR or not # para session: bool whether to get session-level UAR or not # para baseline: bool whether to get baseline performance or not # para train_set: bool whether to get UAR on training set or not # para fusion: bool whether to fuse UAR or not # para test: bool whether to save UAR results """ frame_res, session_res, precision, fscore = 0.0, 0.0, 0.0, 0.0 modality = 'baseline' if baseline else modality # UAR for session-level only (AU features) if not inst.any(): # get recalls for three classes recall = [0] * 3 for i in range(3): index, = np.where(y_dev == (i + 1)) index_pred, = np.where(y_pred[index] == (i + 1)) recall[i] = len(index_pred) / len(index) # TP / (TP + FN) session_res = np.mean(recall) np.save( os.path.join('pre-trained', 'baseline', '%s_%s_results.npy' % (model_name, feature_name)), y_pred) if not fusion: if train_set: print( "\nUAR (mean of recalls) using %s feature based on session-level (training set) is %.3f and %.3f (sklearn)" % (feature_name, session_res, recall_score(y_dev, y_pred, average='macro'))) else: print( "\nUAR (mean of recalls) using %s feature based on session-level (development set) is %.3f and %.3f (sklearn)" % (feature_name, session_res, recall_score(y_dev, y_pred, average='macro'))) if not test: session_res = recall_score(y_dev, y_pred, average='macro') precision, _, fscore, _ = precision_recall_fscore_support( y_dev, y_pred, average='macro') save_UAR_results(frame_res, session_res, precision, fscore, model_name, feature_name, modality) print( classification_report( y_dev, y_pred, target_names=['depression', 'hypo-mania', 'mania'])) else: print( "\nUAR (mean of recalls) using fusion based on session-level is %.3f and %.3f" % (session_res, recall_score(y_dev, y_pred, average='macro'))) if not test: session_res = recall_score(y_dev, y_pred, average='macro') precision, _, fscore, _ = precision_recall_fscore_support( y_dev, y_pred, average='macro') save_UAR_results(frame_res, session_res, precision, fscore, model_name, 'fusion', modality) else: # UAR for frame-level if frame: # get recalls for three classes recall = [0] * 3 for i in range(3): index, = np.where(y_dev == (i + 1)) index_pred, = np.where(y_pred[index] == (i + 1)) recall[i] = len(index_pred) / len(index) # TP / (TP + FN) frame_res = np.mean(recall) if train_set: print( "\nUAR (mean of recalls) using %s feature based on frame-level (training set) is %.3f" % (feature_name, frame_res)) else: print( "\nUAR (mean of recalls) using %s feature based on frame-level (development set) is %.3f" % (feature_name, frame_res)) print( classification_report( y_dev, y_pred, target_names=['depression', 'hypo-mania', 'mania'])) # UAR for session-level if session: # get majority-voting for each session decision = np.array(([0] * inst.max())) for j in range(len(decision)): index, = np.where(inst == (j + 1)) count = [0] * 3 for k in range(3): index_pred, = np.where(y_pred[index] == (k + 1)) count[k] = len(index_pred) decision[j] = np.argmax(count) + 1 np.save( os.path.join('pre-trained', 'baseline', '%s_%s_results.npy' % (model_name, feature_name)), decision) # get recalls for three classes recall = [0] * 3 _, _, level_dev, _ = load_label() labels = level_dev labels = np.array(labels, dtype=np.int8) for i in range(3): index, = np.where(labels == (i + 1)) index_pred, = np.where(decision[index] == (i + 1)) recall[i] = len(index_pred) / len(index) # TP / (TP + FN) session_res = np.mean(recall) if train_set: print( "\nUAR (mean of recalls) using %s feature based on session-level (training set) is %.3f" % (feature_name, session_res)) else: print( "\nUAR (mean of recalls) using %s feature based on session-level (development set) is %.3f" % (feature_name, session_res)) if not train_set and not test: precision, _, fscore, _ = precision_recall_fscore_support( y_dev, y_pred, average='macro') save_UAR_results(frame_res, session_res, precision, fscore, model_name, feature_name, modality) return frame_res, session_res
def test_load_label(self): _, _, y_dev, y_train = load_label(verbose=True) y_dev = y_dev.values[:, 1] y_train = y_train.values[:, 1]
def RF_CV(self): print( "\nrunning RF on features selected with RF with doc2vec embeddings" ) feature_path = smart_open('./pre-trained/fusion/feature_list.txt', 'rb', encoding='utf-8') feature_list = [] for _, line in enumerate(feature_path): feature_list.append(str(line).replace('\n', '')) from sklearn.metrics import precision_recall_fscore_support cv_results_UAR = dict() cv_results_UAP = dict() for feature in feature_list: cv_results_UAR[feature] = [] cv_results_UAP[feature] = [] _, _, y_dev, y_train = load_label() y_train = y_train.astype('int') y_dev = y_dev.astype('int') X_train = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_train.npy')) X_dev = np.load( os.path.join('pre-trained', 'fusion', feature, 'X_dev.npy')) X = np.vstack((X_train, X_dev)) y = np.hstack((y_train, y_dev)) cv_ids = k_fold_cv(len(X)) for cv_id in cv_ids: X_train = X[cv_id[0]] y_train = y[cv_id[0]] X_dev = X[cv_id[1]] y_dev = y[cv_id[1]] print('train on %d test on %d' % (len(y_train), len(y_dev))) random_forest = RandomForest(feature, X_train, y_train, X_dev, y_dev, test=False) random_forest.run() _, y_pred = random_forest.evaluate() precision, recall, _, _ = precision_recall_fscore_support( y_dev, y_pred, average='macro') cv_results_UAR[feature].append(recall) cv_results_UAP[feature].append(precision) assert len(cv_results_UAR[feature]) == len( cv_results_UAP[feature]) == 10 with open(os.path.join('results', 'cross-validation.json'), 'a+', encoding='utf-8') as outfile: json.dump(cv_results_UAR, outfile) json.dump(cv_results_UAP, outfile)
def FUSION(self): print( "\nrunning early fusion strategy on audio-visual-textual modalities" ) model_path_AV = smart_open('./pre-trained/DDAE/model_list.txt', 'rb', encoding='utf-8') model_path_T = smart_open('./pre-trained/doc2vec/model_list.txt', 'rb', encoding='utf-8') model_list_AV = [] model_list_T = [] for _, line_AV in enumerate(model_path_AV): line_AV = str(line_AV).replace('\n', '') model_list_AV.append(line_AV) for _, line_T in enumerate(model_path_T): line_T = str(line_T).replace('\n', '') model_list_T.append(line_T) _, _, y_dev, y_train = load_label() y_train = y_train.astype('int') y_dev = y_dev.astype('int') for AV in model_list_AV: for T in model_list_T: feature_name = AV[19:-2] + T[22:] if os.path.isfile( os.path.join('pre-trained', 'fusion', feature_name, 'X_train_tree.npy')) and os.path.isfile( os.path.join('pre-trained', 'fusion', feature_name, 'X_train_tree.npy')): X_train_tree = np.load( os.path.join('pre-trained', 'fusion', feature_name, 'X_train_tree.npy')) X_dev_tree = np.load( os.path.join('pre-trained', 'fusion', feature_name, 'X_train_tree.npy')) else: X_train_AV = np.load( os.path.join(AV[:-2], 'X_train_tree_%d.npy' % int(AV[-2:]))) X_dev_AV = np.load( os.path.join(AV[:-2], 'X_dev_tree_%d.npy' % int(AV[-2:]))) X_train_txt = np.load(os.path.join(T, 'vectors_train.npy')) X_dev_txt = np.load(os.path.join(T, 'vectors_dev.npy')) assert X_train_AV.shape[0] == X_train_txt.shape[0] == len( y_train) assert X_dev_AV.shape[0] == X_dev_txt.shape[0] == len( y_dev) X_train = np.hstack((X_train_AV, X_train_txt)) X_dev = np.hstack((X_dev_AV, X_dev_txt)) os.mkdir( os.path.join('pre-trained', 'fusion', feature_name)) np.save( os.path.join('pre-trained', 'fusion', feature_name, 'X_train'), X_train) np.save( os.path.join('pre-trained', 'fusion', feature_name, 'X_dev'), X_dev) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=800, criterion='entropy') df = pd.DataFrame(np.vstack((X_train, X_dev))) feature_names = [ 'feature_%d' % i for i in range(len(X_train[0])) ] df.columns = feature_names y = np.hstack((y_train, y_dev)) model.fit(df, y) importances = model.feature_importances_ print("\nfeature importance ranking") indices = np.argsort(importances)[::-1] for f in range(100): print("%d. feature %d %s (%f)" % (f + 1, indices[f], feature_names[indices[f]], importances[indices[f]])) indices = indices[:100] np.save( os.path.join('pre-trained', 'fusion', feature_name, 'feature_list'), indices) X_train_df = pd.DataFrame(X_train) X_train_df.columns = [ 'feature_%d' % i for i in range(len(X_train[0])) ] X_train_tree = X_train_df.iloc[:, indices] X_dev_df = pd.DataFrame(X_dev) X_dev_df.columns = [ 'feature_%d' % i for i in range(len(X_dev[0])) ] X_dev_tree = X_dev_df.iloc[:, indices] np.save( os.path.join('pre-trained', 'fusion', feature_name, 'X_train_tree'), X_train_tree) np.save( os.path.join('pre-trained', 'fusion', feature_name, 'X_dev_tree'), X_dev_tree)
def preprocess_BOXW(verbose=False): """preprocess Bags of X Words representations """ # load directory from configuration file A_input_dir = data_config['data_path_local']['baseline']['BoAW'] V_input_dir = data_config['data_path_local']['baseline']['BoVW'] A_output_dir = data_config['baseline_preproc']['BoAW'] V_output_dir = data_config['baseline_preproc']['BoVW'] # load length from configuration file length = dict() length['train'] = data_config['length']['train'] length['dev'] = data_config['length']['dev'] length['test'] = data_config['length']['test'] # load labels from configuration file _, _, level_dev, level_train = load_label() label_train, label_dev = level_train.values, level_dev.values labels = dict() labels['train'] = label_train[:, 1] labels['dev'] = label_dev[:, 1] for partition in ['train', 'dev']: # write handle A_label_f = smart_open(A_output_dir['%s_label' % partition], 'a+', encoding='utf-8') V_label_f = smart_open(V_output_dir['%s_label' % partition], 'a+', encoding='utf-8') A_inst_f = smart_open(A_output_dir['%s_inst' % partition], 'a+', encoding='utf-8') V_inst_f = smart_open(V_output_dir['%s_inst' % partition], 'a+', encoding='utf-8') A_data, V_data = None, None label = labels[partition] for i in range(length[partition]): A_feature = load_baseline_feature('BoAW', partition, (i + 1)) V_feature = load_baseline_feature('BoVW', partition, (i + 1)) A_t, _ = A_feature.shape V_t, _ = V_feature.shape # ensure timesteps match between Audio and Video timestep = A_t if A_t < V_t else V_t A_feature = A_feature.iloc[:timestep, 2:] V_feature = V_feature.iloc[:timestep, 2:] # concatenate features A_data = A_feature.copy() if not i else pd.concat( [A_data, A_feature]) V_data = V_feature.copy() if not i else pd.concat( [V_data, V_feature]) # write labels and instances A_label_f.write(('%d,' % label[i]) * timestep) V_label_f.write(('%d,' % label[i]) * timestep) A_inst_f.write(('%d,' % (i + 1)) * timestep) V_inst_f.write(('%d,' % (i + 1)) * timestep) if verbose: print(A_feature.shape, V_feature.shape) print(A_data.shape, V_data.shape) # save to external files A_data.to_csv(A_output_dir['%s_data' % partition], header=None, index=None) V_data.to_csv(V_output_dir['%s_data' % partition], header=None, index=None) A_label_f.close() V_label_f.close() A_inst_f.close() V_inst_f.close()
def preproc_baseline_feature(feature_name, verbose=False): """pre-process the baseline features (LLDs) """ # para feature_name: which feature to pre-process # para verbose: whether or not to output more results no_train = data_config['train_len'] no_dev = data_config['dev_len'] # keep one instance in every # instances keep = data_config['keepinstance'] def remove_if_exist(filename): if os.path.isfile(filename): os.remove(filename) # load output filenames train_data = data_config['baseline_preproc'][feature_name]['train_data'] train_label = data_config['baseline_preproc'][feature_name]['train_label'] train_inst = data_config['baseline_preproc'][feature_name]['train_inst'] dev_data = data_config['baseline_preproc'][feature_name]['dev_data'] dev_label = data_config['baseline_preproc'][feature_name]['dev_label'] dev_inst = data_config['baseline_preproc'][feature_name]['dev_inst'] # remove file if exists remove_if_exist(train_data) remove_if_exist(train_label) remove_if_exist(train_inst) remove_if_exist(dev_data) remove_if_exist(dev_label) remove_if_exist(dev_inst) # load the labels ymrs_train, ymrs_dev, level_dev, level_train = load_label() for partition in ['train', 'dev']: index_range = no_train if partition == 'train' else no_dev if verbose: print("\n----preprocessing on %s, dataset %s----" % (feature_name, partition)) if partition == 'train': data_loc, label_loc, inst_loc = train_data, train_label, train_inst else: data_loc, label_loc, inst_loc = dev_data, dev_label, dev_inst dataf = smart_open(data_loc, 'a+', encoding='utf-8') labelf = smart_open(label_loc, 'a+', encoding='utf-8') instf = smart_open(inst_loc, 'a+', encoding='utf-8') for id in range(1, index_range + 1): sample = get_sample(partition, id) if partition == 'train': ymrs_sample = ymrs_train[ymrs_train.Instance_name == sample].iat[0, 1] level_sample = level_train[level_train.Instance_name == sample].iat[0, 1] else: ymrs_sample = ymrs_dev[ymrs_dev.Instance_name == sample].iat[0, 1] level_sample = level_dev[level_dev.Instance_name == sample].iat[0, 1] if verbose: print("YMRS score for %s is %d" % (sample, ymrs_sample)) print("Mania level for %s is %d" % (sample, level_sample)) feat = load_baseline_feature(feature_name, partition, id) no_frame, _ = feat.shape count_nan = 0 for i in range(0, no_frame, keep): if verbose: print("\n----processing no. %d frame----" % i) data = feat.iloc[i, :] data = data[1:] # remove name if data.isnull().values.any(): print("----NAN, DROP FEATURE----") count_nan += 1 continue data_str = data.to_string(header=False, index=False) data_str = data_str.replace("\n", ",").replace(" ", "") # write baseline features to external file dataf.write(data_str) dataf.write("\n") # write baseline labels and instance to external file if id == 1 and i == 0: labelf.write("%d" % level_sample) instf.write("%d" % id) else: labelf.write(",%d" % level_sample) instf.write(",%d" % id) if verbose: print("\n----next feature----") if verbose: print("\n----%s partition done----" % partition) print("\n----ALL NAN DROPPED %d----" % count_nan) # close file handles dataf.close() labelf.close() instf.close()