def train_loop(model, optimizer, train_set, scheduler=None): num_mb = len(train_set) // hp.batch_size if scheduler: scheduler.step(epoch) for i in range(num_mb): # input lmfb (B x T x (F x frame_stacking)) xs = [] # target symbols ts = [] # onehot vector of target symbols (B x L x NUM_CLASSES) ts_onehot = [] # vector of target symbols for label smoothing (B x L x NUM_CLASSES) ts_onehot_LS = [] # input lengths emo = [] emo_onehot = [] emo_onehot_LS = [] lengths = [] ts_lengths = [] temp = [] temp_length = [] for j in range(hp.batch_size): s = train_set[i * hp.batch_size + j].strip() if hp.ASR: x_file, laborg = s.split(' ', 1) elif hp.dist: x_file, laborg, labemo, labdist = s.split('\t') laborg = laborg.strip() labemo = labemo.strip() labdist = labdist.strip() else: x_file, laborg, labemo = s.split('\t') laborg = laborg.strip() labemo = labemo.strip() #if len(laborg) == 0: # laborg = "2 0 1" if '.htk' in x_file: #mean = np.load("/n/work1/feng/src/htk/mean.npy") #var = np.load("/n/work1/feng/src/htk/var.npy") cpudat = load_dat(x_file) cpudat = cpudat[:, :hp.lmfb_dim] #cpudat = (cpudat-mean)/var #print(mean) elif '.npy' in x_file: #mean = np.load("/n/work1/feng/data/swb/mean.npy") #var = np.load("/n/work1/feng/data/swb/var.npy") cpudat = np.load(x_file) #cpudat = (cpudat-mean)/var elif '.wav' in x_file: with wave.open(x_file) as wf: dat = wf.readframes(wf.getnframes()) y = fromstring(dat, dtype=int16)[:, np.newaxis] y_float = y.astype(np.float32) cpudat = (y_float - np.mean(y_float)) / np.std(y_float) tmp = copy.deepcopy(cpudat) print("{} {}".format(x_file, cpudat.shape[0])) if hp.frame_stacking > 1 and hp.encoder_type != 'Wave': cpudat, newlen = frame_stacking(cpudat, hp.frame_stacking) newlen = cpudat.shape[0] if hp.encoder_type == 'CNN': cpudat_split = np.split(cpudat, 3, axis=1) cpudat = np.hstack((cpudat_split[0].reshape(newlen, 1, 80), cpudat_split[1].reshape(newlen, 1, 80), cpudat_split[2].reshape(newlen, 1, 80))) newlen = cpudat.shape[0] lengths.append(newlen) xs.append(cpudat) temp.append(tmp) temp_length.append(tmp.shape[0]) cpulab = np.array([int(i) for i in laborg.split(' ')], dtype=np.int32) #print(cpulab) cpulab_onehot = onehot(cpulab, hp.num_classes) ts.append(cpulab) ts_lengths.append(len(cpulab)) ts_onehot.append(cpulab_onehot) ts_onehot_LS.append(0.9 * cpulab_onehot + 0.1 * 1.0 / hp.num_classes) if hp.dist and hp.ASR == False: cpuemo = np.array([int(x) for x in labemo], dtype=np.int32) emotion_onehot = onehot_dist(labdist, hp.num_emotion) emo_onehot.append(emotion_onehot) emo_onehot_LS.append(0.9 * emotion_onehot + 0.1 * 1.0 / hp.num_emotion) emo.append(cpuemo) elif hp.ASR == False: cpuemo = np.array([int(x) for x in labemo], dtype=np.int32) emotion_onehot = onehot(cpuemo, hp.num_emotion) emo_onehot.append(emotion_onehot) emo_onehot_LS.append(0.9 * emotion_onehot + 0.1 * 1.0 / hp.num_emotion) emo.append(cpuemo) if hp.baseline_type != 'lim_BLSTM': temp, temp_length = xs, lengths if hp.ASR: xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths = sort_pad( hp.batch_size, xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths) youtput_in_Variable = model(xs, lengths, ts_onehot, [], []) loss = 0.0 if hp.decoder_type == 'Attention': for k in range(hp.batch_size): num_labels = ts_lengths[k] loss += label_smoothing_loss( youtput_in_Variable[k][:num_labels], ts_onehot_LS[k][:num_labels], 1) / num_labels print('loss = {}'.format(loss.item())) elif hp.baseline: xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp = sort_pad( hp.batch_size, xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp, temp_length) if hp.baseline_type == 'CNN_BLSTM' or hp.baseline_type == 'lim_BLSTM': onehot_length = temp.size(2) xs_new = torch.zeros((hp.batch_size, 750, onehot_length)) for i in range(hp.batch_size): feature_length = temp.size(1) if feature_length > 750: xs_new.data[:, :750, :] = temp.data[:, :750, :] else: xs_new.data[:, : feature_length, :] = temp.data[:, : feature_length, :] emotion_in_Variable = model(xs_new.to(DEVICE), []) else: #youtput_in_Variable, emotion_in_Variable = model(xs, lengths, ts_onehot, emo_onehot, []) emotion_in_Variable = model(xs, lengths) loss = 0.0 if hp.decoder_type == 'Attention': #print(emo) #print(emotion_in_Variable[:,:hp.num_emotion]) loss += F.cross_entropy( emotion_in_Variable[:, :hp.num_emotion], emo.to(DEVICE)) #for k in range(hp.batch_size): #num_labels = ts_lengths[k] #loss += label_smoothing_loss(youtput_in_Variable[k][:num_labels], ts_onehot_LS[k][:num_labels],1) / num_labels #print(emotion_in_Variable[k][:hp.num_emotion]) #loss += F.cross_entropy(emotion_in_Variable[k][:hp.num_emotion], emo) print('loss = {}'.format(loss.item())) elif hp.text_based: xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp = sort_pad( hp.batch_size, xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp, temp_length) emotion_in_Variable = model(ts.to(DEVICE), ts_lengths.to(DEVICE)) loss = 0.0 if hp.decoder_type == 'Attention': #print(emo) #print(emotion_in_Variable[:,:hp.num_emotion]) loss += F.cross_entropy( emotion_in_Variable[:, :hp.num_emotion], emo.to(DEVICE)) #for k in range(hp.batch_size): #num_labels = ts_lengths[k] #loss += label_smoothing_loss(youtput_in_Variable[k][:num_labels], ts_onehot_LS[k][:num_labels],1) / num_labels #print(emotion_in_Variable[k][:hp.num_emotion]) #loss += F.cross_entropy(emotion_in_Variable[k][:hp.num_emotion], emo) print('loss = {}'.format(loss.item())) elif hp.combined: #seq1 = [] #seq2 = [] #seq1, seq2, xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, \ #xs1, lengths1, ts1, ts_onehot1, ts_onehot_LS1, ts_lengths1, emo1, emo_onehot1, emo_onehot_LS1 \ #= sort_pad(hp.batch_size, xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS) xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp = sort_pad( hp.batch_size, xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp, temp_length) if hp.baseline_type == 'CNN_BLSTM' or hp.baseline_type == 'lim_BLSTM': onehot_length = temp.size(2) xs_new = torch.zeros((hp.batch_size, 750, onehot_length)) for i in range(hp.batch_size): feature_length = temp.size(1) if feature_length > 750: xs_new.data[:, :750, :] = temp.data[:, :750, :] else: xs_new.data[:, : feature_length, :] = temp.data[:, : feature_length, :] emotion_in_Variable = model(xs_new.to(DEVICE), [], ts.to(DEVICE), ts_lengths.to(DEVICE)) else: emotion_in_Variable = model(xs.to(DEVICE), lengths, ts.to(DEVICE), ts_lengths.to(DEVICE)) loss = 0.0 if hp.decoder_type == 'Attention': #print(emo) #print(emotion_in_Variable[:,:hp.num_emotion]) #for i in range(hp.batch_size): # for j in range(hp.batch_size): # if seq1[j] == i: # break # temp = emo[i] # emo[i] = emo[j] # emo[j] = temp loss += F.cross_entropy( emotion_in_Variable[:, :hp.num_emotion], emo.to(DEVICE)) #for k in range(hp.batch_size): #num_labels = ts_lengths[k] #loss += label_smoothing_loss(youtput_in_Variable[k][:num_labels], ts_onehot_LS[k][:num_labels],1) / num_labels #print(emotion_in_Variable[k][:hp.num_emotion]) #loss += F.cross_entropy(emotion_in_Variable[k][:hp.num_emotion], emo) print('loss = {}'.format(loss.item())) elif hp.combined_ASR or hp.ASR_based: xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp = sort_pad( hp.batch_size, xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp, temp_length) if hp.baseline_type == 'CNN_BLSTM' or hp.baseline_type == 'lim_BLSTM': onehot_length = temp.size(2) xs_new = torch.zeros((hp.batch_size, 750, onehot_length)) for i in range(hp.batch_size): feature_length = temp.size(1) if feature_length > 750: xs_new.data[:, :750, :] = temp.data[:, :750, :] else: xs_new.data[:, : feature_length, :] = temp.data[:, : feature_length, :] youtput_in_Variable, emotion_in_Variable = model( xs, lengths, ts_onehot, emo_onehot, xs_new.to(DEVICE)) else: youtput_in_Variable, emotion_in_Variable = model( xs, lengths, ts_onehot, emo_onehot, []) loss = 0.0 if hp.decoder_type == 'Attention': #print(emo) #print(emotion_in_Variable[:,:hp.num_emotion]) loss += F.cross_entropy( emotion_in_Variable[:, :hp.num_emotion], emo.to(DEVICE)) * 0.8 print(loss) for k in range(hp.batch_size): num_labels = ts_lengths[k] loss += label_smoothing_loss( youtput_in_Variable[k][:num_labels], ts_onehot_LS[k][:num_labels], 1) / num_labels * 0.2 #print(emotion_in_Variable[k][:hp.num_emotion]) #loss += F.cross_entropy(emotion_in_Variable[k][:hp.num_emotion], emo) print('loss = {}'.format(loss.item())) sys.stdout.flush() optimizer.zero_grad() # backward loss.backward() clip = 1.0 torch.nn.utils.clip_grad_value_(model.parameters(), clip) # optimizer update optimizer.step() loss.detach() torch.cuda.empty_cache()
while line: if "Ses0" not in line: line = f.readline() continue x_file, _, sentence = line.split(' ',2) sentence = sentence.translate(table) temp = x_file if temp not in name_list: #print(temp) line = f.readline() continue else: ind_emo = name_list.index(temp) position = htkpos+x_file+".htk" cpudat = load_dat(position) cpudat = cpudat[:,:40] if cpudat.shape[0]>2000: line = f.readline() string_emotion = emotion_list[ind_emo] delete[int(string_emotion)] += 1 continue transcripts = [] transcripts = (htkpos+x_file+".htk\t") word_list = sentence.strip().split(' ') #print(word_list) i = 0 transcripts += "2 " for word in word_list: word = word.strip()
def test_loop(model, test_set): batch_size = 1 #mean = np.load("/n/work1/feng/src/htk/mean.npy") #var = np.load("/n/work1/feng/src/htk/var.npy") acc = 0 neutral = 0 positive = 0 negative = 0 ang = 0 ang_total = 0 neutral_total = 0 positive_total = 0 negative_total = 0 total = 0 confusion_matrix = np.zeros((4, 4)) for i in range(len(test_set)): # input lmfb (B x T x (F x frame_stacking)) xs = [] # target symbols ts = [] # onehot vector of target symbols (B x L x NUM_CLASSES) ts_onehot = [] # vector of target symbols for label smoothing (B x L x NUM_CLASSES) ts_onehot_LS = [] # input lengths emo = [] emo_onehot = [] emo_onehot_LS = [] lengths = [] ts_lengths = [] # input lmfb (B x T x (F x frame_stacking)) xs1 = [] # target symbols ts1 = [] # onehot vector of target symbols (B x L x NUM_CLASSES) ts_onehot1 = [] # vector of target symbols for label smoothing (B x L x NUM_CLASSES) ts_onehot_LS1 = [] # input lengths emo1 = [] emo_onehot1 = [] emo_onehot_LS1 = [] lengths1 = [] ts_lengths1 = [] temp = [] temp_length = [] for j in range(batch_size): s = test_set[i * batch_size + j].strip() #if hp.ASR: # x_file = s.strip() #else: #x_file, laborg = s.split(' ', 1) if hp.ASR: x_file, laborg = s.split(' ', 1) elif hp.dist: x_file, laborg, labemo, labdist = s.split('\t') laborg = laborg.strip() labemo = labemo.strip() labdist = labdist.strip() else: x_file, laborg, labemo = s.split('\t') laborg = laborg.strip() labemo = labemo.strip() #if len(laborg) == 0: # laborg = "2 0 1" if '.htk' in x_file: cpudat = load_dat(x_file) cpudat = cpudat[:, :hp.lmfb_dim] #cpudat = (cpudat-mean)/var elif '.npy' in x_file: cpudat = np.load(x_file) #cpudat = (cpudat-mean)/var elif '.wav' in x_file: with wave.open(x_file) as wf: dat = wf.readframes(wf.getnframes()) y = fromstring(dat, dtype=int16)[:, np.newaxis] y_float = y.astype(np.float32) cpudat = (y_float - np.mean(y_float)) / np.std(y_float) tmp = copy.deepcopy(cpudat) print(x_file, end='\t') if hp.frame_stacking > 1 and hp.encoder_type != 'Wave': cpudat, newlen = frame_stacking(cpudat, hp.frame_stacking) newlen = cpudat.shape[0] if hp.encoder_type == 'CNN': cpudat_split = np.split(cpudat, 3, axis=1) cpudat = np.hstack((cpudat_split[0].reshape(newlen, 1, 80), cpudat_split[1].reshape(newlen, 1, 80), cpudat_split[2].reshape(newlen, 1, 80))) newlen = cpudat.shape[0] lengths.append(newlen) xs.append(cpudat) temp.append(tmp) temp_length.append(tmp.shape[0]) if hp.ASR == False: cpuemo = np.array([int(x) for x in labemo], dtype=np.int32) emotion_onehot = onehot(cpuemo, hp.num_emotion) emo_onehot.append(emotion_onehot) emo_onehot_LS.append(0.9 * emotion_onehot + 0.1 * 1.0 / hp.num_emotion) emo.append(cpuemo) cpulab = np.array([int(i) for i in laborg.split(' ')], dtype=np.int32) cpulab_onehot = onehot(cpulab, hp.num_classes) ts.append(cpulab) ts_lengths.append(len(cpulab)) ts_onehot.append(cpulab_onehot) ts_onehot_LS.append(0.9 * cpulab_onehot + 0.1 * 1.0 / hp.num_classes) if hp.baseline_type != 'lim_BLSTM': temp, temp_length = xs, lengths if hp.ASR: xs, lengths, temp = sort_pad(1, xs, lengths, temp=temp, temp_length=temp_length) else: xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp = sort_pad( 1, xs, lengths, ts, ts_onehot, ts_onehot_LS, ts_lengths, emo, emo_onehot, emo_onehot_LS, temp, temp_length) if hp.baseline_type == 'CNN_BLSTM' or hp.baseline_type == 'lim_BLSTM': onehot_length = temp.size(2) xs_new = torch.zeros((1, 750, onehot_length)) for i in range(1): feature_length = temp.size(1) if feature_length > 750: xs_new.data[:, :750, :] = temp.data[:, :750, :] else: xs_new.data[:, : feature_length, :] = temp.data[:, : feature_length, :] if hp.ASR: results = model.decode(xs, lengths, []) for character in results: print(character, end=' ') if results == []: print("2 1", end=' ') print("\t", end='') print(labemo) #print() sys.stdout.flush() elif hp.baseline or hp.combined or hp.text_based: if hp.baseline: emotion = model.decode(xs_new.to(DEVICE), []) elif hp.text_based: emotion = model.decode(ts.to(DEVICE), ts_lengths.to(DEVICE)) elif hp.combined: #emotion_in_Variable = model(xs, lengths, ts.to(DEVICE), ts_lengths.to(DEVICE), seq1, seq2) emotion = model.decode(xs_new.to(DEVICE), ts.to(DEVICE), ts_lengths.to(DEVICE)) #emotion = model.decode(xs, lengths, ts1.to(DEVICE), ts_lengths1.to(DEVICE)) print(int(labemo.strip()), end='\t') print((emotion), end='\t') print() else: results, emotion = model.decode(xs, lengths, xs_new.to(DEVICE)) for character in results: print(character, end=' ') if results == []: print("2 1", end=' ') print('\t', end='') print(int(labemo.strip()), end='\t') print(emotion, end='\t') print() sys.stdout.flush()
x = self.denseb(x) x = self.densec(x) return x def preprocess(x_batch, y_batch): x_batch = tf.cast(x_batch, dtype=tf.float32) / 255. - 0.5 y_batch = tf.cast(y_batch, dtype=tf.int32) return x_batch, y_batch batch_size = 32 epochs = 200 x_train, x_test, y_train, y_test = utils.load_dat() number_samples = y_test.shape[0] # print(x_train.dtype,x_test.dtype,y_train.dtype,y_test.dtype) # print(y_train.shape) train_set = tf.data.Dataset.from_tensor_slices((x_train, y_train)) train_set = train_set.map(preprocess).batch(batch_size) val_set = tf.data.Dataset.from_tensor_slices((x_test, y_test)) val_set = val_set.map(preprocess).batch(batch_size) optis = [ tf.keras.optimizers.Adam(lr=1e-3), tf.keras.optimizers.SGD(learning_rate=1e-3, momentum=0.9)
import numpy as np from utils import load_dat train = open("/n/work1/feng/data/scripts_4emotion/IEMOCAP_train.csv", "r") test = open("/n/work1/feng/data/scripts_4emotion/IEMOCAP_test.csv", "r") flag = 0 for line in train: x_file, laborg, emotion = line.strip().split("\t") f = load_dat(x_file) if flag == 0: temp = f flag = 1 else: temp = np.concatenate((temp, f), axis=0) print(test) for line in test: x_file, laborg, emotion = line.strip().split("\t") f = load_dat(x_file) temp = np.concatenate((temp, f), axis=0) print(temp.shape) mean = np.mean(temp, axis=0) var = np.var(temp, axis=0) np.save("/n/work1/feng/src/htk/mean.npy", mean) np.save("/n/work1/feng/src/htk/var.npy", var) print(mean.shape) print(var.shape)