def _set_probabilities(self, file): # TODO hard coded for getting class counts --> make sure that file class.counts exists # TODO and contains the key class_counts """ Set P(s_k|m_j) and prior P(s_k) from training :param file: path to P(s_k|m_j) file (kaldi-format, must contain the key 'p_s_m') """ # Set P(s_k|m_j) for key, mat in kaldi_io.read_mat_ark(file): if key == 'p_s_m': print('Setting P(s_k|m_j)') self.cond_prob = np.transpose( mat) # we transpose for later dot product # print(np.sum(self.cond_prob, axis=1)) # print(np.shape(np.sum(self.cond_prob, axis=1))) else: print('No probability found') # Set prior P(s_k) for key, mat in kaldi_io.read_mat_ark('../class.counts'): if key == 'class_counts': print('Setting Prior') self.prior = mat / np.sum(mat) else: print('No Prior found')
def create_dataset(self, nj, frac, path_data, output_folder): dataset = DataIterator(nj, path_data) data = [] misc = Misc() count_size = 0 while True: try: data_path = dataset.next_file() print(data_path) for key, mat in kaldi_io.read_mat_ark(data_path): df_mat = pd.DataFrame(mat) np_mat = df_mat.sample(frac=frac).values # np_mat[:, 39] = misc.trans_vec_to_phones(np_mat[:, 39]) data.append(np_mat) except StopIteration: data_sample = np.concatenate(data) print(data_sample.shape) data_dict = {} data_dict['data'] = data_sample with open(output_folder + '/dataset.mat', 'wb') as f: for key, mat in list(data_dict.items()): kaldi_io.write_mat(f, mat.astype(np.float32, copy=False), key=key) break
def create_codebook(self, nj, data_folder): # create keys for enumeration if self._multiple: keys = ['energy', 'raw', 'delta', 'dd'] else: keys = ['simple'] # init 4 minibatchkmeans for energy, raw, delta and delta delta features dict_kmeans = {} for key in keys: dict_kmeans[key] = MiniBatchKMeans(n_clusters=self._num_cluster, init='random', batch_size=200, verbose=1, reassignment_ratio=0.001, max_no_improvement=100, n_init=self._num_cluster) # create dataiterator dataset = DataIterator(nj, data_folder) # iterator and do kmeans df = pd.DataFrame() while True: try: data_path = dataset.next_file() print(data_path) for key, mat in kaldi_io.read_mat_ark(data_path): tmp_df = pd.DataFrame(mat) df = df.append(tmp_df.sample(int(tmp_df.shape[0] * 1.0))) if df.shape[0] > 1000: # so kmeans for every features if self._multiple: dict_kmeans['energy'].partial_fit( whiten(df.values[:, [0, 13, 26]])) dict_kmeans['raw'].partial_fit( whiten(df.values[:, range(1, 13, 1)])) dict_kmeans['delta'].partial_fit( whiten(df.values[:, range(14, 26, 1)])) dict_kmeans['dd'].partial_fit( whiten(df.values[:, range(27, 39, 1)])) else: if self._whitening: dict_kmeans['simple'].partial_fit( whiten(df.values)) else: dict_kmeans['simple'].partial_fit(df.values) self._dict_codebook = dict_kmeans df = pd.DataFrame() # clean up except StopIteration: break
def _set_global_stats(self, file): """ Set the mean and the variance in the class :param file: path to stats file (kaldi-format, must contain the keys 'mean' and 'var') :return: """ for key, mat in kaldi_io.read_mat_ark(file): if key == 'mean': print('Setting mean') self.global_mean = np.transpose(mat) elif key == 'std': print('Setting var') self.global_var = np.transpose(mat) else: print('No mean or var set!!!')
def load_codebook(self, path): if not self._kaldi_formatting: raise TypeError for key, mat in kaldi_io.read_mat_ark(path): self.codebook = mat
def _load_dataset(self, path): for key, mat in kaldi_io.read_mat_ark(path): self._dataset = mat
def _load_weights(self, path): for key, mat in kaldi_io.read_mat_ark(path): self._weights = mat
def vq_data(self, nj, data_folder, output_folder): # vqing traing data assert self.codebook.shape[0] > 0 print('VQing training data...') dataset = DataIterator(nj, data_folder) keys = [] dict_vq, dict_indicies = {}, {} if self._multiple: keys = ['energy', 'raw', 'delta', 'dd'] dict_indicies = { 'energy': [0, 13, 26], 'raw': range(1, 13, 1), 'delta': range(14, 26, 1), 'dd': range(27, 39, 1) } else: keys = ['simple'] dict_indicies = {'simple': range(0, 39)} for key in keys: dict_vq[key] = self.codebook[:, dict_indicies[key]] tmp_dict = {} labels_all = [] phoneme_all = [] count = 1 while True: try: data_path = dataset.next_file() print("Data path is in ", data_path) for key, mat in kaldi_io.read_mat_ark(data_path): if self._multiple: # getting label for every vq df = pd.DataFrame( vq(whiten(mat[:, dict_indicies['energy']]), dict_vq['energy'])[0][:, np.newaxis]) df = pd.concat([ df, pd.DataFrame( vq(whiten(mat[:, dict_indicies['raw']]), dict_vq['raw'])[0][:, np.newaxis]) ], axis=1) df = pd.concat([ df, pd.DataFrame( vq(whiten(mat[:, dict_indicies['delta']]), dict_vq['delta'])[0][:, np.newaxis]) ], axis=1) df = pd.concat([ df, pd.DataFrame( vq(whiten(mat[:, dict_indicies['dd']]), dict_vq['dd'])[0][:, np.newaxis]) ], axis=1) else: if self._whitening: df = pd.DataFrame( vq(whiten(mat[:, :39]), dict_vq['simple'])[0][:, np.newaxis]) labels_all.append(df.values) else: df = pd.DataFrame( vq(mat[:, :39], dict_vq['simple'])[0][:, np.newaxis]) labels_all.append(df.values) if np.shape(mat)[1] > 39: phoneme_all.append(mat[:, 39]) # add to tmp_dict for later saving tmp_dict[key] = df # ordered dict od = collections.OrderedDict(sorted(tmp_dict.items())) # save label-stream from vq with open(output_folder + '/feats_vq_' + str(count), 'wb') as f: for key, mat in list(od.items()): kaldi_io.write_mat(f, mat.values.astype(np.float32, copy=False), key=key) tmp_dict = {} count += 1 except StopIteration: # calc MI if False: misc = Misc() labels_all = np.concatenate(labels_all) # labels_all = np.reshape(labels_all, [np.shape(labels_all)[0] * np.shape(labels_all)[1]], # np.shape(labels_all)[2]) phoneme_all = np.concatenate(phoneme_all) # phoneme_all = np.reshape(phoneme_all, [np.shape(phoneme_all)[0] * np.shape(phoneme_all)[1]], # np.shape(phoneme_all)[2]) print(misc.calculate_mi(labels_all, phoneme_all)) break
def do_inference(self, nj, input_folder, output_folder): """ Does the inference of the model :param nj: number of jobs (how the dataset is split in kaldi) :param input_folder: path to the data folder to do the inference :param output_folder: path to save the output of the inference """ # create DataIterator for iterate through the split folder created by kaldi dataset = DataIterator(nj, input_folder, splice=self._splice, cmvn=self._cmvn) dim = self._dim * (2 * self._splice + 1) # number iterator for counting, necessary for writing the matrices later iterator = iter([i for i in range(1, dataset.get_size() + 1)]) features_all = {} phoneme_all = {} inferenced_data = {} # storing the inferenced data check_data = {} output_all = {} while True: try: data_path = dataset.next_file() # get path to data # print(data_path) # iterate through data for key, mat in kaldi_io.read_mat_ark(data_path): inferenced_data[key] = self._do_single_inference( mat[:, :dim]) # do inference for one batch tmp = self._do_single_inference(mat[:, :dim]) # check_data[key] = [np.argmax(tmp[0], axis=1), np.argmax(tmp[1], axis=1), # np.argmax(tmp[2], axis=1), self._dev_alignments[key]] if np.shape( mat )[1] > dim: # get statistics for mi (only if we input data + labels), for debugging phoneme_all[key] = mat[:, dim] # add for debugging, see below output_all[key] = tmp od = collections.OrderedDict(sorted(inferenced_data.items())) # write posteriors (inferenced data) to files with open(output_folder + '/feats_vq_' + str(next(iterator)), 'wb') as f: for key, mat in list(od.items()): if self.transform: kaldi_io.write_mat(f, mat, key=key) else: kaldi_io.write_mat(f, mat[:, np.newaxis], key=key) inferenced_data = {} # reset dict except StopIteration: # debugging # gather_right = np.zeros(127) # gather_right.fill(1e-5) # gather_wrong = np.zeros(127) # gather_wrong.fill(1e-5) # gather_vq = np.zeros(127) # gather_vq.fill(1e-5) # gather_comb = np.zeros(127) # gather_comb.fill(1e-5) # # for key, entry in check_data.items(): # tmp_van = entry[0] == entry[3] # right pred of vanilla # tmp_vq = entry[1] == entry[3] # right pred of vanilla # tmp_comb = entry[2] == entry[3] # right pred of vanilla # # # np.max(np.expand_dims(~tmp_vq, 1) * output_all[key], axis=1) # # comb_right = [t for t, x in enumerate(tmp_comb) if x] # comb_wrong = [t for t, x in enumerate(~tmp_comb) if x] # vq_right = [t for t, x in enumerate(tmp_vq) if x] # vq_wrong = [t for t, x in enumerate(~tmp_vq) if x] # van_right = [t for t, x in enumerate(tmp_van) if x] # van_wrong = [t for t, x in enumerate(~tmp_van) if x] # # list_vq = ~(entry[0] == entry[3]) == (entry[1] == entry[3]) # list_comb = (entry[0] == entry[3]) == ~(entry[2] == entry[3]) # ind_vq_true = [t for t, x in enumerate(list_vq) if x] # ind_comb_true = [t for t, x in enumerate(list_comb) if x] # ind_vq_false = [t for t, x in enumerate(list_vq) if not x] # # est = output_all[key][1] # # # # plt.subplot(2, 1, 1) # # # plt.hist(np.ndarray.flatten(np.expand_dims(list_vq, 1) * output_all[key]), bins=100, range=[1e-15, 1]) # # plt.hist(-np.sum(np.log2(output_all[key][0]) * output_all[key][0], axis=1), bins=10) # # plt.subplot(2, 1, 2) # # # plt.hist(np.ndarray.flatten(np.expand_dims(~list_vq, 1) * output_all[key]), bins=100, range=[1e-15, 1]) # # plt.hist(-np.sum(np.log2(output_all[key][1]) * output_all[key][1], axis=1), bins=10) # # plt.show() # # print('right comb: ' + str(len(comb_right))) # print('wrong comb: ' + str(len(comb_wrong))) # print('right vq: ' + str(len(vq_right))) # print('wrong vq: ' + str(len(vq_wrong))) # print('right van: ' + str(len(van_right))) # print('wrong van: ' + str(len(van_wrong))) # # print(len(van_right) + len(van_wrong)) # # print(entry[2][van_wrong]) # gather_right[entry[3][comb_right]] += 1.0 # gather_wrong[entry[3][comb_wrong]] += 1.0 # gather_vq[entry[3][ind_vq_true]] += 1.0 # gather_comb[entry[3][ind_comb_true]] += 1.0 # # print(len(van_right) + len(van_wrong)) # # print(len(entry[2])) # print(sum(list_comb) / len(entry[3])) # print(sum(list_vq) / len(entry[3])) # plt.subplot(3, 1, 1) # plt.bar(range(0, 127), gather_right) # plt.subplot(3, 1, 2) # plt.bar(range(0, 127), gather_wrong) # plt.subplot(3, 1, 3) # plt.bar(range(0, 127), gather_vq) # plt.show() # print(check_data[0] == check_data[1]) if False: misc = Misc() features_all = np.concatenate(features_all) phoneme_all = np.expand_dims(np.concatenate(phoneme_all), 1) phoneme_all = misc.trans_vec_to_phones(phoneme_all) # print(misc.calculate_mi(features_all, phoneme_all)) mi, test_py, test_pw, test_pyw = self._session.run( ["mutual_info:0", "p_y:0", "p_w:0", "p_yw:0"], feed_dict={ "is_train:0": False, "ph_features:0": features_all, "ph_labels:0": phoneme_all }) print(mi) tmp_pywtest = pd.DataFrame(test_py) tmp_pywtest.to_csv('py_inf.txt', header=False, index=False) tmp_pywtest = pd.DataFrame(test_pw) tmp_pywtest.to_csv('pw_inf.txt', header=False, index=False) tmp_pywtest = pd.DataFrame(test_pyw) tmp_pywtest.to_csv('pwy_inf.txt', header=False, index=False) break