def utt_to_samples(self, args): ''' Utt to samples of (feat_chunk, spk_id, sample_key). Will be run in a process pool so restrictions apply. ''' result_queue, utt_info = args logging.debug(utt_info) # TODO: wrap into a function or something utt_key, utt_meta = utt_info # Load features and select voiced frames. feat_scp = utt_meta['feat'] feat_mat = kaldiio.load_mat(feat_scp) num_frames_feat, feat_dim = feat_mat.shape vad_scp = utt_meta['vad'] if vad_scp: vad_mat = kaldiio.load_mat(vad_scp) num_frames_vad = vad_mat.shape[0] logging.debug('feat_mat: %s, vad_mat: %s' % (str(feat_mat.shape), str(vad_mat.shape))) if num_frames_feat != num_frames_vad: logging.debug('num_frames_feat != num_frames_vad: %d vs %d' % (num_frames_feat, num_frames_vad)) return None voiced_frames_index = np.where(vad_mat == 1)[0] logging.debug('voiced_frames_index: %s' % (str(voiced_frames_index.shape))) feat_mat_voiced = feat_mat[voiced_frames_index, :] else: # If no VAD info was found, the entire utt will be used. feat_mat_voiced = feat_mat num_frames_voiced = feat_mat_voiced.shape[0] logging.debug('feat_mat_voiced: %s' % (str(feat_mat_voiced.shape))) spk_id = utt_meta['spkid'] logging.debug('Chunk size: %d' % (self.chunk_size)) results = [] chunk_idx = 0 if self.add_random_offset: random_offset = np.random.randint(0, self.chunk_size) else: random_offset = 0 for offset in range(random_offset, num_frames_voiced, self.chunk_size): if self.single_chunk: available = num_frames_voiced - self.chunk_size if available < 0: # No padding. logging.warn('Single chunk mode: available < 0.') break offset = random.randint(0, available) logging.debug('offset = %d' % (offset)) feat_chunk = feat_mat_voiced[offset:offset + self.chunk_size, :] unpadded_frames = feat_chunk.shape[0] if self.pad_chunks and unpadded_frames < self.chunk_size: rel_chunk_len = float(unpadded_frames) / self.chunk_size if rel_chunk_len < self.drop_short_chunks: continue logging.debug('Padding chunk of frames %d ...' % (unpadded_frames)) padded = np.zeros((self.chunk_size, feat_dim), dtype=feat_chunk.dtype) padded[:unpadded_frames, :] = feat_chunk feat_chunk = padded feat_chunk = np.expand_dims(feat_chunk, axis=2) # TODO: not here sample_key = '%s_chunk%02d' % (utt_key, chunk_idx) sample = (feat_chunk, spk_id, sample_key) chunk_idx += 1 results.append(sample) if self.single_chunk: break if result_queue: # queue mode result_queue.put(results) return None # imap mode return results
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--in-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], help='Specify the file format for the rspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--stats-filetype', type=str, default='mat', choices=['mat', 'hdf5', 'npy'], help='Specify the file format for the rspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--out-filetype', type=str, default='mat', choices=['mat', 'hdf5'], help='Specify the file format for the wspecifier. ' '"mat" is the matrix format in kaldi') parser.add_argument('--norm-means', type=strtobool, default=True, help='Do variance normalization or not.') parser.add_argument('--norm-vars', type=strtobool, default=False, help='Do variance normalization or not.') parser.add_argument('--reverse', type=strtobool, default=False, help='Do reverse mode or not') parser.add_argument('--spk2utt', type=str, help='A text file of speaker to utterance-list map. ' '(Don\'t give rspecifier format, such as ' '"ark:spk2utt")') parser.add_argument('--utt2spk', type=str, help='A text file of utterance to speaker map. ' '(Don\'t give rspecifier format, such as ' '"ark:utt2spk")') parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('stats_rspecifier_or_rxfilename', help='Input stats. e.g. ark:stats.ark or stats.mat') parser.add_argument('rspecifier', type=str, help='Read specifier id. e.g. ark:some.ark') parser.add_argument('wspecifier', type=str, help='Write specifier id. e.g. ark:some.ark') args = parser.parse_args() # logging info logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if ':' in args.stats_rspecifier_or_rxfilename: is_rspcifier = True if args.stats_filetype == 'npy': stats_filetype = 'hdf5' else: stats_filetype = args.stats_filetype stats_dict = dict( FileReaderWrapper(args.stats_rspecifier_or_rxfilename, stats_filetype)) else: is_rspcifier = False if args.stats_filetype == 'mat': stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename) else: stats = numpy.load(args.stats_rspecifier_or_rxfilename) stats_dict = {None: stats} cmvn = CMVN(stats=stats_dict, norm_means=args.norm_means, norm_vars=args.norm_vars, utt2spk=args.utt2spk, spk2utt=args.spk2utt, reverse=args.reverse) with FileWriterWrapper( args.wspecifier, filetype=args.out_filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt, mat in FileReaderWrapper(args.rspecifier, args.in_filetype): if is_scipy_wav_style(mat): # If data is sound file, then got as Tuple[int, ndarray] rate, mat = mat mat = cmvn(mat, utt if is_rspcifier else None) writer[utt] = mat
def _get_from_loader(self, filepath, filetype): """Return ndarray In order to make the fds to be opened only at the first referring, the loader are stored in self._loaders >>> ndarray = loader.get_from_loader( ... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5') :param: str filepath: :param: str filetype: :return: :rtype: np.ndarray """ if filetype == "hdf5": # e.g. # {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL", # "filetype": "hdf5", # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL" filepath, key = filepath.split(":", 1) loader = self._loaders.get(filepath) if loader is None: # To avoid disk access, create loader only for the first time loader = h5py.File(filepath, "r") self._loaders[filepath] = loader return loader[key][()] elif filetype == "sound.hdf5": # e.g. # {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL", # "filetype": "sound.hdf5", # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL" filepath, key = filepath.split(":", 1) loader = self._loaders.get(filepath) if loader is None: # To avoid disk access, create loader only for the first time loader = SoundHDF5File(filepath, "r", dtype="int16") self._loaders[filepath] = loader array, rate = loader[key] return array elif filetype == "sound": # e.g. # {"input": [{"feat": "some/path.wav", # "filetype": "sound"}, # Assume PCM16 if not self.keep_all_data_on_mem: array, _ = soundfile.read(filepath, dtype="int16") return array if filepath not in self._loaders: array, _ = soundfile.read(filepath, dtype="int16") self._loaders[filepath] = array return self._loaders[filepath] elif filetype == "npz": # e.g. # {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL", # "filetype": "npz", filepath, key = filepath.split(":", 1) loader = self._loaders.get(filepath) if loader is None: # To avoid disk access, create loader only for the first time loader = np.load(filepath) self._loaders[filepath] = loader return loader[key] elif filetype == "npy": # e.g. # {"input": [{"feat": "some/path.npy", # "filetype": "npy"}, if not self.keep_all_data_on_mem: return np.load(filepath) if filepath not in self._loaders: self._loaders[filepath] = np.load(filepath) return self._loaders[filepath] elif filetype in ["mat", "vec"]: # e.g. # {"input": [{"feat": "some/path.ark:123", # "filetype": "mat"}]}, # In this case, "123" indicates the starting points of the matrix # load_mat can load both matrix and vector if not self.keep_all_data_on_mem: return kaldiio.load_mat(filepath) if filepath not in self._loaders: self._loaders[filepath] = kaldiio.load_mat(filepath) return self._loaders[filepath] elif filetype == "scp": # e.g. # {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL", # "filetype": "scp", filepath, key = filepath.split(":", 1) loader = self._loaders.get(filepath) if loader is None: # To avoid disk access, create loader only for the first time loader = kaldiio.load_scp(filepath) self._loaders[filepath] = loader return loader[key] else: raise NotImplementedError("Not supported: loader_type={}".format(filetype))
def __getitem__(self, indices): """Create mini-batch per step. Args: indices (np.ndarray): indices of dataframe in the current mini-batch Returns: mini_batch_dict (dict): xs (list): input data of size `[T, input_dim]` xlens (list): lengths of xs ys (list): reference labels in the main task of size `[L]` ys_sub1 (list): reference labels in the 1st auxiliary task of size `[L_sub1]` ys_sub2 (list): reference labels in the 2nd auxiliary task of size `[L_sub2]` utt_ids (list): name of each utterance speakers (list): name of each speaker sessions (list): name of each session """ # external alignment trigger_points = None if self.word_alignment_dir is not None: trigger_points = np.zeros( (len(indices), max([self.df['ylen'][i] for i in indices]) + 1), dtype=np.int32) for b, i in enumerate(indices): p = self.df['trigger_points'][i] trigger_points[b, :len(p)] = p - 1 # 0-indexed # NOTE: <eos> is not treated here trigger_points //= self.subsample_factor elif self.ctc_alignment_dir is not None: trigger_points = np.zeros( (len(indices), max([self.df['ylen'][i] for i in indices]) + 1), dtype=np.int32) for b, i in enumerate(indices): p = self.df['trigger_points'][i] # including <eos> trigger_points[b, :len(p)] = p # already 0-indexed # inputs xs = [kaldiio.load_mat(self.df['feat_path'][i]) for i in indices] xlens = [self.df['xlen'][i] for i in indices] utt_ids = [self.df['utt_id'][i] for i in indices] speakers = [self.df['speaker'][i] for i in indices] sessions = [self.df['session'][i] for i in indices] texts = [self.df['text'][i] for i in indices] feat_paths = [self.df['feat_path'][i] for i in indices] # main outputs if self.is_test: ys = [self._token2idx[0](self.df['text'][i]) for i in indices] else: ys = [ list(map(int, str(self.df['token_id'][i]).split())) for i in indices ] # sub1 outputs ys_sub1 = [] if self.df_sub1 is not None: ys_sub1 = [ list(map(int, str(self.df_sub1['token_id'][i]).split())) for i in indices ] elif self._vocab_sub1 > 0 and not self.is_test: ys_sub1 = [self._token2idx[1](self.df['text'][i]) for i in indices] # sub2 outputs ys_sub2 = [] if self.df_sub2 is not None: ys_sub2 = [ list(map(int, str(self.df_sub2['token_id'][i]).split())) for i in indices ] elif self._vocab_sub2 > 0 and not self.is_test: ys_sub2 = [self._token2idx[2](self.df['text'][i]) for i in indices] mini_batch_dict = { 'xs': xs, 'xlens': xlens, 'ys': ys, 'ys_sub1': ys_sub1, 'ys_sub2': ys_sub2, 'utt_ids': utt_ids, 'speakers': speakers, 'sessions': sessions, 'text': texts, 'feat_path': feat_paths, # for plot 'trigger_points': trigger_points, } return mini_batch_dict
import pandas as pd import scipy.linalg if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("lda_mat", help="original lda matrix (kaldi .mat)") parser.add_argument("out_lda_mat", help="new lda matrix to write on") parser.parse_args() args, leftovers = parser.parse_known_args() # Check if n is at least one less dim than the original matrix #load matrix orig_mat = kaldiio.load_mat(args.lda_mat) # new_mat = scipy.linalg.orth(orig_mat) # new_mat, r = np.linalg.qr(np.transpose(orig_mat), mode="complete") # q,r=np.linalg.qr(np.transpose(orig_mat)) # new_mat=q.T #B <- t(qr.Q(qr(A),complete=TRUE)[,5:10]) # is same as #q,r = np.linalg.qr(a) #mat=q.T #MAYBE need to transpose other way round (before calculating?) OR NOT TRANSPOSE the second time? a = orig_mat.T q, r = np.linalg.qr(a, mode='complete')
def __init__(self, corpus, tsv_path, dict_path, unit, nlsyms, wp_model, is_test, min_n_frames, max_n_frames, sort_by, short2long, tsv_path_sub1, tsv_path_sub2, ctc, ctc_sub1, ctc_sub2, subsample_factor, subsample_factor_sub1, subsample_factor_sub2, dict_path_sub1, dict_path_sub2, unit_sub1, unit_sub2, wp_model_sub1, wp_model_sub2, discourse_aware=False, first_n_utterances=-1, word_alignment_dir=None, ctc_alignment_dir=None): """Custom Dataset class. Args: corpus (str): name of corpus tsv_path (str): path to the dataset tsv file dict_path (str): path to the dictionary unit (str): word/wp/char/phone/word_char nlsyms (str): path to the non-linguistic symbols file wp_model (): path to the word-piece model for sentencepiece is_test (bool): min_n_frames (int): exclude utterances shorter than this value max_n_frames (int): exclude utterances longer than this value sort_by (str): sort all utterances in the ascending order input: sort by input length output: sort by output length shuffle: shuffle all utterances short2long (bool): sort utterances in the descending order ctc (bool): subsample_factor (int): discourse_aware (bool): sort in the discourse order first_n_utterances (int): evaluate the first N utterances word_alignment_dir (str): path to word alignment directory ctc_alignment_dir (str): path to CTC alignment directory """ super(Dataset, self).__init__() self.epoch = 0 # meta deta accessed by dataloader self._corpus = corpus self._set = os.path.basename(tsv_path).split('.')[0] self._vocab = count_vocab_size(dict_path) self._unit = unit self._unit_sub1 = unit_sub1 self._unit_sub2 = unit_sub2 self.is_test = is_test self.sort_by = sort_by assert sort_by in ['input', 'output', 'shuffle', 'utt_id'] # if shuffle_bucket: # assert sort_by in ['input', 'output'] if discourse_aware: assert not is_test self.subsample_factor = subsample_factor self.word_alignment_dir = word_alignment_dir self.ctc_alignment_dir = ctc_alignment_dir self._idx2token = [] self._token2idx = [] # Set index converter if unit in ['word', 'word_char']: self._idx2token += [Idx2word(dict_path)] self._token2idx += [ Word2idx(dict_path, word_char_mix=(unit == 'word_char')) ] elif unit == 'wp': self._idx2token += [Idx2wp(dict_path, wp_model)] self._token2idx += [Wp2idx(dict_path, wp_model)] elif unit in ['char']: self._idx2token += [Idx2char(dict_path)] self._token2idx += [Char2idx(dict_path, nlsyms=nlsyms)] elif 'phone' in unit: self._idx2token += [Idx2phone(dict_path)] self._token2idx += [Phone2idx(dict_path)] else: raise ValueError(unit) for i in range(1, 3): dict_path_sub = locals()['dict_path_sub' + str(i)] wp_model_sub = locals()['wp_model_sub' + str(i)] unit_sub = locals()['unit_sub' + str(i)] if dict_path_sub: setattr(self, '_vocab_sub' + str(i), count_vocab_size(dict_path_sub)) # Set index converter if unit_sub: if unit_sub == 'wp': self._idx2token += [ Idx2wp(dict_path_sub, wp_model_sub) ] self._token2idx += [ Wp2idx(dict_path_sub, wp_model_sub) ] elif unit_sub == 'char': self._idx2token += [Idx2char(dict_path_sub)] self._token2idx += [ Char2idx(dict_path_sub, nlsyms=nlsyms) ] elif 'phone' in unit_sub: self._idx2token += [Idx2phone(dict_path_sub)] self._token2idx += [Phone2idx(dict_path_sub)] else: raise ValueError(unit_sub) else: setattr(self, '_vocab_sub' + str(i), -1) # Load dataset tsv file df = pd.read_csv(tsv_path, encoding='utf-8', delimiter='\t') df = df.loc[:, [ 'utt_id', 'speaker', 'feat_path', 'xlen', 'xdim', 'text', 'token_id', 'ylen', 'ydim' ]] for i in range(1, 3): if locals()['tsv_path_sub' + str(i)]: df_sub = pd.read_csv(locals()['tsv_path_sub' + str(i)], encoding='utf-8', delimiter='\t') df_sub = df_sub.loc[:, [ 'utt_id', 'speaker', 'feat_path', 'xlen', 'xdim', 'text', 'token_id', 'ylen', 'ydim' ]] setattr(self, 'df_sub' + str(i), df_sub) else: setattr(self, 'df_sub' + str(i), None) self._input_dim = kaldiio.load_mat(df['feat_path'][0]).shape[-1] # Remove inappropriate utterances print('Original utterance num: %d' % len(df)) n_utts = len(df) if is_test or discourse_aware: df = df[df.apply(lambda x: x['ylen'] > 0, axis=1)] print('Removed %d empty utterances' % (n_utts - len(df))) if first_n_utterances > 0: n_utts = len(df) df = df[df.apply(lambda x: x['ylen'] > 0, axis=1)] df = df.truncate(before=0, after=first_n_utterances - 1) print('Select first %d utterances' % len(df)) else: df = df[df.apply( lambda x: min_n_frames <= x['xlen'] <= max_n_frames, axis=1)] df = df[df.apply(lambda x: x['ylen'] > 0, axis=1)] print('Removed %d utterances (threshold)' % (n_utts - len(df))) if ctc and subsample_factor > 1: n_utts = len(df) df = df[df.apply(lambda x: x['ylen'] <= (x['xlen'] // subsample_factor), axis=1)] print('Removed %d utterances (for CTC)' % (n_utts - len(df))) for i in range(1, 3): df_sub = getattr(self, 'df_sub' + str(i)) ctc_sub = locals()['ctc_sub' + str(i)] subsample_factor_sub = locals()['subsample_factor_sub' + str(i)] if df_sub is not None: if ctc_sub and subsample_factor_sub > 1: df_sub = df_sub[df_sub.apply( lambda x: x['ylen'] <= (x['xlen'] // subsample_factor_sub), axis=1)] if len(df) != len(df_sub): n_utts = len(df) df = df.drop(df.index.difference(df_sub.index)) print('Removed %d utterances (for CTC, sub%d)' % (n_utts - len(df), i)) for j in range(1, i + 1): setattr( self, 'df_sub' + str(j), getattr(self, 'df_sub' + str(j)).drop( getattr(self, 'df_sub' + str(j)).index.difference( df.index))) if corpus == 'swbd': # 1. serialize # df['session'] = df['speaker'].apply(lambda x: str(x).split('-')[0]) # 2. not serialize df['session'] = df['speaker'].apply(lambda x: str(x)) else: df['session'] = df['speaker'].apply(lambda x: str(x)) # Sort tsv records if discourse_aware: # Sort by onset (start time) df = df.assign(prev_utt='') df = df.assign(line_no=list(range(len(df)))) if corpus == 'swbd': df['onset'] = df['utt_id'].apply( lambda x: int(x.split('_')[-1].split('-')[0])) elif corpus == 'csj': df['onset'] = df['utt_id'].apply( lambda x: int(x.split('_')[1])) elif corpus == 'tedlium2': df['onset'] = df['utt_id'].apply( lambda x: int(x.split('-')[-2])) else: raise NotImplementedError(corpus) df = df.sort_values(by=['session', 'onset'], ascending=True) # Extract previous utterances groups = df.groupby('session').groups df['prev_utt'] = df.apply(lambda x: [ df.loc[i, 'line_no'] for i in groups[x['session']] if df.loc[i, 'onset'] < x['onset'] ], axis=1) df['n_prev_utt'] = df.apply(lambda x: len(x['prev_utt']), axis=1) df['n_utt_in_session'] = df.apply( lambda x: len([i for i in groups[x['session']]]), axis=1) df = df.sort_values(by=['n_utt_in_session'], ascending=short2long) # NOTE: this is used only when LM is trained with serialize: true # if is_test and corpus == 'swbd': # # Sort by onset # df['onset'] = df['utt_id'].apply(lambda x: int(x.split('_')[-1].split('-')[0])) # df = df.sort_values(by=['session', 'onset'], ascending=True) elif not is_test: if sort_by == 'input': df = df.sort_values(by=['xlen'], ascending=short2long) elif sort_by == 'output': df = df.sort_values(by=['ylen'], ascending=short2long) elif sort_by == 'shuffle': df = df.reindex(np.random.permutation(self.df.index)) # Fit word alignment to vocabulary if word_alignment_dir is not None: alignment2boundary = WordAlignmentConverter(dict_path, wp_model) n_utts = len(df) df['trigger_points'] = df.apply(lambda x: alignment2boundary( word_alignment_dir, x['speaker'], x['utt_id'], x['text']), axis=1) # remove utterances which do not have the alignment df = df[df.apply(lambda x: x['trigger_points'] is not None, axis=1)] print('Removed %d utterances (for word alignment)' % (n_utts - len(df))) elif ctc_alignment_dir is not None: n_utts = len(df) df['trigger_points'] = df.apply(lambda x: load_ctc_alignment( ctc_alignment_dir, x['speaker'], x['utt_id']), axis=1) # remove utterances which do not have the alignment df = df[df.apply(lambda x: x['trigger_points'] is not None, axis=1)] print('Removed %d utterances (for CTC alignment)' % (n_utts - len(df))) # Re-indexing if discourse_aware: self.df = df for i in range(1, 3): if getattr(self, 'df_sub' + str(i)) is not None: setattr(self, 'df_sub' + str(i), getattr(self, 'df_sub' + str(i)).reindex(df.index)) else: self.df = df.reset_index() for i in range(1, 3): if getattr(self, 'df_sub' + str(i)) is not None: setattr( self, 'df_sub' + str(i), getattr(self, 'df_sub' + str(i)).reindex( df.index).reset_index())
# source vectorizer ... target_speech_df = pd.read_csv(config_args['target_speech_metadata'], encoding='utf-8') target_label_set = config_args['target_language_set'].split() # make sure no utterances with 0 duration such as #target_speech_df = target_speech_df[(target_speech_df.duration!=0)] target_speech_df = target_speech_df[( target_speech_df['language'].isin(target_label_set))] len(target_speech_df), target_label_set cmvn_stats = kaldiio.load_mat(config_args['source_cmvn']) mean_stats = cmvn_stats[0, :-1] count = cmvn_stats[0, -1] offset = np.expand_dims(mean_stats, 0) / count source_speech_vectorizer = LID_Vectorizer( data_dir=config_args['source_data_dir'], speech_df=source_speech_df, feature_type=config_args['input_signal_params']['feature_type'], label_set=config_args['source_language_set'].split(), max_num_frames=config_args['input_signal_params']['max_num_frames'], num_frames=config_args['input_signal_params']['num_frames'], feature_dim=config_args['model_arch']['feature_dim'], start_idx=config_args['input_signal_params']['start_index'], end_idx=config_args['input_signal_params']['end_index'], cmvn=offset)
def test_write_read_mat(tmpdir, endian, dtype): path = tmpdir.mkdir('test') valid = np.random.rand(1000, 120).astype(dtype) kaldiio.save_mat(path.join('a.mat').strpath, valid, endian=endian) test = kaldiio.load_mat(path.join('a.mat').strpath, endian=endian) np.testing.assert_array_equal(test, valid)
path_of_scores = os.path.join(nnet_path, 'scores') createdir(path_of_scores, override=False, append=True) shutil.copyfile(__file__, os.path.join(nnet_path, 'train.py')) shutil.copyfile(args.config_path, os.path.join(nnet_path, 'config.json')) shutil.copytree(utils_path, os.path.join(exp_model_path, 'utils')) shutil.copytree(score_path, os.path.join(exp_model_path, 'score')) shutil.copytree(dataset_path, os.path.join(exp_model_path, 'dataset')) recorder = SummaryWriter(exp_model_path) log_path = os.path.join(exp_model_path, 'log') if params.net_type == 'basic': model = BasicNet(in_features=params.feature_dim, out_features=1, input_process=params.input_process) elif params.net_type == 'cnn_net': model = CnnNet(in_channels=2) elif params.net_type == 'kaldi_lda': model = LdaNet(load_mat(params.kaldi_transform_path), frozen=params.frozen, contains_bias=params.contains_bias, mid_feature=2048 if 'mid_feature' not in params.dict else params.mid_feature, hidden_layers=1 if 'hidden_layers' not in params.dict else params.hidden_layers, input_process='out_add_vec_cat' if 'input_process' not in params.dict else params.input_process) elif params.net_type == 'kaldi_cnn': model = KaldiReductionCNN(load_mat(params.kaldi_transform_path), frozen=params.frozen, contains_bias=params.contains_bias) print2file(model) if params.use_gpu: model = model.cuda() batch_size = params.batch_size print2file('Dealing with data.') train_sl = SiameseSet( scp_path=params.train_path, utt_per_spk=None if 'utt_per_spk' not in params.dict else params.utt_per_spk, pre_load=False if 'pre_load' not in params.dict else params.pre_load, strategy=params.strategy,
import json import matplotlib.pyplot as plt import kaldiio root = "/home/nlp/ASR/espnet/egs/FSW" with open(root + "/dump/test/deltafalse/data.json", "r") as f: test_json = json.load(f)["utts"] key, info = list(test_json.items())[10] fbank = kaldiio.load_mat(info["input"][0]["feat"]) # plot the speech feature plt.matshow(fbank.T[::-1]) plt.title(key + ": " + info["output"][0]["text"])