def main(args): trials = [x.split() for x in open(args.trials)] utt1s = [x[0] for x in trials] utt2s = [x[1] for x in trials] if len(trials[0]) == 3: tar2int = {'nontarget': 0, 'target': 1} target = [tar2int[x[2]] for x in trials] else: target = None with kaldiio.ReadHelper( f'scp:{args.enroll_scp_dir}/{args.enroll_scp}') as reader: utt2embd_enroll = {utt: embd for utt, embd in reader} l_out, l_out_label = ( np.array([utt2embd_enroll[i] for i in utt2embd_enroll]), np.array([i for i in utt2embd_enroll]), ) l_out, l_out_label, _, _ = frontend(args, l_out, l_out_label, np.zeros((512, 512)), np.zeros((512, ))) utt2embd_enroll = {utt: embd for utt, embd in zip(l_out_label, l_out)} with kaldiio.ReadHelper( f'scp:{args.trial_scp_dir}/{args.trial_scp}') as reader: utt2embd_trial = {utt: embd for utt, embd in reader} u_out, u_out_label = ( np.array([utt2embd_trial[i] for i in utt2embd_trial]), np.array([i for i in utt2embd_trial]), ) utt2embd_trial = {utt: embd for utt, embd in zip(u_out_label, u_out)} utt2embd_enroll = [utt2embd_enroll[utt] for utt in utt1s] utt2embd_trial = [utt2embd_trial[utt] for utt in utt2s] scores = cosine_scoring(utt2embd_enroll, utt2embd_trial) score_file_kaldi = [] for enroll, trial, score in zip(utt1s, utt2s, scores): score_file_kaldi.append([enroll, trial, str(score)]) with open(args.output, "w") as txt_file: for line in score_file_kaldi: txt_file.write(" ".join(line) + "\n") # works with any number of elements in a line if target is not None: eer, threshold = compute_eer(scores, target) print("ROC_EER: {:.2f}".format(eer * 100))
def prepare_data(ivec_scp, data_dir): with kaldiio.ReadHelper('scp:' + ivec_scp) as reader: ivectors = {} for k, iv in reader: ivectors[k] = iv with open('{}/utt2lang'.format(data_dir), 'r') as input_utt2lang: utt2lang_dict = {} for line in input_utt2lang: utt2lang_dict[line.split(' ')[0]] = line.split(' ')[1].strip('\n') with open('{}/utt2spk'.format(data_dir), 'r') as input_utt2spk: utt2spk_dict = {} for line in input_utt2spk: utt2spk_dict[line.split(' ')[0]] = line.split(' ')[1].strip('\n') with open('{}/utt2sent'.format(data_dir), 'r') as input_utt2sent: utt2sent_dict = {} for line in input_utt2sent: utt2sent_dict[line.split(' ')[0]] = line.split(' ')[1].strip('\n') ivectors_df = pd.DataFrame.from_dict(ivectors, orient='index').sort_index() labels_df = pd.DataFrame.from_dict(utt2lang_dict, orient='index', columns=["lang"]).sort_index() labels_df["spk"] = pd.DataFrame.from_dict(utt2spk_dict, orient='index') labels_df["sent"] = pd.DataFrame.from_dict(utt2sent_dict, orient='index') data = np.array(ivectors_df) return ivectors_df, labels_df
def compute_spectrum(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = float(args.sample_rate) config['output_type'] = int(args.output_type) config['window_length'] = args.window_length config['frame_length'] = args.frame_length spectrum = Spectrum.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) spectrum_test = spectrum(audio_data, args.sample_rate) sess = tf.compat.v1.Session() spectrum_feats = spectrum_test.eval(session=sess) writer[utt_id] = spectrum_feats
def main(): args = parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logger.info(get_commandline_args()) utt_text_speaker = consolidate_utt_info(scp=None, text=args.text_file, utt2spk=args.utt2spk_file) with kaldiio.ReadHelper( args.rspecifier, segments=args.segments) as reader, file_writer_helper( args.wspecifier, filetype=args.archive_format, compress=args.compress, compression_method=args.compression_method, sample_frequency=args.sample_frequency, transform=Transformation(args.feature_config)) as writer: for utt_id, (rate, wave) in tqdm.tqdm(reader, miniters=100, maxinterval=30): utt_dict = {"x": wave, "rate": rate} utt_dict.update(utt_text_speaker.get(utt_id, {})) try: writer[utt_id] = utt_dict except Exception as e: logger.warning( f"Failed to process utterance {utt_id} with exception:\n{str(e)}" ) continue
def __iter__(self): with kaldiio.ReadHelper(self.rspecifier, segments=self.segments) as reader: for key, array in reader: if self.return_shape: array = array.shape yield key, array
def compute_fbank_pitch(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = float(args.sample_rate) config['upper_frequency_limit'] = float(args.upper_frequency_limit) config['lower_frequency_limit'] = float(args.lower_frequency_limit) config['filterbank_channel_count'] = float(args.filterbank_channel_count) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['thres_autoc'] = args.thres_autoc config['output_type'] = args.output_type fbank_pitch = FbankPitch.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) fbank_pitch_test = fbank_pitch(audio_data, args.sample_rate) sess = tf.Session() fbank_pitch_feats = fbank_pitch_test.eval(session=sess) writer[utt_id] = fbank_pitch_feats
def compute_stft(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['window_length'] = args.window_length config['frame_length'] = args.frame_length stft = Analyfiltbank.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) power_spectrum, phase_spectrum = stft(audio_data, args.sample_rate) sess = tf.Session() if args.output_type == 1: out_feats = power_spectrum.eval(session=sess) else: out_feats = phase_spectrum.eval(session=sess) writer[utt_id] = out_feats
def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None with file_writer_helper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, pcm_format=args.format) as writer: for utt_id, (rate, array) in kaldiio.ReadHelper(args.rspecifier, args.segments): if args.filetype == 'mat': # Kaldi-matrix doesn't support integer array = array.astype(numpy.float32) if array.ndim == 1: # (Time) -> (Time, Channel) array = array[:, None] if args.normalize is not None and args.normalize != 1: array = array.astype(numpy.float32) array = array / (1 << (args.normalize - 1)) if preprocessing is not None: orgtype = array.dtype out = preprocessing(array, uttid_list=utt_id) out = out.astype(orgtype) if args.keep_length: if len(out) > len(array): out = numpy.pad(out, [(0, len(out) - len(array))] + [(0, 0) for _ in range(out.ndim - 1)], mode='constant') elif len(out) < len(array): # The length can be changed by stft, for example. out = out[:len(out)] array = out # shape = (Time, Channel) if args.filetype in ['sound.hdf5', 'sound']: # Write Tuple[int, numpy.ndarray] (scipy style) writer[utt_id] = (rate, array) else: writer[utt_id] = array
def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ file_writer_helper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method ) as writer: for utt_id, (_, array) in reader: array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) spc = spectrogram(x=array, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window) writer[utt_id] = spc
def apply_cmvn(): args = get_parser().parse_args() if ':' in args.stats_rspecifier_or_rxfilename: is_rspcifier = True stats_filetype = 'ark' stats_dict = dict(KaldiReader(args.stats_rspecifier_or_rxfilename)) else: is_rspcifier = False stats_filetype = 'mat' stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename) stats_dict = {None: stats} config = {} config['norm_means'] = args.norm_means config['norm_vars'] = args.norm_vars config['utt2spk'] = args.utt2spk config['spk2utt'] = args.spk2utt config['reverse'] = args.reverse config['std_floor'] = args.std_floor config['filetype'] = stats_filetype cmvn = CMVN.params(config).instantiate() cmvn.call(stats_dict) with KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer, \ kaldiio.ReadHelper(args.rspecifier) as reader: for utt, mat in reader: mat_new = cmvn.apply_cmvn(mat, utt) writer[utt] = mat_new
def compute_mfcc(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['upper_frequency_limit'] = float(args.upper_frequency_limit) config['lower_frequency_limit'] = float(args.lower_frequency_limit) config['filterbank_channel_count'] = float(args.filterbank_channel_count) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['output_type'] = args.output_type config['window_type'] = args.window_type config['snip_edges'] = args.snip_edges config['preeph_coeff'] = args.preeph_coeff config['remove_dc_offset'] = args.remove_dc_offset config['is_fbank'] = args.is_fbank config['cepstral_lifter'] = args.cepstral_lifter config['coefficient_count'] = args.coefficient_count mfcc = Mfcc.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) mfcc_test = tf.squeeze(mfcc(audio_data, args.sample_rate)) sess = tf.Session() mfcc_feats = mfcc_test.eval(session=sess) writer[utt_id] = mfcc_feats
def main(): parser = get_parser() args = parser.parse_args() # set logger logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if not os.path.exists(args.figdir): os.makedirs(args.figdir) with kaldiio.ReadHelper(args.rspecifier) as reader, \ codecs.open(args.wspecifier, "w", encoding="utf-8") as f: for utt_id, (rate, array) in reader: assert rate == args.fs array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) array_trim, idx = librosa.effects.trim( y=array, top_db=args.threshold, frame_length=args.win_length, hop_length=args.shift_length ) start, end = idx / args.fs # save figure plt.subplot(2, 1, 1) plt.plot(array) plt.title("Original") plt.subplot(2, 1, 2) plt.plot(array_trim) plt.title("Trim") plt.tight_layout() plt.savefig(args.figdir + "/" + utt_id + ".png") plt.close() # added minimum silence part start = max(0.0, start - args.min_silence) end = min(len(array) / args.fs, end + args.min_silence) # write to segments file segment = "%s %s %f %f\n" % ( utt_id, utt_id, start, end ) f.write(segment)
def main(): parser = get_parser() args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) # Find the number of utterances n_utt = sum(1 for line in open(args.segments)) logging.info("%d utterances found to be processed." % n_utt) # Compute fbank features with kaldiio.ReadHelper( args.rspecifier, segments=args.segments) as reader, file_writer_helper( args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, ) as writer: for i, struct in enumerate(reader, start=1): logging.info("processing %d/%d(%.2f%%)" % (i, n_utt, 100 * i / n_utt)) utt_id, (rate, array) = struct try: assert rate == args.fs array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) lmspc = logmelspectrogram( x=array, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, fmin=args.fmin, fmax=args.fmax, ) writer[utt_id] = lmspc except: logging.warning("failed to compute fbank for utt_id=`%s`" % utt_id)
def main(args): with kaldiio.ReadHelper(f'scp:{args.emb_in}') as reader: x_vector_u = {utt:embd for utt, embd in reader} R = np.load(args.rotation) # Convert from dictionaries to numpy arrays u_out, u_out_label = ( np.array([x_vector_u[i] for i in x_vector_u]), np.array([i for i in x_vector_u]), ) _, _, emb, emb_label = frontend(args, np.zeros((512,512)), np.zeros((512,)), u_out, u_out_label) R_emb = np.dot(emb, R) scp_data = {utt:embd for utt, embd in zip(emb_label, R_emb)} kaldiio.save_ark(f'{args.emb_out}/transformed_xvector.ark', scp_data, scp=f'{args.emb_out}/transformed_xvector.scp')
def compute_pitch(): parser = get_parser() args = parser.parse_args() config = {} config['sample_rate'] = int(args.sample_rate) config['window_length'] = args.window_length config['frame_length'] = args.frame_length config['snip_edges'] = args.snip_edges config['preemph_coeff'] = args.preemph_coeff config['min_f0'] = args.min_f0 config['max_f0'] = args.max_f0 config['soft_min_f0'] = args.soft_min_f0 config['penalty_factor'] = args.penalty_factor config['lowpass_cutoff'] = args.lowpass_cutoff config['resample_freq'] = args.resample_freq config['delta_pitch'] = args.delta_pitch config['nccf_ballast'] = args.nccf_ballast config['lowpass_filter_width'] = args.lowpass_filter_width config['upsample_filter_width'] = args.upsample_filter_width config['max_frames_latency'] = args.max_frames_latency config['frames_per_chunk'] = args.frames_per_chunk config['simulate_first_pass_online'] = args.simulate_first_pass_online config['recompute_frame'] = args.recompute_frame config['nccf_ballast_online'] = args.nccf_ballast_online pitch = Pitch.params(config).instantiate() with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ KaldiWriter(args.wspecifier, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method) as writer: for utt_id, (sample_rate, array) in reader: if sample_rate != args.sample_rate: args.sample_rate = sample_rate array = array.astype(np.float32) audio_data = tf.constant(array, dtype=tf.float32) pitch_test = tf.squeeze(pitch(audio_data, args.sample_rate)) sess = tf.Session() pitch_feats = pitch_test.eval(session=sess) writer[utt_id] = pitch_feats
def wav_generator(rspecifier, segments=None): """Generates wav-array from multiple wav-rspecifier :param List[str] rspecifier: :param str segments: """ readers = [kaldiio.ReadHelper(r, segments=segments) for r in rspecifier] for vs in zip(*readers): for (_, v), r in zip(vs, rspecifier): # kaldiio.load_mat can handle both wavfile and kaldi-matrix, # and if it is wavfile, returns (rate, ndarray), else ndarray if not isinstance(v, tuple): raise RuntimeError('"{}" is an invalid wav file.'.format(r)) utts = [utt_id for utt_id, _ in vs] if not all(u == utts[0] for u in utts): raise RuntimeError( 'The all keys must be common among wav-rspecifiers: {}'.format( rspecifier)) rates = [rate for utt_id, (rate, array) in vs] if not all(rates[i] == rates[0] for i in range(len(vs))): raise RuntimeError('The all sampling-rage must be common ' 'among wav-rspecifiers: {}'.format(rspecifier)) arrays = [] for utt_id, (rate, array) in vs: if array.ndim == 1: # shape = (Time, 1) array = array[:, None] arrays.append(array) utt_id = utts[0] rate = rates[0] # [Time, Channel] array = numpy.concatenate(arrays, axis=1) yield utt_id, (rate, array)
def main(args): trials = [x.split() for x in open(os.path.join(args.data, 'trials'))] utt1s = [x[0] for x in trials] utt2s = [x[1] for x in trials] if len(trials[0]) == 3: tar2int = {'nontarget': 0, 'target': 1} target = [tar2int[x[2]] for x in trials] else: target = None embd_scp = os.path.join(args.data, 'embedding.scp') with kaldiio.ReadHelper(f'scp:{embd_scp}') as reader: utt2embd = {utt: embd for utt, embd in reader} embd1s = [utt2embd[utt] for utt in utt1s] embd2s = [utt2embd[utt] for utt in utt2s] scores = cosine_scoring(embd1s, embd2s) score_path = os.path.join(args.data, 'scores.txt') np.savetxt(score_path, scores, fmt='%.4f') if target is not None: eer, threshold = compute_eer(scores, target) print("EER: {:.2f}%".format(eer * 100))
help="name to output file (ivectors.h5f or lda_ivectors.h5f") parser.parse_args() args, leftovers = parser.parse_known_args() print(args.output_name) try: shutil.rmtree('{}/tmp'.format(args.target_dir)) except: pass if os.path.exists('{}/{}'.format(args.target_dir, args.output_name)): os.remove('{}/{}'.format(args.target_dir, args.output_name)) os.makedirs('{}/tmp'.format(args.target_dir)) with kaldiio.ReadHelper('scp:{}'.format(args.feats_file)) as reader: filenames = [] times = np.array([0]) for key, numpy_array in reader: filenames.append(key) ivector_2d = np.expand_dims(numpy_array.astype(np.float64), axis=0) np.savez('{}/tmp/{}'.format(args.target_dir, key), features=ivector_2d, time=times) print('aaa') any2h5features.convert('{}/tmp/'.format(args.target_dir), '{}/{}'.format(args.target_dir, args.output_name)) print(args.target_dir, args.output_name) print('bbb') # shutil.rmtree('{}/tmp'.format(args.target_dir))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--fs', type=int, help='Sampling frequency') parser.add_argument('--fmax', type=int, default=None, nargs='?', help='Maximum frequency') parser.add_argument('--fmin', type=int, default=None, nargs='?', help='Minimum frequency') parser.add_argument('--n_mels', type=int, default=80, help='Number of mel basis') parser.add_argument('--n_fft', type=int, default=1024, help='FFT length in point') parser.add_argument('--n_shift', type=int, default=512, help='Shift length in point') parser.add_argument('--win_length', type=int, default=None, nargs='?', help='Analisys window length in point') parser.add_argument('--window', type=str, default='hann', choices=['hann', 'hamming'], help='Type of window') parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--filetype', type=str, default='mat', choices=['mat', 'hdf5'], help='Specify the file format for output. ' '"mat" is the matrix format in kaldi') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int, default=None, help='Give the bit depth of the PCM, ' 'then normalizes data to scale in [-1,1]') parser.add_argument('rspecifier', type=str, help='WAV scp file') parser.add_argument('--segments', type=str, help='segments-file format: each line is either' '<segment-id> <recording-id> <start-time> <end-time>' 'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5') parser.add_argument('wspecifier', type=str, help='Write specifier') args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper(args.rspecifier, segments=args.segments) as reader, \ FileWriterWrapper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method ) as writer: for utt_id, (rate, array) in reader: assert rate == args.fs array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) lmspc = logmelspectrogram(x=array, fs=args.fs, n_mels=args.n_mels, n_fft=args.n_fft, n_shift=args.n_shift, win_length=args.win_length, window=args.window, fmin=args.fmin, fmax=args.fmax) writer[utt_id] = lmspc
import torch from data_io import load_dataset, read_lab_fea_refac01 import kaldiio import os import numpy as np def check_dir(dir): if not os.path.exists(dir): os.makedirs(dir) if __name__ == '__main__': feat_opts = "apply-cmvn --utt2spk=ark:/users/liuli/project/kaldi/egs/timit/s5/data/GSC_enroll_customized/utt2spk ark:/users/liuli/project/kaldi/egs/timit/s5/GSC_fbank/cmvn_GSC_enroll_customized.ark ark:- ark:- |" scp_file = "/users/liuli/project/kaldi/egs/timit/s5/data/GSC_enroll_customized/feats.scp" output_dir = "/users/liuli/database/features/GSC_V2/win25ms_hop10ms_41fbank_cmvn/enroll_customized" ark_file = "ark:copy-feats scp:" + scp_file + " ark:- | " + feat_opts idx = 0 with kaldiio.ReadHelper(ark_file) as reader: for key, numpy_array in reader: idx += 1 label = key.split("-")[0] file_name = key.split("-")[1] save_dir = os.path.join(output_dir, label) check_dir(save_dir) save_path = os.path.join(save_dir, file_name + ".npy") np.save(save_path, numpy_array) if idx % 1000 == 0: print("{}/{} files finished".format(idx, 3600))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--fs', type=int, help='Sampling frequency') parser.add_argument('--threshold', type=float, default=60, help='Threshold in decibels') parser.add_argument('--win_length', type=int, default=1024, help='Analisys window length in point') parser.add_argument('--shift_length', type=int, default=256, help='Shift length in point') parser.add_argument('--min_silence', type=float, default=0.01, help='minimum silence length') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int, default=None, help='Give the bit depth of the PCM, ' 'then normalizes data to scale in [-1,1]') parser.add_argument('rspecifier', type=str, help='WAV scp file') parser.add_argument('wspecifier', type=str, help='Segments file') args = parser.parse_args() # set logger logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) with kaldiio.ReadHelper(args.rspecifier) as reader, \ codecs.open(args.wspecifier, "w", encoding="utf-8") as f: for utt_id, (rate, array) in reader: assert rate == args.fs array = array.astype(numpy.float32) if args.normalize is not None and args.normalize != 1: array = array / (1 << (args.normalize - 1)) array_trim, idx = librosa.effects.trim( y=array, top_db=args.threshold, frame_length=args.win_length, hop_length=args.shift_length) start, end = idx / args.fs # added minimum silence part start = max(0.0, start - args.min_silence) end = min(len(array) / args.fs, end + args.min_silence) # write to segments file segment = "%s_%s_%s %s %f %f\n" % (utt_id, _time_to_str(start), _time_to_str(end), utt_id, start, end) f.write(segment)
def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( description= "Preprocess audio and then extract features (See detail in parallel_wavegan/bin/preprocess.py)." ) parser.add_argument( "--scp", default=None, type=str, help= "kaldi-style wav.scp file. you need to specify either scp or rootdir.") parser.add_argument( "--segments", default=None, type=str, help= "kaldi-style segments file. if use, you must to specify both scp and segments." ) parser.add_argument( "--rootdir", default=None, type=str, help= "directory including wav files. you need to specify either scp or rootdir." ) parser.add_argument("--dumpdir", type=str, required=True, help="directory to dump feature files.") parser.add_argument("--config", type=str, required=True, help="yaml format configuration file.") parser.add_argument("--n_jobs", type=int, default=16, help="number of parallel jobs. (default=16)") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning('skip DEBUG/INFO messages') # load config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.scp is not None and args.rootdir is not None) or \ (args.scp is None and args.rootdir is None): raise ValueError("Please specify either rootdir or scp.") # get dataset if args.scp is not None: dataset = kaldiio.ReadHelper(f"scp:{args.scp}", segments=args.segments) else: dataset = AudioDataset(args.rootdir, "*.wav", audio_load_fn=sf.read, return_filename=True) # check directly existence if not os.path.exists(args.dumpdir): os.makedirs(args.dumpdir, exist_ok=True) # define function for parallel processing def _process_single_file(data): # parse inputs if args.scp is not None: utt_id, (fs, audio) = data audio = audio.astype(np.float32) audio /= (1 << (16 - 1)) # assume that wav is PCM 16 bit else: name, (audio, fs) = data utt_id = os.path.basename(name).replace(".wav", "") # check assert len(audio.shape) == 1, \ f"{utt_id} seems to be multi-channel signal." assert fs == config["sampling_rate"], \ f"{utt_id} seems to have a different sampling rate." assert np.abs(audio).max() <= 1.0, \ f"{utt_id} seems to be different from 16 bit PCM." # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim( audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"]) # extract feature mel = logmelfilterbank(audio, fs, fft_size=config["fft_size"], hop_size=config["hop_size"], win_length=config["win_length"], window=config["window"], num_mels=config["num_mels"], fmin=config["fmin"], fmax=config["fmax"]) # make sure the audio length and feature length are matched audio = np.pad(audio, (0, config["fft_size"]), mode="edge") audio = audio[:len(mel) * config["hop_size"]] assert len(mel) * config["hop_size"] == len(audio) # apply global gain if config["global_gain_scale"] > 0.0: audio *= config["global_gain_scale"] if np.abs(audio).max() > 1.0: logging.warn(f"{utt_id} causes clipping. " f"it is better to re-consider global gain scale.") return # save if config["format"] == "hdf5": write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "wave", audio.astype(np.float32)) write_hdf5(os.path.join(args.dumpdir, f"{utt_id}.h5"), "feats", mel.astype(np.float32)) elif config["format"] == "npy": np.save(os.path.join(args.dumpdir, f"{utt_id}-wave.npy"), audio.astype(np.float32), allow_pickle=False) np.save(os.path.join(args.dumpdir, f"{utt_id}-feats.npy"), mel.astype(np.float32), allow_pickle=False) else: raise ValueError("support only hdf5 or npy format.") # process in parallel Parallel(n_jobs=args.n_jobs, verbose=args.verbose)( [delayed(_process_single_file)(data) for data in tqdm(dataset)])
from apc_model import APCModel from utils import PrenetConfig, RNNConfig # added by Sameer import kaldiio import sys feats_scp = sys.argv[1] segments = sys.argv[2] scp_file = sys.argv[3] ark_file = scp_file.replace('.scp', '.ark') writer = kaldiio.WriteHelper('ark,scp:%s,%s' % (ark_file, scp_file)) if segments: reader = kaldiio.ReadHelper('scp:%s' % feats_scp, segments=segments) else: reader = kaldiio.ReadHelper('scp:%s' % feats_scp) def main(): prenet_config = None rnn_config = RNNConfig(input_size=80, hidden_size=512, num_layers=3, dropout=0., residual=True) # Sameer Added residual=True pretrained_apc = APCModel(mel_dim=80, prenet_config=prenet_config, rnn_config=rnn_config).cuda()
def __iter__(self): if self.filetype == 'mat': with kaldiio.ReadHelper(self.rspecifier) as reader: for key, array in reader: if self.return_shape: array = array.shape yield key, array elif self.filetype == 'sound': if ':' not in self.rspecifier: raise ValueError('Give "rspecifier" such as "scp:some.scp: {}"' .format(self.rspecifier)) ark_or_scp, filepath = self.rspecifier.split(':', 1) if ark_or_scp != 'scp': raise ValueError('Only supporting "scp" for sound file: {}' .format(ark_or_scp)) with io.open(filepath, 'r', encoding='utf-8') as f: for line in f: key, sound_file_path = line.rstrip().split(None, 1) # Assume PCM16 array, rate = soundfile.read(sound_file_path, dtype='int16') # Change Tuple[ndarray, int] -> Tuple[int, ndarray] # (soundfile style -> scipy style) if self.return_shape: array = array.shape yield key, (rate, array) elif self.filetype in ['hdf5', 'sound.hdf5']: if ':' not in self.rspecifier: raise ValueError('Give "rspecifier" such as "ark:some.ark: {}"' .format(self.rspecifier)) ark_or_scp, filepath = self.rspecifier.split(':', 1) if ark_or_scp not in ['ark', 'scp']: raise ValueError('Must be scp or ark: {}'.format(ark_or_scp)) if ark_or_scp == 'scp': hdf5_dict = {} with io.open(filepath, 'r', encoding='utf-8') as f: for line in f: key, value = line.rstrip().split(None, 1) if ':' not in value: raise RuntimeError( 'scp file for hdf5 should be like: ' '"uttid filepath.h5:key": {}({})' .format(line, filepath)) path, h5_key = value.split(':', 1) hdf5_file = hdf5_dict.get(path) if hdf5_file is None: if self.filetype == 'sound.hdf5': hdf5_file = SoundHDF5File(path, 'r') else: hdf5_file = h5py.File(path, 'r') hdf5_dict[path] = hdf5_file if self.filetype == 'sound.hdf5': # Change Tuple[ndarray, int] -> Tuple[int, ndarray] # (soundfile style -> scipy style) array, rate = hdf5_file[h5_key] if self.return_shape: array = array.shape yield key, (rate, array) else: if self.return_shape: yield key, hdf5_file[h5_key].shape else: yield key, hdf5_file[h5_key][()] # Closing all files for k in hdf5_dict: hdf5_dict[k].close() else: if filepath == '-': # Required h5py>=2.9 if PY2: filepath = io.BytesIO(sys.stdin.read()) else: filepath = io.BytesIO(sys.stdin.buffer.read()) if self.filetype == 'sound.hdf5': for key, (r, a) in SoundHDF5File(filepath, 'r').items(): if self.return_shape: a = a.shape yield key, (r, a) else: with h5py.File(filepath, 'r') as f: for key in f: if self.return_shape: yield key, f[key].shape else: yield key, f[key][()] else: raise ValueError( 'Not supporting: filetype={}'.format(self.filetype))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--write-num-frames', type=str, help='Specify wspecifer for utt2num_frames') parser.add_argument('--filetype', type=str, default='mat', choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], help='Specify the file format for output. ' '"mat" is the matrix format in kaldi') parser.add_argument('--format', type=str, default=None, help='The file format for output pcm. ' 'This option is only valid ' 'when "--filetype" is "sound.hdf5" or "sound"') parser.add_argument('--compress', type=strtobool, default=False, help='Save in compressed format') parser.add_argument( '--compression-method', type=int, default=2, help='Specify the method(if mat) or gzip-level(if hdf5)') parser.add_argument('--verbose', '-V', default=0, type=int, help='Verbose option') parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int, default=None, help='Give the bit depth of the PCM, ' 'then normalizes data to scale in [-1,1]') parser.add_argument('--preprocess-conf', type=str, default=None, help='The configuration file for the pre-processing') parser.add_argument('--keep-length', type=strtobool, default=True, help='Truncating or zero padding if the output length ' 'is changed from the input by preprocessing') parser.add_argument('rspecifier', type=str, help='WAV scp file') parser.add_argument('--segments', type=str, help='segments-file format: each line is either' '<segment-id> <recording-id> <start-time> <end-time>' 'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5') parser.add_argument('wspecifier', type=str, help='Write specifier') args = parser.parse_args() logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" if args.verbose > 0: logging.basicConfig(level=logging.INFO, format=logfmt) else: logging.basicConfig(level=logging.WARN, format=logfmt) logging.info(get_commandline_args()) if args.preprocess_conf is not None: preprocessing = Transformation(args.preprocess_conf) logging.info('Apply preprocessing: {}'.format(preprocessing)) else: preprocessing = None with FileWriterWrapper(args.wspecifier, filetype=args.filetype, write_num_frames=args.write_num_frames, compress=args.compress, compression_method=args.compression_method, pcm_format=args.format) as writer: for utt_id, (rate, array) in kaldiio.ReadHelper(args.rspecifier, args.segments): if args.filetype == 'mat': # Kaldi-matrix doesn't support integer array = array.astype(numpy.float32) if array.ndim == 1: # (Time) -> (Time, Channel) array = array[:, None] if args.normalize is not None and args.normalize != 1: array = array.astype(numpy.float32) array = array / (1 << (args.normalize - 1)) if preprocessing is not None: orgtype = array.dtype out = preprocessing(array, uttid_list=utt_id) out = out.astype(orgtype) if args.keep_length: if len(out) > len(array): out = numpy.pad(out, [(0, len(out) - len(array))] + [(0, 0) for _ in range(out.ndim - 1)], mode='constant') elif len(out) < len(array): # The length can be changed by stft, for example. out = out[:len(out)] array = out # shape = (Time, Channel) if args.filetype in ['sound.hdf5', 'sound']: # Write Tuple[int, numpy.ndarray] (scipy style) writer[utt_id] = (rate, array) else: writer[utt_id] = array
parser.add_argument( "train", help="train directory - just basename. Eg: <train_bil_eng-ger>") parser.parse_args() args, leftovers = parser.parse_known_args() train = args.train with open('data/emime/{}/utt2lang'.format(train), 'r') as input_utt2lang: utt2lang_dict = {} for line in input_utt2lang: utt2lang_dict[line.split(' ')[0]] = line.split(' ')[1].strip('\n') ivec = 'exp_emime/ivectors-deltassdc/ivectors_128_tr-{}_ts-{}/ivector.scp'.format( train, train) with kaldiio.ReadHelper('scp:' + ivec) as reader: ivectors = {} for k, iv in reader: ivectors[k] = iv #ivectors_df = pd.DataFrame.from_dict(ivectors) ivectors_df = pd.DataFrame.from_dict(ivectors, orient='index') #this is our y #utt2lang_df = pd.DataFrame(utt2lang_dict, index=["lang"]) predictor_df = pd.DataFrame.from_dict( utt2lang_dict, orient='index', columns=["lang"], dtype="category") #cat is so can do dummy encoding predictor_df["lang_dich"] = predictor_df[ "lang"].cat.codes #change dichotomous
def main(): """Run decoding process.""" parser = argparse.ArgumentParser( description="Decode dumped features with trained Parallel WaveGAN Generator.") parser.add_argument("--scp", default=None, type=str, help="Kaldi-style feats.scp file.") parser.add_argument("--dumpdir", default=None, type=str, help="Directory including feature files.") parser.add_argument("--outdir", default=None, type=str, required=True, help="Direcotry to save generated speech.") parser.add_argument("--checkpoint", default=None, type=str, required=True, help="Checkpoint file.") parser.add_argument("--config", default=None, type=str, help="Yaml format configuration file.") parser.add_argument("--verbose", type=int, default=1, help="logging level (higher is more logging)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning("skip DEBUG/INFO messages") # check direcotry existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config if args.config is None: dirname = os.path.dirname(args.checkpoint) args.config = os.path.join(dirname, "config.yml") with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.scp is not None and args.dumpdir is not None) or \ (args.scp is None and args.dumpdir is None): raise ValueError("Please specify either dumpdir or scp.") # get dataset if args.scp is None: if config["format"] == "hdf5": mel_query = "*.h5" mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": mel_query = "*-feats.npy" mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") dataset = MelDataset( args.dumpdir, mel_query=mel_query, mel_load_fn=mel_load_fn, return_filename=True) logging.info(f"the number of features to be decoded = {len(dataset)}.") else: dataset = kaldiio.ReadHelper(f"scp:{args.scp}") logging.info(f"the feature loaded from {args.scp}.") # setup if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") model = ParallelWaveGANGenerator(**config["generator_params"]) model.load_state_dict(torch.load(args.checkpoint, map_location="cpu")["model"]["generator"]) model.remove_weight_norm() model = model.eval().to(device) logging.info(f"loaded model parameters from {args.checkpoint}.") # start generation pad_size = (config["generator_params"]["aux_context_window"], config["generator_params"]["aux_context_window"]) total_rtf = 0.0 with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar: for idx, (feat_path, c) in enumerate(pbar, 1): # generate each utterance z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device) c = np.pad(c, (pad_size, (0, 0)), "edge") c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device) start = time.time() y = model(z, c).view(-1).cpu().numpy() rtf = (time.time() - start) / (len(y) / config["sampling_rate"]) pbar.set_postfix({"RTF": rtf}) total_rtf += rtf # save as PCM 16 bit wav file utt_id = os.path.splitext(os.path.basename(feat_path))[0] sf.write(os.path.join(config["outdir"], f"{utt_id}_gen.wav"), y, config["sampling_rate"], "PCM_16") # report average RTF logging.info(f"finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f}).")
def main(cmd_args): parser = get_parser() args, _ = parser.parse_known_args(cmd_args) # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s') else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s') logging.warning('Skip DEBUG/INFO messages') # display PYTHONPATH logging.info('python path = ' + os.environ.get('PYTHONPATH', '(None)')) # set random seed logging.info('random seed = %d' % args.seed) random.seed(args.seed) np.random.seed(args.seed) set_deterministic_pytorch(args) logging.info("total speaker is %d" % args.nClasses) spk_model = SpeakerNet(nClasses=args.nClasses, nPerSpeaker=args.nPerSpeaker, trainfunc=args.trainfunc, nOut=512) if args.spk_model is not None: spk_model.loadParameters(args.spk_model) else: spk_model = None spk_model.eval() mean = np.array([[ -1.7101e+08, -1.727767e+08, -1.654258e+08, -1.568423e+08, -1.47768e+08, -1.355978e+08, -1.337955e+08, -1.290715e+08, -1.292888e+08, -1.333105e+08, -1.380836e+08, -1.388845e+08, -1.445241e+08, -1.438754e+08, -1.428372e+08, -1.428697e+08, -1.417773e+08, -1.400568e+08, -1.448087e+08, -1.459874e+08, -1.47229e+08, -1.490556e+08, -1.499799e+08, -1.522063e+08, -1.590756e+08, -1.618226e+08, -1.651485e+08, -1.684847e+08, -1.692581e+08, -1.714363e+08, -1.763494e+08, -1.776152e+08, -1.789162e+08, -1.805202e+08, -1.798933e+08, -1.818852e+08, -1.852947e+08, -1.860893e+08, -1.873477e+08, -1.889484e+08, -1.873008e+08, -1.891793e+08, -1.917609e+08, -1.932594e+08, -1.934982e+08, -1.90069e+08, -1.967007e+08, -1.955583e+08, -1.932292e+08, -2.001965e+08, -1.926799e+08, -2.013976e+08, -1.932717e+08, -1.997551e+08, -1.955731e+08, -1.958617e+08, -1.967825e+08, -1.952326e+08, -1.931164e+08, -1.947601e+08, -1.94064e+08, -1.937533e+08, -1.93948e+08, -1.940927e+08, -1.945755e+08, -1.955468e+08, -1.96344e+08, -1.963595e+08, -1.971519e+08, -1.991344e+08, -1.989762e+08, -2.000582e+08, -2.019397e+08, -2.019519e+08, -2.024301e+08, -2.031892e+08, -2.029932e+08, -2.029679e+08, -2.033156e+08, -2.033823e+08, -2.03208e+08, -2.036384e+08, -2.03879e+08, -2.04647e+08, -2.06028e+08, -2.060116e+08, -2.070609e+08, -2.071168e+08, -2.083309e+08, -2.092469e+08, -2.103796e+08, -2.122868e+08, -2.135678e+08, -2.144521e+08, -2.158103e+08, -2.171439e+08, -2.176665e+08, -2.191257e+08, -2.193856e+08, -2.21079e+08, -2.226874e+08, -2.247855e+08, -2.267768e+08, -2.286809e+08, -2.311216e+08, -2.33142e+08, -2.352095e+08, -2.373178e+08, -2.393992e+08, -2.415607e+08, -2.436022e+08, -2.450806e+08, -2.462217e+08, -2.47608e+08, -2.483978e+08, -2.495429e+08, -2.495807e+08, -2.501201e+08, -2.504308e+08, -2.506836e+08, -2.518955e+08, -2.528667e+08, -2.538843e+08, -2.553601e+08, -2.571577e+08, -2.592016e+08, -2.737314e+08, -3.25694e+08 ]]) var = np.array([[ 3.875797e+08, 3.972777e+08, 3.76892e+08, 3.590407e+08, 3.36797e+08, 2.982351e+08, 2.993923e+08, 2.900205e+08, 2.903182e+08, 3.00258e+08, 3.139445e+08, 3.133095e+08, 3.316776e+08, 3.290742e+08, 3.259625e+08, 3.292938e+08, 3.253266e+08, 3.20113e+08, 3.353506e+08, 3.40549e+08, 3.424283e+08, 3.454718e+08, 3.482779e+08, 3.577333e+08, 3.827005e+08, 3.899876e+08, 4.01662e+08, 4.141465e+08, 4.154033e+08, 4.238292e+08, 4.437099e+08, 4.463138e+08, 4.495017e+08, 4.545714e+08, 4.517053e+08, 4.601415e+08, 4.730579e+08, 4.755685e+08, 4.813327e+08, 4.884872e+08, 4.809006e+08, 4.883675e+08, 5.00223e+08, 5.064776e+08, 5.080264e+08, 4.91717e+08, 5.215152e+08, 5.169479e+08, 5.060737e+08, 5.381505e+08, 5.023963e+08, 5.430141e+08, 5.040811e+08, 5.339064e+08, 5.142676e+08, 5.158492e+08, 5.202875e+08, 5.131353e+08, 5.043084e+08, 5.129934e+08, 5.087678e+08, 5.064136e+08, 5.083315e+08, 5.083852e+08, 5.09834e+08, 5.150194e+08, 5.177091e+08, 5.167306e+08, 5.197394e+08, 5.282414e+08, 5.270312e+08, 5.324564e+08, 5.408028e+08, 5.407178e+08, 5.426285e+08, 5.456758e+08, 5.454526e+08, 5.462478e+08, 5.481372e+08, 5.508704e+08, 5.496423e+08, 5.518889e+08, 5.532486e+08, 5.56079e+08, 5.627578e+08, 5.617894e+08, 5.666932e+08, 5.67652e+08, 5.73079e+08, 5.768822e+08, 5.817027e+08, 5.912957e+08, 5.977753e+08, 6.0268e+08, 6.094717e+08, 6.166043e+08, 6.196362e+08, 6.269311e+08, 6.276106e+08, 6.369116e+08, 6.44361e+08, 6.551513e+08, 6.656342e+08, 6.762929e+08, 6.899264e+08, 7.008929e+08, 7.117181e+08, 7.238042e+08, 7.350025e+08, 7.47482e+08, 7.59422e+08, 7.681328e+08, 7.75756e+08, 7.834833e+08, 7.868992e+08, 7.938968e+08, 7.929719e+08, 7.966068e+08, 7.983973e+08, 7.993377e+08, 8.061261e+08, 8.111478e+08, 8.169364e+08, 8.25449e+08, 8.366562e+08, 8.486715e+08, 9.377093e+08, 1.289456e+09 ]]) num_sum = 8.478675e+07 with kaldiio.ReadHelper("scp:%s" % args.read_file) as reader, kaldiio.WriteHelper( 'ark,scp:%s.ark,%s.scp' % (args.write_file, args.write_file)) as writer: for key, numpy_array in reader: with torch.no_grad(): length = len(numpy_array) numpy_array = numpy_array[20:-20] # numpy_array = numpy_array[20:] # numpy_array = numpy_array[:-20] # numpy_array = numpy_array - mean/num_sum # numpy_array = numpy_array / ( var/num_sum - (mean/num_sum)**2) torch_array = torch.from_numpy(numpy_array).unsqueeze( 0).float() logging.info(torch_array.size()) writer[key] = spk_model(torch_array).squeeze(0).numpy()