def __init__(self, pase_cfg, pase_ckpt, pase_ft, num_inputs, pase_feats, save_path, global_mode=False, stft_cfg=None, stft_ckpt=None, name='PASEInjector'): super().__init__(name=name) self.pase = wf_builder(pase_cfg) if pase_ckpt is not None: self.pase.load_pretrained(pase_ckpt, load_last=True, verbose=True) """ if num_inputs != pase_feats: # make a projector self.pase_W = nn.Conv1d(num_inputs, pase_feats, 1) """ self.global_mode = global_mode if pase_ft: #self.saver = Saver(self, save_path, # prefix='PASE') self.pase.train() else: self.pase.eval() if stft_cfg is not None: stft_cfg['frontend_cfg'] = pase_cfg stft_cfg['frontend_ckpt'] = pase_ckpt self.stft_net = DCRegression(**stft_cfg) if stft_ckpt is not None: self.stft_net.load_pretrained(stft_ckpt, load_last=True, verbose=True)
def __init__(self, frontend_cfg, num_outputs, frontend_ckpt=None, ft_fe=False, rnn_size=512, rnn_layers=3, rnn_type='lstm', cuda=False, name='DCRegression'): super().__init__(name=name) self.frontend = wf_builder(frontend_cfg) if frontend_ckpt is not None: self.frontend.load_pretrained(frontend_ckpt, load_last=True, verbose=True) self.ft_fe = ft_fe ninp = self.frontend.emb_dim #self.rnn = nn.LSTM(ninp, rnn_size, rnn_layers, # batch_first=True, bidirectional=True) self.rnn = build_rnn_block(ninp, rnn_size, rnn_layers, rnn_type, use_cuda=cuda) # Build skip connection adapter self.W = nn.Conv1d(ninp, 2 * rnn_size, 1) self.backend = nn.Sequential( nn.Conv1d(2 * rnn_size, 2 * rnn_size, 1), nn.ReLU(inplace=True), nn.Conv1d(2 * rnn_size, num_outputs, 1) )
def __init__(self, pase_cfg, pase_cp=None, n_z=256, proj_size=0, ncoef=100, sm_type='none'): super(global_MLP, self).__init__() self.encoder = wf_builder(pase_cfg) if pase_cp: self.encoder.load_pretrained(pase_cp, load_last=True, verbose=False) self.model = nn.Sequential(nn.Linear(ncoef, 512), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Linear(512, 512), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Linear(512, n_z)) if proj_size > 0 and sm_type != 'none': if sm_type == 'softmax': self.out_proj = Softmax(input_features=n_z, output_features=proj_size) elif sm_type == 'am_softmax': self.out_proj = AMSoftmax(input_features=n_z, output_features=proj_size) else: raise NotImplementedError
def __init__(self, PASE_cfg, MLP_cfg, PASE_ckpt, context_left=0, context_right=0): super().__init__() input_dim = 4 * PASE_cfg['rnn_dim'] * (1 + context_left + context_right) self.context_left = context_left self.context_right = context_right self.pase = wf_builder(PASE_cfg) self.pase.load_pretrained(PASE_ckpt, load_last=True, verbose=False) self.decoder = MLP(MLP_cfg, input_dim)
def load_pase_plus(PASE_FOLDER, parameters='trained_model/PASE+_parameters.ckpt'): sys.path.append(PASE_FOLDER) from pase.models.frontend import wf_builder pase = wf_builder(join(PASE_FOLDER, 'cfg/frontend/PASE+.cfg')) pase.eval() pase.load_pretrained(parameters, load_last=True, verbose=True) return pase.to(CUDA0)
def __init__(self, PASE_cfg, LSTM_cfg, PASE_ckpt): super().__init__() self.pase = wf_builder(PASE_cfg) self.pase.load_pretrained(PASE_ckpt, load_last=True, verbose=False) input_dim = 4 * PASE_cfg['rnn_dim'] self.decoder = nn.ModuleList([LSTM_cudnn(LSTM_cfg, input_dim), nn.Linear(136, 136)])
def __init__(self, ckpt, config, **kwargs): super(UpstreamExpert, self).__init__() self.pase = wf_builder(config) self.pase.load_pretrained(ckpt, load_last=True, verbose=False) # pseudo_input = torch.randn(1, 1, SAMPLE_RATE * EXAMPLE_SEC) # r = self.pase(pseudo_input) # size will be (1, 256, 625), which are 625 frames of 256 dims each self.output_dim = 256 # r.size(1) raise RuntimeError('There are some import errors with the PASE repo, see this issue: https://github.com/santi-pdp/pase/issues/114.')
def __init__(self, ckpt, model_config, **kwargs): super(UpstreamExpert, self).__init__() self.pase = wf_builder(model_config) self.pase.load_pretrained(ckpt, load_last=True, verbose=False) # Pase can not easily switch between cpu/gpu for now self.pase.cuda() pseudo_input = torch.randn(1, 1, SAMPLE_RATE * EXAMPLE_SEC).cuda() self.output_dim = self.pase(pseudo_input).size(1)
def retrieve_model_and_datasets(encoder_cfg, model_cfg, data_cfg, train_list, valid_list, test_list): with open(model_cfg, 'r') as cfg_f: model_cfg = json.load(cfg_f) if encoder_cfg is not None: with open(encoder_cfg, 'r') as cfg_f: encoder_cfg = json.load(cfg_f) with open(data_cfg, 'r') as cfg_f: data_cfg = json.load(cfg_f) # prepare the three datasets; train, valid and test splits = [train_list, valid_list, test_list] cls_name = model_cfg.pop('name') dset_name = data_cfg.pop('name') if 'chunk_cfg' in data_cfg: chunker = SingleChunkWav(**data_cfg.pop('chunk_cfg')) data_cfg['chunker'] = chunker if encoder_cfg is not None: name = encoder_cfg.pop('name') if name == 'pase' or name == 'PASE': if 'ckpt' in encoder_cfg: ckpt = encoder_cfg.pop('ckpt') else: ckpt = None encoder = wf_builder(encoder_cfg) if ckpt is not None: encoder.load_pretrained(ckpt, load_last=True, verbose=True) model_cfg['frontend'] = encoder elif name == 'tdnn' or name == 'TDNN': model_cfg['xvector'] = True encoder = TDNN(**encoder_cfg) model_cfg['frontend'] = encoder else: raise ValueError('Unrecognized encoder: ', name) model = getattr(pmods, cls_name)(**model_cfg) datasets = [] for si, split in enumerate(splits, start=1): if split is None: # skip this split (validation for instance) datasets.append(None) else: data_cfg['split_list'] = split if si >= len(splits) - 1 and 'chunker' in data_cfg: # remove the chunker for test split del data_cfg['chunker'] datasets.append(getattr(pdsets, dset_name)(**data_cfg)) return model, datasets
def __init__(self, pase_cfg, pase_cp=None, n_z=256, layers=[2, 2, 2, 2], block=PreActBlock, proj_size=0, ncoef=23, sm_type='none'): self.in_planes = 16 super(ResNet_18, self).__init__() self.model = nn.ModuleList() self.model.append( nn.Sequential( nn.Conv2d(1, 16, kernel_size=(2 * ncoef, 3), stride=(1, 1), padding=(0, 1), bias=False), nn.BatchNorm2d(16), nn.ReLU())) self.model.append(self._make_layer(block, 64, layers[0], stride=1)) self.model.append(self._make_layer(block, 128, layers[1], stride=2)) self.model.append(self._make_layer(block, 256, layers[2], stride=2)) self.model.append(self._make_layer(block, 512, layers[3], stride=2)) self.initialize_params() self.pooling = SelfAttention(block.expansion * 512) self.post_pooling = nn.Sequential( nn.Conv1d(block.expansion * 512 * 2, 512, 1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, 512, 1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, n_z, 1)) if proj_size > 0 and sm_type != 'none': if sm_type == 'softmax': self.out_proj = Softmax(input_features=n_z, output_features=proj_size) elif sm_type == 'am_softmax': self.out_proj = AMSoftmax(input_features=n_z, output_features=proj_size) else: raise NotImplementedError ## Load after initialize main model params self.encoder = wf_builder(pase_cfg) if pase_cp: self.encoder.load_pretrained(pase_cp, load_last=True, verbose=False)
def __init__(self, pase_cfg, pase_cp=None, n_layers=4, n_z=256, proj_size=0, ncoef=23, sm_type='none'): super(pyr_rnn, self).__init__() self.model = nn.ModuleList( [nn.LSTM(2 * ncoef, 256, 1, bidirectional=True, batch_first=True)]) for i in range(1, n_layers): self.model.append( nn.LSTM(256 * 2 * 2, 256, 1, bidirectional=True, batch_first=True)) self.pooling = StatisticalPooling() self.post_pooling = nn.Sequential(nn.Conv1d(256 * 2 * 2 * 2, 512, 1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, 512, 1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, n_z, 1)) self.initialize_params() self.attention = SelfAttention(512) if proj_size > 0 and sm_type != 'none': if sm_type == 'softmax': self.out_proj = Softmax(input_features=n_z, output_features=proj_size) elif sm_type == 'am_softmax': self.out_proj = AMSoftmax(input_features=n_z, output_features=proj_size) else: raise NotImplementedError self.encoder = wf_builder(pase_cfg) if pase_cp: self.encoder.load_pretrained(pase_cp, load_last=True, verbose=False)
def __init__(self, pase_cfg, pase_cp=None, n_z=256, proj_size=0, ncoef=100, sm_type='none'): super(TDNN, self).__init__() self.encoder = wf_builder(pase_cfg) if pase_cp: self.encoder.load_pretrained(pase_cp, load_last=True, verbose=False) self.model = nn.Sequential( nn.BatchNorm1d(2 * ncoef), nn.Conv1d(2 * ncoef, 512, 5, padding=2), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, 512, 3, dilation=2, padding=2), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, 512, 3, dilation=3, padding=3), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, 512, 1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, 1500, 1), nn.BatchNorm1d(1500), nn.ReLU(inplace=True)) self.pooling = StatisticalPooling() self.post_pooling = nn.Sequential(nn.Conv1d(3000, 512, 1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, 512, 1), nn.BatchNorm1d(512), nn.ReLU(inplace=True), nn.Conv1d(512, n_z, 1)) if proj_size > 0 and sm_type != 'none': if sm_type == 'softmax': self.out_proj = Softmax(input_features=n_z, output_features=proj_size) elif sm_type == 'am_softmax': self.out_proj = AMSoftmax(input_features=n_z, output_features=proj_size) else: raise NotImplementedError
def __init__(self, res_ckpt_path, pase_cfg_path, pase_ckpt_path): super().__init__() m3 = model.model3.SVHFNet() map_location = None if torch.cuda.is_available() else 'cpu' check_point = torch.load(res_ckpt_path, map_location=map_location) state_dict = check_point['net'] m3.load_state_dict(state_dict) self.vis_stream = m3.vis_stream pase = wf_builder(pase_cfg_path) pase.load_pretrained(pase_ckpt_path, load_last=True, verbose=True) self.aud_stream = AudioStream(pase) self.fc8 = nn.Linear(3072, 1024) self.bn8 = nn.BatchNorm1d(1024) self.relu8 = nn.ReLU() self.fc9 = nn.Linear(1024, 512) self.bn9 = nn.BatchNorm1d(512) self.relu9 = nn.ReLU() self.fc10 = nn.Linear(512, 2)
def main(opts): device = 'cuda' if torch.cuda.is_available() else 'cpu' # build network minions_cfg = pase_parser(opts.minions_cfg, do_losses=False) remove_Dcfg(minions_cfg) pase = wf_builder(opts.cfg) model = Waveminionet(minions_cfg=minions_cfg, num_devices=0, pretrained_ckpts=opts.ckpt, z_minion=False, frontend=pase) model.eval() model.to(device) transf = Reverb(['data/omologo_revs/IRs_2/IR_223108.imp'], ir_fmt='imp') minion = model.minions[0] minion.loss = None pase = model.frontend #print(opts.in_files) in_files = [os.path.join(opts.files_root, inf) for inf in opts.in_files] wavs = [] wfiles = [] max_len = 0 print('Total batches: ', len(in_files) // opts.batch_size) with torch.no_grad(): for wi, wfile in tqdm.tqdm(enumerate(in_files, start=1), total=len(in_files)): wfiles.append(wfile) wav, rate = sf.read(wfile) wavs.append(wav) if len(wav) > max_len: max_len = len(wav) if wi % opts.batch_size == 0 or wi >= len(in_files): lens = [] batch = [] for bi in range(len(wavs)): P_ = max_len - len(wavs[bi]) lens.append(len(wavs[bi])) if P_ > 0: pad = np.zeros((P_)) wav_ = np.concatenate((wavs[bi], pad), axis=0) else: wav_ = wavs[bi] wav = torch.FloatTensor(wav_) wav_r = transf({'chunk': wav}) batch.append(wav_r['chunk'].view(1, 1, -1)) batch = torch.cat(batch, dim=0) x = batch.to(device) h = pase(x) #print('frontend size: ', h.size()) y = minion(h).cpu() for bi in range(len(wavs)): bname = os.path.basename(wfiles[bi]) y_ = y[bi].squeeze().data.numpy() y_ = y_[:lens[bi]] sf.write(os.path.join(opts.out_path, '{}'.format(bname)), y_, 16000) x_ = x[bi].squeeze().data.numpy() x_ = x_[:lens[bi]] sf.write( os.path.join(opts.out_path, 'input_{}'.format(bname)), x_, 16000) max_len = 0 wavs = [] wfiles = [] batch = None """
options['dnn_use_batchnorm']='True,True,True,True,True,False' options['dnn_use_laynorm']='False,False,False,False,False,False' options['dnn_use_laynorm_inp']='False' options['dnn_use_batchnorm_inp']='False' options['dnn_act']='linear,relu,relu,relu,relu,softmax' device=get_freer_gpu() # folder creation text_file=open(output_file, "w") # Loading pase pase =wf_builder(pase_cfg) pase.load_pretrained(pase_model, load_last=True, verbose=False) pase.to(device) pase.eval() # reading the training signals print("Waveform reading...") fea={} for wav_file in tr_lst: [signal, fs] = sf.read(data_folder+'/'+wav_file) signal=signal/np.max(np.abs(signal)) signal = signal.astype(np.float32) fea_id=wav_file.split('/')[-2]+'_'+wav_file.split('/')[-1].split('.')[0] fea[fea_id]=torch.from_numpy(signal).float().to(device).view(1,1,-1)
def main(opts): CUDA = torch.cuda.is_available() and not opts.no_cuda device = 'cuda' if CUDA else 'cpu' torch.manual_seed(opts.seed) random.seed(opts.seed) np.random.seed(opts.seed) if device == 'cuda': torch.cuda.manual_seed_all(opts.seed) spk2idx = load_spk2idx(opts.spk2idx) NSPK = len(set(spk2idx.values())) # Build Model fe = wf_builder(opts.fe_cfg) if opts.train: print('=' * 20) print('Entering TRAIN mode') print('=' * 20) with open(os.path.join(opts.save_path, 'train.opts'), 'w') as cfg_f: cfg_f.write(json.dumps(vars(opts), indent=2)) # Open up guia and split valid with open(opts.train_guia) as tr_guia_f: tr_files = [l.rstrip() for l in tr_guia_f] if opts.test_guia is not None: with open(opts.test_guia) as te_guia_f: te_files = [l.rstrip() for l in te_guia_f] tr_files_, va_files = build_valid_list(tr_files, spk2idx, va_split=opts.va_split) # compute total samples dur beg_t = timeit.default_timer() tr_durs, sr = compute_utterances_durs(tr_files_, opts.data_root) va_durs, _ = compute_utterances_durs(va_files, opts.data_root) train_dur = np.sum(tr_durs) valid_dur = np.sum(va_durs) end_t = timeit.default_timer() print('Read tr/va {:.1f} s/{:.1f} s in {} s'.format( train_dur / sr, valid_dur / sr, end_t - beg_t)) # Build Datasets dset = LibriSpkIDDataset(opts.data_root, tr_files_, spk2idx) va_dset = LibriSpkIDDataset(opts.data_root, va_files, spk2idx) cc = WavCollater(max_len=opts.max_len) #cc_vate = WavCollater(max_len=None) cc_vate = cc dloader = DataLoader(dset, batch_size=opts.batch_size, collate_fn=cc, shuffle=True) va_dloader = DataLoader(va_dset, batch_size=opts.batch_size, collate_fn=cc_vate, shuffle=False) tr_bpe = (train_dur // opts.max_len) // opts.batch_size va_bpe = (valid_dur // opts.max_len) // opts.batch_size if opts.test_guia is not None: te_dset = LibriSpkIDDataset(opts.data_root, te_files, spk2idx) te_dloader = DataLoader(te_dset, batch_size=opts.batch_size, collate_fn=cc_vate, shuffle=False) if opts.fe_ckpt is not None: fe.load_pretrained(opts.fe_ckpt, load_last=True, verbose=True) else: print('*' * 50) print('** WARNING: TRAINING WITHOUT PRETRAIED WEIGHTS FOR THE ' 'FRONT-END **') print('*' * 50) # Enforce training the frontend opts.ft_fe = True model = select_model(opts, fe, NSPK) model.to(device) print(model) # Build optimizer and scheduler opt = select_optimizer(opts, model) sched = select_scheduler(opts, opt) # Make writer writer = SummaryWriter(opts.save_path) best_val_acc = 0 # flag for saver best_val = False for epoch in range(1, opts.epoch + 1): train_epoch(dloader, model, opt, epoch, opts.log_freq, writer=writer, device=device, bpe=tr_bpe) eloss, eacc = eval_epoch(va_dloader, model, epoch, opts.log_freq, writer=writer, device=device, bpe=va_bpe, key='valid') if opts.sched_mode == 'step': sched.step() else: sched.step(eacc) if eacc > best_val_acc: print('*' * 40) print('New best val acc: {:.3f} => {:.3f}.' ''.format(best_val_acc, eacc)) print('*' * 40) best_val_acc = eacc best_val = True model.save(opts.save_path, epoch - 1, best_val=best_val) best_val = False if opts.test_guia is not None: # Eval test on the fly whilst training/validating teloss, teacc = eval_epoch(te_dloader, model, epoch, opts.log_freq, writer=writer, device=device, key='test') if opts.test: print('=' * 20) print('Entering TEST mode') print('=' * 20) #fe = WaveFe(rnn_pool=opts.rnn_pool, emb_dim=opts.emb_dim) model = select_model(opts, fe, NSPK) model.load_pretrained(opts.test_ckpt, load_last=True, verbose=True) model.to(device) model.eval() with open(opts.test_guia) as te_guia_f: te_files = [l.rstrip() for l in te_guia_f] te_dset = LibriSpkIDDataset(opts.data_root, te_files, spk2idx) cc = WavCollater(max_len=None) te_dloader = DataLoader( te_dset, batch_size=1, #collate_fn=cc, shuffle=False) def filter_by_slens(T, slens, sfactor=160): dims = len(T.size()) # extract each sequence by its length seqs = [] for bi in range(T.size(0)): slen = int(np.ceil(slens[bi] / sfactor)) if dims == 3: seqs.append(T[bi, :, :slen]) else: seqs.append(T[bi, :slen]) return seqs with torch.no_grad(): teloss = [] teacc = [] timings = [] beg_t = timeit.default_timer() if opts.test_log_file is not None: test_log_f = open(opts.test_log_file, 'w') test_log_f.write('Filename\tAccuracy [%]\tError [%]\n') else: test_log_f = None for bidx, batch in enumerate(te_dloader, start=1): #X, Y, slen = batch X, Y = batch X = X.unsqueeze(1) X = X.to(device) Y = Y.to(device) Y_ = model(X) Y = Y.view(-1, 1).repeat(1, Y_.size(2)) #Y__seqs = filter_by_slens(Y_, slen) #Y_seqs = filter_by_slens(Y, slen) #assert len(Y__seqs) == len(Y_seqs) #for sidx in range(len(Y__seqs)): # y_ = Y__seqs[sidx].unsqueeze(0) # y = Y_seqs[sidx].unsqueeze(0) # loss = F.nll_loss(y_, y) # teacc.append(accuracy(y_, y)) # teloss.append(loss) loss = F.nll_loss(Y_, Y) acc = accuracy(Y_, Y) if test_log_f: test_log_f.write('{}\t{:.2f}\t{:.2f}\n' \ ''.format(te_files[bidx - 1], acc * 100, 100 - (acc * 100))) teacc.append(accuracy(Y_, Y)) teloss.append(loss.item()) end_t = timeit.default_timer() timings.append(end_t - beg_t) beg_t = timeit.default_timer() if bidx % 100 == 0 or bidx == 1: mteloss = np.mean(teloss) mteacc = np.mean(teacc) mtimings = np.mean(timings) print('Processed test file {}/{} mfiletime: {:.2f} s, ' 'macc: {:.4f}, mloss: {:.2f}' ''.format(bidx, len(te_dloader), mtimings, mteacc, mteloss), end='\r') print() if test_log_f: test_log_f.write('-' * 30 + '\n') test_log_f.write('Test accuracy: ' \ '{:.2f}\n'.format(np.mean(teacc) * 100)) test_log_f.write('Test error: ' \ '{:.2f}\n'.format(100 - (np.mean(teacc) *100))) test_log_f.write('Test loss: ' \ '{:.2f}\n'.format(np.mean(teloss))) test_log_f.close() print('Test accuracy: {:.4f}'.format(np.mean(teacc))) print('Test loss: {:.2f}'.format(np.mean(teloss)))
def get_pase_representations(pase_model, audio_path): y, fs = librosa.core.load(audio_path, sr=None) y = torch.tensor(y)[(None, ) * 2].to( device) # unsqueeze twice at first dim pase_reps = pase_model(y) pase_reps = pase_reps.detach().cpu().numpy() return pase_reps if __name__ == "__main__": audio_path = sys.argv[1] save_path = sys.argv[2] # load model pase = wf_builder('cfg/frontend/PASE+.cfg').eval() pase = pase.to(device) pase.load_pretrained('checkpoints/pase_pretrained.ckpt', load_last=True, verbose=True) # get list of speaker ids from VoxCeleb speaker_ids = os.listdir(audio_path) # get PASE representations for utterances from each speaker for speaker_id in tqdm(speaker_ids): os.makedirs(os.path.join(save_path, speaker_id), exist_ok=True) path_to_speaker = os.path.join(audio_path, speaker_id) video_ids = os.listdir(path_to_speaker) utt_idx = 1
def train(opts): CUDA = True if torch.cuda.is_available() and not opts.no_cuda else False device = 'cuda' if CUDA else 'cpu' num_devices = 1 np.random.seed(opts.seed) random.seed(opts.seed) torch.manual_seed(opts.seed) if CUDA: torch.cuda.manual_seed_all(opts.seed) num_devices = torch.cuda.device_count() print('[*] Using CUDA {} devices'.format(num_devices)) else: print('[!] Using CPU') print('Seeds initialized to {}'.format(opts.seed)) # --------------------- # Build Model frontend = wf_builder(opts.fe_cfg) minions_cfg = pase_parser(opts.net_cfg, batch_acum=opts.batch_acum, device=device, frontend=frontend) model = Waveminionet(minions_cfg=minions_cfg, adv_loss=opts.adv_loss, num_devices=num_devices, frontend=frontend) print(model) print('Frontend params: ', model.frontend.describe_params()) model.to(device) trans = make_transforms(opts, minions_cfg) print(trans) if opts.dtrans_cfg is not None: with open(opts.dtrans_cfg, 'r') as dtr_cfg: dtr = json.load(dtr_cfg) #dtr['trans_p'] = opts.distortion_p dist_trans = config_distortions(**dtr) print(dist_trans) else: dist_trans = None # Build Dataset(s) and DataLoader(s) dataset = getattr(pase.dataset, opts.dataset) dset = dataset(opts.data_root, opts.data_cfg, 'train', transform=trans, noise_folder=opts.noise_folder, whisper_folder=opts.whisper_folder, distortion_probability=opts.distortion_p, distortion_transforms=dist_trans, preload_wav=opts.preload_wav) dloader = DataLoader(dset, batch_size=opts.batch_size, shuffle=True, collate_fn=DictCollater(), num_workers=opts.num_workers, pin_memory=CUDA) # Compute estimation of bpe. As we sample chunks randomly, we # should say that an epoch happened after seeing at least as many # chunks as total_train_wav_dur // chunk_size bpe = (dset.total_wav_dur // opts.chunk_size) // opts.batch_size opts.bpe = bpe if opts.do_eval: va_dset = dataset(opts.data_root, opts.data_cfg, 'valid', transform=trans, noise_folder=opts.noise_folder, whisper_folder=opts.whisper_folder, distortion_probability=opts.distortion_p, distortion_transforms=dist_trans, preload_wav=opts.preload_wav) va_dloader = DataLoader(va_dset, batch_size=opts.batch_size, shuffle=False, collate_fn=DictCollater(), num_workers=opts.num_workers, pin_memory=CUDA) va_bpe = (va_dset.total_wav_dur // opts.chunk_size) // opts.batch_size opts.va_bpe = va_bpe else: va_dloader = None # fastet lr to MI #opts.min_lrs = {'mi':0.001} model.train_(dloader, vars(opts), device=device, va_dloader=va_dloader)
'-e', default='.wav', help='file extension to search for in dataset folder') parser.add_argument('--batch_size', type=int, default=32) args = parser.parse_args() extension = args.extension path = args.path wav_files = get_files(path, extension) if hp.pase_cfg is None: raise ValueError assert hp.pase_ckpt is not None CUDA = torch.cuda.is_available() and hp.cuda hp.device = 'cuda' if CUDA else 'cpu' # Load pase model pase = wf_builder(hp.pase_cfg) pase.load_pretrained(hp.pase_ckpt, load_last=True, verbose=True) pase.to(hp.device) pase.eval() paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print(f'\n{len(wav_files)} {extension[1:]} files found in "{path}"\n') if len(wav_files) == 0: print('Please point wav_path in hparams.py to your dataset,') print('or use the --path option.\n') else: if not hp.ignore_tts: text_dict = ljspeech(path)
dnn_act = cfg['dnn_act'] options = {} options['dnn_lay'] = dnn_lay options['dnn_drop'] = dnn_drop options['dnn_use_batchnorm'] = dnn_use_batchnorm options['dnn_use_laynorm'] = dnn_use_laynorm options['dnn_use_laynorm_inp'] = dnn_use_laynorm_inp options['dnn_use_batchnorm_inp'] = dnn_use_batchnorm_inp options['dnn_act'] = dnn_act # folder creation text_file = open(output_file, "w") # Loading pase pase = wf_builder(cfg_pase) pase.load_pretrained(pase_ckpt, load_last=True, verbose=False) pase.to(device) pase.eval() # reading the training signals print("Waveform reading...") # reading the dev signals fea_dev = {} for wav_file in dev_lst: [signal, fs] = sf.read(data_folder + '/' + wav_file) signal = signal / np.max(np.abs(signal)) fea_id = wav_file.split('/')[-2] + '_' + wav_file.split('/')[-1].split( '.')[0] fea_dev[fea_id] = torch.from_numpy(signal).float().to(device).view(
def cluster(opts): CUDA = True if torch.cuda.is_available() else False device = 'cuda' if CUDA else 'cpu' num_devices = 1 np.random.seed(opts.seed) random.seed(opts.seed) torch.manual_seed(opts.seed) if CUDA: torch.cuda.manual_seed_all(opts.seed) num_devices = torch.cuda.device_count() print('[*] Using CUDA {} devices'.format(num_devices)) else: print('[!] Using CPU') fe = wf_builder(opts.fe_cfg) if opts.fe_ckpt is not None: fe.load_pretrained(opts.fe_ckpt, load_last=True, verbose=True) else: print('WARNING: No pretrained ckpt loaded for FE! Random clustering?') fe.to(device) fe.eval() trans = Compose( [ToTensor(), SingleChunkWav(opts.chunk_size, random_scale=False)]) # Build Dataset(s) and DataLoader(s) dset = PairWavDataset(opts.data_root, opts.data_cfg, 'train', transform=trans) dloader = DataLoader(dset, batch_size=opts.batch_size, shuffle=True, collate_fn=DictCollater(), num_workers=opts.num_workers) # acumulate train chunks and do clustering on them, # with each chunk containing several frames X = [] timings = [] N = opts.num_samples // opts.batch_size beg_t = timeit.default_timer() for bidx in range(1, N + 1, 1): batch = next(dloader.__iter__()) chunk = batch['chunk'] y = fe(chunk.to(device)).mean(dim=2) X.append(y.view(-1, y.size(-1)).cpu().data.numpy()) end_t = timeit.default_timer() timings.append(end_t - beg_t) beg_t = timeit.default_timer() if bidx % opts.log_freq == 0 or bidx >= N: print('Forwarded batch {:4d}/{:4d}, btime: {:.2f} s, ' 'mbtime: {:.2f} s'.format(bidx, N, timings[-1], np.mean(timings)), end='\r') print() X = np.concatenate(X, axis=0) print('Total X shape: ', X.shape) print('Running KMeans...') beg_t = timeit.default_timer() kmeans = KMeans(n_clusters=opts.k_clusters, n_jobs=opts.n_jobs, verbose=0).fit(X) end_t = timeit.default_timer() print('Clusterized in {:.2f} s'.format(end_t - beg_t)) print('Saving KMeans...') with open(os.path.join(opts.save_path, 'kmeans.pkl'), 'wb') as f: pickle.dump(kmeans, f) print('Finished program')
def build_pase(ckpt, model_config): pase = wf_builder(model_config) pase.load_pretrained(ckpt, load_last=True, verbose=False) return pase
res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, adaptnet=adaptnet, mode=hp.voc_mode).to(device) print(voc_model) trainable_params = list(voc_model.parameters()) paths = Paths(hp.data_path, hp.voc_model_id, '') # Load pase model print('Building PASE...') if hp.pase_cfg is not None: # 2 PASEs: (1) Identifier extractor, (2) Content extractor pase_cntnt = wf_builder(hp.pase_cfg) if hp.pase_ckpt is not None: pase_cntnt.load_pretrained(hp.pase_ckpt, load_last=True, verbose=True) pase_cntnt.to(device) if conversion: pase_id = wf_builder(hp.pase_cfg) if hp.pase_ckpt is not None: pase_id.load_pretrained(hp.pase_ckpt, load_last=True, verbose=True) pase_id.to(device) if hp.pase_cntnt_ft: print('Setting Content PASE in TRAIN mode') pase_cntnt.train()
def load_pase_plus(pase_folder=PASE_FOLDER, parameters='trained_model/PASE+_parameters.ckpt'): pase = wf_builder(join(PASE_FOLDER, 'cfg/frontend/PASE+.cfg')) pase.eval() pase.load_pretrained(parameters, load_last=True, verbose=True) return pase.to(CUDA0)