def main(): data_loader = DataLoader(args) if (args.process_data): data_loader.process_data() return torch.cuda.set_device(args.gpu_device) data_loader.load() if (args.model == "rnn"): myModel = model.RNNModel(args, data_loader.vocab_size, 8, data_loader.id_2_vec).cuda() elif (args.model == "cnn"): myModel = model.CNNModel(args, data_loader.vocab_size, 8, data_loader.id_2_vec).cuda() elif (args.model == "baseline"): myModel = model.Baseline(args, data_loader.vocab_size, 8, data_loader.id_2_vec).cuda() else: print("invalid model type") exit(1) if (args.test_only): test(myModel, data_loader, args) else: train(myModel, data_loader, args)
def main(): print(torch.cuda.is_available()) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) outfile = options.outfile sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset print("Preparing data reading...") datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'valid')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] with io.open(os.path.join(datadir, 'filelists', 'test')) as f: filelist += [l.rstrip() for l in f if l.rstrip()] # - create generator for spectra spects = (cached( options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy'), audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps) for fn in filelist) # - pitch-shift if needed if options.pitchshift: import scipy.ndimage spline_order = 2 spects = (scipy.ndimage.affine_transform( spect, (1, 1 / (1 + options.pitchshift / 100.)), output_shape=(len(spect), mel_max), order=spline_order) for spect in spects) # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) # - define generator for mel spectra spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) # - load mean/std meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) # - define generator for Z-scoring spects = ((spect - mean) * istd for spect in spects) # - define generator for silence-padding pad = np.tile((np.log(1e-7) - mean) * istd, (blocklen // 2, 1)) spects = (np.concatenate((pad, spect, pad), axis=0) for spect in spects) # - we start the generator in a background thread (not required) spects = augment.generate_in_background([spects], num_cached=1) mdl = model.CNNModel() mdl.load_state_dict(torch.load(modelfile)) mdl.to(device) mdl.eval() # run prediction loop print("Predicting:") predictions = [] for spect in progress(spects, total=len(filelist), desc='File '): # naive way: pass excerpts of the size used during training # - view spectrogram memory as a 3-tensor of overlapping excerpts num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) # - pass mini-batches through the network and concatenate results preds = np.vstack( mdl( torch.from_numpy( np.transpose( excerpts[pos:pos + batchsize, :, :, np.newaxis], ( 0, 3, 1, 2))).to(device)).cpu().detach().numpy() for pos in range(0, num_excerpts, batchsize)) predictions.append(preds) # save predictions print("Saving predictions") np.savez(outfile, **{fn: pred for fn, pred in zip(filelist, predictions)})
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile lossgradient = options.lossgradient cfg = {} print(options.vars) print('Model save file:', modelfile) print('Lossgrad file:', lossgradient) for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] print('Occluded amount:',cfg['occlude']) bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) dataloader = DatasetLoader(options.dataset, options.cache_spectra, datadir, input_type=options.input_type) batches = dataloader.prepare_batches(sample_rate, frame_len, fps, mel_bands, mel_min, mel_max, blocklen, batchsize) validation_data = DatasetLoader(options.dataset, '../ismir2015/experiments/mel_data/', datadir, dataset_split='valid', input_type='mel_spects') mel_spects_val, labels_val = validation_data.prepare_batches(sample_rate, frame_len, fps, mel_bands, mel_min, mel_max, blocklen, batchsize, batch_data=False) with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) if(options.input_type=='mel_spects'): mdl = model.CNNModel(input_type='mel_spects_norm', is_zeromean=False, sample_rate=sample_rate, frame_len=frame_len, fps=fps, mel_bands=mel_bands, mel_min=mel_min, mel_max=mel_max, bin_mel_max=bin_mel_max, meanstd_file=meanstd_file, device=device) if(lossgradient!='None'): mdl_lossgrad = model.CNNModel(input_type=options.input_type, is_zeromean=False, sample_rate=sample_rate, frame_len=frame_len, fps=fps, mel_bands=mel_bands, mel_min=mel_min, mel_max=mel_max, bin_mel_max=bin_mel_max, meanstd_file=meanstd_file, device=device) mdl_lossgrad.load_state_dict(torch.load(lossgradient)) mdl_lossgrad.to(device) mdl_lossgrad.eval() mdl = mdl.to(device) #Setting up learning rate and learning rate parameters initial_eta = cfg['initial_eta'] eta_decay = cfg['eta_decay'] momentum = cfg['momentum'] eta_decay_every = cfg.get('eta_decay_every', 1) eta = initial_eta #set up loss criterion = torch.nn.BCELoss() #set up optimizer optimizer = torch.optim.SGD(mdl.parameters(),lr=eta,momentum=momentum,nesterov=True) #optimizer = torch.optim.Adam(mdl.parameters(), lr=eta, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=eta_decay_every,gamma=eta_decay) #set up optimizer writer = SummaryWriter(os.path.join(modelfile,'runs')) epochs = cfg['epochs'] epochsize = cfg['epochsize'] batches = iter(batches) #conditions to save model best_val_loss = 100000. best_val_error = 1. #loss gradient values for validation data loss_grad_val = validation_data.prepare_loss_grad_batches(options.loss_grad_save, mel_spects_val, labels_val, mdl_lossgrad, criterion, blocklen, batchsize, device) for epoch in range(epochs): # - Initialize certain parameters that are used to monitor training err = 0 total_norm = 0 loss_accum = 0 mdl.train(True) # - Compute the L-2 norm of the gradients for p in mdl.parameters(): if p.grad is not None: param_norm = p.grad.data.norm(2) total_norm += param_norm.item() ** 2 total_norm = total_norm ** (1. / 2) # - Start the training for this epoch for batch in progress(range(epochsize), min_delay=0.5,desc='Epoch %d/%d: Batch ' % (epoch+1, epochs)): data = next(batches) if(options.input_type=='audio' or options.input_type=='stft'): input_data = data[0] else: input_data = np.transpose(data[0][:,:,:,np.newaxis],(0,3,1,2)) labels = data[1][:,np.newaxis].astype(np.float32) input_data_loss = input_data if lossgradient!='None': g = loss_grad(mdl_lossgrad, torch.from_numpy(input_data_loss).to(device).requires_grad_(True), torch.from_numpy(labels).to(device), criterion) g = np.squeeze(g) input_data = (input_data-mean) * istd for i in range(batchsize): if(options.lossgrad_algorithm=='grad'): rank_matrix = np.abs(g[i]) elif(options.lossgrad_algorithm=='gradxinp'): rank_matrix = np.squeeze(g[i]*input_data[i,:,:,:]) elif(options.lossgrad_algorithm=='gradorig'): rank_matrix = g[i] v = np.argsort(rank_matrix, axis=None)[-cfg['occlude']:] input_data[i,:,v//80,v%80] = 0 else: for i in range(batchsize): #print('random') v = np.random.choice(115*80, cfg['occlude'], replace=False) input_data[i,:,v//80,v%80] = 0 input_data = input_data.astype(floatX) labels = (0.02 + 0.96*labels) optimizer.zero_grad() outputs = mdl(torch.from_numpy(input_data).to(device)) loss = criterion(outputs, torch.from_numpy(labels).to(device)) loss.backward() optimizer.step() #print(loss.item()) loss_accum += loss.item() # - Compute validation loss and error if desired if options.validate: #mdl.model_type = 'mel_spects' from eval import evaluate mdl.train(False) val_loss = 0 preds = [] labs = [] max_len = fps num_iter = 0 for spect, label, g in zip(mel_spects_val, labels_val, loss_grad_val): num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) # - Pass mini-batches through the network and concatenate results for pos in range(0, num_excerpts, batchsize): input_data = np.transpose(excerpts[pos:pos + batchsize,:,:,np.newaxis],(0,3,1,2)) #if (pos+batchsize>num_excerpts): # label_batch = label[blocklen//2+pos:blocklen//2+num_excerpts, # np.newaxis].astype(np.float32) #else: # label_batch = label[blocklen//2+pos:blocklen//2+pos+batchsize, # np.newaxis].astype(np.float32) if (pos+batchsize>num_excerpts): label_batch = label[pos:num_excerpts, np.newaxis].astype(np.float32) else: label_batch = label[pos:pos+batchsize, np.newaxis].astype(np.float32) #input_data_loss = input_data if lossgradient!='None': #grads = loss_grad(mdl_lossgrad, torch.from_numpy(input_data_loss).to(device).requires_grad_(True), torch.from_numpy(label_batch).to(device), criterion) input_data = (input_data-mean) * istd for i in range(input_data.shape[0]): if(options.lossgrad_algorithm=='grad'): rank_matrix = np.abs(g[i]) elif(options.lossgrad_algorithm=='gradxinp'): rank_matrix = np.squeeze(g[i]*input_data[i,:,:,:]) elif(options.lossgrad_algorithm=='gradorig'): rank_matrix = g[i] v = np.argsort(np.abs(rank_matrix), axis=None)[-cfg['occlude']:] input_data[i,:,v//80,v%80] = 0 else: for i in range(input_data.shape[0]): #print('random') v = np.random.choice(115*80, cfg['occlude'], replace=False) input_data[i,:,v//80,v%80] = 0 input_data = input_data.astype(floatX) pred = mdl(torch.from_numpy(input_data).to(device)) e = criterion(pred,torch.from_numpy(label_batch).to(device)) preds = np.append(preds,pred[:,0].cpu().detach().numpy()) labs = np.append(labs,label_batch) val_loss +=e.item() num_iter+=1 #mdl.model_type = 'mel_spects_norm' print("Validation loss: %.3f" % (val_loss / num_iter)) _, results = evaluate(preds,labs) print("Validation error: %.3f" % (1 - results['accuracy'])) if(1-results['accuracy']<best_val_error): torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth')) best_val_loss = val_loss/num_iter best_val_error = 1-results['accuracy'] print('New saved model',best_val_loss, best_val_error) #Update the learning rate scheduler.step() print('Training Loss per epoch', loss_accum/epochsize) # - Save parameters for examining writer.add_scalar('Training Loss',loss_accum/epochsize,epoch) if(options.validate): writer.add_scalar('Validation loss', val_loss/num_iter,epoch) writer.add_scalar('Gradient norm', total_norm, epoch) writer.add_scalar('Validation error', 1-results['accuracy']) #for param_group in optimizer.param_groups: #print(param_group['lr']) if not options.validate: torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth')) with io.open(os.path.join(modelfile, 'model.vars'), 'w') as f: f.writelines('%s=%s\n' % kv for kv in cfg.items())
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile cfg = {} print(options.vars) for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) if (options.input_type == 'audio'): dataloader = DatasetLoader(options.dataset, options.cache_spectra, datadir, input_type=options.input_type) batches = dataloader.prepare_audio_batches(sample_rate, frame_len, fps, blocklen, batchsize) else: dataloader = DatasetLoader(options.dataset, options.cache_spectra, datadir, input_type=options.input_type) batches = dataloader.prepare_batches(sample_rate, frame_len, fps, mel_bands, mel_min, mel_max, blocklen, batchsize) validation_data = DatasetLoader(options.dataset, '../ismir2015/experiments/mel_data/', datadir, dataset_split='valid', input_type='mel_spects') mel_spects_val, labels_val = validation_data.prepare_batches( sample_rate, frame_len, fps, mel_bands, mel_min, mel_max, blocklen, batchsize, batch_data=False) mdl = model.CNNModel(model_type=options.model_type, input_type=options.input_type, is_zeromean=False, sample_rate=sample_rate, frame_len=frame_len, fps=fps, mel_bands=mel_bands, mel_min=mel_min, mel_max=mel_max, bin_mel_max=bin_mel_max, meanstd_file=meanstd_file, device=device) mdl = mdl.to(device) #Setting up learning rate and learning rate parameters initial_eta = cfg['initial_eta'] eta_decay = cfg['eta_decay'] momentum = cfg['momentum'] eta_decay_every = cfg.get('eta_decay_every', 1) eta = initial_eta #set up loss criterion = torch.nn.BCELoss() #set up optimizer optimizer = torch.optim.SGD(mdl.parameters(), lr=eta, momentum=momentum, nesterov=True) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=eta_decay_every, gamma=eta_decay) #set up optimizer writer = SummaryWriter(os.path.join(modelfile, 'runs')) epochs = cfg['epochs'] epochsize = cfg['epochsize'] batches = iter(batches) #conditions to save model best_val_loss = 100000. best_val_error = 1. for epoch in range(epochs): # - Initialize certain parameters that are used to monitor training err = 0 total_norm = 0 loss_accum = 0 mdl.train(True) # - Compute the L-2 norm of the gradients for p in mdl.parameters(): if p.grad is not None: param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) # - Start the training for this epoch for batch in progress(range(epochsize), min_delay=0.5, desc='Epoch %d/%d: Batch ' % (epoch + 1, epochs)): data = next(batches) if (options.input_type == 'audio' or options.input_type == 'stft'): input_data = data[0] else: input_data = np.transpose(data[0][:, :, :, np.newaxis], (0, 3, 1, 2)) labels = data[1][:, np.newaxis].astype(np.float32) #map labels to make them softer if not options.adversarial_training: labels = (0.02 + 0.96 * labels) optimizer.zero_grad() if (options.adversarial_training): mdl.train(False) if (options.input_type == 'stft'): input_data_adv = attacks.PGD( mdl, torch.from_numpy(input_data).to(device), target=torch.from_numpy(labels).to(device), eps=cfg['eps'], step_size=cfg['eps_iter'], iterations=cfg['nb_iter'], use_best=True, random_start=True, clip_min=0, clip_max=1e8).cpu().detach().numpy() else: input_data_adv = attacks.PGD( mdl, torch.from_numpy(input_data).to(device), target=torch.from_numpy(labels).to(device), eps=cfg['eps'], step_size=cfg['eps_iter'], iterations=cfg['nb_iter'], use_best=True, random_start=True).cpu().detach().numpy() mdl.train(True) optimizer.zero_grad() outputs = mdl(torch.from_numpy(input_data_adv).to(device)) else: optimizer.zero_grad() outputs = mdl(torch.from_numpy(input_data).to(device)) #input(outputs.size()) #input(mdl.conv(torch.from_numpy(input_data).to(device)).cpu().detach().numpy().shape) loss = criterion(outputs, torch.from_numpy(labels).to(device)) loss.backward() optimizer.step() print(loss.item()) loss_accum += loss.item() # - Compute validation loss and error if desired if options.validate: mdl.input_type = 'mel_spects' from eval import evaluate mdl.train(False) val_loss = 0 preds = [] labs = [] max_len = fps num_iter = 0 for spect, label in zip(mel_spects_val, labels_val): num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) # - Pass mini-batches through the network and concatenate results for pos in range(0, num_excerpts, batchsize): input_data = np.transpose( excerpts[pos:pos + batchsize, :, :, np.newaxis], (0, 3, 1, 2)) #if (pos+batchsize>num_excerpts): # label_batch = label[blocklen//2+pos:blocklen//2+num_excerpts, # np.newaxis].astype(np.float32) #else: # label_batch = label[blocklen//2+pos:blocklen//2+pos+batchsize, # np.newaxis].astype(np.float32) if (pos + batchsize > num_excerpts): label_batch = label[pos:num_excerpts, np.newaxis].astype(np.float32) else: label_batch = label[pos:pos + batchsize, np.newaxis].astype(np.float32) pred = mdl(torch.from_numpy(input_data).to(device)) e = criterion(pred, torch.from_numpy(label_batch).to(device)) preds = np.append(preds, pred[:, 0].cpu().detach().numpy()) labs = np.append(labs, label_batch) val_loss += e.item() num_iter += 1 mdl.input_type = options.input_type print("Validation loss: %.3f" % (val_loss / num_iter)) _, results = evaluate(preds, labs) print("Validation error: %.3f" % (1 - results['accuracy'])) if (1 - results['accuracy'] < best_val_error): torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth')) best_val_loss = val_loss / num_iter best_val_error = 1 - results['accuracy'] print('New saved model', best_val_loss, best_val_error) #Update the learning rate scheduler.step() print('Training Loss per epoch', loss_accum / epochsize) # - Save parameters for examining writer.add_scalar('Training Loss', loss_accum / epochsize, epoch) writer.add_scalar('Validation loss', val_loss / num_iter, epoch) writer.add_scalar('Gradient norm', total_norm, epoch) writer.add_scalar('Validation error', 1 - results['accuracy']) #for param_group in optimizer.param_groups: #print(param_group['lr']) if not options.validate: torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth')) with io.open(os.path.join(modelfile, 'model.vars'), 'w') as f: f.writelines('%s=%s\n' % kv for kv in cfg.items())
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'train')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] if options.validate: with io.open(os.path.join(datadir, 'filelists', 'valid')) as f: filelist_val = [l.strip() for l in f if l.strip()] filelist.extend(filelist_val) else: filelist_val = [] # - compute spectra print("Computing%s spectra..." % (" or loading" if options.cache_spectra else "")) spects = [] for fn in progress(filelist, 'File '): cache_fn = (options.cache_spectra and os.path.join(options.cache_spectra, fn + '.npy')) spects.append( cached(cache_fn, audio.extract_spect, os.path.join(datadir, 'audio', fn), sample_rate, frame_len, fps)) # - load and convert corresponding labels print("Loading labels...") labels = [] for fn, spect in zip(filelist, spects): fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab') with io.open(fn) as f: segments = [l.rstrip().split() for l in f if l.rstrip()] segments = [(float(start), float(end), label == 'sing') for start, end, label in segments] timestamps = np.arange(len(spect)) / float(fps) labels.append(create_aligned_targets(segments, timestamps, np.bool)) # - prepare mel filterbank filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands, mel_min, mel_max) filterbank = filterbank[:bin_mel_max].astype(floatX) if options.validate: spects_val = spects[-len(filelist_val):] spects = spects[:-len(filelist_val)] labels_val = labels[-len(filelist_val):] labels = labels[:-len(filelist_val)] # - precompute mel spectra, if needed, otherwise just define a generator mel_spects = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects) if not options.augment: mel_spects = list(mel_spects) del spects # - load mean/std or compute it, if not computed yet meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) try: with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] except (IOError, KeyError): print("Computing mean and standard deviation...") mean, std = znorm.compute_mean_std(mel_spects) np.savez(meanstd_file, mean=mean, std=std) mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) # - prepare training data generator print("Preparing training data feed...") if not options.augment: # Without augmentation, we just precompute the normalized mel spectra # and create a generator that returns mini-batches of random excerpts mel_spects = [(spect - mean) * istd for spect in mel_spects] batches = augment.grab_random_excerpts(mel_spects, labels, batchsize, blocklen) else: # For time stretching and pitch shifting, it pays off to preapply the # spline filter to each input spectrogram, so it does not need to be # applied to each mini-batch later. spline_order = cfg['spline_order'] if spline_order > 1: from scipy.ndimage import spline_filter spects = [ spline_filter(spect, spline_order).astype(floatX) for spect in spects ] # We define a function to create the mini-batch generator. This allows # us to easily create multiple generators for multithreading if needed. def create_datafeed(spects, labels): # With augmentation, as we want to apply random time-stretching, # we request longer excerpts than we finally need to return. max_stretch = cfg['max_stretch'] batches = augment.grab_random_excerpts( spects, labels, batchsize=batchsize, frames=int(blocklen / (1 - max_stretch))) # We wrap the generator in another one that applies random time # stretching and pitch shifting, keeping a given number of frames # and bins only. max_shift = cfg['max_shift'] batches = augment.apply_random_stretch_shift(batches, max_stretch, max_shift, keep_frames=blocklen, keep_bins=bin_mel_max, order=spline_order, prefiltered=True) # We transform the excerpts to mel frequency and log magnitude. batches = augment.apply_filterbank(batches, filterbank) batches = augment.apply_logarithm(batches) # We apply random frequency filters max_db = cfg['max_db'] batches = augment.apply_random_filters(batches, filterbank, mel_max, max_db=max_db) # We apply normalization batches = augment.apply_znorm(batches, mean, istd) return batches # We start the mini-batch generator and augmenter in one or more # background threads or processes (unless disabled). bg_threads = cfg['bg_threads'] bg_processes = cfg['bg_processes'] if not bg_threads and not bg_processes: # no background processing: just create a single generator batches = create_datafeed(spects, labels) elif bg_threads: # multithreading: create a separate generator per thread batches = augment.generate_in_background( [create_datafeed(spects, labels) for _ in range(bg_threads)], num_cached=bg_threads * 5) elif bg_processes: # multiprocessing: single generator is forked along with processes batches = augment.generate_in_background( [create_datafeed(spects, labels)] * bg_processes, num_cached=bg_processes * 25, in_processes=True) ########################################################################### #-----------Main changes to code to make it work with pytorch-------------# ########################################################################### print("preparing training function...") mdl = model.CNNModel() mdl = mdl.to(device) #Setting up learning rate and learning rate parameters initial_eta = cfg['initial_eta'] eta_decay = cfg['eta_decay'] momentum = cfg['momentum'] eta_decay_every = cfg.get('eta_decay_every', 1) eta = initial_eta #set up loss criterion = torch.nn.BCELoss() #set up optimizer optimizer = torch.optim.SGD(mdl.parameters(), lr=eta, momentum=momentum, nesterov=True) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=eta_decay_every, gamma=eta_decay) #set up optimizer writer = SummaryWriter(os.path.join(modelfile, 'runs')) epochs = cfg['epochs'] epochsize = cfg['epochsize'] batches = iter(batches) #conditions to save model best_val_loss = 100000. best_val_error = 1. for epoch in range(epochs): # - Initialize certain parameters that are used to monitor training err = 0 total_norm = 0 loss_accum = 0 mdl.train(True) # - Compute the L-2 norm of the gradients for p in mdl.parameters(): if p.grad is not None: param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) # - Start the training for this epoch for batch in progress(range(epochsize), min_delay=0.5, desc='Epoch %d/%d: Batch ' % (epoch + 1, epochs)): data = next(batches) input_data = np.transpose(data[0][:, :, :, np.newaxis], (0, 3, 1, 2)) labels = data[1][:, np.newaxis].astype(np.float32) #map labels to make them softer labels = (0.02 + 0.96 * labels) optimizer.zero_grad() outputs = mdl(torch.from_numpy(input_data).to(device)) loss = criterion(outputs, torch.from_numpy(labels).to(device)) loss.backward() optimizer.step() loss_accum += loss.item() # - Compute validation loss and error if desired if options.validate: from eval import evaluate mdl.train(False) val_loss = 0 preds = [] labs = [] max_len = fps mel_spects_val = (np.log( np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7)) for spect in spects_val) mel_spects_val = [(spect - mean) * istd for spect in mel_spects_val] num_iter = 0 for spect, label in zip(mel_spects_val, labels_val): num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect, shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) # - Pass mini-batches through the network and concatenate results for pos in range(0, num_excerpts, batchsize): input_data = np.transpose( excerpts[pos:pos + batchsize, :, :, np.newaxis], (0, 3, 1, 2)) if (pos + batchsize > num_excerpts): label_batch = label[blocklen // 2 + pos:blocklen // 2 + num_excerpts, np.newaxis].astype(np.float32) else: label_batch = label[blocklen // 2 + pos:blocklen // 2 + pos + batchsize, np.newaxis].astype(np.float32) pred = mdl(torch.from_numpy(input_data).to(device)) e = criterion(pred, torch.from_numpy(label_batch).to(device)) preds = np.append(preds, pred[:, 0].cpu().detach().numpy()) labs = np.append(labs, label_batch) val_loss += e.item() num_iter += 1 print("Validation loss: %.3f" % (val_loss / num_iter)) _, results = evaluate(preds, labs) print("Validation error: %.3f" % (1 - results['accuracy'])) if (val_loss / num_iter < best_val_loss and (1 - results['accuracy']) < best_val_error): torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth')) best_val_loss = val_loss / num_iter best_val_error = 1 - results['accuracy'] print('New saved model', best_val_loss, best_val_error) #Update the learning rate scheduler.step() print('Training Loss per epoch', loss_accum / epochsize) # - Save parameters for examining writer.add_scalar('Training Loss', loss_accum / epochsize, epoch) writer.add_scalar('Validation loss', val_loss / num_iter, epoch) writer.add_scalar('Gradient norm', total_norm, epoch) writer.add_scalar('Validation error', 1 - results['accuracy']) for param_group in optimizer.param_groups: print(param_group['lr']) if not options.validate: torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth'))
def main(): print(torch.cuda.is_available()) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # parse command line parser = opts_parser() options = parser.parse_args() modelfile = options.modelfile lossgradient = options.lossgradient cfg = {} for fn in options.vars: cfg.update(config.parse_config_file(fn)) cfg.update(config.parse_variable_assignments(options.var)) outfile = options.outfile sample_rate = cfg['sample_rate'] frame_len = cfg['frame_len'] fps = cfg['fps'] mel_bands = cfg['mel_bands'] mel_min = cfg['mel_min'] mel_max = cfg['mel_max'] blocklen = cfg['blocklen'] batchsize = cfg['batchsize'] bin_nyquist = frame_len // 2 + 1 bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate # prepare dataset print("Preparing data reading...") datadir = os.path.join(os.path.dirname(__file__), os.path.pardir, 'datasets', options.dataset) # - load filelist with io.open(os.path.join(datadir, 'filelists', 'valid')) as f: filelist = [l.rstrip() for l in f if l.rstrip()] with io.open(os.path.join(datadir, 'filelists', 'test')) as f: filelist += [l.rstrip() for l in f if l.rstrip()] # - load mean/std meanstd_file = os.path.join(os.path.dirname(__file__), '%s_meanstd.npz' % options.dataset) dataloader = DatasetLoader(options.dataset, options.cache_spectra, datadir, input_type=options.input_type, filelist=filelist) mel_spects, labels = dataloader.prepare_batches(sample_rate, frame_len, fps, mel_bands, mel_min, mel_max, blocklen, batchsize, batch_data=False) with np.load(meanstd_file) as f: mean = f['mean'] std = f['std'] mean = mean.astype(floatX) istd = np.reciprocal(std).astype(floatX) mdl = model.CNNModel(input_type='mel_spects_norm', is_zeromean=False, meanstd_file=meanstd_file, device=device) mdl.load_state_dict(torch.load(modelfile)) mdl.to(device) mdl.eval() if (lossgradient != 'None'): mdl_lossgrad = model.CNNModel(input_type=options.input_type, is_zeromean=False, sample_rate=sample_rate, frame_len=frame_len, fps=fps, mel_bands=mel_bands, mel_min=mel_min, mel_max=mel_max, bin_mel_max=bin_mel_max, meanstd_file=meanstd_file, device=device) mdl_lossgrad.load_state_dict(torch.load(lossgradient)) mdl_lossgrad.to(device) mdl_lossgrad.eval() criterion = torch.nn.BCELoss() loss_grad_val = dataloader.prepare_loss_grad_batches( options.loss_grad_save, mel_spects, labels, mdl_lossgrad, criterion, blocklen, batchsize, device) # run prediction loop print("Predicting:") predictions = [] #for spect, g in zip(mel_spects, loss_grad_val): c = 0 for spect in progress(mel_spects, total=len(filelist), desc='File '): if (lossgradient != 'None'): g = loss_grad_val[c] c += 1 # naive way: pass excerpts of the size used during training # - view spectrogram memory as a 3-tensor of overlapping excerpts num_excerpts = len(spect) - blocklen + 1 excerpts = np.lib.stride_tricks.as_strided( spect.astype(floatX), shape=(num_excerpts, blocklen, spect.shape[1]), strides=(spect.strides[0], spect.strides[0], spect.strides[1])) preds = np.zeros((num_excerpts, 1)) count = 0 for pos in range(0, num_excerpts, batchsize): input_data = np.transpose( excerpts[pos:pos + batchsize, :, :, np.newaxis], (0, 3, 1, 2)) input_data = (input_data - mean) * istd if lossgradient != 'None': for i in range(input_data.shape[0]): if (options.lossgrad_algorithm == 'grad'): rank_matrix = np.abs(g[i + pos]) elif (options.lossgrad_algorithm == 'gradxinp'): rank_matrix = np.squeeze(g[i + pos] * input_data[i, :, :, :]) elif (options.lossgrad_algorithm == 'gradorig'): rank_matrix = g[i + pos] if (options.ROAR == 1): v = np.argsort(rank_matrix, axis=None)[-cfg['occlude']:] else: v = np.argsort(rank_matrix, axis=None)[:cfg['occlude']] input_data[i, :, v // 80, v % 80] = 0 else: for i in range(input_data.shape[0]): #print('random') v = np.random.choice(115 * 80, cfg['occlude'], replace=False) input_data[i, :, v // 80, v % 80] = 0 count += 1 #print('Here') #preds = np.vstack(mdl.forward(torch.from_numpy( # np.transpose(excerpts[pos:pos + batchsize,:,:, # np.newaxis],(0,3,1,2))).to(device)).cpu().detach().numpy() # for pos in range(0, num_excerpts, batchsize)) preds[pos:pos + batchsize, :] = mdl( torch.from_numpy(input_data).to( device)).cpu().detach().numpy() print('Here') predictions.append(preds) # save predictions print("Saving predictions") np.savez(outfile, **{fn: pred for fn, pred in zip(filelist, predictions)})