예제 #1
0
def main():
    data_loader = DataLoader(args)
    if (args.process_data):
        data_loader.process_data()
        return
    torch.cuda.set_device(args.gpu_device)

    data_loader.load()
    if (args.model == "rnn"):
        myModel = model.RNNModel(args, data_loader.vocab_size, 8,
                                 data_loader.id_2_vec).cuda()
    elif (args.model == "cnn"):
        myModel = model.CNNModel(args, data_loader.vocab_size, 8,
                                 data_loader.id_2_vec).cuda()
    elif (args.model == "baseline"):
        myModel = model.Baseline(args, data_loader.vocab_size, 8,
                                 data_loader.id_2_vec).cuda()
    else:
        print("invalid model type")
        exit(1)

    if (args.test_only):
        test(myModel, data_loader, args)
    else:
        train(myModel, data_loader, args)
예제 #2
0
def main():
    print(torch.cuda.is_available())
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    modelfile = options.modelfile

    cfg = {}
    for fn in options.vars:
        cfg.update(config.parse_config_file(fn))

    cfg.update(config.parse_variable_assignments(options.var))

    outfile = options.outfile
    sample_rate = cfg['sample_rate']
    frame_len = cfg['frame_len']
    fps = cfg['fps']
    mel_bands = cfg['mel_bands']
    mel_min = cfg['mel_min']
    mel_max = cfg['mel_max']
    blocklen = cfg['blocklen']
    batchsize = cfg['batchsize']

    bin_nyquist = frame_len // 2 + 1
    bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate

    # prepare dataset
    print("Preparing data reading...")
    datadir = os.path.join(os.path.dirname(__file__), os.path.pardir,
                           'datasets', options.dataset)

    # - load filelist
    with io.open(os.path.join(datadir, 'filelists', 'valid')) as f:
        filelist = [l.rstrip() for l in f if l.rstrip()]
    with io.open(os.path.join(datadir, 'filelists', 'test')) as f:
        filelist += [l.rstrip() for l in f if l.rstrip()]

    # - create generator for spectra
    spects = (cached(
        options.cache_spectra
        and os.path.join(options.cache_spectra, fn + '.npy'),
        audio.extract_spect, os.path.join(datadir, 'audio',
                                          fn), sample_rate, frame_len, fps)
              for fn in filelist)

    # - pitch-shift if needed
    if options.pitchshift:
        import scipy.ndimage
        spline_order = 2
        spects = (scipy.ndimage.affine_transform(
            spect, (1, 1 / (1 + options.pitchshift / 100.)),
            output_shape=(len(spect), mel_max),
            order=spline_order) for spect in spects)

    # - prepare mel filterbank
    filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands,
                                             mel_min, mel_max)
    filterbank = filterbank[:bin_mel_max].astype(floatX)

    # - define generator for mel spectra
    spects = (np.log(
        np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7))
              for spect in spects)

    # - load mean/std
    meanstd_file = os.path.join(os.path.dirname(__file__),
                                '%s_meanstd.npz' % options.dataset)
    with np.load(meanstd_file) as f:
        mean = f['mean']
        std = f['std']
    mean = mean.astype(floatX)
    istd = np.reciprocal(std).astype(floatX)

    # - define generator for Z-scoring
    spects = ((spect - mean) * istd for spect in spects)

    # - define generator for silence-padding
    pad = np.tile((np.log(1e-7) - mean) * istd, (blocklen // 2, 1))
    spects = (np.concatenate((pad, spect, pad), axis=0) for spect in spects)

    # - we start the generator in a background thread (not required)
    spects = augment.generate_in_background([spects], num_cached=1)

    mdl = model.CNNModel()
    mdl.load_state_dict(torch.load(modelfile))
    mdl.to(device)
    mdl.eval()

    # run prediction loop
    print("Predicting:")
    predictions = []
    for spect in progress(spects, total=len(filelist), desc='File '):
        # naive way: pass excerpts of the size used during training
        # - view spectrogram memory as a 3-tensor of overlapping excerpts
        num_excerpts = len(spect) - blocklen + 1
        excerpts = np.lib.stride_tricks.as_strided(
            spect,
            shape=(num_excerpts, blocklen, spect.shape[1]),
            strides=(spect.strides[0], spect.strides[0], spect.strides[1]))

        # - pass mini-batches through the network and concatenate results
        preds = np.vstack(
            mdl(
                torch.from_numpy(
                    np.transpose(
                        excerpts[pos:pos + batchsize, :, :, np.newaxis], (
                            0, 3, 1, 2))).to(device)).cpu().detach().numpy()
            for pos in range(0, num_excerpts, batchsize))
        predictions.append(preds)

    # save predictions
    print("Saving predictions")
    np.savez(outfile, **{fn: pred for fn, pred in zip(filelist, predictions)})
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    modelfile = options.modelfile
    lossgradient = options.lossgradient
    cfg = {}
    print(options.vars)
    print('Model save file:', modelfile)
    print('Lossgrad file:', lossgradient)
    for fn in options.vars:
        cfg.update(config.parse_config_file(fn))

    cfg.update(config.parse_variable_assignments(options.var))
    
    sample_rate = cfg['sample_rate']
    frame_len = cfg['frame_len']
    fps = cfg['fps']
    mel_bands = cfg['mel_bands']
    mel_min = cfg['mel_min']
    mel_max = cfg['mel_max']
    blocklen = cfg['blocklen']
    batchsize = cfg['batchsize']
    print('Occluded amount:',cfg['occlude'])
    bin_nyquist = frame_len // 2 + 1
    bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate

    # prepare dataset
    datadir = os.path.join(os.path.dirname(__file__),
                           os.path.pardir, 'datasets', options.dataset)
    
    meanstd_file = os.path.join(os.path.dirname(__file__),
                                '%s_meanstd.npz' % options.dataset)
 
    dataloader = DatasetLoader(options.dataset, options.cache_spectra, datadir, input_type=options.input_type)
    batches = dataloader.prepare_batches(sample_rate, frame_len, fps,
            mel_bands, mel_min, mel_max, blocklen, batchsize)
    
    validation_data = DatasetLoader(options.dataset, '../ismir2015/experiments/mel_data/', datadir,
            dataset_split='valid', input_type='mel_spects')
    mel_spects_val, labels_val = validation_data.prepare_batches(sample_rate, frame_len, fps,
            mel_bands, mel_min, mel_max, blocklen, batchsize, batch_data=False)

    with np.load(meanstd_file) as f:
        mean = f['mean']
        std = f['std']
    mean = mean.astype(floatX)
    istd = np.reciprocal(std).astype(floatX)
    if(options.input_type=='mel_spects'):
        mdl = model.CNNModel(input_type='mel_spects_norm', is_zeromean=False,
            sample_rate=sample_rate, frame_len=frame_len, fps=fps,
            mel_bands=mel_bands, mel_min=mel_min, mel_max=mel_max,
            bin_mel_max=bin_mel_max, meanstd_file=meanstd_file, device=device)
        if(lossgradient!='None'):
            mdl_lossgrad =  model.CNNModel(input_type=options.input_type,
                is_zeromean=False, sample_rate=sample_rate, frame_len=frame_len, fps=fps,
                mel_bands=mel_bands, mel_min=mel_min, mel_max=mel_max,
                bin_mel_max=bin_mel_max, meanstd_file=meanstd_file, device=device)
            mdl_lossgrad.load_state_dict(torch.load(lossgradient))
            mdl_lossgrad.to(device)
            mdl_lossgrad.eval()
 
    mdl = mdl.to(device)
    
    #Setting up learning rate and learning rate parameters
    initial_eta = cfg['initial_eta']
    eta_decay = cfg['eta_decay']
    momentum = cfg['momentum']
    eta_decay_every = cfg.get('eta_decay_every', 1)
    eta = initial_eta

    #set up loss
    criterion = torch.nn.BCELoss()

    #set up optimizer
    optimizer = torch.optim.SGD(mdl.parameters(),lr=eta,momentum=momentum,nesterov=True)
    #optimizer = torch.optim.Adam(mdl.parameters(), lr=eta, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=eta_decay_every,gamma=eta_decay)

    #set up optimizer 
    writer = SummaryWriter(os.path.join(modelfile,'runs'))

    
    epochs = cfg['epochs']
    epochsize = cfg['epochsize']
    batches = iter(batches)
    
    #conditions to save model
    best_val_loss = 100000.
    best_val_error = 1.
    
    #loss gradient values for validation data
    loss_grad_val = validation_data.prepare_loss_grad_batches(options.loss_grad_save,
            mel_spects_val, labels_val, mdl_lossgrad, criterion, blocklen, batchsize, device)
    for epoch in range(epochs):
        # - Initialize certain parameters that are used to monitor training
        err = 0
        total_norm = 0
        loss_accum = 0
        mdl.train(True)
        # - Compute the L-2 norm of the gradients
        for p in mdl.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)
        
        # - Start the training for this epoch
        for batch in progress(range(epochsize), min_delay=0.5,desc='Epoch %d/%d: Batch ' % (epoch+1, epochs)):
            data = next(batches)
            if(options.input_type=='audio' or options.input_type=='stft'):
                input_data = data[0]
            else:
                input_data = np.transpose(data[0][:,:,:,np.newaxis],(0,3,1,2))
            labels = data[1][:,np.newaxis].astype(np.float32)
            input_data_loss = input_data
            
            if lossgradient!='None':
                g = loss_grad(mdl_lossgrad, torch.from_numpy(input_data_loss).to(device).requires_grad_(True), torch.from_numpy(labels).to(device), criterion)
                g = np.squeeze(g)
                input_data = (input_data-mean) * istd
                for i in range(batchsize):
                    if(options.lossgrad_algorithm=='grad'):
                        rank_matrix = np.abs(g[i])
                    elif(options.lossgrad_algorithm=='gradxinp'):
                        rank_matrix = np.squeeze(g[i]*input_data[i,:,:,:])
                    elif(options.lossgrad_algorithm=='gradorig'):
                        rank_matrix = g[i]
                    v = np.argsort(rank_matrix, axis=None)[-cfg['occlude']:]
                    input_data[i,:,v//80,v%80] = 0
 
            else:
                for i in range(batchsize):
                    #print('random')
                    v = np.random.choice(115*80, cfg['occlude'], replace=False)
                    input_data[i,:,v//80,v%80] = 0
          
            input_data = input_data.astype(floatX)

            labels = (0.02 + 0.96*labels)
            
            optimizer.zero_grad()
            outputs = mdl(torch.from_numpy(input_data).to(device))
        
            loss = criterion(outputs, torch.from_numpy(labels).to(device))
            loss.backward()
            optimizer.step()
            #print(loss.item())
            loss_accum += loss.item()
   
        # - Compute validation loss and error if desired
        if options.validate:
            #mdl.model_type = 'mel_spects'
            from eval import evaluate
            mdl.train(False) 
            val_loss = 0
            preds = []
            labs = []
            max_len = fps
            
            num_iter = 0 

            for spect, label, g in zip(mel_spects_val, labels_val, loss_grad_val):
                num_excerpts = len(spect) - blocklen + 1
                excerpts = np.lib.stride_tricks.as_strided(
                    spect, shape=(num_excerpts, blocklen, spect.shape[1]),
                    strides=(spect.strides[0], spect.strides[0], spect.strides[1]))
                
                # - Pass mini-batches through the network and concatenate results
                for pos in range(0, num_excerpts, batchsize):
                    input_data = np.transpose(excerpts[pos:pos + batchsize,:,:,np.newaxis],(0,3,1,2))
                    #if (pos+batchsize>num_excerpts):
                    #    label_batch = label[blocklen//2+pos:blocklen//2+num_excerpts,
                    #            np.newaxis].astype(np.float32)
                    #else:
                    #    label_batch = label[blocklen//2+pos:blocklen//2+pos+batchsize,
                    #            np.newaxis].astype(np.float32)
                    if (pos+batchsize>num_excerpts):
                        label_batch = label[pos:num_excerpts,
                               np.newaxis].astype(np.float32)
                    else:
                        label_batch = label[pos:pos+batchsize,
                                np.newaxis].astype(np.float32)
                    
                    #input_data_loss = input_data  
                    if lossgradient!='None':
                        #grads = loss_grad(mdl_lossgrad, torch.from_numpy(input_data_loss).to(device).requires_grad_(True), torch.from_numpy(label_batch).to(device), criterion)
                        input_data = (input_data-mean) * istd
                        for i in range(input_data.shape[0]):
                            if(options.lossgrad_algorithm=='grad'):
                                rank_matrix = np.abs(g[i])
                            elif(options.lossgrad_algorithm=='gradxinp'):
                                rank_matrix = np.squeeze(g[i]*input_data[i,:,:,:])
                            elif(options.lossgrad_algorithm=='gradorig'):
                                rank_matrix = g[i]
                
                            v = np.argsort(np.abs(rank_matrix), axis=None)[-cfg['occlude']:]
                            input_data[i,:,v//80,v%80] = 0
                    else:
                        for i in range(input_data.shape[0]):
                            #print('random')
                            v = np.random.choice(115*80, cfg['occlude'], replace=False)
                            input_data[i,:,v//80,v%80] = 0
          
                    input_data = input_data.astype(floatX)
          
                    pred = mdl(torch.from_numpy(input_data).to(device))
                    e = criterion(pred,torch.from_numpy(label_batch).to(device))
                    preds = np.append(preds,pred[:,0].cpu().detach().numpy())
                    labs = np.append(labs,label_batch)
                    val_loss +=e.item()
                    num_iter+=1
            #mdl.model_type = 'mel_spects_norm'
            print("Validation loss: %.3f" % (val_loss / num_iter))
            _, results = evaluate(preds,labs)
            print("Validation error: %.3f" % (1 - results['accuracy']))
            
            if(1-results['accuracy']<best_val_error):
                torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth'))
                best_val_loss = val_loss/num_iter
                best_val_error = 1-results['accuracy']
                print('New saved model',best_val_loss, best_val_error)
                    
        #Update the learning rate
        scheduler.step()
        
        print('Training Loss per epoch', loss_accum/epochsize) 
        
        # - Save parameters for examining
        writer.add_scalar('Training Loss',loss_accum/epochsize,epoch)
        if(options.validate):
            writer.add_scalar('Validation loss', val_loss/num_iter,epoch)
            writer.add_scalar('Gradient norm', total_norm, epoch)
            writer.add_scalar('Validation error', 1-results['accuracy'])
        #for param_group in optimizer.param_groups:
            #print(param_group['lr'])
    
    if not options.validate:
        torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth'))
    with io.open(os.path.join(modelfile, 'model.vars'), 'w') as f:
        f.writelines('%s=%s\n' % kv for kv in cfg.items())
예제 #4
0
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    modelfile = options.modelfile

    cfg = {}
    print(options.vars)
    for fn in options.vars:
        cfg.update(config.parse_config_file(fn))

    cfg.update(config.parse_variable_assignments(options.var))

    sample_rate = cfg['sample_rate']
    frame_len = cfg['frame_len']
    fps = cfg['fps']
    mel_bands = cfg['mel_bands']
    mel_min = cfg['mel_min']
    mel_max = cfg['mel_max']
    blocklen = cfg['blocklen']
    batchsize = cfg['batchsize']

    bin_nyquist = frame_len // 2 + 1
    bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate

    # prepare dataset
    datadir = os.path.join(os.path.dirname(__file__), os.path.pardir,
                           'datasets', options.dataset)

    meanstd_file = os.path.join(os.path.dirname(__file__),
                                '%s_meanstd.npz' % options.dataset)

    if (options.input_type == 'audio'):
        dataloader = DatasetLoader(options.dataset,
                                   options.cache_spectra,
                                   datadir,
                                   input_type=options.input_type)
        batches = dataloader.prepare_audio_batches(sample_rate, frame_len, fps,
                                                   blocklen, batchsize)
    else:
        dataloader = DatasetLoader(options.dataset,
                                   options.cache_spectra,
                                   datadir,
                                   input_type=options.input_type)
        batches = dataloader.prepare_batches(sample_rate, frame_len, fps,
                                             mel_bands, mel_min, mel_max,
                                             blocklen, batchsize)

    validation_data = DatasetLoader(options.dataset,
                                    '../ismir2015/experiments/mel_data/',
                                    datadir,
                                    dataset_split='valid',
                                    input_type='mel_spects')
    mel_spects_val, labels_val = validation_data.prepare_batches(
        sample_rate,
        frame_len,
        fps,
        mel_bands,
        mel_min,
        mel_max,
        blocklen,
        batchsize,
        batch_data=False)

    mdl = model.CNNModel(model_type=options.model_type,
                         input_type=options.input_type,
                         is_zeromean=False,
                         sample_rate=sample_rate,
                         frame_len=frame_len,
                         fps=fps,
                         mel_bands=mel_bands,
                         mel_min=mel_min,
                         mel_max=mel_max,
                         bin_mel_max=bin_mel_max,
                         meanstd_file=meanstd_file,
                         device=device)
    mdl = mdl.to(device)

    #Setting up learning rate and learning rate parameters
    initial_eta = cfg['initial_eta']
    eta_decay = cfg['eta_decay']
    momentum = cfg['momentum']
    eta_decay_every = cfg.get('eta_decay_every', 1)
    eta = initial_eta

    #set up loss
    criterion = torch.nn.BCELoss()

    #set up optimizer
    optimizer = torch.optim.SGD(mdl.parameters(),
                                lr=eta,
                                momentum=momentum,
                                nesterov=True)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=eta_decay_every,
                                                gamma=eta_decay)

    #set up optimizer
    writer = SummaryWriter(os.path.join(modelfile, 'runs'))

    epochs = cfg['epochs']
    epochsize = cfg['epochsize']
    batches = iter(batches)

    #conditions to save model
    best_val_loss = 100000.
    best_val_error = 1.

    for epoch in range(epochs):
        # - Initialize certain parameters that are used to monitor training
        err = 0
        total_norm = 0
        loss_accum = 0
        mdl.train(True)
        # - Compute the L-2 norm of the gradients
        for p in mdl.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item()**2
        total_norm = total_norm**(1. / 2)

        # - Start the training for this epoch
        for batch in progress(range(epochsize),
                              min_delay=0.5,
                              desc='Epoch %d/%d: Batch ' %
                              (epoch + 1, epochs)):
            data = next(batches)
            if (options.input_type == 'audio' or options.input_type == 'stft'):
                input_data = data[0]
            else:
                input_data = np.transpose(data[0][:, :, :, np.newaxis],
                                          (0, 3, 1, 2))
            labels = data[1][:, np.newaxis].astype(np.float32)

            #map labels to make them softer
            if not options.adversarial_training:
                labels = (0.02 + 0.96 * labels)
            optimizer.zero_grad()

            if (options.adversarial_training):
                mdl.train(False)
                if (options.input_type == 'stft'):
                    input_data_adv = attacks.PGD(
                        mdl,
                        torch.from_numpy(input_data).to(device),
                        target=torch.from_numpy(labels).to(device),
                        eps=cfg['eps'],
                        step_size=cfg['eps_iter'],
                        iterations=cfg['nb_iter'],
                        use_best=True,
                        random_start=True,
                        clip_min=0,
                        clip_max=1e8).cpu().detach().numpy()
                else:
                    input_data_adv = attacks.PGD(
                        mdl,
                        torch.from_numpy(input_data).to(device),
                        target=torch.from_numpy(labels).to(device),
                        eps=cfg['eps'],
                        step_size=cfg['eps_iter'],
                        iterations=cfg['nb_iter'],
                        use_best=True,
                        random_start=True).cpu().detach().numpy()

                mdl.train(True)
                optimizer.zero_grad()
                outputs = mdl(torch.from_numpy(input_data_adv).to(device))
            else:
                optimizer.zero_grad()
                outputs = mdl(torch.from_numpy(input_data).to(device))
            #input(outputs.size())
            #input(mdl.conv(torch.from_numpy(input_data).to(device)).cpu().detach().numpy().shape)
            loss = criterion(outputs, torch.from_numpy(labels).to(device))
            loss.backward()
            optimizer.step()
            print(loss.item())
            loss_accum += loss.item()

        # - Compute validation loss and error if desired
        if options.validate:
            mdl.input_type = 'mel_spects'
            from eval import evaluate
            mdl.train(False)
            val_loss = 0
            preds = []
            labs = []
            max_len = fps

            num_iter = 0

            for spect, label in zip(mel_spects_val, labels_val):
                num_excerpts = len(spect) - blocklen + 1
                excerpts = np.lib.stride_tricks.as_strided(
                    spect,
                    shape=(num_excerpts, blocklen, spect.shape[1]),
                    strides=(spect.strides[0], spect.strides[0],
                             spect.strides[1]))
                # - Pass mini-batches through the network and concatenate results
                for pos in range(0, num_excerpts, batchsize):
                    input_data = np.transpose(
                        excerpts[pos:pos + batchsize, :, :, np.newaxis],
                        (0, 3, 1, 2))
                    #if (pos+batchsize>num_excerpts):
                    #    label_batch = label[blocklen//2+pos:blocklen//2+num_excerpts,
                    #            np.newaxis].astype(np.float32)
                    #else:
                    #    label_batch = label[blocklen//2+pos:blocklen//2+pos+batchsize,
                    #            np.newaxis].astype(np.float32)
                    if (pos + batchsize > num_excerpts):
                        label_batch = label[pos:num_excerpts,
                                            np.newaxis].astype(np.float32)
                    else:
                        label_batch = label[pos:pos + batchsize,
                                            np.newaxis].astype(np.float32)

                    pred = mdl(torch.from_numpy(input_data).to(device))
                    e = criterion(pred,
                                  torch.from_numpy(label_batch).to(device))
                    preds = np.append(preds, pred[:, 0].cpu().detach().numpy())
                    labs = np.append(labs, label_batch)
                    val_loss += e.item()
                    num_iter += 1
            mdl.input_type = options.input_type
            print("Validation loss: %.3f" % (val_loss / num_iter))
            _, results = evaluate(preds, labs)
            print("Validation error: %.3f" % (1 - results['accuracy']))

            if (1 - results['accuracy'] < best_val_error):
                torch.save(mdl.state_dict(),
                           os.path.join(modelfile, 'model.pth'))
                best_val_loss = val_loss / num_iter
                best_val_error = 1 - results['accuracy']
                print('New saved model', best_val_loss, best_val_error)

        #Update the learning rate
        scheduler.step()

        print('Training Loss per epoch', loss_accum / epochsize)

        # - Save parameters for examining
        writer.add_scalar('Training Loss', loss_accum / epochsize, epoch)
        writer.add_scalar('Validation loss', val_loss / num_iter, epoch)
        writer.add_scalar('Gradient norm', total_norm, epoch)
        writer.add_scalar('Validation error', 1 - results['accuracy'])
        #for param_group in optimizer.param_groups:
        #print(param_group['lr'])

    if not options.validate:
        torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth'))
    with io.open(os.path.join(modelfile, 'model.vars'), 'w') as f:
        f.writelines('%s=%s\n' % kv for kv in cfg.items())
예제 #5
0
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    modelfile = options.modelfile

    cfg = {}
    for fn in options.vars:
        cfg.update(config.parse_config_file(fn))

    cfg.update(config.parse_variable_assignments(options.var))

    sample_rate = cfg['sample_rate']
    frame_len = cfg['frame_len']
    fps = cfg['fps']
    mel_bands = cfg['mel_bands']
    mel_min = cfg['mel_min']
    mel_max = cfg['mel_max']
    blocklen = cfg['blocklen']
    batchsize = cfg['batchsize']

    bin_nyquist = frame_len // 2 + 1
    bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate

    # prepare dataset
    datadir = os.path.join(os.path.dirname(__file__), os.path.pardir,
                           'datasets', options.dataset)

    # - load filelist
    with io.open(os.path.join(datadir, 'filelists', 'train')) as f:
        filelist = [l.rstrip() for l in f if l.rstrip()]
    if options.validate:
        with io.open(os.path.join(datadir, 'filelists', 'valid')) as f:
            filelist_val = [l.strip() for l in f if l.strip()]
        filelist.extend(filelist_val)
    else:
        filelist_val = []
    # - compute spectra
    print("Computing%s spectra..." %
          (" or loading" if options.cache_spectra else ""))
    spects = []
    for fn in progress(filelist, 'File '):
        cache_fn = (options.cache_spectra
                    and os.path.join(options.cache_spectra, fn + '.npy'))
        spects.append(
            cached(cache_fn, audio.extract_spect,
                   os.path.join(datadir, 'audio', fn), sample_rate, frame_len,
                   fps))

    # - load and convert corresponding labels
    print("Loading labels...")
    labels = []
    for fn, spect in zip(filelist, spects):
        fn = os.path.join(datadir, 'labels', fn.rsplit('.', 1)[0] + '.lab')
        with io.open(fn) as f:
            segments = [l.rstrip().split() for l in f if l.rstrip()]
        segments = [(float(start), float(end), label == 'sing')
                    for start, end, label in segments]
        timestamps = np.arange(len(spect)) / float(fps)
        labels.append(create_aligned_targets(segments, timestamps, np.bool))

    # - prepare mel filterbank
    filterbank = audio.create_mel_filterbank(sample_rate, frame_len, mel_bands,
                                             mel_min, mel_max)
    filterbank = filterbank[:bin_mel_max].astype(floatX)

    if options.validate:
        spects_val = spects[-len(filelist_val):]
        spects = spects[:-len(filelist_val)]
        labels_val = labels[-len(filelist_val):]
        labels = labels[:-len(filelist_val)]

    # - precompute mel spectra, if needed, otherwise just define a generator
    mel_spects = (np.log(
        np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7))
                  for spect in spects)

    if not options.augment:
        mel_spects = list(mel_spects)
        del spects

    # - load mean/std or compute it, if not computed yet
    meanstd_file = os.path.join(os.path.dirname(__file__),
                                '%s_meanstd.npz' % options.dataset)
    try:
        with np.load(meanstd_file) as f:
            mean = f['mean']
            std = f['std']
    except (IOError, KeyError):
        print("Computing mean and standard deviation...")
        mean, std = znorm.compute_mean_std(mel_spects)
        np.savez(meanstd_file, mean=mean, std=std)
    mean = mean.astype(floatX)
    istd = np.reciprocal(std).astype(floatX)

    # - prepare training data generator
    print("Preparing training data feed...")
    if not options.augment:
        # Without augmentation, we just precompute the normalized mel spectra
        # and create a generator that returns mini-batches of random excerpts
        mel_spects = [(spect - mean) * istd for spect in mel_spects]
        batches = augment.grab_random_excerpts(mel_spects, labels, batchsize,
                                               blocklen)
    else:
        # For time stretching and pitch shifting, it pays off to preapply the
        # spline filter to each input spectrogram, so it does not need to be
        # applied to each mini-batch later.
        spline_order = cfg['spline_order']
        if spline_order > 1:
            from scipy.ndimage import spline_filter
            spects = [
                spline_filter(spect, spline_order).astype(floatX)
                for spect in spects
            ]

        # We define a function to create the mini-batch generator. This allows
        # us to easily create multiple generators for multithreading if needed.
        def create_datafeed(spects, labels):
            # With augmentation, as we want to apply random time-stretching,
            # we request longer excerpts than we finally need to return.
            max_stretch = cfg['max_stretch']
            batches = augment.grab_random_excerpts(
                spects,
                labels,
                batchsize=batchsize,
                frames=int(blocklen / (1 - max_stretch)))

            # We wrap the generator in another one that applies random time
            # stretching and pitch shifting, keeping a given number of frames
            # and bins only.
            max_shift = cfg['max_shift']
            batches = augment.apply_random_stretch_shift(batches,
                                                         max_stretch,
                                                         max_shift,
                                                         keep_frames=blocklen,
                                                         keep_bins=bin_mel_max,
                                                         order=spline_order,
                                                         prefiltered=True)

            # We transform the excerpts to mel frequency and log magnitude.
            batches = augment.apply_filterbank(batches, filterbank)
            batches = augment.apply_logarithm(batches)

            # We apply random frequency filters
            max_db = cfg['max_db']
            batches = augment.apply_random_filters(batches,
                                                   filterbank,
                                                   mel_max,
                                                   max_db=max_db)

            # We apply normalization
            batches = augment.apply_znorm(batches, mean, istd)

            return batches

        # We start the mini-batch generator and augmenter in one or more
        # background threads or processes (unless disabled).
        bg_threads = cfg['bg_threads']
        bg_processes = cfg['bg_processes']
        if not bg_threads and not bg_processes:
            # no background processing: just create a single generator
            batches = create_datafeed(spects, labels)
        elif bg_threads:
            # multithreading: create a separate generator per thread
            batches = augment.generate_in_background(
                [create_datafeed(spects, labels) for _ in range(bg_threads)],
                num_cached=bg_threads * 5)
        elif bg_processes:
            # multiprocessing: single generator is forked along with processes
            batches = augment.generate_in_background(
                [create_datafeed(spects, labels)] * bg_processes,
                num_cached=bg_processes * 25,
                in_processes=True)

    ###########################################################################
    #-----------Main changes to code to make it work with pytorch-------------#
    ###########################################################################

    print("preparing training function...")
    mdl = model.CNNModel()
    mdl = mdl.to(device)

    #Setting up learning rate and learning rate parameters
    initial_eta = cfg['initial_eta']
    eta_decay = cfg['eta_decay']
    momentum = cfg['momentum']
    eta_decay_every = cfg.get('eta_decay_every', 1)
    eta = initial_eta

    #set up loss
    criterion = torch.nn.BCELoss()

    #set up optimizer
    optimizer = torch.optim.SGD(mdl.parameters(),
                                lr=eta,
                                momentum=momentum,
                                nesterov=True)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=eta_decay_every,
                                                gamma=eta_decay)

    #set up optimizer
    writer = SummaryWriter(os.path.join(modelfile, 'runs'))

    epochs = cfg['epochs']
    epochsize = cfg['epochsize']
    batches = iter(batches)

    #conditions to save model
    best_val_loss = 100000.
    best_val_error = 1.

    for epoch in range(epochs):
        # - Initialize certain parameters that are used to monitor training
        err = 0
        total_norm = 0
        loss_accum = 0
        mdl.train(True)
        # - Compute the L-2 norm of the gradients
        for p in mdl.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item()**2
        total_norm = total_norm**(1. / 2)

        # - Start the training for this epoch
        for batch in progress(range(epochsize),
                              min_delay=0.5,
                              desc='Epoch %d/%d: Batch ' %
                              (epoch + 1, epochs)):
            data = next(batches)
            input_data = np.transpose(data[0][:, :, :, np.newaxis],
                                      (0, 3, 1, 2))
            labels = data[1][:, np.newaxis].astype(np.float32)

            #map labels to make them softer
            labels = (0.02 + 0.96 * labels)
            optimizer.zero_grad()

            outputs = mdl(torch.from_numpy(input_data).to(device))
            loss = criterion(outputs, torch.from_numpy(labels).to(device))
            loss.backward()
            optimizer.step()
            loss_accum += loss.item()

        # - Compute validation loss and error if desired
        if options.validate:

            from eval import evaluate
            mdl.train(False)
            val_loss = 0
            preds = []
            labs = []
            max_len = fps

            mel_spects_val = (np.log(
                np.maximum(np.dot(spect[:, :bin_mel_max], filterbank), 1e-7))
                              for spect in spects_val)

            mel_spects_val = [(spect - mean) * istd
                              for spect in mel_spects_val]

            num_iter = 0

            for spect, label in zip(mel_spects_val, labels_val):
                num_excerpts = len(spect) - blocklen + 1
                excerpts = np.lib.stride_tricks.as_strided(
                    spect,
                    shape=(num_excerpts, blocklen, spect.shape[1]),
                    strides=(spect.strides[0], spect.strides[0],
                             spect.strides[1]))

                # - Pass mini-batches through the network and concatenate results
                for pos in range(0, num_excerpts, batchsize):
                    input_data = np.transpose(
                        excerpts[pos:pos + batchsize, :, :, np.newaxis],
                        (0, 3, 1, 2))
                    if (pos + batchsize > num_excerpts):
                        label_batch = label[blocklen // 2 + pos:blocklen // 2 +
                                            num_excerpts,
                                            np.newaxis].astype(np.float32)
                    else:
                        label_batch = label[blocklen // 2 + pos:blocklen // 2 +
                                            pos + batchsize,
                                            np.newaxis].astype(np.float32)

                    pred = mdl(torch.from_numpy(input_data).to(device))
                    e = criterion(pred,
                                  torch.from_numpy(label_batch).to(device))
                    preds = np.append(preds, pred[:, 0].cpu().detach().numpy())
                    labs = np.append(labs, label_batch)
                    val_loss += e.item()
                    num_iter += 1

            print("Validation loss: %.3f" % (val_loss / num_iter))
            _, results = evaluate(preds, labs)
            print("Validation error: %.3f" % (1 - results['accuracy']))

            if (val_loss / num_iter < best_val_loss
                    and (1 - results['accuracy']) < best_val_error):
                torch.save(mdl.state_dict(),
                           os.path.join(modelfile, 'model.pth'))
                best_val_loss = val_loss / num_iter
                best_val_error = 1 - results['accuracy']
                print('New saved model', best_val_loss, best_val_error)

        #Update the learning rate
        scheduler.step()

        print('Training Loss per epoch', loss_accum / epochsize)

        # - Save parameters for examining
        writer.add_scalar('Training Loss', loss_accum / epochsize, epoch)
        writer.add_scalar('Validation loss', val_loss / num_iter, epoch)
        writer.add_scalar('Gradient norm', total_norm, epoch)
        writer.add_scalar('Validation error', 1 - results['accuracy'])
        for param_group in optimizer.param_groups:
            print(param_group['lr'])

    if not options.validate:
        torch.save(mdl.state_dict(), os.path.join(modelfile, 'model.pth'))
예제 #6
0
def main():
    print(torch.cuda.is_available())
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # parse command line
    parser = opts_parser()
    options = parser.parse_args()
    modelfile = options.modelfile
    lossgradient = options.lossgradient

    cfg = {}
    for fn in options.vars:
        cfg.update(config.parse_config_file(fn))

    cfg.update(config.parse_variable_assignments(options.var))

    outfile = options.outfile
    sample_rate = cfg['sample_rate']
    frame_len = cfg['frame_len']
    fps = cfg['fps']
    mel_bands = cfg['mel_bands']
    mel_min = cfg['mel_min']
    mel_max = cfg['mel_max']
    blocklen = cfg['blocklen']
    batchsize = cfg['batchsize']

    bin_nyquist = frame_len // 2 + 1
    bin_mel_max = bin_nyquist * 2 * mel_max // sample_rate

    # prepare dataset
    print("Preparing data reading...")
    datadir = os.path.join(os.path.dirname(__file__), os.path.pardir,
                           'datasets', options.dataset)

    # - load filelist
    with io.open(os.path.join(datadir, 'filelists', 'valid')) as f:
        filelist = [l.rstrip() for l in f if l.rstrip()]
    with io.open(os.path.join(datadir, 'filelists', 'test')) as f:
        filelist += [l.rstrip() for l in f if l.rstrip()]

    # - load mean/std
    meanstd_file = os.path.join(os.path.dirname(__file__),
                                '%s_meanstd.npz' % options.dataset)

    dataloader = DatasetLoader(options.dataset,
                               options.cache_spectra,
                               datadir,
                               input_type=options.input_type,
                               filelist=filelist)
    mel_spects, labels = dataloader.prepare_batches(sample_rate,
                                                    frame_len,
                                                    fps,
                                                    mel_bands,
                                                    mel_min,
                                                    mel_max,
                                                    blocklen,
                                                    batchsize,
                                                    batch_data=False)

    with np.load(meanstd_file) as f:
        mean = f['mean']
        std = f['std']
    mean = mean.astype(floatX)
    istd = np.reciprocal(std).astype(floatX)

    mdl = model.CNNModel(input_type='mel_spects_norm',
                         is_zeromean=False,
                         meanstd_file=meanstd_file,
                         device=device)
    mdl.load_state_dict(torch.load(modelfile))
    mdl.to(device)
    mdl.eval()

    if (lossgradient != 'None'):
        mdl_lossgrad = model.CNNModel(input_type=options.input_type,
                                      is_zeromean=False,
                                      sample_rate=sample_rate,
                                      frame_len=frame_len,
                                      fps=fps,
                                      mel_bands=mel_bands,
                                      mel_min=mel_min,
                                      mel_max=mel_max,
                                      bin_mel_max=bin_mel_max,
                                      meanstd_file=meanstd_file,
                                      device=device)
        mdl_lossgrad.load_state_dict(torch.load(lossgradient))
        mdl_lossgrad.to(device)
        mdl_lossgrad.eval()
        criterion = torch.nn.BCELoss()
        loss_grad_val = dataloader.prepare_loss_grad_batches(
            options.loss_grad_save, mel_spects, labels, mdl_lossgrad,
            criterion, blocklen, batchsize, device)

    # run prediction loop
    print("Predicting:")
    predictions = []
    #for spect, g in zip(mel_spects, loss_grad_val):
    c = 0
    for spect in progress(mel_spects, total=len(filelist), desc='File '):
        if (lossgradient != 'None'):
            g = loss_grad_val[c]
        c += 1
        # naive way: pass excerpts of the size used during training
        # - view spectrogram memory as a 3-tensor of overlapping excerpts
        num_excerpts = len(spect) - blocklen + 1
        excerpts = np.lib.stride_tricks.as_strided(
            spect.astype(floatX),
            shape=(num_excerpts, blocklen, spect.shape[1]),
            strides=(spect.strides[0], spect.strides[0], spect.strides[1]))
        preds = np.zeros((num_excerpts, 1))
        count = 0
        for pos in range(0, num_excerpts, batchsize):
            input_data = np.transpose(
                excerpts[pos:pos + batchsize, :, :, np.newaxis], (0, 3, 1, 2))
            input_data = (input_data - mean) * istd
            if lossgradient != 'None':
                for i in range(input_data.shape[0]):
                    if (options.lossgrad_algorithm == 'grad'):
                        rank_matrix = np.abs(g[i + pos])
                    elif (options.lossgrad_algorithm == 'gradxinp'):
                        rank_matrix = np.squeeze(g[i + pos] *
                                                 input_data[i, :, :, :])
                    elif (options.lossgrad_algorithm == 'gradorig'):
                        rank_matrix = g[i + pos]
                    if (options.ROAR == 1):
                        v = np.argsort(rank_matrix,
                                       axis=None)[-cfg['occlude']:]
                    else:
                        v = np.argsort(rank_matrix, axis=None)[:cfg['occlude']]
                    input_data[i, :, v // 80, v % 80] = 0
            else:
                for i in range(input_data.shape[0]):
                    #print('random')
                    v = np.random.choice(115 * 80,
                                         cfg['occlude'],
                                         replace=False)
                    input_data[i, :, v // 80, v % 80] = 0

            count += 1

            #print('Here')
            #preds = np.vstack(mdl.forward(torch.from_numpy(
            #            np.transpose(excerpts[pos:pos + batchsize,:,:,
            #            np.newaxis],(0,3,1,2))).to(device)).cpu().detach().numpy()
            #        for pos in range(0, num_excerpts, batchsize))

            preds[pos:pos + batchsize, :] = mdl(
                torch.from_numpy(input_data).to(
                    device)).cpu().detach().numpy()
        print('Here')
        predictions.append(preds)
    # save predictions
    print("Saving predictions")
    np.savez(outfile, **{fn: pred for fn, pred in zip(filelist, predictions)})