def main():
    p = argparse.ArgumentParser()
    p.add_argument('--gpu', '-g', type=int, default=-1)
    p.add_argument(
        '--model',
        '-m',
        type=str,
        default=
        '/content/drive/My Drive/vocal-remover/models/MultiGenreModelNP.pth')
    p.add_argument('--input', '-i', required=True)
    p.add_argument('--sr', '-r', type=int, default=44100)
    p.add_argument('--hop_length', '-l', type=int, default=1024)
    p.add_argument('--window_size', '-w', type=int, default=512)
    p.add_argument('--out_mask', '-M', action='store_true')
    p.add_argument('--postprocess', '-p', action='store_true')
    args = p.parse_args()

    print('loading model...', end=' ')
    device = torch.device('cpu')
    model = nets.CascadedASPPNet()
    model.load_state_dict(torch.load(args.model, map_location=device))
    if torch.cuda.is_available() and args.gpu >= 0:
        device = torch.device('cuda:{}'.format(args.gpu))
        model.to(device)
    print('done')

    print('loading wave source...', end=' ')
    X, sr = librosa.load(args.input,
                         args.sr,
                         False,
                         dtype=np.float32,
                         res_type='kaiser_fast')
    print('done')

    print('stft of wave source...', end=' ')
    X = spec_utils.calc_spec(X, args.hop_length)
    X, phase = np.abs(X), np.exp(1.j * np.angle(X))
    coeff = X.max()
    X /= coeff
    print('done')

    offset = model.offset
    l, r, roi_size = dataset.make_padding(X.shape[2], args.window_size, offset)
    X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
    X_roll = np.roll(X_pad, roi_size // 2, axis=2)

    model.eval()
    with torch.no_grad():
        masks = []
        masks_roll = []
        for i in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
            start = i * roi_size
            X_window = torch.from_numpy(
                np.asarray([
                    X_pad[:, :, start:start + args.window_size],
                    X_roll[:, :, start:start + args.window_size]
                ])).to(device)
            pred = model.predict(X_window)
            pred = pred.detach().cpu().numpy()
            masks.append(pred[0])
            masks_roll.append(pred[1])

        mask = np.concatenate(masks, axis=2)[:, :, :X.shape[2]]
        mask_roll = np.concatenate(masks_roll, axis=2)[:, :, :X.shape[2]]
        mask = (mask + np.roll(mask_roll, -roi_size // 2, axis=2)) / 2

    if args.postprocess:
        vocal = X * (1 - mask) * coeff
        mask = spec_utils.mask_uninformative(mask, vocal)

    inst = X * mask * coeff
    vocal = X * (1 - mask) * coeff

    basename = os.path.splitext(os.path.basename(args.input))[0]

    print('inverse stft of instruments...', end=' ')
    wav = spec_utils.spec_to_wav(inst, phase, args.hop_length)
    print('done')
    sf.write('{}_Instruments.wav'.format(basename), wav.T, sr)

    print('inverse stft of vocals...', end=' ')
    wav = spec_utils.spec_to_wav(vocal, phase, args.hop_length)
    print('done')
    sf.write('{}_Vocals.wav'.format(basename), wav.T, sr)

    if args.out_mask:
        norm_mask = np.uint8((1 - mask) * 255).transpose(1, 2, 0)
        norm_mask = np.concatenate(
            [np.max(norm_mask, axis=2, keepdims=True), norm_mask],
            axis=2)[::-1]
        _, bin_mask = cv2.imencode('.png', norm_mask)
        with open('{}_Mask.png'.format(basename), mode='wb') as f:
            bin_mask.tofile(f)
示例#2
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('--gpu', '-g', type=int, default=-1)
    p.add_argument('--model', '-m', type=str, default='models/baseline.npz')
    p.add_argument('--input', '-i', required=True)
    p.add_argument('--sr', '-r', type=int, default=44100)
    p.add_argument('--hop_length', '-l', type=int, default=1024)
    p.add_argument('--window_size', '-w', type=int, default=1024)
    p.add_argument('--out_mask', '-M', action='store_true')
    p.add_argument('--postprocess', '-p', action='store_true')
    args = p.parse_args()

    print('loading model...', end=' ')
    model = unet.MultiBandUNet()
    chainer.serializers.load_npz(args.model, model)
    if args.gpu >= 0:
        chainer.backends.cuda.check_cuda_available()
        chainer.backends.cuda.get_device(args.gpu).use()
        model.to_gpu()
    xp = model.xp
    print('done')

    print('loading wave source...', end=' ')
    X, sr = librosa.load(args.input,
                         args.sr,
                         False,
                         dtype=np.float32,
                         res_type='kaiser_fast')
    print('done')

    print('wave source stft...', end=' ')
    X, phase = spec_utils.calc_spec(X, args.hop_length, phase=True)
    coeff = X.max()
    X /= coeff
    print('done')

    left = model.offset
    roi_size = args.window_size - left * 2
    right = roi_size - (X.shape[2] % roi_size) + left
    X_pad = np.pad(X, ((0, 0), (0, 0), (left, right)), mode='reflect')

    masks = []
    with chainer.no_backprop_mode(), chainer.using_config('train', False):
        for j in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
            start = j * roi_size
            X_window = X_pad[None, :, :, start:start + args.window_size]
            X_tta = np.concatenate([X_window, X_window[:, ::-1, :, :]])

            pred = model(xp.asarray(X_tta))
            pred = backends.cuda.to_cpu(pred.data)
            pred[1] = pred[1, ::-1, :, :]
            masks.append(pred.mean(axis=0))

    mask = np.concatenate(masks, axis=2)[:, :, :X.shape[2]]
    if args.postprocess:
        vocal_pred = X * (1 - mask) * coeff
        mask = spec_utils.mask_uninformative(mask, vocal_pred)
    inst_pred = X * mask * coeff
    vocal_pred = X * (1 - mask) * coeff

    if args.out_mask:
        norm_mask = np.uint8(mask.mean(axis=0) * 255)[::-1]
        hm = cv2.applyColorMap(norm_mask, cv2.COLORMAP_MAGMA)
        cv2.imwrite('mask.png', hm)

    print('instrumental inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(inst_pred, phase, args.hop_length)
    print('done')
    librosa.output.write_wav('instrumental.wav', wav, sr)

    print('vocal inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(vocal_pred, phase, args.hop_length)
    print('done')
    librosa.output.write_wav('vocal.wav', wav, sr)
示例#3
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('--gpu', '-g', type=int, default=-1)
    p.add_argument('--model', '-m', type=str, default='models/baseline.pth')
    p.add_argument('--input', '-i', required=True)
    p.add_argument('--sr', '-r', type=int, default=44100)
    p.add_argument('--hop_length', '-l', type=int, default=1024)
    p.add_argument('--window_size', '-w', type=int, default=512)
    p.add_argument('--out_mask', '-M', action='store_true')
    p.add_argument('--postprocess', '-p', action='store_true')
    args = p.parse_args()

    print('loading model...', end=' ')
    device = torch.device('cpu')
    model = nets.CascadedASPPNet()
    model.load_state_dict(torch.load(args.model, map_location=device))
    if torch.cuda.is_available() and args.gpu >= 0:
        device = torch.device('cuda:{}'.format(args.gpu))
        model.to(device)
    print('done')

    print('loading wave source...', end=' ')
    X, sr = librosa.load(args.input,
                         args.sr,
                         False,
                         dtype=np.float32,
                         res_type='kaiser_fast')
    print('done')

    print('wave source stft...', end=' ')
    X = spec_utils.calc_spec(X, args.hop_length)
    X, phase = np.abs(X), np.exp(1.j * np.angle(X))
    coeff = X.max()
    X /= coeff
    print('done')

    offset = model.offset
    l, r, roi_size = dataset.make_padding(X.shape[2], args.window_size, offset)
    X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')

    masks = []
    model.eval()
    with torch.no_grad():
        for j in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
            start = j * roi_size
            X_window = X_pad[None, :, :, start:start + args.window_size]
            pred = model.predict(torch.from_numpy(X_window).to(device))
            pred = pred.detach().cpu().numpy()
            masks.append(pred[0])

    mask = np.concatenate(masks, axis=2)[:, :, :X.shape[2]]
    if args.postprocess:
        vocal_pred = X * (1 - mask) * coeff
        mask = spec_utils.mask_uninformative(mask, vocal_pred)
    inst_pred = X * mask * coeff
    vocal_pred = X * (1 - mask) * coeff

    if args.out_mask:
        norm_mask = np.uint8((1 - mask) * 255)
        canvas = np.zeros((norm_mask.shape[1], norm_mask.shape[2], 3))
        canvas[:, :, 1] = norm_mask[0]
        canvas[:, :, 2] = norm_mask[1]
        canvas[:, :, 0] = np.max(norm_mask, axis=0)
        cv2.imwrite('mask.png', canvas[::-1])

    basename = os.path.splitext(os.path.basename(args.input))[0]

    print('instrumental inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(inst_pred, phase, args.hop_length)
    print('done')
    sf.write('{}_Instrumental.wav'.format(basename), wav.T, sr)

    print('vocal inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(vocal_pred, phase, args.hop_length)
    print('done')
    sf.write('{}_Vocal.wav'.format(basename), wav.T, sr)
示例#4
0
    masks = []
    with chainer.no_backprop_mode(), chainer.using_config('train', False):
        for j in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
            start = j * roi_size
            X_window = X_pad[None, :, :, start:start + args.window_size]
            X_tta = np.concatenate([X_window, X_window[:, ::-1, :, :]])

            pred = model(xp.asarray(X_tta))
            pred = backends.cuda.to_cpu(pred.data)
            pred[1] = pred[1, ::-1, :, :]
            masks.append(pred.mean(axis=0))

    mask = np.concatenate(masks, axis=2)[:, :, :X.shape[2]]
    if args.postprocess:
        vocal_pred = X * (1 - mask) * coeff
        mask = spec_utils.mask_uninformative(mask, vocal_pred)
    inst_pred = X * mask * coeff
    vocal_pred = X * (1 - mask) * coeff

    if args.out_mask:
        norm_mask = np.uint8(mask.mean(axis=0) * 255)[::-1]
        hm = cv2.applyColorMap(norm_mask, cv2.COLORMAP_MAGMA)
        cv2.imwrite('mask.png', hm)

    print('instrumental inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(inst_pred, phase, args.hop_length)
    print('done')
    librosa.output.write_wav('instrumental.wav', wav, sr)

    print('vocal inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(vocal_pred, phase, args.hop_length)