Пример #1
0
        X, y = spec_utils.align_wave_head_and_tail(X, y, args.sr)

        v = X - y
        sf.write(input_i, y.T, args.sr)
        sf.write(input_v, v.T, args.sr)
        subprocess.call(cmd_i, stderr=subprocess.DEVNULL)
        subprocess.call(cmd_v, stderr=subprocess.DEVNULL)

        y, _ = librosa.load(output_i,
                            args.sr,
                            False,
                            dtype=np.float32,
                            res_type='kaiser_fast')
        v, _ = librosa.load(output_v,
                            args.sr,
                            False,
                            dtype=np.float32,
                            res_type='kaiser_fast')
        X = y + v

        spec = spec_utils.calc_spec(X, args.hop_length)
        np.save(outpath_mix, np.abs(spec))

        spec = spec_utils.calc_spec(y, args.hop_length)
        np.save(outpath_inst, np.abs(spec))

        os.remove(input_i)
        os.remove(input_v)
        os.remove(output_i)
        os.remove(output_v)
def main():
    p = argparse.ArgumentParser()
    p.add_argument('--gpu', '-g', type=int, default=-1)
    p.add_argument(
        '--model',
        '-m',
        type=str,
        default=
        '/content/drive/My Drive/vocal-remover/models/MultiGenreModelNP.pth')
    p.add_argument('--input', '-i', required=True)
    p.add_argument('--sr', '-r', type=int, default=44100)
    p.add_argument('--hop_length', '-l', type=int, default=1024)
    p.add_argument('--window_size', '-w', type=int, default=512)
    p.add_argument('--out_mask', '-M', action='store_true')
    p.add_argument('--postprocess', '-p', action='store_true')
    args = p.parse_args()

    print('loading model...', end=' ')
    device = torch.device('cpu')
    model = nets.CascadedASPPNet()
    model.load_state_dict(torch.load(args.model, map_location=device))
    if torch.cuda.is_available() and args.gpu >= 0:
        device = torch.device('cuda:{}'.format(args.gpu))
        model.to(device)
    print('done')

    print('loading wave source...', end=' ')
    X, sr = librosa.load(args.input,
                         args.sr,
                         False,
                         dtype=np.float32,
                         res_type='kaiser_fast')
    print('done')

    print('stft of wave source...', end=' ')
    X = spec_utils.calc_spec(X, args.hop_length)
    X, phase = np.abs(X), np.exp(1.j * np.angle(X))
    coeff = X.max()
    X /= coeff
    print('done')

    offset = model.offset
    l, r, roi_size = dataset.make_padding(X.shape[2], args.window_size, offset)
    X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
    X_roll = np.roll(X_pad, roi_size // 2, axis=2)

    model.eval()
    with torch.no_grad():
        masks = []
        masks_roll = []
        for i in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
            start = i * roi_size
            X_window = torch.from_numpy(
                np.asarray([
                    X_pad[:, :, start:start + args.window_size],
                    X_roll[:, :, start:start + args.window_size]
                ])).to(device)
            pred = model.predict(X_window)
            pred = pred.detach().cpu().numpy()
            masks.append(pred[0])
            masks_roll.append(pred[1])

        mask = np.concatenate(masks, axis=2)[:, :, :X.shape[2]]
        mask_roll = np.concatenate(masks_roll, axis=2)[:, :, :X.shape[2]]
        mask = (mask + np.roll(mask_roll, -roi_size // 2, axis=2)) / 2

    if args.postprocess:
        vocal = X * (1 - mask) * coeff
        mask = spec_utils.mask_uninformative(mask, vocal)

    inst = X * mask * coeff
    vocal = X * (1 - mask) * coeff

    basename = os.path.splitext(os.path.basename(args.input))[0]

    print('inverse stft of instruments...', end=' ')
    wav = spec_utils.spec_to_wav(inst, phase, args.hop_length)
    print('done')
    sf.write('{}_Instruments.wav'.format(basename), wav.T, sr)

    print('inverse stft of vocals...', end=' ')
    wav = spec_utils.spec_to_wav(vocal, phase, args.hop_length)
    print('done')
    sf.write('{}_Vocals.wav'.format(basename), wav.T, sr)

    if args.out_mask:
        norm_mask = np.uint8((1 - mask) * 255).transpose(1, 2, 0)
        norm_mask = np.concatenate(
            [np.max(norm_mask, axis=2, keepdims=True), norm_mask],
            axis=2)[::-1]
        _, bin_mask = cv2.imencode('.png', norm_mask)
        with open('{}_Mask.png'.format(basename), mode='wb') as f:
            bin_mask.tofile(f)
Пример #3
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('--gpu', '-g', type=int, default=-1)
    p.add_argument('--model', '-m', type=str, default='models/baseline.pth')
    p.add_argument('--input', '-i', required=True)
    p.add_argument('--sr', '-r', type=int, default=44100)
    p.add_argument('--hop_length', '-l', type=int, default=1024)
    p.add_argument('--window_size', '-w', type=int, default=512)
    p.add_argument('--out_mask', '-M', action='store_true')
    p.add_argument('--postprocess', '-p', action='store_true')
    args = p.parse_args()

    print('loading model...', end=' ')
    device = torch.device('cpu')
    model = nets.CascadedASPPNet()
    model.load_state_dict(torch.load(args.model, map_location=device))
    if torch.cuda.is_available() and args.gpu >= 0:
        device = torch.device('cuda:{}'.format(args.gpu))
        model.to(device)
    print('done')

    print('loading wave source...', end=' ')
    X, sr = librosa.load(args.input,
                         args.sr,
                         False,
                         dtype=np.float32,
                         res_type='kaiser_fast')
    print('done')

    print('wave source stft...', end=' ')
    X = spec_utils.calc_spec(X, args.hop_length)
    X, phase = np.abs(X), np.exp(1.j * np.angle(X))
    coeff = X.max()
    X /= coeff
    print('done')

    offset = model.offset
    l, r, roi_size = dataset.make_padding(X.shape[2], args.window_size, offset)
    X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')

    masks = []
    model.eval()
    with torch.no_grad():
        for j in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
            start = j * roi_size
            X_window = X_pad[None, :, :, start:start + args.window_size]
            pred = model.predict(torch.from_numpy(X_window).to(device))
            pred = pred.detach().cpu().numpy()
            masks.append(pred[0])

    mask = np.concatenate(masks, axis=2)[:, :, :X.shape[2]]
    if args.postprocess:
        vocal_pred = X * (1 - mask) * coeff
        mask = spec_utils.mask_uninformative(mask, vocal_pred)
    inst_pred = X * mask * coeff
    vocal_pred = X * (1 - mask) * coeff

    if args.out_mask:
        norm_mask = np.uint8((1 - mask) * 255)
        canvas = np.zeros((norm_mask.shape[1], norm_mask.shape[2], 3))
        canvas[:, :, 1] = norm_mask[0]
        canvas[:, :, 2] = norm_mask[1]
        canvas[:, :, 0] = np.max(norm_mask, axis=0)
        cv2.imwrite('mask.png', canvas[::-1])

    basename = os.path.splitext(os.path.basename(args.input))[0]

    print('instrumental inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(inst_pred, phase, args.hop_length)
    print('done')
    sf.write('{}_Instrumental.wav'.format(basename), wav.T, sr)

    print('vocal inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(vocal_pred, phase, args.hop_length)
    print('done')
    sf.write('{}_Vocal.wav'.format(basename), wav.T, sr)
Пример #4
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('--gpu', '-g', type=int, default=-1)
    p.add_argument('--model', '-m', type=str, default='models/baseline.npz')
    p.add_argument('--input', '-i', required=True)
    p.add_argument('--sr', '-r', type=int, default=44100)
    p.add_argument('--hop_length', '-l', type=int, default=1024)
    p.add_argument('--window_size', '-w', type=int, default=1024)
    p.add_argument('--out_mask', '-M', action='store_true')
    p.add_argument('--postprocess', '-p', action='store_true')
    args = p.parse_args()

    print('loading model...', end=' ')
    model = unet.MultiBandUNet()
    chainer.serializers.load_npz(args.model, model)
    if args.gpu >= 0:
        chainer.backends.cuda.check_cuda_available()
        chainer.backends.cuda.get_device(args.gpu).use()
        model.to_gpu()
    xp = model.xp
    print('done')

    print('loading wave source...', end=' ')
    X, sr = librosa.load(args.input,
                         args.sr,
                         False,
                         dtype=np.float32,
                         res_type='kaiser_fast')
    print('done')

    print('wave source stft...', end=' ')
    X, phase = spec_utils.calc_spec(X, args.hop_length, phase=True)
    coeff = X.max()
    X /= coeff
    print('done')

    left = model.offset
    roi_size = args.window_size - left * 2
    right = roi_size - (X.shape[2] % roi_size) + left
    X_pad = np.pad(X, ((0, 0), (0, 0), (left, right)), mode='reflect')

    masks = []
    with chainer.no_backprop_mode(), chainer.using_config('train', False):
        for j in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
            start = j * roi_size
            X_window = X_pad[None, :, :, start:start + args.window_size]
            X_tta = np.concatenate([X_window, X_window[:, ::-1, :, :]])

            pred = model(xp.asarray(X_tta))
            pred = backends.cuda.to_cpu(pred.data)
            pred[1] = pred[1, ::-1, :, :]
            masks.append(pred.mean(axis=0))

    mask = np.concatenate(masks, axis=2)[:, :, :X.shape[2]]
    if args.postprocess:
        vocal_pred = X * (1 - mask) * coeff
        mask = spec_utils.mask_uninformative(mask, vocal_pred)
    inst_pred = X * mask * coeff
    vocal_pred = X * (1 - mask) * coeff

    if args.out_mask:
        norm_mask = np.uint8(mask.mean(axis=0) * 255)[::-1]
        hm = cv2.applyColorMap(norm_mask, cv2.COLORMAP_MAGMA)
        cv2.imwrite('mask.png', hm)

    print('instrumental inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(inst_pred, phase, args.hop_length)
    print('done')
    librosa.output.write_wav('instrumental.wav', wav, sr)

    print('vocal inverse stft...', end=' ')
    wav = spec_utils.spec_to_wav(vocal_pred, phase, args.hop_length)
    print('done')
    librosa.output.write_wav('vocal.wav', wav, sr)
Пример #5
0
        chainer.backends.cuda.check_cuda_available()
        chainer.backends.cuda.get_device(args.gpu).use()
        model.to_gpu()
    xp = model.xp
    print('done')

    print('loading wave source...', end=' ')
    X, sr = librosa.load(args.input,
                         args.sr,
                         False,
                         dtype=np.float32,
                         res_type='kaiser_fast')
    print('done')

    print('wave source stft...', end=' ')
    X, phase = spec_utils.calc_spec(X, args.hop_length, phase=True)
    coeff = X.max()
    X /= coeff
    print('done')

    left = model.offset
    roi_size = args.window_size - left * 2
    right = roi_size - (X.shape[2] % roi_size) + left
    X_pad = np.pad(X, ((0, 0), (0, 0), (left, right)), mode='reflect')

    masks = []
    with chainer.no_backprop_mode(), chainer.using_config('train', False):
        for j in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
            start = j * roi_size
            X_window = X_pad[None, :, :, start:start + args.window_size]
            X_tta = np.concatenate([X_window, X_window[:, ::-1, :, :]])
Пример #6
0
def upload_file():
    if request.method == 'POST':
        #CLEANUP SCRIPT
        upload_folder = app.config['UPLOAD_FOLDER']
        now = time.time()

        for filename in os.listdir(upload_folder):
            if allowed_file(filename):
                if os.path.getmtime(os.path.join(upload_folder,
                                                 filename)) < now - 30 * 60:
                    if os.path.isfile(os.path.join(upload_folder, filename)):
                        print(filename)
                        os.remove(os.path.join(upload_folder, filename))

        f = request.files['file']
        if f and allowed_file(f.filename):
            path = upload_folder + '/' + f.filename
            f.save(path)
            # INFERENCE.PY
            import chainer
            from chainer import backends
            import cv2
            import librosa
            import numpy as np
            from tqdm import tqdm

            from lib import spec_utils
            from lib import unet

            #p = argparse.ArgumentParser()
            agpu = -1  #p.add_argument('--gpu', '-g', type=int, default=-1)
            amodel = models_path  #p.add_argument('--model', '-m', type=str, default='models/baseline.npz')
            ainput = path  #p.add_argument('--input', '-i', required=True)
            asr = 44100  #p.add_argument('--sr', '-r', type=int, default=44100)
            ahop_length = 1024  #p.add_argument('--hop_length', '-l', type=int, default=1024)
            awindow_size = 1024  #p.add_argument('--window_size', '-w', type=int, default=1024)
            aout_mask = False  #p.add_argument('--out_mask', '-M', action='store_true')
            #args = p.parse_args()
            print('loading model...', end=' ')
            model = unet.MultiBandUNet()
            chainer.serializers.load_npz(amodel, model)
            if agpu >= 0:
                chainer.backends.cuda.check_cuda_available()
                chainer.backends.cuda.get_device(agpu).use()
                model.to_gpu()
            xp = model.xp
            print('done')

            #CHANGE DURATION FOR PAID VERSION
            print('loading wave source...', end=' ')
            X, sr = librosa.load(ainput,
                                 asr,
                                 False,
                                 duration=30.0,
                                 dtype=np.float32,
                                 res_type='kaiser_fast')
            print('done')

            print('wave source stft...', end=' ')
            X, phase = spec_utils.calc_spec(X, ahop_length, phase=True)
            coeff = X.max()
            X /= coeff
            print('done')

            left = model.offset
            roi_size = awindow_size - left * 2
            right = roi_size + left - (X.shape[2] % left)
            X_pad = np.pad(X, ((0, 0), (0, 0), (left, right)), mode='reflect')

            masks = []
            with chainer.no_backprop_mode(), chainer.using_config(
                    'train', False):
                for j in tqdm(range(int(np.ceil(X.shape[2] / roi_size)))):
                    start = j * roi_size
                    X_window = X_pad[None, :, :, start:start + awindow_size]
                    X_tta = np.concatenate([X_window, X_window[:, ::-1, :, :]])

                    pred = model(xp.asarray(X_tta))
                    pred = backends.cuda.to_cpu(pred.data)
                    pred[1] = pred[1, ::-1, :, :]
                    masks.append(pred.mean(axis=0))

            mask = np.concatenate(masks, axis=2)[:, :, :X.shape[2]]
            # vocal_pred = X * (1 - mask) * coeff
            # mask = spec_utils.mask_uninformative(mask, vocal_pred)
            inst_pred = X * mask * coeff
            vocal_pred = X * (1 - mask) * coeff

            if aout_mask:
                norm_mask = np.uint8(mask.mean(axis=0) * 255)[::-1]
                hm = cv2.applyColorMap(norm_mask, cv2.COLORMAP_MAGMA)
                cv2.imwrite('mask.png', hm)

            print('instrumental inverse stft...', end=' ')
            wav = spec_utils.spec_to_wav(inst_pred, phase, ahop_length)
            print('done')
            instrumental = f.filename.split('.')[0] + '_instrumental.wav'
            librosa.output.write_wav('app/static/' + instrumental, wav, sr)

            print('vocal inverse stft...', end=' ')
            wav = spec_utils.spec_to_wav(vocal_pred, phase, ahop_length)
            print('done')

            vocal = f.filename.split('.')[0] + '_vocal.wav'
            librosa.output.write_wav('app/static/' + vocal, wav, sr)
            return render_template('uploaded.html',
                                   title='Success',
                                   original=f.filename,
                                   instrumental=instrumental,
                                   vocal=vocal)