Exemplo n.º 1
0
def play_synthesized_features(
        feats, norm_path='/home/ubuntu/loop/data/vctk/norm_info/norm.dat'):
    output_dir = './'
    output_file = 'test.wav'

    generate_merlin_wav(feats['audio_features'], output_dir, output_file,
                        norm_path)

    IPython.display.display(
        IPython.display.Audio(output_file + '.wav', autoplay=True))

    rate, wav_data = wavfile.read(output_file + '.wav')

    return rate, wav_data
Exemplo n.º 2
0
def main():
    weights = torch.load(args.checkpoint,
                         map_location=lambda storage, loc: storage)
    opt = torch.load(os.path.dirname(args.checkpoint) + '/args.pth')
    train_args = opt[0]

    char2code = {'aa': 0, 'ae': 1, 'ah': 2, 'ao': 3, 'aw': 4, 'ax': 5,  'ay': 6,
                 'b': 7, 'ch': 8, 'd': 9, 'dh': 10, 'eh': 11, 'er': 12, 'ey': 13,
                 'f': 14, 'g': 15, 'hh': 16, 'i': 17, 'ih': 18, 'iy': 19, 'jh': 20,
                 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'ng': 25, 'ow': 26, 'oy': 27,
                 'p': 28, 'pau': 29, 'r': 30, 's': 31, 'sh': 32, 'ssil': 33,
                 't': 34, 'th': 35, 'uh': 36, 'uw': 37, 'v': 38, 'w': 39, 'y': 40,
                 'z': 41}
    nspkr = train_args.nspk

    norm_path = None
    if os.path.exists(train_args.data + '/norm_info/norm.dat'):
        norm_path = train_args.data + '/norm_info/norm.dat'
    elif os.path.exists(os.path.dirname(args.checkpoint) + '/norm.dat'):
        norm_path = os.path.dirname(args.checkpoint) + '/norm.dat'
    else:
        print('ERROR: Failed to find norm file.')
        return
    train_args.noise = 0

    model = Loop(train_args)
    model.load_state_dict(weights)
    if args.gpu >= 0:
        model.cuda()
    model.eval()

    if args.spkr not in range(nspkr):
        print('ERROR: Unknown speaker id: %d.' % args.spkr)
        return

    txt, feat, spkr, output_fname = None, None, None, None
    if args.npz is not '':
        txt, text, feat = npy_loader_phonemes(args.npz)

        txt = Variable(txt.unsqueeze(1), volatile=True)
        feat = Variable(feat.unsqueeze(1), volatile=True)
        spkr = Variable(torch.LongTensor([args.spkr]), volatile=True)

        fname = os.path.basename(args.npz)[:-4]
        output_fname = fname + '.gen_' + str(args.spkr)

        words = np.char.split(text).tolist()
        words = [word.encode('utf-8') for word in words]
        action = 'none'
        number = 'none'
        objectt = 'none'
        location = 'none'

        # Remove extra word for special cases
        if len(words) == 7:
            words = words[1:]

        action = words[0]
        if len(words) == 2:
            objectt = words[1]
        elif len(words) > 3:
            number = words[1]
            objectt = words[2]
            location = words[-1]


        #print(words[0], words[1], words[2], words[-1])
        #print(text)

        # Read dataframe
        frames = {}
        if os.path.exists(args.dataset_file):
            df = pd.read_csv(args.dataset_file)
            for row in zip(*[df[col].values.tolist() for col in ['path', 'speakerId', 'transcription', 'action', 'number', 'object', 'location']]):
                frames[row[0]] = {'path': row[0],
                              'speakerId': row[1],
                              'transcription': row[2],
                              'action': row[3],
                              'number': row[4],
                              'object': row[5],
                              'location': row[6]}
        
        # Add new data
        path = os.path.join('wavs/synthetic', output_fname.strip("/") + '.wav')
        frames[path] = {'path': path,
                    'speakerId': args.spkr,
                    'transcription': text,
                    'action': action,
                    'number': number,
                    'object': objectt,
                    'location': location}

        paths = []
        speakerIds = []
        transcriptions = []
        actions = []
        numbers = []
        objects = []
        locations = []
        for key, frame in frames.items():
            paths.append(frame['path'])
            speakerIds.append(frame['speakerId'])
            transcriptions.append(frame['transcription'])
            actions.append(frame['action'])
            numbers.append(frame['number'])
            objects.append(frame['object'])
            locations.append(frame['location'])

            df = pd.DataFrame(OrderedDict([('path', paths),
                            ('speakerId', speakerIds),
                            ('transcription', transcriptions),
                            ('action', actions),
                            ('number', numbers),
                            ('object', objects),
                            ('location', locations)]))
        df.to_csv(args.dataset_file)

    else:
        print('ERROR: Must supply npz file path or text as source.')
        return

    ###
    key_list = list(char2code.keys())
    val_list = list(char2code.values())
    phrase = [key_list[val_list.index(letter)] for letter in txt.data.numpy()]
    #print(phrase)
    ###


    if args.gpu >= 0:
        txt = txt.cuda()
        feat = feat.cuda()
        spkr = spkr.cuda()


    out, attn = model([txt, spkr], feat)
    out, attn = trim_pred(out, attn)

    output_dir = os.path.join(os.path.dirname(args.checkpoint), 'results')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    generate_merlin_wav(out.data.cpu().numpy(),
                        output_dir,
                        output_fname,
                        norm_path)
Exemplo n.º 3
0
def generate_sample_with_loop(
        npz='',
        text='',
        spkr_id=1,
        gender=1,
        checkpoint='models/vctk-16khz-cmu-no-boundaries-all/bestmodel.pth',
        output_dir='./',
        npz_path='/home/ubuntu/loop/data/vctk-16khz-cmu-no-boundaries-all/numpy_features',
        output_file_override=None,
        ident_override=None):
    # npz = ''
    # text = 'Your tickets for the social issues'
    # text = 'see that girl watch that scene'
    # npz = '/home/ubuntu/loop/data/vctk/numpy_features/p294_011.npz'
    # spkr_id = 12
    # checkpoint = 'checkpoints/vctk/lastmodel.pth'
    # checkpoint = 'models/vctk/bestmodel.pth'

    gender = np.array(gender).reshape(-1)
    out_dict = dict()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    gpu = 0

    # load loop weights & params from checkpoint
    weights = torch.load(checkpoint, map_location=lambda storage, loc: storage)
    opt = torch.load(os.path.dirname(checkpoint) + '/args.pth')
    train_args = opt[0]

    train_dataset = NpzFolder(
        '/home/ubuntu/loop/data/vctk-16khz-cmu-no-boundaries-all/numpy_features'
    )
    char2code = train_dataset.dict
    spkr2code = train_dataset.speakers
    # print spkr2code.cpu().data

    norm_path = train_args.data + '/norm_info/norm.dat'
    norm_path = '/home/ubuntu/loop/data/vctk-16khz-cmu-no-boundaries-all/norm_info/norm.dat'
    train_args.noise = 0

    valid_dataset_path = npz_path + '_valid'

    # prepare loop model
    if ident_override:
        #model = Loop_Ident(train_args)
        pass
    else:
        model = Loop_Base(train_args)

    model.load_state_dict(weights)
    if gpu >= 0:
        model.cuda()
    model.eval()

    # check speaker id is valid
    if spkr_id not in range(len(spkr2code)):
        print('ERROR: Unknown speaker id: %d.' % spkr_id)

    # get phone sequence
    txt, feat, spkr, output_fname = None, None, None, None
    if npz is not '':
        # use pre-calculated phonemes etc.
        txt, feat, pre_calc_feat = npy_loader_phonemes(
            os.path.join(npz_path, npz))

        txt = Variable(txt.unsqueeze(1), volatile=True)
        feat = Variable(feat.unsqueeze(1), volatile=True)
        spkr = Variable(torch.LongTensor([spkr_id]), volatile=True)

        output_file = os.path.basename(npz)[:-4] + '_' + str(spkr_id)

        out_dict['pre_calc_feat'] = pre_calc_feat

    elif text is not '':
        # use specified text string
        # extract phonemes from the text
        txt = text2phone(text, char2code)
        feat = torch.FloatTensor(txt.size(0) * 20, 63)
        spkr = torch.LongTensor([spkr_id])

        txt = Variable(txt.unsqueeze(1), volatile=True)
        feat = Variable(feat.unsqueeze(1), volatile=True)
        spkr = Variable(spkr, volatile=True)

        output_file = text.replace(' ', '_')
    else:
        print('ERROR: Must supply npz file path or text as source.')
        raise Exception('Need source')

    if output_file_override:
        output_file = output_file_override

    # use gpu
    if gpu >= 0:
        txt = txt.cuda()
        feat = feat.cuda()
        spkr = spkr.cuda()

    # run loop model to generate output features
    # print(ident_override)
    if ident_override:
        loop_feat, attn = model([txt, spkr, gender],
                                feat,
                                ident_override=ident_override)
    else:
        loop_feat, attn = model([txt, spkr, gender], feat)

    loop_feat, attn = trim_pred(loop_feat, attn)

    # add to output dictionary
    out_dict['txt'] = txt[:, 0].squeeze().data.tolist()
    out_dict['spkr'] = spkr
    out_dict['feat'] = feat.data.cpu().numpy()
    out_dict['loop_feat'] = loop_feat.data.cpu().numpy()
    out_dict['attn'] = attn.squeeze().data.cpu().numpy()
    out_dict['output_file'] = output_file
    out_dict['valid_dataset_path'] = valid_dataset_path

    # print output_dir

    # generate .wav file from loop output features
    #print(output_dir)
    #print(output_file)
    #print(norm_path)

    generate_merlin_wav(loop_feat.data.cpu().numpy(), output_dir, output_file,
                        norm_path)

    # generate .wav file from original features for reference
    if npz is not '':
        output_orig_fname = os.path.basename(npz)[:-4] + '.orig'
        generate_merlin_wav(feat[:, 0, :].data.cpu().numpy(), output_dir,
                            output_orig_fname, norm_path)
        out_dict['output_orig_fname'] = output_orig_fname

    return out_dict
Exemplo n.º 4
0
def main():
    weights = torch.load(args.checkpoint,
                         map_location=lambda storage, loc: storage)
    opt = torch.load(os.path.dirname(args.checkpoint) + '/args.pth')
    train_args = opt[0]

    char2code = {'aa': 0, 'ae': 1, 'ah': 2, 'ao': 3, 'aw': 4, 'ax': 5,  'ay': 6,
                 'b': 7, 'ch': 8, 'd': 9, 'dh': 10, 'eh': 11, 'er': 12, 'ey': 13,
                 'f': 14, 'g': 15, 'hh': 16, 'i': 17, 'ih': 18, 'iy': 19, 'jh': 20,
                 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'ng': 25, 'ow': 26, 'oy': 27,
                 'p': 28, 'pau': 29, 'r': 30, 's': 31, 'sh': 32, 'ssil': 33,
                 't': 34, 'th': 35, 'uh': 36, 'uw': 37, 'v': 38, 'w': 39, 'y': 40,
                 'z': 41}
    nspkr = train_args.nspk

    norm_path = None
    if os.path.exists(train_args.data + '/norm_info/norm.dat'):
        norm_path = train_args.data + '/norm_info/norm.dat'
    elif os.path.exists(os.path.dirname(args.checkpoint) + '/norm.dat'):
        norm_path = os.path.dirname(args.checkpoint) + '/norm.dat'
    else:
        print('ERROR: Failed to find norm file.')
        return
    train_args.noise = 0

    model = Loop(train_args)
    model.load_state_dict(weights)
    if args.gpu >= 0:
        model.cuda()
    model.eval()

    if args.spkr not in range(nspkr):
        print('ERROR: Unknown speaker id: %d.' % args.spkr)
        return

    txt, feat, spkr, output_fname = None, None, None, None
    if args.npz is not '':
        txt, feat = npy_loader_phonemes(args.npz)

        txt = Variable(txt.unsqueeze(1), volatile=True)
        feat = Variable(feat.unsqueeze(1), volatile=True)
        spkr = Variable(torch.LongTensor([args.spkr]), volatile=True)

        fname = os.path.basename(args.npz)[:-4]
        output_fname = fname + '.gen_' + str(args.spkr)
    elif args.text is not '':
        txt = text2phone(args.text, char2code)
        feat = torch.FloatTensor(txt.size(0)*20, 63)
        spkr = torch.LongTensor([args.spkr])

        txt = Variable(txt.unsqueeze(1), volatile=True)
        feat = Variable(feat.unsqueeze(1), volatile=True)
        spkr = Variable(spkr, volatile=True)

        # slugify input string to file name
        fname = args.text.replace(' ', '_')
        valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
        fname = ''.join(c for c in fname if c in valid_chars)

        output_fname = fname + '.gen_' + str(args.spkr)
    else:
        print('ERROR: Must supply npz file path or text as source.')
        return

    if args.gpu >= 0:
        txt = txt.cuda()
        feat = feat.cuda()
        spkr = spkr.cuda()


    out, attn = model([txt, spkr], feat)
    out, attn = trim_pred(out, attn)

    output_dir = os.path.join(os.path.dirname(args.checkpoint), 'results')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    generate_merlin_wav(out.data.cpu().numpy(),
                        output_dir,
                        output_fname,
                        norm_path)

    if args.npz is not '':
        output_orig_fname = os.path.basename(args.npz)[:-4] + '.orig'
        generate_merlin_wav(feat[:, 0, :].data.cpu().numpy(),
                            output_dir,
                            output_orig_fname,
                            norm_path)
Exemplo n.º 5
0
def main():
    weights = torch.load(args.checkpoint,
                         map_location=lambda storage, loc: storage)
    opt = torch.load(os.path.dirname(args.checkpoint) + '/args.pth')
    train_args = opt[0]

    train_dataset = NpzFolder(train_args.data + '/numpy_features')
    char2code = train_dataset.dict
    spkr2code = train_dataset.speakers

    norm_path = train_args.data + '/norm_info/norm.dat.npy'
    train_args.noise = 0

    model = Loop(train_args)
    model.load_state_dict(weights)
    if args.gpu >= 0:
        model.cuda()
    model.eval()

    if args.spkr not in range(len(spkr2code)):
        print('ERROR: Unknown speaker id: %d.' % args.spkr)
        return

    txt, feat, spkr, output_fname = None, None, None, None
    if args.npz is not '':
        txt, feat = npy_loader_phonemes(args.npz)

        txt = Variable(txt.unsqueeze(1), volatile=True)
        feat = Variable(feat.unsqueeze(1), volatile=True)
        spkr = Variable(torch.LongTensor([args.spkr]), volatile=True)

        fname = os.path.basename(args.npz)[:-4]
        output_fname = fname + '.gen_' + str(args.spkr)
    elif args.text is not '':
        txt = text2phone(args.text, char2code)
        #feat = torch.FloatTensor(500, 67)
        feat = torch.FloatTensor(1500, 67)
        spkr = torch.LongTensor([args.spkr])

        txt = Variable(txt.unsqueeze(1), volatile=True)
        feat = Variable(feat.unsqueeze(1), volatile=True)
        spkr = Variable(spkr, volatile=True)

        fname = args.text.replace(' ', '_')
        output_fname = fname + '.gen_' + str(args.spkr)
    else:
        print('ERROR: Must supply npz file path or text as source.')
        return

    if args.gpu >= 0:
        txt = txt.cuda()
        feat = feat.cuda()
        spkr = spkr.cuda()

    out, attn = model([txt, spkr], feat)
    out, attn = trim_pred(out, attn)

    output_dir = os.path.join(os.path.dirname(args.checkpoint), 'results')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    #'''
    generate_merlin_wav(out.data.cpu().numpy(), output_dir, output_fname,
                        norm_path)
    #'''
    #out.data.cpu().numpy().tofile(output_fname)

    if args.npz is not '':
        output_orig_fname = os.path.basename(args.npz)[:-4] + '.orig'
        generate_merlin_wav(feat[:, 0, :].data.cpu().numpy(), output_dir,
                            output_orig_fname, norm_path)