示例#1
0
def invert_to_sound (specname):
    specinfo = np.load(specname)
    original = utils.ispecgram(specinfo,
              n_fft=512,
              hop_length=None,
              mask=True,
              log_mag=True,
              re_im=False,
              dphase=True,
              mag_only=False,
              num_iters=1000)

    plt.plot(original)
    plt.show()
    # scale the original array to audible sound
    write(specname +'Inverse.wav', 16000, original)
示例#2
0
    #     samples = sess.run(G_sample, feed_dict={Z: sample_Z(16, Z_dim)})
    #     # fig = plot(samples)
    #     # plt.savefig('out/{}.png'.format(str(i).zfill(3)), bbox_inches='tight')
    #     i += 1
    #     # plt.close(fig)
    # every 1000 iter test
   
    if it % 1000 == 0:
        samples = sess.run(G_sample, feed_dict={Z: sample_Z('3230')})
        samples = np.transpose(samples)
        XXX = concat_to_spec(samples)
        ori = utils.ispecgram(XXX,
              n_fft=512,
              hop_length=None,
              mask=True,
              log_mag=True,
              re_im=False,
              dphase=True,
              mag_only=False,
              num_iters=1000)
        write('out/{}.wav'.format(str(i).zfill(3)), 16000, ori)
        i += 1

        # print samples.shape
        # fig = plot(samples)
        # plt.savefig('out/{}.png'.format(str(i).zfill(3)), bbox_inches='tight')
        # i += 1
        # plt.close(fig)

    V_mb, A_mb = data_gen.next()
    # print 'V_mb', V_mb
示例#3
0
def main():
    video_all = [
        '3229', '3230', '3231', '3232', '3233', '3234', '3235', '3236', '3237'
    ]
    save_folder = '/home/yuanxin/code/bitplanes-tracking/'

    for video_index in video_all:
        data_folder = '/home/yuanxin/Downloads/output/test_' + video_index
        specname = './spec.npy'
        centername = './center.npy'
        os.chdir(data_folder)
        feat_map = []
        for file in sorted(glob.glob("*.prob")):
            print file
            f = open(file, "rb")
            size = array.array("i")  # L is the typecode for uint32
            size.fromfile(f, 5)
            f.seek(20, os.SEEK_SET)
            total = size[0] * size[1] * size[2] * size[3] * size[4]
            data = array.array("f")
            data.fromfile(f, total)
            data = np.array(data)
            data = data.reshape(size[0], size[1], size[2], size[3], size[4])
            # print data.shape
            feat_map.append(data)
            f.close()

        feat_map = np.array(feat_map)

        max_index = []
        for i in range(feat_map.shape[0]):
            max_index.append(np.argmax(feat_map[i][0].reshape(16)))

        os.chdir(save_folder)
        audioname = './VGG_out/inverse_' + video_index + '.wav'
        X = np.load(specname)
        center = np.load(centername)
        label = max_index
        label = np.lib.pad(label, (7, 7),
                           'constant',
                           constant_values=(label[0], label[-1]))
        # interp label

        # print X.shape
        # print center.shape
        # print label.shape
        pdb.set_trace()
        Y = center_to_spec(X[0:len(label)], center, label)
        LabelT = np.transpose(Y)
        print 'LabelT', LabelT.shape
        # reshape to the correct rainbowgram format
        XXX = concat_to_spec(LabelT)
        print 'XXX', XXX.shape
        ori = utils.ispecgram(XXX,
                              n_fft=512,
                              hop_length=None,
                              mask=True,
                              log_mag=True,
                              re_im=False,
                              dphase=True,
                              mag_only=False,
                              num_iters=1000)
        # plt.plot(ori)
        # plt.show()
        # scale the original array to audible sound
        write(audioname, 15360, ori)
示例#4
0
def time_windowed_stft_transfer(content_file,
                                style_file,
                                window_seconds=5,
                                hop_seconds=2.5):
    rate, content = wavfile.read(content_file)
    if total_seconds:
        content = content[:rate * total_seconds]
    total_samples = int(1.0 * params.sample_rate / rate * len(content))
    print('Resampling content')
    content = resample(content, total_samples)
    content = content.astype(np.float32)
    librosa.output.write_wav('output/content.wav',
                             sr=params.sample_rate,
                             y=content.astype(np.int16),
                             norm=False)

    x = np.random.normal(0, 1, total_samples)
    x = x.astype(np.float32)
    x /= np.max(np.abs(x))
    x *= (amplitude - 1)
    # x *= 0.01
    x = content

    unnormal_noise = x.flatten()
    librosa.output.write_wav('output/x.wav',
                             sr=params.sample_rate,
                             y=unnormal_noise.astype(np.int16),
                             norm=False)

    rate, style = wavfile.read(style_file)
    if total_seconds:
        style = style[:rate * total_seconds]
    total_samples2 = int(len(style) / rate) * params.sample_rate
    print('Resampling style')
    style = resample(style, total_samples2)
    style = style.astype(np.float32)
    librosa.output.write_wav('output/style.wav',
                             sr=params.sample_rate,
                             y=style.astype(np.int16),
                             norm=False)
    session = tf.Session()

    style_stft = specgram(style,
                          n_fft=params.fft_size,
                          log_mag=audio_utils.log,
                          hop_length=256,
                          mag_only=True)
    style_stft = style_stft.reshape([style_stft.shape[0], style_stft.shape[1]])
    style_stft = style_stft.T
    style_stft = style_stft[np.newaxis, :, :]
    print('Style: ')
    print(
        f'Min: {np.min(style_stft):.4f}, Max: {np.max(style_stft):.4f}, Mean: {np.mean(style_stft):.4f}'
    )

    content_stft = specgram(content,
                            n_fft=params.fft_size,
                            log_mag=audio_utils.log,
                            hop_length=256,
                            mag_only=True)
    content_stft = content_stft.reshape(
        [content_stft.shape[0], content_stft.shape[1]])
    content_stft = content_stft.T
    content_stft = content_stft[np.newaxis, :, :]
    print('Content: ')
    print(
        f'Min: {np.min(content_stft):.4f}, Max: {np.max(content_stft):.4f}, Mean: {np.mean(content_stft):.4f}'
    )

    result_stft = content_stft[:, :, :]

    hop_samples = int(content_stft.shape[1] /
                      (len(content) / params.sample_rate) * hop_seconds)
    window_samples = int(content_stft.shape[1] /
                         (len(content) / params.sample_rate) * window_seconds)

    transferrer = StftTransfer(
        session,
        (content_stft.shape[0], window_samples, content_stft.shape[2]))

    print('Starting window shifting')
    first = True
    for j in range(3):
        content_stft = result_stft[:]

        for i in range(0 if first else random.randint(0, hop_samples),
                       result_stft.shape[1], hop_samples):
            start_time = time.time()
            # next_hop = int(hop_samples[i + 1]+1) if len(hop_samples) > i + 1 else content_stft.shape[1]
            current_content = content_stft[:, i:i + window_samples, :]
            current_initial = result_stft[:, i:i + window_samples, :]
            if current_content.shape[1] != window_samples:
                continue

            current_style = get_best_style(transferrer, current_initial[:],
                                           style_stft, hop_samples // 4)

            current_result_stft = transferrer.stft_transfer(
                content_stft=current_content,
                style_stft=current_style,
                initial_stft=current_initial,
                maxiter=20)
            result_stft[:, i:i + window_samples, :] = current_result_stft
            print(
                f'Epoch {j} last sample: {(i/result_stft.shape[1]):.4f} elapsed time: {(time.time() - start_time):.4f}'
            )

        result = result_stft[0].T[:, :, np.newaxis]
        result = ispecgram(result,
                           params.fft_size,
                           hop_length=256,
                           log_mag=audio_utils.log,
                           mag_only=True,
                           num_iters=20)
        result = np.clip(result, -1, 1)
        result *= amplitude

        # result = stft_transfer(content=content, style=style, initial=x)
        librosa.output.write_wav(f'output/result{j}.wav',
                                 sr=params.sample_rate,
                                 y=result.astype(np.int16),
                                 norm=False)
        print('Written result.wav')
        # librosa.output.write_wav('output/result.wav', sr=sample_rate, y=result.astype(np.int16), norm=False)
        # print('Written result.wav')
        first = False

    print('Inverting specgram')

    result = result_stft[0].T[:, :, np.newaxis]
    result = ispecgram(result,
                       params.fft_size,
                       hop_length=256,
                       log_mag=audio_utils.log,
                       mag_only=True,
                       num_iters=100)
    result = np.clip(result, -1, 1)
    result *= amplitude

    # result = stft_transfer(content=content, style=style, initial=x)
    librosa.output.write_wav('output/result.wav',
                             sr=params.sample_rate,
                             y=result.astype(np.int16),
                             norm=False)
    print('Written result.wav')