예제 #1
0
def _get_data_training(current_set, set_size, mixtures_list, sources_list,
                       window_values, fft_size, hop, seq_length,
                       context_length, batch_size):
    """Gets the actual input and output data for training.

    :param current_set: The current set of files that we are now looking.
    :type current_set: int
    :param set_size: The size of the sets that we consider.
    :type set_size: int
    :param mixtures_list: A list with the paths of the mixtures.
    :type mixtures_list: list[pathlib.Path]
    :param sources_list: A list with the paths of the source.
    :type sources_list: list[pathlib.Path]
    :param window_values: The values of the windowing function that we will use.
    :type window_values: numpy.core.multiarray.ndarray
    :param fft_size: The size of the FFT in samples.
    :type fft_size: int
    :param hop: The hop size in samples.
    :type hop: int
    :param seq_length: The sequence length in frames.
    :type seq_length: int
    :param context_length: The context length in frames.
    :type context_length: int
    :param batch_size: The batch size.
    :type batch_size: int
    :return: The actual input and target value.
    :rtype: numpy.core.multiarray.ndarray
    """
    m_list = mixtures_list[(current_set - 1) * set_size:current_set * set_size]
    s_list = sources_list[(current_set - 1) * set_size:current_set * set_size]

    ms_train, vs_train = None, None

    for index in range(len(m_list)):
        mix = wav_read(m_list[index].joinpath('mixture.wav'), mono=False)[0]
        vox = wav_read(s_list[index].joinpath('vocals.wav'), mono=False)[0]

        ms_seg = stft(0.5 * np.sum(mix, axis=-1), window_values, fft_size,
                      hop)[0][3:-3, :]
        vs_seg = stft(0.5 * np.sum(vox, axis=-1), window_values, fft_size,
                      hop)[0][3:-3, :]

        if index == 0:
            ms_train = ms_seg
            vs_train = vs_seg

        else:
            ms_train = np.vstack((ms_train, ms_seg))
            vs_train = np.vstack((vs_train, vs_seg))

    vs_train = ideal_ratio_masking(ms_train, vs_train, ms_train) * 2.
    vs_train = np.clip(vs_train, a_min=0., a_max=1.)

    ms_train = np.clip(ms_train, a_min=0., a_max=1.)
    ms_train, vs_train, _ = _make_overlap_sequences(ms_train, vs_train,
                                                    ms_train, seq_length,
                                                    context_length * 2,
                                                    batch_size)

    return ms_train, vs_train
예제 #2
0
def _get_data_testing(sources_parent_path, window_values, fft_size, hop,
                      seq_length, context_length, batch_size, usage_case):
    """Gets the actual input and output data for testing.

    :param sources_parent_path: The parent path of the sources
    :type sources_parent_path: str
    :param window_values: The values of the windowing function that we will use.
    :type window_values: numpy.core.multiarray.ndarray
    :param fft_size: The size of the FFT in samples.
    :type fft_size: int
    :param hop: The hop size in samples.
    :type hop: int
    :param seq_length: The sequence length in frames.
    :type seq_length: int
    :param context_length: The context length in frames.
    :type context_length: int
    :param batch_size: The batch size.
    :type batch_size: int
    :param usage_case: Flag to indicate that currently we are just using it.
    :type usage_case: bool
    :return: The actual input and target value.
    :rtype: numpy.core.multiarray.ndarray
    """
    if not usage_case:
        bass = wav_read(os.path.join(sources_parent_path, 'bass.wav'),
                        mono=False)[0]
        drums = wav_read(os.path.join(sources_parent_path, 'drums.wav'),
                         mono=False)[0]
        others = wav_read(os.path.join(sources_parent_path, 'other.wav'),
                          mono=False)[0]
        voice = wav_read(os.path.join(sources_parent_path, 'vocals.wav'),
                         mono=False)[0]

        bg_true = np.sum(bass + drums + others, axis=-1) * 0.5
        voice_true = np.sum(voice, axis=-1) * 0.5
        mix = np.sum(bass + drums + others + voice, axis=-1) * 0.5
    else:
        mix = wav_read(sources_parent_path, mono=True)[0]
        voice_true = None
        bg_true = None

    mix_magnitude, mix_phase = stft(mix, window_values, fft_size, hop)

    # Data reshaping (magnitude and phase)
    mix_magnitude, mix_phase, _ = _make_overlap_sequences(
        mix_magnitude, mix_phase, mix_phase, seq_length, context_length * 2,
        batch_size)

    return mix, mix_magnitude, mix_phase, voice_true, bg_true