예제 #1
0
    def load_songs(self):

        for set in ['train', 'val']:
            for condition in ['mixture', 'vocals']:
                for filepath in self.file_paths[set][condition]:

                    if condition == 'vocals':

                        sequence = util.load_wav(filepath, self.sample_rate)
                        self.sequences[set][condition].append(sequence)
                        self.num_sequences_in_memory += 1

                        if self.extract_voice_percent > 0:
                            self.voice_indices[set].append(
                                util.get_sequence_with_singing_indices(
                                    sequence))
                    else:

                        if self.in_memory_percentage == 1 or np.random.uniform(
                                0, 1) <= (self.in_memory_percentage - 0.5) * 2:
                            sequence = util.load_wav(filepath,
                                                     self.sample_rate)
                            self.sequences[set][condition].append(sequence)
                            self.num_sequences_in_memory += 1
                        else:
                            self.sequences[set][condition].append([-1])
예제 #2
0
파일: main.py 프로젝트: rmacas/mlg
def inference(config, cla):

    if cla.batch_size is not None:
        batch_size = int(cla.batch_size)
    else:
        batch_size = config['training']['batch_size']

    if cla.target_field_length is not None:
        cla.target_field_length = int(cla.target_field_length)

    if not bool(cla.one_shot):
        model = models.DenoisingWavenet(config, target_field_length=cla.target_field_length,
                                        load_checkpoint=cla.load_checkpoint, print_model_summary=cla.print_model_summary)
        print('Performing inference..')
    else:
        print('Performing one-shot inference..')

    samples_folder_path = os.path.join(config['training']['path'], 'samples')
    output_folder_path = get_valid_output_folder_path(samples_folder_path)

    #If input_path is a single wav file, then set filenames to single element with wav filename
    if cla.noisy_input_path.endswith('.wav'):
        filenames = [cla.noisy_input_path.rsplit('/', 1)[-1]]
        cla.noisy_input_path = cla.noisy_input_path.rsplit('/', 1)[0] + '/'
        if cla.clean_input_path is not None:
            cla.clean_input_path = cla.clean_input_path.rsplit('/', 1)[0] + '/'
    else:
        if not cla.noisy_input_path.endswith('/'):
            cla.noisy_input_path += '/'
        filenames = [filename for filename in os.listdir(cla.noisy_input_path) if filename.endswith('.wav')]

    clean_input = None
    for filename in filenames:
        noisy_input = util.load_wav(cla.noisy_input_path + filename, config['dataset']['sample_rate'])
        if cla.clean_input_path is not None:
            if not cla.clean_input_path.endswith('/'):
                cla.clean_input_path += '/'
            clean_input = util.load_wav(cla.clean_input_path + filename, config['dataset']['sample_rate'])

        input = {'noisy': noisy_input, 'clean': clean_input}

        output_filename_prefix = filename[0:-4] + '_'

        if config['model']['condition_encoding'] == 'one_hot':
            condition_input = util.one_hot_encode(int(cla.condition_value), 29)[0]
        else:
            condition_input = util.binary_encode(int(cla.condition_value), 29)[0]

        if bool(cla.one_shot):
            if len(input['noisy']) % 2 == 0:  # If input length is even, remove one sample
                input['noisy'] = input['noisy'][:-1]
                if input['clean'] is not None:
                    input['clean'] = input['clean'][:-1]
            model = models.DenoisingWavenet(config, load_checkpoint=cla.load_checkpoint, input_length=len(input['noisy']), print_model_summary=cla.print_model_summary)

        print("Denoising: " + filename)
        denoise.denoise_sample(model, input, condition_input, batch_size, output_filename_prefix,
                                            config['dataset']['sample_rate'], output_folder_path)
예제 #3
0
    def load_directory(self, directory_path, condition):

        filenames = [
            filename for filename in os.listdir(directory_path)
            if filename.endswith('.wav')
        ]

        speakers = []
        file_paths = []
        speech_onset_offset_indices = []
        regain_factors = []
        sequences = []
        for filename in filenames:

            speaker_name = filename[0:4]
            speakers.append(speaker_name)

            filepath = os.path.join(directory_path, filename)

            if condition == 'clean':

                sequence = util.load_wav(filepath, self.sample_rate)
                sequences.append(sequence)
                self.num_sequences_in_memory += 1
                regain_factors.append(self.regain / util.rms(sequence))
                #如果extract_voice为true,则需要进行去除前后静音操作
                if self.extract_voice:
                    #speech_onset_offset_indices是非静音段的起止点
                    speech_onset_offset_indices.append(
                        util.get_subsequence_with_speech_indices(sequence))
            else:
                if self.in_memory_percentage == 1 or np.random.uniform(
                        0, 1) <= (self.in_memory_percentage - 0.5) * 2:
                    sequence = util.load_wav(filepath, self.sample_rate)
                    sequences.append(sequence)
                    self.num_sequences_in_memory += 1
                else:
                    sequences.append([-1])

            if speaker_name not in self.speaker_mapping:
                self.speaker_mapping[speaker_name] = len(
                    self.speaker_mapping) + 1

            file_paths.append(filepath)

        return sequences, file_paths, speakers, speech_onset_offset_indices, regain_factors
    def load_directory(self, filenames, spk):
        sequences = []
        for filename in tqdm(filenames):
            sequence = util.load_wav(filename, self.sample_rate)
            sequences.append(sequence)
        sequences = np.array(sequences)

        return sequences
예제 #5
0
    def encode_dataset(self):

        print('Encoding from {} into {}'.format(self.path, self.tfr))
        print('Input length : {}'.format(self.input_length))

        total = 0
        less_than_target = 0

        with tf.python_io.TFRecordWriter(self.tfr) as writer:      
            filenames = os.listdir(os.path.join(self.path, self.mode, 's1'))
            
            for filename in tqdm(filenames):
                s1 = util.load_wav(os.path.join(self.path, self.mode, 's1', filename), self.sample_rate)
                s2 = util.load_wav(os.path.join(self.path, self.mode, 's2', filename), self.sample_rate)

                def write(_s1, _s2):
                    example = tf.train.Example(
                        features=tf.train.Features(
                            feature={
                                "s1": tf.train.Feature(float_list=tf.train.FloatList(value=_s1)),
                                "s2": tf.train.Feature(float_list=tf.train.FloatList(value=_s2))
                            }))
                    writer.write(example.SerializeToString())
                    
                if len(s1) < self.input_length:
                    # b = np.random.random_integers(self.receptive_field_length//2, 
                    #                               self.input_length-len(s1)-self.receptive_field_length//2)
                    b = np.random.random_integers(0, self.input_length-len(s1))
                    s1_pad = np.zeros((self.input_length))
                    s1_pad[b:b+len(s1)] = s1
                    s2_pad = np.zeros((self.input_length))
                    s2_pad[b:b+len(s2)] = s2
                    write(s1_pad, s2_pad)
                    less_than_target += 1

                else:
                    stride = self.input_length // 2
                    for i in range(0, len(s1) - self.input_length, stride):
                        s1_pad = s1[i:i+self.input_length]
                        s2_pad = s2[i:i+self.input_length]
                        write(s1_pad, s2_pad)
                        total += 1
                        
            print('total example : {}, less than target : {}'.format(total + less_than_target, less_than_target))
예제 #6
0
def test(config, cla):
    log_file = os.path.join(config['training']['path'], cla.ckpt_name, 'log_'+cla.test_set)

    if not os.path.exists(os.path.join(config['training']['path'], cla.ckpt_name)):
        os.mkdir(os.path.join(config['training']['path'], cla.ckpt_name))

    output_path = os.path.join(config['training']['path'], 'sample')

    sess = tf.Session()
    G = hparams.get_model(config['model']['type'])(config, sess)
    G_save_path = os.path.join(config['training']['path'], 'generat.ckpt')
    G.load(G_save_path, cla.ckpt_name)

    if not cla.mix_input_path.endswith('/'): cla.mix_input_path += '/'
    filenames = [filename for filename in os.listdir(cla.mix_input_path) if filename.endswith('.wav')]

    sdr_sum = []
    sisnr_sum = []
    pesq_sum = []

    for filename in filenames:
        util.myprint(log_file, filename)
        mix_audio = util.load_wav(cla.mix_input_path + filename, config['dataset']['sample_rate'])
        clean_1 = util.load_wav(cla.clean_input_path + 's1/' + filename, config['dataset']['sample_rate'])
        clean_2 = util.load_wav(cla.clean_input_path + 's2/' + filename, config['dataset']['sample_rate'])

        sdr, sisnr, pesq, pit_ch = separate.separate_sample(sess, G, config, mix_audio, clean_1, clean_2)

        util.myprint(log_file, '    sdr: {}, {}'.format(sdr[0], sdr[1]))
        util.myprint(log_file, '    sisnr: {}, {}'.format(sisnr[0], sisnr[1]))
        util.myprint(log_file, '    pesq: {}, {}'.format(pesq[0], pesq[1]))

        sdr_sum.append(sdr)
        sisnr_sum.append(sisnr)
        pesq_sum.append(pesq)

    sdr_sum = np.array(sdr_sum)
    sisnr_sum = np.array(sisnr_sum)
    pesq_sum = np.array(pesq_sum)

    util.myprint(log_file, 'test sdr : {}'.format(np.mean(sdr_sum)))
    util.myprint(log_file, 'test sisnr : {}'.format(np.mean(sisnr_sum)))
    util.myprint(log_file, 'test pesq : {}'.format(np.mean(pesq_sum)))
    def retrieve_sequence(self, set, condition, sequence_num):
        if len(self.sequences[set][condition][sequence_num]) == 1:
            sequence = util.load_wav(self.file_paths[set][condition][sequence_num], self.sample_rate)

            if (float(self.num_sequences_in_memory) / self.get_num_sequences_in_dataset()) < self.in_memory_percentage:
                self.sequences[set][condition][sequence_num] = sequence
                self.num_sequences_in_memory += 1
        else:
            sequence = self.sequences[set][condition][sequence_num]

        return np.array(sequence)
    def load_directory(self, filenames, spk):
        speakers = []
        file_paths = []
        speech_onset_offset_indices = []
        regain_factors = []
        sequences = []
        for filename in filenames:
            speaker_name = filename.split('/')[-1].split('_')[0][:3] if spk=='a' else \
                filename.split('/')[-1].split('_')[2][:3]
            speakers.append(speaker_name)

            sequence = util.load_wav(filename, self.sample_rate)
            sequences.append(sequence)
            self.num_sequences_in_memory += 1
            # regain_factors.append(self.regain / util.rms(sequence))

            if self.extract_voice:
                # get sub-sequence without front and ending silence
                speech_onset_offset_indices.append(util.get_subsequence_with_speech_indices(sequence))

            if speaker_name not in self.speaker_mapping:
                self.speaker_mapping[speaker_name] = len(self.speaker_mapping) + 1

        return sequences, speakers, speech_onset_offset_indices, regain_factors
예제 #9
0
def test(config, cla):

    if cla.batch_size is not None:
        batch_size = int(cla.batch_size)
    else:
        batch_size = config['training']['batch_size']

    if cla.target_field_length is not None:
        cla.target_field_length = int(cla.target_field_length)

    model = models.DenoisingWavenet(
        config,
        target_field_length=cla.target_field_length,
        load_checkpoint=cla.load_checkpoint,
        print_model_summary=cla.print_model_summary)

    samples_folder_path = os.path.join(config['training']['path'], 'samples')
    output_folder_path = get_valid_output_folder_path(samples_folder_path)

    if not cla.noisy_input_path.endswith('/'):
        cla.noisy_input_path += '/'
    filenames = [
        filename for filename in os.listdir(cla.noisy_input_path)
        if filename.endswith('.wav')
    ]

    with open('spk_info.json') as f:
        spk_info = json.load(f)

    sdr = []
    n_output = config['training']['n_output'] if 'n_output' in config[
        'training'] else 2
    n_speaker = config['training']['n_speaker'] if 'n_speaker' in config[
        'training'] else 2
    gender_stat = {
        'ch' + str(i + 1): {
            'M': 0,
            'F': 0
        }
        for i in range(n_output)
    }
    # gender_stat = {'ch1':{'M':0,'F':0}, 'ch2':{'M':0,'F':0}}

    for filename in filenames:
        noisy_input = util.load_wav(cla.noisy_input_path + filename,
                                    config['dataset']['sample_rate'])
        if cla.clean_input_path is not None:
            if not cla.clean_input_path.endswith('/'):
                cla.clean_input_path += '/'
            clean_input_1 = util.load_wav(
                cla.clean_input_path + 's1/' + filename,
                config['dataset']['sample_rate'])
            clean_input_2 = util.load_wav(
                cla.clean_input_path + 's2/' + filename,
                config['dataset']['sample_rate'])
        input = {
            'noisy': noisy_input,
            'clean_1': clean_input_1,
            'clean_2': clean_input_2
        }

        output_filename_prefix = filename[0:-4] + '_'
        spk1 = output_filename_prefix.split('_')[0][:3]
        spk2 = output_filename_prefix.split('_')[2][:3]
        spk_name = [spk1, spk2]
        spk_gender = [spk_info[spk1], spk_info[spk2]]

        # print("Denoising: " + filename).
        condition_input = None
        print(filename)
        _sdr, ch_gender, pit_idx = denoise.denoise_sample(
            model,
            input,
            condition_input,
            batch_size,
            output_filename_prefix,
            config['dataset']['sample_rate'],
            n_speaker,
            n_output,
            output_folder_path,
            spk_gender=spk_gender,
            use_pit=cla.use_pit,
            pad=cla.zero_pad)
        # print('sdr = %f, %f' %(_sdr[0],_sdr[1]))
        if spk_gender[0] == 'F' and spk_gender[1] == 'M':
            for i in range(1, -1, -1):
                print('{} {}: sdr={}, idx={}'.format(spk_gender[i],
                                                     spk_name[i], _sdr[i],
                                                     pit_idx[i]))
        else:
            for i in range(2):
                print('{} {}: sdr={}, idx={}'.format(spk_gender[i],
                                                     spk_name[i], _sdr[i],
                                                     pit_idx[i]))
        # print(ch_gender)
        # for ch, stat in ch_gender.items():
        # for gen, num in stat.items():
        # gender_stat[ch][gen] += num
        sdr.append(_sdr)
    sdr = np.array(sdr)
    print('Testing SDR:', np.mean(sdr))
    print(gender_stat)
예제 #10
0
hann = np.hanning(frame_width)
spacing = 1024
bitrate = 44100

f_0 = 50
f_1 = 2000
f_r = 10
power_thresh = 10

if len(sys.argv) == 2:
    input_file = sys.argv[1]
else:
    print("usage: ./harmonicity.py input.wav")
    sys.exit()

data = util.load_wav(input_file)
all_weights = []
fft_len = frame_width * 4
zeropad = np.zeros(frame_width * 3)
best_frequencies = []

k0 = int(np.floor(util.hz_to_fourier(f_0, frame_width * 4, bitrate)))
k1 = int(np.ceil(util.hz_to_fourier(f_1, frame_width * 4, bitrate)))
# iterate through frames
for i in tqdm(range(0, int((len(data) - frame_width) / spacing))):

    # spectrum generation and preprocessing

    frame = data[i * spacing:i * spacing + frame_width]
    window = frame * hann
    raw_fft = fft(np.concatenate((window, zeropad)))
예제 #11
0
def inference(config, cla):

    if cla.batch_size is not None:
        batch_size = int(cla.batch_size)
    else:
        batch_size = config['training']['batch_size']

    if cla.target_field_length is not None:
        cla.target_field_length = int(cla.target_field_length)

    if not bool(cla.one_shot):

        if config['model']['type'] == 'singing-voice':
            model = models.SingingVoiceSeparationWavenet(
                config,
                target_field_length=cla.target_field_length,
                load_checkpoint=cla.load_checkpoint,
                print_model_summary=cla.print_model_summary)

        elif config['model']['type'] == 'multi-instrument':
            model = models.MultiInstrumentSeparationWavenet(
                config,
                target_field_length=cla.target_field_length,
                load_checkpoint=cla.load_checkpoint,
                print_model_summary=cla.print_model_summary)

        print 'Performing inference..'

    else:
        print 'Performing one-shot inference..'

    samples_folder_path = os.path.join(config['training']['path'], 'samples')
    output_folder_path = get_valid_output_folder_path(samples_folder_path)

    #If input_path is a single wav file, then set filenames to single element with wav filename
    if cla.mixture_input_path.endswith('.wav'):
        filenames = [cla.mixture_input_path.rsplit('/', 1)[-1]]
        cla.mixture_input_path = cla.mixture_input_path.rsplit('/', 1)[0] + '/'

    else:
        if not cla.mixture_input_path.endswith('/'):
            cla.mixture_input_path += '/'
        filenames = [
            filename for filename in os.listdir(cla.mixture_input_path)
            if filename.endswith('.wav')
        ]

    for filename in filenames:
        mixture_input = util.load_wav(cla.mixture_input_path + filename,
                                      config['dataset']['sample_rate'])

        input = {'mixture': mixture_input}

        output_filename_prefix = filename[0:-4]

        if bool(cla.one_shot):
            if len(input['mixture']
                   ) % 2 == 0:  # If input length is even, remove one sample
                input['mixture'] = input['mixture'][:-1]

            if config['model']['type'] == 'singing-voice':
                model = models.SingingVoiceSeparationWavenet(
                    config,
                    target_field_length=cla.target_field_length,
                    load_checkpoint=cla.load_checkpoint,
                    print_model_summary=cla.print_model_summary)

            elif config['model']['type'] == 'multi-instrument':
                model = models.MultiInstrumentSeparationWavenet(
                    config,
                    target_field_length=cla.target_field_length,
                    load_checkpoint=cla.load_checkpoint,
                    print_model_summary=cla.print_model_summary)

        print "Separating: " + filename
        separate.separate_sample(model, input, batch_size,
                                 output_filename_prefix,
                                 config['dataset']['sample_rate'],
                                 output_folder_path, config['model']['type'])