def use_me_process(sources_list, output_file_names): """The usage process. :param sources_list: The file names to be used. :type sources_list: list[str] :param output_file_names: The output file names to be used. :type output_file_names: list[list[str]] """ print('\n-- Welcome to MaD TwinNet.') if debug: print( '\n-- Cannot proceed in debug mode. Please set debug=False at the settings file.' ) print('-- Exiting.') exit(-1) print( '-- Now I will extract the voice and the background music from the provided files' ) # Masker modules rnn_enc = RNNEnc(hyper_parameters['reduced_dim'], hyper_parameters['context_length'], debug) rnn_dec = RNNDec(hyper_parameters['rnn_enc_output_dim'], debug) fnn = FNNMasker(hyper_parameters['rnn_enc_output_dim'], hyper_parameters['original_input_dim'], hyper_parameters['context_length']) # Denoiser modules denoiser = FNNDenoiser(hyper_parameters['original_input_dim']) rnn_enc.load_state_dict(torch.load(output_states_path['rnn_enc'])) rnn_dec.load_state_dict(torch.load(output_states_path['rnn_dec'])) fnn.load_state_dict(torch.load(output_states_path['fnn'])) denoiser.load_state_dict(torch.load(output_states_path['denoiser'])) if not debug and torch.has_cudnn: rnn_enc = rnn_enc.cuda() rnn_dec = rnn_dec.cuda() fnn = fnn.cuda() denoiser = denoiser.cuda() testing_it = data_feeder_testing( window_size=hyper_parameters['window_size'], fft_size=hyper_parameters['fft_size'], hop_size=hyper_parameters['hop_size'], seq_length=hyper_parameters['seq_length'], context_length=hyper_parameters['context_length'], batch_size=1, debug=debug, sources_list=sources_list) print('-- Let\'s go!\n') total_time = 0 for index, data in enumerate(testing_it()): s_time = time.time() mix, mix_magnitude, mix_phase, voice_true, bg_true = data voice_predicted = np.zeros( (mix_magnitude.shape[0], hyper_parameters['seq_length'] - hyper_parameters['context_length'] * 2, hyper_parameters['window_size']), dtype=np.float32) for batch in range( int(mix_magnitude.shape[0] / training_constants['batch_size'])): b_start = batch * training_constants['batch_size'] b_end = (batch + 1) * training_constants['batch_size'] v_in = Variable( torch.from_numpy(mix_magnitude[b_start:b_end, :, :])) if not debug and torch.has_cudnn: v_in = v_in.cuda() tmp_voice_predicted = rnn_enc(v_in) tmp_voice_predicted = rnn_dec(tmp_voice_predicted) tmp_voice_predicted = fnn(tmp_voice_predicted, v_in) tmp_voice_predicted = denoiser(tmp_voice_predicted) voice_predicted[ b_start:b_end, :, :] = tmp_voice_predicted.data.cpu().numpy() data_process_results_testing( index=index, voice_true=voice_true, bg_true=bg_true, voice_predicted=voice_predicted, window_size=hyper_parameters['window_size'], mix=mix, mix_magnitude=mix_magnitude, mix_phase=mix_phase, hop=hyper_parameters['hop_size'], context_length=hyper_parameters['context_length'], output_file_name=output_file_names[index]) e_time = time.time() print( usage_output_string_per_example.format(f=sources_list[index], t=e_time - s_time)) total_time += e_time - s_time print('\n-- Testing finished\n') print(usage_output_string_total.format(t=total_time)) print('-- That\'s all folks!')
def _testing_process(data, index, mad, device, seq_length, context_length, window_size, batch_size, hop_size, outputPath): """The testing process over testing data. :param data: The testing data. :type data: numpy.ndarray :param index: The index of the testing data (used for\ calculating scores). :type index: int :param mad: The MaD system. :type mad: torch.nn.Module :param device: The device to be used. :type device: str :param seq_length: The sequence length used. :type seq_length: int :param context_length: The context length used. :type context_length: int :param window_size: The window size used. :type window_size: int :param batch_size: The batch size used. :type batch_size: int :param hop_size: The hop size used. :type hop_size: int :return: The SDR and SIR scores, and the time elapsed for\ the process. :rtype: (numpy.ndarray, numpy.ndarray, float) """ s_time = time.time() mix, mix_magnitude, mix_phase, voice_true, bg_true = data voice_predicted = np.zeros(( mix_magnitude.shape[0], seq_length - context_length * 2, window_size), dtype=np.float32) for batch in range(int(mix_magnitude.shape[0] / batch_size)): b_start = batch * batch_size b_end = (batch + 1) * batch_size v_in = from_numpy( mix_magnitude[b_start:b_end, :, :]).to(device) voice_predicted[b_start:b_end, :, :] = mad( v_in.unsqueeze(1)).v_j_filt.cpu().numpy() tmp_sdr, tmp_sir, tmp_sar = data_feeder.data_process_results_testing( index=index, voice_true=voice_true, bg_true=bg_true, voice_predicted=voice_predicted, window_size=window_size, mix=mix, mix_magnitude=mix_magnitude, mix_phase=mix_phase, hop=hop_size, context_length=context_length, outputPath=outputPath) time_elapsed = time.time() - s_time printing.print_msg(testing_output_string_per_example.format( e=index, sdr=np.median([i for i in tmp_sdr[0] if not np.isnan(i)]), sir=np.median([i for i in tmp_sir[0] if not np.isnan(i)]), sar=np.median([i for i in tmp_sar[0] if not np.isnan(i)]), t=time_elapsed )) return tmp_sdr, tmp_sir,tmp_sar, time_elapsed
def testing_process(): """The testing process. """ device = 'cuda' if not debug and torch.cuda.is_available() else 'cpu' print('\n-- Starting testing process. Debug mode: {}'.format(debug)) print('-- Process on: {}'.format(device), end='\n\n') print('-- Setting up modules... ', end='') # Masker modules rnn_enc = RNNEnc(hyper_parameters['reduced_dim'], hyper_parameters['context_length'], debug) rnn_dec = RNNDec(hyper_parameters['rnn_enc_output_dim'], debug) fnn = FNNMasker(hyper_parameters['rnn_enc_output_dim'], hyper_parameters['original_input_dim'], hyper_parameters['context_length']) # Denoiser modules denoiser = FNNDenoiser(hyper_parameters['original_input_dim']) rnn_enc.load_state_dict(torch.load( output_states_path['rnn_enc'])).to(device) rnn_dec.load_state_dict(torch.load( output_states_path['rnn_dec'])).to(device) fnn.load_state_dict(torch.load(output_states_path['fnn'])).to(device) denoiser.load_state_dict(torch.load( output_states_path['denoiser'])).to(device) print('done.') testing_it = data_feeder_testing( window_size=hyper_parameters['window_size'], fft_size=hyper_parameters['fft_size'], hop_size=hyper_parameters['hop_size'], seq_length=hyper_parameters['seq_length'], context_length=hyper_parameters['context_length'], batch_size=1, debug=debug) print('-- Testing starts\n') sdr = [] sir = [] total_time = 0 for index, data in enumerate(testing_it()): s_time = time.time() mix, mix_magnitude, mix_phase, voice_true, bg_true = data voice_predicted = np.zeros( (mix_magnitude.shape[0], hyper_parameters['seq_length'] - hyper_parameters['context_length'] * 2, hyper_parameters['window_size']), dtype=np.float32) for batch in range( int(mix_magnitude.shape[0] / training_constants['batch_size'])): b_start = batch * training_constants['batch_size'] b_end = (batch + 1) * training_constants['batch_size'] v_in = torch.from_numpy( mix_magnitude[b_start:b_end, :, :]).to(device) tmp_voice_predicted = rnn_enc(v_in) tmp_voice_predicted = rnn_dec(tmp_voice_predicted) tmp_voice_predicted = fnn(tmp_voice_predicted, v_in) tmp_voice_predicted = denoiser(tmp_voice_predicted) voice_predicted[ b_start:b_end, :, :] = tmp_voice_predicted.data.cpu().numpy() tmp_sdr, tmp_sir = data_process_results_testing( index=index, voice_true=voice_true, bg_true=bg_true, voice_predicted=voice_predicted, window_size=hyper_parameters['window_size'], mix=mix, mix_magnitude=mix_magnitude, mix_phase=mix_phase, hop=hyper_parameters['hop_size'], context_length=hyper_parameters['context_length']) e_time = time.time() print( testing_output_string_per_example.format( e=index, sdr=np.median([i for i in tmp_sdr[0] if not np.isnan(i)]), sir=np.median([i for i in tmp_sir[0] if not np.isnan(i)]), t=e_time - s_time)) total_time += e_time - s_time sdr.append(tmp_sdr) sir.append(tmp_sir) print('\n-- Testing finished\n') print( testing_output_string_all.format( sdr=np.median([ii for i in sdr for ii in i[0] if not np.isnan(ii)]), sir=np.median([ii for i in sir for ii in i[0] if not np.isnan(ii)]), t=total_time)) print('\n-- Saving results... ', end='') with open(metrics_paths['sdr'], 'wb') as f: pickle.dump(sdr, f, protocol=2) with open(metrics_paths['sir'], 'wb') as f: pickle.dump(sir, f, protocol=2) print('done!') print('-- That\'s all folks!')
def use_me_process(sources_list, output_file_names): """The usage process. :param sources_list: The file names to be used. :type sources_list: list[pathlib.Path] :param output_file_names: The output file names to be used. :type output_file_names: list[list[str]] """ printing.print_msg('Welcome to MaD TwinNet.', end='\n\n') if debug: printing.print_msg('Cannot proceed in debug mode. ' 'Please set `debug=False` at the settings ' 'file.') printing.print_msg('Exiting.') exit(-1) printing.print_msg('Now I will extract the voice and the ' 'background music from the provided files') device = 'cuda' if not debug and torch.cuda.is_available() else 'cpu' # MaD setting up mad = MaD(rnn_enc_input_dim=hyper_parameters['reduced_dim'], rnn_dec_input_dim=hyper_parameters['rnn_enc_output_dim'], original_input_dim=hyper_parameters['original_input_dim'], context_length=hyper_parameters['context_length']) mad.load_state_dict(torch.load(output_states_path['mad'])) mad = mad.to(device).eval() testing_it = data_feeder.data_feeder_testing( window_size=hyper_parameters['window_size'], fft_size=hyper_parameters['fft_size'], hop_size=hyper_parameters['hop_size'], seq_length=hyper_parameters['seq_length'], context_length=hyper_parameters['context_length'], batch_size=1, debug=debug, sources_list=sources_list) printing.print_msg('Let\'s go!', end='\n\n') total_time = 0 for index, data in enumerate(testing_it()): s_time = time.time() mix, mix_magnitude, mix_phase, voice_true, bg_true = data voice_predicted = np.zeros( (mix_magnitude.shape[0], hyper_parameters['seq_length'] - hyper_parameters['context_length'] * 2, hyper_parameters['window_size']), dtype=np.float32) for batch in range( int(mix_magnitude.shape[0] / training_constants['batch_size'])): b_start = batch * training_constants['batch_size'] b_end = (batch + 1) * training_constants['batch_size'] v_in = torch.from_numpy( mix_magnitude[b_start:b_end, :, :]).to(device) voice_predicted[b_start:b_end, :, :] = mad( v_in).v_j_filt.cpu().numpy() data_feeder.data_process_results_testing( index=index, voice_true=voice_true, bg_true=bg_true, voice_predicted=voice_predicted, window_size=hyper_parameters['window_size'], mix=mix, mix_magnitude=mix_magnitude, mix_phase=mix_phase, hop=hyper_parameters['hop_size'], context_length=hyper_parameters['context_length'], output_file_name=output_file_names[index]) e_time = time.time() printing.print_msg( usage_output_string_per_example.format(f=sources_list[index], t=e_time - s_time)) total_time += e_time - s_time printing.print_msg('MaDTwinNet finished') printing.print_msg(usage_output_string_total.format(t=total_time)) printing.print_msg('That\'s all folks!')