def __init__(self, model_conf_path, data_conf_path, model_name, model_path, output_dir, epoch): self.model_name = model_name self.test_data = load_test_data(data_conf_path) model_tag = "MSI-DIS" if model_name == "SYS" else model_name self.network = load_gpu_model(model_conf_path, model_tag, model_path) mkdir(output_dir) self.output_dir = output_dir self.score_path = os.path.join(output_dir, f"scores-{epoch}.json")
def synthesis(self, path, track_id): output_dir = self.output_dir model_name = self.model_name song = WeiMidi(path) song = song[track_id] song = song[25 * 100 : 35 * 100] #target = onehot_tensor(device(target).long()) sample_index = 0 for sample in self.test_data.test_samples(): sample_index += 1 mix = device(sample['mix'].sum(0)) instrs = sample['instrs'] sample_dir = f"{output_dir}/sample_{sample_index}" mkdir(sample_dir) print(mix.shape[-1] - 16000 * 45) mix = mix[:, 16000 * 35 : 16000 * 45] for i in range(INSTR_NUM): instr_name, audio, _, target, query, _ = instrs[i] query = device(query[0]) target = adapt_pitch(song, target) target = onehot_tensor(device(target).long()) mix_spec, mix_cos, mix_sin = wav2spec(mix) query_spec, _, _ = wav2spec(query) wav_len = mix.shape[-1] spec_len = mix_spec.shape[-2] if spec_len > target.shape[-1]: spec_len = target.shape[-1] mix_spec = mix_spec.transpose(0, -2)[:spec_len].transpose(0, -2) mix_spec_batches = devide_into_batches(mix_spec, duration_axis=-2) query_spec_batches = devide_into_batches(query_spec, duration_axis=-2) hQuery = self.query(query_spec_batches) target_batches = devide_into_batches(target, duration_axis=-1) batches = zip(mix_spec_batches, target_batches) preds = self.predict(batches, hQuery, "synthesis") synthesis_batches = merge_from_list(preds, index=0) synthesis_spec = merge_batches(synthesis_batches, duration_axis=-2) synthesis_spec = align(synthesis_spec, mix_cos, -2) synthesis_wav = spec2wav(synthesis_spec, mix_cos, mix_sin, 160000, syn_phase=1) synthesis_wav_path = f"{sample_dir}/{instr_name}_{model_name}.wav" save_audio(synthesis_wav[:, :160000], synthesis_wav_path)
def save_test_lst(data, output_folder): testset_folder = os.path.join(output_folder, "testset") mkdir(testset_folder) test_lst = [] query_lst = [] for songName in data: test_lst += data[songName]["test"] query_lst += data[songName]["query"] test_lst = [f"{t[0]},{t[1]}\t{t[2]},{t[3]}" for t in test_lst] query_lst = [f"{t[0]},{t[1]}\t{t[2]},{t[3]}" for t in query_lst] print("test set", len(test_lst)) test_lst_path = os.path.join(testset_folder, "test.lst") query_lst_path = os.path.join(testset_folder, "query.lst") write_lst(test_lst_path, test_lst) write_lst(query_lst_path, query_lst)
def save_train_lst(data, output_folder): for instr in data: instr_folder = os.path.join(output_folder, instr) mkdir(instr_folder) path = os.path.join(instr_folder, "train.lst") write_lst(path, data[instr])
def inference(self): results = {} sample_index = -1 output_dir = self.output_dir model_name = self.model_name for mode in modes[model_name]: results[mode] = [] for sample in self.test_data.test_samples(): sample_index += 1 mix = device(sample['mix'].sum(0)) instrs = sample['instrs'] result = {} for mode in modes[model_name]: result[mode] = {} for i in range(INSTR_NUM): instr_name, audio, annotation, target, query, query_annotation = instrs[i] for mode in modes[model_name]: result[mode][instr_name] = {"separation" : [], "transcription" : []} query = device(query[0]) query_annotation = query_annotation[0] mix_spec, mix_cos, mix_sin = wav2spec(mix) query_spec, _, _ = wav2spec(query) wav_len = mix.shape[-1] spec_len = mix_spec.shape[-2] mix_spec_batches = devide_into_batches(mix_spec, duration_axis=-2) query_spec_batches = devide_into_batches(query_spec, duration_axis=-2) hQuery = self.query(query_spec_batches) if model_name in ["AMT", "MSS", "MSS-AMT", "UNET"]: batches = mix_spec_batches else: target = onehot_tensor(device(target).long()) target_batches = devide_into_batches(target, duration_axis=-1) batches = zip(mix_spec_batches, target_batches) preds = self.predict(batches, hQuery) sample_dir = f"{output_dir}/sample_{sample_index}" mkdir(sample_dir) if model_name in ["AMT", "MSS-AMT", "MSI", "MSI-DIS"]: transcription_batches = merge_from_list(preds, index=-1) prob = merge_batches(transcription_batches, duration_axis=-1) est_annotation = parse_frameroll2annotation(np.argmax(prob.cpu().numpy(), 0)) est_annotation_path = f"{sample_dir}/{instr_name}_est.txt" write_lst(est_annotation_path, est_annotation) annotation = str.replace(annotation, '/gpfsnyu/home/ll4270/music/transcription/wei_transcription', '/scratch/gx219/wei_env/data-source/dataset') result[model_name][instr_name]['transcription'].append([est_annotation_path, annotation]) if not model_name == "AMT": separated_batches = merge_from_list(preds, index=0) separated_spec = merge_batches(separated_batches, duration_axis=-2) separated_spec = align(separated_spec, mix_cos, -2) separated_wav = spec2wav(separated_spec, mix_cos, mix_sin, wav_len) separated_wav_path = f"{sample_dir}/{instr_name}_{model_name}.wav" mix_path = f"{sample_dir}/Mixture.wav" ref_path = f"{sample_dir}/{instr_name}_ref.wav" save_audio(separated_wav, separated_wav_path) save_audio(mix, mix_path) save_audio(torch.from_numpy(audio), ref_path) result[model_name][instr_name]['separation'].append([separated_wav_path, ref_path]) if model_name in ["MSI", "MSI-DIS"]: separated_batches = merge_from_list(preds, index=1) separated_spec = merge_batches(separated_batches, duration_axis=-2) separated_spec = align(separated_spec, mix_cos, -2) separated_wav = spec2wav(separated_spec, mix_cos, mix_sin, wav_len) separated_wav_path = f"{sample_dir}/{instr_name}_{model_name}-S.wav" save_audio(separated_wav, separated_wav_path) result[f"{model_name}-S"][instr_name]['separation'].append([separated_wav_path, ref_path]) for mode in modes[model_name]: results[mode].append(result[mode]) return results
def process_unit(n): name = meta_dict['audio_filename'][n]['mix'] print(name) audio_path = os.path.join(dataset_dir, name) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) packed_hdf5_path = os.path.join(feature_dir, '{}.h5'.format(os.path.splitext(name)[0])) mkdir(os.path.dirname(packed_hdf5_path)) with h5py.File(packed_hdf5_path, 'w') as hf: #hf.attrs.create('midi_filename', data=meta_dict['midi_filename'][n].encode(), dtype='S100') hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16) for i, name in enumerate(meta_dict['audio_filename'][n]['separated_sources']): audio_path = os.path.join(dataset_dir, name) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) (hq_audio, _) = librosa.core.load(audio_path, sr=sample_rate * 2, mono=True) note_annotations_path = os.path.join(dataset_dir, meta_dict['audio_filename'][n]['note_annotations'][i]) note_annotations = read_lst(note_annotations_path) note_annotations = [notes.split('\t\t') for notes in note_annotations] note_annotations = [[notes[0], float(notes[2]) + float(notes[0]), float(freq2note(notes[1]))] for notes in note_annotations] note_annotations = np.array(note_annotations, dtype = np.float32) note_annotations_lst = ['%s\t%s\t%s' % (notes[0], str(notes[1]), str(notes[2])) for notes in note_annotations] ref_path = os.path.join(feature_dir, '{}_ref.txt'.format(os.path.splitext(name)[0])) mkdir(os.path.dirname(packed_hdf5_path)) write_lst(ref_path, note_annotations_lst) duration = (audio.shape[-1] + sample_rate - 1) // sample_rate target_processor = TargetProcessor(duration, frames_per_second, begin_note, notes_num) target_dict = target_processor.process(0, note_annotations) frame_roll = np.array(target_dict['frame_roll'], dtype=np.int) train_packed_hdf5_path = os.path.join(feature_dir, '{}._TRAIN.h5'.format(os.path.splitext(name)[0])) test_packed_hdf5_path = os.path.join(feature_dir, '{}._TEST.h5'.format(os.path.splitext(name)[0])) scale = 9 dense_audio = remove_empty_segment(audio, frame_roll, frames_per_second, sample_rate, notes_num) dense_hq_audio = remove_empty_segment(hq_audio, frame_roll, frames_per_second, sample_rate * 2, notes_num) for i in range(scale): shift_pitch = i - (scale // 2) packed_hdf5_path = os.path.join(feature_dir, '{}._TRAIN_shift_pitch_{}.h5'.format(os.path.splitext(name)[0], shift_pitch)) if os.path.exists(packed_hdf5_path): continue if shift_pitch == 0: shift_audio = audio shift_dense_audio = dense_audio else: shift_audio = librosa.effects.pitch_shift(hq_audio, sample_rate * 2, n_steps=shift_pitch) shift_audio = librosa.core.resample(shift_audio, sample_rate * 2, sample_rate) shift_dense_audio = librosa.effects.pitch_shift(dense_hq_audio, sample_rate * 2, n_steps=shift_pitch) shift_dense_audio = librosa.core.resample(shift_dense_audio, sample_rate * 2, sample_rate) shift_frame_roll = frame_roll.copy() + shift_pitch shift_frame_roll[shift_frame_roll == notes_num + shift_pitch] = notes_num shift_frame_roll = np.clip(shift_frame_roll, 0, notes_num) with h5py.File(packed_hdf5_path, 'w') as hf: hf.create_dataset(name='shift_waveform', data=float32_to_int16(shift_audio), dtype=np.int16) hf.create_dataset(name='shift_dense_waveform', data=float32_to_int16(shift_dense_audio), dtype=np.int16) hf.create_dataset(name='frame_roll', data=shift_frame_roll, dtype=np.int16) with h5py.File(train_packed_hdf5_path, 'w') as hf: hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16) hf.create_dataset(name='frame_roll', data=frame_roll, dtype=np.int16) with h5py.File(test_packed_hdf5_path, 'w') as hf: hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16) hf.create_dataset(name='waveform_path', data=[audio_path.encode()], dtype='S200') hf.create_dataset(name='note_annotations_txt', data=[ref_path.encode()], dtype='S200') hf.create_dataset(name='frame_roll', data=frame_roll, dtype=np.int16)
def pack_urmp_dataset_to_hdf5(args): dataset_dir = args.dataset_dir feature_dir = args.feature_dir process_num = args.process_num sample_rate = args.sample_rate frames_per_second = args.frames_per_second n_fft = args.n_fft begin_note = args.begin_note notes_num = args.notes_num mkdir(feature_dir) meta_dict = {} meta_dict['audio_filename'] = [] audios_num = 0 for folder in os.listdir(dataset_dir): if str.startswith(folder, "._"): continue meta_data = folder.split('_') if len(meta_data) < 4: continue audios_num += 1 id = meta_data[0] name = meta_data[1] sources = meta_data[2:] audio = {} audio['mix'] = os.path.join(folder, f'AuMix_{folder}.wav') audio['separated_sources'] = [] audio['note_annotations'] = [] for j, s in enumerate(sources): audio['separated_sources'] += [os.path.join(folder, f'AuSep_{j + 1}_{s}_{id}_{name}.wav')] audio['note_annotations'] += [os.path.join(folder, f'Notes_{j + 1}_{s}_{id}_{name}.txt')] meta_dict['audio_filename'] += [audio] feature_time = time.time() print(f"The total number of the mixture audio is {audios_num}") def process_unit(n): name = meta_dict['audio_filename'][n]['mix'] print(name) audio_path = os.path.join(dataset_dir, name) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) packed_hdf5_path = os.path.join(feature_dir, '{}.h5'.format(os.path.splitext(name)[0])) mkdir(os.path.dirname(packed_hdf5_path)) with h5py.File(packed_hdf5_path, 'w') as hf: #hf.attrs.create('midi_filename', data=meta_dict['midi_filename'][n].encode(), dtype='S100') hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16) for i, name in enumerate(meta_dict['audio_filename'][n]['separated_sources']): audio_path = os.path.join(dataset_dir, name) (audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True) (hq_audio, _) = librosa.core.load(audio_path, sr=sample_rate * 2, mono=True) note_annotations_path = os.path.join(dataset_dir, meta_dict['audio_filename'][n]['note_annotations'][i]) note_annotations = read_lst(note_annotations_path) note_annotations = [notes.split('\t\t') for notes in note_annotations] note_annotations = [[notes[0], float(notes[2]) + float(notes[0]), float(freq2note(notes[1]))] for notes in note_annotations] note_annotations = np.array(note_annotations, dtype = np.float32) note_annotations_lst = ['%s\t%s\t%s' % (notes[0], str(notes[1]), str(notes[2])) for notes in note_annotations] ref_path = os.path.join(feature_dir, '{}_ref.txt'.format(os.path.splitext(name)[0])) mkdir(os.path.dirname(packed_hdf5_path)) write_lst(ref_path, note_annotations_lst) duration = (audio.shape[-1] + sample_rate - 1) // sample_rate target_processor = TargetProcessor(duration, frames_per_second, begin_note, notes_num) target_dict = target_processor.process(0, note_annotations) frame_roll = np.array(target_dict['frame_roll'], dtype=np.int) train_packed_hdf5_path = os.path.join(feature_dir, '{}._TRAIN.h5'.format(os.path.splitext(name)[0])) test_packed_hdf5_path = os.path.join(feature_dir, '{}._TEST.h5'.format(os.path.splitext(name)[0])) scale = 9 dense_audio = remove_empty_segment(audio, frame_roll, frames_per_second, sample_rate, notes_num) dense_hq_audio = remove_empty_segment(hq_audio, frame_roll, frames_per_second, sample_rate * 2, notes_num) for i in range(scale): shift_pitch = i - (scale // 2) packed_hdf5_path = os.path.join(feature_dir, '{}._TRAIN_shift_pitch_{}.h5'.format(os.path.splitext(name)[0], shift_pitch)) if os.path.exists(packed_hdf5_path): continue if shift_pitch == 0: shift_audio = audio shift_dense_audio = dense_audio else: shift_audio = librosa.effects.pitch_shift(hq_audio, sample_rate * 2, n_steps=shift_pitch) shift_audio = librosa.core.resample(shift_audio, sample_rate * 2, sample_rate) shift_dense_audio = librosa.effects.pitch_shift(dense_hq_audio, sample_rate * 2, n_steps=shift_pitch) shift_dense_audio = librosa.core.resample(shift_dense_audio, sample_rate * 2, sample_rate) shift_frame_roll = frame_roll.copy() + shift_pitch shift_frame_roll[shift_frame_roll == notes_num + shift_pitch] = notes_num shift_frame_roll = np.clip(shift_frame_roll, 0, notes_num) with h5py.File(packed_hdf5_path, 'w') as hf: hf.create_dataset(name='shift_waveform', data=float32_to_int16(shift_audio), dtype=np.int16) hf.create_dataset(name='shift_dense_waveform', data=float32_to_int16(shift_dense_audio), dtype=np.int16) hf.create_dataset(name='frame_roll', data=shift_frame_roll, dtype=np.int16) with h5py.File(train_packed_hdf5_path, 'w') as hf: hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16) hf.create_dataset(name='frame_roll', data=frame_roll, dtype=np.int16) with h5py.File(test_packed_hdf5_path, 'w') as hf: hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16) hf.create_dataset(name='waveform_path', data=[audio_path.encode()], dtype='S200') hf.create_dataset(name='note_annotations_txt', data=[ref_path.encode()], dtype='S200') hf.create_dataset(name='frame_roll', data=frame_roll, dtype=np.int16) def process_group(st, ed, total_num, pid): print(f"process {pid + 1} starts") for n in range(st, ed): process_unit(n) print(f"process {pid + 1} : {n + 1}/{total_num} done.") print(f"process {pid + 1} ends") audio_groups = get_process_groups(audios_num, process_num) for pid, (st, ed) in enumerate(audio_groups): p = multiprocessing.Process(target = process_group, args = (st, ed, audios_num, pid)) p.start()
def train(model_name, load_epoch, epoch, model_folder, data_conf_path, model_conf_path): nnet = modelFactory(model_name, model_conf_path) learning_rate = 5e-4 DECAY = 100 mkdir(model_folder) if load_epoch >= 0: model_path = f'{model_folder}/params_epoch-{load_epoch}.pkl' nnet.load_state_dict(torch.load(model_path), strict=True) resume_epoch = load_epoch + 1 urmp_data = UrmpSample(data_conf_path, 'train') train_batch_size = urmp_data.get_batch_size() urmp_loader = DataLoader(urmp_data, batch_size=train_batch_size, shuffle=False, num_workers=1, pin_memory=True, persistent_workers=False, collate_fn=urmp_data.get_collate_fn()) def get_parameters(nnet, model_name): parameters = {} parameters['query'] = list(nnet.network.parameters()) if model_name in ['MSI']: parameters['MSS-AMT'] = list(nnet.network.parameters()) if model_name in ['UNET']: parameters['MSS'] = list(nnet.network.parameters()) if model_name in ['MSI-DIS', 'AMT', 'MSS', 'MSS-AMT']: parameters[model_name] = list(nnet.network.parameters()) return parameters def get_optimizer(r_epoch, parameters): optimizers = [] for param in parameters: optimizer = torch.optim.Adam(parameters[param], lr=learning_rate / (2**(r_epoch // DECAY)), \ betas=(0.9, 0.999), eps=1e-08, weight_decay=0., amsgrad=True) optimizers.append({'mode': param, 'opt': optimizer, 'name': param}) return optimizers parameters = get_parameters(nnet, model_name) optimizer = get_optimizer(resume_epoch, parameters) step_per_epoch = urmp_data.get_len() // train_batch_size pre_time = time.time() pre_time = compute_time(f'begin train...', pre_time) nnet.train() pre_time = compute_time(f'train done', pre_time) for i in range(resume_epoch, epoch): if i % DECAY == 0: pre_time = compute_time(f'begin update op...', pre_time) optimizer = get_optimizer(resume_epoch, parameters) print('learning rate', learning_rate / (2**(i // DECAY))) for i_batch, urmp_batch in enumerate(urmp_loader): urmp_batch = move_data2cuda(urmp_batch) for j in range(len(optimizer)): op = optimizer[j]['opt'] name = optimizer[j]['name'] op.zero_grad() loss, loss_text = train_step(nnet, urmp_batch, optimizer[j]['mode']) loss.backward() op.step() print( f"update {optimizer[j]['mode']} network epoch {i} loss: {i_batch}/{step_per_epoch}", loss_text) del loss torch.save(nnet.state_dict(), f"{model_folder}/params_epoch-{i}.pkl")