示例#1
0
	def __init__(self, model_conf_path, data_conf_path, model_name, model_path, output_dir, epoch):
		
		self.model_name = model_name

		self.test_data = load_test_data(data_conf_path)
		model_tag = "MSI-DIS" if model_name == "SYS" else model_name
		self.network = load_gpu_model(model_conf_path, model_tag, model_path)
		
		mkdir(output_dir)
		self.output_dir = output_dir
		self.score_path = os.path.join(output_dir, f"scores-{epoch}.json")
示例#2
0
	def synthesis(self, path, track_id):
		output_dir = self.output_dir
		model_name = self.model_name	
		song = WeiMidi(path)
		song = song[track_id]
		song = song[25 * 100 : 35 * 100]
		#target = onehot_tensor(device(target).long())
		sample_index = 0

		for sample in self.test_data.test_samples():
			sample_index += 1
			mix = device(sample['mix'].sum(0))
			instrs = sample['instrs']
			sample_dir = f"{output_dir}/sample_{sample_index}"
			mkdir(sample_dir)
			print(mix.shape[-1] - 16000 * 45)
			mix = mix[:, 16000 * 35 : 16000 * 45]


			for i in range(INSTR_NUM):
				instr_name, audio, _, target, query, _ = instrs[i]
				query = device(query[0])
				
				target = adapt_pitch(song, target)
				target = onehot_tensor(device(target).long())


				mix_spec, mix_cos, mix_sin = wav2spec(mix)
				query_spec, _, _ = wav2spec(query)

				wav_len = mix.shape[-1]
				spec_len = mix_spec.shape[-2]

				if spec_len > target.shape[-1]:
					spec_len = target.shape[-1]
					mix_spec = mix_spec.transpose(0, -2)[:spec_len].transpose(0, -2)

				mix_spec_batches = devide_into_batches(mix_spec, duration_axis=-2)
				query_spec_batches = devide_into_batches(query_spec, duration_axis=-2)

				hQuery = self.query(query_spec_batches)

				target_batches = devide_into_batches(target, duration_axis=-1)
				batches = zip(mix_spec_batches, target_batches)

				preds = self.predict(batches, hQuery, "synthesis")

				synthesis_batches = merge_from_list(preds, index=0)
				synthesis_spec = merge_batches(synthesis_batches, duration_axis=-2)
				synthesis_spec = align(synthesis_spec, mix_cos, -2)
				synthesis_wav = spec2wav(synthesis_spec, mix_cos, mix_sin, 160000, syn_phase=1)

				synthesis_wav_path = f"{sample_dir}/{instr_name}_{model_name}.wav"
				save_audio(synthesis_wav[:, :160000], synthesis_wav_path)
示例#3
0
def save_test_lst(data, output_folder):
    testset_folder = os.path.join(output_folder, "testset")
    mkdir(testset_folder)
    test_lst = []
    query_lst = []
    for songName in data:
        test_lst += data[songName]["test"]
        query_lst += data[songName]["query"]
    test_lst = [f"{t[0]},{t[1]}\t{t[2]},{t[3]}" for t in test_lst]
    query_lst = [f"{t[0]},{t[1]}\t{t[2]},{t[3]}" for t in query_lst]
    print("test set", len(test_lst))
    test_lst_path = os.path.join(testset_folder, "test.lst")
    query_lst_path = os.path.join(testset_folder, "query.lst")
    write_lst(test_lst_path, test_lst)
    write_lst(query_lst_path, query_lst)
示例#4
0
def save_train_lst(data, output_folder):
    for instr in data:
        instr_folder = os.path.join(output_folder, instr)
        mkdir(instr_folder)
        path = os.path.join(instr_folder, "train.lst")
        write_lst(path, data[instr])
示例#5
0
	def inference(self):
		
		results = {}
		sample_index = -1
		output_dir = self.output_dir

		model_name = self.model_name
		for mode in modes[model_name]:
			results[mode] = []

		for sample in self.test_data.test_samples():
			sample_index += 1
			mix = device(sample['mix'].sum(0))
			instrs = sample['instrs']
			
			result = {}
			for mode in modes[model_name]:
				result[mode] = {}

			for i in range(INSTR_NUM):
				instr_name, audio, annotation, target, query, query_annotation = instrs[i]
			
				for mode in modes[model_name]:
					result[mode][instr_name] = {"separation" : [], "transcription" : []}
	
				query = device(query[0])
				query_annotation = query_annotation[0]

				mix_spec, mix_cos, mix_sin = wav2spec(mix)
				query_spec, _, _ = wav2spec(query)

				
				wav_len = mix.shape[-1]
				spec_len = mix_spec.shape[-2]
				
				mix_spec_batches = devide_into_batches(mix_spec, duration_axis=-2)
				query_spec_batches = devide_into_batches(query_spec, duration_axis=-2)

				hQuery = self.query(query_spec_batches)
				
				if model_name in ["AMT", "MSS", "MSS-AMT", "UNET"]:
					batches = mix_spec_batches
				else:
					target = onehot_tensor(device(target).long())
					target_batches = devide_into_batches(target, duration_axis=-1)
					batches = zip(mix_spec_batches, target_batches)
				
				preds = self.predict(batches, hQuery)
				
				sample_dir = f"{output_dir}/sample_{sample_index}"
				mkdir(sample_dir)

	
				if model_name in ["AMT", "MSS-AMT", "MSI", "MSI-DIS"]:
					transcription_batches = merge_from_list(preds, index=-1)					
					prob = merge_batches(transcription_batches, duration_axis=-1)
					est_annotation = parse_frameroll2annotation(np.argmax(prob.cpu().numpy(), 0))
					est_annotation_path = f"{sample_dir}/{instr_name}_est.txt" 
					write_lst(est_annotation_path, est_annotation)
					
					annotation = str.replace(annotation, '/gpfsnyu/home/ll4270/music/transcription/wei_transcription', '/scratch/gx219/wei_env/data-source/dataset')
					result[model_name][instr_name]['transcription'].append([est_annotation_path, annotation])


				if not model_name == "AMT":
					separated_batches = merge_from_list(preds, index=0)
					separated_spec = merge_batches(separated_batches, duration_axis=-2)
					separated_spec = align(separated_spec, mix_cos, -2)
					separated_wav = spec2wav(separated_spec, mix_cos, mix_sin, wav_len)

					separated_wav_path = f"{sample_dir}/{instr_name}_{model_name}.wav"
					mix_path = f"{sample_dir}/Mixture.wav"
					ref_path = f"{sample_dir}/{instr_name}_ref.wav"
					save_audio(separated_wav, separated_wav_path)
					save_audio(mix, mix_path)
					save_audio(torch.from_numpy(audio), ref_path)
					result[model_name][instr_name]['separation'].append([separated_wav_path, ref_path])

				if model_name in ["MSI", "MSI-DIS"]:
					separated_batches = merge_from_list(preds, index=1)
					separated_spec = merge_batches(separated_batches, duration_axis=-2)
					separated_spec = align(separated_spec, mix_cos, -2)
					separated_wav = spec2wav(separated_spec, mix_cos, mix_sin, wav_len)
					separated_wav_path = f"{sample_dir}/{instr_name}_{model_name}-S.wav"
					save_audio(separated_wav, separated_wav_path)
					result[f"{model_name}-S"][instr_name]['separation'].append([separated_wav_path, ref_path])


			for mode in modes[model_name]:
				results[mode].append(result[mode])

		return results
	def process_unit(n):
	
		name = meta_dict['audio_filename'][n]['mix']
		print(name)
		audio_path = os.path.join(dataset_dir, name)
		(audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
		packed_hdf5_path = os.path.join(feature_dir, '{}.h5'.format(os.path.splitext(name)[0]))
		mkdir(os.path.dirname(packed_hdf5_path))
		with h5py.File(packed_hdf5_path, 'w') as hf:
			#hf.attrs.create('midi_filename', data=meta_dict['midi_filename'][n].encode(), dtype='S100')
			hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16)

		for i, name in enumerate(meta_dict['audio_filename'][n]['separated_sources']):
			audio_path = os.path.join(dataset_dir, name)

			(audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
			(hq_audio, _) = librosa.core.load(audio_path, sr=sample_rate * 2, mono=True)

			note_annotations_path = os.path.join(dataset_dir, meta_dict['audio_filename'][n]['note_annotations'][i])
			note_annotations = read_lst(note_annotations_path)
			note_annotations = [notes.split('\t\t') for notes in note_annotations]
			note_annotations = [[notes[0], float(notes[2]) + float(notes[0]), float(freq2note(notes[1]))] for notes in note_annotations]
			note_annotations = np.array(note_annotations, dtype = np.float32)
			note_annotations_lst = ['%s\t%s\t%s' % (notes[0], str(notes[1]), str(notes[2])) for notes in note_annotations]
			ref_path = os.path.join(feature_dir, '{}_ref.txt'.format(os.path.splitext(name)[0]))
			mkdir(os.path.dirname(packed_hdf5_path))
			write_lst(ref_path, note_annotations_lst)

			duration = (audio.shape[-1] + sample_rate - 1) // sample_rate
			target_processor = TargetProcessor(duration, frames_per_second, begin_note, notes_num)
			target_dict = target_processor.process(0, note_annotations)
			frame_roll = np.array(target_dict['frame_roll'], dtype=np.int)
			

			train_packed_hdf5_path = os.path.join(feature_dir, '{}._TRAIN.h5'.format(os.path.splitext(name)[0]))
			test_packed_hdf5_path = os.path.join(feature_dir, '{}._TEST.h5'.format(os.path.splitext(name)[0]))

			scale = 9
			dense_audio = remove_empty_segment(audio, frame_roll, frames_per_second, sample_rate, notes_num)
			dense_hq_audio = remove_empty_segment(hq_audio, frame_roll, frames_per_second, sample_rate * 2, notes_num)

			for i in range(scale):
				shift_pitch = i - (scale // 2)
				packed_hdf5_path = os.path.join(feature_dir, '{}._TRAIN_shift_pitch_{}.h5'.format(os.path.splitext(name)[0], shift_pitch))
				if os.path.exists(packed_hdf5_path):
					continue

				if shift_pitch == 0:
					shift_audio = audio
					shift_dense_audio = dense_audio
				else:
					shift_audio = librosa.effects.pitch_shift(hq_audio, sample_rate * 2, n_steps=shift_pitch)	
					shift_audio = librosa.core.resample(shift_audio, sample_rate * 2, sample_rate)	
					shift_dense_audio = librosa.effects.pitch_shift(dense_hq_audio, sample_rate * 2, n_steps=shift_pitch)
					shift_dense_audio = librosa.core.resample(shift_dense_audio, sample_rate * 2, sample_rate)

				shift_frame_roll = frame_roll.copy() + shift_pitch
				shift_frame_roll[shift_frame_roll == notes_num + shift_pitch] = notes_num
				shift_frame_roll = np.clip(shift_frame_roll, 0, notes_num)

				with h5py.File(packed_hdf5_path, 'w') as hf:
					hf.create_dataset(name='shift_waveform', data=float32_to_int16(shift_audio), dtype=np.int16)
					hf.create_dataset(name='shift_dense_waveform', data=float32_to_int16(shift_dense_audio), dtype=np.int16)
					hf.create_dataset(name='frame_roll', data=shift_frame_roll, dtype=np.int16)

			with h5py.File(train_packed_hdf5_path, 'w') as hf:
				hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16)
				hf.create_dataset(name='frame_roll', data=frame_roll, dtype=np.int16)

			with h5py.File(test_packed_hdf5_path, 'w') as hf:				
				hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16)
				hf.create_dataset(name='waveform_path', data=[audio_path.encode()], dtype='S200')
				hf.create_dataset(name='note_annotations_txt', data=[ref_path.encode()], dtype='S200')
				hf.create_dataset(name='frame_roll', data=frame_roll, dtype=np.int16)	
def pack_urmp_dataset_to_hdf5(args):

	dataset_dir = args.dataset_dir
	feature_dir = args.feature_dir
	process_num = args.process_num

	sample_rate = args.sample_rate
	frames_per_second = args.frames_per_second
	n_fft = args.n_fft
	begin_note = args.begin_note
	notes_num = args.notes_num
	

	mkdir(feature_dir)

	meta_dict = {}
	meta_dict['audio_filename'] = []
	audios_num = 0

	for folder in os.listdir(dataset_dir):
		if str.startswith(folder, "._"):
			continue
		meta_data = folder.split('_')
		if len(meta_data) < 4:
			continue	
		audios_num += 1
		id = meta_data[0]
		name = meta_data[1]
		sources = meta_data[2:]
		audio = {}
		audio['mix'] = os.path.join(folder, f'AuMix_{folder}.wav')
		audio['separated_sources'] = []
		audio['note_annotations'] = []
		for j, s in enumerate(sources):
			audio['separated_sources'] += [os.path.join(folder, f'AuSep_{j + 1}_{s}_{id}_{name}.wav')]
			audio['note_annotations'] += [os.path.join(folder, f'Notes_{j + 1}_{s}_{id}_{name}.txt')]
	
		meta_dict['audio_filename'] += [audio]

	feature_time = time.time()
	print(f"The total number of the mixture audio is {audios_num}")
	def process_unit(n):
	
		name = meta_dict['audio_filename'][n]['mix']
		print(name)
		audio_path = os.path.join(dataset_dir, name)
		(audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
		packed_hdf5_path = os.path.join(feature_dir, '{}.h5'.format(os.path.splitext(name)[0]))
		mkdir(os.path.dirname(packed_hdf5_path))
		with h5py.File(packed_hdf5_path, 'w') as hf:
			#hf.attrs.create('midi_filename', data=meta_dict['midi_filename'][n].encode(), dtype='S100')
			hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16)

		for i, name in enumerate(meta_dict['audio_filename'][n]['separated_sources']):
			audio_path = os.path.join(dataset_dir, name)

			(audio, _) = librosa.core.load(audio_path, sr=sample_rate, mono=True)
			(hq_audio, _) = librosa.core.load(audio_path, sr=sample_rate * 2, mono=True)

			note_annotations_path = os.path.join(dataset_dir, meta_dict['audio_filename'][n]['note_annotations'][i])
			note_annotations = read_lst(note_annotations_path)
			note_annotations = [notes.split('\t\t') for notes in note_annotations]
			note_annotations = [[notes[0], float(notes[2]) + float(notes[0]), float(freq2note(notes[1]))] for notes in note_annotations]
			note_annotations = np.array(note_annotations, dtype = np.float32)
			note_annotations_lst = ['%s\t%s\t%s' % (notes[0], str(notes[1]), str(notes[2])) for notes in note_annotations]
			ref_path = os.path.join(feature_dir, '{}_ref.txt'.format(os.path.splitext(name)[0]))
			mkdir(os.path.dirname(packed_hdf5_path))
			write_lst(ref_path, note_annotations_lst)

			duration = (audio.shape[-1] + sample_rate - 1) // sample_rate
			target_processor = TargetProcessor(duration, frames_per_second, begin_note, notes_num)
			target_dict = target_processor.process(0, note_annotations)
			frame_roll = np.array(target_dict['frame_roll'], dtype=np.int)
			

			train_packed_hdf5_path = os.path.join(feature_dir, '{}._TRAIN.h5'.format(os.path.splitext(name)[0]))
			test_packed_hdf5_path = os.path.join(feature_dir, '{}._TEST.h5'.format(os.path.splitext(name)[0]))

			scale = 9
			dense_audio = remove_empty_segment(audio, frame_roll, frames_per_second, sample_rate, notes_num)
			dense_hq_audio = remove_empty_segment(hq_audio, frame_roll, frames_per_second, sample_rate * 2, notes_num)

			for i in range(scale):
				shift_pitch = i - (scale // 2)
				packed_hdf5_path = os.path.join(feature_dir, '{}._TRAIN_shift_pitch_{}.h5'.format(os.path.splitext(name)[0], shift_pitch))
				if os.path.exists(packed_hdf5_path):
					continue

				if shift_pitch == 0:
					shift_audio = audio
					shift_dense_audio = dense_audio
				else:
					shift_audio = librosa.effects.pitch_shift(hq_audio, sample_rate * 2, n_steps=shift_pitch)	
					shift_audio = librosa.core.resample(shift_audio, sample_rate * 2, sample_rate)	
					shift_dense_audio = librosa.effects.pitch_shift(dense_hq_audio, sample_rate * 2, n_steps=shift_pitch)
					shift_dense_audio = librosa.core.resample(shift_dense_audio, sample_rate * 2, sample_rate)

				shift_frame_roll = frame_roll.copy() + shift_pitch
				shift_frame_roll[shift_frame_roll == notes_num + shift_pitch] = notes_num
				shift_frame_roll = np.clip(shift_frame_roll, 0, notes_num)

				with h5py.File(packed_hdf5_path, 'w') as hf:
					hf.create_dataset(name='shift_waveform', data=float32_to_int16(shift_audio), dtype=np.int16)
					hf.create_dataset(name='shift_dense_waveform', data=float32_to_int16(shift_dense_audio), dtype=np.int16)
					hf.create_dataset(name='frame_roll', data=shift_frame_roll, dtype=np.int16)

			with h5py.File(train_packed_hdf5_path, 'w') as hf:
				hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16)
				hf.create_dataset(name='frame_roll', data=frame_roll, dtype=np.int16)

			with h5py.File(test_packed_hdf5_path, 'w') as hf:				
				hf.create_dataset(name='waveform', data=float32_to_int16(audio), dtype=np.int16)
				hf.create_dataset(name='waveform_path', data=[audio_path.encode()], dtype='S200')
				hf.create_dataset(name='note_annotations_txt', data=[ref_path.encode()], dtype='S200')
				hf.create_dataset(name='frame_roll', data=frame_roll, dtype=np.int16)	

	def process_group(st, ed, total_num, pid):
		print(f"process {pid + 1} starts")
		for n in range(st, ed):
			process_unit(n)
			print(f"process {pid + 1} : {n + 1}/{total_num} done.")
		print(f"process {pid + 1} ends")


	audio_groups = get_process_groups(audios_num, process_num)
	for pid, (st, ed) in enumerate(audio_groups):
		p = multiprocessing.Process(target = process_group, args = (st, ed, audios_num, pid))
		p.start()
def train(model_name, load_epoch, epoch, model_folder, data_conf_path,
          model_conf_path):

    nnet = modelFactory(model_name, model_conf_path)

    learning_rate = 5e-4
    DECAY = 100

    mkdir(model_folder)

    if load_epoch >= 0:
        model_path = f'{model_folder}/params_epoch-{load_epoch}.pkl'
        nnet.load_state_dict(torch.load(model_path), strict=True)

    resume_epoch = load_epoch + 1

    urmp_data = UrmpSample(data_conf_path, 'train')
    train_batch_size = urmp_data.get_batch_size()

    urmp_loader = DataLoader(urmp_data,
                             batch_size=train_batch_size,
                             shuffle=False,
                             num_workers=1,
                             pin_memory=True,
                             persistent_workers=False,
                             collate_fn=urmp_data.get_collate_fn())

    def get_parameters(nnet, model_name):
        parameters = {}
        parameters['query'] = list(nnet.network.parameters())

        if model_name in ['MSI']:
            parameters['MSS-AMT'] = list(nnet.network.parameters())
        if model_name in ['UNET']:
            parameters['MSS'] = list(nnet.network.parameters())
        if model_name in ['MSI-DIS', 'AMT', 'MSS', 'MSS-AMT']:
            parameters[model_name] = list(nnet.network.parameters())

        return parameters

    def get_optimizer(r_epoch, parameters):
        optimizers = []
        for param in parameters:
            optimizer = torch.optim.Adam(parameters[param], lr=learning_rate / (2**(r_epoch // DECAY)), \
              betas=(0.9, 0.999), eps=1e-08, weight_decay=0., amsgrad=True)
            optimizers.append({'mode': param, 'opt': optimizer, 'name': param})
        return optimizers

    parameters = get_parameters(nnet, model_name)
    optimizer = get_optimizer(resume_epoch, parameters)
    step_per_epoch = urmp_data.get_len() // train_batch_size

    pre_time = time.time()
    pre_time = compute_time(f'begin train...', pre_time)
    nnet.train()
    pre_time = compute_time(f'train done', pre_time)
    for i in range(resume_epoch, epoch):
        if i % DECAY == 0:
            pre_time = compute_time(f'begin update op...', pre_time)
            optimizer = get_optimizer(resume_epoch, parameters)
            print('learning rate', learning_rate / (2**(i // DECAY)))

        for i_batch, urmp_batch in enumerate(urmp_loader):
            urmp_batch = move_data2cuda(urmp_batch)
            for j in range(len(optimizer)):
                op = optimizer[j]['opt']
                name = optimizer[j]['name']
                op.zero_grad()
                loss, loss_text = train_step(nnet, urmp_batch,
                                             optimizer[j]['mode'])
                loss.backward()
                op.step()
                print(
                    f"update {optimizer[j]['mode']} network epoch {i} loss: {i_batch}/{step_per_epoch}",
                    loss_text)
                del loss
        torch.save(nnet.state_dict(), f"{model_folder}/params_epoch-{i}.pkl")