def split_wav(input_example, min_length, max_length, sample_rate, debug_output_directory, split_example, load_audio_with_librosa): """Splits wav and midi files for the dataset.""" tf.logging.info('Splitting %s', input_example.features.feature['id'].bytes_list.value[0]) wav_data = input_example.features.feature['audio'].bytes_list.value[0] ns = music_pb2.NoteSequence.FromString( input_example.features.feature['sequence'].bytes_list.value[0]) Metrics.counter('split_wav', 'read_midi_wav_to_split').inc() if not split_example: split_examples = audio_label_data_utils.process_record( wav_data, ns, ns.id, min_length=0, max_length=-1, sample_rate=sample_rate, allow_empty_notesequence=True, load_audio_with_librosa=load_audio_with_librosa) for example in split_examples: Metrics.counter('split_wav', 'full_example').inc() yield example else: try: split_examples = audio_label_data_utils.process_record( wav_data, ns, ns.id, min_length=min_length, max_length=max_length, sample_rate=sample_rate, load_audio_with_librosa=load_audio_with_librosa) for example in split_examples: Metrics.counter('split_wav', 'split_example').inc() yield example except AssertionError: output_file = 'badexample-' + hashlib.md5( ns.id).hexdigest() + '.proto' output_path = os.path.join(debug_output_directory, output_file) tf.logging.error('Exception processing %s. Writing file to %s', ns.id, output_path) with tf.gfile.Open(output_path, 'w') as f: f.write(input_example.SerializeToString()) raise
def generate_train_set(exclude_ids): """Generate the train TFRecord.""" train_file_pairs = [] for directory in train_dirs: path = os.path.join(FLAGS.input_dir, directory) path = os.path.join(path, '*.wav') wav_files = glob.glob(path) # find matching mid files for wav_file in wav_files: base_name_root, _ = os.path.splitext(wav_file) mid_file = base_name_root + '.mid' if filename_to_id(wav_file) not in exclude_ids: train_file_pairs.append((wav_file, mid_file)) train_output_name = os.path.join(FLAGS.output_dir, 'maps_config2_train.tfrecord') with tf.python_io.TFRecordWriter(train_output_name) as writer: for idx, pair in enumerate(train_file_pairs): print('{} of {}: {}'.format(idx, len(train_file_pairs), pair[0])) # load the wav data wav_data = tf.gfile.Open(pair[0], 'rb').read() # load the midi data and convert to a notesequence ns = midi_io.midi_file_to_note_sequence(pair[1]) for example in audio_label_data_utils.process_record( wav_data, ns, pair[0], FLAGS.min_length, FLAGS.max_length, FLAGS.sample_rate): writer.write(example.SerializeToString())
def infer(filename): # WAV 파일 Binary로 읽기 wav = open(filename, 'rb') wav_data = wav.read() wav.close() tf.logging.info('User .WAV FIle %s length %s bytes', filename, len(wav_data)) ## 전처리 # 청크로 분할 후, Protocol Buffers 로 변환 to_process = [] examples = list( audio_label_data_utils.process_record(wav_data=wav_data, ns=music_pb2.NoteSequence(), example_id=filename, min_length=0, max_length=-1, allow_empty_notesequence=True)) # 분할된 버퍼를 시리얼라이즈 to_process.append(examples[0].SerializeToString()) ############################################################# #시리얼라이즈한 버퍼를 iterator에 주입 sess.run(iterator.initializer, {example: to_process}) # Inference predictions = list(estimator.predict(input_fn, yield_single_examples=False)) #가정 설정문으로 prediction size를 1로 보장 assert len(predictions) == 1 #예측 결과 불러오기 frame_predictions = predictions[0]['frame_predictions'][0] onset_predictions = predictions[0]['onset_predictions'][0] # 치는 순간 velocity_values = predictions[0]['velocity_values'][0] #강약 #MIDI로 인코딩 sequence_prediction = sequences_lib.pianoroll_to_note_sequence( frame_predictions, frames_per_second=data.hparams_frames_per_second(hparams), min_duration_ms=0, min_midi_pitch=constants.MIN_MIDI_PITCH, onset_predictions=onset_predictions, velocity_values=velocity_values) basename = os.path.split(os.path.splitext(filename)[0])[1] + '.mid' output_filename = os.path.join('', basename) midi_filename = (output_filename) midi_io.sequence_proto_to_midi_file(sequence_prediction, midi_filename) print('Program Ended, Your MIDI File is in', output_filename) sess.close()
def process(files): for fn in files: print('**\n\n', fn, '\n\n**') with open(fn, 'rb', buffering=0) as f: wav_data = f.read() example_list = list( audio_label_data_utils.process_record( wav_data=wav_data, ns=music_pb2.NoteSequence(), example_id=fn, min_length=0, max_length=-1, allow_empty_notesequence=True)) assert len(example_list) == 1 to_process.append(example_list[0].SerializeToString()) print('Processing complete for', fn) sess = tf.Session() sess.run([ tf.initializers.global_variables(), tf.initializers.local_variables() ]) sess.run(iterator.initializer, {examples: to_process}) def transcription_data(params): del params return tf.data.Dataset.from_tensors(sess.run(next_record)) input_fn = infer_util.labels_to_features_wrapper(transcription_data) #@title Run inference prediction_list = list( estimator.predict( input_fn, yield_single_examples=False)) assert len(prediction_list) == 1 # Ignore warnings caused by pyfluidsynth import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) sequence_prediction = music_pb2.NoteSequence.FromString( prediction_list[0]['sequence_predictions'][0]) pathname = fn.split('/').pop() print('**\n\n', pathname, '\n\n**') midi_filename = '{outputs}/{file}.mid'.format(outputs=output,file=pathname) midi_io.sequence_proto_to_midi_file(sequence_prediction, midi_filename)
def create_example(filename): """Processes an audio file into an Example proto.""" wav_data = tf.gfile.Open(filename, 'rb').read() example_list = list( audio_label_data_utils.process_record( wav_data=wav_data, ns=music_pb2.NoteSequence(), # decode to handle filenames with extended characters. example_id=six.ensure_text(filename, 'utf-8'), min_length=0, max_length=-1, allow_empty_notesequence=True)) assert len(example_list) == 1 return example_list[0].SerializeToString()
def process(self, paths): wav_path, midi_path = paths if midi_path: if FLAGS.use_midi_stems: base_ns = note_sequence_from_directory( os.path.dirname(midi_path)) else: base_ns = midi_io.midi_file_to_note_sequence(midi_path) base_ns.filename = midi_path else: base_ns = music_pb2.NoteSequence() logging.info('Creating Example %s:%s', midi_path, wav_path) if FLAGS.convert_flac: samples, sr = librosa.load(wav_path, FLAGS.sample_rate) wav_data = audio_io.samples_to_wav_data(samples, sr) else: wav_data = tf.io.gfile.GFile(wav_path, 'rb').read() ns = copy.deepcopy(base_ns) # Use base names. ns.id = '%s:%s' % (wav_path, midi_path) Metrics.counter('create_example', 'read_midi_wav').inc() if FLAGS.max_length > 0: split_examples = audio_label_data_utils.process_record( wav_data, ns, ns.id, min_length=FLAGS.min_length, max_length=FLAGS.max_length, sample_rate=FLAGS.sample_rate, load_audio_with_librosa=False) for example in split_examples: Metrics.counter('split_wav', 'split_example').inc() yield example else: example = audio_label_data_utils.create_example( ns.id, ns, wav_data) Metrics.counter('create_example', 'created_example').inc() yield example
def inference(filename): # 오디오 파일(.wav) 읽기 wav_file = open(filename, mode='rb') wav_data = wav_file.read() wav_file.close() print('User uploaded file "{name}" with length {length} bytes'.format(name=filename, length=len(wav_data))) # 청크로 분할 후 protobufs 포맷으로 데이터 생성 to_process = [] example_list = list( audio_label_data_utils.process_record(wav_data=wav_data, ns=music_pb2.NoteSequence(), example_id=filename, min_length=0, max_length=-1, allow_empty_notesequence=True)) # Serialize to_process.append(example_list[0].SerializeToString()) # 세션 실행 sess.run(iterator.initializer, {examples: to_process}) # 예측 prediction_list = list(estimator.predict(input_fn, yield_single_examples=False)) assert len(prediction_list) == 1 # 예측 결과 데이터 가져오기 frame_predictions = prediction_list[0]['frame_predictions'][0] onset_predictions = prediction_list[0]['onset_predictions'][0] velocity_values = prediction_list[0]['velocity_values'][0] # 예측 결과 데이터를 이용해서 미디 시퀀스 생성 sequence_prediction = sequences_lib.pianoroll_to_note_sequence( frame_predictions, frames_per_second=data.hparams_frames_per_second(hparams), min_duration_ms=0, min_midi_pitch=constants.MIN_MIDI_PITCH, onset_predictions=onset_predictions, velocity_values=velocity_values) basename = os.path.split(os.path.splitext(filename)[0])[1] + '.mid' output_filename = os.path.join(env.MIDI_DIRECTORY, basename) # 미디 시퀀스를 파일로 내보내기 midi_filename = (output_filename) midi_io.sequence_proto_to_midi_file(sequence_prediction, midi_filename) return basename
def mix_examples(mixid_exs, sample_rate, load_audio_with_librosa): """Mix several Examples together to create a new example.""" mixid, exs = mixid_exs del mixid example_samples = [] example_sequences = [] for ex_str in exs: ex = tf.train.Example.FromString(ex_str) wav_data = ex.features.feature['audio'].bytes_list.value[0] if load_audio_with_librosa: samples = audio_io.wav_data_to_samples_librosa( wav_data, sample_rate) else: samples = audio_io.wav_data_to_samples(wav_data, sample_rate) example_samples.append(samples) ns = music_pb2.NoteSequence.FromString( ex.features.feature['sequence'].bytes_list.value[0]) example_sequences.append(ns) mixed_samples, mixed_sequence = audio_label_data_utils.mix_sequences( individual_samples=example_samples, sample_rate=sample_rate, individual_sequences=example_sequences) mixed_wav_data = audio_io.samples_to_wav_data(mixed_samples, sample_rate) mixed_id = '::'.join(['mixed'] + [ns.id for ns in example_sequences]) mixed_sequence.id = mixed_id mixed_filename = '::'.join(['mixed'] + [ns.filename for ns in example_sequences]) mixed_sequence.filename = mixed_filename examples = list( audio_label_data_utils.process_record(mixed_wav_data, mixed_sequence, mixed_id, min_length=0, max_length=-1, sample_rate=sample_rate)) assert len(examples) == 1 return examples[0]
def testSplitAudioLabelData(self): wav_data, sequence = self._CreateSyntheticExample() records = audio_label_data_utils.process_record( wav_data, sequence, 'test', sample_rate=SAMPLE_RATE) for record in records: audio = record.features.feature['audio'].bytes_list.value[0] velocity_range = music_pb2.VelocityRange.FromString( record.features.feature['velocity_range'].bytes_list.value[0]) note_sequence = music_pb2.NoteSequence.FromString( record.features.feature['sequence'].bytes_list.value[0]) expected_samples = np.zeros(10 * SAMPLE_RATE) np.testing.assert_array_equal( expected_samples, audio_io.wav_data_to_samples(audio, sample_rate=SAMPLE_RATE)) self.assertEqual(velocity_range.min, 20) self.assertEqual(velocity_range.max, 80) self.assertEqual(note_sequence.notes[0].velocity, 20) self.assertEqual(note_sequence.notes[0].end_time, 5.) self.assertEqual(note_sequence.notes[1].velocity, 80) self.assertEqual(note_sequence.notes[1].end_time, 10.)
def transcribe(audio, sr, cuda=False): """ Google sucks and want to use audio path (raw wav) instead of decoded samples loosing in decoupling between file format and DSP input audio and sample rate, output mat like asmd with (pitch, ons, offs, velocity) """ # simple hack because google sucks... in this way we can accept audio data # already loaded and keep our reasonable interface (and decouple i/o # from processing) original_google_sucks = audio_io.wav_data_to_samples audio_io.wav_data_to_samples = google_sucks audio = np.array(audio) config = configs.CONFIG_MAP['onsets_frames'] hparams = config.hparams hparams.use_cudnn = cuda hparams.batch_size = 1 examples = tf.placeholder(tf.string, [None]) dataset = data.provide_batch(examples=examples, preprocess_examples=True, params=hparams, is_training=False, shuffle_examples=False, skip_n_initial_records=0) estimator = train_util.create_estimator(config.model_fn, CHECKPOINT_DIR, hparams) iterator = dataset.make_initializable_iterator() next_record = iterator.get_next() example_list = list( audio_label_data_utils.process_record(wav_data=audio, sample_rate=sr, ns=music_pb2.NoteSequence(), example_id="fakeid", min_length=0, max_length=-1, allow_empty_notesequence=True, load_audio_with_librosa=False)) assert len(example_list) == 1 to_process = [example_list[0].SerializeToString()] sess = tf.Session() sess.run([ tf.initializers.global_variables(), tf.initializers.local_variables() ]) sess.run(iterator.initializer, {examples: to_process}) def transcription_data(params): del params return tf.data.Dataset.from_tensors(sess.run(next_record)) # put back the original function (it still writes and reload... stupid # though audio_io.wav_data_to_samples = original_google_sucks input_fn = infer_util.labels_to_features_wrapper(transcription_data) prediction_list = list( estimator.predict(input_fn, yield_single_examples=False)) assert len(prediction_list) == 1 notes = music_pb2.NoteSequence.FromString( prediction_list[0]['sequence_predictions'][0]).notes out = np.empty((len(notes), 4)) for i, note in enumerate(notes): out[i] = [note.pitch, note.start_time, note.end_time, note.velocity] return out
"""# Upload Audio Run the following cell to upload audio files. """ # pianoAudio = 'misty.wav' pianoAudio = 'lalala.wav' to_process = [] with open(pianoAudio, mode='rb') as file: wav_data = file.read() example_list = list( audio_label_data_utils.process_record(wav_data=wav_data, ns=music_pb2.NoteSequence(), example_id='accompaniment.wav', min_length=0, max_length=-1, allow_empty_notesequence=True)) to_process.append(example_list[0].SerializeToString()) sess = tf.Session() sess.run( [tf.initializers.global_variables(), tf.initializers.local_variables()]) sess.run(iterator.initializer, {examples: to_process}) input_fn = infer_util.labels_to_features_wrapper(transcription_data) """# Inference