def preprocess(data, name, sr=16000): # get count of examples from text file num_examples = len(data['prompts']) # pad out all these jagged arrays and store them in an npy file texts = [] text_lens = [] speech_lens = [] max_freq_length = audio.maximum_audio_length // (audio.r * audio.hop_length) stfts = np.zeros((num_examples, max_freq_length, 1025 * audio.r), dtype=np.float16) mels = np.zeros((num_examples, max_freq_length, 80 * audio.r), dtype=np.float16) count = 0 for text, audio_file in tqdm(zip(data['prompts'], data['audio_files']), total=num_examples): text = [process_char(c) for c in list(text)] mel, stft = audio.process_audio(audio_file, sr=sr) if mel is not None: texts.append(np.array(text)) text_lens.append(len(text)) speech_lens.append(mel.shape[0]) mels[count] = mel stfts[count] = stft count += 1 mels = mels[:len(texts)] stfts = stfts[:len(texts)] save_to_npy(texts, text_lens, mels, stfts, speech_lens, name) if 'speakers' in data: np.save('data/%s/speakers.npy' % name, data['speakers'], allow_pickle=False) # save vocabulary save_vocab(name)
def get_alignment(body, response): """Calculate alignment of an MEI file to an youtube video or audio file. :param body: Form data from HTTP post request. :param response: Response header object. :return: Dictionary containing IDs of rests and notes as keys and their corresponding timing positions in the audio as values. """ ########################################################################### # parse incoming request ########################################################################### if 'mei' not in body: response.status = HTTP_BAD_REQUEST return 'Please provide MEI file.' if 'youtube-url' in body: request_type = Type.YOUTUBE elif 'audio' in body: request_type = Type.AUDIO else: response.status = HTTP_BAD_REQUEST return 'Please provide either a valid YouTube link or an audio file.' ########################################################################### # save audio track to temporary file ########################################################################### with tempfile.TemporaryDirectory() as temp_dir: audio_path = os.path.join(temp_dir, 'audio') if request_type == Type.YOUTUBE: youtube_url = body['youtube-url'] try: youtube.download_audio(youtube_url, audio_path) except: # broad exception clause on purpose! response.status = HTTP_BAD_REQUEST return 'YouTube video could not be downloaded. Check your network connection and make sure that the YouTube URL is valid.' else: # request_type == Type.AUDIO # write audio to temporary file with open(audio_path, mode='wb') as audio_file: audio_file.write(body['audio']) try: audio_data, frame_rate, trim_start, _ = process_audio( audio_path) # process and load audio data as numpy array except: response.status = HTTP_BAD_REQUEST return 'Audio file could not be processed. Make sure that the file format is supported by FFmpeg.' ########################################################################### # calculate MEI chroma features ########################################################################### mei_xml = body['mei'].decode('utf-8') try: chroma_mei, id_to_chroma_index = mei_to_chroma( mei_xml) # might throw Java exception except: # broad exception clause on purpose! response.status = HTTP_BAD_REQUEST return 'MEI file could not be processed.' ########################################################################### # calculate audio chroma features ########################################################################### chroma_size = round(len(audio_data) / chroma_mei.shape[1]) chroma_audio = librosa.feature.chroma_stft(y=audio_data, sr=frame_rate, hop_length=chroma_size) ########################################################################### # calculate warping path ########################################################################### path = librosa.sequence.dtw(chroma_mei, chroma_audio)[1] path_dict = {key: value for (key, value) in path} ########################################################################### # build and return dictionary {MEI id: time[seconds]} ########################################################################### id_to_time = {} chroma_length = len(audio_data) / frame_rate / chroma_audio.shape[1] for id in id_to_chroma_index: id_to_time[id] = path_dict[id_to_chroma_index[id]] * chroma_length id_to_time[ id] += trim_start / 1000 # Offset for trimmed audio in seconds return id_to_time # return result as JSON
def preprocess_to_table(data, data_name, sr=16000): filename = data_name # Added example_batch_size = 5000 # get count of examples from text file num_examples = len(data['prompts']) # pad out all these jagged arrays and store them in an npy file texts = [] max_freq_length = audio.maximum_audio_length // (audio.r * audio.hop_length) print("num_examples: %s" % str(num_examples)) print("max_freq_length: %s" % str(max_freq_length)) print("1025*audio.r: %s" % str(1025 * audio.r)) text_lens = np.zeros((example_batch_size), dtype=np.int32) speech_lens = np.zeros((example_batch_size), dtype=np.int32) mels = np.zeros((example_batch_size, max_freq_length, 80 * audio.r), dtype=np.float16) stfts = np.zeros((example_batch_size, max_freq_length, 1025 * audio.r), dtype=np.float16) print("Processing audio...") texts_for_length = list() audio_count = 0 for text, audio_file in zip(data['prompts'], data['audio_files']): mel, stft = audio.process_audio(audio_file, sr=sr) if mel is not None: text = np.array([process_char(c) for c in list(text)]) texts_for_length.append(text) audio_count += 1 if audio_count % 500 == 0: print("Processed %d audio samples..." % audio_count) print("Processed %d audio samples total!" % audio_count) max_text_length = max(r.shape[0] for r in texts_for_length) print("max_text_length: %d" % max_text_length) max_len = max(r.shape[0] for r in texts_for_length) padded_texts_for_length = pad_to_dense(texts_for_length, max_len) print("max_text_length: %s" % str(padded_texts_for_length.shape)) table_description = { INDEX_COL: tables.Int64Col(), MELS_COL: tables.Float16Col(shape=(max_freq_length, 80 * audio.r)), MELS_SHAPE_COL: tables.Int64Col(shape=(2)), STFTS_COL: tables.Float16Col(shape=(max_freq_length, 1025 * audio.r)), STFTS_SHAPE_COL: tables.Int64Col(shape=(2)), TEXTS_COL: tables.Int32Col(shape=(max_text_length)), TEXT_LENS_COL: tables.Int32Col(), SPEECH_LENS_COL: tables.Int32Col() } create_hdf5_table_file(filename, table_description) print("len prompts: %d" % len(data["prompts"])) print("len audio_files: %d" % len(data["audio_files"])) count = 0 for text, audio_file in tqdm(zip(data['prompts'], data['audio_files']), total=num_examples): original_text = text text = [process_char(c) for c in list(text)] mel, stft = audio.process_audio(audio_file, sr=sr) if mel is not None: texts.append(np.array(text)) text_lens[count % example_batch_size] = len(text) speech_lens[count % example_batch_size] = mel.shape[0] mels[count % example_batch_size] = mel stfts[count % example_batch_size] = stft count += 1 if count % example_batch_size == 0: print("Writing data on count: %d/%d" % (count, num_examples)) rows = list() for i in range(example_batch_size): row_dict = dict() row_dict[INDEX_COL] = count - 1 row_dict[MELS_COL] = mels[i] row_dict[MELS_SHAPE_COL] = mels[i].shape row_dict[STFTS_COL] = stfts[i] row_dict[STFTS_SHAPE_COL] = stfts[i].shape row_dict[TEXT_LENS_COL] = text_lens[i] row_dict[SPEECH_LENS_COL] = speech_lens[i] rows.append(row_dict) append_rows_to_hdf5_table(filename, rows) print("Wrote batch sized '%d' to '%s'" % (example_batch_size, filename)) else: print("mel is None for text: %s" % str(original_text)) rows = list() starting_index = count - 1 - (count % example_batch_size) for i in range(count % example_batch_size): row_dict = dict() row_dict[INDEX_COL] = starting_index + i row_dict[MELS_COL] = mels[i] row_dict[MELS_SHAPE_COL] = mels[i].shape row_dict[STFTS_COL] = stfts[i] row_dict[STFTS_SHAPE_COL] = stfts[i].shape row_dict[TEXT_LENS_COL] = text_lens[i] row_dict[SPEECH_LENS_COL] = speech_lens[i] rows.append(row_dict) append_rows_to_hdf5_table(filename, rows) print("Final batch, wrote batch sized '%d' to '%s'" % ((count % example_batch_size), filename)) print("texts len: %d" % len(texts)) max_len = max(r.shape[0] for r in texts) i = 0 while i < len(texts): update_all_rows_in_hdf5_table(filename, TEXTS_COL, pad_to_dense( texts[i:i + example_batch_size], max_len), start_index=i) i += example_batch_size print("Wrote batch sized '%d' to '%s'" % (example_batch_size, filename)) if len(texts) % example_batch_size != 0: prev_i = i - example_batch_size update_all_rows_in_hdf5_table( filename, TEXTS_COL, pad_to_dense(texts[prev_i:prev_i + example_batch_size], max_len), start_index=prev_i) print("Final batch, wrote batch sized '%d' to '%s'" % ((len(texts) % example_batch_size), filename)) if 'speakers' in data: np.save('data/%s/speakers.npy' % data_name, data['speakers'], allow_pickle=False) # save vocabulary save_vocab(data_name)
def preprocess(data, data_name, sr=16000): # Added example_batch_size = 5000 # get count of examples from text file num_examples = len(data['prompts']) # pad out all these jagged arrays and store them in an npy file texts = [] text_lens = [] speech_lens = [] max_freq_length = audio.maximum_audio_length // (audio.r * audio.hop_length) print("num_examples: %s" % str(num_examples)) print("max_freq_length: %s" % str(max_freq_length)) print("1025*audio.r: %s" % str(1025 * audio.r)) mels = np.zeros((example_batch_size, max_freq_length, 80 * audio.r), dtype=np.float16) stfts = np.zeros((example_batch_size, max_freq_length, 1025 * audio.r), dtype=np.float16) create_hdf5_file(max_freq_length, data_name) count = 0 for text, audio_file in tqdm(zip(data['prompts'], data['audio_files']), total=num_examples): text = [process_char(c) for c in list(text)] mel, stft = audio.process_audio(audio_file, sr=sr) if mel is not None: texts.append(np.array(text)) text_lens.append(len(text)) speech_lens.append(mel.shape[0]) mels[count % example_batch_size] = mel stfts[count % example_batch_size] = stft count += 1 if count % example_batch_size == 0: append_to_hdf5_dataset(mels, stfts, data_name) append_to_hdf5_dataset(mels[:count % example_batch_size], stfts[:count % example_batch_size], data_name) max_len = max(r.shape[0] for r in texts) inputs = pad_to_dense(texts, max_len), np.array(text_lens), np.array(speech_lens) short_names = 'texts', 'text_lens', 'speech_lens' filepath = "data/%s/%s" % (data_name, "data") for short_name, inp in zip(short_names, inputs): print("Saving to hdf5: %s, %s" % (filepath, inp.shape)) print("input: %s" % inp) add_data_to_hdf5(inp, short_name, filepath) if 'speakers' in data: np.save('data/%s/speakers.npy' % data_name, data['speakers'], allow_pickle=False) # save vocabulary save_vocab(data_name) print("Saved '%d' records." % count)