Пример #1
0
def preprocess(data, name, sr=16000):

    # get count of examples from text file
    num_examples = len(data['prompts'])

    # pad out all these jagged arrays and store them in an npy file
    texts = []
    text_lens = []
    speech_lens = []

    max_freq_length = audio.maximum_audio_length // (audio.r *
                                                     audio.hop_length)
    stfts = np.zeros((num_examples, max_freq_length, 1025 * audio.r),
                     dtype=np.float16)
    mels = np.zeros((num_examples, max_freq_length, 80 * audio.r),
                    dtype=np.float16)

    count = 0
    for text, audio_file in tqdm(zip(data['prompts'], data['audio_files']),
                                 total=num_examples):

        text = [process_char(c) for c in list(text)]
        mel, stft = audio.process_audio(audio_file, sr=sr)

        if mel is not None:
            texts.append(np.array(text))
            text_lens.append(len(text))
            speech_lens.append(mel.shape[0])

            mels[count] = mel
            stfts[count] = stft

            count += 1

    mels = mels[:len(texts)]
    stfts = stfts[:len(texts)]

    save_to_npy(texts, text_lens, mels, stfts, speech_lens, name)

    if 'speakers' in data:
        np.save('data/%s/speakers.npy' % name,
                data['speakers'],
                allow_pickle=False)

    # save vocabulary
    save_vocab(name)
Пример #2
0
def get_alignment(body, response):
    """Calculate alignment of an MEI file to an youtube video or audio file.

    :param body: Form data from HTTP post request.
    :param response: Response header object.
    :return: Dictionary containing IDs of rests and notes as keys and
    their corresponding timing positions in the audio as values.
    """

    ###########################################################################
    # parse incoming request
    ###########################################################################
    if 'mei' not in body:
        response.status = HTTP_BAD_REQUEST
        return 'Please provide MEI file.'

    if 'youtube-url' in body:
        request_type = Type.YOUTUBE
    elif 'audio' in body:
        request_type = Type.AUDIO
    else:
        response.status = HTTP_BAD_REQUEST
        return 'Please provide either a valid YouTube link or an audio file.'

    ###########################################################################
    # save audio track to temporary file
    ###########################################################################
    with tempfile.TemporaryDirectory() as temp_dir:
        audio_path = os.path.join(temp_dir, 'audio')
        if request_type == Type.YOUTUBE:
            youtube_url = body['youtube-url']
            try:
                youtube.download_audio(youtube_url, audio_path)
            except:  # broad exception clause on purpose!
                response.status = HTTP_BAD_REQUEST
                return 'YouTube video could not be downloaded. Check your network connection and make sure that the YouTube URL is valid.'
        else:  # request_type == Type.AUDIO
            # write audio to temporary file
            with open(audio_path, mode='wb') as audio_file:
                audio_file.write(body['audio'])

        try:
            audio_data, frame_rate, trim_start, _ = process_audio(
                audio_path)  # process and load audio data as numpy array
        except:
            response.status = HTTP_BAD_REQUEST
            return 'Audio file could not be processed. Make sure that the file format is supported by FFmpeg.'

    ###########################################################################
    # calculate MEI chroma features
    ###########################################################################
    mei_xml = body['mei'].decode('utf-8')
    try:
        chroma_mei, id_to_chroma_index = mei_to_chroma(
            mei_xml)  # might throw Java exception
    except:  # broad exception clause on purpose!
        response.status = HTTP_BAD_REQUEST
        return 'MEI file could not be processed.'

    ###########################################################################
    # calculate audio chroma features
    ###########################################################################
    chroma_size = round(len(audio_data) / chroma_mei.shape[1])
    chroma_audio = librosa.feature.chroma_stft(y=audio_data,
                                               sr=frame_rate,
                                               hop_length=chroma_size)

    ###########################################################################
    # calculate warping path
    ###########################################################################
    path = librosa.sequence.dtw(chroma_mei, chroma_audio)[1]
    path_dict = {key: value for (key, value) in path}

    ###########################################################################
    # build and return dictionary {MEI id: time[seconds]}
    ###########################################################################
    id_to_time = {}
    chroma_length = len(audio_data) / frame_rate / chroma_audio.shape[1]
    for id in id_to_chroma_index:
        id_to_time[id] = path_dict[id_to_chroma_index[id]] * chroma_length
        id_to_time[
            id] += trim_start / 1000  # Offset for trimmed audio in seconds

    return id_to_time  # return result as JSON
Пример #3
0
def preprocess_to_table(data, data_name, sr=16000):
    filename = data_name

    # Added
    example_batch_size = 5000

    # get count of examples from text file
    num_examples = len(data['prompts'])

    # pad out all these jagged arrays and store them in an npy file
    texts = []

    max_freq_length = audio.maximum_audio_length // (audio.r *
                                                     audio.hop_length)
    print("num_examples: %s" % str(num_examples))
    print("max_freq_length: %s" % str(max_freq_length))
    print("1025*audio.r: %s" % str(1025 * audio.r))

    text_lens = np.zeros((example_batch_size), dtype=np.int32)
    speech_lens = np.zeros((example_batch_size), dtype=np.int32)
    mels = np.zeros((example_batch_size, max_freq_length, 80 * audio.r),
                    dtype=np.float16)
    stfts = np.zeros((example_batch_size, max_freq_length, 1025 * audio.r),
                     dtype=np.float16)

    print("Processing audio...")
    texts_for_length = list()
    audio_count = 0
    for text, audio_file in zip(data['prompts'], data['audio_files']):
        mel, stft = audio.process_audio(audio_file, sr=sr)
        if mel is not None:
            text = np.array([process_char(c) for c in list(text)])
            texts_for_length.append(text)
        audio_count += 1
        if audio_count % 500 == 0:
            print("Processed %d audio samples..." % audio_count)
    print("Processed %d audio samples total!" % audio_count)
    max_text_length = max(r.shape[0] for r in texts_for_length)
    print("max_text_length: %d" % max_text_length)
    max_len = max(r.shape[0] for r in texts_for_length)
    padded_texts_for_length = pad_to_dense(texts_for_length, max_len)
    print("max_text_length: %s" % str(padded_texts_for_length.shape))

    table_description = {
        INDEX_COL: tables.Int64Col(),
        MELS_COL: tables.Float16Col(shape=(max_freq_length, 80 * audio.r)),
        MELS_SHAPE_COL: tables.Int64Col(shape=(2)),
        STFTS_COL: tables.Float16Col(shape=(max_freq_length, 1025 * audio.r)),
        STFTS_SHAPE_COL: tables.Int64Col(shape=(2)),
        TEXTS_COL: tables.Int32Col(shape=(max_text_length)),
        TEXT_LENS_COL: tables.Int32Col(),
        SPEECH_LENS_COL: tables.Int32Col()
    }
    create_hdf5_table_file(filename, table_description)

    print("len prompts: %d" % len(data["prompts"]))
    print("len audio_files: %d" % len(data["audio_files"]))

    count = 0
    for text, audio_file in tqdm(zip(data['prompts'], data['audio_files']),
                                 total=num_examples):

        original_text = text

        text = [process_char(c) for c in list(text)]
        mel, stft = audio.process_audio(audio_file, sr=sr)

        if mel is not None:
            texts.append(np.array(text))

            text_lens[count % example_batch_size] = len(text)
            speech_lens[count % example_batch_size] = mel.shape[0]

            mels[count % example_batch_size] = mel
            stfts[count % example_batch_size] = stft

            count += 1

            if count % example_batch_size == 0:
                print("Writing data on count: %d/%d" % (count, num_examples))
                rows = list()
                for i in range(example_batch_size):
                    row_dict = dict()
                    row_dict[INDEX_COL] = count - 1
                    row_dict[MELS_COL] = mels[i]
                    row_dict[MELS_SHAPE_COL] = mels[i].shape
                    row_dict[STFTS_COL] = stfts[i]
                    row_dict[STFTS_SHAPE_COL] = stfts[i].shape
                    row_dict[TEXT_LENS_COL] = text_lens[i]
                    row_dict[SPEECH_LENS_COL] = speech_lens[i]
                    rows.append(row_dict)

                append_rows_to_hdf5_table(filename, rows)
                print("Wrote batch sized '%d' to '%s'" %
                      (example_batch_size, filename))
        else:
            print("mel is None for text: %s" % str(original_text))

    rows = list()
    starting_index = count - 1 - (count % example_batch_size)
    for i in range(count % example_batch_size):
        row_dict = dict()
        row_dict[INDEX_COL] = starting_index + i
        row_dict[MELS_COL] = mels[i]
        row_dict[MELS_SHAPE_COL] = mels[i].shape
        row_dict[STFTS_COL] = stfts[i]
        row_dict[STFTS_SHAPE_COL] = stfts[i].shape
        row_dict[TEXT_LENS_COL] = text_lens[i]
        row_dict[SPEECH_LENS_COL] = speech_lens[i]
        rows.append(row_dict)
    append_rows_to_hdf5_table(filename, rows)
    print("Final batch, wrote batch sized '%d' to '%s'" %
          ((count % example_batch_size), filename))

    print("texts len: %d" % len(texts))
    max_len = max(r.shape[0] for r in texts)
    i = 0
    while i < len(texts):
        update_all_rows_in_hdf5_table(filename,
                                      TEXTS_COL,
                                      pad_to_dense(
                                          texts[i:i + example_batch_size],
                                          max_len),
                                      start_index=i)
        i += example_batch_size
        print("Wrote batch sized '%d' to '%s'" %
              (example_batch_size, filename))
    if len(texts) % example_batch_size != 0:
        prev_i = i - example_batch_size
        update_all_rows_in_hdf5_table(
            filename,
            TEXTS_COL,
            pad_to_dense(texts[prev_i:prev_i + example_batch_size], max_len),
            start_index=prev_i)
        print("Final batch, wrote batch sized '%d' to '%s'" %
              ((len(texts) % example_batch_size), filename))

    if 'speakers' in data:
        np.save('data/%s/speakers.npy' % data_name,
                data['speakers'],
                allow_pickle=False)

    # save vocabulary
    save_vocab(data_name)
Пример #4
0
def preprocess(data, data_name, sr=16000):
    # Added
    example_batch_size = 5000

    # get count of examples from text file
    num_examples = len(data['prompts'])

    # pad out all these jagged arrays and store them in an npy file
    texts = []
    text_lens = []
    speech_lens = []

    max_freq_length = audio.maximum_audio_length // (audio.r *
                                                     audio.hop_length)
    print("num_examples: %s" % str(num_examples))
    print("max_freq_length: %s" % str(max_freq_length))
    print("1025*audio.r: %s" % str(1025 * audio.r))

    mels = np.zeros((example_batch_size, max_freq_length, 80 * audio.r),
                    dtype=np.float16)
    stfts = np.zeros((example_batch_size, max_freq_length, 1025 * audio.r),
                     dtype=np.float16)

    create_hdf5_file(max_freq_length, data_name)

    count = 0
    for text, audio_file in tqdm(zip(data['prompts'], data['audio_files']),
                                 total=num_examples):

        text = [process_char(c) for c in list(text)]
        mel, stft = audio.process_audio(audio_file, sr=sr)

        if mel is not None:
            texts.append(np.array(text))
            text_lens.append(len(text))
            speech_lens.append(mel.shape[0])

            mels[count % example_batch_size] = mel
            stfts[count % example_batch_size] = stft

            count += 1

            if count % example_batch_size == 0:
                append_to_hdf5_dataset(mels, stfts, data_name)

    append_to_hdf5_dataset(mels[:count % example_batch_size],
                           stfts[:count % example_batch_size], data_name)

    max_len = max(r.shape[0] for r in texts)
    inputs = pad_to_dense(texts,
                          max_len), np.array(text_lens), np.array(speech_lens)
    short_names = 'texts', 'text_lens', 'speech_lens'
    filepath = "data/%s/%s" % (data_name, "data")
    for short_name, inp in zip(short_names, inputs):
        print("Saving to hdf5: %s, %s" % (filepath, inp.shape))
        print("input: %s" % inp)
        add_data_to_hdf5(inp, short_name, filepath)

    if 'speakers' in data:
        np.save('data/%s/speakers.npy' % data_name,
                data['speakers'],
                allow_pickle=False)

    # save vocabulary
    save_vocab(data_name)
    print("Saved '%d' records." % count)