Exemplo n.º 1
0
def test_synthesize():
    model_path = os.path.join("files", "tacotron2_statedict.pt")
    waveglow_path = os.path.join("files",
                                 "waveglow_256channels_universal_v5.pt")
    graph_path = "graph.png"
    audio_path = "synthesized_audio.wav"

    model = load_model(model_path)
    assert model

    waveglow = load_waveglow_model(waveglow_path)
    assert waveglow

    text = "hello everybody my name is david attenborough"
    inflect_engine = inflect.engine()
    synthesize(model,
               waveglow,
               text,
               inflect_engine,
               graph=graph_path,
               audio=audio_path)

    assert text_similarity(text, transcribe(audio_path)) > 0.5
    assert os.path.isfile(graph_path)
    assert os.path.isfile(audio_path)

    os.remove(graph_path)
    os.remove(audio_path)
Exemplo n.º 2
0
def process_segments(audio_path,
                     output_path,
                     segments,
                     min_length,
                     max_length,
                     logging=logging):
    logging.info("Generating segments...")
    samples = []
    total = len(segments)
    index = 0
    for i in range(total):
        segment = segments[i]
        _, time_start, time_end = segment
        time_length = time_end - time_start

        if time_length >= min_length and time_length <= max_length:
            name = cut_audio(audio_path, int(time_start), int(time_end),
                             output_path)
            clip_path = os.path.join(output_path, name)
            transcript = transcribe(clip_path)
            if transcript:
                samples.append({
                    "index": index,
                    "start": time_start,
                    "end": time_end,
                    "name": name,
                    "transcript": transcript.strip(),
                })
                index += 1

        logging.info(f"Progress - {i+1}/{total}")
    return samples
def transcribe_clips(folder, labels, output_path):
    files = os.listdir(folder)[:5]
    labels = read_labels(labels)
    data = []
    for filename in tqdm(files):
        prediction = transcribe(os.path.join(folder, filename))
        actual = labels[filename[:-4]]
        score = compare(prediction, actual)
        data.append(Transcription(filename, prediction, actual, score))

    save_results(data, output_path)
Exemplo n.º 4
0
def evalulate_audio(audio, text):
    """
    Gets list of words not recognised in the audio.
    Compares the transcription and given text.

    Parameters
    ----------
    audio : str
        Path to audio file
    text : str
        Synthesised text

    Returns
    -------
    set
        Set of words not recognised in the audio
    """
    results = transcribe(audio)
    original_words = text.split(" ")
    produced_words = results.split(" ")
    return set(original_words) - set(produced_words)
Exemplo n.º 5
0
def process_segments(audio_path,
                     output_path,
                     segments,
                     min_length,
                     max_length,
                     logging=logging):
    """
    Generates audio clips and reduces segments to only valid ones.
    This includes removing segements which are too long, too short or cannot be transcribed.

    Parameters
    ----------
    audio_path : str
        Path to audio file
    output_path : str
        Path to save clips to
    segments : list
        List of segments produced in get_segments
    min_length : int
        Minimum length of a clip (in milliseconds)
    max_length : int
        Maximum length of a clip (in milliseconds)
    logging : logging (optional)
        Logging object to write progress to

    Returns
    -------
    list
        List of samples (dictionaries containing clip index, start, end, name & transcript)
    """
    logging.info("Generating segments...")
    samples = []
    total = len(segments)
    index = 0
    for i in range(total):
        segment = segments[i]
        _, time_start, time_end = segment
        time_length = time_end - time_start

        if time_length >= min_length and time_length <= max_length:
            name = cut_audio(audio_path, int(time_start), int(time_end),
                             output_path)
            clip_path = os.path.join(output_path, name)

            try:
                transcript = transcribe(clip_path)
            except:
                logging.info(f"Could not transcribe {clip_path}")
                transcript = None

            if transcript:
                samples.append({
                    "index": index,
                    "start": time_start,
                    "end": time_end,
                    "name": name,
                    "transcript": transcript.strip(),
                })
                index += 1

        logging.info(f"Progress - {i+1}/{total}")
    return samples
Exemplo n.º 6
0
def evalulate_audio(audio, text):
    results = transcribe(audio)
    original_words = text.split(" ")
    produced_words = results.split(" ")
    return set(original_words) - set(produced_words)