예제 #1
0
def test_new_pron_dict_using_use_existing(tmpdir):
    """
    Using the use_existing when an existing pron dict does not exist is okay.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    kaldi.new_pron_dict('p', use_existing=True)
    return
예제 #2
0
def test_override_and_use_existing(tmpdir):
    """
    Cannot have both the override and use_existing parameters set to True.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    with pytest.raises(ValueError):
        kaldi.new_pron_dict('x', override=True, use_existing=True)
    return
예제 #3
0
def test_two_new_pron_dict_with_same_name(tmpdir):
    """
    Trying to create two pron dict with the same name without override or
    use_existing set to True will produce a ValueError.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    kaldi.new_pron_dict('p')
    with pytest.raises(ValueError):
        kaldi.new_pron_dict('p')
    return
예제 #4
0
def test_existing_pron_dict_using_use_existing(tmpdir):
    """
    Use the use_existing parameter to load configurations from existing pron dict.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd1 = kaldi.new_pron_dict('x')
    pd1_hash = pd1.hash
    pd2 = kaldi.new_pron_dict('x', use_existing=True)
    assert len(kaldi.list_pron_dicts()) == 1
    assert pd1_hash == pd2.hash
    assert pd1.path == pd2.path
    return
예제 #5
0
def test_existing_pron_dict_using_override(tmpdir):
    """
    Use override to delete a pron dict with the same name and create a totally
    new pron dict with the same name.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd1 = kaldi.new_pron_dict('pronunciation dictionary')
    pd1_hash = pd1.hash
    pd2 = kaldi.new_pron_dict('pronunciation dictionary', override=True)
    # note pd1 can no longer be used
    assert len(kaldi.list_pron_dicts()) == 1
    assert pd1_hash != pd2.hash
    return
예제 #6
0
def test_set_l2s_missing_path(tmpdir):
    """
    If the path does not exist then get a missing file error.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd = kaldi.new_pron_dict('pronunciation dictionary')
    with pytest.raises(FileNotFoundError):
        pd.set_l2s_path('/missing/letter_to_sound.txt')
    return
예제 #7
0
def test_lexicon_before_linking(tmpdir):
    """
    Must link to a dataset before attempting to generate the lexicon,
    otherwise an error is produced.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd = kaldi.new_pron_dict('pronunciation dictionary')
    pd.set_l2s_path('/recordings/letter_to_sound.txt')
    with pytest.raises(RuntimeError):
        pd.generate_lexicon()
    return
예제 #8
0
def test_lexicon_before_dataset_processing(tmpdir):
    """
    An attempt to generate lexicon before processing the dataset will raise an
    error.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    ds = kaldi.new_dataset('dataset_x')
    pd = kaldi.new_pron_dict('pronunciation dictionary')
    pd.link(ds)
    pd.set_l2s_path('/recordings/letter_to_sound.txt')
    with pytest.raises(RuntimeError):
        pd.generate_lexicon()
예제 #9
0
def test_loads_minimal(tmpdir):
    """
    Use load class method to load a pron dict from existing configuration.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd1 = kaldi.new_pron_dict('pronunciation dictionary')
    pd2 = PronDict.load(pd1.path)

    assert pd2.state == json.loads(f"""
    {{
        "name": "pronunciation dictionary",
        "hash": "{pd1.hash}",
        "date": "{pd1.date}",
        "dataset": null,
        "l2s": false,
        "lexicon": false
    }}
    """)
    return
예제 #10
0
def test_save_lexicon(tmpdir):
    """
    save a lexicon separate from the letters to sounds.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd = kaldi.new_pron_dict('pronunciation dictionary')
    pd.save_lexicon('This is the new lexicon')
    assert pd.get_lexicon_content() == 'This is the new lexicon'
    assert pd.state == json.loads(f"""
    {{
        "name": "pronunciation dictionary",
        "hash": "{pd.hash}",
        "date": "{pd.date}",
        "dataset": null,
        "l2s": false,
        "lexicon": true
    }}
    """)
    return
예제 #11
0
def test_new_pron_dict_using_override(tmpdir):
    """
    Using override has no effect when the pron dict with the same name does not
    exist.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd = kaldi.new_pron_dict('pronunciation dictionary', override=True)
    assert pd.get_l2s_content() == False
    assert pd.get_lexicon_content() == None
    assert pd.state == json.loads(f"""
    {{
        "name": "pronunciation dictionary",
        "hash": "{pd.hash}",
        "date": "{pd.date}",
        "dataset": null,
        "l2s": false,
        "lexicon": false
    }}
    """)
    return
예제 #12
0
def test_new_pron_dict(tmpdir):
    """
    Check the state of a new pron dict.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    # ds = kaldi.new_dataset('dataset_x')
    pd = kaldi.new_pron_dict('pronunciation dictionary')
    assert pd.get_l2s_content() == False
    assert pd.get_lexicon_content() == None
    assert pd.state == json.loads(f"""
    {{
        "name": "pronunciation dictionary",
        "hash": "{pd.hash}",
        "date": "{pd.date}",
        "dataset": null,
        "l2s": false,
        "lexicon": false
    }}
    """)
    return
예제 #13
0
def test_set_l2s_path(tmpdir):
    """
    Set letters to sound by file path.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd = kaldi.new_pron_dict('pronunciation dictionary')
    pd.set_l2s_path('/recordings/letter_to_sound.txt')

    assert pd.get_lexicon_content() == None
    assert pd.state == json.loads(f"""
    {{
        "name": "pronunciation dictionary",
        "hash": "{pd.hash}",
        "date": "{pd.date}",
        "dataset": null,
        "l2s": true,
        "lexicon": false
    }}
    """)
    return
예제 #14
0
def test_pron_dict_with_dataset(tmpdir):
    """
    Link dataset.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    ds = kaldi.new_dataset('dataset_x')
    pd = kaldi.new_pron_dict('pronunciation dictionary')
    pd.link(ds)

    assert pd.get_lexicon_content() == None
    assert pd.state == json.loads(f"""
    {{
        "name": "pronunciation dictionary",
        "hash": "{pd.hash}",
        "date": "{pd.date}",
        "dataset": "dataset_x",
        "l2s": false,
        "lexicon": false
    }}
    """)
    return
예제 #15
0
def test_set_l2s_content(tmpdir):
    """
    Set letters to sound by content.
    """
    kaldi = KaldiInterface(f'{tmpdir}/state')
    pd = kaldi.new_pron_dict('pronunciation dictionary')
    with open('/recordings/letter_to_sound.txt', 'rb') as fin:
        content = fin.read()
        pd.set_l2s_content(content)

    assert pd.get_lexicon_content() == None
    assert Path(f'{pd.path}/l2s.txt').is_file()
    assert pd.state == json.loads(f"""
    {{
        "name": "pronunciation dictionary",
        "hash": "{pd.hash}",
        "date": "{pd.date}",
        "dataset": null,
        "l2s": true,
        "lexicon": false
    }}
    """)
    return
예제 #16
0
def mock_model(tmpdir_factory):
    base_path = tmpdir_factory.mktemp("pipeline")
    base_path = Path(base_path)
    if not base_path.joinpath('/state').exists():
        kaldi = KaldiInterface(f'{base_path}/state')

        ds = kaldi.new_dataset('dataset_x')
        ds.add_directory('/recordings/transcribed')
        ds.select_importer('Elan')
        ds.process()

        pd = kaldi.new_pron_dict('pron_dict_y')
        pd.link(ds)
        pd.set_l2s_path('/recordings/letter_to_sound.txt')
        pd.generate_lexicon()

        m = kaldi.new_model('model_z')
        m.link(ds, pd)
        m.build_kaldi_structure()  # TODO: remove this line
        m.train()  # may take a while
    else:
        kaldi = KaldiInterface.load(f'{base_path}/state')
        m = kaldi.new_model('model_z', use_existing=True)
    return (kaldi, m)
예제 #17
0
# ======
# Create a Kaldi interface directory (where all the associated files/objects
# will be stored).
kaldi = KaldiInterface('/elpis/state')

# Step 1
# ======
# Setup a dataset to to train data on.
ds = kaldi.new_dataset('dsy')
ds.add_directory('/recordings/transcribed', filter=['eaf', 'wav'])
ds.process()

# Step 2
# ======
# Build pronunciation dictionary
pd = kaldi.new_pron_dict('pd')
pd.link(ds)
pd.set_l2s_path('/recordings/letter_to_sound.txt')
pd.generate_lexicon()

# Step 3
# ======
# Link dataset and pd to a new model, then train the model.
m = kaldi.new_model('mx')
m.link(ds, pd)
m.build_kaldi_structure()
m.train() # may take a while

# Step 4
# ======
# Make a transcription interface and transcribe unseen audio to elan.