def test_basic(basic_dict_path, basic_corpus_dir, generated_dir, default_feature_config):
    dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    dictionary.write()
    output_directory = os.path.join(generated_dir, 'basic')
    c = AlignableCorpus(basic_corpus_dir, output_directory)
    c.initialize_corpus(dictionary)
    default_feature_config.generate_features(c)
    assert c.get_feat_dim(default_feature_config) == 39
def test_basic(basic_dict_path, generated_dir):
    d = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    d.write()
    assert set(d.phones) == {'sil', 'sp', 'spn', 'phonea', 'phoneb', 'phonec'}
    assert set(d.positional_nonsil_phones) == {
        'phonea_B', 'phonea_I', 'phonea_E', 'phonea_S', 'phoneb_B', 'phoneb_I',
        'phoneb_E', 'phoneb_S', 'phonec_B', 'phonec_I', 'phonec_E', 'phonec_S'
    }
def test_stereo(basic_dict_path, stereo_corpus_dir, temp_dir, default_feature_config):
    temp = os.path.join(temp_dir, 'stereo')
    dictionary = Dictionary(basic_dict_path, os.path.join(temp, 'basic'))
    dictionary.write()
    d = AlignableCorpus(stereo_corpus_dir, temp)
    d.initialize_corpus(dictionary)
    default_feature_config.generate_features(d)
    assert d.get_feat_dim(default_feature_config) == 39
def test_basic_txt(basic_corpus_txt_dir, basic_dict_path, generated_dir, default_feature_config):
    dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    dictionary.write()
    output_directory = os.path.join(generated_dir, 'basic')
    c = AlignableCorpus(basic_corpus_txt_dir, output_directory)
    assert len(c.no_transcription_files) == 0
    c.initialize_corpus(dictionary)
    default_feature_config.generate_features(c)
    assert c.get_feat_dim(default_feature_config) == 39
def test_subset(large_prosodylab_format_directory, temp_dir, large_dataset_dictionary, default_feature_config):
    output_directory = os.path.join(temp_dir, 'large_subset')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(large_dataset_dictionary, output_directory)
    d.write()
    c = AlignableCorpus(large_prosodylab_format_directory, output_directory)
    c.initialize_corpus(d)
    sd = c.split_directory()

    default_feature_config.generate_features(c)
    s = c.subset_directory(10, default_feature_config)
    assert os.path.exists(sd)
    assert os.path.exists(s)
def test_transcribe_from_temp(basic_corpus_txt_dir, basic_dict_path, generated_dir, default_feature_config):
    dictionary = Dictionary(basic_dict_path, os.path.join(generated_dir, 'basic'))
    dictionary.write()
    output_directory = os.path.join(generated_dir, 'basic')
    c = TranscribeCorpus(basic_corpus_txt_dir, output_directory, use_mp=False)
    c.initialize_corpus(dictionary)
    default_feature_config.generate_features(c)
    assert c.get_feat_dim(default_feature_config) == 39

    c = TranscribeCorpus(basic_corpus_txt_dir, output_directory, use_mp=False)
    c.initialize_corpus(dictionary)
    default_feature_config.generate_features(c)
    assert c.get_feat_dim(default_feature_config) == 39
def test_short_segments(basic_dict_path, shortsegments_corpus_dir, temp_dir, default_feature_config):
    temp = os.path.join(temp_dir, 'short_segments')
    dictionary = Dictionary(basic_dict_path, temp)
    dictionary.write()
    corpus = AlignableCorpus(shortsegments_corpus_dir, temp)
    corpus.initialize_corpus(dictionary)
    default_feature_config.generate_features(corpus)
    assert len(corpus.feat_mapping.keys()) == 2
    assert len(corpus.utt_speak_mapping.keys()) == 3
    assert len(corpus.speak_utt_mapping.keys()) == 1
    assert len(corpus.text_mapping.keys()) == 3
    assert len(corpus.utt_wav_mapping.keys()) == 1
    assert len(corpus.segments.keys()) == 3
    assert len(corpus.ignored_utterances) == 1
def test_weird_words(weird_words_dir, temp_dir, sick_dict_path):
    output_directory = os.path.join(temp_dir, 'weird_words')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(sick_dict_path, output_directory)
    assert 'i’m' not in d.words
    assert '’m' not in d.words
    assert d.words["i'm"][0]['pronunciation'] == ('ay', 'm', 'ih')
    assert d.words["i'm"][1]['pronunciation'] == ('ay', 'm')
    assert d.words["'m"][0]['pronunciation'] == ('m',)
    d.write()
    c = AlignableCorpus(weird_words_dir, output_directory, use_mp=False)
    c.initialize_corpus(d)
    print(c.utterance_oovs['weird_words'])
    assert c.utterance_oovs['weird_words'] == ['ajfish', 'asds-asda', 'sdasd']
Пример #9
0
def validate_corpus(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    shutil.rmtree(data_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)

    corpus = AlignableCorpus(args.corpus_directory,
                             data_directory,
                             speaker_characters=args.speaker_characters,
                             num_jobs=getattr(args, 'num_jobs', 3))
    dictionary = Dictionary(args.dictionary_path,
                            data_directory,
                            word_set=corpus.word_set)

    a = CorpusValidator(corpus,
                        dictionary,
                        temp_directory=data_directory,
                        ignore_acoustics=getattr(args, 'ignore_acoustics',
                                                 False),
                        test_transcriptions=getattr(args,
                                                    'test_transcriptions',
                                                    False),
                        use_mp=not args.disable_mp)
    a.validate()
Пример #10
0
def train_lm(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)

    data_directory = os.path.join(temp_dir, corpus_name)
    corpus = AlignableCorpus(args.corpus_directory, data_directory)
    if args.config_path:
        train_config = train_lm_yaml_to_config(args.config_path)
    else:
        train_config = load_basic_train_lm()
    if args.dictionary_path is not None:
        dictionary = Dictionary(args.dictionary_path, data_directory)
    else:
        dictionary = None
    trainer = LmTrainer(corpus,
                        train_config,
                        args.output_model_path,
                        dictionary=dictionary,
                        temp_directory=data_directory,
                        num_jobs=args.num_jobs)
    trainer.train()
def test_generate_orthography_dict(basic_corpus_dir, orth_sick_output,
                                   temp_dir):
    if G2P_DISABLED:
        pytest.skip('No Pynini found')
    command = [
        'g2p', basic_corpus_dir, orth_sick_output, '-t', temp_dir, '-q',
        '--clean', '-d'
    ]
    args, unknown = parser.parse_known_args(command)
    run_g2p(args)
    assert os.path.exists(orth_sick_output)
    d = Dictionary(orth_sick_output, temp_dir)
    assert len(d.words) > 0
Пример #12
0
def test_generate_orthography_dict(basic_corpus_dir, orth_sick_output,
                                   temp_dir):
    if G2P_DISABLED:
        pytest.skip('No Pynini found')
    args = G2PDummyArgs()
    args.g2p_model_path = None
    args.input_path = basic_corpus_dir
    args.output_path = orth_sick_output
    args.temp_directory = temp_dir
    run_g2p(args)
    assert os.path.exists(orth_sick_output)
    d = Dictionary(orth_sick_output, temp_dir)
    assert len(d.words) > 0
Пример #13
0
def validate_corpus(args):
    command = 'validate'
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    shutil.rmtree(data_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)
    logger = setup_logger(command, data_directory)

    corpus = AlignableCorpus(args.corpus_directory,
                             data_directory,
                             speaker_characters=args.speaker_characters,
                             num_jobs=getattr(args, 'num_jobs', 3),
                             logger=logger,
                             use_mp=not args.disable_mp)
    dictionary = Dictionary(args.dictionary_path,
                            data_directory,
                            logger=logger)
    if args.acoustic_model_path:
        acoustic_model = AcousticModel(args.acoustic_model_path)
        acoustic_model.validate(dictionary)

    a = CorpusValidator(corpus,
                        dictionary,
                        temp_directory=data_directory,
                        ignore_acoustics=getattr(args, 'ignore_acoustics',
                                                 False),
                        test_transcriptions=getattr(args,
                                                    'test_transcriptions',
                                                    False),
                        use_mp=not args.disable_mp,
                        logger=logger)
    begin = time.time()
    a.validate()
    logger.debug('Validation took {} seconds'.format(time.time() - begin))
    logger.info('All done!')
    logger.debug('Done! Everything took {} seconds'.format(time.time() -
                                                           all_begin))
    handlers = logger.handlers[:]
    for handler in handlers:
        handler.close()
        logger.removeHandler(handler)
def train_g2p(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    dictionary = Dictionary(args.dictionary_path, '')
    t = Trainer(dictionary,
                args.output_model_path,
                temp_directory=temp_dir,
                order=args.order,
                num_jobs=args.num_jobs,
                use_mp=not args.disable_mp)
    if args.validate:
        t.validate()
    t.train()
Пример #15
0
def train_lm(args):
    command = 'train_lm'
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    if args.config_path:
        train_config = train_lm_yaml_to_config(args.config_path)
    else:
        train_config = load_basic_train_lm()
    corpus_name = os.path.basename(args.source_path)
    if corpus_name == '':
        args.source_path = os.path.dirname(args.source_path)
        corpus_name = os.path.basename(args.source_path)
    source = args.source_path
    dictionary = None
    if args.source_path.lower().endswith('.arpa'):
        corpus_name = os.path.splitext(corpus_name)[0]
        data_directory = os.path.join(temp_dir, corpus_name)
    else:
        data_directory = os.path.join(temp_dir, corpus_name)

    logger = setup_logger(command, data_directory)
    if not args.source_path.lower().endswith('.arpa'):
        source = AlignableCorpus(args.source_path, data_directory, num_jobs=args.num_jobs, use_mp=args.num_jobs>1)
        if args.dictionary_path is not None:
            dictionary = Dictionary(args.dictionary_path, data_directory)
        else:
            dictionary = None
    trainer = LmTrainer(source, train_config, args.output_model_path, dictionary=dictionary,
                        temp_directory=data_directory,
                        supplemental_model_path=args.model_path, supplemental_model_weight=args.model_weight)
    begin = time.time()
    trainer.train()
    logger.debug('Training took {} seconds'.format(time.time() - begin))

    logger.info('All done!')
    logger.debug('Done! Everything took {} seconds'.format(time.time() - all_begin))
    handlers = logger.handlers[:]
    for handler in handlers:
        handler.close()
        logger.removeHandler(handler)
Пример #16
0
def train_lm(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    all_begin = time.time()
    corpus_name = os.path.basename(args.source_path)
    if corpus_name == '':
        args.source_path = os.path.dirname(args.source_path)
        corpus_name = os.path.basename(args.source_path)
    if args.source_path.lower().endswith('.arpa'):
        source = args.source_path
        dictionary = None
        corpus_name = os.path.splitext(corpus_name)[0]
        data_directory = os.path.join(temp_dir, corpus_name)
    else:
        data_directory = os.path.join(temp_dir, corpus_name)
        source = AlignableCorpus(args.source_path,
                                 data_directory,
                                 num_jobs=args.num_jobs)
        if args.dictionary_path is not None:
            dictionary = Dictionary(args.dictionary_path, data_directory)
        else:
            dictionary = None
    if args.config_path:
        train_config = train_lm_yaml_to_config(args.config_path)
    else:
        train_config = load_basic_train_lm()
    trainer = LmTrainer(source,
                        train_config,
                        args.output_model_path,
                        dictionary=dictionary,
                        temp_directory=data_directory,
                        supplemental_model_path=args.model_path,
                        supplemental_model_weight=args.model_weight)
    trainer.train()

    print('Done! Everything took {} seconds'.format(time.time() - all_begin))
Пример #17
0
def test_speaker_groupings(large_prosodylab_format_directory, temp_dir,
                           large_dataset_dictionary, default_feature_config):
    output_directory = os.path.join(temp_dir, 'large')
    shutil.rmtree(output_directory, ignore_errors=True)
    d = Dictionary(large_dataset_dictionary, output_directory)
    d.write()
    c = AlignableCorpus(large_prosodylab_format_directory, output_directory)

    c.initialize_corpus(d)
    default_feature_config.generate_features(c)
    speakers = os.listdir(large_prosodylab_format_directory)
    for s in speakers:
        assert any(s in x for x in c.speaker_groups)
    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.groups)

    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.feat_mapping)

    shutil.rmtree(output_directory, ignore_errors=True)
    d.write()
    c = AlignableCorpus(large_prosodylab_format_directory,
                        output_directory,
                        num_jobs=2)

    c.initialize_corpus(d)
    default_feature_config.generate_features(c)
    for s in speakers:
        assert any(s in x for x in c.speaker_groups)
    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.groups)

    for root, dirs, files in os.walk(large_prosodylab_format_directory):
        for f in files:
            name, ext = os.path.splitext(f)
            assert any(name in x for x in c.feat_mapping)
def test_extra_annotations(extra_annotations_path, generated_dir):
    d = Dictionary(extra_annotations_path,
                   os.path.join(generated_dir, 'extra'))
    assert '{' in d.graphemes
    d.write()
def test_basic_noposition(basic_dict_path, generated_dir):
    d = Dictionary(basic_dict_path,
                   os.path.join(generated_dir, 'basic'),
                   position_dependent_phones=False)
    x = d.write()
    assert set(d.phones) == {'sil', 'sp', 'spn', 'phonea', 'phoneb', 'phonec'}
Пример #20
0
def align_corpus(args):
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    conf_path = os.path.join(data_directory, 'config.yml')
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {'dirty': False,
                'begin': time.time(),
                'version': __version__,
                'type': 'align',
                'corpus_directory': args.corpus_directory,
                'dictionary_path': args.dictionary_path}
    if getattr(args, 'clean', False) \
            or conf['dirty'] or conf['type'] != 'align' \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        shutil.rmtree(data_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(args.output_directory, exist_ok=True)
    try:
        corpus = AlignableCorpus(args.corpus_directory, data_directory,
                        speaker_characters=args.speaker_characters,
                        num_jobs=args.num_jobs)
        if corpus.issues_check:
            print('WARNING: Some issues parsing the corpus were detected. '
                  'Please run the validator to get more information.')
        print(corpus.speaker_utterance_info())
        acoustic_model = AcousticModel(args.acoustic_model_path)
        dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set)
        acoustic_model.validate(dictionary)

        begin = time.time()
        if args.config_path:
            align_config = align_yaml_to_config(args.config_path)
        else:
            align_config = load_basic_align()
        a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config,
                              temp_directory=data_directory,
                              debug=getattr(args, 'debug', False))
        if args.debug:
            print('Setup pretrained aligner in {} seconds'.format(time.time() - begin))
        a.verbose = args.verbose

        begin = time.time()
        a.align()
        if args.debug:
            print('Performed alignment in {} seconds'.format(time.time() - begin))

        begin = time.time()
        a.export_textgrids(args.output_directory)
        if args.debug:
            print('Exported TextGrids in {} seconds'.format(time.time() - begin))
        print('Done! Everything took {} seconds'.format(time.time() - all_begin))
    except Exception as _:
        conf['dirty'] = True
        raise
    finally:
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)
def transcribe_corpus(args):
    command = 'transcribe'
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    if args.config_path:
        transcribe_config = transcribe_yaml_to_config(args.config_path)
    else:
        transcribe_config = load_basic_transcribe()
    data_directory = os.path.join(temp_dir, corpus_name)
    if getattr(args, 'clean', False) and os.path.exists(data_directory):
        print('Cleaning old directory!')
        shutil.rmtree(data_directory, ignore_errors=True)
    logger = setup_logger(command, data_directory)
    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(args.output_directory, exist_ok=True)
    os.makedirs(data_directory, exist_ok=True)
    conf_path = os.path.join(data_directory, 'config.yml')
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {
            'dirty': False,
            'begin': time.time(),
            'version': __version__,
            'type': 'transcribe',
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path,
            'acoustic_model_path': args.acoustic_model_path,
            'language_model_path': args.language_model_path,
        }
    if conf['dirty'] or conf['type'] != command \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path \
            or conf['language_model_path'] != args.language_model_path \
            or conf['acoustic_model_path'] != args.acoustic_model_path:
        logger.warning(
            'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no '
            'weird behavior for previous versions of the temporary directory.')
        if conf['dirty']:
            logger.debug('Previous run ended in an error (maybe ctrl-c?)')
        if conf['type'] != command:
            logger.debug(
                'Previous run was a different subcommand than {} (was {})'.
                format(command, conf['type']))
        if conf['corpus_directory'] != args.corpus_directory:
            logger.debug('Previous run used source directory '
                         'path {} (new run: {})'.format(
                             conf['corpus_directory'], args.corpus_directory))
        if conf['version'] != __version__:
            logger.debug('Previous run was on {} version (new run: {})'.format(
                conf['version'], __version__))
        if conf['dictionary_path'] != args.dictionary_path:
            logger.debug('Previous run used dictionary path {} '
                         '(new run: {})'.format(conf['dictionary_path'],
                                                args.dictionary_path))
        if conf['acoustic_model_path'] != args.acoustic_model_path:
            logger.debug('Previous run used acoustic model path {} '
                         '(new run: {})'.format(conf['acoustic_model_path'],
                                                args.acoustic_model_path))
        if conf['language_model_path'] != args.language_model_path:
            logger.debug('Previous run used language model path {} '
                         '(new run: {})'.format(conf['language_model_path'],
                                                args.language_model_path))
    try:
        if args.evaluate:
            corpus = AlignableCorpus(
                args.corpus_directory,
                data_directory,
                speaker_characters=args.speaker_characters,
                num_jobs=args.num_jobs,
                use_mp=transcribe_config.use_mp)
        else:
            corpus = TranscribeCorpus(
                args.corpus_directory,
                data_directory,
                speaker_characters=args.speaker_characters,
                num_jobs=args.num_jobs,
                use_mp=transcribe_config.use_mp)
        print(corpus.speaker_utterance_info())
        acoustic_model = AcousticModel(args.acoustic_model_path,
                                       root_directory=data_directory)
        language_model = LanguageModel(args.language_model_path,
                                       root_directory=data_directory)
        dictionary = Dictionary(args.dictionary_path, data_directory)
        acoustic_model.validate(dictionary)
        begin = time.time()
        t = Transcriber(corpus,
                        dictionary,
                        acoustic_model,
                        language_model,
                        transcribe_config,
                        temp_directory=data_directory,
                        debug=getattr(args, 'debug', False),
                        evaluation_mode=args.evaluate)
        if args.debug:
            print('Setup pretrained aligner in {} seconds'.format(time.time() -
                                                                  begin))

        begin = time.time()
        t.transcribe()
        if args.debug:
            print('Performed transcribing in {} seconds'.format(time.time() -
                                                                begin))
        if args.evaluate:
            t.evaluate(args.output_directory)
            best_config_path = os.path.join(args.output_directory,
                                            'best_transcribe_config.yaml')
            save_config(t.transcribe_config, best_config_path)
            t.export_transcriptions(args.output_directory)
        else:
            begin = time.time()
            t.export_transcriptions(args.output_directory)
            if args.debug:
                print('Exported transcriptions in {} seconds'.format(
                    time.time() - begin))
        print('Done! Everything took {} seconds'.format(time.time() -
                                                        all_begin))
    except Exception as _:
        conf['dirty'] = True
        raise
    finally:
        handlers = logger.handlers[:]
        for handler in handlers:
            handler.close()
            logger.removeHandler(handler)
        if os.path.exists(data_directory):
            with open(conf_path, 'w') as f:
                yaml.dump(conf, f)
Пример #22
0
def train_ivector(args):
    command = 'train_ivector'
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    if args.config_path:
        train_config, align_config = train_yaml_to_config(args.config_path)
    else:
        train_config, align_config = load_basic_train_ivector()
    conf_path = os.path.join(data_directory, 'config.yml')
    if getattr(args, 'clean', False) and os.path.exists(data_directory):
        print('Cleaning old directory!')
        shutil.rmtree(data_directory, ignore_errors=True)
    logger = setup_logger(command, data_directory)

    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {
            'dirty': False,
            'begin': all_begin,
            'version': __version__,
            'type': command,
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path,
            'acoustic_model_path': args.acoustic_model_path,
        }
    if conf['dirty'] or conf['type'] != command \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path \
            or conf['acoustic_model_path'] != args.acoustic_model_path:
        logger.warning(
            'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no '
            'weird behavior for previous versions of the temporary directory.')
        if conf['dirty']:
            logger.debug('Previous run ended in an error (maybe ctrl-c?)')
        if conf['type'] != command:
            logger.debug(
                'Previous run was a different subcommand than {} (was {})'.
                format(command, conf['type']))
        if conf['corpus_directory'] != args.corpus_directory:
            logger.debug('Previous run used source directory '
                         'path {} (new run: {})'.format(
                             conf['corpus_directory'], args.corpus_directory))
        if conf['version'] != __version__:
            logger.debug('Previous run was on {} version (new run: {})'.format(
                conf['version'], __version__))
        if conf['dictionary_path'] != args.dictionary_path:
            logger.debug('Previous run used dictionary path {} '
                         '(new run: {})'.format(conf['dictionary_path'],
                                                args.dictionary_path))
        if conf['acoustic_model_path'] != args.acoustic_model_path:
            logger.debug('Previous run used acoustic model path {} '
                         '(new run: {})'.format(conf['acoustic_model_path'],
                                                args.acoustic_model_path))

    os.makedirs(data_directory, exist_ok=True)
    try:
        begin = time.time()
        corpus = AlignableCorpus(args.corpus_directory,
                                 data_directory,
                                 speaker_characters=args.speaker_characters,
                                 num_jobs=args.num_jobs,
                                 debug=getattr(args, 'debug', False),
                                 logger=logger,
                                 use_mp=align_config.use_mp)
        acoustic_model = AcousticModel(args.acoustic_model_path)
        dictionary = Dictionary(args.dictionary_path,
                                data_directory,
                                word_set=corpus.word_set,
                                logger=logger)
        acoustic_model.validate(dictionary)
        a = PretrainedAligner(corpus,
                              dictionary,
                              acoustic_model,
                              align_config,
                              temp_directory=data_directory,
                              logger=logger)
        logger.debug(
            'Setup pretrained aligner in {} seconds'.format(time.time() -
                                                            begin))
        a.verbose = args.verbose
        begin = time.time()
        a.align()
        logger.debug('Performed alignment in {} seconds'.format(time.time() -
                                                                begin))
        for identifier, trainer in train_config.items():
            trainer.logger = logger
            if identifier != 'ivector':
                continue
            begin = time.time()
            trainer.init_training(identifier, data_directory, corpus,
                                  dictionary, a)
            trainer.train(call_back=print)
            logger.debug('Training took {} seconds'.format(time.time() -
                                                           begin))
            trainer.save(args.output_model_path)

        logger.info('All done!')
        logger.debug('Done! Everything took {} seconds'.format(time.time() -
                                                               all_begin))
    except Exception as e:
        conf['dirty'] = True
        raise e
    finally:
        handlers = logger.handlers[:]
        for handler in handlers:
            handler.close()
            logger.removeHandler(handler)
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)
Пример #23
0
def train_ivector(args):
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    conf_path = os.path.join(data_directory, 'config.yml')
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {
            'dirty': False,
            'begin': time.time(),
            'version': __version__,
            'type': 'train_and_align',
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path
        }
    if getattr(args, 'clean', False) \
            or conf['dirty'] or conf['type'] != 'train_and_align' \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        shutil.rmtree(data_directory, ignore_errors=True)

    os.makedirs(data_directory, exist_ok=True)
    try:
        corpus = AlignableCorpus(args.corpus_directory,
                                 data_directory,
                                 speaker_characters=args.speaker_characters,
                                 num_jobs=getattr(args, 'num_jobs', 3),
                                 debug=getattr(args, 'debug', False))
        if corpus.issues_check:
            print('WARNING: Some issues parsing the corpus were detected. '
                  'Please run the validator to get more information.')
        dictionary = Dictionary(args.dictionary_path,
                                data_directory,
                                word_set=corpus.word_set)
        utt_oov_path = os.path.join(corpus.split_directory(),
                                    'utterance_oovs.txt')
        if os.path.exists(utt_oov_path):
            shutil.copy(utt_oov_path, args.output_directory)
        oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt')
        if os.path.exists(oov_path):
            shutil.copy(oov_path, args.output_directory)
        if args.config_path:
            train_config, align_config = train_yaml_to_config(args.config_path)
        else:
            train_config, align_config = load_basic_train_ivector()
        a = TrainableAligner(corpus,
                             dictionary,
                             train_config,
                             align_config,
                             temp_directory=data_directory)
        a.verbose = args.verbose
        a.train()
        a.save(args.output_model_path)
    except Exception as e:
        conf['dirty'] = True
        raise e
    finally:
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)
def train_dictionary(args):
    command = 'train_dictionary'
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    conf_path = os.path.join(data_directory, 'config.yml')
    if args.config_path:
        align_config = align_yaml_to_config(args.config_path)
    else:
        align_config = load_basic_align()
    if getattr(args, 'clean', False) and os.path.exists(data_directory):
        print('Cleaning old directory!')
        shutil.rmtree(data_directory, ignore_errors=True)
    logger = setup_logger(command, data_directory)
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {'dirty': False,
                'begin': time.time(),
                'version': __version__,
                'type': command,
                'corpus_directory': args.corpus_directory,
                'dictionary_path': args.dictionary_path,
                'acoustic_model_path': args.acoustic_model_path
                }
    if conf['dirty'] or conf['type'] != command \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        logger.warning(
            'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no '
            'weird behavior for previous versions of the temporary directory.')
        if conf['dirty']:
            logger.debug('Previous run ended in an error (maybe ctrl-c?)')
        if conf['type'] != command:
            logger.debug('Previous run was a different subcommand than {} (was {})'.format(command, conf['type']))
        if conf['corpus_directory'] != args.corpus_directory:
            logger.debug('Previous run used source directory '
                         'path {} (new run: {})'.format(conf['corpus_directory'], args.corpus_directory))
        if conf['version'] != __version__:
            logger.debug('Previous run was on {} version (new run: {})'.format(conf['version'], __version__))
        if conf['dictionary_path'] != args.dictionary_path:
            logger.debug('Previous run used dictionary path {} '
                         '(new run: {})'.format(conf['dictionary_path'], args.dictionary_path))
        if conf['acoustic_model_path'] != args.acoustic_model_path:
            logger.debug('Previous run used acoustic model path {} '
                         '(new run: {})'.format(conf['acoustic_model_path'], args.acoustic_model_path))

    os.makedirs(data_directory, exist_ok=True)
    try:
        corpus = AlignableCorpus(args.corpus_directory, data_directory,
                        speaker_characters=args.speaker_characters,
                        num_jobs=args.num_jobs, use_mp=align_config.use_mp, logger=logger)
        if corpus.issues_check:
            logger.warning('WARNING: Some issues parsing the corpus were detected. '
                  'Please run the validator to get more information.')
        logger.info(corpus.speaker_utterance_info())
        acoustic_model = AcousticModel(args.acoustic_model_path)
        dictionary = Dictionary(args.dictionary_path, data_directory, word_set=corpus.word_set, logger=logger)
        acoustic_model.validate(dictionary)

        begin = time.time()
        a = PretrainedAligner(corpus, dictionary, acoustic_model, align_config,
                              temp_directory=data_directory,
                              debug=getattr(args, 'debug', False), logger=logger)
        logger.debug('Setup pretrained aligner in {} seconds'.format(time.time() - begin))
        a.verbose = args.verbose

        begin = time.time()
        a.align()
        logger.debug('Performed alignment in {} seconds'.format(time.time() - begin))

        a.generate_pronunciations(args.output_directory)
        print('Done! Everything took {} seconds'.format(time.time() - all_begin))
    except Exception as _:
        conf['dirty'] = True
        raise
    finally:
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)
def test_frclitics(frclitics_dict_path, generated_dir):
    d = Dictionary(frclitics_dict_path, os.path.join(generated_dir,
                                                     'frclitics'))
    x = d.write()
    assert d.separate_clitics('aujourd') == ['aujourd']
    assert d.separate_clitics('aujourd\'hui') == ['aujourd\'hui']
    assert d.separate_clitics('vingt-six') == ['vingt', 'six']
    assert d.separate_clitics('m\'appelle') == ['m\'', 'appelle']
    assert d.separate_clitics('c\'est') == ['c\'est']
    assert d.separate_clitics('purple-people-eater') == ['purple-people-eater']
    assert d.separate_clitics('m\'appele') == ['m\'', 'appele']
    assert d.separate_clitics('m\'ving-sic') == ["m'", 'ving', 'sic']
    assert d.separate_clitics('flying\'purple-people-eater') == [
        'flying\'purple-people-eater'
    ]
Пример #26
0
def transcribe_corpus(args):
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    print(data_directory, os.path.exists(data_directory))
    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(args.output_directory, exist_ok=True)
    os.makedirs(data_directory, exist_ok=True)
    conf_path = os.path.join(data_directory, 'config.yml')
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {
            'dirty': False,
            'begin': time.time(),
            'version': __version__,
            'type': 'align',
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path
        }
    if getattr(args, 'clean', False) \
            or conf['dirty'] or conf['type'] != 'align' \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        shutil.rmtree(data_directory, ignore_errors=True)
    try:
        corpus = TranscribeCorpus(args.corpus_directory,
                                  data_directory,
                                  speaker_characters=args.speaker_characters,
                                  num_jobs=args.num_jobs)
        print(corpus.speaker_utterance_info())
        acoustic_model = AcousticModel(args.acoustic_model_path)
        language_model = LanguageModel(args.language_model_path)
        dictionary = Dictionary(args.dictionary_path, data_directory)
        acoustic_model.validate(dictionary)

        begin = time.time()
        t = Transcriber(corpus,
                        dictionary,
                        acoustic_model,
                        language_model,
                        temp_directory=data_directory,
                        debug=getattr(args, 'debug', False))
        if args.debug:
            print('Setup pretrained aligner in {} seconds'.format(time.time() -
                                                                  begin))

        begin = time.time()
        a.align()
        if args.debug:
            print('Performed alignment in {} seconds'.format(time.time() -
                                                             begin))

        begin = time.time()
        a.export_textgrids(args.output_directory)
        if args.debug:
            print('Exported TextGrids in {} seconds'.format(time.time() -
                                                            begin))
        print('Done! Everything took {} seconds'.format(time.time() -
                                                        all_begin))
    except Exception as _:
        conf['dirty'] = True
        raise
    finally:
        if os.path.exists(data_directory):
            with open(conf_path, 'w') as f:
                yaml.dump(conf, f)
Пример #27
0
def sick_dict(sick_dict_path, generated_dir):
    output_directory = os.path.join(generated_dir, 'sickcorpus')
    dictionary = Dictionary(sick_dict_path, output_directory)
    dictionary.write()
    return dictionary
def align_corpus(args, unknown_args=None):
    command = 'train_and_align'
    all_begin = time.time()
    if not args.temp_directory:
        temp_dir = TEMP_DIR
    else:
        temp_dir = os.path.expanduser(args.temp_directory)
    corpus_name = os.path.basename(args.corpus_directory)
    if corpus_name == '':
        args.corpus_directory = os.path.dirname(args.corpus_directory)
        corpus_name = os.path.basename(args.corpus_directory)
    data_directory = os.path.join(temp_dir, corpus_name)
    logger = setup_logger(command, data_directory)
    if args.config_path:
        train_config, align_config = train_yaml_to_config(args.config_path)
    else:
        train_config, align_config = load_basic_train()
    if unknown_args:
        align_config.update_from_args(unknown_args)
    conf_path = os.path.join(data_directory, 'config.yml')
    if args.debug:
        logger.warning(
            'Running in DEBUG mode, may have impact on performance and disk usage.'
        )
    if getattr(args, 'clean', False) and os.path.exists(data_directory):
        logger.info('Cleaning old directory!')
        shutil.rmtree(data_directory, ignore_errors=True)
    if os.path.exists(conf_path):
        with open(conf_path, 'r') as f:
            conf = yaml.load(f, Loader=yaml.SafeLoader)
    else:
        conf = {
            'dirty': False,
            'begin': time.time(),
            'version': __version__,
            'type': command,
            'corpus_directory': args.corpus_directory,
            'dictionary_path': args.dictionary_path
        }
    if  conf['dirty'] or conf['type'] != command \
            or conf['corpus_directory'] != args.corpus_directory \
            or conf['version'] != __version__ \
            or conf['dictionary_path'] != args.dictionary_path:
        logger.warning(
            'WARNING: Using old temp directory, this might not be ideal for you, use the --clean flag to ensure no '
            'weird behavior for previous versions of the temporary directory.')
        if conf['dirty']:
            logger.debug('Previous run ended in an error (maybe ctrl-c?)')
        if conf['type'] != command:
            logger.debug(
                'Previous run was a different subcommand than {} (was {})'.
                format(command, conf['type']))
        if conf['corpus_directory'] != args.corpus_directory:
            logger.debug('Previous run used source directory '
                         'path {} (new run: {})'.format(
                             conf['corpus_directory'], args.corpus_directory))
        if conf['version'] != __version__:
            logger.debug('Previous run was on {} version (new run: {})'.format(
                conf['version'], __version__))
        if conf['dictionary_path'] != args.dictionary_path:
            logger.debug('Previous run used dictionary path {} '
                         '(new run: {})'.format(conf['dictionary_path'],
                                                args.dictionary_path))

    os.makedirs(data_directory, exist_ok=True)
    os.makedirs(args.output_directory, exist_ok=True)
    try:
        corpus = AlignableCorpus(args.corpus_directory,
                                 data_directory,
                                 speaker_characters=args.speaker_characters,
                                 num_jobs=getattr(args, 'num_jobs', 3),
                                 debug=getattr(args, 'debug', False),
                                 logger=logger,
                                 use_mp=align_config.use_mp)
        if corpus.issues_check:
            logger.warning('Some issues parsing the corpus were detected. '
                           'Please run the validator to get more information.')
        logger.info(corpus.speaker_utterance_info())
        dictionary = Dictionary(args.dictionary_path,
                                data_directory,
                                word_set=corpus.word_set,
                                logger=logger)
        utt_oov_path = os.path.join(corpus.split_directory(),
                                    'utterance_oovs.txt')
        if os.path.exists(utt_oov_path):
            shutil.copy(utt_oov_path, args.output_directory)
        oov_path = os.path.join(corpus.split_directory(), 'oovs_found.txt')
        if os.path.exists(oov_path):
            shutil.copy(oov_path, args.output_directory)
        a = TrainableAligner(corpus,
                             dictionary,
                             train_config,
                             align_config,
                             temp_directory=data_directory,
                             logger=logger,
                             debug=getattr(args, 'debug', False))
        a.verbose = args.verbose
        begin = time.time()
        a.train()
        logger.debug('Training took {} seconds'.format(time.time() - begin))
        a.export_textgrids(args.output_directory)
        if args.output_model_path is not None:
            a.save(args.output_model_path)
        logger.info('All done!')
        logger.debug('Done! Everything took {} seconds'.format(time.time() -
                                                               all_begin))
    except Exception as _:
        conf['dirty'] = True
        raise
    finally:
        handlers = logger.handlers[:]
        for handler in handlers:
            handler.close()
            logger.removeHandler(handler)
        with open(conf_path, 'w') as f:
            yaml.dump(conf, f)