def test_generator(sick_g2p_model_path, sick_corpus, g2p_sick_output): if G2P_DISABLED: pytest.skip('No Pynini found') model = G2PModel(sick_g2p_model_path) assert not model.validate(sick_corpus.word_set) assert model.validate( [x for x in sick_corpus.word_set if not check_bracketed(x)]) gen = PyniniDictionaryGenerator(model, sick_corpus.word_set) gen.output(g2p_sick_output) assert os.path.exists(g2p_sick_output)
def get_word_set(corpus, include_bracketed=False): word_set = corpus.word_set decode_error_files = [] textgrid_read_errors = {} for file_path in corpus.transcriptions_without_wavs: if file_path.endswith('.lab'): try: text = load_text(file_path) except UnicodeDecodeError: decode_error_files.append(file_path) continue words = parse_transcription(text) word_set.update(words) else: tg = TextGrid() try: tg.read(file_path) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() textgrid_read_errors[file_path] = '\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) continue for i, ti in enumerate(tg.tiers): if ti.name.lower() == 'notes': continue if not isinstance(ti, IntervalTier): continue for interval in ti: text = interval.mark.lower().strip() words = parse_transcription(text) if not words: continue word_set.update(words) if decode_error_files: print( 'WARNING: The following files were not able to be decoded using utf8:\n\n' '{}'.format('\n'.join(decode_error_files))) if textgrid_read_errors: print( 'WARNING: The following TextGrid files were not able to be read:\n\n' '{}'.format('\n'.join(textgrid_read_errors.keys()))) print( 'Generating transcriptions for the {} word types found in the corpus...' .format(len(word_set))) if not include_bracketed: word_set = [x for x in word_set if not check_bracketed(x)] return word_set
def generate_dictionary(args): print("Generating pronunciations from G2P model") if not args.temp_directory: temp_dir = TEMP_DIR temp_dir = os.path.join(temp_dir, 'G2P') else: temp_dir = os.path.expanduser(args.temp_directory) if os.path.isdir(args.input_path): input_dir = os.path.expanduser(args.input_path) corpus_name = os.path.basename(args.input_path) if corpus_name == '': args.input_path = os.path.dirname(args.input_path) corpus_name = os.path.basename(args.input_path) data_directory = os.path.join(temp_dir, corpus_name) corpus = AlignableCorpus(input_dir, data_directory, num_jobs=args.num_jobs, use_mp=(not args.disable_mp)) word_set = get_word_set(corpus, args.include_bracketed) else: word_set = [] with open(args.input_path, 'r', encoding='utf8') as f: for line in f: word_set.extend(line.strip().split()) if not args.include_bracketed: word_set = [x for x in word_set if not check_bracketed(x)] if args.g2p_model_path is not None: model = G2PModel(args.g2p_model_path, root_directory=os.path.join(temp_dir, 'models')) model.validate(word_set) gen = Generator(model, word_set, temp_directory=temp_dir, num_jobs=args.num_jobs) gen.output(args.output_path) model.clean_up() else: with open(args.output_path, "w", encoding='utf8') as f: for word in word_set: pronunciation = list(word) f.write('{} {}\n'.format(word, ' '.join(pronunciation)))
def test_check_bracketed(): """Checks if the brackets are removed correctly and handling an empty string works""" word_set = ['uh', '(the)', 'sick', '<corpus>', '[a]', '{cold}', ''] expected_result = ['uh', 'sick', ''] assert [x for x in word_set if not check_bracketed(x)] == expected_result