def get_alphabet(language): if language in ALPHABETS: return ALPHABETS[language] alphabet_path = getattr(CLI_ARGS, language + "_alphabet") alphabet = Alphabet(alphabet_path) if alphabet_path else None ALPHABETS[language] = alphabet return alphabet
def main(): parser = get_importers_parser( description="Import CommonVoice v2.0 corpora") parser.add_argument("tsv_dir", help="Directory containing tsv files") parser.add_argument( "--audio_dir", help= 'Directory containing the audio clips - defaults to "<tsv_dir>/clips"', ) parser.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) parser.add_argument( "--space_after_every_character", action="store_true", help="To help transcript join by white space", ) params = parser.parse_args() validate_label = get_validate_label(params) audio_dir = (params.audio_dir if params.audio_dir else os.path.join( params.tsv_dir, "clips")) alphabet = Alphabet( params.filter_alphabet) if params.filter_alphabet else None filter_obj = LabelFilter(params.normalize, alphabet, validate_label) _preprocess_data(params.tsv_dir, audio_dir, filter_obj, params.space_after_every_character)
def _ending_tester(self, file, expected): alphabet = Alphabet( os.path.join(os.path.dirname(__file__), 'test_data', file)) label = '' label_id = -1 for expected_label, expected_label_id in expected: try: label_id = alphabet.encode(expected_label) except KeyError: pass self.assertEqual(label_id, [expected_label_id]) try: label = alphabet.decode([expected_label_id]) except KeyError: pass self.assertEqual(label, expected_label)
def create_bundle( alphabet_path, lm_path, vocab_path, package_path, force_utf8, default_alpha, default_beta, ): words = set() vocab_looks_char_based = True with open(vocab_path) as fin: for line in fin: for word in line.split(): words.add(word.encode("utf-8")) if len(word) > 1: vocab_looks_char_based = False print("{} unique words read from vocabulary file.".format(len(words))) cbm = "Looks" if vocab_looks_char_based else "Doesn't look" print("{} like a character based model.".format(cbm)) if force_utf8 != None: # pylint: disable=singleton-comparison use_utf8 = force_utf8.value else: use_utf8 = vocab_looks_char_based print("Using detected UTF-8 mode: {}".format(use_utf8)) if use_utf8: serialized_alphabet = UTF8Alphabet().serialize() else: if not alphabet_path: raise RuntimeError("No --alphabet path specified, can't continue.") serialized_alphabet = Alphabet(alphabet_path).serialize() alphabet = NativeAlphabet() err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) if err != 0: raise RuntimeError("Error loading alphabet: {}".format(err)) scorer = Scorer() scorer.set_alphabet(alphabet) scorer.set_utf8_mode(use_utf8) scorer.reset_params(default_alpha, default_beta) err = scorer.load_lm(lm_path) if err != ds_ctcdecoder.DS_ERR_SCORER_NO_TRIE: print('Error loading language model file: 0x{:X}.'.format(err)) print( 'See the error codes section in https://deepspeech.readthedocs.io for a description.' ) sys.exit(1) scorer.fill_dictionary(list(words)) shutil.copy(lm_path, package_path) # append, not overwrite if scorer.save_dictionary(package_path, True): print("Package created in {}".format(package_path)) else: print("Error when creating {}".format(package_path)) sys.exit(1)
def create_bundle( alphabet_path, lm_path, vocab_path, package_path, force_utf8, default_alpha, default_beta, ): words = set() vocab_looks_char_based = True with open(vocab_path) as fin: for line in fin: for word in line.split(): words.add(word.encode("utf-8")) if len(word) > 1: vocab_looks_char_based = False print("{} unique words read from vocabulary file.".format(len(words))) print("{} like a character based model.".format( "Looks" if vocab_looks_char_based else "Doesn't look")) if force_utf8 != None: # pylint: disable=singleton-comparison use_utf8 = force_utf8.value print("Forcing UTF-8 mode = {}".format(use_utf8)) else: use_utf8 = vocab_looks_char_based if use_utf8: serialized_alphabet = UTF8Alphabet().serialize() else: if not alphabet_path: print("No --alphabet path specified, can't continue.") sys.exit(1) serialized_alphabet = Alphabet(alphabet_path).serialize() alphabet = NativeAlphabet() err = alphabet.deserialize(serialized_alphabet, len(serialized_alphabet)) if err != 0: print("Error loading alphabet: {}".format(err)) sys.exit(1) scorer = Scorer() scorer.set_alphabet(alphabet) scorer.set_utf8_mode(use_utf8) scorer.reset_params(default_alpha, default_beta) scorer.load_lm(lm_path) scorer.fill_dictionary(list(words)) shutil.copy(lm_path, package_path) scorer.save_dictionary(package_path, True) # append, not overwrite print("Package created in {}".format(package_path))
parser.add_argument(dest="target_dir") parser.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) return parser.parse_args() if __name__ == "__main__": CLI_ARGS = handle_args() ALPHABET = Alphabet( CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None validate_label = get_validate_label(CLI_ARGS) def label_filter(label): if CLI_ARGS.normalize: label = (unicodedata.normalize("NFKD", label.strip()).encode( "ascii", "ignore").decode("ascii", "ignore")) label = validate_label(label) if ALPHABET and label: try: ALPHABET.encode(label) except KeyError: label = None return label _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir)
description="Import German Distant Speech (TUDA)") parser.add_argument("base_dir", help="Directory containing all data") parser.add_argument( "--max_duration", type=int, default=10000, help="Maximum sample duration in milliseconds", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) parser.add_argument( "--alphabet", help="Exclude samples with characters not in provided alphabet file", ) parser.add_argument( "--keep_archive", type=bool, default=True, help="If downloaded archives should be kept", ) return parser.parse_args() if __name__ == "__main__": CLI_ARGS = handle_args() ALPHABET = Alphabet(CLI_ARGS.alphabet) if CLI_ARGS.alphabet else None download_and_prepare()
"--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) PARSER.add_argument( "--space_after_every_character", action="store_true", help="To help transcript join by white space", ) PARAMS = PARSER.parse_args() validate_label = get_validate_label(PARAMS) AUDIO_DIR = (PARAMS.audio_dir if PARAMS.audio_dir else os.path.join( PARAMS.tsv_dir, "clips")) ALPHABET = Alphabet( PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None def label_filter_fun(label): if PARAMS.normalize: label = (unicodedata.normalize("NFKD", label.strip()).encode( "ascii", "ignore").decode("ascii", "ignore")) label = validate_label(label) if ALPHABET and label: try: ALPHABET.encode(label) except KeyError: label = None return label _preprocess_data(PARAMS.tsv_dir, AUDIO_DIR, PARAMS.space_after_every_character)