def main(): parser = get_importers_parser( description="Import CommonVoice v2.0 corpora") parser.add_argument("tsv_dir", help="Directory containing tsv files") parser.add_argument( "--audio_dir", help= 'Directory containing the audio clips - defaults to "<tsv_dir>/clips"', ) parser.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) parser.add_argument( "--space_after_every_character", action="store_true", help="To help transcript join by white space", ) params = parser.parse_args() validate_label = get_validate_label(params) audio_dir = (params.audio_dir if params.audio_dir else os.path.join( params.tsv_dir, "clips")) alphabet = Alphabet( params.filter_alphabet) if params.filter_alphabet else None filter_obj = LabelFilter(params.normalize, alphabet, validate_label) _preprocess_data(params.tsv_dir, audio_dir, filter_obj, params.space_after_every_character)
def init_worker(params): global FILTER_OBJ # pylint: disable=global-statement global AUDIO_DIR # pylint: disable=global-statement AUDIO_DIR = params.audio_dir if params.audio_dir else os.path.join( params.tsv_dir, "clips") validate_label = get_validate_label(params) alphabet = Alphabet( params.filter_alphabet) if params.filter_alphabet else None FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)
def main(args): """Main entry point allowing external calls Args: args ([str]): command line parameter list """ args = parse_args(args) validate_label = get_validate_label(args) setup_logging(args.loglevel) _logger.info("Starting GramVaani importer...") _logger.info("Starting loading GramVaani csv...") csv = GramVaaniCSV(args.csv_filename) _logger.info("Starting downloading GramVaani mp3's...") downloader = GramVaaniDownloader(csv, args.target_dir) mp3_directory = downloader.download() _logger.info("Starting converting GramVaani mp3's to wav's...") converter = GramVaaniConverter(args.target_dir, mp3_directory) wav_directory = converter.convert() datasets = GramVaaniDataSets(args.target_dir, wav_directory, csv) datasets.create() datasets.save() _logger.info("Finished GramVaani importer...")
"--filter_alphabet", help="Exclude samples with characters not in provided alphabet", ) parser.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones", ) return parser.parse_args() if __name__ == "__main__": CLI_ARGS = handle_args() ALPHABET = Alphabet( CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None validate_label = get_validate_label(CLI_ARGS) def label_filter(label): if CLI_ARGS.normalize: label = (unicodedata.normalize("NFKD", label.strip()).encode( "ascii", "ignore").decode("ascii", "ignore")) label = validate_label(label) if ALPHABET and label: try: ALPHABET.encode(label) except KeyError: label = None return label _download_and_preprocess_data(target_dir=CLI_ARGS.target_dir)
if __name__ == "__main__": PARSER = get_importers_parser( description="Import XML from Conference Centre for Economics, France") PARSER.add_argument("target_dir", help="Destination directory") PARSER.add_argument( "--filter_alphabet", help="Exclude samples with characters not in provided alphabet") PARSER.add_argument( "--normalize", action="store_true", help="Converts diacritic characters to their base ones") PARAMS = PARSER.parse_args() validate_label = get_validate_label(PARAMS) ALPHABET = Alphabet( PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None def label_filter_fun(label): if PARAMS.normalize: label = unicodedata.normalize("NFKD", label.strip()) \ .encode("ascii", "ignore") \ .decode("ascii", "ignore") label = maybe_normalize(label) label = validate_label(label) if ALPHABET and label: try: ALPHABET.encode(label) except KeyError: label = None
PUNCTUATIONS_REG = re.compile(r"[°\-,;!?.()\[\]*…—]") MULTIPLE_SPACES_REG = re.compile(r"\s{2,}") def cleanup_transcript(text, english_compatible=False): text = text.replace("’", "'").replace("\u00A0", " ") text = PUNCTUATIONS_REG.sub(" ", text) text = MULTIPLE_SPACES_REG.sub(" ", text) if english_compatible: text = unidecode.unidecode(text) return text.strip().lower() def handle_args(): parser = get_importers_parser(description="Importer for TrainingSpeech dataset.") parser.add_argument(dest="target_dir") parser.add_argument( "--english-compatible", action="store_true", dest="english_compatible", help="Remove diactrics and other non-ascii chars.", ) return parser.parse_args() if __name__ == "__main__": cli_args = handle_args() validate_label = get_validate_label(cli_args) _download_and_preprocess_data(cli_args.target_dir, cli_args.english_compatible)
def init_worker(params): global FILTER_OBJ # pylint: disable=global-statement validate_label = get_validate_label(params) alphabet = Alphabet( params.filter_alphabet) if params.filter_alphabet else None FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)
def test_get_validate_label(self): args = Namespace(validate_label_locale=from_here( 'test_data/validate_locale_fra.py')) f = get_validate_label(args) l = f('toto') self.assertEqual(l, 'toto')
def test_get_validate_label_missing(self): args = Namespace(validate_label_locale=from_here( 'test_data/validate_locale_ger.py')) f = get_validate_label(args) self.assertEqual(f, None)
def test_validate_label_locale_default(self): f = get_validate_label(Namespace(validate_label_locale=None)) self.assertEqual(f('toto'), 'toto') self.assertEqual(f('toto1234'), None) self.assertEqual(f('toto1234[{[{[]'), None)