def main(args): """Main entry point allowing external calls Args: args ([str]): command line parameter list """ args = parse_args(args) validate_label = get_validate_label(args) setup_logging(args.loglevel) _logger.info("Starting GramVaani importer...") _logger.info("Starting loading GramVaani csv...") csv = GramVaaniCSV(args.csv_filename) _logger.info("Starting downloading GramVaani mp3's...") downloader = GramVaaniDownloader(csv, args.target_dir) mp3_directory = downloader.download() _logger.info("Starting converting GramVaani mp3's to wav's...") converter = GramVaaniConverter(args.target_dir, mp3_directory) wav_directory = converter.convert() datasets = GramVaaniDataSets(args.target_dir, wav_directory, csv) datasets.create() datasets.save() _logger.info("Finished GramVaani importer...")
PUNCTUATIONS_REG = re.compile(r"[°\-,;!?.()\[\]*…—]") MULTIPLE_SPACES_REG = re.compile(r'\s{2,}') def cleanup_transcript(text, english_compatible=False): text = text.replace('’', "'").replace('\u00A0', ' ') text = PUNCTUATIONS_REG.sub(' ', text) text = MULTIPLE_SPACES_REG.sub(' ', text) if english_compatible: text = unidecode.unidecode(text) return text.strip().lower() def handle_args(): parser = get_importers_parser( description='Importer for TrainingSpeech dataset.') parser.add_argument(dest='target_dir') parser.add_argument('--english-compatible', action='store_true', dest='english_compatible', help='Remove diactrics and other non-ascii chars.') return parser.parse_args() if __name__ == "__main__": cli_args = handle_args() validate_label = get_validate_label(cli_args) _download_and_preprocess_data(cli_args.target_dir, cli_args.english_compatible)
'--audio_dir', help= 'Directory containing the audio clips - defaults to "<tsv_dir>/clips"') PARSER.add_argument( '--filter_alphabet', help='Exclude samples with characters not in provided alphabet') PARSER.add_argument( '--normalize', action='store_true', help='Converts diacritic characters to their base ones') PARSER.add_argument('--space_after_every_character', action='store_true', help='To help transcript join by white space') PARAMS = PARSER.parse_args() validate_label = get_validate_label(PARAMS) AUDIO_DIR = PARAMS.audio_dir if PARAMS.audio_dir else os.path.join( PARAMS.tsv_dir, 'clips') ALPHABET = Alphabet( PARAMS.filter_alphabet) if PARAMS.filter_alphabet else None def label_filter_fun(label): if PARAMS.normalize: label = unicodedata.normalize("NFKD", label.strip()) \ .encode("ascii", "ignore") \ .decode("ascii", "ignore") label = validate_label(label) if ALPHABET and label: try: ALPHABET.encode(label)
help='Converts diacritic characters to their base ones') parser.add_argument( '--bogus-records', type=argparse.FileType('r'), required=False, help= 'Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items' ) return parser.parse_args() if __name__ == "__main__": CLI_ARGS = handle_args() ALPHABET = Alphabet( CLI_ARGS.filter_alphabet) if CLI_ARGS.filter_alphabet else None validate_label = get_validate_label(CLI_ARGS) bogus_regexes = [] if CLI_ARGS.bogus_records: for line in CLI_ARGS.bogus_records: bogus_regexes.append(re.compile(line.strip())) def record_filter(path): if any(regex.match(path) for regex in bogus_regexes): print('Reject', path) return False return True def label_filter(label): if CLI_ARGS.normalize: label = unicodedata.normalize("NFKD", label.strip()) \