def handle_args():
    parser = get_importers_parser(
        description=
        'Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details.'
    )
    parser.add_argument(dest='target_dir')
    parser.add_argument('--qId',
                        type=int,
                        required=True,
                        help='LinguaLibre language qId')
    parser.add_argument('--iso639-3',
                        type=str,
                        required=True,
                        help='ISO639-3 language code')
    parser.add_argument('--english-name',
                        type=str,
                        required=True,
                        help='Enligh name of the language')
    parser.add_argument(
        '--filter_alphabet',
        help='Exclude samples with characters not in provided alphabet')
    parser.add_argument(
        '--normalize',
        action='store_true',
        help='Converts diacritic characters to their base ones')
    parser.add_argument(
        '--bogus-records',
        type=argparse.FileType('r'),
        required=False,
        help=
        'Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items'
    )
    return parser.parse_args()
예제 #2
0
def handle_args():
    parser = get_importers_parser(description='Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/.')
    parser.add_argument(dest='target_dir')
    parser.add_argument('--filter_alphabet', help='Exclude samples with characters not in provided alphabet')
    parser.add_argument('--normalize', action='store_true', help='Converts diacritic characters to their base ones')
    parser.add_argument('--skiplist', type=str, default='', help='Directories / books to skip, comma separated')
    parser.add_argument('--language', required=True, type=str, help='Dataset language to use')
    return parser.parse_args()
예제 #3
0
def handle_args():
    parser = get_importers_parser(
        description='Importer for TrainingSpeech dataset.')
    parser.add_argument(dest='target_dir')
    parser.add_argument('--english-compatible',
                        action='store_true',
                        dest='english_compatible',
                        help='Remove diactrics and other non-ascii chars.')
    return parser.parse_args()
예제 #4
0
def handle_args():
    parser = get_importers_parser(
        description=
        'Importer for African Accented French dataset. More information on http://www.openslr.org/57/.'
    )
    parser.add_argument(dest='target_dir')
    parser.add_argument(
        '--filter_alphabet',
        help='Exclude samples with characters not in provided alphabet')
    parser.add_argument(
        '--normalize',
        action='store_true',
        help='Converts diacritic characters to their base ones')
    return parser.parse_args()
예제 #5
0
def main():
    # https://www.openslr.org/62/
    parser = get_importers_parser(description='Import aidatatang_200zh corpus')
    parser.add_argument('tgz_file', help='Path to aidatatang_200zh.tgz')
    parser.add_argument(
        '--target_dir',
        default='',
        help=
        'Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.'
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.dirname(params.tgz_file)

    preprocess_data(params.tgz_file, params.target_dir)
예제 #6
0
def parse_args(args):
    """Parse command line parameters
    Args:
      args ([str]): Command line parameters as list of strings
    Returns:
      :obj:`argparse.Namespace`: command line parameters namespace
    """
    parser = get_importers_parser(
        description="Imports GramVaani data for Deep Speech")
    parser.add_argument(
        "--version",
        action="version",
        version="GramVaaniImporter {ver}".format(ver=__version__),
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_const",
        required=False,
        help="set loglevel to INFO",
        dest="loglevel",
        const=logging.INFO,
    )
    parser.add_argument(
        "-vv",
        "--very-verbose",
        action="store_const",
        required=False,
        help="set loglevel to DEBUG",
        dest="loglevel",
        const=logging.DEBUG,
    )
    parser.add_argument(
        "-c",
        "--csv_filename",
        required=True,
        help="Path to the GramVaani csv",
        dest="csv_filename",
    )
    parser.add_argument(
        "-t",
        "--target_dir",
        required=True,
        help="Directory in which to save the importer GramVaani data",
        dest="target_dir",
    )
    return parser.parse_args(args)
예제 #7
0
def main():
    # https://www.openslr.org/47/
    parser = get_importers_parser(
        description='Import Primewords Chinese corpus set 1')
    parser.add_argument('tgz_file',
                        help='Path to primewords_md_2018_set1.tar.gz')
    parser.add_argument(
        '--target_dir',
        default='',
        help=
        'Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.'
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.dirname(params.tgz_file)

    preprocess_data(params.tgz_file, params.target_dir)
def main():
    # https://www.openslr.org/38/
    parser = get_importers_parser(
        description='Import Free ST Chinese Mandarin corpus')
    parser.add_argument('tgz_file',
                        help='Path to ST-CMDS-20170001_1-OS.tar.gz')
    parser.add_argument(
        '--target_dir',
        default='',
        help=
        'Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.'
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.dirname(params.tgz_file)

    preprocess_data(params.tgz_file, params.target_dir)
예제 #9
0
def main():
    # https://openslr.org/68/
    parser = get_importers_parser(description='Import MAGICDATA corpus')
    parser.add_argument(
        'folder_with_archives',
        help='Path to folder containing magicdata_{train,dev,test}.tar.gz')
    parser.add_argument(
        '--target_dir',
        default='',
        help=
        'Target folder to extract files into and put the resulting CSVs. Defaults to a folder called magicdata next to the archives'
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.join(params.folder_with_archives,
                                         'magicdata')

    preprocess_data(params.folder_with_archives, params.target_dir)
예제 #10
0
    print_import_report(counter, SAMPLE_RATE, MAX_SECS)


def _maybe_convert_wav(mp3_filename, wav_filename):
    if not path.exists(wav_filename):
        transformer = sox.Transformer()
        transformer.convert(samplerate=SAMPLE_RATE)
        try:
            transformer.build(mp3_filename, wav_filename)
        except sox.core.SoxError:
            pass


if __name__ == "__main__":
    PARSER = get_importers_parser(
        description='Import CommonVoice v2.0 corpora')
    PARSER.add_argument('tsv_dir', help='Directory containing tsv files')
    PARSER.add_argument(
        '--audio_dir',
        help=
        'Directory containing the audio clips - defaults to "<tsv_dir>/clips"')
    PARSER.add_argument(
        '--filter_alphabet',
        help='Exclude samples with characters not in provided alphabet')
    PARSER.add_argument(
        '--normalize',
        action='store_true',
        help='Converts diacritic characters to their base ones')
    PARSER.add_argument('--space_after_every_character',
                        action='store_true',
                        help='To help transcript join by white space')