Пример #1
0
def handle_args():
    parser = get_importers_parser(
        description="Importer for LinguaLibre dataset. Check https://lingualibre.fr/wiki/Help:Download_from_LinguaLibre for details."
    )
    parser.add_argument(dest="target_dir")
    parser.add_argument(
        "--qId", type=int, required=True, help="LinguaLibre language qId"
    )
    parser.add_argument(
        "--iso639-3", type=str, required=True, help="ISO639-3 language code"
    )
    parser.add_argument(
        "--english-name", type=str, required=True, help="English name of the language"
    )
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    parser.add_argument(
        "--bogus-records",
        type=argparse.FileType("r"),
        required=False,
        help="Text file listing well-known bogus record to skip from importing, from https://lingualibre.fr/wiki/LinguaLibre:Misleading_items",
    )
    return parser.parse_args()
Пример #2
0
def handle_args():
    parser = get_importers_parser(description="Importer for TrainingSpeech dataset.")
    parser.add_argument(dest="target_dir")
    parser.add_argument(
        "--english-compatible",
        action="store_true",
        dest="english_compatible",
        help="Remove diactrics and other non-ascii chars.",
    )
    return parser.parse_args()
Пример #3
0
def main():
    # https://www.openslr.org/62/
    parser = get_importers_parser(description="Import aidatatang_200zh corpus")
    parser.add_argument("tgz_file", help="Path to aidatatang_200zh.tgz")
    parser.add_argument(
        "--target_dir",
        default="",
        help=
        "Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.dirname(params.tgz_file)

    preprocess_data(params.tgz_file, params.target_dir)
Пример #4
0
def parse_args(args):
    """Parse command line parameters
    Args:
      args ([str]): Command line parameters as list of strings
    Returns:
      :obj:`argparse.Namespace`: command line parameters namespace
    """
    parser = get_importers_parser(
        description="Imports GramVaani data for Deep Speech")
    parser.add_argument(
        "--version",
        action="version",
        version="GramVaaniImporter {ver}".format(ver=__version__),
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_const",
        required=False,
        help="set loglevel to INFO",
        dest="loglevel",
        const=logging.INFO,
    )
    parser.add_argument(
        "-vv",
        "--very-verbose",
        action="store_const",
        required=False,
        help="set loglevel to DEBUG",
        dest="loglevel",
        const=logging.DEBUG,
    )
    parser.add_argument(
        "-c",
        "--csv_filename",
        required=True,
        help="Path to the GramVaani csv",
        dest="csv_filename",
    )
    parser.add_argument(
        "-t",
        "--target_dir",
        required=True,
        help="Directory in which to save the importer GramVaani data",
        dest="target_dir",
    )
    return parser.parse_args(args)
Пример #5
0
def main():
    # https://www.openslr.org/38/
    parser = get_importers_parser(
        description="Import Free ST Chinese Mandarin corpus")
    parser.add_argument("tgz_file",
                        help="Path to ST-CMDS-20170001_1-OS.tar.gz")
    parser.add_argument(
        "--target_dir",
        default="",
        help=
        "Target folder to extract files into and put the resulting CSVs. Defaults to same folder as the main archive.",
    )
    params = parser.parse_args()

    if not params.target_dir:
        params.target_dir = os.path.dirname(params.tgz_file)

    preprocess_data(params.tgz_file, params.target_dir)
Пример #6
0
def handle_args():
    parser = get_importers_parser(
        description="Importer for M-AILABS dataset. https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/."
    )
    parser.add_argument(dest="target_dir")
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    parser.add_argument(
        "--skiplist",
        type=str,
        default="",
        help="Directories / books to skip, comma separated",
    )
    parser.add_argument(
        "--language", required=True, type=str, help="Dataset language to use"
    )
    return parser.parse_args()
Пример #7
0
def parse_args():
    parser = get_importers_parser(
        description="Import CommonVoice v2.0 corpora")
    parser.add_argument("tsv_dir", help="Directory containing tsv files")
    parser.add_argument(
        "--audio_dir",
        help=
        'Directory containing the audio clips - defaults to "<tsv_dir>/clips"',
    )
    parser.add_argument(
        "--filter_alphabet",
        help="Exclude samples with characters not in provided alphabet",
    )
    parser.add_argument(
        "--normalize",
        action="store_true",
        help="Converts diacritic characters to their base ones",
    )
    parser.add_argument(
        "--space_after_every_character",
        action="store_true",
        help="To help transcript join by white space",
    )
    return parser.parse_args()