Пример #1
0
 def is_valid(self, njobs=utils.default_njobs()):
     """Return True if the corpus is in a valid state"""
     try:
         self.validate(njobs=njobs)
     except IOError:
         return False
     return True
Пример #2
0
    def add_parser(cls, subparsers, name=None):
        if name is None:
            name = cls.name

        # get basic parser init from AbstractCommand
        parser, dir_group = super(AbstractKaldiCommand,
                                  cls).add_parser(subparsers, name)

        # add a --recipe option
        parser.add_argument('--recipe',
                            action='store_true',
                            help="""
            put the Kaldi recipe in <output_dir>/recipe, by default the recipe
            is deleted""")

        # add a --njobs option
        parser.add_argument('-j',
                            '--njobs',
                            type=int,
                            metavar='<njobs>',
                            default=utils.default_njobs(),
                            help="""
            number of jobs for parallel computation, because Kaldi
            used to run jobs per speakers, the number of jobs is
            min(<njobs>, corpus.nspeakers). Default is to launch
            %(default)s jobs.""")

        return parser, dir_group
Пример #3
0
    def validate(self, njobs=utils.default_njobs()):
        """Validate speech corpus data

        Raise IOError on the first encoutered error, relies on the
        CorpusValidation class.

        """
        CorpusValidation(self, njobs=njobs, log=self.log).validate()
Пример #4
0
    def add_parser(cls, subparsers):
        """Return a default argument parser for corpus preparation"""
        parser = subparsers.add_parser(cls.preparator.name)
        parser.formatter_class = argparse.RawDescriptionHelpFormatter
        parser.description = textwrap.dedent(cls.long_description())

        group = parser.add_argument_group('directories')

        default_input_dir = cls.preparator.default_input_dir()
        if default_input_dir is None:
            group.add_argument(
                'input_dir', metavar='<input-dir>',
                help='root directory of the raw corpus distribution')
        else:
            group.add_argument(
                '-i', '--input-dir', metavar='<input-dir>',
                default=default_input_dir,
                help='root directory of the raw corpus distribution, '
                'default is %(default)s')

        group.add_argument(
            '-o', '--output-dir', metavar='<output-dir>', default=None,
            help='the prepared corpus is created in '
            '<output-dir>/data, if not specified use {}.'
            .format(cls.default_output_dir()))

        parser.add_argument(
            '-v', '--verbose', action='store_true',
            help='display more messages to stdout')

        parser.add_argument(
            '-f', '--force', action='store_true',
            help='if specified, overwrite the output directory '
            '<output-dir>/data. If not specified but the directory exists, '
            'the program detects desired wav files already present and '
            'do not convert them again.')

        parser.add_argument(
            '-j', '--njobs', type=int, default=utils.default_njobs(),
            metavar='<njobs>',
            help='number of jobs to launch when doing parallel '
            'computations (mainly for wav conversion). '
            'Default is to launch %(default)s jobs.')

        parser.add_argument(
            '--keep-short-utts', action='store_true',
            help='utterances shorter than 0.1 second are removed by defaults, '
            "as they won't be accepted by Kaldi for feature extraction. "
            "Use this option to keep those short utterances in the corpus.")

        if cls.preparator.audio_format == 'wav':
            parser.add_argument(
                '--copy-wavs', action='store_true',
                help='the audio files of this corpus are already in wav. '
                'By default abkhazia will import them as symbolic links, '
                'use this option to force copy')

        return parser
Пример #5
0
    def __init__(self, input_dir, log=utils.logger.null_logger()):
        self.njobs = utils.default_njobs(local=True)
        self.log = log

        # init input directory
        if not os.path.isdir(input_dir):
            raise IOError(
                'input directory does not exist:\n{}'.format(input_dir))
        self.input_dir = os.path.abspath(input_dir)

        # init empty output corpus
        self.corpus = abkhazia.corpus.Corpus()
        self.corpus.meta.source = self.input_dir
        self.corpus.meta.name = self.name
Пример #6
0
    def __init__(self, corpus, output_dir, log=utils.logger.null_logger()):
        super(AbstractRecipe, self).__init__(log=log)
        self.njobs = utils.default_njobs()
        self.corpus = corpus
        self.meta.source = 'corpus = {}'.format(self.corpus.meta.source)
        self.meta.name = self.name + ' on corpus ' + self.corpus.meta.name

        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)
        self.output_dir = os.path.abspath(output_dir)

        # init the recipe dir as a subdirectory of output_dir
        self.recipe_dir = os.path.join(self.output_dir, 'recipe')
        if not os.path.isdir(self.recipe_dir):
            os.makedirs(self.recipe_dir)

        # if True, delete the recipe_dir on instance destruction
        self.delete_recipe = True

        # init the abkhazia2kaldi converter
        self.a2k = Abkhazia2Kaldi(self.corpus,
                                  self.recipe_dir,
                                  name=self.name,
                                  log=self.log)
Пример #7
0
 def __init__(self, corpus, njobs=default_njobs(),
              log=logger.null_logger()):
     self.corpus = corpus
     self.njobs = njobs
     self.log = log