def available_weights(cls): """Return the pretrained weights files as a dict (name -> file) Returns ------- weight_files : dict A mapping 'weights name' -> 'weights files', where the files are absolutes paths to compressed numpy array (.npz format). The 'weights name' is either *BabelMulti*, *FisherMono* or *FisherTri*. Raises ------ RuntimeError If the directory `shennong/share/bottleneck` is not found, or if all the weights files are missing in it. """ # locate the directory shennong/share/bottleneck, raise if it # cannot be found directory = pkg_resources.resource_filename( pkg_resources.Requirement.parse('shennong'), 'shennong/share/bottleneck') if not os.path.isdir(directory): # pragma: nocover raise RuntimeError(f'directory not found: {directory}') # retrieve the weights files expected_files = { f[0]: os.path.join(directory, f[1] + '.npz') for f in [('BabelMulti', 'Babel-ML17_FBANK_HL1500_SBN80_PhnStates3096'), ('FisherMono', 'FisherEnglish_FBANK_HL500_SBN80_PhnStates120'), ('FisherTri', 'FisherEnglish_FBANK_HL500_SBN80_triphones2423')] } # make sure all the files are here, raise a RuntimeError if # all files are missing, log a warning is only one or two # files are missing files = {k: v for k, v in expected_files.items() if os.path.isfile(v)} if not files: # pragma: nocover raise RuntimeError('no weights file found in {}'.format(directory)) for k in expected_files.keys(): if k not in files: # pragma: nocover get_logger('bottleneck', 'warning').warning( 'weights file for "%s" is unavailable', k) return files
def load(cls, filename, serializer=None, log=get_logger('serializer', 'warning')): """Loads a FeaturesCollection from a `filename` Parameters ---------- filename : str The file to load serializer : str, optional The file serializer to use for loading, if not specified guess the serializer from the `filename` extension log : logging.Logger, optional Where to send log messages. Default to a logger named 'serializer' with a 'warning' level. Returns ------- features : :class:`~shennong.features.FeaturesCollection` The features loaded from the `filename` Raises ------ IOError If the `filename` cannot be read ValueError If the `serializer` or the file extension is not supported, if the features loading fails. """ return get_serializer(cls, filename, log, serializer).load()
def save(self, filename, serializer=None, with_properties=True, log=get_logger('serializer', 'warning'), **kwargs): """Saves a FeaturesCollection to a `filename` Parameters ---------- filename : str The file to write serializer : str, optional The file serializer to use for loading, if not specified guess the serializer from the `filename` extension with_properties : bool, optional When False do not save the features properties, default to True. log : logging.Logger, optional Where to send log messages. Default to a logger named 'serializer' with a 'warning' level. compress : bool_or_str_or_int, optional Only valid for numpy (.npz), matlab (.mat) and h5features (.h5f) serializers. When True compress the file. Default to True. scp : bool, optional Only valid for kaldi (.ark) serializer. When True writes a .scp file along with the .ark file. Default to False. Raises ------ IOError If the file `filename` already exists ValueError If the `serializer` or the file extension is not supported, if the features saving fails. """ get_serializer(self.__class__, filename, log, serializer).save( self, with_properties=with_properties, **kwargs)
def main(): # parse input arguments parser = argparse.ArgumentParser() parser.add_argument( 'data_directory', help='input/output data directory', type=pathlib.Path) parser.add_argument( 'config_file', help='YAML configuration file', type=pathlib.Path) parser.add_argument( 'corpus', choices=['english', 'xitsonga'], help='corpus to process') parser.add_argument( '--do-vtln', action='store_true', help='extract warped features from pre-trained VTLN') parser.add_argument( '-j', '--njobs', type=int, default=4, metavar='<int>', help='number of parallel jobs (default to %(default)s)') parser.add_argument( '-v', '--verbose', action='store_true', help='increase log level') args = parser.parse_args() # check and setup arguments data_directory = args.data_directory if not data_directory.is_dir(): raise ValueError(f'directory not found: {data_directory}') config = args.config_file if not config.is_file(): raise ValueError(f'file not found: {config}') warps = None if args.do_vtln: warps_file = data_directory / f'{args.corpus}.warps' if not warps_file.is_file(): raise ValueError(f'file not found: {warps_file}') warps = {spk: float(warp) for spk, warp in ( line.strip().split() for line in open(warps_file, 'r'))} (data_directory / 'features').mkdir(exist_ok=True) log = get_logger('extraction', 'debug' if args.verbose else 'info') # load input utterances log.info('loading utterances...') utterances = Utterances( [line.strip().split(' ') for line in open( data_directory / f'{args.corpus}.utts', 'r')]) # extract the features features = pipeline.extract_features( config, utterances, warps=warps, njobs=args.njobs, log=log) # save them h5f_file = data_directory / 'features' / f'{args.corpus}_{config.stem}.h5f' if args.do_vtln: h5f_file = h5f_file.replace('.h5f', '_vtln.h5f') features.save(h5f_file)
def _check_environment(njobs, log=get_logger('pipeline', 'warning')): if njobs == 1: return try: nthreads = int(os.environ['OMP_NUM_THREADS']) except KeyError: nthreads = None if not nthreads or nthreads > 1: log.warning( 'working on %s threads but implicit parallelism is active, ' 'this may slow down the processing. Set the environment variable ' 'OMP_NUM_THREADS=1 to disable this warning', njobs)
def __init__(self, config, utterances, log=get_logger('manager', 'warning')): self._config = config self._utterances = utterances self._warps = {} self.log = log self._check_utterances() # store the metadata because we need to access the sample rate # for processors instanciation audio_files = set(utt.audio_file for utt in utterances) self._audio_metadata = {} for audio in audio_files: log.debug('scanning %s', audio) self._audio_metadata[audio] = Audio.scan(audio) # make sure all the audio files are compatible with the pipeline log.info('scanning %s utterances...', len(self._utterances)) self._check_audio_files() # the features type to be extracted self.features = [ k for k in self.config.keys() if k in self.valid_features ][0] # get some framing parameters constant for all processors # (retrieve them from a features processor instance) proc = self.get_features_processor(next(iter(self.utterances))) self.frame_length = proc.frame_length self.frame_shift = proc.frame_shift # if CMVN by speaker, instanciate a CMVN processor by speaker # here, else instanciate a processor per utterance if 'cmvn' in self.config: if self.config['cmvn']['by_speaker']: self._cmvn_processors = { spk: self.get_processor_class('cmvn')(proc.ndims) for spk in set(utt.speaker for utt in self.utterances) } else: self._cmvn_processors = { utt.name: self.get_processor_class('cmvn')(proc.ndims) for utt in self.utterances }
def command_extract(args): """Execute the 'speech-features extract' command""" # setup the logger (level given by -q/-v arguments) if args.quiet: log = utils.null_logger() level = 'error' else: if args.verbose == 0: level = 'warning' elif args.verbose == 1: level = 'info' else: # verbose >= 2 level = 'debug' log = logger.get_logger(name='speech-features', level=level) # make sure the output file is not already existing and have a # valid extension output_file = args.output_file if os.path.exists(output_file): log.error('output file already exist: %s', output_file) return output_ext = os.path.splitext(output_file)[1] if output_ext not in supported_extensions().keys(): log.error( 'output file has an unsupported extension "%s", must be in %s', output_ext, ", ".join(supported_extensions().keys())) return # make sure the input config and utterances exists for filename in (args.config, args.utterances): if not os.path.exists(filename): log.error('input file not found: %s', filename) # read the utterances file utterances = Utterances.load(args.utterances) # run the pipeline features = pipeline.extract_features(args.config, utterances, njobs=args.njobs, log=log) # save the features log.info('saving the features to %s', output_file) features.save(output_file)
def set_logger(self, level, formatter='%(levelname)s - %(name)s - %(message)s'): """Change level and/or format of the processor's logger Parameters ---------- level : str The minimum log level handled by the logger (any message above this level will be ignored). Must be 'debug', 'info', 'warning' or 'error'. formatter : str, optional A string to format the log messages, see https://docs.python.org/3/library/logging.html#formatter-objects. By default display level and message. Use '%(asctime)s - %(levelname)s - %(name)s - %(message)s' to display time, level, name and message. """ self._logger = get_logger(self.name, level=level, formatter=formatter)
import shennong.pipeline as pipeline from shennong.logger import get_logger ENGLISH_ITEM = ('https://raw.githubusercontent.com/bootphon/ABXpy/' 'zerospeech2015/resources/english.item') XITSONGA_ITEM = ('https://raw.githubusercontent.com/bootphon/ABXpy/' 'zerospeech2015/resources/xitsonga.item') ENGLISH_FILES_LIST = ('https://raw.githubusercontent.com/bootphon/' 'Zerospeech2015/master/english_files.txt') XITSONGA_FILES_LIST = ('https://raw.githubusercontent.com/bootphon/' 'Zerospeech2015/master/xitsonga_files.txt') log = get_logger('data setup', 'info') def setup_data(data_directory, buckeye_directory, xitsonga_directory): """Setup a data directory with all input data required * creates the ``data_directory`` * make a symlink to ``buckeye_directory`` and ``xitsonga_directory`` in it * download the ABX item files for buckeye and xitsonga * create the list of utterances for both corpora * create the configuration files for features extraction """ # basic checks if not buckeye_directory.is_dir(): raise ValueError(f'directory does not exists: {buckeye_directory}')
def main(): # parse input arguments parser = argparse.ArgumentParser() parser.add_argument('data_directory', type=pathlib.Path, help='input/output data directory') parser.add_argument('conf', choices=['only', 'nocmvn', 'full'], help='pipeline configuration') parser.add_argument('warps', type=pathlib.Path, help='VTLN warps to use') parser.add_argument('-o', '--output-file', type=pathlib.Path, help='features file') parser.add_argument( '-j', '--njobs', type=int, default=4, metavar='<int>', help='number of parallel jobs (default to %(default)s)') parser.add_argument('-v', '--verbose', action='store_true', help='increase log level') args = parser.parse_args() # check and setup arguments data_directory = args.data_directory if not data_directory.is_dir(): raise ValueError(f'directory not found: {data_directory}') config = data_directory / 'config' / f'mfcc_{args.conf}.yaml' if not config.is_file(): raise ValueError(f'file not found: {config}') warps = None if args.warps.is_file(): warps = { spk: float(warp) for spk, warp in (line.strip().split() for line in open(args.warps, 'r')) } else: # the case without VTLN assert str(args.warps) == 'off' log = get_logger('extraction', 'debug' if args.verbose else 'info') # load input utterances log.info('loading utterances...') utterances = Utterances.load(data_directory / 'english.utts') # extract the features features = pipeline.extract_features(config, utterances, warps=warps, njobs=args.njobs, log=log) # save them (args.output_file.parent).mkdir(exist_ok=True, parents=True) features.save(args.output_file)
def __init__(self): self._logger = get_logger(self.name, level='info')
def concatenate(self, other, tolerance=0, log=get_logger('features', 'info')): """Returns the concatenation of this features with `other` Build a new Features instance made of the concatenation of this instance with the other instance. Their `times` must be the equal. Parameters ---------- other : Features, shape = [nframes +/- tolerance, ndim2] The other features to concatenate at the end of this one tolerance : int, optional If the number of frames of the two features is different, trim the longest one up to a frame difference of `tolerance`, otherwise raise a ValueError. This option is usefull when concatenating pitch with other 'standard' features because pitch processing includes a downsampling which can alter the resulting number of frames (the same tolerance is applied in Kaldi, e.g. in paste-feats). Default to 0. log : logging.Logger, optional Where to send log messages Returns ------- features : Features, shape = [nframes +/- tolerance, ndim1 + ndim2] Raises ------ ValueError If `other` cannot be concatenated because of inconsistencies: number of frames difference greater than tolerance, inequal times values. """ # check the number of frames is within the tolerance need_trim = False diff = abs(self.nframes - other.nframes) if diff: if not tolerance: raise ValueError('features have a different number of frames') if tolerance and diff > tolerance: raise ValueError( 'features differs number of frames, and ' 'greater than tolerance: |{} - {}| > {}'.format( self.nframes, other.nframes, tolerance)) log.warning( 'features differs in number of frames, but ' 'within tolerance (|%s - %s| <= %s), trim the longest one', self.nframes, other.nframes, tolerance) need_trim = True # trim the longest features to the size of the shortest one data1 = self.data data2 = other.data times1 = self.times times2 = other.times if need_trim: if self.nframes > other.nframes: data1 = data1[:-diff] times1 = times1[:-diff] else: data2 = data2[:-diff] times2 = times2[:-diff] # ensures time axis is shared accross the two features if not np.allclose(times1, times2): raise ValueError('times are not equal') # merge properties of the two features properties = copy.deepcopy(self.properties) other_properties = copy.deepcopy(other.properties) properties.update( {k: v for k, v in other_properties.items() if k != 'pipeline'}) if 'pipeline' not in properties: properties['pipeline'] = [] if 'pipeline' in other_properties: for k in other_properties['pipeline']: properties['pipeline'].append(k) columns = properties['pipeline'][-1]['columns'] properties['pipeline'][-1]['columns'] = [ columns[0] + self.ndims, columns[1] + self.ndims ] return Features(np.hstack((data1, data2)), times1, properties=properties)
def _init_config(config, log=get_logger('pipeline', 'warning')): try: if os.path.isfile(config): log.debug('loading configuration from %s', config) config = open(config, 'r').read() except TypeError: pass if isinstance(config, str): # the config is a string, try to load it as a YAML try: config = yaml.load(config, Loader=yaml.FullLoader) except yaml.YAMLError as err: raise ValueError(f'error in configuration: {err}') # ensure all the keys in config are known unknown_keys = [ k for k in config.keys() if k not in list(PipelineManager.valid_processors) + ['pitch'] ] if unknown_keys: raise ValueError('invalid keys in configuration: {}'.format( ', '.join(unknown_keys))) # ensure one and only one features processor is defined in the # configuration features = [k for k in config.keys() if k in valid_features()] if not features: raise ValueError( 'the configuration does not define any features extraction ' '(must have one and only one entry of {})'.format(', '.join( valid_features()))) if len(features) > 1: raise ValueError( 'more than one features extraction processors are defined, ' '(must have one and only one entry of {}): {}'.format( ', '.join(valid_features()), ', '.join(features))) if 'vtln' in config and features[0] in ('spectrogram', 'bottleneck'): raise ValueError(f'{features[0]} features do not support VTLN') if 'cmvn' in config: # force by_speaker to False if not existing if 'by_speaker' not in config['cmvn']: log.warning('by_speaker option not specified for cmvn, ' 'assuming it is false and doing cmvn by utterance') config['cmvn']['by_speaker'] = False # force with_vad to True if not existing if 'with_vad' not in config['cmvn']: config['cmvn']['with_vad'] = True # on pitch, make sure we have a 'postprocessing' entry if 'pitch' in config and 'postprocessing' not in config['pitch']: config['pitch']['postprocessing'] = {} # log message describing the pipeline configuration msg = [] if 'pitch' in config: msg.append(f'{config["pitch"]["processor"]} pitch') if 'delta' in config: msg.append('delta') if 'cmvn' in config: msg.append('cmvn by {}{}'.format( 'speaker' if config['cmvn']['by_speaker'] else 'utterance', ' with vad' if config['cmvn']['with_vad'] else '')) if 'vtln' in config: msg.append('vtln by {}'.format( 'speaker' if config['vtln']['by_speaker'] else 'utterance')) log.info('pipeline configured for %s features extraction%s', features[0], ' with {}'.format(', '.join(msg)) if msg else '') return config
def extract_features(configuration, utterances, warps=None, njobs=1, log=get_logger('pipeline', 'warning')): """Speech features extraction pipeline Given a pipeline ``configuration`` and ``utterances`` defining a list of utterances on which to extract features, this function applies the whole pipeline and returns the extracted features as an instance of :class:`~shennong.features.features.FeaturesCollection`. It uses ``njobs`` parallel subprocesses. Parameters ---------- config : dict or str The pipeline configuration, can be a dictionary, a path to a YAML file or a string formatted in YAML. To get a configuration example, see :func:`get_default_config` utterances : :class:`~shennong.utterances.Utterances` The list of utterances to extract the features on. warps : dict, optional A dictionnary of precomputed VTLN warps coefficients to be applied on features. Must be a dict (str: float) of warps indexed either by utterances speaker or name. Both the ``warps`` argument and the config['vtln'] entry must not be defined together. njobs : int, optional The number to subprocesses to execute in parallel, use a single process by default. log : logging.Logger A logger to display messages during pipeline execution Returns ------- features : :class:`~shennong.features.features.FeaturesCollection` The extracted speech features Raises ------ ValueError If the ``configuration`` or the ``utterances`` are invalid, if both the ``warps`` argument and the 'vtln' entry in configuration are defined or if something goes wrong during features extraction. """ # intialize the pipeline configuration, the list of wav files to # process, instanciate the pipeline processors and make all the # checks to ensure all is correct njobs = get_njobs(njobs, log=log) config = _init_config(configuration, log=log) log.info('detected format for utterances index is: %s', utterances.format(type=str)) # make sure the warps are valid (not overloading 'vtln' in config and # either by speaker or by utterance. If defined per speaker convert them by # utterance) if warps: warps = _init_warps(warps, config, utterances, log) # check the OMP_NUM_THREADS variable for parallel computations _check_environment(njobs, log=log) # do all the computations return _extract_features(config, utterances, warps, njobs=njobs, log=log)