def command_extract(args): # setup the logger (level given by -q/-v arguments) if args.quiet: log = utils.null_logger() else: if args.verbose == 0: level = 'warning' elif args.verbose == 1: level = 'info' else: # verbose >= 2 level = 'debug' log = utils.get_logger(name='speech-features', level=level) # forward the initialized log to shennong utils._logger = log # make sure the output file is not already existing and have a # valid extension output_file = args.output_file if os.path.exists(output_file): log.error('output file already exist: %s', output_file) return output_ext = os.path.splitext(output_file)[1] if output_ext not in supported_extensions().keys(): log.error( 'output file has an unsupported extension "%s", must be in %s', output_ext, ", ".join(supported_extensions().keys())) return # make sure the input config and wavs_index exists for filename in (args.config, args.utts_index): if not os.path.exists(filename): log.error('input file not found: %s', filename) # read the utterances file as a list of lists, ignore empty lines # in the file utterances = [ utt.split(' ') for utt in (utt.strip() for utt in open(args.utts_index, 'r')) if utt] # run the pipeline features = pipeline.extract_features( args.config, utterances, njobs=args.njobs, log=log) # save the features log.info('saving the features to %s', output_file) features.save(output_file)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('data_dir', help='input directory with wavs') parser.add_argument( 'output_dir', default='/tmp', nargs='?', help='output directory (created files are deleted at exit)') args = parser.parse_args() # load audio data and compute total duration audio_data = { os.path.basename(f): Audio.load(f) for f in list_files_with_extension(args.data_dir, '.wav') } total_duration = datetime.timedelta( seconds=int(sum(a.duration for a in audio_data.values()))) print('found {} wav files, total duration of {}'.format( len(audio_data), str(total_duration))) # compute the features (default MFCC) print('computing MFCC features...') t1 = datetime.datetime.now() processor = MfccProcessor() features = FeaturesCollection( **{k: processor.process(v) for k, v in audio_data.items()}) t2 = datetime.datetime.now() print('took {}'.format(t2 - t1)) # save the features in all the supported formats data = { 'duration': total_duration, 'data': { ext: analyze_serializer(features, ext, args.output_dir) for ext in supported_extensions().keys() } } print_results(data)
for v in (True, False)]) def test_cmvn(utterances_index, by_speaker, with_vad): config = pipeline.get_default_config('mfcc', with_cmvn=True, with_pitch=False, with_delta=False) config['cmvn']['by_speaker'] = by_speaker config['cmvn']['with_vad'] = with_vad feats = pipeline.extract_features(config, utterances_index) feat2 = feats[utterances_index[0][0]] assert feat2.is_valid() assert feat2.shape[0] == 140 assert feat2.shape[1] == 13 @pytest.mark.parametrize('ext', supported_extensions().keys()) def test_extract_features_full(ext, wav_file, wav_file_8k, wav_file_float32, capsys, tmpdir): # difficult case with parallel jobs, different sampling rates, # speakers and segments index = [('u1', wav_file, 's1', 0, 1), ('u2', wav_file_float32, 's2', 1, 1.2), ('u3', wav_file_8k, 's1', 1, 3)] config = pipeline.get_default_config('mfcc') # disable VAD because it can alter the cmvn result (far from (0, # 1) when the signal includes non-voiced frames) config['cmvn']['with_vad'] = False feats = pipeline.extract_features(config, index,
def test_get_serializer_byext(ext): h = serializers.get_serializer(FeaturesCollection, 'foo' + ext, None) assert not os.path.isfile('foo' + ext) assert isinstance(h, serializers.supported_extensions()[ext])