def to_json(self, fname=None): ''' Parse the entire dataset to a list of dictionary containin at least two keys: `input`: path to audio file `duration`: length of the audio `label`: transcription of the audio ''' fname = fname or os.path.join(self.default_output_dir, 'data.json') if os.path.exists(fname) and override: os.remove(fname) if not os.path.isdir(os.path.split(fname)[0]): safe_mkdirs(os.path.split(fname)[0]) data = self._to_ld() with codecs.open(fname, 'w', encoding='utf8') as f: json.dump(data, f) self._logger.info(self._report(ld2dl(data))) return fname
def to_h5(self, fname=None, input_parser=None, label_parser=None, split_sets=True, override=False): ''' Generates h5df file for the dataset Note that this function will calculate the features rather than store the path to the audio file Args split_sets: if True and dataset is split in several sets (e.g. train, valid, test) the h5 file will create the corresponding datasets; otherwise no dataset is create ''' fname = fname or os.path.join(self.default_output_dir, 'data.h5') if h5py.is_hdf5(fname) and override: os.remove(fname) if not os.path.isdir(os.path.split(fname)[0]): safe_mkdirs(os.path.split(fname)[0]) feat_name = str(input_parser) data = self._to_ld(label_parser=label_parser) if len(data) == 0: raise IndexError("Data is empty") datasets = ['/'] if 'dataset' in data[0]: datasets = list(set([d['dataset'] for d in data])) self._logger.info('Opening %s', fname) with h5py.File(fname) as f: # create all datasets for dataset in datasets: group = f['/'] if dataset != '/': group = f.create_group(dataset) inputs = group.create_dataset( 'inputs', (0,), maxshape=(None,), dtype=h5py.special_dtype(vlen=np.dtype('float32'))) if input_parser is not None and input_parser.num_feats: inputs.attrs['num_feats'] = input_parser.num_feats group.create_dataset( 'labels', (0,), maxshape=(None,), dtype=h5py.special_dtype(vlen=unicode)) group.create_dataset( 'durations', (0,), maxshape=(None,)) for i, d in enumerate(data): dataset = '/' if dataset not in datasets: dataset = d['dataset'] # HDF5 pointers inputs = f[dataset]['inputs'] labels = f[dataset]['labels'] durations = f[dataset]['durations'] # Data input_ = d['input'] if input_parser is not None: input_ = input_parser(input_) label = d['label'] duration = d['duration'] inputs.resize(inputs.shape[0] + 1, axis=0) inputs[inputs.shape[0] - 1] = input_.flatten().astype('float32') labels.resize(labels.shape[0] + 1, axis=0) labels[labels.shape[0] - 1] = label.encode('utf8') durations.resize(durations.shape[0] + 1, axis=0) durations[durations.shape[0] - 1] = duration # Flush to disk only when it reaches 128 samples if i % 128 == 0: self._logger.info('%d/%d done.' % (i, len(data))) f.flush() f.flush() self._logger.info('%d/%d done.' % (len(data), len(data))) return fname
from __future__ import absolute_import, division, print_function from utils.generic_utils import safe_mkdirs import os DT_ABSPATH = os.path.join( os.path.sep.join( os.path.dirname(os.path.abspath(__file__)).split(os.path.sep)[:-1]), '.datasets') safe_mkdirs(DT_ABSPATH) from datasets.dataset_parser import DatasetParser from datasets.sid import Sid from datasets.lapsbm import LapsBM from datasets.voxforge import VoxForge from datasets.cslu import CSLU from datasets.dummy import Dummy from datasets.brsd import BRSD #English datasets from datasets.cvc import CVC from datasets.librispeech import LibriSpeech from datasets.voxforge_en import VoxForge_En from datasets.tedlium2 import TedLium2 from datasets.vctk import VCTK from datasets.tatoeba import Tatoeba from datasets.ensd import ENSD