示例#1
0
    def to_json(self, fname=None):
        ''' Parse the entire dataset to a list of dictionary containin at least
        two keys:
            `input`: path to audio file
            `duration`: length of the audio
            `label`: transcription of the audio
        '''
        fname = fname or os.path.join(self.default_output_dir, 'data.json')

        if os.path.exists(fname) and override:
            os.remove(fname)

        if not os.path.isdir(os.path.split(fname)[0]):
            safe_mkdirs(os.path.split(fname)[0])

        data = self._to_ld()

        with codecs.open(fname, 'w', encoding='utf8') as f:
            json.dump(data, f)

        self._logger.info(self._report(ld2dl(data)))

        return fname
示例#2
0
    def to_h5(self, fname=None, input_parser=None, label_parser=None,
              split_sets=True, override=False):
        ''' Generates h5df file for the dataset
        Note that this function will calculate the features rather than store
        the path to the audio file

        Args
            split_sets: if True and dataset is split in several sets (e.g.
            train, valid, test) the h5 file will create the corresponding
            datasets; otherwise no dataset is create
        '''

        fname = fname or os.path.join(self.default_output_dir, 'data.h5')

        if h5py.is_hdf5(fname) and override:
            os.remove(fname)

        if not os.path.isdir(os.path.split(fname)[0]):
            safe_mkdirs(os.path.split(fname)[0])

        feat_name = str(input_parser)

        data = self._to_ld(label_parser=label_parser)

        if len(data) == 0:
            raise IndexError("Data is empty")

        datasets = ['/']
        if 'dataset' in data[0]:
            datasets = list(set([d['dataset'] for d in data]))

        self._logger.info('Opening %s', fname)
        with h5py.File(fname) as f:

            # create all datasets
            for dataset in datasets:

                group = f['/']
                if dataset != '/':
                    group = f.create_group(dataset)

                inputs = group.create_dataset(
                    'inputs', (0,), maxshape=(None,),
                    dtype=h5py.special_dtype(vlen=np.dtype('float32')))

                if input_parser is not None and input_parser.num_feats:
                    inputs.attrs['num_feats'] = input_parser.num_feats

                group.create_dataset(
                    'labels', (0,), maxshape=(None,),
                    dtype=h5py.special_dtype(vlen=unicode))

                group.create_dataset(
                    'durations', (0,), maxshape=(None,))

            for i, d in enumerate(data):

                dataset = '/'
                if dataset not in datasets:
                    dataset = d['dataset']

                # HDF5 pointers
                inputs = f[dataset]['inputs']
                labels = f[dataset]['labels']
                durations = f[dataset]['durations']

                # Data
                input_ = d['input']
                if input_parser is not None:
                    input_ = input_parser(input_)

                label = d['label']
                duration = d['duration']

                inputs.resize(inputs.shape[0] + 1, axis=0)
                inputs[inputs.shape[0] - 1] = input_.flatten().astype('float32')

                labels.resize(labels.shape[0] + 1, axis=0)
                labels[labels.shape[0] - 1] = label.encode('utf8')

                durations.resize(durations.shape[0] + 1, axis=0)
                durations[durations.shape[0] - 1] = duration

                # Flush to disk only when it reaches 128 samples
                if i % 128 == 0:
                    self._logger.info('%d/%d done.' % (i, len(data)))
                    f.flush()

            f.flush()
            self._logger.info('%d/%d done.' % (len(data), len(data)))

            return fname
示例#3
0
from __future__ import absolute_import, division, print_function

from utils.generic_utils import safe_mkdirs
import os

DT_ABSPATH = os.path.join(
    os.path.sep.join(
        os.path.dirname(os.path.abspath(__file__)).split(os.path.sep)[:-1]),
    '.datasets')
safe_mkdirs(DT_ABSPATH)

from datasets.dataset_parser import DatasetParser
from datasets.sid import Sid
from datasets.lapsbm import LapsBM
from datasets.voxforge import VoxForge
from datasets.cslu import CSLU
from datasets.dummy import Dummy
from datasets.brsd import BRSD

#English datasets
from datasets.cvc import CVC
from datasets.librispeech import LibriSpeech
from datasets.voxforge_en import VoxForge_En
from datasets.tedlium2 import TedLium2
from datasets.vctk import VCTK
from datasets.tatoeba import Tatoeba

from datasets.ensd import ENSD