def __init__(self, path='datasets/Nottingham', source='http://www-etud.iro.umontreal.ca/~boulanni/Nottingham.zip', train_filter='.*train.*', valid_filter='.*valid.*', test_filter='.*test.*', ): super(Nottingham, self).__init__(path=path, source=source, train_filter=train_filter, valid_filter=valid_filter, test_filter=test_filter) # grab the datasets from midireading the files train_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(config.floatX) for f in find_files(self.path, train_filter) ] valid_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(config.floatX) for f in find_files(self.path, valid_filter) ] test_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(config.floatX) for f in find_files(self.path, test_filter) ] self.train_inputs = numpy.concatenate(train_datasets) self.train_targets = None self.valid_inputs = numpy.concatenate(valid_datasets) self.valid_targets = None self.test_inputs = numpy.concatenate(test_datasets) self.test_targets = None
def __init__(self, path='datasets/MuseData', source='http://www-etud.iro.umontreal.ca/~boulanni/MuseData.zip', train_filter='.*train.*', valid_filter='.*valid.*', test_filter='.*test.*', ): super(MuseData, self).__init__(path=path, source=source, train_filter=train_filter, valid_filter=valid_filter, test_filter=test_filter) # grab the datasets from midireading the files train_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in find_files(self.path, train_filter) ] valid_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in find_files(self.path, valid_filter) ] test_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in find_files(self.path, test_filter) ] self.train_inputs = numpy.concatenate(train_datasets) self.train_targets = None self.valid_inputs = numpy.concatenate(valid_datasets) self.valid_targets = None self.test_inputs = numpy.concatenate(test_datasets) self.test_targets = None
def __iter__(self): for fname in files.find_files(self.path, self.filter): if self.preprocess is not None and callable(self.preprocess): fname = self.preprocess(fname) fnames = raise_to_list(fname) for name in fnames: yield name
def __iter__(self): for fname in find_files(self.path, self.filter): if self.preprocess is not None and callable(self.preprocess): fname = self.preprocess(fname) fnames = raise_to_list(fname) for name in fnames: yield name
def __iter__(self): for fname in files.find_files(self.path, self.filter): try: with Image.open(fname) as im: data = numpy.array(im) if self.preprocess is not None and callable(self.preprocess): data = self.preprocess(data) data = raise_to_list(data) for d in data: yield d except Exception as err: log.exception(err.__str__())
def __iter__(self): idx = 0 for fname in files.find_files(self.path, self.filter): with open(fname, 'r') as f: for line in f: if self.preprocess is not None: line = self.preprocess(line) line = raise_to_list(line) for token in line: if idx >= self.n_future: yield token else: idx += 1
def __iter__(self): idx = 0 for fname in files.find_files(self.path, self.filter): with open(fname, 'r') as f: for line in f: if self.preprocess is not None: line = self.preprocess(line) line = raise_to_list(line) for token in line: if idx >= self.n_future: yield token else: idx += 1
def __iter__(self): for fname in find_files(self.path, self.filter): try: with Image.open(fname) as im: data = numpy.array(im) if self.preprocess is not None and callable( self.preprocess): data = self.preprocess(data) data = raise_to_list(data) for d in data: yield d except Exception as err: _log.exception(err.__str__())
def __iter__(self): idx = 0 for fname in files.find_files(self.path, self.filter): try: with open(fname, 'r') as f: for line in f: if self.preprocess is not None and callable(self.preprocess): line = self.preprocess(line) line = raise_to_list(line) for token in line: if idx >= self.n_future: yield token else: idx += 1 except Exception as err: log.exception(err.__str__())
def __iter__(self): idx = 0 for fname in find_files(self.path, self.filter): try: with open(fname, 'r') as f: for line in f: if self.preprocess is not None and callable( self.preprocess): line = self.preprocess(line) line = raise_to_list(line) for token in line: if idx >= self.n_future: yield token else: idx += 1 except Exception as err: _log.exception(err.__str__())
def __init__( self, path=DEFAULT_TEDLIUM_DATASET_PATH, window_duration = 0.01, skip_count = 1, max_speeches = None, ): """Initialize the Dataset with a given storage for TEDLIUM path -- target path for the TED LIUM data storage window_duration -- duration of the audio window in seconds skip_count -- step size across the segments in the repo used to do a very small subset of the dataset when doing testing iterations. This allows you to test an "epoch" across a small subset of the 40GB data-file """ self.window_size = 2**int(math.ceil(math.log(int(window_duration * 16000),2))) source_filename = path + '.tar.gz' if not os.path.exists(path): if os.path.exists(source_filename): # Note: this could, in theory overwrite anything on disk, as the Python # tarfile module doesn't prevent writing outside the root directory # (according to its docs). file_ops.untar(source_filename, destination_dir=os.path.dirname(path)) if not os.path.exists(path): raise RuntimeError( "You need to download the TEDLIUM corpus (v2) from %(url)s and save it to %(path)s"%{ 'url': LIUM_BASE + TEDLIUM_DOWNLOAD_URL, 'path': source_filename, } ) path = os.path.realpath(path) log.info("Searching for speeches") self.train_speeches = [ tedlium.Speech( sph, window_size=self.window_size ) for sph in file_ops.find_files( path, '.*[/]train[/]sph[/].*[.]sph', ) ] if max_speeches: self.train_speeches = self.train_speeches[:max_speeches] self.test_speeches = [ tedlium.Speech( sph, window_size=self.window_size ) for sph in file_ops.find_files( path, '.*[/]test[/]sph[/].*[.]sph', ) ] if max_speeches: self.test_speeches = self.test_speeches[:max_speeches] self.valid_speeches = [ tedlium.Speech( sph, window_size=self.window_size ) for sph in file_ops.find_files( path, '.*[/]dev[/]sph[/].*[.]sph', ) ] if max_speeches: self.valid_speeches = self.valid_speeches[:max_speeches] log.info( "Creating speech segments (utterance records using 1/%s of the utterances)", skip_count, ) train_inputs,train_targets = inputs_and_targets( self.train_speeches ) valid_inputs,valid_targets = inputs_and_targets( self.valid_speeches ) test_inputs,test_targets = inputs_and_targets( self.test_speeches ) log.info("Initializing the OpenDeep dataset") super(TEDLIUMDataset,self).__init__( train_inputs=train_inputs,train_targets=train_targets, valid_inputs=valid_inputs,valid_targets=valid_targets, test_inputs=test_inputs,test_targets=test_targets, )