def _check_running_feature_extraction(feat_dir, n_files): # True mean need to run the feature extraction if not os.path.exists(feat_dir): return True indices_path = os.path.join(feat_dir, 'indices_%s' % FEATURE_NAME) if not os.path.exists(indices_path): return True try: indices = F.MmapDict(path=indices_path, read_only=True) n_indices = len(indices) indices.close() except Exception as e: import traceback traceback.print_exc() print("Loading indices error: '%s'" % str(e), "at:", indices_path) return True if n_indices != n_files: return True return False
def test_feeders(self): with utils.TemporaryDirectory() as temppath: np.random.seed(1208251813) transcription_test = {} # ====== create fake dataset ====== # ds = F.Dataset(os.path.join(temppath, 'ds')) ds['X'] = np.arange(0, 10000).reshape(-1, 5) # generate fake indices indices = [] for i, j in enumerate(range(0, ds['X'].shape[0], 20)): indices.append(['name_%d' % i, j, j + 20]) np.savetxt(os.path.join(ds.path, 'indices.csv'), indices, fmt='%s', delimiter=' ') # generate fake transcription transcription = F.MmapDict( os.path.join(ds.path, 'transcription.dict')) for name, start, end in indices: trans = np.random.randint(0, 4, size=(20, )).tolist() transcription[name] = trans for i, j in zip(ds['X'][start:end], trans): transcription_test[str(i.tolist())] = j transcription.flush() transcription.close() ds.flush() ds.close() # ====== test feeder ====== # ds = F.Dataset(os.path.join(temppath, 'ds'), read_only=True) REF = ds['X'][:].ravel().tolist() feeder = F.Feeder(ds['X'], ds['indices.csv'], ncpu=2, buffer_size=2) # ==================== No recipes ==================== # def test_iter_no_trans(it): x = [] n = 0 for i in it: x += i.ravel().tolist() n += i.shape[0] x = np.sort(x).tolist() self.assertEqual(x, REF) self.assertEqual(n, ds['X'].shape[0]) # ====== NO shuffle ====== # test_iter_no_trans(feeder.set_batch(12, seed=None, shuffle_level=0)) # ====== shuffle 0 ====== # test_iter_no_trans(feeder.set_batch(12, seed=1203, shuffle_level=0)) # ====== shuffle 2 ====== # test_iter_no_trans(feeder.set_batch(12, seed=1203, shuffle_level=2)) # ==================== Convert name to indices ==================== # feeder.set_recipes([ F.recipes.Name2Trans( converter_func=lambda name: int(name.split('_')[-1])), F.recipes.CreateBatch() ]) def test_iter_trans(it): x = [] y = 0 n = 0 for i, j in it: x += i.ravel().tolist() n += i.shape[0] y += np.sum(j) x = np.sort(x).tolist() self.assertEqual(x, REF) self.assertEqual(y, 99000) self.assertEqual(n, ds['X'].shape[0]) # ====== NO shuffle ====== # test_iter_trans(feeder.set_batch(12, seed=None, shuffle_level=0)) # ====== shuffle 0 ====== # test_iter_trans(feeder.set_batch(12, seed=1203, shuffle_level=0)) # ====== shuffle 2 ====== # test_iter_trans(feeder.set_batch(12, seed=1203, shuffle_level=2)) # ==================== Transcription ==================== # del feeder ds = F.Dataset(os.path.join(temppath, 'ds')) feeder = F.Feeder(ds['X'], indices=ds['indices.csv'], ncpu=2, buffer_size=2) feeder.set_recipes([ F.recipes.TransLoader(ds['transcription.dict'], dtype='int32'), F.recipes.CreateBatch() ]) n = 0 X = [] for i, j in feeder.set_batch(12, seed=1208251813, shuffle_level=2): X += i.ravel().tolist() n += i.shape[0] for x, y in zip(i, j): self.assertTrue(transcription_test[str(x.tolist())] == y) X = np.sort(X).tolist() self.assertEqual(X, REF) self.assertEqual(n, ds['X'].shape[0])
import numpy as np import os os.environ['ODIN'] = 'float32,cpu,theano,seed=12' from odin import backend as K from odin import nnet as N from odin import fuel from odin.utils import one_hot, UnitTimer ds = fuel.load_mspec_test() transcription_path = os.path.join(ds.path, 'alignment.dict') indices_path = os.path.join(ds.path, 'indices.csv') indices = np.genfromtxt(indices_path, dtype=str, delimiter=' ') transcription = fuel.MmapDict(transcription_path) mean = ds['mspec_mean'][:] std = ds['mspec_mean'][:] cache = 5 # =========================================================================== # Single process # =========================================================================== def get_data(): """ batch_size = 128 """ batch = [] batch_trans = [] for name, start, end in indices: start = int(start) end = int(end)