def __init__(self, func, data, epoch=1, p=1.0, batch_size=128, seed=None, shuffle_level=2, callbacks=None, labels=None, name=None, verbose=2): super(Task, self).__init__() self.set_func(func, data) # this Progbar will record the history as well self._labels = [str(l) for l in labels] \ if labels is not None else None self._progbar = Progbar(target=self.nb_samples, name=name, interval=0., print_report=True, print_summary=True) self._progbar.set_labels(self._labels) # ====== set callback and verbose ====== # self._callback = CallbackList(callbacks) self.set_verbose(verbose) # ====== assign other arguments ====== # self._nb_epoch = epoch self._p = np.clip(p, 0., 1.) self._seed = seed self.set_batch(batch_size, seed, shuffle_level) self._name = name # ====== current info ====== # self._curr_epoch = 0 self._curr_iter = 0 self._curr_samples = 0 self._curr_epoch_iter = 0 self._curr_epoch_samples = 0 self._callback_msg = [] # ====== iter tracking ====== # self._created_iter = None self._stop = False
def _load_archive(self, path, extract_path): from zipfile import ZipFile, ZIP_DEFLATED try: zfile = ZipFile(path, mode='r', compression=ZIP_DEFLATED) allfile = zfile.namelist() # validate extract_path if not os.path.isdir(extract_path): raise ValueError('Extract path must be path folder, but path' '={} is a file'.format(extract_path)) extract_path = os.path.join(extract_path, os.path.basename(path).replace('.zip', '')) # found the extracted dir, use it if os.path.isdir(extract_path) and \ set(os.listdir(extract_path)) == set(allfile): self._set_path(extract_path) return # decompress everything if not os.path.exists(extract_path): os.mkdir(extract_path) maxlen = max([len(i) for i in allfile]) pb = Progbar(target=len(allfile), name="[Dataset] Loading Archive", print_summary=True, print_report=True) for i, f in enumerate(allfile): zfile.extract(f, path=extract_path) pb['File'] = ('Unarchiving: %-' + str(maxlen) + 's') % f pb.add(1) # ====== finally set path ====== # self._set_path(extract_path) except IOError as e: raise IOError('Error loading archived dataset, path:{}, error:{}' '.'.format(path, e)) return None
def fit(self, X, y=None, print_progress=False): """Fit the model with X, using minibatches of size batch_size. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. y: Passthrough for ``Pipeline`` compatibility. Returns ------- self: object Returns the instance itself. """ if isinstance(X, Data): X = X[:] X = check_array(X, copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if self.batch_size is None: batch_size = 12 * n_features else: batch_size = self.batch_size if print_progress: prog = Progbar(target=n_samples) for batch in gen_batches(n_samples, batch_size): x = X[batch] self.partial_fit(x, check_input=False) if print_progress: prog.add(x.shape[0]) return self
def _load_archive(self, path, extract_path): from zipfile import ZipFile, ZIP_DEFLATED try: zfile = ZipFile(path, mode='r', compression=ZIP_DEFLATED) allfile = zfile.namelist() # validate extract_path if not os.path.isdir(extract_path): raise ValueError('Extract path must be path folder, but path' '={} is a file'.format(extract_path)) extract_path = os.path.join( extract_path, os.path.basename(path).replace('.zip', '')) # found the extracted dir, use it if os.path.isdir(extract_path) and \ set(os.listdir(extract_path)) == set(allfile): self._set_path(extract_path) return # decompress everything if not os.path.exists(extract_path): os.mkdir(extract_path) maxlen = max([len(i) for i in allfile]) progbar = Progbar(len(allfile)) for i, f in enumerate(allfile): zfile.extract(f, path=extract_path) progbar.title = ('Unarchiving: %-' + str(maxlen) + 's') % f progbar.update(i + 1) # ====== finally set path ====== # self._set_path(extract_path) except IOError as e: raise IOError('Error loading archived dataset, path:{}, error:{}' '.'.format(path, e)) return None
def _extract_zero_and_first_stats(X, sad, indices, gmm, z_path, f_path, name_path): n_samples = X.shape[0] # indices is None, every row is single sample (utterance or image ...) if indices is None: if os.path.exists(z_path): os.remove(z_path) if os.path.exists(f_path): os.remove(f_path) Z = MmapArrayWriter(path=z_path, dtype='float32', shape=(n_samples, gmm.nmix), remove_exist=True) F = MmapArrayWriter(path=f_path, dtype='float32', shape=(n_samples, gmm.feat_dim * gmm.nmix), remove_exist=True) jobs, _ = _split_jobs(n_samples, ncpu=mpi.cpu_count(), device='cpu', gpu_factor=1) def map_transform(start_end): start, end = start_end for i in range(start, end): # removed by SAD if sad is not None and not bool(sad[i]): yield None, None, None else: z, f = gmm.transform(X[i][np.newaxis, :], zero=True, first=True, device='cpu') yield i, z, f prog = Progbar(target=n_samples, print_report=True, print_summary=False, name="Extracting zero and first order statistics") for i, z, f in mpi.MPI(jobs, map_transform, ncpu=None, batch=1): if i is not None: # i None means removed by SAD Z[i] = z F[i] = f prog.add(1) Z.flush() F.flush() Z.close() F.close() # use directly the transform_to_disk function else: gmm.transform_to_disk(X, indices=indices, sad=sad, pathZ=z_path, pathF=f_path, name_path=name_path, dtype='float32', device=None, ncpu=None, override=True)
def test_conv_deconv_transpose(self): def feval(X, y): f = K.function(X, y) shape = (np.random.randint(8, 18), ) + tuple(X.shape.as_list()[1:]) x = np.random.rand(*shape) return f(x) prog = Progbar(target=2 * 3 * 3 * 2 * 2, print_report=True) for X in (K.placeholder(shape=(None, 13, 12, 25)), K.placeholder(shape=(None, 13, 12, 8, 25))): for strides in (1, 2, 3): for filter_size in (3, 4, 5): for num_filters in (8, 25): for pad in ("same", "valid"): for dilation in (1, ): # ====== progress ====== # prog['test'] = "#Dim:%d;Stride:%d;Filter:%d;Channel:%d;Pad:%s" % \ (X.shape.ndims, strides, filter_size, num_filters, pad) prog.add(1) # ====== test Conv ====== # f = N.Conv(num_filters=num_filters, filter_size=filter_size, pad=pad, strides=strides, activation=tf.nn.relu, dilation=dilation) fT = f.T y = f(X) self.assertEqual( feval(X, y).shape[1:], tuple(y.shape.as_list()[1:])) yT = fT(y) self.assertEqual( feval(X, yT).shape[1:], tuple(yT.shape.as_list()[1:])) self.assertEqual(X.shape.as_list(), yT.shape.as_list()) # ====== test Transpose ====== # f = N.TransposeConv(num_filters=num_filters, filter_size=filter_size, pad=pad, strides=strides, activation=K.relu, dilation=dilation) fT = f.T y = f(X) self.assertEqual( feval(X, y).shape[1:], tuple(y.shape.as_list()[1:])) yT = fT(y) self.assertEqual( feval(X, yT).shape[1:], tuple(yT.shape.as_list()[1:])) self.assertEqual(X.shape.as_list(), yT.shape.as_list())
def fit(self, texts, vocabulary=None): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. """ texts = self._validate_texts(texts) word_counts = self._word_counts word_docs = self._word_docs # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== start processing ====== # prog = Progbar(target=1234, name="Fitting tokenizer", print_report=True, print_summary=True) start_time = timeit.default_timer() for nb_docs, doc in processor(texts, vocabulary, keep_order=False): total_docs_tokens = 0 seen_words = {} # update words->count for token in doc: total_docs_tokens += 1 word_counts[token] += 1 # update words->doc if token not in seen_words: seen_words[token] = 1 word_docs[token] += 1 # save longest docs if total_docs_tokens > self.__longest_document[-1]: self.__longest_document = [doc, total_docs_tokens] # print progress prog['#Doc'] = nb_docs prog['#Tok'] = len(word_counts) prog.add(1) if prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # ====== print summary of the process ====== # # if self.print_progress: # prog.target = nb_docs; prog.update(nb_docs) processing_time = timeit.default_timer() - start_time print('Processed %d-docs, %d-tokens in %f second.' % (nb_docs, len(word_counts), processing_time)) self.nb_docs += nb_docs # ====== sorting ====== # self._refresh_dictionary() return self
def fit(self, texts, vocabulary=None): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. """ texts = self._validate_texts(texts) word_counts = self._word_counts word_docs = self._word_docs # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== start processing ====== # prog = Progbar(target=1208, name="Fitting tokenizer", print_report=True, print_summary=True) start_time = timeit.default_timer() for nb_docs, doc in processor(texts, vocabulary, keep_order=False): total_docs_tokens = 0 seen_words = {} # update words->count for token in doc: total_docs_tokens += 1 word_counts[token] += 1 # update words->doc if token not in seen_words: seen_words[token] = 1 word_docs[token] += 1 # save longest docs if total_docs_tokens > self.__longest_document[-1]: self.__longest_document = [doc, total_docs_tokens] # print progress prog['#Doc'] = nb_docs prog['#Tok'] = len(word_counts) prog.add(1) if prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # ====== print summary of the process ====== # # if self.print_progress: # prog.target = nb_docs; prog.update(nb_docs) processing_time = timeit.default_timer() - start_time print('Processed %d-docs, %d-tokens in %f second.' % (nb_docs, len(word_counts), processing_time)) self.nb_docs += nb_docs # ====== sorting ====== # self._refresh_dictionary() return self
def __init__(self, name, format='', tracking=[]): super(ProgressMonitor, self).__init__() self.name = name self._history = [] self._prog = Progbar(100, title='') # ====== format ====== # self._format_results = 0 for i in _PLACEHOLDER: self._format_results += len(i.findall(format)) self._format = format # ====== one-time tracking at epoch_end ====== # if isinstance(tracking, dict): tracking = tracking.iteritems() self.tracking = [(int(i), j) for i, j in tracking if callable(j)] self._tracking_history = defaultdict(list)
def archive(self): from zipfile import ZipFile, ZIP_DEFLATED path = self.archive_path zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED) files = set([_[-1] for _ in self._data_map.itervalues()]) progbar = Progbar(len(files), title='Archiving:') maxlen = max([len(os.path.basename(i)) for i in files]) for i, f in enumerate(files): zfile.write(f, os.path.basename(f)) progbar.title = ('Archiving: %-' + str(maxlen) + 's') % os.path.basename(f) progbar.update(i + 1) zfile.close() return path
def archive(self): from zipfile import ZipFile, ZIP_DEFLATED path = self.archive_path zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED) files = set([_[-1] for _ in self._data_map.values()]) prog = Progbar(target=len(files), name="[Dataset] Archiving", print_report=True, print_summary=True) maxlen = max([len(os.path.basename(i)) for i in files]) for i, f in enumerate(files): zfile.write(f, os.path.basename(f)) prog['Data'] = ('Archiving: %-' + str(maxlen) + 's') \ % os.path.basename(f) prog.add(1) zfile.close() return path
def _saveable_variables(self): return {'_format': self._format, '_format_results': self._format_results, '_history': [], 'tracking': self.tracking, '_tracking_history': defaultdict(list), '_prog': Progbar(100, title=''), 'name': self.name}
def transform_mpi(self, X, y=None, keep_order=True, ncpu=4, n_components=None, print_progress=False): """ Sample as transform but using multiprocessing """ n = X.shape[0] if self.batch_size is None: batch_size = 12 * len(self.mean_) else: batch_size = self.batch_size batch_list = [(i, min(i + batch_size, n)) for i in range(0, n + batch_size, batch_size) if i < n] if print_progress: prog = Progbar(target=n) # ====== run MPI jobs ====== # def map_func(batch): for start, end in batch: start, end = batch[0] x = super(MiniBatchPCA, self).transform(X=X[start:end], y=y) # doing dim reduction here save a lot of memory for # inter-processors transfer if n_components is not None: x = x[:, :n_components] # just need to return the start for ordering yield start, x mpi = MPI(batch_list, map_func=map_func, ncpu=ncpu, buffer_size=1, maximum_queue_size=ncpu * 12) # ====== process the return ====== # X_transformed = [] for start, x in mpi: X_transformed.append((start, x)) if print_progress: prog.add(x.shape[0]) if keep_order: X_transformed = sorted(X_transformed, key=lambda x: x[0]) X_transformed = np.concatenate([x[-1] for x in X_transformed], axis=0) return X_transformed
def save_cache(self, path, datatype='memmap', print_progress=True): """ Save all preprocessed data to a Dataset """ if not isinstance(path, str) or os.path.isfile(path): raise ValueError('path must be string path to a folder.') if os.path.exists(path): print('Remove old dataset at path:', path) shutil.rmtree(path) ds = Dataset(path) # ====== start caching ====== # if print_progress: prog = Progbar(target=self.shape[0], title='Caching:') for X in self: if not isinstance(X, (tuple, list)): X = (X, ) # saving preprocessed data for i, x in enumerate(X): name = 'data%d' % i if name in ds: ds[name].append(x) else: ds[(name, datatype)] = x # print progress if print_progress: prog.add(X[0].shape[0]) prog.target = prog.seen_so_far prog.add(0) ds.flush() ds.close() # end return self
def _extract_zero_and_first_stats(X, sad, indices, gmm, z_path, f_path, name_path): n_samples = X.shape[0] # indices is None, every row is single sample (utterance or image ...) if indices is None: if os.path.exists(z_path): os.remove(z_path) if os.path.exists(f_path): os.remove(f_path) Z = MmapData(path=z_path, dtype='float32', shape=(n_samples, gmm.nmix), read_only=False) F = MmapData(path=f_path, dtype='float32', shape=(n_samples, gmm.feat_dim * gmm.nmix), read_only=False) jobs, _ = _split_jobs(n_samples, ncpu=mpi.cpu_count(), device='cpu', gpu_factor=1) def map_transform(start_end): start, end = start_end for i in range(start, end): # removed by SAD if sad is not None and not bool(sad[i]): yield None, None, None else: z, f = gmm.transform(X[i][np.newaxis, :], zero=True, first=True, device='cpu') yield i, z, f prog = Progbar(target=n_samples, print_report=True, print_summary=False, name="Extracting zero and first order statistics") for i, z, f in mpi.MPI(jobs, map_transform, ncpu=None, batch=1): if i is not None: # i None means removed by SAD Z[i] = z F[i] = f prog.add(1) Z.flush(); Z.close() F.flush(); F.close() # use directly the transform_to_disk function else: gmm.transform_to_disk(X, indices=indices, sad=sad, pathZ=z_path, pathF=f_path, name_path=name_path, dtype='float32', device=None, ncpu=None, override=True)
def transform(self, X, y=None, n_components=None, print_progress=False): n = X.shape[0] if self.batch_size is None: batch_size = 12 * len(self.mean_) else: batch_size = self.batch_size batch_list = [(i, min(i + batch_size, n)) for i in range(0, n + batch_size, batch_size) if i < n] if print_progress: prog = Progbar(target=n) # ====== start transforming ====== # X_transformed = [] for start, end in batch_list: x = super(MiniBatchPCA, self).transform(X=X[start:end], y=y) if n_components is not None: x = x[:, :n_components] X_transformed.append(x) if print_progress: prog.add(x.shape[0]) return np.concatenate(X_transformed, axis=0)
def _fitting_helper(it, fn, nb_samples, nb_classes, title): prog = Progbar(target=nb_samples, print_report=True, print_summary=False, name=title) results = None start_time = time.time() for nb_iter, (x, y) in enumerate(it): # ====== preprocessing ====== # x, y = _preprocess_xy(x, y, nb_classes) # ====== post-processing results ====== # if results is None: results = list(fn(x, y)) else: for idx, r in enumerate(fn(x, y)): results[idx] += r # ====== update progress ====== # prog.add(x.shape[0]) duration = time.time() - start_time return (nb_iter + 1, duration, [r if isinstance(r, np.ndarray) else r / (nb_iter + 1) for r in results])
def _predict(self, X, f_pred): if not self.is_fitted: raise RuntimeError("LogisticRegression hasn't been initialized or " "fitted.") if hasattr(X, 'set_batch'): it = iter(X.set_batch(batch_size=self.batch_size, seed=None)) elif hasattr(X, '__getitem__'): it = (X[start:end] for start, end in batching(batch_size=self.batch_size, n=X.shape[0])) else: raise ValueError("`X` must has attributes 'set_batch' or '__getitem__'") # ====== make prediction ====== # y = [] prog = Progbar(target=X.shape[0], print_report=True, print_summary=False, name="Predicting") for x in it: x = _preprocess_xy(x, y=None, nb_classes=self.nb_classes) y.append(f_pred(x)) prog.add(x.shape[0]) return np.concatenate(y, axis=0)
def _extract_test_data(feat, label, utt_length): prog = Progbar(target=len(feeder_test), print_summary=True, name="Preprocessing test set") X_test = defaultdict(list) for name, idx, X, y in feeder_test: # validate everything as expected assert fn_label(name) == np.argmax(y), name # label is right # save to list X_test[name].append((idx, X)) prog.add(X.shape[0]) # ====== create 1 array for data and dictionary for indices ====== # X_test_name = [] X_test_data = [] for name, X in X_test.items(): X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])], axis=0).astype('float16') X_test_name += [name + '.%d' % i for i in range(len(X))] X_test_data.append(X) X_test_name = np.array(X_test_name) X_test_data = np.concatenate(X_test_data, axis=0) return X_test_name, X_test_data
def _fitting_helper(it, fn, nb_samples, nb_classes, title): prog = Progbar(target=nb_samples, print_report=True, print_summary=False, name=title) results = None start_time = time.time() for nb_iter, (x, y) in enumerate(it): # ====== preprocessing ====== # x, y = _preprocess_xy(x, y, nb_classes) # ====== post-processing results ====== # if results is None: results = list(fn(x, y)) else: for idx, r in enumerate(fn(x, y)): results[idx] += r # ====== update progress ====== # prog.add(x.shape[0]) duration = time.time() - start_time return (nb_iter + 1, duration, [ r if isinstance(r, np.ndarray) else r / (nb_iter + 1) for r in results ])
def predict_proba(self, *args): self._auto_create_inputs(args) self._create_function() n = 0 nb_samples = args[0].shape[0] batch_size = self._batch_size prediction = [] prog = Progbar(target=nb_samples, title='Predicting') while n < nb_samples: end = min(n + batch_size, nb_samples) x = [i[n:end] for i in args] x = self._functions['pred'](*x) _min = np.min(x, axis=-1)[:, None] _max = np.max(x, axis=-1)[:, None] x = (x - _min) / (_max - _min) x = x / x.sum(-1)[:, None] prediction.append(x) n = end prog.update(n) return np.concatenate(prediction, axis=0)
def make_prediction(feeder, title): prog = Progbar(target=len(feeder), print_summary=True, name=title) name_list = [] y_pred = [] y_true = [] for name, idx, X, y in feeder.set_batch(batch_size=100000, batch_mode='file', seed=None, shuffle_level=0): name_list.append(name) y = np.argmax(y, axis=-1) assert len(np.unique(y)) == 1, name spk = label2spk[y[0]] assert spkid[name] == spk, name y_true.append(y) y_ = f_prob(X) y_pred.append(y_) assert len(y) == len(y_) prog.add(X.shape[0]) evaluate_prediction(name_list, y_pred, y_true, title=title)
def evaluate_latent(fn, feeder, title): y_true = [] Z = [] for outputs in Progbar(feeder.set_batch(batch_mode='file'), name=title, print_report=True, print_summary=False, count_func=lambda x: x[-1].shape[0]): name = str(outputs[0]) idx = int(outputs[1]) data = outputs[2:] assert idx == 0 y_true.append(name) Z.append(fn(*data)) Z = np.concatenate(Z, axis=0) # ====== visualize spectrogram ====== # if Z.ndim >= 3: sample = np.random.choice(range(len(Z)), size=3, replace=False) spec = Z[sample.astype('int32')] y = [y_true[int(i)] for i in sample] plot_figure(nrow=6, ncol=6) for i, (s, tit) in enumerate(zip(spec, y)): s = s.reshape(len(s), -1) plot_spectrogram(s.T, ax=(1, 3, i + 1), title=tit) # ====== visualize each point ====== # # flattent to 2D Z = np.reshape(Z, newshape=(len(Z), -1)) # tsne if necessary if Z.shape[-1] > 3: Z = fast_tsne(Z, n_components=3, n_jobs=8, random_state=K.get_rng().randint(0, 10e8)) # color and marker Z_color = [digit_color_map[i.split('_')[-1]] for i in y_true] Z_marker = [gender_marker_map[i.split('_')[1]] for i in y_true] plot_figure(nrow=6, ncol=20) for i, azim in enumerate((15, 60, 120)): plot_scatter(x=Z[:, 0], y=Z[:, 1], z=Z[:, 2], ax=(1, 3, i + 1), size=4, color=Z_color, marker=Z_marker, azim=azim, legend=legends if i == 1 else None, legend_ncol=11, fontsize=10, title=title) plot_save(os.path.join(FIG_PATH, '%s.pdf' % title))
def make_dnn_prediction(functions, X, batch_size=256, title=''): return_list = True if not isinstance(functions, (tuple, list)): functions = [functions] return_list = False n_functions = len(functions) results = [[] for i in range(n_functions)] # ====== prepare progress bar ====== # n_samples = len(X) prog = Progbar(target=n_samples, print_summary=True, name="Making prediction: %s" % str(title)) # ====== for feeder ====== # if isinstance(X, F.Feeder): y_true = [] for x, y in X.set_batch(batch_size=batch_size): for res, fn in zip(results, functions): res.append(fn(x)) prog.add(x.shape[0]) y_true.append(np.argmax(y, axis=-1) if y.ndim == 2 else y) results = [np.concatenate(res, axis=0) for res in results] y_true = np.concatenate(y_true, axis=0) if return_list: return results, y_true return results[0], y_true # ====== for numpy array ====== # else: for start, end in batching(batch_size=batch_size, n=n_samples): y = X[start:end] for res, fn in zip(results, functions): res.append(fn(y)) prog.add(end - start) results = [np.concatenate(res, axis=0) for res in results] if return_list: return results return results[0]
def _predict(self, X, f_pred): if not self.is_fitted: raise RuntimeError("LogisticRegression hasn't been initialized or " "fitted.") if hasattr(X, 'set_batch'): it = iter(X.set_batch(batch_size=self.batch_size, seed=None)) elif hasattr(X, '__getitem__'): it = (X[start:end] for start, end in batching(batch_size=self.batch_size, n=X.shape[0])) else: raise ValueError( "`X` must has attributes 'set_batch' or '__getitem__'") # ====== make prediction ====== # y = [] prog = Progbar(target=X.shape[0], print_report=True, print_summary=False, name="Predicting") for x in it: x = _preprocess_xy(x, y=None, nb_classes=self.nb_classes) y.append(f_pred(x)) prog.add(x.shape[0]) return np.concatenate(y, axis=0)
def test_pool_depool(self): X1 = K.placeholder(shape=(None, 12, 8, 25), name='X1') X2 = K.placeholder(shape=(None, 12, 8, 25, 18), name='X2') x1 = np.random.rand(13, 12, 8, 25) x2 = np.random.rand(13, 12, 8, 25, 18) prog = Progbar(target=2 * 2 * 2 * 3, print_report=True) def check_shape(s1, s2): self.assertEqual(tuple(s1), tuple(s2), msg="%s != %s" % (str(s1), str(s2))) for pool_size in (2, 3): for strides in (2, 3): # strides > window_shape not supported due to inconsistency # between CPU and GPU implementations if pool_size < strides: prog.add(1) continue for pad in ('valid', 'same'): for transpose_mode in ('nn', 'pad_margin', 'repeat'): # ====== print prog ====== # prog['test'] = "Size:%d,Stride:%d,Pad:%s,T:%s" % \ (pool_size, strides, pad, transpose_mode) prog.add(1) # ====== check ops 4D ====== # down = N.Pool(pool_size=pool_size, strides=strides, pad=pad, mode='max', transpose_mode=transpose_mode) up = down.T y1 = down(X1) check_shape( K.eval(y1, { X1: x1 }).shape[1:], y1.shape.as_list()[1:]) y2 = up(y1) check_shape(K.eval(y2, {X1: x1}).shape, x1.shape) # ====== check ops 5D ====== # down = N.Pool(pool_size=pool_size, strides=strides, pad=pad, mode='max', transpose_mode=transpose_mode) up = down.T y1 = down(X2) check_shape( K.eval(y1, { X2: x2 }).shape[1:], y1.shape[1:]) y2 = up(y1) check_shape(K.eval(y2, {X2: x2}).shape, x2.shape)
def validating_noise_data(in_path_raw): # preparing noise_dataset = ['musan', 'rirs'] all_files = defaultdict(list) n_files = sum( len(sre_file_list[i]) for i in noise_dataset if i in sre_file_list) n_non_exist = 0 n_exist = 0 prog = Progbar(target=n_files, print_summary=True, name="Validating noise dataset") prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1]) prog.set_summarizer(key='#Exist', fn=lambda x: x[-1]) # check all dataset for ds_name in noise_dataset: if ds_name not in sre_file_list: continue if ds_name not in in_path_raw: continue base_path = in_path_raw[ds_name] base_ds = all_files[ds_name] # start validating for row in sre_file_list[ds_name]: # check file path, channel, name, noise_type, duration = row[:5] path = os.path.join(base_path, path) if os.path.exists(path): base_ds.append([path, channel, name, noise_type, duration]) n_exist += 1 else: n_non_exist += 1 # update progress prog['ds'] = ds_name prog['#Exist'] = n_exist prog['#Non-exist'] = n_non_exist prog.add(1) # ====== return ====== # # Header: # 0 1 2 3 4 # path, channel, name, noise_type, duration return { key: np.array(sorted(val, key=lambda x: x[0])) for key, val in all_files.items() }
def validating_noise_data(in_path_raw): # preparing noise_dataset = ['musan', 'rirs'] all_files = defaultdict(list) n_files = sum(len(sre_file_list[i]) for i in noise_dataset if i in sre_file_list) n_non_exist = 0 n_exist = 0 prog = Progbar(target=n_files, print_summary=True, name="Validating noise dataset") prog.set_summarizer(key='#Non-exist', fn=lambda x: x[-1]) prog.set_summarizer(key='#Exist', fn=lambda x: x[-1]) # check all dataset for ds_name in noise_dataset: if ds_name not in sre_file_list: continue if ds_name not in in_path_raw: continue base_path = in_path_raw[ds_name] base_ds = all_files[ds_name] # start validating for row in sre_file_list[ds_name]: # check file path, channel, name, noise_type, duration = row[:5] path = os.path.join(base_path, path) if os.path.exists(path): base_ds.append([path, channel, name, noise_type, duration]) n_exist += 1 else: n_non_exist += 1 # update progress prog['ds'] = ds_name prog['#Exist'] = n_exist prog['#Non-exist'] = n_non_exist prog.add(1) # ====== return ====== # # Header: # 0 1 2 3 4 # path, channel, name, noise_type, duration return {key: np.array(sorted(val, key=lambda x: x[0])) for key, val in all_files.items()}
def evaluate_feeder(feeder, title): y_true_digit = [] y_true_gender = [] y_pred = [] for outputs in Progbar(feeder.set_batch(batch_mode='file'), name=title, print_report=True, print_summary=False, count_func=lambda x: x[-1].shape[0]): name = str(outputs[0]) idx = int(outputs[1]) data = outputs[2:] assert idx == 0 y_true_digit.append(f_digits(name)) y_true_gender.append(f_genders(name)) y_pred.append(f_pred(*data)) # ====== post processing ====== # y_true_digit = np.array(y_true_digit, dtype='int32') y_true_gender = np.array(y_true_gender, dtype='int32') y_pred_proba = np.concatenate(y_pred, axis=0) y_pred_all = np.argmax(y_pred_proba, axis=-1).astype('int32') # ====== plotting for each gender ====== # plot_figure(nrow=6, ncol=25) for gen in range(len(genders)): y_true, y_pred = [], [] for i, g in enumerate(y_true_gender): if g == gen: y_true.append(y_true_digit[i]) y_pred.append(y_pred_all[i]) if len(y_true) == 0: continue cm = confusion_matrix(y_true, y_pred, labels=range(len(digits))) plot_confusion_matrix(cm, labels=digits, fontsize=8, ax=(1, 4, gen + 1), title='[%s]%s' % (genders[gen], title)) plot_save(os.path.join(FIG_PATH, '%s.pdf' % title))
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError( "The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([ k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices' ]) # ====== checking indices ====== # main_indices = { name: (start, end) for name, (start, end) in ds['indices'].items() } for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform(ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()
def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey(lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError("Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError("Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'.") # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str(get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join([s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)); flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError("ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError("Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write('[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification('Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification('Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
def save_cache(self, path, name=None, dtype=None, batch_size=1024): """ Save all preprocessed data to a Dataset Parameters ---------- path: string path to a folder name: None, or list of string specific name for each returned `numpy.ndarray` during iteration dtype: None, or list of dtype, or single dtype specific dtype for all or each of returned `numpy.ndarray` during iteration batch_size: int amount of samples for each batch (higher the faster iteration) Note ---- Only returned `numpy.ndarray` are saved """ from odin.fuel.dataset import Dataset if not is_string(path): raise ValueError("`path` must be string path to a folder.") if os.path.exists(path) and os.path.isfile(path): raise ValueError("`path` is a file, required a folder for " "saving all cache data.") # ====== start caching ====== # prog = Progbar(target=len(self), name='Saving cache of preprocessed data', print_report=True, print_summary=True) ds = Dataset(path, override=True) with self.set_batch_context(batch_size=int(batch_size), seed=None, start=0, end=-1, shuffle_level=0): for X in self: if not isinstance(X, (tuple, list)): X = (X,) n = 0 i = 0 # saving preprocessed data for x in X: if isinstance(x, np.ndarray): # checking name if name is None: x_name = 'X%d' % i else: x_name = name[i] # checking dtype if isinstance(dtype, (tuple, list)): x = x.astype(dtype[i]) elif dtype is not None: x = x.astype(dtype) # saving to the dataset if x_name in ds: ds[x_name].append(x) else: ds[(x_name, 'memmap')] = x # update samples count, and data count n = x.shape[0] i += 1 # print progress prog.add(n) # ====== flush and close everything ====== # ds.flush() ds.close() with open(os.path.join(path, 'README'), 'wb') as f: f.write(str(self)) # end # ====== check one more time ====== # ds = Dataset(path, read_only=True) print(ds) print(ctext("Dataset size:", 'cyan'), ds.size, '(MB)') ds.close() return self
def copy(self, destination, indices_filter=None, data_filter=None, override=False): """ Copy the dataset to a new folder and closed the old dataset """ from distutils.dir_util import copy_tree read_only = self.read_only # indices if indices_filter is not None and \ not is_callable(indices_filter) and \ not isinstance(indices_filter, (tuple, list)): raise ValueError('`indices_filter` must be callable, tuple, list or None') if isinstance(indices_filter, (tuple, list)): tmp = tuple(indices_filter) indices_filter = lambda x: x in tmp # data name if data_filter is not None and \ not is_callable(data_filter) and \ not isinstance(data_filter, (tuple, list)): raise ValueError('`data_filter` must be callable, tuple, list or None') if isinstance(data_filter, (tuple, list)): tmp = tuple(data_filter) data_filter = lambda x: x in tmp # ====== other files which are not Data ====== # other_files = [i for i in os.listdir(self.path) if i not in self] # ====== preprocessing ====== # destination = os.path.abspath(str(destination)) if not os.path.exists(destination): os.mkdir(destination) elif not os.path.isdir(destination): raise ValueError('path at "%s" must be a folder' % destination) elif override: shutil.rmtree(destination) os.mkdir(destination) else: raise ValueError("A folder exist at path: '%s', cannot be overrided." % destination) # ====== copy everything ====== # if indices_filter is None and data_filter is None: print("Copying %s files from '%s' to '%s' ..." % (ctext(len(self), 'cyan'), ctext(self.path, 'yellow'), ctext(destination, 'yellow'))) copy_tree(self.path, destination) # ====== only data_filter ====== # elif indices_filter is None: data_list = [i for i in self.keys() if data_filter(i)] # copy all the data for name in data_list: org_path = os.path.join(self.path, name) dst_path = os.path.join(destination, name) print("Copying from '%s' to '%s' ..." % (ctext(org_path, 'yellow'), ctext(dst_path, 'yellow'))) shutil.copy2(org_path, dst_path) # copy all the related indices for name in self.keys(): org_path = os.path.join(self.path, name) dst_path = os.path.join(destination, name) if not os.path.exists(dst_path) and \ ('indices' == name or any(i in data_list for i in name.split('_')[1:])): print("Copying Indices from '%s' to '%s'" % (ctext(org_path, 'cyan'), ctext(dst_path, 'cyan'))) shutil.copy2(org_path, dst_path) # ====== use indices_filter and data_filter ====== # else: if data_filter is None: all_data = list(self.keys()) else: all_data = [i for i in self.keys() if data_filter(i)] # list of data with separated indices separated_data = flatten_list( [k.split('_')[1:] for k in self.keys() if 'indices_' == k[:8]]) # iterate over indices and copy one by one data for ids_name in [k for k in self.keys() if 'indices' == k[:7]]: indices = [(n, (s, e)) for n, (s, e) in self[ids_name] if indices_filter(n)] # no match indices, skip if len(indices) == 0: continue nb_samples = sum(e - s for n, (s, e) in indices) # get all data assigned to given indices data = ids_name.split('_')[1:] if len(data) == 0: data = [i for i in all_data if i not in separated_data] else: data = [i for i in data if i in all_data] # if still no data found, skip if len(data) == 0: continue # copy each data for data_name in data: X = self[data_name] # copy big MmapDict if isinstance(X, MmapDict) and len(X) == len(self[ids_name]): new_path = os.path.join(destination, os.path.basename(X.path)) print("Copying MmapDict from '%s' to '%s'" % ( ctext(X.path, 'cyan'), ctext(new_path, 'cyan'))) new_dict = MmapDict(new_path, cache_size=80000, read_only=False) for n, (s, e) in indices: new_dict[n] = X[n] new_dict.flush(save_all=True) new_dict.close() # copy MmapData elif isinstance(X, MmapData): Y = MmapData(path=os.path.join(destination, data_name), dtype=X.dtype, shape=(0,) + X.shape[1:], read_only=False) prog = Progbar(target=nb_samples, print_report=True, print_summary=True, name="Copying data: '%s' to path:'%s'" % (ctext(data_name, 'yellow'), ctext(Y.data_info, 'cyan'))) for n, (s, e) in indices: Y.append(X[s:e]) prog.add(e - s) # unknown data-type else: org_path = os.path.join(self.path, data_name) new_path = os.path.join(destination, data_name) # just copy directly the files if os.path.isfile(org_path) or \ not os.path.exists(new_path): shutil.copy2(org_path, new_path) print("Copying '%s' to '%s' ..." % (ctext(org_path, 'cyan'), ctext(new_path, 'yellow'))) else: wprint("Cannot copy: '%s' - %s" % (ctext(data_name, 'cyan'), ctext(type(self[data_name]), 'yellow'))) # copy the indices new_indices = MmapDict(os.path.join(destination, ids_name), cache_size=80000, read_only=False) start = 0 for n, (s, e) in indices: size = e - s new_indices[n] = (start, start + size) start += size new_indices.flush(save_all=True) new_indices.close() # ====== copy others files ====== # for f in other_files: org_path = os.path.join(self.path, f) dst_path = os.path.join(destination, f) if not os.path.exists(dst_path): if os.path.isdir(org_path): # directory copy_tree(org_path, dst_path) else: # single file shutil.copy2(org_path, dst_path) # ====== readme ====== # readme_name = os.path.basename(self._readme_path) dst_path = os.path.join(destination, readme_name) if not os.path.exists(dst_path): shutil.copy2(self._readme_path, dst_path) return Dataset(destination, read_only=read_only)
def filter_utterances(X, indices, spkid, min_dur=None, min_utt=None, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=None, title=''): """ X : 2-D matrix input features indices : Mapping utterance_name -> (start, end) in `X` spkid : Mapping utterance_name -> speaker_id remove_min_length : bool (default: True) if True, remove all files shorter than MINIMUM_UTT_DURATION remove_min_uttspk : bool (default: True) if True, remove all speakers with lower amount of utterances than MINIMUM_UTT_PER_SPEAKERS n_speakers : {None, int} (default: None) if given, downsample the dataset by given number of speakers save_path : {None, str} (default: None) if given, pickle all filtered files to disk """ if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS minimum_amount_of_frames = min_dur / Config.STEP_LENGTH save_data = {} prog = Progbar(target=len(indices), print_report=True, print_summary=True, name='Filtering broken utterances: %s' % title) prog.set_summarizer('zero-length', fn=lambda x: x[-1]) prog.set_summarizer('min-frames', fn=lambda x: x[-1]) prog.set_summarizer('zero-var', fn=lambda x: x[-1]) prog.set_summarizer('small-var', fn=lambda x: x[-1]) prog.set_summarizer('overflow', fn=lambda x: x[-1]) # ====== mpi function for checking ====== # @nb.jit(nopython=True, nogil=True) def _fast_mean_var_ax0(z): # using this function for calculating mean and variance # can double the speed but cannot check overflow, # only accept float32 or float64 input s1 = np.zeros(shape=(z.shape[1],), dtype=z.dtype) s2 = np.zeros(shape=(z.shape[1],), dtype=z.dtype) for i in range(z.shape[0]): s1 += z[i] s2 += np.power(z[i], 2) mean = s1 / z.shape[0] var = s2 / z.shape[0] - np.power(mean, 2) return mean, var def _mpi_func(jobs): for name, (start, end) in jobs: y = X[start:end] # flags is_zero_len = False is_zero_var = False is_small_var = False is_min_frames = False is_overflow = False # checking length if y.shape[0] == 0: is_zero_len = True elif y.shape[0] < minimum_amount_of_frames: is_min_frames = True # checking statistics else: with catch_warnings_error(RuntimeWarning): try: # mean = np.mean(y, axis=-1) var = np.var(y, axis=-1) # min_val = np.min(y, axis=-1) # max_val = np.max(y, axis=-1) # numerical unstable except RuntimeWarning as w: if 'overflow encountered' in str(w): is_overflow = True else: print(name, ':', w) # process with more numerical filtering else: if np.any(np.isclose(var, 0)): is_zero_var = True # very heuristic and aggressive here # filter-out anything with ~16.67% of low-var # this could remove 1/3 of the original data if np.sum(var < 0.01) > (len(y) / 6): is_small_var = True # return the flags yield (name, is_zero_len, is_min_frames, is_zero_var, is_small_var, is_overflow) # ====== running the multiprocessing filter ====== # zero_len_files = {} min_frame_files = {} zero_var_files = {} small_var_files = {} overflow_files = {} for res in mpi.MPI(jobs=sorted(indices.items(), key=lambda x: x[1][0]), func=_mpi_func, ncpu=NCPU if ncpu is None else int(ncpu), batch=250): name = res[0] if res[1]: zero_len_files[name] = 1 if res[2]: min_frame_files[name] = 1 if res[3]: zero_var_files[name] = 1 if res[4]: small_var_files[name] = 1 if res[5]: overflow_files[name] = 1 # update progress prog['name'] = name[:48] prog['zero-length'] = len(zero_len_files) prog['min-frames'] = len(min_frame_files) prog['zero-var'] = len(zero_var_files) prog['small-var'] = len(small_var_files) prog['overflow'] = len(overflow_files) prog.add(1) # ====== remove broken files ====== # if not bool(remove_min_length): min_frame_files = {} new_indices = {name: (start, end) for name, (start, end) in indices.items() if name not in zero_len_files and name not in min_frame_files and name not in zero_var_files and name not in small_var_files and name not in overflow_files} print("Filtered #utterances: %s/%s (files)" % (ctext(len(indices) - len(new_indices), 'lightcyan'), ctext(len(indices), 'cyan'))) indices = new_indices # ====== store save data ====== # save_data['zero_len'] = zero_len_files save_data['min_dur'] = min_frame_files save_data['zero_var'] = zero_var_files save_data['small_var'] = small_var_files save_data['overflow'] = overflow_files # ====== filter-out by number of utt-per-speaker ====== # if bool(remove_min_uttspk): spk2utt = defaultdict(list) for name in indices.keys(): spk2utt[spkid[name]].append(name) n_utt_removed = 0 n_spk_removed = 0 removed_utt = [] keep_utt = [] for spk, utt in spk2utt.items(): if len(utt) < min_utt: n_utt_removed += len(utt) n_spk_removed += 1 removed_utt += utt else: keep_utt += utt removed_utt = set(removed_utt) keep_utt = set(keep_utt) save_data['min_utt'] = removed_utt print("Removed min-utt/spk: %s/%s(utt) %s/%s(spk)" % ( ctext(n_utt_removed, 'lightcyan'), ctext(len(indices), 'cyan'), ctext(n_spk_removed, 'lightcyan'), ctext(len(spk2utt), 'cyan') )) assert len(indices) == n_utt_removed + len(keep_utt), "Not possible!" indices = {name: (start, end) for name, (start, end) in indices.items() if name in keep_utt} # ====== sample by number of speakers ====== # if isinstance(n_speakers, Number) and n_speakers > 0: spk2utt = defaultdict(list) for name, (start, end) in indices.items(): spk2utt[spkid[name]].append((name, (start, end))) n_org_spk = len(spk2utt) n_org_ids = len(indices) # only need down-sampling with smaller number of speaker if n_speakers < n_org_spk: rand = np.random.RandomState(seed=Config.SUPER_SEED) tmp = list(spk2utt.keys()) rand.shuffle(tmp) sampled_spk = tmp[:n_speakers] indices = [] for spk in sampled_spk: indices += spk2utt[spk] indices = dict(indices) else: sampled_spk = spk2utt # print some log print("Selected: %s/%s(spk) which have %s/%s(utt)" % ( ctext(len(sampled_spk), 'lightcyan'), ctext(n_org_spk, 'cyan'), ctext(len(indices), 'lightcyan'), ctext(n_org_ids, 'cyan') )) # ====== return the new indices ====== # if save_path is not None: try: with open(save_path, 'wb') as save_file: pickle.dump(save_data, save_file) except Exception as e: print("Cannot save filtering data to path: '%s', error: '%s'" % (save_path, str(e))) return indices
def __call__(self, *inputs, **kwargs): show_progress = kwargs.pop('show_progress', False) # dictionary as inputs if len(kwargs) == len(self.inputs_name): inputs = [kwargs[i] for i in self.inputs_name] # ====== delete un-matchede inputs ====== # inputs_new = [] tmp = list(inputs) shapes = list(self._input_shape) # this process iteratively remove inputs with mismatch shape # to current given input for s in shapes: for i in tuple(tmp): if len(i.shape) != len(s) or \ any(a is not None and a > 0 and a != b for a, b in zip(s, i.shape)): # different ndim, or shape tmp.remove(i) else: inputs_new.append(i) tmp.remove(i) break if len(inputs_new) != len(self.inputs): raise ValueError("Given inputs have shape: %s, cannot match the shape of " "defined inputs: %s" % ('; '.join([str(i.shape) for i in inputs]), '; '.join([str(i) for i in self.input_shape]))) if not self._strict: inputs = inputs_new # ====== create feed_dict ====== # feed_dict = {} inputs = flatten_list(inputs, level=None) for tensor, value in zip(self.inputs, inputs): feed_dict[tensor] = value feed_dict.update(self.defaults) # check if modifying training mode if self.training is None: pass elif self.training: feed_dict.update({is_training(): True}) else: feed_dict.update({is_training(): False}) session = get_session() outputs = None # ====== mini-batches ====== # if self.batch_size is not None: batch_vars = ([i for i in feed_dict.keys() if is_tensor(i)] if len(self.batch_vars) == 0 else self.batch_vars) batch_vars = [i for i in batch_vars if i in feed_dict and hasattr(feed_dict[i], 'shape')] n_samples = list(set(feed_dict[i].shape[0] for i in batch_vars)) assert len(n_samples) == 1, \ "Data have multiple batching dimension: %s" % str(n_samples) n_samples = n_samples[0] # only continue if we have more samples than `batch_size` if n_samples > self.batch_size: n_output = len(self.outputs) outputs = [] all_batches = [] # (optional) showing progress if show_progress: prog = Progbar(target=n_samples, print_report=False, print_summary=False, name='') for s, e in batching(batch_size=int(self.batch_size), n=n_samples): if show_progress: prog.add(e - s) all_batches.append(e - s) feed_dict_minibatch = OrderedDict([(k, v[s:e]) if k in batch_vars else (k, v) for k, v in feed_dict.items()]) updated = session.run(self.outputs + [self.updates_ops], feed_dict=feed_dict_minibatch) updated = updated[:n_output] if not self._return_list: updated = updated[0] outputs.append(updated) ## concatenate all outputs if not self._return_list: o_ndim = outputs[0].ndim if o_ndim == 0: # returned scalars outputs = np.array(outputs) else: # returned array for o_axis in range(o_ndim): all_n = [o.shape[o_axis] for o in outputs] if all_n == all_batches: break outputs = np.concatenate(outputs, axis=o_axis) ## returning a list of outputs else: new_outputs = [] for output_idx in range(len(outputs[0])): o = [x[output_idx] for x in outputs] o_ndim = o[0].ndim if o_ndim == 0: # returned scalars o = np.array(o) else: # returned array for o_axis in range(o[0].ndim): all_n = [val.shape[o_axis] for val in o] if all_n == all_batches: break o = np.concatenate(o, axis=o_axis) new_outputs.append(o) outputs = new_outputs # ====== single batch ====== # if outputs is None: updated = session.run(self.outputs + [self.updates_ops], feed_dict=feed_dict) outputs = updated[:len(self.outputs)] if not self._return_list: outputs = outputs[0] # ====== return final output ====== # return outputs
def prepare_dnn_data(save_dir, feat_name=None, utt_length=None, seq_mode=None, min_dur=None, min_utt=None, exclude=None, train_proportion=None, return_dataset=False): assert os.path.isdir(save_dir), \ "Path to '%s' is not a directory" % save_dir if feat_name is None: feat_name = FEATURE_NAME if utt_length is None: utt_length = int(_args.utt) if seq_mode is None: seq_mode = str(_args.seq).strip().lower() if min_dur is None: min_dur = MINIMUM_UTT_DURATION if min_utt is None: min_utt = MINIMUM_UTT_PER_SPEAKERS if exclude is None: exclude = str(_args.exclude).strip() print("Minimum duration: %s(s)" % ctext(min_dur, 'cyan')) print("Minimum utt/spk : %s(utt)" % ctext(min_utt, 'cyan')) # ******************** prepare dataset ******************** # path = os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE) assert os.path.exists(path), "Cannot find acoustic dataset at path: %s" % path ds = F.Dataset(path=path, read_only=True) rand = np.random.RandomState(seed=Config.SUPER_SEED) # ====== find the right feature ====== # assert feat_name in ds, "Cannot find feature with name: %s" % feat_name X = ds[feat_name] ids_name = 'indices_%s' % feat_name assert ids_name in ds, "Cannot find indices with name: %s" % ids_name # ====== basic path ====== # path_filtered_data = os.path.join(save_dir, 'filtered_files.pkl') path_train_files = os.path.join(save_dir, 'train_files.pkl') path_speaker_info = os.path.join(save_dir, 'speaker_info.pkl') # ******************** cannot find cached data ******************** # if any(not os.path.exists(p) for p in [path_filtered_data, path_train_files, path_speaker_info]): # ====== exclude some dataset ====== # if len(exclude) > 0: exclude_dataset = {i: 1 for i in exclude.split(',')} print("* Excluded dataset:", ctext(exclude_dataset, 'cyan')) indices = {name: (start, end) for name, (start, end) in ds[ids_name].items() if ds['dsname'][name] not in exclude_dataset} # special case exclude all the noise data if 'noise' in exclude_dataset: indices = {name: (start, end) for name, (start, end) in indices.items() if '/' not in name} else: indices = {i: j for i, j in ds[ids_name].items()} # ====== down-sampling if necessary ====== # if _args.downsample > 1000: dataset2name = defaultdict(list) # ordering the indices so we sample the same set every time for name in sorted(indices.keys()): dataset2name[ds['dsname'][name]].append(name) n_total_files = len(indices) n_sample_files = int(_args.downsample) # get the percentage of each dataset dataset2per = {i: len(j) / n_total_files for i, j in dataset2name.items()} # sampling based on percentage _ = {} for dsname, flist in dataset2name.items(): rand.shuffle(flist) n_dataset_files = int(dataset2per[dsname] * n_sample_files) _.update({i: indices[i] for i in flist[:n_dataset_files]}) indices = _ # ====== * filter out "bad" sample ====== # indices = filter_utterances(X=X, indices=indices, spkid=ds['spkid'], min_utt=min_utt, min_dur=min_dur, remove_min_length=True, remove_min_uttspk=True, n_speakers=None, ncpu=None, save_path=path_filtered_data) # ====== all training file name ====== # # modify here to train full dataset all_name = sorted(indices.keys()) rand.shuffle(all_name); rand.shuffle(all_name) n_files = len(all_name) print("#Files:", ctext(n_files, 'cyan')) # ====== speaker mapping ====== # name2spk = {name: ds['spkid'][name] for name in all_name} all_speakers = sorted(set(name2spk.values())) spk2label = {spk: i for i, spk in enumerate(all_speakers)} name2label = {name: spk2label[spk] for name, spk in name2spk.items()} assert len(name2label) == len(all_name) print("#Speakers:", ctext(len(all_speakers), 'cyan')) # ====== stratify sampling based on speaker ====== # valid_name = [] # create speakers' cluster label2name = defaultdict(list) for name, label in sorted(name2label.items(), key=lambda x: x[0]): label2name[label].append(name) # for each speaker with >= 3 utterance for label, name_list in sorted(label2name.items(), key=lambda x: x[0]): if len(name_list) < 3: continue n = max(1, int(0.05 * len(name_list))) # 5% for validation valid_name += rand.choice(a=name_list, size=n, replace=False).tolist() # train list is the rest _ = set(valid_name) train_name = [i for i in all_name if i not in _] # ====== split training and validation ====== # train_indices = {name: indices[name] for name in train_name} valid_indices = {name: indices[name] for name in valid_name} # ====== save cached data ====== # with open(path_train_files, 'wb') as fout: pickle.dump({'train': train_indices, 'valid': valid_indices}, fout) with open(path_speaker_info, 'wb') as fout: pickle.dump({'all_speakers': all_speakers, 'name2label': name2label, 'spk2label': spk2label}, fout) # ******************** load cached data ******************** # else: with open(path_train_files, 'rb') as fin: obj = pickle.load(fin) train_indices = obj['train'] valid_indices = obj['valid'] with open(path_speaker_info, 'rb') as fin: obj = pickle.load(fin) all_speakers = obj['all_speakers'] name2label = obj['name2label'] spk2label = obj['spk2label'] # ******************** print log ******************** # def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % ( dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan')) # ====== training files ====== # print("#Train files:", ctext('%-8d' % len(train_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in train_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in train_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=train_indices) # ====== valid files ====== # print("#Valid files:", ctext('%-8d' % len(valid_indices), 'cyan'), "#spk:", ctext(len(set(name2label[name] for name in valid_indices.keys())), 'cyan'), "#noise:", ctext(len([name for name in valid_indices.keys() if '/' in name]), 'cyan')) summary_indices(ids=valid_indices) # ******************** create the recipe ******************** # assert all(name in name2label for name in train_indices.keys()) assert all(name in name2label for name in valid_indices.keys()) recipes = prepare_dnn_feeder_recipe(name2label=name2label, n_speakers=len(all_speakers), utt_length=utt_length, seq_mode=seq_mode) # ====== downsample training set for analyzing if required ====== # if train_proportion is not None: assert 0 < train_proportion < 1 n_training = len(train_indices) train_indices = list(train_indices.items()) rand.shuffle(train_indices); rand.shuffle(train_indices) train_indices = dict(train_indices[:int(n_training * train_proportion)]) # ====== create feeder ====== # train_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=NCPU, buffer_size=256) valid_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=max(2, NCPU // 4), buffer_size=64) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) print(valid_feeder) # ====== debugging ====== # if IS_DEBUGGING: import matplotlib matplotlib.use('Agg') prog = Progbar(target=len(valid_feeder), print_summary=True, name="Iterating validation set") samples = [] n_visual = 250 for name, idx, X, y in valid_feeder.set_batch(batch_size=100000, batch_mode='file', seed=None, shuffle_level=0): assert idx == 0, "Utterances longer than %.2f(sec)" % (100000 * Config.STEP_LENGTH) prog['X'] = X.shape prog['y'] = y.shape prog.add(X.shape[0]) # random sampling if rand.rand(1) < 0.5 and len(samples) < n_visual: for i in rand.randint(0, X.shape[0], size=4, dtype='int32'): samples.append((name, X[i], np.argmax(y[i], axis=-1))) # plot the spectrogram n_visual = len(samples) V.plot_figure(nrow=n_visual, ncol=8) for i, (name, X, y) in enumerate(samples): is_noise = '/' in name assert name2label[name] == y, "Speaker label mismatch for file: %s" % name name = name.split('/')[0] dsname = ds['dsname'][name] spkid = ds['spkid'][name] y = np.argmax(y, axis=-1) ax = V.plot_spectrogram(X.T, ax=(n_visual, 1, i + 1), title='#%d' % (i + 1)) ax.set_title('[%s][%s]%s %s' % ('noise' if is_noise else 'clean', dsname, name, spkid), fontsize=6) # don't need to be high resolutions V.plot_save('/tmp/tmp.pdf', dpi=12) exit() # ====== return ====== # if bool(return_dataset): return train_feeder, valid_feeder, all_speakers, ds return train_feeder, valid_feeder, all_speakers
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False): """ Using parallel MiniBatchPCA to do PCA for multiple features at once. """ # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec) # add reading data from indices also # ====== check input dataset ====== # own_dataset = True if is_string(dataset) and os.path.isdir(dataset): dataset = Dataset(dataset, read_only=True) elif isinstance(dataset, Dataset): own_dataset = False elif isinstance(dataset, FeatureProcessor): dataset = Dataset(dataset.path, read_only=True) else: raise ValueError("Cannot acquire Dataset from input: %s" % str(dataset)) # ====== extract all feat_name ====== # if is_string(feat_name) and feat_name == 'auto': feat_name = [] for k in dataset.keys(): X = dataset[k] if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1: feat_name.append(k) else: feat_name = [ name for name in as_tuple(feat_name, t=str) if name in dataset ] # ====== load PCA ====== # from odin.ml import MiniBatchPCA # init PCA nb_samples = 0 for feat in feat_name: nb_samples += dataset[feat].shape[0] # ====== prepare MPI PCA ====== # add_notification("Selected features for PCA: " + ctext(', '.join(feat_name), 'yellow')) def map_pca(name): X = dataset[name] # found exist pca model if 'pca_' + feat in dataset and not override: pca = dataset['pca_' + feat] # create new PCA else: pca = MiniBatchPCA(n_components=None, whiten=False, copy=True, batch_size=None) # No shuffling make iter much faster for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0): pca.partial_fit(x) yield x.shape[0] # save PCA model with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f: cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL) # finish return feature name yield name mpi = MPI(jobs=feat_name, func=map_pca, ncpu=None, batch=1, hwm=12082518, backend='python') # ====== running the MPI ====== # remain_features = list(feat_name) finished_features = [] prog = Progbar(target=nb_samples, print_summary=True, print_report=True, name='PCA') for n in mpi: if is_string(n): remain_features.remove(n) finished_features.append(n) else: prog['Remain'] = ', '.join(remain_features) prog['Finished'] = ', '.join(finished_features) prog.add(n) # ====== return ====== # if own_dataset: dataset.close()
with np.warnings.catch_warnings(): rand = np.random.RandomState(seed=Config.SUPER_SEED) np.warnings.filterwarnings('ignore') # ====== stratify sampling from each dataset ====== # clusters = defaultdict(list) clusters_count = defaultdict(int) samples = [] for row in sorted(ALL_FILES, key=lambda x: x[0]): clusters[row[4]].append(row) clusters_count[row[4]] += 1 for k, v in clusters.items(): rand.shuffle(v) samples += v[:18] # 18 files from each dataset # ====== run the MPI for feature extraction ====== # prog = Progbar(target=len(samples), print_report=True, print_summary=False, name=FEATURE_RECIPE) error_signal = [] for feat in mpi.MPI(jobs=samples, func=recipe.transform, ncpu=NCPU, batch=1): assert FEATURE_NAME in feat # update progress if isinstance(feat, pp.base.ExtractorSignal): error_signal.append(feat) prog.add(1) continue prog['spkid'] = feat['spkid'] prog['name'] = feat['name'] prog['dsname'] = feat['dsname']
class Task(object): """ Parameters ---------- func: call-able function will be executed for each iteration data: single or list of odin.fuel.Data, numpy.ndarray iterate over all these data and execute function on the data. epoch: int how many epoch will be repeated p: float (0.0 - 1.0) probability the `func` will be execute for each iteration batch_size: int (> 0) number of samples for each iteration seed: int random seed for shuffling the data shuffle_level: int (0, 1, 2) if 0, shuffle the file lists if 1, shuffle the buffer (i.e. list of processing files) and all the previous if 2, shuffle the returned batch and all the previous callbacks: None, or list of `odin.training.Callback` callback will be promoted during the execution of the task labels: None, or list of string labels for printing the confusion matrix in `odin.utils.Progbar` name: None or string unique name for Task identity. verbose : {0, 1, 2, 3, 4} specific verbose level controlling the log output 0 - Turn off all log 1 - progress off, only notification 2 - progress off, notification and summary 3 - progress on, nothing else 4 - progress on, notification and summary 5 - progress on, notification, summary and batch report """ def __init__(self, func, data, epoch=1, p=1.0, batch_size=128, seed=None, shuffle_level=2, callbacks=None, labels=None, name=None, verbose=2): super(Task, self).__init__() self.set_func(func, data) # this Progbar will record the history as well self._labels = [str(l) for l in labels] \ if labels is not None else None self._progbar = Progbar(target=self.nb_samples, name=name, interval=0., print_report=True, print_summary=True) self._progbar.set_labels(self._labels) # ====== set callback and verbose ====== # self._callback = CallbackList(callbacks) self.set_verbose(verbose) # ====== assign other arguments ====== # self._nb_epoch = epoch self._p = np.clip(p, 0., 1.) self._seed = seed self.set_batch(batch_size, seed, shuffle_level) self._name = name # ====== current info ====== # self._curr_epoch = 0 self._curr_iter = 0 self._curr_samples = 0 self._curr_epoch_iter = 0 self._curr_epoch_samples = 0 self._callback_msg = [] # ====== iter tracking ====== # self._created_iter = None self._stop = False def __str__(self): return "<Task:'%s' p:%s bs:%s #ep:%s/%s #it:%s/%s #n:%s/%s %s>" % \ (ctext(self.name, 'lightyellow'), ctext(self.probability, 'cyan'), ctext(self.batch_size, 'cyan'), ctext(self.curr_epoch, 'lightcyan'), ctext(self.nb_epoch, 'cyan'), ctext(self.curr_epoch_iter, 'lightcyan'), ctext(self.curr_iter, 'cyan'), ctext(self.curr_epoch_samples, 'lightcyan'), ctext(self.curr_samples, 'cyan'), ','.join([ctext(i.__class__.__name__, 'cyan') for i in self._callback._callbacks])) def __getstate__(self): return (self._progbar, self._nb_epoch, self._p, self._name, self._batch_size, self._rng, self._seed, self._shuffle_level, self._verbose) def __setstate__(self, states): (self._progbar, self._nb_epoch, self._p, self._name, self._batch_size, self._rng, self._seed, self._shuffle_level, self._verbose) = states # ====== current info ====== # self._curr_epoch = 0 self._curr_iter = 0 self._curr_samples = 0 self._curr_epoch_iter = 0 self._curr_epoch_samples = 0 self._callback_msg = [] # ====== iter tracking ====== # self._created_iter = None self._stop = False # ====== reset value of func and data ====== # self._func = None self._data = None def set_callbacks(self, callbacks): self._callback.set_callbacks(callbacks) if self._verbose == 0: self._callback.set_notification(False) else: self._callback.set_notification(True) return self def set_verbose(self, verbose): verbose = int(verbose) self._verbose = verbose if verbose == 0: # turn off everything self._callback.set_notification(False) self._progbar.print_progress = False self._progbar.print_summary = False self._progbar.print_report = False elif verbose == 1: # progress off, only notification self._callback.set_notification(True) self._progbar.print_progress = False self._progbar.print_summary = False self._progbar.print_report = False elif verbose == 2: # progress off, notification + summary self._callback.set_notification(True) self._progbar.print_progress = False self._progbar.print_summary = True self._progbar.print_report = False elif verbose == 3: # progress on, nothing else self._callback.set_notification(False) self._progbar.print_progress = True self._progbar.print_summary = False self._progbar.print_report = False elif verbose == 4: # progress on, notification + summary self._callback.set_notification(True) self._progbar.print_progress = True self._progbar.print_summary = True self._progbar.print_report = False elif verbose == 5: # progress on, notification, report, summary self._callback.set_notification(True) self._progbar.print_progress = True self._progbar.print_summary = True self._progbar.print_report = True else: raise ValueError( "Only support verbose value: 0, 1, 2, 3, 4, 5; but given: %s" % str(verbose)) def set_func(self, func, data): # ====== check function ====== # self._func = func if isinstance(func, K.Function): self._output_info = [(o.name, o.shape.as_list()) for o in self._func.outputs] elif hasattr(func, '__call__'): self._output_info = [] # No info (normal function) else: raise ValueError("No support for function type: %s" % func.__class__.__name__) # ====== check data ====== # if not isinstance(data, (tuple, list)): data = [data] self._data = [fuel.as_data(i, copy=not isinstance(i, fuel.Feeder)) for i in data] self._nb_samples = min([d.iter_len for d in self._data]) return self def set_batch(self, batch_size=None, seed=-1, shuffle_level=None): if batch_size is not None: self._batch_size = batch_size if seed is None or seed >= 0: if seed is not None: self._rng = np.random.RandomState(seed) else: self._rng = struct() self._rng.randint = lambda x: None self._rng.rand = get_rng().rand if shuffle_level is not None: self._shuffle_level = min(max(int(shuffle_level), 0), 2) return self # ==================== Properties ==================== # @property def history(self): """ Return : dictionary type {epoch_id : {tensor_name0: [batch_return1, batch_return2, ...], tensor_name1: [batch_return1, batch_return2, ...], ...}, 1 : {tensor_name0: [batch_return1, batch_return2, ...], tensor_name1: [batch_return1, batch_return2, ...], ...}, ... } Example ------- >>> for task_name, task_hist in task.history.items(): >>> print(task_name) >>> for epoch_id, values in task_hist.items(): >>> print(' Epoch:', epoch_id) >>> for tensor_name, v in values.items(): >>> print(' ', tensor_name, len(v)) """ return self._progbar.history @property def progbar(self): return self._progbar @property def name(self): return str(self._name) @property def labels(self): return self._labels @property def nb_epoch(self): return self._nb_epoch @property def nb_samples(self): ''' Estimated number of iteration for each epoch ''' return self._nb_samples @property def probability(self): """Chance that the func will be execute during iteration""" return self._p @property def iter_per_epoch(self): ''' Estimated number of iteration for each epoch ''' return int(np.ceil(self._nb_samples / self._batch_size)) @property def batch_size(self): return self._batch_size @property def curr_epoch(self): """Total number of epoch finished since the beginning of the Task""" return self._curr_epoch @property def curr_iter(self): """Total number of iteration finished since the beginning of the Task""" return self._curr_iter @property def curr_samples(self): """Total number of samples finished since the beginning of the Task""" return self._curr_samples @property def curr_epoch_iter(self): """Number of iteration within current epoch""" return self._curr_epoch_iter @property def curr_epoch_samples(self): """Number of samples within current epoch""" return self._curr_epoch_samples @property def callback_msg(self): return self._callback_msg # ==================== control function ==================== # def stop(self): """ Stop all iterations running for this Task""" if self._created_iter is not None: self._stop = True # just run to end of the iterators for i in self._created_iter: pass self._stop = False self._created_iter = None def copy(self): return Task(self._func, self._data, epoch=self.nb_epoch, p=self.probability, batch_size=self.batch_size, seed=self._seed, shuffle_level=self._shuffle_level, name=self._name, verbose=self._verbose) def __iter(self): ''' Return ------ One of the following: * 'task_start': * 'epoch_start' : beginning of epoch * 'epoch_end' : epoch ended * 'task_end' : task ended * (results, nb_iter, nb_samples, nb_total_samples, nb_epoch) : results of execute function on data Note ---- 'end_task' also end of final epoch ''' yield None # just for initalize the iterator self._callback_msg = self._callback.task_start(self) yield 'task_start' if self._stop: yield 'task_end' else: # ====== start of training ====== # while self._curr_epoch < self._nb_epoch: self._callback_msg = self._callback.epoch_start(self, self._data) yield 'epoch_start' seed = self._rng.randint(10e8) # if only 1 Data, don't need zip or we will mess up if len(self._data) == 1: data_it = iter(self._data[0].set_batch(batch_size=self._batch_size, seed=seed, shuffle_level=self._shuffle_level)) data = data_it else: data_it = [iter(d.set_batch(batch_size=self._batch_size, seed=seed, shuffle_level=self._shuffle_level)) for d in self._data] data = zip(*data_it) # ====== start the iteration ====== # self._curr_epoch_samples = 0 self._curr_epoch_iter = 0 with self._progbar.safe_progress(): for i, x in enumerate(data): # alread terminated, try to exhausted the iterator # if forced_to_terminate: continue # preprocessed the data if not isinstance(x, (tuple, list)): x = [x] # update some info shape0 = x[0].shape[0] self._curr_samples += shape0 self._curr_iter += 1 self._curr_epoch_samples += shape0 self._curr_epoch_iter += 1 self._callback_msg = self._callback.batch_start(self, x) # apply the function if self.probability >= 1. or self._rng.rand() < self.probability: results = self._func(*x) # add msg from batch_end event self._callback_msg += self._callback.batch_end(self, results) # return results yield results # update the progress bar for (name, shape), res in zip(self._output_info, as_tuple(results)): if len(shape) == 0: # return single value self._progbar[name] = res else: # return tensor self._progbar[name] = res self._progbar.add(shape0) # check TERMINATE signal if self._stop: # send signal to the data iterators also for i in data_it: if hasattr(i, 'stop'): i.stop() else: # just iterate all over for _ in i: pass # break the epoch loop break ### Epoch end signaling self._curr_epoch += 1 self._callback_msg = self._callback.epoch_end( self, self._progbar.history[self._curr_epoch - 1]) yield 'epoch_end' # ====== check if we got the right number for epoch iter ====== # if self._curr_epoch_samples != self._nb_samples: # just for sure should not smaller than the real number self._nb_samples = self._curr_epoch_samples # ====== end_epoch or task ====== # if self._stop or self._curr_epoch >= self._nb_epoch: self._callback_msg = self._callback.task_end( self, self._progbar.history) yield 'task_end' # showing notification if self._verbose >= 1 and self._verbose != 3: self._progbar.add_notification('Task "%s" ended!' % str(self.name)) break # ====== end of iteration ====== # self._created_iter = None def __iter__(self): if self._created_iter is None: # reset all information self._curr_epoch = 0 self._curr_iter = 0 self._curr_samples = 0 self._curr_epoch_iter = 0 self._curr_epoch_samples = 0 self._callback_msg = [] # create new iter self._created_iter = self.__iter() # initialize the iteration next(self._created_iter) return self._created_iter def __del__(self): self.stop()
def fast_pca(*x, n_components=None, algo='rpca', y=None, batch_size=1024, return_model=False, random_state=5218): """ A shortcut for many different PCA algorithms Parameters ---------- x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError("`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x] # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1,) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in batching(batch_size=batch_size, n=x_train.shape[0], seed=5218): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)
def calculate_pca(dataset, feat_name='auto', batch_size=5218, override=False): """ Using parallel MiniBatchPCA to do PCA for multiple features at once. """ # TODO: add different pca prefix (e.g. pca_full_mspec, pca_sami_mspec) # add reading data from indices also # ====== check input dataset ====== # own_dataset = True if is_string(dataset) and os.path.isdir(dataset): dataset = Dataset(dataset, read_only=True) elif isinstance(dataset, Dataset): own_dataset = False elif isinstance(dataset, FeatureProcessor): dataset = Dataset(dataset.path, read_only=True) else: raise ValueError("Cannot acquire Dataset from input: %s" % str(dataset)) # ====== extract all feat_name ====== # if is_string(feat_name) and feat_name == 'auto': feat_name = [] for k in dataset.keys(): X = dataset[k] if hasattr(X, 'ndim') and X.ndim == 2 and X.shape[-1] > 1: feat_name.append(k) else: feat_name = [name for name in as_tuple(feat_name, t=str) if name in dataset] # ====== load PCA ====== # from odin.ml import MiniBatchPCA # init PCA nb_samples = 0 for feat in feat_name: nb_samples += dataset[feat].shape[0] # ====== prepare MPI PCA ====== # add_notification("Selected features for PCA: " + ctext(', '.join(feat_name), 'yellow')) def map_pca(name): X = dataset[name] # found exist pca model if 'pca_' + feat in dataset and not override: pca = dataset['pca_' + feat] # create new PCA else: pca = MiniBatchPCA(n_components=None, whiten=False, copy=True, batch_size=None) # No shuffling make iter much faster for x in X.set_batch(batch_size=batch_size, seed=None, shuffle_level=0): pca.partial_fit(x) yield x.shape[0] # save PCA model with open(os.path.join(dataset.path, 'pca_' + name), 'wb') as f: cPickle.dump(pca, f, protocol=cPickle.HIGHEST_PROTOCOL) # finish return feature name yield name mpi = MPI(jobs=feat_name, func=map_pca, ncpu=None, batch=1, hwm=12082518, backend='python') # ====== running the MPI ====== # remain_features = list(feat_name) finished_features = [] prog = Progbar(target=nb_samples, print_summary=True, print_report=True, name='PCA') for n in mpi: if is_string(n): remain_features.remove(n) finished_features.append(n) else: prog['Remain'] = ', '.join(remain_features) prog['Finished'] = ', '.join(finished_features) prog.add(n) # ====== return ====== # if own_dataset: dataset.close()
loss = tf.losses.log_loss(labels=X, predictions=X_probas) # =========================================================================== # Optimizing the network # =========================================================================== update_ops = K.optimizers.Adam(lr=0.001).minimize(loss) K.initialize_all_variables() # ====== intitalize ====== # record_train_loss = [] record_valid_loss = [] patience = 3 epoch = 0 # We want the rate to go up but the distortion to go down while True: # ====== training ====== # train_losses = [] prog = Progbar(target=X_train.shape[0], name='Epoch%d' % epoch) start_time = timeit.default_timer() for start, end in batching(batch_size=args.bs, n=X_train.shape[0], seed=K.get_rng().randint(10e8)): _ = K.eval(loss, feed_dict={X: X_train[start:end]}, update_after=update_ops) prog.add(end - start) train_losses.append(_) # ====== training log ====== # print(ctext("[Epoch %d]" % epoch, 'yellow'), '%.2f(s)' % (timeit.default_timer() - start_time)) print("[Training set] Loss: %.4f" % np.mean(train_losses)) # ====== validation set ====== # code_samples, lo = K.eval([Z, loss], feed_dict={X: X_valid}) print("[Valid set] Loss: %.4f" % lo) # ====== record the history ====== # record_train_loss.append(np.mean(train_losses))
shutil.rmtree(wav_path) elif len(os.listdir(wav_path)) != TOTAL_FILES: print("Found only %d files at '%s', delete old wave files" % (len(os.listdir(wav_path)), wav_path)) shutil.rmtree(wav_path) # ====== convert all compress audio to .wav using sph2pipe ====== # if not os.path.exists(wav_path): os.mkdir(wav_path) cmds = ["sph2pipe %s %s -f rif" % (path, os.path.join(wav_path, get_name(path))) for path in audio_files] def mpi_fn(cmd): exec_commands(cmd, print_progress=False) yield len(cmd) prog = Progbar(target=len(cmds), print_report=True, print_summary=True, name='Converting .sph to .wav') # run the MPI tasks mpi = MPI(jobs=cmds, func=mpi_fn, ncpu=cpu_count() - 1, batch=12) for i in mpi: prog.add(i) # =========================================================================== # Extract Acoustic features # =========================================================================== jobs = get_all_files(wav_path, filter_func=lambda x: '.wav' == x[-4:]) assert len(jobs) == TOTAL_FILES # ====== configuration ====== # if not os.path.exists(outpath) or args.ds: extractors = pp.make_pipeline(steps=[
def run(self): njobs = len(self.jobs) dataset = Dataset(self.path) if self.n_cache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.n_cache) # ====== indices ====== # databases = defaultdictkey( lambda key: MmapDict(path=os.path.join(dataset.path, key), cache_size=10000, read_only=False)) last_start = defaultdict(int) # ====== statistic ====== # # load old statistics stats = defaultdict(lambda: [0, 0]) # name -> (sum1, sum2) for key in dataset.keys(): if 'sum1' == key[-4]: stats[key[:-4]][0] = dataset[key][:] elif 'sum2' == key[-4:]: stats[key[:-4]][1] = dataset[key][:] # all data are cached for periodically flushed cache = defaultdict(list) n_processed = [0] # store the value as reference # ====== helper ====== # def flush_feature(feat_name, X_cached): if len(X_cached) > 0: X_cached = np.concatenate(X_cached, 0) # flush data if feat_name in dataset: dataset[feat_name].append(X_cached) else: dataset[(feat_name, 'memmap')] = X_cached # ====== repeated for each result returned ====== # def post_processing(result): # search for file name if self.identifier not in result: raise RuntimeError( "Cannot find identifier '%s' in returned dictionary" % self.identifier) file_name = result[self.identifier] # invalid file_name if not is_string(file_name): raise RuntimeError( "Cannot find file name in returned features " "list, the file name can be specified in key: 'name', 'path' " "and the type of the value must be string. All available " "keys are: %s" % str(result.keys())) # store all new indices # mapping [X.shape[0]] -> [feat_name, feat_name, ...] all_indices = {} # processing for feat_name, X in result.items(): # some invalid feat_name if feat_name in ('config', 'pipeline', 'sum1', 'sum2'): raise RuntimeError( "Returned features' name cannot be one " "of the following: 'config', 'pipeline', 'sum1', 'sum2'." ) # ignore some feat_name if feat_name in ('name'): continue # if numpy ndarray, save to MmapData if isinstance(X, np.ndarray) or \ 'sum1' == feat_name[-4:] or \ 'sum2' == feat_name[-4:]: # save statistics instead if 'sum1' == feat_name[-4:]: stats[feat_name[:-4]][0] += X elif 'sum2' == feat_name[-4:]: stats[feat_name[:-4]][1] += X # save features array else: all_indices[feat_name] = X.shape[0] # cache data, only if we have more than 0 sample if X.shape[0] > 0: cache[feat_name].append(X) # else all other kind of data save to MmapDict else: databases[feat_name][file_name] = X # remove data del X # ====== update indices ====== # if len(all_indices) > 0: for feat_name, n in all_indices.items(): ids_name = 'indices_%s' % feat_name databases[ids_name][file_name] = (last_start[ids_name], last_start[ids_name] + n) last_start[ids_name] += n # ====== flush cache ====== # n_processed[0] += 1 if n_processed[0] % cache_limit == 0: # 12 + 8 for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() # ====== update progress ====== # return file_name # ====== mapping function ====== # def _map_func(dat): try: ret = self.extractor.transform(dat) except Exception as e: # Non-handled exception ret = '\n========\n' ret += 'Time : `%s`\n' % str( get_formatted_datetime(only_number=False)) ret += 'Error : `%s`\n' % str(e) ret += 'Input : `%s`\n' % str(dat) import traceback etype, value, tb = sys.exc_info() for line in traceback.TracebackException( type(value), value, tb, limit=None).format(chain=True): ret += line return ret # ====== processing ====== # mpi = MPI(jobs=self.jobs, func=_map_func, ncpu=self.n_cpu, batch=1, hwm=self.n_cpu * 3, backend='python') # initialize prog = Progbar(target=njobs, name=self.path, interval=0.12, print_report=True, print_summary=True) start_time = time.time() last_time = time.time() last_count = 0 with open(self._log_path, 'w') as flog: # writing the log head flog.write('============================\n') flog.write('Start Time : %s\n' % get_formatted_datetime(only_number=False)) flog.write('Outpath : %s\n' % self.path) flog.write('Extractor : %s\n' % '->'.join( [s[-1].__class__.__name__ for s in self.extractor.steps])) flog.write('#Jobs : %d\n' % njobs) flog.write('#CPU : %d\n' % self.n_cpu) flog.write('#Cache : %d\n' % cache_limit) flog.write('============================\n') flog.flush() # start processing the file list for count, result in enumerate(mpi): # Non-handled exception if isinstance(result, string_types): flog.write(result) flog.flush() self._error_log.append(result) if self.stop_on_failure: raise RuntimeError(result) # some error might happened elif isinstance(result, ExtractorSignal): flog.write(str(result)) flog.flush() if result.action == 'error': prog.add_notification(str(result)) raise RuntimeError( "ExtractorSignal requests terminating processor!") elif result.action == 'warn': prog.add_notification(str(result)) elif result.action == 'ignore': self._error_log.append(result) else: raise RuntimeError( "Unknown action from ExtractorSignal: %s" % result.action) prog['File'] = '%-48s' % result.message[:48] # otherwise, no error happened, do post-processing else: name = post_processing(result) prog['File'] = '%-48s' % str(name)[:48] # update progress prog.add(1) # manually write to external log file if (count + 1) % max(1, int(0.01 * njobs)) == 0: curr_time = time.time() elap = curr_time - start_time avg_speed = (count + 1) / elap cur_speed = (count + 1 - last_count) / (curr_time - last_time) avg_est = (njobs - count - 1) / avg_speed cur_est = (njobs - count - 1) / cur_speed flog.write( '[%s] Processed: %d(files) Remain: %d(files) Elap.: %.2f(secs)\n' ' Avg.Spd: %.2f(obj/sec) Avg.Est.: %.2f(secs)\n' ' Cur.Spd: %.2f(obj/sec) Cur.Est.: %.2f(secs)\n' % (get_formatted_datetime(only_number=False), count + 1, njobs - count - 1, elap, avg_speed, avg_est, cur_speed, cur_est)) flog.flush() last_time = curr_time last_count = count + 1 # ====== end, flush the last time ====== # for feat_name, X_cached in cache.items(): flush_feature(feat_name, X_cached) cache.clear() cache = None dataset.flush() prog.add_notification("Flushed all data to disk") # ====== saving indices ====== # for name, db in databases.items(): db.flush(save_all=True) db_size = len(db) db.close() prog.add_notification( 'Flush MmapDict "%s" to disk, size: %s' % (ctext(name, 'yellow'), ctext(str(db_size), 'yellow'))) # ====== save mean and std ====== # def save_mean_std(sum1, sum2, name): N = dataset[name.split('_')[0]].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - np.power(mean, 2)) if np.any(np.isnan(mean)): wprint('Mean contains NaN, name: %s' % name) if np.any(np.isnan(std)): wprint('Std contains NaN, name: %s' % name) dataset[name + 'sum1'] = sum1 dataset[name + 'sum2'] = sum2 dataset[name + 'mean'] = mean dataset[name + 'std'] = std # save all stats if len(stats) > 0: for feat_name, (sum1, sum2) in stats.items(): save_mean_std(sum1, sum2, feat_name) prog.add_notification( 'Saved statistics of: %s, shape: %s' % (ctext(feat_name.split('_')[0], 'yellow'), ctext(str(sum1.shape), 'yellow'))) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== saving the extractor ====== # # not good idea to save the extractor all the time # pipeline_path = os.path.join(dataset.path, 'pipeline') # with open(pipeline_path, 'wb') as f: # cPickle.dump(self.extractor, f, protocol=2) # prog.add_notification("Saved Extractor pipeline at: %s" % # ctext(pipeline_path, 'yellow')) # ====== saving the configuration ====== # config_path = os.path.join(dataset.path, 'config') config = MmapDict(config_path) config['__configuration_time__'] = time.time() config['__processor__'] = self.path for i in dir(self): if _default_module.match(i) is not None: continue j = getattr(self, i) if isinstance(j, (Number, string_types, bool)): config[i] = j config.flush(save_all=True) self.config = {i: j for i, j in config} config.close() prog.add_notification("Saved configuration at: %s" % ctext(config_path, 'yellow')) # ====== final notification ====== # prog.add_notification("Closed all dataset.") prog.add_notification("Dataset at path: %s" % ctext(dataset.path, 'yellow'))
for feat_name in features: # ====== check all indices not overlap ====== # all_indices = [F.MmapDict(path=os.path.join(i, 'indices_%s' % feat_name), read_only=True) for i in inpath] _ = [] for ids in all_indices: _ += list(ids.keys()) assert len(_) == len(set(_)), "Overlap indices name" # ====== initialize ====== # out_data = None out_indices = {} start = 0 curr_nfile = 0 prog = Progbar(target=sum(len(i) for i in all_indices), print_summary=True, name=outpath) for i, path in enumerate(inpath): in_data = F.MmapData(path=os.path.join(path, feat_name), read_only=True) in_indices = all_indices[i] # initialize if out_data is None: out_data = F.MmapData(path=os.path.join(outpath, feat_name), dtype=in_data.dtype, shape=(0,) + in_data.shape[1:], read_only=False) # copy data for name, (s, e) in list(in_indices.items()): X = in_data[s:e]
def prepare_dnn_data(recipe, feat, utt_length, seed=52181208): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = {name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA} train_indices, valid_indices = train_valid_test_split( x=list(train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name:TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder( data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=52181208)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif is_number(end_document): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1208 auto_adjust_len = True prog = Progbar(target=target_len, name="Tokenize Transform", print_report=True, print_summary=True) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError('Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog['#Docs'] = nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process # if self.print_progress and auto_adjust_len: # prog.target = nb_docs; prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.items(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results
def fast_pca(*x, n_components=None, algo='rpca', y=None, batch_size=1024, return_model=False, random_state=1234): """ A shortcut for many different PCA algorithms Parameters ---------- x : {list, tuple} list of matrices for transformation, the first matrix will be used for training n_components : {None, int} number of PCA components algo : {'pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'} different PCA algorithm: 'ipca' - IncrementalPCA, 'ppca' - Probabilistic PCA, 'sppca' - Supervised Probabilistic PCA, 'plda' - Probabilistic LDA, 'rpca' - randomized PCA using randomized SVD y : {numpy.ndarray, None} required for labels in case of `sppca` batch_size : int (default: 1024) batch size, only used for IncrementalPCA return_model : bool (default: False) if True, return the trained PCA model as the FIRST return """ batch_size = int(batch_size) algo = str(algo).lower() if algo not in ('pca', 'ipca', 'ppca', 'sppca', 'plda', 'rpca'): raise ValueError( "`algo` must be one of the following: 'pca', " "'ppca', 'plda', 'sppca', or 'rpca'; but given: '%s'" % algo) if algo in ('sppca', 'plda') and y is None: raise RuntimeError("`y` must be not None if `algo='sppca'`") x = flatten_list(x, level=None) x = [i[:] if i.__class__.__name__ == 'MmapData' else i for i in x] # ====== check input ====== # x_train = x[0] x_test = x[1:] input_shape = None if x_train.ndim > 2: # only 2D for PCA input_shape = (-1, ) + x_train.shape[1:] new_shape = (-1, np.prod(input_shape[1:])) x_train = np.reshape(x_train, new_shape) x_test = [np.reshape(x, new_shape) for x in x_test] if n_components is not None: # no need to reshape back input_shape = None # ====== train PCA ====== # if algo == 'sppca': pca = SupervisedPPCA(n_components=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'plda': from odin.ml import PLDA pca = PLDA(n_phi=n_components, random_state=random_state) pca.fit(x_train, y) elif algo == 'pca': pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_train) elif algo == 'rpca': # we copy the implementation of RandomizedPCA because # it is significantly faster than PCA(svd_solver='randomize') pca = RandomizedPCA(n_components=n_components, iterated_power=2, random_state=random_state) pca.fit(x_train) elif algo == 'ipca': pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) prog = Progbar(target=x_train.shape[0], print_report=False, print_summary=False, name="Fitting PCA") for start, end in batching(batch_size=batch_size, n=x_train.shape[0], seed=1234): pca.partial_fit(x_train[start:end], check_input=False) prog.add(end - start) elif algo == 'ppca': pca = PPCA(n_components=n_components, random_state=random_state) pca.fit(x_train) # ====== transform ====== # x_train = pca.transform(x_train) x_test = [pca.transform(x) for x in x_test] # reshape back to original shape if necessary if input_shape is not None: x_train = np.reshape(x_train, input_shape) x_test = [np.reshape(x, input_shape) for x in x_test] # return the results if len(x_test) == 0: return x_train if not return_model else (pca, x_train) return tuple([x_train] + x_test) if not return_model else tuple([pca, x_train] + x_test)
training.EarlyStopGeneralizationLoss('valid', ce, threshold=5, patience=5) ]) task.set_train_task(func=f_train, data=train, epoch=args.epoch, name='train') task.set_valid_task(func=f_score, data=valid, freq=training.Timer(percentage=0.8), name='valid') task.run() # =========================================================================== # Saving the test data # CSV separated by tab # =========================================================================== sep = '\t' prog = Progbar(target=len(test_ids) + len(train) + len(valid), print_summary=True, print_report=True, name="Extracting x-vector") with open(TRAIN_PATH, 'w') as f_train, open(TEST_PATH, 'w') as f_test: # ====== save training set ====== # for name, idx, X, y in train.set_batch(batch_size=8000, batch_mode='file', seed=None): assert idx == 0 y = np.argmax(y, axis=-1) assert len(set(y)) == 1 y = y[0] z = np.mean(f_z(X), axis=0, keepdims=False).astype('float32') f_train.write(sep.join([str(y)] + [str(i) for i in z]) + '\n') prog.add(X.shape[0]) # ====== save validation set ====== # for name, idx, X, y in valid.set_batch(batch_size=8000, batch_mode='file', seed=None):
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } train_indices, valid_indices = train_valid_test_split(x=list( train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=87654321)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
def validating_training_data(in_path_raw, training_dataset): file_list = {ds: sre_file_list[ds] for ds in training_dataset if ds in sre_file_list} # ====== meta info ====== # all_files = [] non_exist_files = [] extension_count = defaultdict(int) total_data = sum(v.shape[0] for k, v in file_list.items() if k not in('musan', 'rirs')) # ====== progress ====== # prog = Progbar(target=total_data, print_summary=True, print_report=True, name="Preprocessing File List") prog.set_summarizer('#Files', fn=lambda x: x[-1]) prog.set_summarizer('#Non-exist', fn=lambda x: x[-1]) # ====== iterating ====== # for ds_name, data in sorted(file_list.items(), key=lambda x: x[0]): if ds_name in ('musan', 'rirs'): continue for row in data: path, channel, name, spkid = row[:4] assert channel in ('0', '1') # check path provided if ds_name in in_path_raw: path = os.path.join(in_path_raw[ds_name], path) # create new row start_time = '-' end_time = '-' if ds_name == 'mx6': start_time, end_time = row[-2:] new_row = [path, channel, name, ds_name + '_' + spkid, ds_name, start_time, end_time] # check file exist if os.path.exists(path): all_files.append(new_row) else: non_exist_files.append(new_row) # extension ext = os.path.splitext(path)[-1] extension_count[ext + '-' + ds_name] += 1 # update progress prog['Dataset'] = ds_name prog['#Files'] = len(all_files) prog['#Non-exist'] = len(non_exist_files) prog.add(1) # final results all_files = np.array(all_files) if len(all_files) == 0: return all_files, np.array(non_exist_files), extension_count # ====== check no duplicated name ====== # n_files = len(all_files) n_unique_files = len(np.unique(all_files[:, 2])) assert n_files == n_unique_files, \ 'Found duplicated name: %d != %d' % (n_files, n_unique_files) # ====== check no duplicated speaker ====== # n_spk = sum(len(np.unique(dat[:, 3])) for name, dat in file_list.items() if name not in ('musan', 'rirs')) n_unique_spk = len(np.unique(all_files[:, 3])) assert n_spk == n_unique_spk, \ 'Found duplicated speakers: %d != %d' % (n_spk, n_unique_spk) # ====== return ====== # # Header: # 0 1 2 3 4 5 6 # path, channel, name, spkid, dataset, start_time, end_time return all_files, np.array(non_exist_files), extension_count
def validate_features(ds_or_processor, path, nb_samples=25, override=False, seed=12082518, fig_width=4): # TODO: add PCA visualization # TODO: update to match new indices style def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color)) import matplotlib matplotlib.use('Agg') from odin.visual import plot_save, plot_multiple_features # ====== check path to dataset ====== # should_close_ds = True if isinstance(ds_or_processor, FeatureProcessor): ds = Dataset(ds_or_processor.path, read_only=True) elif is_string(ds_or_processor): ds = Dataset(ds_or_processor, read_only=True) elif isinstance(ds_or_processor, Dataset): ds = ds_or_processor should_close_ds = False else: raise ValueError("`ds` can be None, string, or Dataset. No " "support for given input type: %s" % str(type(ds))) print(ctext('Validating dataset:', 'yellow'), '"%s"' % ds.path) # ====== extract the config of the dataset ====== # if 'config' not in ds: raise RuntimeError("The `Dataset` must be generated by `FeatureProcessor` " "which must contain `config` MmapDict of extracted " "features configuration.") # config = ds['config'] # pipeline = ds['pipeline'] # ====== output path ====== # path = str(path) if not os.path.exists(path): os.mkdir(path) elif override: if os.path.isfile(path): os.remove(path) else: shutil.rmtree(path) os.mkdir(path) else: raise ValueError("`path`=%s exists, cannot override." % path) prev_stdio = get_stdio_path() stdio(path=os.path.join(path, 'log.txt')) nb_samples = int(nb_samples) # ====== get all features ====== # # [(name, dtype, statistic-able), ...] all_keys = [k for k in ds.keys() if k not in ('config', 'pipeline')] # store all features (included the features in external_indices all_features = [] # the external indices can be: indices_mfcc_bnf external_indices = flatten_list([k.split('_')[1:] for k in all_keys if 'indices' in k and k != 'indices']) # ====== checking indices ====== # main_indices = {name: (start, end) for name, (start, end) in ds['indices'].items()} for ids_name in (k for k in all_keys if 'indices' in k): ids = sorted([(name, start, end) for name, (start, end) in ds[ids_name].items()], key=lambda x: x[1]) for prev, now in zip(ids, ids[1:]): assert prev[2] == now[1], "Zero length in indices" assert prev[2] - prev[1] > 0, "Zero length in indices" assert now[2] - now[1] > 0, "Zero length in indices" # final length match length of Data if ids_name != 'indices': for feat_name in ids_name.split('_')[1:]: assert now[-1] == len(ds[feat_name]), \ "Indices and data length mismatch, indices:'%s' feat:'%s'" % \ (ids_name, feat_name) all_features.append(feat_name) else: for feat_name in all_keys: if feat_name not in external_indices and \ 'sum1' != feat_name[-4:] and 'sum2' != feat_name[-4:] and \ 'mean' != feat_name[-4:] and 'std' != feat_name[-3:] and \ isinstance(ds[feat_name], MmapData): assert now[-1] == len(ds[feat_name]), \ "Length of indices and actual data mismatch, " + ids_name + ':' + feat_name all_features.append(feat_name) # logging logger("Checked all:", ids_name, True) # ====== check all dictionary types ====== # for name in all_keys: if isinstance(ds[name], MmapDict) and 'indices' not in name: data = ds[name] # special cases if name == 'sr': checking_func = lambda x: x > 0 # for sr else: checking_func = lambda x: True # check for key, val in data.items(): assert key in main_indices, \ "Dictionary with name:'%s' has key not found in indices." % name assert checking_func(val) logger("Checked dictionary: ", name, True) # ====== checking each type of data ====== # # get all stats name all_stats = defaultdict(list) for k in all_keys: if 'sum1' == k[-4:] or 'sum2' == k[-4:] or \ 'mean' == k[-4:] or 'std' == k[-3:]: all_stats[k[:-4].split('_')[0]].append(k) # get all pca name all_pca = {i: i + '_pca' for i in all_features if i + '_pca' in ds} # checking one-by-one numpy.ndarray features array for feat_name in all_features: dtype = str(ds[feat_name].dtype) # checking all data indices = ds.find_prefix(feat_name, 'indices') prog = Progbar(target=len(indices), interval=0.1, print_report=True, name='Checking: %s(%s)' % (feat_name, dtype)) # start iterating over all data file fail_test = False for file_name, (start, end) in indices: dat = ds[feat_name][start:end] # No NaN value if np.any(np.isnan(dat)): logger("NaN values", file_name + ':' + feat_name, False) fail_test = True # not all value closed to zeros if np.all(np.isclose(dat, 0.)): logger("All-closed-zeros values", file_name + ':' + feat_name, False) fail_test = True prog['Name'] = file_name prog.add(1) if not fail_test: logger("Check data incredibility for: ", feat_name, True) # checking statistics if feat_name in all_stats: fail_test = False for stat_name in all_stats[feat_name]: X = ds[stat_name] if X.ndim >= 1: X = X[:] if np.any(np.isnan(X)): logger("NaN values", feat_name + ':' + stat_name, False) fail_test = True if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values", feat_name + ':' + stat_name, False) fail_test = True if not fail_test: logger("Check statistics for: ", feat_name, True) # check PCA if feat_name in all_pca: pca = ds[all_pca[feat_name]] n = ds[feat_name].shape[0] nb_feats = ds[feat_name].shape[-1] fail_test = False # performing PCA on random samples for i in range(nb_samples): start = np.random.randint(0, n - nb_samples - 1) X = pca.transform( ds[feat_name][start:(start + nb_samples)], n_components=max(nb_feats // 2, 1)) if np.any(np.isnan(X)): logger("NaN values in PCA", feat_name, False) fail_test = True break if np.all(np.isclose(X, 0.)): logger("All-closed-zeros values in PCA", feat_name, False) fail_test = True break if not fail_test: logger("Check PCA for: ", feat_name, True) # ====== Do sampling ====== # np.random.seed(seed) # seed for reproceducible all_samples = np.random.choice(list(ds['indices'].keys()), size=nb_samples, replace=False) # plotting all samples for sample_id, file_name in enumerate(all_samples): X = {} for feat_name in all_features: start, end = ds.find_prefix(feat_name, 'indices')[file_name] feat = ds[feat_name][start:end] X[feat_name] = feat # some special handling try: _special_cases(X=feat, feat_name=feat_name, file_name=file_name, ds=ds, path=path) except Exception as e: logger("Special case error: %s" % str(e), file_name + ':' + feat_name, False) plot_multiple_features(X, title=file_name, fig_width=fig_width) figure_path = os.path.join(path, '%s.pdf' % _escape_file_name(file_name)) plot_save(figure_path, log=False, clear_all=True) logger("Sample figure saved at: ", figure_path, True) # plotting the statistic figure_path = os.path.join(path, 'stats.pdf') for feat_name, stat_name in all_stats.items(): X = {name: ds[name][:] for name in stat_name if ds[name].ndim >= 1} if len(X) > 0: plot_multiple_features(X, title=feat_name, fig_width=fig_width) plot_save(figure_path, log=False, clear_all=True) logger("Stats figure save at: ", figure_path, True) logger("All reports at folder: ", os.path.abspath(path), True) # ====== cleaning ====== # stdio(path=prev_stdio) if should_close_ds: ds.close()