def __str__(self): # ====== get all attrs ====== # all_attrs = dir(self) print_attrs = {} for name in all_attrs: if '_' != name[0] and (len(name) >= 2 and '__' != name[:2]) and\ name not in ('nb_desc'): attr = getattr(self, name) if name == 'data_idx': print_attrs[name] = str(attr) elif isinstance(attr, slice): print_attrs[name] = str(attr) elif inspect.isfunction(attr): print_attrs[name] = "(f)" + attr.__name__ elif isinstance(attr, np.ndarray): print_attrs[name] = ("(%s)" % str(attr.dtype)) + \ str(attr.shape) elif isinstance(attr, (tuple, list)): print_attrs[name] = "(list)" + str(len(attr)) elif isinstance(attr, Mapping): print_attrs[name] = "(map)" + str(len(attr)) elif is_primitives(attr): print_attrs[name] = str(attr) print_attrs = sorted(print_attrs.items(), key=lambda x: x[0]) print_attrs = [('#desc', self.nb_desc)] + print_attrs print_attrs = ' '.join(["%s:%s" % (ctext(key, 'yellow'), val) for key, val in print_attrs]) # ====== format the output ====== # s = '<%s %s>' % (ctext(self.__class__.__name__, 'cyan'), print_attrs) return s
def logger(title, tag, check): check = bool(check) text_color = 'yellow' if check else 'red' print(ctext(' *', 'cyan'), ctext(str(title), text_color), ctext(str(tag), 'magenta'), ctext("✓", text_color) if check else ctext("✗", text_color))
def evaluate_prediction(name_list, y_pred, y_true, title): def _report(y_p, y_t, pad=''): with catch_warnings_ignore(Warning): z_ = np.concatenate(y_p, axis=0) z = np.concatenate(y_t, axis=0) print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1))) z_ = np.concatenate([np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0) z = np.array([i[0] for i in y_t]) print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1))) datasets_2_samples = defaultdict(list) for name, y_p, y_t in zip(name_list, y_pred, y_true): dsname = ds['dsname'][name] datasets_2_samples[dsname].append((name, y_p, y_t)) print('=' * 12, ctext(title, 'lightyellow'), '=' * 12) _report(y_p=y_pred, y_t=y_true) for dsname, data in sorted(datasets_2_samples.items(), key=lambda x: x[0]): print(ctext(dsname, 'yellow'), ':') y_pred = [i[1] for i in data] y_true = [i[2] for i in data] _report(y_p=y_pred, y_t=y_true, pad=' ')
def __call__(self, x=None, out_dim=None, n_eventdim=0, **kwargs): n_eventdim = int(n_eventdim) if x is not None: x = tf.convert_to_tensor(x) if out_dim is not None: out_dim = int(out_dim) print_log = self._print_log padding = self._padding if print_log: print(padding + ctext("Parsing distribution:", 'lightyellow'), '%s/%s' % (ctext(self.normalized_name, 'lightcyan'), ctext(self.distribution.__name__, 'cyan'))) args = {} for p_name, p_val in self.get_ordered_arguments(): if print_log: print(padding + " Parsing parameter:", ctext(p_name, 'cyan')) p_val = _parse_parameter(x, out_dim, p_name, p_val, print_log=print_log, padding=padding, **kwargs) args[p_name] = p_val dist = self.distribution(**args) if n_eventdim > 0: dist = tfd.Independent(distribution=dist, reinterpreted_batch_ndims=n_eventdim) if print_log: print(' Distribution:', ctext(dist, 'cyan')) self._print_log = False self._padding = '' return dist
def __str__(self): return "<Task:'%s' p:%s bs:%s #ep:%s/%s #it:%s/%s #n:%s/%s %s>" % \ (ctext(self.name, 'lightyellow'), ctext(self.probability, 'cyan'), ctext(self.batch_size, 'cyan'), ctext(self.curr_epoch, 'lightcyan'), ctext(self.nb_epoch, 'cyan'), ctext(self.curr_epoch_iter, 'lightcyan'), ctext(self.curr_iter, 'cyan'), ctext(self.curr_epoch_samples, 'lightcyan'), ctext(self.curr_samples, 'cyan'), ','.join([ctext(i.__class__.__name__, 'cyan') for i in self._callback._callbacks]))
def __str__(self): s = ["Distribution: %s/%s" % (ctext(self.normalized_name, 'lightcyan'), ctext(self.distribution.__name__, 'cyan'))] for name, val in self.get_ordered_arguments(): if isinstance(val, DistributionDescription): s.append(' %s: ' % name) s.append('\n'.join([' ' + line for line in str(val).split('\n')])) else: s.append(' %s: %s' % (name, ctext(val, 'cyan'))) return '\n'.join(s)
def get_model_path(system_name, logging=True): """ Parameters ---------- args_name : list of string list of name for parsed argument, taken into account for creating model name Return ------ exp_dir, model_path, log_path """ args_name = [] if system_name == 'xvec': args_name += ['utt', 'seq'] elif system_name == 'ivec': args_name += ['nmix', 'tdim'] else: raise ValueError("No support for system with name: %s" % system_name) args_name += ['mindur', 'minutt'] # ====== base system and feature identity ====== # name = str(system_name).lower() name += '_' + FEATURE_RECIPE.replace('_', '') name += '.' + FEATURE_NAME # ====== concat the attributes ====== # attributes = [] for i in [str(i) for i in args_name]: attributes.append(str(getattr(_args, i))) attributes = '_'.join(attributes) name += '.' + attributes # ====== check the exclude dataset ====== # excluded_dataset = str(_args.exclude).strip() if len(excluded_dataset) > 0: dataset_str = [] for excluded in sorted(set(excluded_dataset.split(','))): assert excluded in sre_file_list or excluded == 'noise', \ "Unknown excluded dataset with name: '%s'" % excluded dataset_str.append(excluded) dataset_str = '_'.join(dataset_str) name += '.' + dataset_str # ====== check save_path ====== # save_path = os.path.join(EXP_DIR, name) if os.path.exists(save_path) and IS_OVERRIDE: print("Override path:", ctext(save_path, 'yellow')) shutil.rmtree(save_path) if not os.path.exists(save_path): os.mkdir(save_path) # ====== return path ====== # log_path = get_logpath(name='log.txt', increasing=True, odin_base=False, root=save_path) model_path = os.path.join(save_path, 'model.ai') if bool(logging): print("Model path:", ctext(model_path, 'cyan')) print("Log path:", ctext(log_path, 'cyan')) return save_path, model_path, log_path
def dataset_statistics(dsname): ids = {name: (start, end) for name, (start, end) in indices.items() if ds['dsname'][name] == dsname} name2spk = {name: ds['spkid'][name] for name in ids.keys()} s = [] s.append('=' * 12 + ctext('%-12s' % dsname, 'lightyellow') + '=' * 12) s.append('#Files :' + ctext(len(ids), 'cyan')) s.append("#Speakers:" + ctext(len(set(name2spk.values())), 'cyan')) # ====== mean and std ====== # sum1 = 0. sum2 = 0. n = 0 spk_sum1 = defaultdict(float) spk_sum2 = defaultdict(float) spk_n = defaultdict(int) for name, (start, end) in ids.items(): spkid = name2spk[name] n += end - start; spk_n[spkid] += end - start x = X[start:end][:].astype('float64') s1 = np.sum(x, axis=0); s2 = np.sum(x**2, axis=0) sum1 += s1; sum2 += s2 spk_sum1[spkid] += s1; spk_sum2[spkid] += s2 data_mean = sum1 / n data_std = np.sqrt(sum2 / n - data_mean ** 2) spk_stats = {} for spkid in name2spk.values(): n = spk_n[spkid] s1, s2 = spk_sum1[spkid], spk_sum2[spkid] mean = s1 / n std = np.sqrt(s2 / n - mean ** 2) spk_stats[spkid] = (mean, std) spk_mean = np.concatenate([x[0][None, :] for x in spk_stats.values()], axis=0).mean(0) spk_std = np.concatenate([x[1][None, :] for x in spk_stats.values()], axis=0).mean(0) # ====== utterances length ====== # all_length = np.array([(end - start) * Config.STEP_LENGTH for start, end in indices.values()]) # ====== speaker - utterance relation ====== # nutt_per_spk = defaultdict(int) dur_per_spk = defaultdict(list) for name, (start, end) in ids.items(): spkid = name2spk[name] nutt_per_spk[spkid] += 1 dur_per_spk[spkid].append((end - start) * Config.STEP_LENGTH) all_spk = sorted(nutt_per_spk.keys()) spk_df = pd.DataFrame(data={'nutt_per_spk': [nutt_per_spk[spk] for spk in all_spk], 'sum_per_spk': [np.sum(dur_per_spk[spk]) for spk in all_spk], 'mean_per_spk': [np.mean(dur_per_spk[spk]) for spk in all_spk], }) return dsname, '\n'.join(s), (all_length, spk_df), (data_mean, data_std), (spk_mean, spk_std)
def __str__(self): name = ctext('IndexedData', 'cyan') s = '<%s: Indices(type:"%s" length:%d)>\n' % \ (name, self.indices_info[0], len(self.indices)) for dat in self.data: s += ' (%s)%s: %s %s\n' % \ (dat.__class__.__name__, ctext(str(dat.data_info), 'yellow'), dat.shape, str(dat.dtype)) return s[:-1]
def prepare_ivec_data(recipe, feat): ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = {name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA} print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) return X, train_indices, test_indices
def __str__(self): if self.is_fitted: explained_vars = ';'.join([ctext('%.2f' % i, 'cyan') for i in self.explained_variance_ratio_[:8]]) else: explained_vars = 0 s = '%s(batch_size=%s, #components=%s, #samples=%s, vars=%s)' % \ (ctext('MiniBatchPCA', 'yellow'), ctext(self.batch_size, 'cyan'), ctext(self.n_components, 'cyan'), ctext(self.n_samples_seen_, 'cyan'), explained_vars) return s
def _save(self, is_best): is_best = bool(is_best) # trigger event for callbacks self._callback.event(TrainSignal.SAVE_BEST if is_best else TrainSignal.SAVE) # ====== save the model to hard drive ====== # if self._save_path is not None: # serialize the best model to disk if is_best: final_save_path = self._save_path N.serialize(nnops=self._save_obj, path=self._save_path, save_variables=True, variables=self._save_variables, binary_output=False, override=True) # not the best model saved, just periodically saving else: final_save_path = self._save_path + '.%d' % self._current_checkpoint_count N.serialize(nnops=self._save_obj, path=final_save_path, save_variables=True, variables=self._save_variables, binary_output=False, override=True) self._current_checkpoint_count += 1 if self._checkpoint_max > 1 and self._current_checkpoint_count > self._checkpoint_max: shutil.rmtree( self._save_path + '.%d' % (self._current_checkpoint_count - self._checkpoint_max - 1)) # print the log self._show_noti("[%s] Creating %scheckpoint at: %s" % (ctext('MainLoop', 'red'), ctext('[best]', 'yellow') if is_best else '', final_save_path)) # save history if self._save_history: with open(final_save_path + '.hist', 'wb') as f: pickle.dump(self.history, f) self._show_noti("[%s] Save history at: %s" % (ctext('MainLoop', 'red'), final_save_path + '.hist')) # ====== store the object directly in RAM (only for the best) ====== # elif bool(is_best) and \ (self._save_obj is not None or len(self._save_variables) > 0): del self._best_object self._best_object = N.serialize( self._save_obj, path=None, save_variables=True, variables=self._save_variables, binary_output=True) mem_size = sum(len(v) for k, v in self._best_object.items()) / 1024 / 1024 self._show_noti( "[%s] Creating dynamic checkpoint in RAM using %.2f (megabytes)" % (ctext('MainLoop', 'red'), mem_size))
def summary_indices(ids): datasets = defaultdict(int) speakers = defaultdict(list) text = '' for name in sorted(ids.keys()): text += name + str(ids[name]) dsname = ds['dsname'][name] datasets[dsname] += 1 speakers[dsname].append(ds['spkid'][name]) for dsname in sorted(datasets.keys()): print(' %-18s: %s(utt) %s(spk)' % ( dsname, ctext('%6d' % datasets[dsname], 'cyan'), ctext(len(set(speakers[dsname])), 'cyan'))) print(' MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan'))
def _parse_parameter(x, out_dim, name, info, print_log, padding, **kwargs): # ====== parsing distribution ====== # if isinstance(info, DistributionDescription): if print_log: print(padding + " Info:", 'DistributionDescription') y = info.set_print_log(print_log ).set_padding_log(' ' )(x, out_dim, **kwargs) # ====== parsing network ====== # elif isinstance(info, dict): if print_log: print(padding + " Info:", str(info)) y = _network(x, out_dim, name, info, print_log, padding) y = _support(y, info, print_log, padding, **kwargs) y = _activate(y, info, print_log, padding, **kwargs) # ====== just tensor ====== # else: if print_log: print(padding + " Info:", 'Tensor') y = tf.convert_to_tensor(info) if print_log: print(padding + " Output:", ctext(y, 'cyan')) return y
def _report(y_p, y_t, pad=''): with catch_warnings_ignore(Warning): z_ = np.concatenate(y_p, axis=0) z = np.concatenate(y_t, axis=0) print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1))) z_ = np.concatenate([np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0) z = np.array([i[0] for i in y_t]) print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))
def validate_features_dataset(output_dataset_path, ds_validation_path): ds = F.Dataset(output_dataset_path, read_only=True) print(ds) features = {} for key, val in ds.items(): if 'indices_' in key: name = key.split('_')[-1] features[name] = (val, ds[name]) all_indices = [val[0] for val in features.values()] # ====== sampling 250 files ====== # all_files = sampling_iter(it=all_indices[0].keys(), k=250, seed=Config.SUPER_SEED) all_files = [f for f in all_files if all(f in ids for ids in all_indices)] print("#Samples:", ctext(len(all_files), 'cyan')) # ====== ignore the 20-figures warning ====== # with catch_warnings_ignore(RuntimeWarning): for file_name in all_files: X = {} for feat_name, (ids, data) in features.items(): start, end = ids[file_name] X[feat_name] = data[start:end][:].astype('float32') V.plot_multiple_features(features=X, fig_width=20, title='[%s]%s' % (ds['dsname'][file_name], file_name)) V.plot_save(ds_validation_path, dpi=12)
def _rollback(self, is_final=False): # TODO: update rollback mechanism if not self._allow_rollback and not is_final: return # trigger event for callbacks self._callback.event(TrainSignal.ROLLBACK) # default rollback procedure if self._save_path is not None and os.path.exists(self._save_path): self._show_noti("[%s] Rollback from: %s" % (ctext('MainLoop', 'red'), self._save_path)) # restore previous checkpoint immediately N.deserialize(self._save_path, force_restore_vars=True) # otherwise, load stored variables from RAM elif self._best_object is not None: self._show_noti("[%s] Rollback to the best stored object from RAM" % (ctext('MainLoop', 'red'))) N.deserialize(path_or_data=self._best_object, force_restore_vars=True)
def __str__(self): padding = ' ' # NOTE: each element in the list is one line s = ['========== ' + ctext('Dataset:%s Total:%d Size:%.2f(MB)', 'magenta') % (self.path, len(self._data_map), self.size) + ' =========='] s += self._readme_info s += [ctext('DATA:', 'yellow'), '----'] # ====== Find longest string ====== # longest_name = 0 longest_shape = 0 longest_dtype = 0 longest_file = 0 print_info = [] for name, (dtype, shape, data, path) in sorted(self._data_map.items()): shape = data.shape if hasattr(data, 'shape') else shape longest_name = max(len(name), longest_name) longest_dtype = max(len(str(dtype)), longest_dtype) longest_shape = max(len(str(shape)), longest_shape) longest_file = max(len(str(path)), longest_file) print_info.append([name, dtype, shape, path]) # ====== return print string ====== # format_str = (padding + '%-' + str(longest_name + 2) + 's ' '%-' + str(longest_dtype) + 's' + ctext(':', 'yellow') + '%-' + str(longest_shape) + 's ' 'path:%-' + str(longest_file) + 's') for name, dtype, shape, path in print_info: s.append(format_str % ('"%s"' % name, dtype, shape, path)) # ====== add recipes info ====== # for name, recipe in self._saved_recipes.items(): s.append(ctext('(Recipe) ', 'yellow') + '"%s"' % name) for rcp in recipe: rcp = str(rcp) s.append('\n'.join([padding + line for line in rcp.split('\n')])) # ====== add indices info ====== # for name, index in self._saved_indices.items(): s.append(ctext('(Index) ', 'yellow') + '"%s"' % name) s.append(padding + str(index)) name, (start, end) = next(index.items()) s.append(padding + 'Sample: "%s %d-%d"' % (name, start, end)) return '\n'.join(s)
def get_model_path(system_name, args): """Return: exp_dir, model_path, log_path, train_path, test_path""" name = '_'.join([str(system_name).lower(), args.recipe, args.feat]) if 'l' in args: name += '_' + str(int(args.l)) if 'nmix' in args: name += '_' + str(int(args.nmix)) if 'tdim' in args: name += '_' + str(int(args.tdim)) save_path = os.path.join(PATH_EXP, name) if not os.path.exists(save_path): os.mkdir(save_path) # ====== return path ====== # log_path = os.path.join(save_path, 'log.txt') model_path = os.path.join(save_path, 'model.ai') train_path = os.path.join(save_path, 'train.dat') test_path = os.path.join(save_path, 'test.dat') print("Model path:", ctext(model_path, 'cyan')) print("Log path:", ctext(log_path, 'cyan')) return save_path, model_path, log_path, train_path, test_path
def get_exp_path(system_name, args, override=False): """ Return: exp_dir, model_path, log_path """ exp_dir = get_exppath(tag='TIDIGITS_%s_%s_%s' % (system_name, args.task, args.feat)) if 'nmix' in args: exp_dir += '_%d' % args.nmix if 'tdim' in args: exp_dir += '_%d' % args.tdim # ====== check override ====== # if bool(override) and os.path.exists(exp_dir): shutil.rmtree(exp_dir) if not os.path.exists(exp_dir): os.mkdir(exp_dir) # ====== basic paths ====== # model_path = os.path.join(exp_dir, 'model.ai') log_path = os.path.join(exp_dir, 'log_%s.txt' % get_formatted_datetime(only_number=True)) print("Exp dir:", ctext(exp_dir, 'cyan')) print("Model path:", ctext(model_path, 'cyan')) print("Log path:", ctext(log_path, 'cyan')) return exp_dir, model_path, log_path
def _report(y_p, y_t, pad=''): with catch_warnings_ignore(Warning): z_ = np.concatenate(y_p, axis=0) z = np.concatenate(y_t, axis=0) print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1))) z_ = np.concatenate( [np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0) z = np.array([i[0] for i in y_t]) print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))
def verify_dependencies(): try: command = "SMILExtract -h" output = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError: raise Exception("Can't find SMILExtract executable") else: m = re.search('openSMILE version (.*)', str(output, 'utf-8'), re.MULTILINE) if m: opensmile_version = m.group(1) print('Found openSMILE:', ctext(opensmile_version, 'magenta'))
def __str__(self): padding = ' ' s = '<%s: #keys:%d #iter:%d #CPU:%s #Buffer:%d #HWM:%d mode:"%s">\n' % \ (ctext('Feeder', 'cyan'), len(self.indices_keys), len(self._running_iter), self.ncpu, self.buffer_size, self.hwm, self._batch_mode) # ====== Shape and dtype ====== # shape = self.shape # this is always list of shape s += padding + ctext("Shape: ", 'magenta') + \ ', '.join((str(s) for s in shape)) + '\n' s += padding + ctext("Dtype: ", 'magenta') + \ ', '.join((str(dt) for dt in self.dtype)) + '\n' # ====== print recipes ====== # s += padding + ctext('Recipes:', 'magenta') + '\n' for recipe in self._recipes: s += '\n'.join(['\t' + i for i in str(recipe).split('\n')]) s += '\n' # ====== print data descriptor ====== # s += padding + ctext('Descriptor:', 'magenta') + '\n' for desc in self._data: s += '\n'.join(['\t' + i for i in str(desc).split('\n')]) s += '\n' return s[:-1]
def evaluate_prediction(name_list, y_pred, y_true, title): def _report(y_p, y_t, pad=''): with catch_warnings_ignore(Warning): z_ = np.concatenate(y_p, axis=0) z = np.concatenate(y_t, axis=0) print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1))) z_ = np.concatenate( [np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0) z = np.array([i[0] for i in y_t]) print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan')) print(pad, "#Samples:", ctext(len(z), 'cyan')) print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels)) print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1))) datasets_2_samples = defaultdict(list) for name, y_p, y_t in zip(name_list, y_pred, y_true): dsname = ds['dsname'][name] datasets_2_samples[dsname].append((name, y_p, y_t)) print('=' * 12, ctext(title, 'lightyellow'), '=' * 12) _report(y_p=y_pred, y_t=y_true) for dsname, data in sorted(datasets_2_samples.items(), key=lambda x: x[0]): print(ctext(dsname, 'yellow'), ':') y_pred = [i[1] for i in data] y_true = [i[2] for i in data] _report(y_p=y_pred, y_t=y_true, pad=' ')
def __call__(self, x=None, out_dim=None, n_eventdim=0, **kwargs): n_eventdim = int(n_eventdim) if x is not None: x = tf.convert_to_tensor(x) if out_dim is not None: out_dim = int(out_dim) print_log = self._print_log padding = self._padding if print_log: print( padding + ctext("Parsing distribution:", 'lightyellow'), '%s/%s' % (ctext(self.normalized_name, 'lightcyan'), ctext(self.distribution.__name__, 'cyan'))) args = {} for p_name, p_val in self.get_ordered_arguments(): if print_log: print(padding + " Parsing parameter:", ctext(p_name, 'cyan')) p_val = _parse_parameter(x, out_dim, p_name, p_val, print_log=print_log, padding=padding, **kwargs) args[p_name] = p_val dist = self.distribution(**args) if n_eventdim > 0: dist = tfd.Independent(distribution=dist, reinterpreted_batch_ndims=n_eventdim) if print_log: print(' Distribution:', ctext(dist, 'cyan')) self._print_log = False self._padding = '' return dist
def freqcount(x, key=None, count=1, normalize=False, sort=False, pretty_return=False): """ x: list, iterable Parameters ---------- key: call-able extract the key from each item in the list count: call-able, int extract the count from each item in the list normalize: bool if normalize, all the values are normalized from 0. to 1. ( which sum up to 1. in total). sort: boolean if True, the list will be sorted in ascent order. pretty_return: boolean if True, return pretty formatted text. Return ------ dict: x(obj) -> freq(int) if `pretty_return` is `True`, return pretty formatted string. """ freq = defaultdict(int) if key is None: key = lambda x: x if count is None: count = 1 if isinstance(count, Number): _ = int(count) count = lambda x: _ for i in x: c = count(i) i = key(i) freq[i] += c # always return the same order s = float(sum(v for v in freq.values())) freq = OrderedDict([(k, freq[k] / s if normalize else freq[k]) for k in sorted(freq.keys())]) # check sort if sort: freq = OrderedDict(sorted(freq.items(), key=lambda x: x[1])) # check pretty return if pretty_return: s = '' for name, value in freq.items(): s += ' %s: %d\n' % (ctext(name, 'yellow'), value) return s return freq
def _save_data_to_path(preprocessed_path, X, y, gene_names, label_names, cell_names, verbose): # save data if verbose: print("Saving data to %s ..." % ctext(preprocessed_path, 'cyan')) with open(os.path.join(preprocessed_path, 'X'), 'wb') as f: pickle.dump(X, f) with open(os.path.join(preprocessed_path, 'y'), 'wb') as f: pickle.dump(y, f) # save the meta info with open(os.path.join(preprocessed_path, 'X_row'), 'wb') as f: pickle.dump(cell_names, f) with open(os.path.join(preprocessed_path, 'X_col'), 'wb') as f: pickle.dump(gene_names, f) with open(os.path.join(preprocessed_path, 'y_col'), 'wb') as f: pickle.dump(label_names, f)
def plot_learning_curves(self): start_time = time.time() fig = plt.figure(figsize=(20, len(self) * 4)) n_metrics = 5 for row_idx, pos in enumerate(self): row_idx = row_idx * n_metrics train = _extract_metrics(pos.train_history) valid = _extract_metrics(pos.valid_history) for col_idx, (name, i, j) in enumerate( zip(['loss', 'LLK_x', 'LLK_y', 'KLqp_x', 'KLqp_y'], train, valid)): col_idx += 1 plt.subplot(len(self), n_metrics, row_idx + col_idx) if col_idx == 1: plt.title(pos.short_id_lines, fontsize=8, fontstyle='italic') else: plt.title(name) if i is None or j is None: plt.plot(0, 0) plt.xticks(()) plt.yticks(()) else: plt.plot(i, linewidth=2.5, label='train:%.2f' % (np.max(i) if 'LLK' in name else np.min(i)), linestyle='-') plt.plot(j, linewidth=2.5, label='valid:%.2f' % (np.max(j) if 'LLK' in name else np.min(j)), linestyle='--', alpha=0.8) plt.legend() plt.tight_layout() self.add_figure('learning_curves', fig) return self._log('plot_learning_curves %s(s)' % ctext(time.time() - start_time, 'lightyellow'))
def summary(x, axis=None, shorten=False, float_precision=2): """ Return string of statistical summary given series `x` {#:%s|mi:%s|q1:%s|md:%s|mn:%s|q3:%s|ma:%s|sd:%s} """ if isinstance(x, Iterator): x = list(x) if isinstance(x, (tuple, list, set)): x = np.array(x) mean, std = np.mean(x, axis=axis), np.std(x, axis=axis) median = np.median(x, axis=axis) qu1, qu3 = np.percentile(x, [25, 75], axis=axis) min_, max_ = np.min(x, axis=axis), np.max(x, axis=axis) s = "" fmt = '%.' + str(int(float_precision)) + 'f' if not shorten: x = x.ravel() samples = ', '.join([ str(i) for i in np.random.choice( x, size=min(8, len(x)), replace=False).tolist() ]) s += "***** Summary *****\n" s += " Min : %s\n" % (fmt % min_) s += "1st Qu. : %s\n" % (fmt % qu1) s += " Median : %s\n" % (fmt % median) s += " Mean : %s\n" % (fmt % mean) s += "3rd Qu. : %s\n" % (fmt % qu3) s += " Max : %s\n" % (fmt % max_) s += "-------------------\n" s += " Std : %s\n" % (fmt % std) s += "#Samples: %d\n" % len(x) s += "Samples : %s\n" % samples s += "Sparsity: %s\n" % (fmt % sparsity_percentage(x)) else: s += "{#:%s|mi:%s|q1:%s|md:%s|mn:%s|q3:%s|ma:%s|sd:%s}" %\ (ctext(len(x), 'cyan'), ctext(fmt % min_, 'cyan'), ctext(fmt % qu1, 'cyan'), ctext(fmt % median, 'cyan'), ctext(fmt % mean, 'cyan'), ctext(fmt % qu3, 'cyan'), ctext(fmt % max_, 'cyan'), ctext(fmt % std, 'cyan')) return s
def __str__(self): s = '' s += ctext("<Ivector ", 'yellow') s += "GMM:%s " % self.is_gmm_fitted s += "Tmat:%s\n" % self.is_tmat_fitted if os.path.exists(self.path) and len(os.listdir(self.path)) > 0: # list all model files s += " %s: " % ctext('model', 'cyan') s += ', '.join([ '"%s"' % f for f in sorted(os.listdir(self.path)) if 'zstat' not in f and 'fstat' not in f and 'ivec' not in f and 'name_' not in f ]) s += '\n' # list all Zero-stats files s += " %s: " % ctext('Z-stats', 'cyan') s += ', '.join([ '"%s"' % f for f in sorted(os.listdir(self.path)) if 'zstat' in f ]) s += '\n' # list all First-stats files s += " %s: " % ctext('F-stats', 'cyan') s += ', '.join([ '"%s"' % f for f in sorted(os.listdir(self.path)) if 'fstat' in f ]) s += '\n' # list all Ivec-stats files s += " %s: " % ctext('ivec', 'cyan') s += ', '.join([ '"%s"' % f for f in sorted(os.listdir(self.path)) if 'ivec' in f ]) s += '\n' # list all Name path files s += " %s: " % ctext('name-list', 'cyan') s += ', '.join([ '"%s"' % f for f in sorted(os.listdir(self.path)) if 'name_' in f ]) s += '\n' # list all attributes for k, v in sorted(self.__dict__.items(), key=lambda x: x[0]): if is_primitive(v, inc_ndarray=False): s += " %s: %s\n" % (ctext(k, 'cyan'), str(v)) s = s[:-1] + '>' return s
def _set_path(self, path, read_only): MAXIMUM_README_LINE = 25 # all files are opened with default_mode=r+ self._data_map = OrderedDict() self._path = os.path.abspath(path) self._default_hdf5 = os.path.basename(self._path) + '_default.h5' # svaed feeder info self._saved_indices = {} self._saved_recipes = {} # just make new dir if not os.path.exists(path): os.mkdir(path) os.mkdir(self.recipe_path) os.mkdir(self.index_path) return # not thing to do more elif not os.path.isdir(path): raise ValueError('Dataset path must be a folder.') # ====== Load all Data ====== # files = os.listdir(path) for fname in files: # found README if 'readme' == fname[:6].lower(): readme_path = os.path.join(path, fname) with open(readme_path, 'r') as readme_file: readme = readme_file.readlines()[:MAXIMUM_README_LINE] readme = [ ' ' + i[:-1] for i in readme if len(i) > 0 and i != '\n' ] readme.append(' => For more information: ' + readme_path) self._readme_info = [ctext('README:', 'yellow'), '------' ] + readme self._readme_path = readme_path # parse data data = _parse_data_descriptor(os.path.join(path, fname), read_only) if data is None: continue for key, d in data: if key in self._data_map: raise ValueError('Found duplicated data with follow info: ' '{}'.format(key)) else: self._data_map[key] = d
def _bar_box_line(self, title, ylabel, get_score, model_id, ax, ignore=[], using_bar=True): start_time = time.time() assert callable(model_id), "model_id must be callable" assert callable(get_score) data = [] for pos in self.posteriors: name = model_id(pos.infer) train, test = get_score(pos) for i in ignore: del train[i] del test[i] for i, j in train.items(): data.append({'Model': name, ylabel: j, 'Data': 'train'}) for i, j in test.items(): data.append({'Model': name, ylabel: j, 'Data': 'test'}) df = pd.DataFrame(data) ax = to_axis2D(ax) # Bar plot if using_bar: sns.barplot(x='Model', y=ylabel, hue='Data', data=df, ax=ax) # Box plot else: sns.boxplot(x='Model', y=ylabel, hue='Data', data=df, ax=ax) ax.grid(axis='y', linewidth=1.2, alpha=0.5) ax.set_axisbelow(True) self.add_figure(title, ax.get_figure()) return self._log( '%s %s(s)' % (title, ctext(time.time() - start_time, 'lightyellow')))
def summary(x, axis=None, shorten=False): """ Return string of statistical summary given series `x` {#:%s|mi:%s|q1:%s|md:%s|mn:%s|q3:%s|ma:%s|sd:%s} """ if isinstance(x, Iterator): x = list(x) if isinstance(x, (tuple, list, set)): x = np.array(x) mean, std = np.mean(x, axis=axis), np.std(x, axis=axis) median = np.median(x, axis=axis) qu1, qu3 = np.percentile(x, [25, 75], axis=axis) min_, max_ = np.min(x, axis=axis), np.max(x, axis=axis) s = "" if not shorten: x = x.ravel() samples = ', '.join([str(i) for i in np.random.choice(x, size=min(8, len(x)), replace=False).tolist()]) s += "***** Summary *****\n" s += " Min : %s\n" % str(min_) s += "1st Qu. : %s\n" % str(qu1) s += " Median : %s\n" % str(median) s += " Mean : %g\n" % mean s += "3rd Qu. : %s\n" % str(qu3) s += " Max : %s\n" % str(max_) s += "-------------------\n" s += " Std : %g\n" % std s += "#Samples: %d\n" % len(x) s += "Samples : %s\n" % samples else: s += "{#:%s|mi:%s|q1:%s|md:%s|mn:%s|q3:%s|ma:%s|sd:%s}" %\ (ctext(len(x), 'cyan'), ctext('%g' % min_, 'cyan'), ctext('%g' % qu1, 'cyan'), ctext('%g' % median, 'cyan'), ctext('%g' % mean, 'cyan'), ctext('%g' % qu3, 'cyan'), ctext('%g' % max_, 'cyan'), ctext('%g' % std, 'cyan')) return s
def __init__(self, path, read_only=False, override=False): path = os.path.abspath(path) self.read_only = read_only self._readme_info = [ctext('README:', 'yellow'), '------', ' No information!'] self._readme_path = None # flag to check cPickle called with protocol 2 self._new_args_called = False # parse all data from path if path is not None: if override and os.path.exists(path) and os.path.isdir(path): shutil.rmtree(path) print('Overrided old dataset at path:', path) if os.path.isfile(path) and '.zip' in os.path.basename(path): self._load_archive(path, extract_path=path.replace(os.path.basename(path), '')) else: self._set_path(path, self.read_only) else: raise ValueError('Invalid path for Dataset: %s' % path)
def downsample_data(*X): y = [None] * len(X) _ = list(set(x.shape[0] for x in X if x is not None)) assert len(_) == 1, "Inconsistent shape[0] for X and y" num_samples = _[0] _RAND = np.random.RandomState(seed=87654321) # ====== Downsample if the data is huge ====== # if num_samples > 8000: print("[Warning] Given: %s; downsample to 8000 samples" % ctext(', '.join([str(x.shape) for x in X if x is not None]), 'cyan')) ids = _RAND.choice(a=np.arange(0, num_samples), size=8000, replace=False) for i, x in enumerate(X): if x is not None: x = x[ids] y[i] = x else: y = X return tuple(y)
def __str__(self): s = '' s += ctext("<Ivector ", 'yellow') s += "GMM:%s " % self.is_gmm_fitted s += "Tmat:%s\n" % self.is_tmat_fitted if os.path.exists(self.path) and len(os.listdir(self.path)) > 0: # list all model files s += " %s: " % ctext('model', 'cyan') s += ', '.join(['"%s"' % f for f in sorted(os.listdir(self.path)) if 'zstat' not in f and 'fstat' not in f and 'ivec' not in f and 'name_' not in f]) s += '\n' # list all Zero-stats files s += " %s: " % ctext('Z-stats', 'cyan') s += ', '.join(['"%s"' % f for f in sorted(os.listdir(self.path)) if 'zstat' in f]) s += '\n' # list all First-stats files s += " %s: " % ctext('F-stats', 'cyan') s += ', '.join(['"%s"' % f for f in sorted(os.listdir(self.path)) if 'fstat' in f]) s += '\n' # list all Ivec-stats files s += " %s: " % ctext('ivec', 'cyan') s += ', '.join(['"%s"' % f for f in sorted(os.listdir(self.path)) if 'ivec' in f]) s += '\n' # list all Name path files s += " %s: " % ctext('name-list', 'cyan') s += ', '.join(['"%s"' % f for f in sorted(os.listdir(self.path)) if 'name_' in f]) s += '\n' # list all attributes for k, v in sorted(self.__dict__.items(), key=lambda x: x[0]): if is_primitives(v, inc_ndarray=False): s += " %s: %s\n" % (ctext(k, 'cyan'), str(v)) s = s[:-1] + '>' return s
def __init__(self, path, read_only=False, override=False): path = os.path.abspath(path) self.read_only = read_only self._readme_info = [ ctext('README:', 'yellow'), '------', ' No information!' ] self._readme_path = None # flag to check cPickle called with protocol 2 self._new_args_called = False # parse all data from path if path is not None: if override and os.path.exists(path) and os.path.isdir(path): shutil.rmtree(path) print('Overrided old dataset at path:', path) if os.path.isfile(path) and '.zip' in os.path.basename(path): self._load_archive(path, extract_path=path.replace( os.path.basename(path), '')) else: self._set_path(path, self.read_only) else: raise ValueError('Invalid path for Dataset: %s' % path)
def _parse_parameter(x, out_dim, name, info, print_log, padding, **kwargs): # ====== parsing distribution ====== # if isinstance(info, DistributionDescription): if print_log: print(padding + " Info:", 'DistributionDescription') y = info.set_print_log(print_log).set_padding_log(' ')(x, out_dim, **kwargs) # ====== parsing network ====== # elif isinstance(info, dict): if print_log: print(padding + " Info:", str(info)) y = _network(x, out_dim, name, info, print_log, padding) y = _support(y, info, print_log, padding, **kwargs) y = _activate(y, info, print_log, padding, **kwargs) # ====== just tensor ====== # else: if print_log: print(padding + " Info:", 'Tensor') y = tf.convert_to_tensor(info) if print_log: print(padding + " Output:", ctext(y, 'cyan')) return y
def __str__(self): s = ctext('============= FeatureProcessor: %s =============' % self.path, 'yellow') + '\n' padding = ' ' # ====== basic info ====== # s += '- Jobs: ' + ctext(len(self.jobs), 'cyan') + '\n' s += '- #CPU: ' + ctext(self.n_cpu, 'cyan') + '\n' s += '- #Cache: ' + ctext(self.n_cache, 'cyan') + '\n' # ====== print pipeline ====== # s += ctext("* Pipeline:", 'yellow') + '\n' for _, extractor in self.extractor.steps: for line in str(extractor).split('\n'): s += padding + ' ' + line + '\n' # ====== print config ====== # s += ctext("* Configurations:", 'yellow') + '\n' for i, j in self.config.items(): s += padding + str(i) + ' : ' + str(j) + '\n' return s
def _activate(y, info, print_log, padding, **kwargs): fn = info.get('fn', lambda x: x) args = [] args_name = [] for p_name, p_val in inspect.signature(fn).parameters.items(): if p_name in kwargs: p_val = kwargs[p_name] else: p_val = p_val.default args.append(p_val) args_name.append(p_name) args[0] = y for a_name, a_val in zip(args_name, args): if a_val == inspect.Parameter.empty: raise RuntimeError("Cannot extract value for argument name: '%s'" % a_name) # print out log if print_log: print(padding + " activation: <%s>(%s)" % (fn.__name__, '; '.join( [ctext(i, 'cyan') + ':' + str(j) for i, j in zip(args_name, args)]))) y = fn(*args) return y
def _activate(y, info, print_log, padding, **kwargs): fn = info.get('fn', lambda x: x) args = [] args_name = [] for p_name, p_val in inspect.signature(fn).parameters.items(): if p_name in kwargs: p_val = kwargs[p_name] else: p_val = p_val.default args.append(p_val) args_name.append(p_name) args[0] = y for a_name, a_val in zip(args_name, args): if a_val == inspect.Parameter.empty: raise RuntimeError("Cannot extract value for argument name: '%s'" % a_name) # print out log if print_log: print(padding + " activation: <%s>(%s)" % ( fn.__name__, '; '.join([ctext(i, 'cyan') + ':' + str(j) for i, j in zip(args_name, args)]))) y = fn(*args) return y
def __str__(self): s = ctext( '============= FeatureProcessor: %s =============' % self.path, 'yellow') + '\n' padding = ' ' # ====== basic info ====== # s += '- Jobs: ' + ctext(len(self.jobs), 'cyan') + '\n' s += '- #CPU: ' + ctext(self.n_cpu, 'cyan') + '\n' s += '- #Cache: ' + ctext(self.n_cache, 'cyan') + '\n' # ====== print pipeline ====== # s += ctext("* Pipeline:", 'yellow') + '\n' for _, extractor in self.extractor.steps: for line in str(extractor).split('\n'): s += padding + ' ' + line + '\n' # ====== print config ====== # s += ctext("* Configurations:", 'yellow') + '\n' for i, j in self.config.items(): s += padding + str(i) + ' : ' + str(j) + '\n' return s
ids = np.random.permutation(len(X_train)) X_train, y_train = X_train[ids], y_train[ids] X_valid, y_valid = X_train[40000:], y_train[40000:] X_train, y_train = X_train[:40000], y_train[:40000] # normalize value to [0, 1] X_train = X_train / 255. X_valid = X_valid / 255. X_test = X_test / 255. print(ds) # ====== others ====== # X_samples, y_samples = X_train[:25], y_train[:25] input_shape = ds['X_train'].shape input_ndim = len(input_shape) print("Train shape:", ctext(X_train.shape, 'cyan')) print("Valid shape:", ctext(X_valid.shape, 'cyan')) print("Test shape:", ctext(X_test.shape, 'cyan')) # ====== create basic tensor ====== # X = K.placeholder(shape=(None,) + input_shape[1:], name='X_input') y = K.placeholder(shape=(None,), name='y_input') # =========================================================================== # Create the network # =========================================================================== LATENT_DROPOUT = 0.3 if args.cnn: with N.args_scope(([N.Conv, N.Dense], dict(b_init=None, activation=K.linear)), (N.BatchNorm, dict(activation=tf.nn.elu)), (N.Pool, dict(mode='max', pool_size=2))): f_encoder = N.Sequence([ N.Dropout(level=0.5),
def event(self, event_type): print(ctext("[Debug] Event:", 'cyan'), event_type)
def task_end(self, task, task_results): print(ctext("Task End:", 'cyan'), task.name, task.curr_epoch, task.curr_samples, [(i, [(n, len(v), type(v[0])) for n, v in j.items()]) for i, j in task_results.items()])
def task_start(self, task): print(ctext("Task Start:", 'cyan'), task.name, task.curr_epoch, task.curr_samples)
def epoch_end(self, task, epoch_results): print(ctext("Epoch End:", 'cyan'), task.name, task.curr_epoch, task.curr_samples, [(i, len(j), type(j[0])) for i, j in epoch_results.items()])
from odin import nnet as N, backend as K from odin import visual as V from odin.utils import (ctext, mpi, Progbar, catch_warnings_ignore, stdio, get_logpath, catch_warnings_ignore) from helpers import (FEATURE_RECIPE, FEATURE_NAME, PATH_ACOUSTIC_FEATURES, MINIMUM_UTT_DURATION, ANALYSIS_DIR, Config, filter_utterances, prepare_dnn_data) # ====== prepare log ====== # stdio( get_logpath(name="analyze_data.log", increasing=True, odin_base=False, root=ANALYSIS_DIR)) print(ctext(FEATURE_RECIPE, 'lightyellow')) print(ctext(FEATURE_NAME, 'lightyellow')) assert os.path.isdir(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE)) # ====== essential path ====== # figure_path = os.path.join( ANALYSIS_DIR, '%s_%s.pdf' % (FEATURE_RECIPE.replace('_', ''), FEATURE_NAME)) print(ctext(figure_path, 'lightyellow')) # =========================================================================== # Load the data # =========================================================================== ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE), read_only=True) X = ds[FEATURE_NAME] # remove all noise data indices = {
def send_notification(self, msg): if self._log: add_notification( '[%s] %s' % (ctext(self.__class__.__name__, 'magenta'), msg)) return self
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321): """ Return ------ train_feeder : Feeder for training valid_feeder : Feeder for validating test_ids : Test indices test_dat : Data array all_speakers : list of all speaker in training set """ # Load dataset frame_length = int(utt_length / FRAME_SHIFT) ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True) X = ds[feat] train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()} test_indices = { name: start_end for name, start_end in ds['indices'].items() if name not in TRAIN_DATA } train_indices, valid_indices = train_valid_test_split(x=list( train_indices.items()), train=0.9, inc_test=False, seed=seed) all_speakers = sorted(set(TRAIN_DATA.values())) n_speakers = max(all_speakers) + 1 print("#Train files:", ctext(len(train_indices), 'cyan')) print("#Valid files:", ctext(len(valid_indices), 'cyan')) print("#Test files:", ctext(len(test_indices), 'cyan')) print("#Speakers:", ctext(n_speakers, 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post', data_idx=0), F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0), F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1) ] train_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=train_indices), batch_mode='batch', ncpu=7, buffer_size=12) valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X, indices=valid_indices), batch_mode='batch', ncpu=2, buffer_size=4) train_feeder.set_recipes(recipes) valid_feeder.set_recipes(recipes) print(train_feeder) # ====== cache the test data ====== # cache_dat = os.path.join(PATH_EXP, 'test_%s_%d.dat' % (feat, int(utt_length))) cache_ids = os.path.join(PATH_EXP, 'test_%s_%d.ids' % (feat, int(utt_length))) # validate cache files if os.path.exists(cache_ids): with open(cache_ids, 'rb') as f: ids = pickle.load(f) if len(ids) != len(test_indices): os.remove(cache_ids) if os.path.exists(cache_dat): os.remove(cache_dat) elif os.path.exists(cache_dat): os.remove(cache_dat) # caching if not os.path.exists(cache_dat): dat = F.MmapData(cache_dat, dtype='float16', shape=(0, frame_length, X.shape[1])) ids = {} prog = Progbar(target=len(test_indices)) s = 0 for name, (start, end) in test_indices.items(): y = X[start:end] y = segment_axis(y, axis=0, frame_length=frame_length, step_length=frame_length, end='pad', pad_value=0, pad_mode='post') dat.append(y) # update indices ids[name] = (s, s + len(y)) s += len(y) # update progress prog.add(1) dat.flush() dat.close() with open(cache_ids, 'wb') as f: pickle.dump(ids, f) # ====== re-load ====== # dat = F.MmapData(cache_dat, read_only=True) with open(cache_ids, 'rb') as f: ids = pickle.load(f) # ====== save some sample ====== # sample_path = os.path.join(PATH_EXP, 'test_%s_%d.pdf' % (feat, int(utt_length))) V.plot_figure(nrow=9, ncol=6) for i, (name, (start, end)) in enumerate( sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]), k=12, seed=87654321)): x = dat[start:end][:].astype('float32') ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T, ax=(12, 1, i + 1), title='') ax.set_title(name) V.plot_save(sample_path) return (train_feeder, valid_feeder, ids, dat, all_speakers)
NUM_DIM = 3 colors = ['r', 'b', 'g'] markers = ["o", "^", "s"] SEED = K.get_rng().randint(0, 10e8) # =========================================================================== # Load dataset # =========================================================================== ds = F.IRIS.load() print(ds) nb_samples = ds['X'].shape[0] ids = K.get_rng().permutation(nb_samples) X = ds['X'][ids] y = ds['y'][ids] labels = ds['name'] print("Labels:", ctext(labels)) assert len(colors) == len(labels) and len(markers) == len(labels) X_train = X[:int(TRAINING_PERCENT * nb_samples)] y_train = y[:int(TRAINING_PERCENT * nb_samples)] y_train_color = [colors[i] for i in y_train] y_train_marker = [markers[i] for i in y_train] X_score = X[int(TRAINING_PERCENT * nb_samples):] y_score = y[int(TRAINING_PERCENT * nb_samples):] y_score_color = [colors[i] for i in y_score] y_score_marker = [markers[i] for i in y_score] print("Train:", X_train.shape, y_train.shape) print("Score:", X_score.shape, y_score.shape)
def prepare_data(feat, label, utt_length=0.4, for_ivec=False): """ Returns (i-vector) ------------------ ds[feat] train_files y_train test_files y_test labels Returns (x-vector) ------------------ train : Feeder feeder for training data for iterating over pair of (X, y) valid : Feeder feeder for validating data for iterating over pair of (X, y) X_test_name : list of file names file names are append with '.%d' for cut segment ID X_test_true : list of integer label of each sample X_test_data : array list of test data same length as X_test_name labels : list of string list of labels for classification task Example ------- (train, valid, X_test_name, X_test_true, X_test_data, labels) = prepare_data_dnn(feat=FEAT, label='gender') """ label = str(label).lower() assert label in _support_label, "No support for label: %s" % label assert 0 < utt_length <= 1. # ====== load dataset ====== # if not os.path.exists(PATH_ACOUSTIC): raise RuntimeError( "Cannot find extracted acoustic features at path: '%s'," "run the code speech_features_extraction.py!" % PATH_ACOUSTIC) ds = F.Dataset(PATH_ACOUSTIC, read_only=True) assert feat in ds, "Cannot find feature with name: %s" % feat indices = list(ds['indices'].items()) K.get_rng().shuffle(indices) # ====== helper ====== # def is_train(x): return x.split('_')[0] == 'train' def extract_label(x): return x.split('_')[_support_label[label]] print("Task:", ctext(label, 'cyan')) fn_label, labels = unique_labels([i[0] for i in indices], key_func=extract_label, return_labels=True) print("Labels:", ctext(labels, 'cyan')) # ====== training and test data ====== # train_files = [] # (name, (start, end)) ... test_files = [] for name, (start, end) in indices: if is_train(name): train_files.append((name, (start, end))) else: test_files.append((name, (start, end))) # name for each dataset, useful for later print("#Train:", ctext(len(train_files), 'cyan')) print("#Test:", ctext(len(test_files), 'cyan')) # ====== for i-vectors ====== # y_train = np.array([fn_label(i[0]) for i in train_files]) y_test = np.array([fn_label(i[0]) for i in test_files]) if bool(for_ivec): return ds[feat], train_files, y_train, test_files, y_test, labels # ====== length ====== # length = [(end - start) for _, (start, end) in indices] max_length = max(length) frame_length = int(max_length * utt_length) step_length = frame_length print("Max length :", ctext(max_length, 'yellow')) print("Frame length:", ctext(frame_length, 'yellow')) print("Step length :", ctext(step_length, 'yellow')) # ====== split dataset ====== # # split by speaker ID train_files, valid_files = train_valid_test_split( x=train_files, train=0.8, cluster_func=None, idfunc=lambda x: x[0].split('_')[4], # splited by speaker inc_test=False) print("#File train:", ctext(len(train_files), 'cyan')) print("#File valid:", ctext(len(valid_files), 'cyan')) print("#File test :", ctext(len(test_files), 'cyan')) recipes = [ F.recipes.Sequencing(frame_length=frame_length, step_length=step_length, end='pad', pad_mode='post', pad_value=0), F.recipes.Name2Label(converter_func=fn_label), F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1) ] feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files), ncpu=6, batch_mode='batch') feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files), ncpu=4, batch_mode='batch') feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files), ncpu=4, batch_mode='file') feeder_train.set_recipes(recipes) feeder_valid.set_recipes(recipes) feeder_test.set_recipes(recipes) print(feeder_train) # ====== process X_test, y_test in advance for faster evaluation ====== # @cache_disk def _extract_test_data(feat, label, utt_length): prog = Progbar(target=len(feeder_test), print_summary=True, name="Preprocessing test set") X_test = defaultdict(list) for name, idx, X, y in feeder_test: # validate everything as expected assert fn_label(name) == np.argmax(y), name # label is right # save to list X_test[name].append((idx, X)) prog.add(X.shape[0]) # ====== create 1 array for data and dictionary for indices ====== # X_test_name = [] X_test_data = [] for name, X in X_test.items(): X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])], axis=0).astype('float16') X_test_name += [name + '.%d' % i for i in range(len(X))] X_test_data.append(X) X_test_name = np.array(X_test_name) X_test_data = np.concatenate(X_test_data, axis=0) return X_test_name, X_test_data # convert everything back to float32 X_test_name, X_test_data = _extract_test_data(feat, label, utt_length) X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name]) return feeder_train, feeder_valid, \ X_test_name, X_test_true, X_test_data, labels
# ====== load data feeder ====== # (train, valid, X_test_name, X_test_true, X_test_data, labels) = prepare_data(feat=args.feat, label=args.task) n_classes = len(labels) # =========================================================================== # Create model # =========================================================================== inputs = [ K.placeholder(shape=(None, ) + shape[1:], dtype='float32', name='input%d' % i) for i, shape in enumerate(as_tuple_of_shape(train.shape)) ] X = inputs[0] y = inputs[1] print("Inputs:", ctext(inputs, 'cyan')) # ====== create the networks ====== # with N.args_scope([('Conv', 'Dense'), dict(b_init=None, activation=K.linear, pad='same')], ['BatchNorm', dict(activation=K.relu)]): f = N.Sequence([ N.Dimshuffle(pattern=(0, 1, 2, 'x')), N.Conv(num_filters=32, filter_size=(9, 7)), N.BatchNorm(), N.Pool(pool_size=(3, 2), strides=2), N.Conv(num_filters=64, filter_size=(5, 3)), N.BatchNorm(), N.Pool(pool_size=(3, 1), strides=(2, 1), name='PoolOutput1'), N.Conv(num_filters=64, filter_size=(5, 3)), N.BatchNorm(), N.Pool(pool_size=(3, 2), strides=(2, 2), name='PoolOutput2'),
def batch_end(self, task, batch_results): print(ctext("Batch End:", 'cyan'), task.name, task.curr_epoch, task.curr_samples, [(i.shape, i.dtype, type(i)) for i in as_tuple(batch_results)])
def dataset_statistics(dsname): ids = { name: (start, end) for name, (start, end) in indices.items() if ds['dsname'][name] == dsname } name2spk = {name: ds['spkid'][name] for name in ids.keys()} s = [] s.append('=' * 12 + ctext('%-12s' % dsname, 'lightyellow') + '=' * 12) s.append('#Files :' + ctext(len(ids), 'cyan')) s.append("#Speakers:" + ctext(len(set(name2spk.values())), 'cyan')) # ====== mean and std ====== # sum1 = 0. sum2 = 0. n = 0 spk_sum1 = defaultdict(float) spk_sum2 = defaultdict(float) spk_n = defaultdict(int) for name, (start, end) in ids.items(): spkid = name2spk[name] n += end - start spk_n[spkid] += end - start x = X[start:end][:].astype('float64') s1 = np.sum(x, axis=0) s2 = np.sum(x**2, axis=0) sum1 += s1 sum2 += s2 spk_sum1[spkid] += s1 spk_sum2[spkid] += s2 data_mean = sum1 / n data_std = np.sqrt(sum2 / n - data_mean**2) spk_stats = {} for spkid in name2spk.values(): n = spk_n[spkid] s1, s2 = spk_sum1[spkid], spk_sum2[spkid] mean = s1 / n std = np.sqrt(s2 / n - mean**2) spk_stats[spkid] = (mean, std) spk_mean = np.concatenate([x[0][None, :] for x in spk_stats.values()], axis=0).mean(0) spk_std = np.concatenate([x[1][None, :] for x in spk_stats.values()], axis=0).mean(0) # ====== utterances length ====== # all_length = np.array([(end - start) * Config.STEP_LENGTH for start, end in indices.values()]) # ====== speaker - utterance relation ====== # nutt_per_spk = defaultdict(int) dur_per_spk = defaultdict(list) for name, (start, end) in ids.items(): spkid = name2spk[name] nutt_per_spk[spkid] += 1 dur_per_spk[spkid].append((end - start) * Config.STEP_LENGTH) all_spk = sorted(nutt_per_spk.keys()) spk_df = pd.DataFrame( data={ 'nutt_per_spk': [nutt_per_spk[spk] for spk in all_spk], 'sum_per_spk': [np.sum(dur_per_spk[spk]) for spk in all_spk], 'mean_per_spk': [np.mean(dur_per_spk[spk]) for spk in all_spk], }) return dsname, '\n'.join(s), (all_length, spk_df), (data_mean, data_std), (spk_mean, spk_std)
def epoch_start(self, task, data): print(ctext("Epoch Start:", 'cyan'), task.name, task.curr_epoch, task.curr_samples, [(i.shape, i.dtype, type(i)) for i in data])