示例#1
0
文件: recipe_base.py 项目: imito/odin
 def __str__(self):
   # ====== get all attrs ====== #
   all_attrs = dir(self)
   print_attrs = {}
   for name in all_attrs:
     if '_' != name[0] and (len(name) >= 2 and '__' != name[:2]) and\
     name not in ('nb_desc'):
       attr = getattr(self, name)
       if name == 'data_idx':
         print_attrs[name] = str(attr)
       elif isinstance(attr, slice):
         print_attrs[name] = str(attr)
       elif inspect.isfunction(attr):
         print_attrs[name] = "(f)" + attr.__name__
       elif isinstance(attr, np.ndarray):
         print_attrs[name] = ("(%s)" % str(attr.dtype)) + \
             str(attr.shape)
       elif isinstance(attr, (tuple, list)):
         print_attrs[name] = "(list)" + str(len(attr))
       elif isinstance(attr, Mapping):
         print_attrs[name] = "(map)" + str(len(attr))
       elif is_primitives(attr):
         print_attrs[name] = str(attr)
   print_attrs = sorted(print_attrs.items(), key=lambda x: x[0])
   print_attrs = [('#desc', self.nb_desc)] + print_attrs
   print_attrs = ' '.join(["%s:%s" % (ctext(key, 'yellow'), val)
                           for key, val in print_attrs])
   # ====== format the output ====== #
   s = '<%s %s>' % (ctext(self.__class__.__name__, 'cyan'), print_attrs)
   return s
示例#2
0
文件: processor.py 项目: imito/odin
 def logger(title, tag, check):
   check = bool(check)
   text_color = 'yellow' if check else 'red'
   print(ctext('   *', 'cyan'),
         ctext(str(title), text_color),
         ctext(str(tag), 'magenta'),
         ctext("✓", text_color) if check else ctext("✗", text_color))
示例#3
0
文件: analyze.py 项目: imito/odin
def evaluate_prediction(name_list, y_pred, y_true, title):
  def _report(y_p, y_t, pad=''):
    with catch_warnings_ignore(Warning):
      z_ = np.concatenate(y_p, axis=0)
      z = np.concatenate(y_t, axis=0)
      print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan'))
      print(pad, "#Samples:", ctext(len(z), 'cyan'))
      print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels))
      print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))

      z_ = np.concatenate([np.mean(i, axis=0, keepdims=True) for i in y_p],
                          axis=0)
      z = np.array([i[0] for i in y_t])
      print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan'))
      print(pad, "#Samples:", ctext(len(z), 'cyan'))
      print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels))
      print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))

  datasets_2_samples = defaultdict(list)
  for name, y_p, y_t in zip(name_list, y_pred, y_true):
    dsname = ds['dsname'][name]
    datasets_2_samples[dsname].append((name, y_p, y_t))

  print('=' * 12, ctext(title, 'lightyellow'), '=' * 12)
  _report(y_p=y_pred, y_t=y_true)

  for dsname, data in sorted(datasets_2_samples.items(),
                             key=lambda x: x[0]):
    print(ctext(dsname, 'yellow'), ':')
    y_pred = [i[1] for i in data]
    y_true = [i[2] for i in data]
    _report(y_p=y_pred, y_t=y_true, pad='  ')
示例#4
0
  def __call__(self, x=None, out_dim=None, n_eventdim=0, **kwargs):
    n_eventdim = int(n_eventdim)
    if x is not None:
      x = tf.convert_to_tensor(x)
    if out_dim is not None:
      out_dim = int(out_dim)
    print_log = self._print_log
    padding = self._padding

    if print_log:
      print(padding + ctext("Parsing distribution:", 'lightyellow'),
        '%s/%s' % (ctext(self.normalized_name, 'lightcyan'),
                   ctext(self.distribution.__name__, 'cyan')))

    args = {}
    for p_name, p_val in self.get_ordered_arguments():
      if print_log:
        print(padding + " Parsing parameter:", ctext(p_name, 'cyan'))
      p_val = _parse_parameter(x, out_dim,
                               p_name, p_val,
                               print_log=print_log,
                               padding=padding,
                               **kwargs)
      args[p_name] = p_val

    dist = self.distribution(**args)
    if n_eventdim > 0:
      dist = tfd.Independent(distribution=dist,
                             reinterpreted_batch_ndims=n_eventdim)
    if print_log:
      print(' Distribution:', ctext(dist, 'cyan'))
    self._print_log = False
    self._padding = ''
    return dist
示例#5
0
文件: trainer.py 项目: imito/odin
 def __str__(self):
   return "<Task:'%s' p:%s bs:%s #ep:%s/%s #it:%s/%s #n:%s/%s %s>" % \
   (ctext(self.name, 'lightyellow'),
    ctext(self.probability, 'cyan'),
    ctext(self.batch_size, 'cyan'),
    ctext(self.curr_epoch, 'lightcyan'), ctext(self.nb_epoch, 'cyan'),
    ctext(self.curr_epoch_iter, 'lightcyan'), ctext(self.curr_iter, 'cyan'),
    ctext(self.curr_epoch_samples, 'lightcyan'), ctext(self.curr_samples, 'cyan'),
    ','.join([ctext(i.__class__.__name__, 'cyan')
              for i in self._callback._callbacks]))
示例#6
0
 def __str__(self):
   s = ["Distribution: %s/%s" % (ctext(self.normalized_name, 'lightcyan'),
                                 ctext(self.distribution.__name__, 'cyan'))]
   for name, val in self.get_ordered_arguments():
     if isinstance(val, DistributionDescription):
       s.append('    %s: ' % name)
       s.append('\n'.join(['      ' + line for line in str(val).split('\n')]))
     else:
       s.append('    %s: %s' % (name, ctext(val, 'cyan')))
   return '\n'.join(s)
示例#7
0
文件: helpers.py 项目: imito/odin
def get_model_path(system_name, logging=True):
  """
  Parameters
  ----------
  args_name : list of string
    list of name for parsed argument, taken into account for creating
    model name

  Return
  ------
  exp_dir, model_path, log_path
  """
  args_name = []
  if system_name == 'xvec':
    args_name += ['utt', 'seq']
  elif system_name == 'ivec':
    args_name += ['nmix', 'tdim']
  else:
    raise ValueError("No support for system with name: %s" % system_name)
  args_name += ['mindur', 'minutt']
  # ====== base system and feature identity ====== #
  name = str(system_name).lower()
  name += '_' + FEATURE_RECIPE.replace('_', '')
  name += '.' + FEATURE_NAME
  # ====== concat the attributes ====== #
  attributes = []
  for i in [str(i) for i in args_name]:
    attributes.append(str(getattr(_args, i)))
  attributes = '_'.join(attributes)
  name += '.' + attributes
  # ====== check the exclude dataset ====== #
  excluded_dataset = str(_args.exclude).strip()
  if len(excluded_dataset) > 0:
    dataset_str = []
    for excluded in sorted(set(excluded_dataset.split(','))):
      assert excluded in sre_file_list or excluded == 'noise', \
      "Unknown excluded dataset with name: '%s'" % excluded
      dataset_str.append(excluded)
    dataset_str = '_'.join(dataset_str)
    name += '.' + dataset_str
  # ====== check save_path ====== #
  save_path = os.path.join(EXP_DIR, name)
  if os.path.exists(save_path) and IS_OVERRIDE:
    print("Override path:", ctext(save_path, 'yellow'))
    shutil.rmtree(save_path)
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  # ====== return path ====== #
  log_path = get_logpath(name='log.txt', increasing=True,
                         odin_base=False, root=save_path)
  model_path = os.path.join(save_path, 'model.ai')
  if bool(logging):
    print("Model path:", ctext(model_path, 'cyan'))
    print("Log path:", ctext(log_path, 'cyan'))
  return save_path, model_path, log_path
示例#8
0
def dataset_statistics(dsname):
  ids = {name: (start, end)
         for name, (start, end) in indices.items()
         if ds['dsname'][name] == dsname}
  name2spk = {name: ds['spkid'][name]
              for name in ids.keys()}
  s = []
  s.append('=' * 12 + ctext('%-12s' % dsname, 'lightyellow') + '=' * 12)
  s.append('#Files   :' + ctext(len(ids), 'cyan'))
  s.append("#Speakers:" + ctext(len(set(name2spk.values())), 'cyan'))
  # ====== mean and std ====== #
  sum1 = 0.
  sum2 = 0.
  n = 0
  spk_sum1 = defaultdict(float)
  spk_sum2 = defaultdict(float)
  spk_n = defaultdict(int)
  for name, (start, end) in ids.items():
    spkid = name2spk[name]
    n += end - start; spk_n[spkid] += end - start
    x = X[start:end][:].astype('float64')
    s1 = np.sum(x, axis=0); s2 = np.sum(x**2, axis=0)
    sum1 += s1; sum2 += s2
    spk_sum1[spkid] += s1; spk_sum2[spkid] += s2
  data_mean = sum1 / n
  data_std = np.sqrt(sum2 / n - data_mean ** 2)

  spk_stats = {}
  for spkid in name2spk.values():
    n = spk_n[spkid]
    s1, s2 = spk_sum1[spkid], spk_sum2[spkid]
    mean = s1 / n
    std = np.sqrt(s2 / n - mean ** 2)
    spk_stats[spkid] = (mean, std)
  spk_mean = np.concatenate([x[0][None, :] for x in spk_stats.values()],
                            axis=0).mean(0)
  spk_std = np.concatenate([x[1][None, :] for x in spk_stats.values()],
                           axis=0).mean(0)
  # ====== utterances length ====== #
  all_length = np.array([(end - start) * Config.STEP_LENGTH
                         for start, end in indices.values()])
  # ====== speaker - utterance relation ====== #
  nutt_per_spk = defaultdict(int)
  dur_per_spk = defaultdict(list)
  for name, (start, end) in ids.items():
    spkid = name2spk[name]
    nutt_per_spk[spkid] += 1
    dur_per_spk[spkid].append((end - start) * Config.STEP_LENGTH)
  all_spk = sorted(nutt_per_spk.keys())
  spk_df = pd.DataFrame(data={'nutt_per_spk': [nutt_per_spk[spk] for spk in all_spk],
                              'sum_per_spk': [np.sum(dur_per_spk[spk]) for spk in all_spk],
                              'mean_per_spk': [np.mean(dur_per_spk[spk]) for spk in all_spk],
                        })
  return dsname, '\n'.join(s), (all_length, spk_df), (data_mean, data_std), (spk_mean, spk_std)
示例#9
0
文件: feeder.py 项目: imito/odin
 def __str__(self):
   name = ctext('IndexedData', 'cyan')
   s = '<%s: Indices(type:"%s" length:%d)>\n' % \
       (name, self.indices_info[0], len(self.indices))
   for dat in self.data:
     s += '   (%s)%s: %s %s\n' % \
         (dat.__class__.__name__,
             ctext(str(dat.data_info), 'yellow'),
             dat.shape,
             str(dat.dtype))
   return s[:-1]
示例#10
0
文件: utils.py 项目: imito/odin
def prepare_ivec_data(recipe, feat):
  ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe),
                 read_only=True)
  X = ds[feat]
  train_indices = {name: ds['indices'][name]
                   for name in TRAIN_DATA.keys()}
  test_indices = {name: start_end
                  for name, start_end in ds['indices'].items()
                  if name not in TRAIN_DATA}
  print("#Train files:", ctext(len(train_indices), 'cyan'))
  print("#Test files:", ctext(len(test_indices), 'cyan'))
  return X, train_indices, test_indices
示例#11
0
 def __str__(self):
   if self.is_fitted:
     explained_vars = ';'.join([ctext('%.2f' % i, 'cyan')
                                for i in self.explained_variance_ratio_[:8]])
   else:
     explained_vars = 0
   s = '%s(batch_size=%s, #components=%s, #samples=%s, vars=%s)' % \
       (ctext('MiniBatchPCA', 'yellow'),
        ctext(self.batch_size, 'cyan'),
        ctext(self.n_components, 'cyan'),
        ctext(self.n_samples_seen_, 'cyan'),
        explained_vars)
   return s
示例#12
0
文件: trainer.py 项目: imito/odin
 def _save(self, is_best):
   is_best = bool(is_best)
   # trigger event for callbacks
   self._callback.event(TrainSignal.SAVE_BEST
                        if is_best else
                        TrainSignal.SAVE)
   # ====== save the model to hard drive ====== #
   if self._save_path is not None:
     # serialize the best model to disk
     if is_best:
       final_save_path = self._save_path
       N.serialize(nnops=self._save_obj, path=self._save_path,
                   save_variables=True, variables=self._save_variables,
                   binary_output=False, override=True)
     # not the best model saved, just periodically saving
     else:
       final_save_path = self._save_path + '.%d' % self._current_checkpoint_count
       N.serialize(nnops=self._save_obj,
                   path=final_save_path,
                   save_variables=True, variables=self._save_variables,
                   binary_output=False, override=True)
       self._current_checkpoint_count += 1
       if self._checkpoint_max > 1 and self._current_checkpoint_count > self._checkpoint_max:
         shutil.rmtree(
             self._save_path + '.%d' %
             (self._current_checkpoint_count - self._checkpoint_max - 1))
     # print the log
     self._show_noti("[%s] Creating %scheckpoint at: %s" %
                     (ctext('MainLoop', 'red'),
                      ctext('[best]', 'yellow') if is_best else '',
                      final_save_path))
     # save history
     if self._save_history:
       with open(final_save_path + '.hist', 'wb') as f:
         pickle.dump(self.history, f)
       self._show_noti("[%s] Save history at: %s" %
                       (ctext('MainLoop', 'red'),
                        final_save_path + '.hist'))
   # ====== store the object directly in RAM (only for the best) ====== #
   elif bool(is_best) and \
   (self._save_obj is not None or len(self._save_variables) > 0):
     del self._best_object
     self._best_object = N.serialize(
         self._save_obj, path=None, save_variables=True,
         variables=self._save_variables, binary_output=True)
     mem_size = sum(len(v) for k, v in self._best_object.items()) / 1024 / 1024
     self._show_noti(
         "[%s] Creating dynamic checkpoint in RAM using %.2f (megabytes)" %
         (ctext('MainLoop', 'red'), mem_size))
示例#13
0
文件: helpers.py 项目: imito/odin
 def summary_indices(ids):
   datasets = defaultdict(int)
   speakers = defaultdict(list)
   text = ''
   for name in sorted(ids.keys()):
     text += name + str(ids[name])
     dsname = ds['dsname'][name]
     datasets[dsname] += 1
     speakers[dsname].append(ds['spkid'][name])
   for dsname in sorted(datasets.keys()):
     print('  %-18s: %s(utt) %s(spk)' % (
         dsname,
         ctext('%6d' % datasets[dsname], 'cyan'),
         ctext(len(set(speakers[dsname])), 'cyan')))
   print('  MD5 checksum:', ctext(crypto.md5_checksum(text), 'lightcyan'))
示例#14
0
def _parse_parameter(x, out_dim,
                     name, info,
                     print_log, padding,
                     **kwargs):
  # ====== parsing distribution ====== #
  if isinstance(info, DistributionDescription):
    if print_log:
      print(padding + "   Info:", 'DistributionDescription')
    y = info.set_print_log(print_log
      ).set_padding_log('    '
      )(x, out_dim, **kwargs)
  # ====== parsing network ====== #
  elif isinstance(info, dict):
    if print_log:
      print(padding + "   Info:", str(info))
    y = _network(x, out_dim, name, info, print_log, padding)
    y = _support(y, info, print_log, padding, **kwargs)
    y = _activate(y, info, print_log, padding, **kwargs)
  # ====== just tensor ====== #
  else:
    if print_log:
      print(padding + "   Info:", 'Tensor')
    y = tf.convert_to_tensor(info)
  if print_log:
    print(padding + "   Output:", ctext(y, 'cyan'))
  return y
示例#15
0
文件: analyze.py 项目: imito/odin
  def _report(y_p, y_t, pad=''):
    with catch_warnings_ignore(Warning):
      z_ = np.concatenate(y_p, axis=0)
      z = np.concatenate(y_t, axis=0)
      print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan'))
      print(pad, "#Samples:", ctext(len(z), 'cyan'))
      print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels))
      print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))

      z_ = np.concatenate([np.mean(i, axis=0, keepdims=True) for i in y_p],
                          axis=0)
      z = np.array([i[0] for i in y_t])
      print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan'))
      print(pad, "#Samples:", ctext(len(z), 'cyan'))
      print(pad, "Log loss:", log_loss(y_true=z, y_pred=z_, labels=labels))
      print(pad, "Accuracy:", accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))
示例#16
0
文件: helpers.py 项目: imito/odin
def validate_features_dataset(output_dataset_path, ds_validation_path):
  ds = F.Dataset(output_dataset_path, read_only=True)
  print(ds)

  features = {}
  for key, val in ds.items():
    if 'indices_' in key:
      name = key.split('_')[-1]
      features[name] = (val, ds[name])

  all_indices = [val[0] for val in features.values()]
  # ====== sampling 250 files ====== #
  all_files = sampling_iter(it=all_indices[0].keys(), k=250,
                            seed=Config.SUPER_SEED)
  all_files = [f for f in all_files
               if all(f in ids for ids in all_indices)]
  print("#Samples:", ctext(len(all_files), 'cyan'))

  # ====== ignore the 20-figures warning ====== #
  with catch_warnings_ignore(RuntimeWarning):
    for file_name in all_files:
      X = {}
      for feat_name, (ids, data) in features.items():
        start, end = ids[file_name]
        X[feat_name] = data[start:end][:].astype('float32')
      V.plot_multiple_features(features=X, fig_width=20,
            title='[%s]%s' % (ds['dsname'][file_name], file_name))

  V.plot_save(ds_validation_path, dpi=12)
示例#17
0
文件: trainer.py 项目: imito/odin
 def _rollback(self, is_final=False):
   # TODO: update rollback mechanism
   if not self._allow_rollback and not is_final:
     return
   # trigger event for callbacks
   self._callback.event(TrainSignal.ROLLBACK)
   # default rollback procedure
   if self._save_path is not None and os.path.exists(self._save_path):
     self._show_noti("[%s] Rollback from: %s" %
                     (ctext('MainLoop', 'red'), self._save_path))
     # restore previous checkpoint immediately
     N.deserialize(self._save_path, force_restore_vars=True)
   # otherwise, load stored variables from RAM
   elif self._best_object is not None:
     self._show_noti("[%s] Rollback to the best stored object from RAM" %
                     (ctext('MainLoop', 'red')))
     N.deserialize(path_or_data=self._best_object,
                   force_restore_vars=True)
示例#18
0
文件: dataset.py 项目: imito/odin
 def __str__(self):
   padding = '  '
   # NOTE: each element in the list is one line
   s = ['==========  ' +
        ctext('Dataset:%s Total:%d Size:%.2f(MB)', 'magenta') %
        (self.path, len(self._data_map), self.size) +
        '  ==========']
   s += self._readme_info
   s += [ctext('DATA:', 'yellow'),
         '----']
   # ====== Find longest string ====== #
   longest_name = 0
   longest_shape = 0
   longest_dtype = 0
   longest_file = 0
   print_info = []
   for name, (dtype, shape, data, path) in sorted(self._data_map.items()):
     shape = data.shape if hasattr(data, 'shape') else shape
     longest_name = max(len(name), longest_name)
     longest_dtype = max(len(str(dtype)), longest_dtype)
     longest_shape = max(len(str(shape)), longest_shape)
     longest_file = max(len(str(path)), longest_file)
     print_info.append([name, dtype, shape, path])
   # ====== return print string ====== #
   format_str = (padding + '%-' + str(longest_name + 2) + 's  '
                 '%-' + str(longest_dtype) + 's' + ctext(':', 'yellow') +
                 '%-' + str(longest_shape) + 's  '
                 'path:%-' + str(longest_file) + 's')
   for name, dtype, shape, path in print_info:
     s.append(format_str % ('"%s"' % name, dtype, shape, path))
   # ====== add recipes info ====== #
   for name, recipe in self._saved_recipes.items():
     s.append(ctext('(Recipe) ', 'yellow') + '"%s"' % name)
     for rcp in recipe:
       rcp = str(rcp)
       s.append('\n'.join([padding + line
                           for line in rcp.split('\n')]))
   # ====== add indices info ====== #
   for name, index in self._saved_indices.items():
     s.append(ctext('(Index) ', 'yellow') + '"%s"' % name)
     s.append(padding + str(index))
     name, (start, end) = next(index.items())
     s.append(padding + 'Sample: "%s %d-%d"' % (name, start, end))
   return '\n'.join(s)
示例#19
0
文件: utils.py 项目: imito/odin
def get_model_path(system_name, args):
  """Return: exp_dir, model_path, log_path, train_path, test_path"""
  name = '_'.join([str(system_name).lower(), args.recipe, args.feat])
  if 'l' in args:
    name += '_' + str(int(args.l))
  if 'nmix' in args:
    name += '_' + str(int(args.nmix))
  if 'tdim' in args:
    name += '_' + str(int(args.tdim))
  save_path = os.path.join(PATH_EXP, name)
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  # ====== return path ====== #
  log_path = os.path.join(save_path, 'log.txt')
  model_path = os.path.join(save_path, 'model.ai')
  train_path = os.path.join(save_path, 'train.dat')
  test_path = os.path.join(save_path, 'test.dat')
  print("Model path:", ctext(model_path, 'cyan'))
  print("Log path:", ctext(log_path, 'cyan'))
  return save_path, model_path, log_path, train_path, test_path
示例#20
0
文件: utils.py 项目: imito/odin
def get_exp_path(system_name, args, override=False):
  """ Return: exp_dir, model_path, log_path """
  exp_dir = get_exppath(tag='TIDIGITS_%s_%s_%s' %
    (system_name, args.task, args.feat))
  if 'nmix' in args:
    exp_dir += '_%d' % args.nmix
  if 'tdim' in args:
    exp_dir += '_%d' % args.tdim
  # ====== check override ====== #
  if bool(override) and os.path.exists(exp_dir):
    shutil.rmtree(exp_dir)
  if not os.path.exists(exp_dir):
    os.mkdir(exp_dir)
  # ====== basic paths ====== #
  model_path = os.path.join(exp_dir, 'model.ai')
  log_path = os.path.join(exp_dir,
                         'log_%s.txt' % get_formatted_datetime(only_number=True))
  print("Exp dir:", ctext(exp_dir, 'cyan'))
  print("Model path:", ctext(model_path, 'cyan'))
  print("Log path:", ctext(log_path, 'cyan'))
  return exp_dir, model_path, log_path
示例#21
0
    def _report(y_p, y_t, pad=''):
        with catch_warnings_ignore(Warning):
            z_ = np.concatenate(y_p, axis=0)
            z = np.concatenate(y_t, axis=0)
            print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan'))
            print(pad, "#Samples:", ctext(len(z), 'cyan'))
            print(pad, "Log loss:", log_loss(y_true=z,
                                             y_pred=z_,
                                             labels=labels))
            print(pad, "Accuracy:",
                  accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))

            z_ = np.concatenate(
                [np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0)
            z = np.array([i[0] for i in y_t])
            print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan'))
            print(pad, "#Samples:", ctext(len(z), 'cyan'))
            print(pad, "Log loss:", log_loss(y_true=z,
                                             y_pred=z_,
                                             labels=labels))
            print(pad, "Accuracy:",
                  accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))
示例#22
0
def verify_dependencies():
  try:
    command = "SMILExtract -h"
    output = subprocess.check_output(command, shell=True,
                                     stderr=subprocess.STDOUT)
  except subprocess.CalledProcessError:
    raise Exception("Can't find SMILExtract executable")
  else:
    m = re.search('openSMILE version (.*)', str(output, 'utf-8'),
                  re.MULTILINE)
    if m:
      opensmile_version = m.group(1)
      print('Found openSMILE:', ctext(opensmile_version, 'magenta'))
示例#23
0
 def __str__(self):
     padding = '   '
     s = '<%s: #keys:%d #iter:%d #CPU:%s #Buffer:%d #HWM:%d mode:"%s">\n' % \
         (ctext('Feeder', 'cyan'), len(self.indices_keys),
             len(self._running_iter), self.ncpu, self.buffer_size,
             self.hwm, self._batch_mode)
     # ====== Shape and dtype ====== #
     shape = self.shape  # this is always list of shape
     s += padding + ctext("Shape: ", 'magenta') + \
         ', '.join((str(s) for s in shape)) + '\n'
     s += padding + ctext("Dtype: ", 'magenta') + \
         ', '.join((str(dt) for dt in self.dtype)) + '\n'
     # ====== print recipes ====== #
     s += padding + ctext('Recipes:', 'magenta') + '\n'
     for recipe in self._recipes:
         s += '\n'.join(['\t' + i for i in str(recipe).split('\n')])
         s += '\n'
     # ====== print data descriptor ====== #
     s += padding + ctext('Descriptor:', 'magenta') + '\n'
     for desc in self._data:
         s += '\n'.join(['\t' + i for i in str(desc).split('\n')])
         s += '\n'
     return s[:-1]
示例#24
0
文件: feeder.py 项目: imito/odin
 def __str__(self):
   padding = '   '
   s = '<%s: #keys:%d #iter:%d #CPU:%s #Buffer:%d #HWM:%d mode:"%s">\n' % \
       (ctext('Feeder', 'cyan'), len(self.indices_keys),
           len(self._running_iter), self.ncpu, self.buffer_size,
           self.hwm, self._batch_mode)
   # ====== Shape and dtype ====== #
   shape = self.shape # this is always list of shape
   s += padding + ctext("Shape: ", 'magenta') + \
       ', '.join((str(s) for s in shape)) + '\n'
   s += padding + ctext("Dtype: ", 'magenta') + \
       ', '.join((str(dt) for dt in self.dtype)) + '\n'
   # ====== print recipes ====== #
   s += padding + ctext('Recipes:', 'magenta') + '\n'
   for recipe in self._recipes:
     s += '\n'.join(['\t' + i for i in str(recipe).split('\n')])
     s += '\n'
   # ====== print data descriptor ====== #
   s += padding + ctext('Descriptor:', 'magenta') + '\n'
   for desc in self._data:
     s += '\n'.join(['\t' + i for i in str(desc).split('\n')])
     s += '\n'
   return s[:-1]
示例#25
0
def evaluate_prediction(name_list, y_pred, y_true, title):
    def _report(y_p, y_t, pad=''):
        with catch_warnings_ignore(Warning):
            z_ = np.concatenate(y_p, axis=0)
            z = np.concatenate(y_t, axis=0)
            print(pad, '*** %s ***' % ctext('Frame-level', 'lightcyan'))
            print(pad, "#Samples:", ctext(len(z), 'cyan'))
            print(pad, "Log loss:", log_loss(y_true=z,
                                             y_pred=z_,
                                             labels=labels))
            print(pad, "Accuracy:",
                  accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))

            z_ = np.concatenate(
                [np.mean(i, axis=0, keepdims=True) for i in y_p], axis=0)
            z = np.array([i[0] for i in y_t])
            print(pad, '*** %s ***' % ctext('Utterance-level', 'lightcyan'))
            print(pad, "#Samples:", ctext(len(z), 'cyan'))
            print(pad, "Log loss:", log_loss(y_true=z,
                                             y_pred=z_,
                                             labels=labels))
            print(pad, "Accuracy:",
                  accuracy_score(y_true=z, y_pred=np.argmax(z_, axis=-1)))

    datasets_2_samples = defaultdict(list)
    for name, y_p, y_t in zip(name_list, y_pred, y_true):
        dsname = ds['dsname'][name]
        datasets_2_samples[dsname].append((name, y_p, y_t))

    print('=' * 12, ctext(title, 'lightyellow'), '=' * 12)
    _report(y_p=y_pred, y_t=y_true)

    for dsname, data in sorted(datasets_2_samples.items(), key=lambda x: x[0]):
        print(ctext(dsname, 'yellow'), ':')
        y_pred = [i[1] for i in data]
        y_true = [i[2] for i in data]
        _report(y_p=y_pred, y_t=y_true, pad='  ')
示例#26
0
    def __call__(self, x=None, out_dim=None, n_eventdim=0, **kwargs):
        n_eventdim = int(n_eventdim)
        if x is not None:
            x = tf.convert_to_tensor(x)
        if out_dim is not None:
            out_dim = int(out_dim)
        print_log = self._print_log
        padding = self._padding

        if print_log:
            print(
                padding + ctext("Parsing distribution:", 'lightyellow'),
                '%s/%s' % (ctext(self.normalized_name, 'lightcyan'),
                           ctext(self.distribution.__name__, 'cyan')))

        args = {}
        for p_name, p_val in self.get_ordered_arguments():
            if print_log:
                print(padding + " Parsing parameter:", ctext(p_name, 'cyan'))
            p_val = _parse_parameter(x,
                                     out_dim,
                                     p_name,
                                     p_val,
                                     print_log=print_log,
                                     padding=padding,
                                     **kwargs)
            args[p_name] = p_val

        dist = self.distribution(**args)
        if n_eventdim > 0:
            dist = tfd.Independent(distribution=dist,
                                   reinterpreted_batch_ndims=n_eventdim)
        if print_log:
            print(' Distribution:', ctext(dist, 'cyan'))
        self._print_log = False
        self._padding = ''
        return dist
示例#27
0
文件: stats.py 项目: imito/odin
def freqcount(x, key=None, count=1, normalize=False, sort=False,
              pretty_return=False):
  """ x: list, iterable

  Parameters
  ----------
  key: call-able
      extract the key from each item in the list
  count: call-able, int
      extract the count from each item in the list
  normalize: bool
      if normalize, all the values are normalized from 0. to 1. (
      which sum up to 1. in total).
  sort: boolean
      if True, the list will be sorted in ascent order.
  pretty_return: boolean
      if True, return pretty formatted text.

  Return
  ------
  dict: x(obj) -> freq(int)
  if `pretty_return` is `True`, return pretty formatted string.
  """
  freq = defaultdict(int)
  if key is None:
    key = lambda x: x
  if count is None:
    count = 1
  if isinstance(count, Number):
    _ = int(count)
    count = lambda x: _
  for i in x:
    c = count(i)
    i = key(i)
    freq[i] += c
  # always return the same order
  s = float(sum(v for v in freq.values()))
  freq = OrderedDict([(k, freq[k] / s if normalize else freq[k])
                      for k in sorted(freq.keys())])
  # check sort
  if sort:
    freq = OrderedDict(sorted(freq.items(), key=lambda x: x[1]))
  # check pretty return
  if pretty_return:
    s = ''
    for name, value in freq.items():
      s += ' %s: %d\n' % (ctext(name, 'yellow'), value)
    return s
  return freq
示例#28
0
def _save_data_to_path(preprocessed_path, X, y, gene_names, label_names,
                       cell_names, verbose):
    # save data
    if verbose:
        print("Saving data to %s ..." % ctext(preprocessed_path, 'cyan'))
    with open(os.path.join(preprocessed_path, 'X'), 'wb') as f:
        pickle.dump(X, f)
    with open(os.path.join(preprocessed_path, 'y'), 'wb') as f:
        pickle.dump(y, f)
    # save the meta info
    with open(os.path.join(preprocessed_path, 'X_row'), 'wb') as f:
        pickle.dump(cell_names, f)
    with open(os.path.join(preprocessed_path, 'X_col'), 'wb') as f:
        pickle.dump(gene_names, f)
    with open(os.path.join(preprocessed_path, 'y_col'), 'wb') as f:
        pickle.dump(label_names, f)
示例#29
0
    def plot_learning_curves(self):
        start_time = time.time()

        fig = plt.figure(figsize=(20, len(self) * 4))
        n_metrics = 5

        for row_idx, pos in enumerate(self):
            row_idx = row_idx * n_metrics
            train = _extract_metrics(pos.train_history)
            valid = _extract_metrics(pos.valid_history)

            for col_idx, (name, i, j) in enumerate(
                    zip(['loss', 'LLK_x', 'LLK_y', 'KLqp_x', 'KLqp_y'], train,
                        valid)):
                col_idx += 1
                plt.subplot(len(self), n_metrics, row_idx + col_idx)

                if col_idx == 1:
                    plt.title(pos.short_id_lines,
                              fontsize=8,
                              fontstyle='italic')
                else:
                    plt.title(name)

                if i is None or j is None:
                    plt.plot(0, 0)
                    plt.xticks(())
                    plt.yticks(())
                else:
                    plt.plot(i,
                             linewidth=2.5,
                             label='train:%.2f' %
                             (np.max(i) if 'LLK' in name else np.min(i)),
                             linestyle='-')
                    plt.plot(j,
                             linewidth=2.5,
                             label='valid:%.2f' %
                             (np.max(j) if 'LLK' in name else np.min(j)),
                             linestyle='--',
                             alpha=0.8)
                    plt.legend()

        plt.tight_layout()
        self.add_figure('learning_curves', fig)
        return self._log('plot_learning_curves %s(s)' %
                         ctext(time.time() - start_time, 'lightyellow'))
示例#30
0
def summary(x, axis=None, shorten=False, float_precision=2):
    """ Return string of statistical summary given series `x`
    {#:%s|mi:%s|q1:%s|md:%s|mn:%s|q3:%s|ma:%s|sd:%s}
  """
    if isinstance(x, Iterator):
        x = list(x)
    if isinstance(x, (tuple, list, set)):
        x = np.array(x)
    mean, std = np.mean(x, axis=axis), np.std(x, axis=axis)
    median = np.median(x, axis=axis)
    qu1, qu3 = np.percentile(x, [25, 75], axis=axis)
    min_, max_ = np.min(x, axis=axis), np.max(x, axis=axis)
    s = ""
    fmt = '%.' + str(int(float_precision)) + 'f'
    if not shorten:
        x = x.ravel()
        samples = ', '.join([
            str(i) for i in np.random.choice(
                x, size=min(8, len(x)), replace=False).tolist()
        ])
        s += "***** Summary *****\n"
        s += "    Min : %s\n" % (fmt % min_)
        s += "1st Qu. : %s\n" % (fmt % qu1)
        s += " Median : %s\n" % (fmt % median)
        s += "   Mean : %s\n" % (fmt % mean)
        s += "3rd Qu. : %s\n" % (fmt % qu3)
        s += "    Max : %s\n" % (fmt % max_)
        s += "-------------------\n"
        s += "    Std : %s\n" % (fmt % std)
        s += "#Samples: %d\n" % len(x)
        s += "Samples : %s\n" % samples
        s += "Sparsity: %s\n" % (fmt % sparsity_percentage(x))
    else:
        s += "{#:%s|mi:%s|q1:%s|md:%s|mn:%s|q3:%s|ma:%s|sd:%s}" %\
        (ctext(len(x), 'cyan'),
         ctext(fmt % min_, 'cyan'),
         ctext(fmt % qu1, 'cyan'),
         ctext(fmt % median, 'cyan'),
         ctext(fmt % mean, 'cyan'),
         ctext(fmt % qu3, 'cyan'),
         ctext(fmt % max_, 'cyan'),
         ctext(fmt % std, 'cyan'))
    return s
示例#31
0
 def __str__(self):
     s = ''
     s += ctext("<Ivector ", 'yellow')
     s += "GMM:%s " % self.is_gmm_fitted
     s += "Tmat:%s\n" % self.is_tmat_fitted
     if os.path.exists(self.path) and len(os.listdir(self.path)) > 0:
         # list all model files
         s += "  %s: " % ctext('model', 'cyan')
         s += ', '.join([
             '"%s"' % f for f in sorted(os.listdir(self.path))
             if 'zstat' not in f and 'fstat' not in f and 'ivec' not in f
             and 'name_' not in f
         ])
         s += '\n'
         # list all Zero-stats files
         s += "  %s: " % ctext('Z-stats', 'cyan')
         s += ', '.join([
             '"%s"' % f for f in sorted(os.listdir(self.path))
             if 'zstat' in f
         ])
         s += '\n'
         # list all First-stats files
         s += "  %s: " % ctext('F-stats', 'cyan')
         s += ', '.join([
             '"%s"' % f for f in sorted(os.listdir(self.path))
             if 'fstat' in f
         ])
         s += '\n'
         # list all Ivec-stats files
         s += "  %s: " % ctext('ivec', 'cyan')
         s += ', '.join([
             '"%s"' % f for f in sorted(os.listdir(self.path))
             if 'ivec' in f
         ])
         s += '\n'
         # list all Name path files
         s += "  %s: " % ctext('name-list', 'cyan')
         s += ', '.join([
             '"%s"' % f for f in sorted(os.listdir(self.path))
             if 'name_' in f
         ])
         s += '\n'
     # list all attributes
     for k, v in sorted(self.__dict__.items(), key=lambda x: x[0]):
         if is_primitive(v, inc_ndarray=False):
             s += "  %s: %s\n" % (ctext(k, 'cyan'), str(v))
     s = s[:-1] + '>'
     return s
示例#32
0
 def _set_path(self, path, read_only):
     MAXIMUM_README_LINE = 25
     # all files are opened with default_mode=r+
     self._data_map = OrderedDict()
     self._path = os.path.abspath(path)
     self._default_hdf5 = os.path.basename(self._path) + '_default.h5'
     # svaed feeder info
     self._saved_indices = {}
     self._saved_recipes = {}
     # just make new dir
     if not os.path.exists(path):
         os.mkdir(path)
         os.mkdir(self.recipe_path)
         os.mkdir(self.index_path)
         return  # not thing to do more
     elif not os.path.isdir(path):
         raise ValueError('Dataset path must be a folder.')
     # ====== Load all Data ====== #
     files = os.listdir(path)
     for fname in files:
         # found README
         if 'readme' == fname[:6].lower():
             readme_path = os.path.join(path, fname)
             with open(readme_path, 'r') as readme_file:
                 readme = readme_file.readlines()[:MAXIMUM_README_LINE]
                 readme = [
                     '  ' + i[:-1] for i in readme
                     if len(i) > 0 and i != '\n'
                 ]
                 readme.append(' => For more information: ' + readme_path)
                 self._readme_info = [ctext('README:', 'yellow'), '------'
                                      ] + readme
                 self._readme_path = readme_path
         # parse data
         data = _parse_data_descriptor(os.path.join(path, fname), read_only)
         if data is None:
             continue
         for key, d in data:
             if key in self._data_map:
                 raise ValueError('Found duplicated data with follow info: '
                                  '{}'.format(key))
             else:
                 self._data_map[key] = d
示例#33
0
    def _bar_box_line(self,
                      title,
                      ylabel,
                      get_score,
                      model_id,
                      ax,
                      ignore=[],
                      using_bar=True):
        start_time = time.time()
        assert callable(model_id), "model_id must be callable"
        assert callable(get_score)

        data = []
        for pos in self.posteriors:
            name = model_id(pos.infer)
            train, test = get_score(pos)

            for i in ignore:
                del train[i]
                del test[i]

            for i, j in train.items():
                data.append({'Model': name, ylabel: j, 'Data': 'train'})
            for i, j in test.items():
                data.append({'Model': name, ylabel: j, 'Data': 'test'})
        df = pd.DataFrame(data)

        ax = to_axis2D(ax)
        # Bar plot
        if using_bar:
            sns.barplot(x='Model', y=ylabel, hue='Data', data=df, ax=ax)
        # Box plot
        else:
            sns.boxplot(x='Model', y=ylabel, hue='Data', data=df, ax=ax)

        ax.grid(axis='y', linewidth=1.2, alpha=0.5)
        ax.set_axisbelow(True)

        self.add_figure(title, ax.get_figure())
        return self._log(
            '%s %s(s)' %
            (title, ctext(time.time() - start_time, 'lightyellow')))
示例#34
0
文件: stats.py 项目: imito/odin
def summary(x, axis=None, shorten=False):
  """ Return string of statistical summary given series `x`
    {#:%s|mi:%s|q1:%s|md:%s|mn:%s|q3:%s|ma:%s|sd:%s}
  """
  if isinstance(x, Iterator):
    x = list(x)
  if isinstance(x, (tuple, list, set)):
    x = np.array(x)
  mean, std = np.mean(x, axis=axis), np.std(x, axis=axis)
  median = np.median(x, axis=axis)
  qu1, qu3 = np.percentile(x, [25, 75], axis=axis)
  min_, max_ = np.min(x, axis=axis), np.max(x, axis=axis)
  s = ""
  if not shorten:
    x = x.ravel()
    samples = ', '.join([str(i)
           for i in np.random.choice(x, size=min(8, len(x)), replace=False).tolist()])
    s += "***** Summary *****\n"
    s += "    Min : %s\n" % str(min_)
    s += "1st Qu. : %s\n" % str(qu1)
    s += " Median : %s\n" % str(median)
    s += "   Mean : %g\n" % mean
    s += "3rd Qu. : %s\n" % str(qu3)
    s += "    Max : %s\n" % str(max_)
    s += "-------------------\n"
    s += "    Std : %g\n" % std
    s += "#Samples: %d\n" % len(x)
    s += "Samples : %s\n" % samples
  else:
    s += "{#:%s|mi:%s|q1:%s|md:%s|mn:%s|q3:%s|ma:%s|sd:%s}" %\
    (ctext(len(x), 'cyan'),
     ctext('%g' % min_, 'cyan'),
     ctext('%g' % qu1, 'cyan'),
     ctext('%g' % median, 'cyan'),
     ctext('%g' % mean, 'cyan'),
     ctext('%g' % qu3, 'cyan'),
     ctext('%g' % max_, 'cyan'),
     ctext('%g' % std, 'cyan'))
  return s
示例#35
0
文件: dataset.py 项目: imito/odin
 def __init__(self, path, read_only=False, override=False):
   path = os.path.abspath(path)
   self.read_only = read_only
   self._readme_info = [ctext('README:', 'yellow'),
                        '------',
                        '  No information!']
   self._readme_path = None
   # flag to check cPickle called with protocol 2
   self._new_args_called = False
   # parse all data from path
   if path is not None:
     if override and os.path.exists(path) and os.path.isdir(path):
       shutil.rmtree(path)
       print('Overrided old dataset at path:', path)
     if os.path.isfile(path) and '.zip' in os.path.basename(path):
       self._load_archive(path,
           extract_path=path.replace(os.path.basename(path), ''))
     else:
       self._set_path(path, self.read_only)
   else:
     raise ValueError('Invalid path for Dataset: %s' % path)
示例#36
0
def downsample_data(*X):
    y = [None] * len(X)
    _ = list(set(x.shape[0] for x in X if x is not None))
    assert len(_) == 1, "Inconsistent shape[0] for X and y"
    num_samples = _[0]
    _RAND = np.random.RandomState(seed=87654321)
    # ====== Downsample if the data is huge ====== #
    if num_samples > 8000:
        print("[Warning] Given: %s; downsample to 8000 samples" %
              ctext(', '.join([str(x.shape)
                               for x in X if x is not None]), 'cyan'))
        ids = _RAND.choice(a=np.arange(0, num_samples),
                           size=8000,
                           replace=False)
        for i, x in enumerate(X):
            if x is not None:
                x = x[ids]
            y[i] = x
    else:
        y = X
    return tuple(y)
示例#37
0
文件: ivector.py 项目: imito/odin
 def __str__(self):
   s = ''
   s += ctext("<Ivector ", 'yellow')
   s += "GMM:%s " % self.is_gmm_fitted
   s += "Tmat:%s\n" % self.is_tmat_fitted
   if os.path.exists(self.path) and len(os.listdir(self.path)) > 0:
     # list all model files
     s += "  %s: " % ctext('model', 'cyan')
     s += ', '.join(['"%s"' % f
                     for f in sorted(os.listdir(self.path))
                     if 'zstat' not in f and 'fstat' not in f and
                     'ivec' not in f and 'name_' not in f])
     s += '\n'
     # list all Zero-stats files
     s += "  %s: " % ctext('Z-stats', 'cyan')
     s += ', '.join(['"%s"' % f
                     for f in sorted(os.listdir(self.path))
                     if 'zstat' in f])
     s += '\n'
     # list all First-stats files
     s += "  %s: " % ctext('F-stats', 'cyan')
     s += ', '.join(['"%s"' % f
                     for f in sorted(os.listdir(self.path))
                     if 'fstat' in f])
     s += '\n'
     # list all Ivec-stats files
     s += "  %s: " % ctext('ivec', 'cyan')
     s += ', '.join(['"%s"' % f
                     for f in sorted(os.listdir(self.path))
                     if 'ivec' in f])
     s += '\n'
     # list all Name path files
     s += "  %s: " % ctext('name-list', 'cyan')
     s += ', '.join(['"%s"' % f
                     for f in sorted(os.listdir(self.path))
                     if 'name_' in f])
     s += '\n'
   # list all attributes
   for k, v in sorted(self.__dict__.items(), key=lambda x: x[0]):
     if is_primitives(v, inc_ndarray=False):
       s += "  %s: %s\n" % (ctext(k, 'cyan'), str(v))
   s = s[:-1] + '>'
   return s
示例#38
0
 def __init__(self, path, read_only=False, override=False):
     path = os.path.abspath(path)
     self.read_only = read_only
     self._readme_info = [
         ctext('README:', 'yellow'), '------', '  No information!'
     ]
     self._readme_path = None
     # flag to check cPickle called with protocol 2
     self._new_args_called = False
     # parse all data from path
     if path is not None:
         if override and os.path.exists(path) and os.path.isdir(path):
             shutil.rmtree(path)
             print('Overrided old dataset at path:', path)
         if os.path.isfile(path) and '.zip' in os.path.basename(path):
             self._load_archive(path,
                                extract_path=path.replace(
                                    os.path.basename(path), ''))
         else:
             self._set_path(path, self.read_only)
     else:
         raise ValueError('Invalid path for Dataset: %s' % path)
示例#39
0
def _parse_parameter(x, out_dim, name, info, print_log, padding, **kwargs):
    # ====== parsing distribution ====== #
    if isinstance(info, DistributionDescription):
        if print_log:
            print(padding + "   Info:", 'DistributionDescription')
        y = info.set_print_log(print_log).set_padding_log('    ')(x, out_dim,
                                                                  **kwargs)
    # ====== parsing network ====== #
    elif isinstance(info, dict):
        if print_log:
            print(padding + "   Info:", str(info))
        y = _network(x, out_dim, name, info, print_log, padding)
        y = _support(y, info, print_log, padding, **kwargs)
        y = _activate(y, info, print_log, padding, **kwargs)
    # ====== just tensor ====== #
    else:
        if print_log:
            print(padding + "   Info:", 'Tensor')
        y = tf.convert_to_tensor(info)
    if print_log:
        print(padding + "   Output:", ctext(y, 'cyan'))
    return y
示例#40
0
文件: processor.py 项目: imito/odin
 def __str__(self):
   s = ctext('============= FeatureProcessor: %s =============' % self.path, 'yellow') + '\n'
   padding = '  '
   # ====== basic info ====== #
   s += '- Jobs: ' + ctext(len(self.jobs), 'cyan') + '\n'
   s += '- #CPU: ' + ctext(self.n_cpu, 'cyan') + '\n'
   s += '- #Cache: ' + ctext(self.n_cache, 'cyan') + '\n'
   # ====== print pipeline ====== #
   s += ctext("* Pipeline:", 'yellow') + '\n'
   for _, extractor in self.extractor.steps:
     for line in str(extractor).split('\n'):
       s += padding + ' ' + line + '\n'
   # ====== print config ====== #
   s += ctext("* Configurations:", 'yellow') + '\n'
   for i, j in self.config.items():
     s += padding + str(i) + ' : ' + str(j) + '\n'
   return s
示例#41
0
def _activate(y, info, print_log, padding, **kwargs):
    fn = info.get('fn', lambda x: x)
    args = []
    args_name = []
    for p_name, p_val in inspect.signature(fn).parameters.items():
        if p_name in kwargs:
            p_val = kwargs[p_name]
        else:
            p_val = p_val.default
        args.append(p_val)
        args_name.append(p_name)
    args[0] = y

    for a_name, a_val in zip(args_name, args):
        if a_val == inspect.Parameter.empty:
            raise RuntimeError("Cannot extract value for argument name: '%s'" %
                               a_name)
    # print out log
    if print_log:
        print(padding + "   activation: <%s>(%s)" % (fn.__name__, '; '.join(
            [ctext(i, 'cyan') + ':' + str(j)
             for i, j in zip(args_name, args)])))
    y = fn(*args)
    return y
示例#42
0
def _activate(y, info, print_log, padding, **kwargs):
  fn = info.get('fn', lambda x: x)
  args = []
  args_name = []
  for p_name, p_val in inspect.signature(fn).parameters.items():
    if p_name in kwargs:
      p_val = kwargs[p_name]
    else:
      p_val = p_val.default
    args.append(p_val)
    args_name.append(p_name)
  args[0] = y

  for a_name, a_val in zip(args_name, args):
    if a_val == inspect.Parameter.empty:
      raise RuntimeError("Cannot extract value for argument name: '%s'" % a_name)
  # print out log
  if print_log:
    print(padding + "   activation: <%s>(%s)" % (
        fn.__name__,
        '; '.join([ctext(i, 'cyan') + ':' + str(j)
                 for i, j in zip(args_name, args)])))
  y = fn(*args)
  return y
示例#43
0
 def __str__(self):
     s = ctext(
         '============= FeatureProcessor: %s =============' % self.path,
         'yellow') + '\n'
     padding = '  '
     # ====== basic info ====== #
     s += '- Jobs: ' + ctext(len(self.jobs), 'cyan') + '\n'
     s += '- #CPU: ' + ctext(self.n_cpu, 'cyan') + '\n'
     s += '- #Cache: ' + ctext(self.n_cache, 'cyan') + '\n'
     # ====== print pipeline ====== #
     s += ctext("* Pipeline:", 'yellow') + '\n'
     for _, extractor in self.extractor.steps:
         for line in str(extractor).split('\n'):
             s += padding + ' ' + line + '\n'
     # ====== print config ====== #
     s += ctext("* Configurations:", 'yellow') + '\n'
     for i, j in self.config.items():
         s += padding + str(i) + ' : ' + str(j) + '\n'
     return s
  ids = np.random.permutation(len(X_train))
  X_train, y_train = X_train[ids], y_train[ids]

  X_valid, y_valid = X_train[40000:], y_train[40000:]
  X_train, y_train = X_train[:40000], y_train[:40000]
  # normalize value to [0, 1]
  X_train = X_train / 255.
  X_valid = X_valid / 255.
  X_test = X_test / 255.
print(ds)
# ====== others ====== #
X_samples, y_samples = X_train[:25], y_train[:25]
input_shape = ds['X_train'].shape
input_ndim = len(input_shape)
print("Train shape:", ctext(X_train.shape, 'cyan'))
print("Valid shape:", ctext(X_valid.shape, 'cyan'))
print("Test  shape:", ctext(X_test.shape, 'cyan'))
# ====== create basic tensor ====== #
X = K.placeholder(shape=(None,) + input_shape[1:], name='X_input')
y = K.placeholder(shape=(None,), name='y_input')
# ===========================================================================
# Create the network
# ===========================================================================
LATENT_DROPOUT = 0.3
if args.cnn:
  with N.args_scope(([N.Conv, N.Dense], dict(b_init=None, activation=K.linear)),
                    (N.BatchNorm, dict(activation=tf.nn.elu)),
                    (N.Pool, dict(mode='max', pool_size=2))):
    f_encoder = N.Sequence([
        N.Dropout(level=0.5),
示例#45
0
 def event(self, event_type):
   print(ctext("[Debug] Event:", 'cyan'), event_type)
示例#46
0
 def task_end(self, task, task_results):
   print(ctext("Task End:", 'cyan'),
       task.name, task.curr_epoch, task.curr_samples,
       [(i, [(n, len(v), type(v[0])) for n, v in j.items()])
        for i, j in task_results.items()])
示例#47
0
 def task_start(self, task):
   print(ctext("Task Start:", 'cyan'),
         task.name, task.curr_epoch, task.curr_samples)
示例#48
0
 def epoch_end(self, task, epoch_results):
   print(ctext("Epoch End:", 'cyan'),
         task.name, task.curr_epoch, task.curr_samples,
         [(i, len(j), type(j[0])) for i, j in epoch_results.items()])
示例#49
0
from odin import nnet as N, backend as K
from odin import visual as V
from odin.utils import (ctext, mpi, Progbar, catch_warnings_ignore, stdio,
                        get_logpath, catch_warnings_ignore)

from helpers import (FEATURE_RECIPE, FEATURE_NAME, PATH_ACOUSTIC_FEATURES,
                     MINIMUM_UTT_DURATION, ANALYSIS_DIR, Config,
                     filter_utterances, prepare_dnn_data)

# ====== prepare log ====== #
stdio(
    get_logpath(name="analyze_data.log",
                increasing=True,
                odin_base=False,
                root=ANALYSIS_DIR))
print(ctext(FEATURE_RECIPE, 'lightyellow'))
print(ctext(FEATURE_NAME, 'lightyellow'))
assert os.path.isdir(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE))
# ====== essential path ====== #
figure_path = os.path.join(
    ANALYSIS_DIR,
    '%s_%s.pdf' % (FEATURE_RECIPE.replace('_', ''), FEATURE_NAME))
print(ctext(figure_path, 'lightyellow'))
# ===========================================================================
# Load the data
# ===========================================================================
ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEATURES, FEATURE_RECIPE),
               read_only=True)
X = ds[FEATURE_NAME]
# remove all noise data
indices = {
示例#50
0
 def send_notification(self, msg):
   if self._log:
     add_notification(
         '[%s] %s' % (ctext(self.__class__.__name__, 'magenta'), msg))
   return self
示例#51
0
def prepare_dnn_data(recipe, feat, utt_length, seed=87654321):
    """
  Return
  ------
  train_feeder : Feeder for training
  valid_feeder : Feeder for validating
  test_ids : Test indices
  test_dat : Data array
  all_speakers : list of all speaker in training set
  """
    # Load dataset
    frame_length = int(utt_length / FRAME_SHIFT)
    ds = F.Dataset(os.path.join(PATH_ACOUSTIC_FEAT, recipe), read_only=True)
    X = ds[feat]
    train_indices = {name: ds['indices'][name] for name in TRAIN_DATA.keys()}
    test_indices = {
        name: start_end
        for name, start_end in ds['indices'].items() if name not in TRAIN_DATA
    }
    train_indices, valid_indices = train_valid_test_split(x=list(
        train_indices.items()),
                                                          train=0.9,
                                                          inc_test=False,
                                                          seed=seed)
    all_speakers = sorted(set(TRAIN_DATA.values()))
    n_speakers = max(all_speakers) + 1
    print("#Train files:", ctext(len(train_indices), 'cyan'))
    print("#Valid files:", ctext(len(valid_indices), 'cyan'))
    print("#Test files:", ctext(len(test_indices), 'cyan'))
    print("#Speakers:", ctext(n_speakers, 'cyan'))
    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post',
                             data_idx=0),
        F.recipes.Name2Label(lambda name: TRAIN_DATA[name], ref_idx=0),
        F.recipes.LabelOneHot(nb_classes=n_speakers, data_idx=1)
    ]
    train_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=train_indices),
                            batch_mode='batch',
                            ncpu=7,
                            buffer_size=12)
    valid_feeder = F.Feeder(data_desc=F.IndexedData(data=X,
                                                    indices=valid_indices),
                            batch_mode='batch',
                            ncpu=2,
                            buffer_size=4)
    train_feeder.set_recipes(recipes)
    valid_feeder.set_recipes(recipes)
    print(train_feeder)
    # ====== cache the test data ====== #
    cache_dat = os.path.join(PATH_EXP,
                             'test_%s_%d.dat' % (feat, int(utt_length)))
    cache_ids = os.path.join(PATH_EXP,
                             'test_%s_%d.ids' % (feat, int(utt_length)))
    # validate cache files
    if os.path.exists(cache_ids):
        with open(cache_ids, 'rb') as f:
            ids = pickle.load(f)
        if len(ids) != len(test_indices):
            os.remove(cache_ids)
            if os.path.exists(cache_dat):
                os.remove(cache_dat)
    elif os.path.exists(cache_dat):
        os.remove(cache_dat)
    # caching
    if not os.path.exists(cache_dat):
        dat = F.MmapData(cache_dat,
                         dtype='float16',
                         shape=(0, frame_length, X.shape[1]))
        ids = {}
        prog = Progbar(target=len(test_indices))
        s = 0
        for name, (start, end) in test_indices.items():
            y = X[start:end]
            y = segment_axis(y,
                             axis=0,
                             frame_length=frame_length,
                             step_length=frame_length,
                             end='pad',
                             pad_value=0,
                             pad_mode='post')
            dat.append(y)
            # update indices
            ids[name] = (s, s + len(y))
            s += len(y)
            # update progress
            prog.add(1)
        dat.flush()
        dat.close()
        with open(cache_ids, 'wb') as f:
            pickle.dump(ids, f)
    # ====== re-load ====== #
    dat = F.MmapData(cache_dat, read_only=True)
    with open(cache_ids, 'rb') as f:
        ids = pickle.load(f)
    # ====== save some sample ====== #
    sample_path = os.path.join(PATH_EXP,
                               'test_%s_%d.pdf' % (feat, int(utt_length)))
    V.plot_figure(nrow=9, ncol=6)
    for i, (name, (start, end)) in enumerate(
            sampling_iter(it=sorted(ids.items(), key=lambda x: x[0]),
                          k=12,
                          seed=87654321)):
        x = dat[start:end][:].astype('float32')
        ax = V.plot_spectrogram(x[np.random.randint(0, len(x))].T,
                                ax=(12, 1, i + 1),
                                title='')
        ax.set_title(name)
    V.plot_save(sample_path)
    return (train_feeder, valid_feeder, ids, dat, all_speakers)
示例#52
0
NUM_DIM = 3
colors = ['r', 'b', 'g']
markers = ["o", "^", "s"]
SEED = K.get_rng().randint(0, 10e8)
# ===========================================================================
# Load dataset
# ===========================================================================
ds = F.IRIS.load()
print(ds)

nb_samples = ds['X'].shape[0]
ids = K.get_rng().permutation(nb_samples)
X = ds['X'][ids]
y = ds['y'][ids]
labels = ds['name']
print("Labels:", ctext(labels))
assert len(colors) == len(labels) and len(markers) == len(labels)

X_train = X[:int(TRAINING_PERCENT * nb_samples)]
y_train = y[:int(TRAINING_PERCENT * nb_samples)]
y_train_color = [colors[i] for i in y_train]
y_train_marker = [markers[i] for i in y_train]

X_score = X[int(TRAINING_PERCENT * nb_samples):]
y_score = y[int(TRAINING_PERCENT * nb_samples):]
y_score_color = [colors[i] for i in y_score]
y_score_marker = [markers[i] for i in y_score]

print("Train:", X_train.shape, y_train.shape)
print("Score:", X_score.shape, y_score.shape)
示例#53
0
def prepare_data(feat, label, utt_length=0.4, for_ivec=False):
    """

  Returns (i-vector)
  ------------------
  ds[feat]
  train_files
  y_train
  test_files
  y_test
  labels

  Returns (x-vector)
  ------------------
  train : Feeder
    feeder for training data for iterating over pair of (X, y)
  valid : Feeder
    feeder for validating data for iterating over pair of (X, y)
  X_test_name : list of file names
    file names are append with '.%d' for cut segment ID
  X_test_true : list of integer
    label of each sample
  X_test_data : array
    list of test data same length as X_test_name
  labels : list of string
    list of labels for classification task

  Example
  -------
  (train, valid,
   X_test_name, X_test_true, X_test_data,
   labels) = prepare_data_dnn(feat=FEAT, label='gender')

  """
    label = str(label).lower()
    assert label in _support_label, "No support for label: %s" % label
    assert 0 < utt_length <= 1.
    # ====== load dataset ====== #
    if not os.path.exists(PATH_ACOUSTIC):
        raise RuntimeError(
            "Cannot find extracted acoustic features at path: '%s',"
            "run the code speech_features_extraction.py!" % PATH_ACOUSTIC)
    ds = F.Dataset(PATH_ACOUSTIC, read_only=True)
    assert feat in ds, "Cannot find feature with name: %s" % feat
    indices = list(ds['indices'].items())
    K.get_rng().shuffle(indices)

    # ====== helper ====== #
    def is_train(x):
        return x.split('_')[0] == 'train'

    def extract_label(x):
        return x.split('_')[_support_label[label]]

    print("Task:", ctext(label, 'cyan'))
    fn_label, labels = unique_labels([i[0] for i in indices],
                                     key_func=extract_label,
                                     return_labels=True)
    print("Labels:", ctext(labels, 'cyan'))
    # ====== training and test data ====== #
    train_files = []  # (name, (start, end)) ...
    test_files = []
    for name, (start, end) in indices:
        if is_train(name):
            train_files.append((name, (start, end)))
        else:
            test_files.append((name, (start, end)))
    # name for each dataset, useful for later
    print("#Train:", ctext(len(train_files), 'cyan'))
    print("#Test:", ctext(len(test_files), 'cyan'))
    # ====== for i-vectors ====== #
    y_train = np.array([fn_label(i[0]) for i in train_files])
    y_test = np.array([fn_label(i[0]) for i in test_files])
    if bool(for_ivec):
        return ds[feat], train_files, y_train, test_files, y_test, labels
    # ====== length ====== #
    length = [(end - start) for _, (start, end) in indices]
    max_length = max(length)
    frame_length = int(max_length * utt_length)
    step_length = frame_length
    print("Max length  :", ctext(max_length, 'yellow'))
    print("Frame length:", ctext(frame_length, 'yellow'))
    print("Step length :", ctext(step_length, 'yellow'))
    # ====== split dataset ====== #
    # split by speaker ID
    train_files, valid_files = train_valid_test_split(
        x=train_files,
        train=0.8,
        cluster_func=None,
        idfunc=lambda x: x[0].split('_')[4],  # splited by speaker
        inc_test=False)
    print("#File train:", ctext(len(train_files), 'cyan'))
    print("#File valid:", ctext(len(valid_files), 'cyan'))
    print("#File test :", ctext(len(test_files), 'cyan'))

    recipes = [
        F.recipes.Sequencing(frame_length=frame_length,
                             step_length=step_length,
                             end='pad',
                             pad_mode='post',
                             pad_value=0),
        F.recipes.Name2Label(converter_func=fn_label),
        F.recipes.LabelOneHot(nb_classes=len(labels), data_idx=-1)
    ]
    feeder_train = F.Feeder(F.IndexedData(ds[feat], indices=train_files),
                            ncpu=6,
                            batch_mode='batch')
    feeder_valid = F.Feeder(F.IndexedData(ds[feat], indices=valid_files),
                            ncpu=4,
                            batch_mode='batch')
    feeder_test = F.Feeder(F.IndexedData(ds[feat], indices=test_files),
                           ncpu=4,
                           batch_mode='file')
    feeder_train.set_recipes(recipes)
    feeder_valid.set_recipes(recipes)
    feeder_test.set_recipes(recipes)
    print(feeder_train)

    # ====== process X_test, y_test in advance for faster evaluation ====== #
    @cache_disk
    def _extract_test_data(feat, label, utt_length):
        prog = Progbar(target=len(feeder_test),
                       print_summary=True,
                       name="Preprocessing test set")
        X_test = defaultdict(list)
        for name, idx, X, y in feeder_test:
            # validate everything as expected
            assert fn_label(name) == np.argmax(y), name  # label is right
            # save to list
            X_test[name].append((idx, X))
            prog.add(X.shape[0])
        # ====== create 1 array for data and dictionary for indices ====== #
        X_test_name = []
        X_test_data = []
        for name, X in X_test.items():
            X = np.concatenate([x[1] for x in sorted(X, key=lambda i: i[0])],
                               axis=0).astype('float16')
            X_test_name += [name + '.%d' % i for i in range(len(X))]
            X_test_data.append(X)
        X_test_name = np.array(X_test_name)
        X_test_data = np.concatenate(X_test_data, axis=0)
        return X_test_name, X_test_data

    # convert everything back to float32
    X_test_name, X_test_data = _extract_test_data(feat, label, utt_length)
    X_test_true = np.array([fn_label(i.split('.')[0]) for i in X_test_name])
    return feeder_train, feeder_valid, \
    X_test_name, X_test_true, X_test_data, labels
示例#54
0
文件: tvec.py 项目: trungnt13/odin-ai
# ====== load data feeder ====== #
(train, valid, X_test_name, X_test_true, X_test_data,
 labels) = prepare_data(feat=args.feat, label=args.task)
n_classes = len(labels)
# ===========================================================================
# Create model
# ===========================================================================
inputs = [
    K.placeholder(shape=(None, ) + shape[1:],
                  dtype='float32',
                  name='input%d' % i)
    for i, shape in enumerate(as_tuple_of_shape(train.shape))
]
X = inputs[0]
y = inputs[1]
print("Inputs:", ctext(inputs, 'cyan'))
# ====== create the networks ====== #
with N.args_scope([('Conv', 'Dense'),
                   dict(b_init=None, activation=K.linear, pad='same')],
                  ['BatchNorm', dict(activation=K.relu)]):
    f = N.Sequence([
        N.Dimshuffle(pattern=(0, 1, 2, 'x')),
        N.Conv(num_filters=32, filter_size=(9, 7)),
        N.BatchNorm(),
        N.Pool(pool_size=(3, 2), strides=2),
        N.Conv(num_filters=64, filter_size=(5, 3)),
        N.BatchNorm(),
        N.Pool(pool_size=(3, 1), strides=(2, 1), name='PoolOutput1'),
        N.Conv(num_filters=64, filter_size=(5, 3)),
        N.BatchNorm(),
        N.Pool(pool_size=(3, 2), strides=(2, 2), name='PoolOutput2'),
示例#55
0
  ids = np.random.permutation(len(X_train))
  X_train, y_train = X_train[ids], y_train[ids]

  X_valid, y_valid = X_train[40000:], y_train[40000:]
  X_train, y_train = X_train[:40000], y_train[:40000]
  # normalize value to [0, 1]
  X_train = X_train / 255.
  X_valid = X_valid / 255.
  X_test = X_test / 255.
print(ds)
# ====== others ====== #
X_samples, y_samples = X_train[:25], y_train[:25]
input_shape = ds['X_train'].shape
input_ndim = len(input_shape)
print("Train shape:", ctext(X_train.shape, 'cyan'))
print("Valid shape:", ctext(X_valid.shape, 'cyan'))
print("Test  shape:", ctext(X_test.shape, 'cyan'))
# ====== create basic tensor ====== #
X = K.placeholder(shape=(None,) + input_shape[1:], name='X_input')
y = K.placeholder(shape=(None,), name='y_input')
# ===========================================================================
# Create the network
# ===========================================================================
LATENT_DROPOUT = 0.3
if args.cnn:
  with N.args_scope(([N.Conv, N.Dense], dict(b_init=None, activation=K.linear)),
                    (N.BatchNorm, dict(activation=tf.nn.elu)),
                    (N.Pool, dict(mode='max', pool_size=2))):
    f_encoder = N.Sequence([
        N.Dropout(level=0.5),
示例#56
0
 def batch_end(self, task, batch_results):
   print(ctext("Batch End:", 'cyan'),
         task.name, task.curr_epoch, task.curr_samples,
         [(i.shape, i.dtype, type(i)) for i in as_tuple(batch_results)])
示例#57
0
def dataset_statistics(dsname):
    ids = {
        name: (start, end)
        for name, (start, end) in indices.items()
        if ds['dsname'][name] == dsname
    }
    name2spk = {name: ds['spkid'][name] for name in ids.keys()}
    s = []
    s.append('=' * 12 + ctext('%-12s' % dsname, 'lightyellow') + '=' * 12)
    s.append('#Files   :' + ctext(len(ids), 'cyan'))
    s.append("#Speakers:" + ctext(len(set(name2spk.values())), 'cyan'))
    # ====== mean and std ====== #
    sum1 = 0.
    sum2 = 0.
    n = 0
    spk_sum1 = defaultdict(float)
    spk_sum2 = defaultdict(float)
    spk_n = defaultdict(int)
    for name, (start, end) in ids.items():
        spkid = name2spk[name]
        n += end - start
        spk_n[spkid] += end - start
        x = X[start:end][:].astype('float64')
        s1 = np.sum(x, axis=0)
        s2 = np.sum(x**2, axis=0)
        sum1 += s1
        sum2 += s2
        spk_sum1[spkid] += s1
        spk_sum2[spkid] += s2
    data_mean = sum1 / n
    data_std = np.sqrt(sum2 / n - data_mean**2)

    spk_stats = {}
    for spkid in name2spk.values():
        n = spk_n[spkid]
        s1, s2 = spk_sum1[spkid], spk_sum2[spkid]
        mean = s1 / n
        std = np.sqrt(s2 / n - mean**2)
        spk_stats[spkid] = (mean, std)
    spk_mean = np.concatenate([x[0][None, :] for x in spk_stats.values()],
                              axis=0).mean(0)
    spk_std = np.concatenate([x[1][None, :] for x in spk_stats.values()],
                             axis=0).mean(0)
    # ====== utterances length ====== #
    all_length = np.array([(end - start) * Config.STEP_LENGTH
                           for start, end in indices.values()])
    # ====== speaker - utterance relation ====== #
    nutt_per_spk = defaultdict(int)
    dur_per_spk = defaultdict(list)
    for name, (start, end) in ids.items():
        spkid = name2spk[name]
        nutt_per_spk[spkid] += 1
        dur_per_spk[spkid].append((end - start) * Config.STEP_LENGTH)
    all_spk = sorted(nutt_per_spk.keys())
    spk_df = pd.DataFrame(
        data={
            'nutt_per_spk': [nutt_per_spk[spk] for spk in all_spk],
            'sum_per_spk': [np.sum(dur_per_spk[spk]) for spk in all_spk],
            'mean_per_spk': [np.mean(dur_per_spk[spk]) for spk in all_spk],
        })
    return dsname, '\n'.join(s), (all_length, spk_df), (data_mean,
                                                        data_std), (spk_mean,
                                                                    spk_std)
示例#58
0
 def epoch_start(self, task, data):
   print(ctext("Epoch Start:", 'cyan'),
         task.name, task.curr_epoch, task.curr_samples,
         [(i.shape, i.dtype, type(i)) for i in data])