Exemplo n.º 1
0
 def embed(self, vocabulary, dtype='float32', token_not_found='ignore'):
     """Any word not found in the vocabulary will be set to all-zeros"""
     # ====== check vocab ======= #
     if not isinstance(vocabulary, Mapping):
         raise ValueError('"vocabulary" must be any instance of dict.')
     # ====== check token_not_found ====== #
     if not is_number(token_not_found) and \
     not is_string(token_not_found) and \
     token_not_found not in ('ignore', 'raise'):
         raise ValueError('token_not_found can be: "ignore", "raise"'
                          ', an integer of token index, or a string '
                          'represented a token.')
     if token_not_found not in ('ignore', 'raise'):
         token_not_found = int(self.dictionary[token_not_found])
     elif is_number(token_not_found):
         token_not_found = int(token_not_found)
     # ====== create embedding matrix ====== #
     ndim = len(next(vocabulary.values()))
     matrix = np.zeros(shape=(len(self.dictionary), ndim), dtype=dtype)
     for word, idx in self.dictionary.items():
         if len(word) == 0: continue
         if word in vocabulary:
             matrix[idx, :] = vocabulary[word]
         elif token_not_found == 'raise':
             raise Exception('Cannot find token "%s" in the vocabulary.' %
                             word)
         elif isinstance(token_not_found, int):
             matrix[idx, :] == matrix[token_not_found, :]
     return matrix
Exemplo n.º 2
0
Arquivo: text.py Projeto: imito/odin
 def embed(self, vocabulary, dtype='float32',
           token_not_found='ignore'):
   """Any word not found in the vocabulary will be set to all-zeros"""
   # ====== check vocab ======= #
   if not isinstance(vocabulary, Mapping):
     raise ValueError('"vocabulary" must be any instance of dict.')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== create embedding matrix ====== #
   ndim = len(next(vocabulary.values()))
   matrix = np.zeros(shape=(len(self.dictionary), ndim), dtype=dtype)
   for word, idx in self.dictionary.items():
     if len(word) == 0: continue
     if word in vocabulary:
       matrix[idx, :] = vocabulary[word]
     elif token_not_found == 'raise':
       raise Exception('Cannot find token "%s" in the vocabulary.' % word)
     elif isinstance(token_not_found, int):
       matrix[idx, :] == matrix[token_not_found, :]
   return matrix
Exemplo n.º 3
0
def _check_shape(s):
  if hasattr(s, '__call__'):
    return s
  if is_number(s) or s is None:
    s = (s,)
  elif isinstance(s, np.ndarray):
    s = s.tolist()
  return tuple([int(i) if is_number(i) else None for i in s])
Exemplo n.º 4
0
def _preprocessing_losses(losses,
                          y_true,
                          y_pred,
                          inherit_losses=None,
                          sample_weights=None):
    """ Can be used for both objectives and metrics """
    from odin import backend as K
    # ====== special cases, only one inputs outputs, and multiple loss ====== #
    nb_losses = len(losses)
    if len(y_true) == 0:
        y_true = [None] * nb_losses
    elif len(y_true) == 1:
        y_true = y_true * nb_losses
    if len(y_pred) == 0:
        y_pred = [None] * nb_losses
    elif len(y_pred) == 1:
        y_pred = y_pred * nb_losses
    # ====== applying ====== #
    cost = []
    for idx, fn in enumerate(as_tuple(losses)):
        weight = 1
        kwargs = {}
        # preprocess
        if isinstance(fn, (tuple, list)):
            if len(fn) == 1:
                fn = fn[0]
            else:
                weight = [i for i in fn if is_number(i)]
                weight = 1 if len(weight) == 0 else weight[0]
                kwargs = [i for i in fn if isinstance(i, Mapping)]
                kwargs = {} if len(kwargs) == 0 else kwargs[0]
                fn = [i for i in fn if i != weight and i != kwargs][0]
        # apply the loss
        if is_number(fn):
            if inherit_losses is None or fn >= len(inherit_losses):
                raise ValueError("Cannot find losses at index: '%d'" % fn)
            obj = inherit_losses[fn]
        elif K.is_tensor(fn):
            obj = fn
        elif hasattr(fn, '__call__'):
            try:
                sign = inspect.signature(fn)
                if 'weights' in sign.parameters and sample_weights is not None:
                    kwargs['weights'] = sample_weights
            except ValueError:
                pass
            finally:
                obj = fn(y_true[idx], y_pred[idx], **kwargs)
            if isinstance(obj, (tuple, list)):
                wprint(
                    "function: '%s' return %d outputs (%s), only pick the first one"
                    % (fn.__name__, len(obj), '; '.join([str(i)
                                                         for i in obj])))
                obj = obj[0]
        cost.append((weight, obj))
    # ====== reduce ====== #
    return [c if w == 1 else w * c for w, c in cost]
Exemplo n.º 5
0
def _preprocessing_losses(losses, y_true, y_pred, inherit_losses=None,
                          sample_weights=None):
  """ Can be used for both objectives and metrics """
  from odin import backend as K
  # ====== special cases, only one inputs outputs, and multiple loss ====== #
  nb_losses = len(losses)
  if len(y_true) == 0:
    y_true = [None] * nb_losses
  elif len(y_true) == 1:
    y_true = y_true * nb_losses
  if len(y_pred) == 0:
    y_pred = [None] * nb_losses
  elif len(y_pred) == 1:
    y_pred = y_pred * nb_losses
  # ====== applying ====== #
  cost = []
  for idx, fn in enumerate(as_tuple(losses)):
    weight = 1
    kwargs = {}
    # preprocess
    if isinstance(fn, (tuple, list)):
      if len(fn) == 1:
        fn = fn[0]
      else:
        weight = [i for i in fn if is_number(i)]
        weight = 1 if len(weight) == 0 else weight[0]
        kwargs = [i for i in fn if isinstance(i, Mapping)]
        kwargs = {} if len(kwargs) == 0 else kwargs[0]
        fn = [i for i in fn if i != weight and i != kwargs][0]
    # apply the loss
    if is_number(fn):
      if inherit_losses is None or fn >= len(inherit_losses):
        raise ValueError("Cannot find losses at index: '%d'" % fn)
      obj = inherit_losses[fn]
    elif K.is_tensor(fn):
      obj = fn
    elif hasattr(fn, '__call__'):
      try:
        sign = inspect.signature(fn)
        if 'weights' in sign.parameters and sample_weights is not None:
          kwargs['weights'] = sample_weights
      except ValueError:
        pass
      finally:
        obj = fn(y_true[idx], y_pred[idx], **kwargs)
      if isinstance(obj, (tuple, list)):
        wprint("function: '%s' return %d outputs (%s), only pick the first one"
               % (fn.__name__,
                  len(obj),
                  '; '.join([str(i) for i in obj])))
        obj = obj[0]
    cost.append((weight, obj))
  # ====== reduce ====== #
  return [c if w == 1 else w * c for w, c in cost]
Exemplo n.º 6
0
 def _apply(self, X):
   axes = self.axes
   ndims = X.shape.ndims
   if is_string(axes) and axes.lower() == 'auto':
     if ndims == 3:
       axes = (1,)
     elif ndims == 4:
       axes = (1, 2)
     elif ndims == 5:
       axes = (1, 2, 3)
   X = K.upsample(X, scale=self.size, axes=axes, method=self.mode)
   # ====== check desire_shape ====== #
   desire_shape = self.desire_shape
   if desire_shape is not None:
     desire_shape = [None if i is None or i < 0 else int(i)
                     for i in desire_shape]
     # do padding if necessary
     paddings = [[0, 0] if i is None or o is None or i >= o else
                 [tf.cast(tf.ceil((o - i) / 2), 'int32'),
                  tf.cast(tf.floor((o - i) / 2), 'int32')]
                 for i, o in zip(X.shape.as_list(), desire_shape)]
     if not all(i == [0, 0] for i in paddings):
       X = tf.pad(X, paddings=paddings, mode='CONSTANT')
     # do slice if necessary
     slices = [slice(tf.cast(tf.floor((i - o) / 2), 'int32'),
                     tf.cast(-tf.ceil((i - o) / 2), 'int32'), None)
               if i is not None and o is not None and i > o else slice(None)
               for i, o in zip(X.shape.as_list(), desire_shape)]
     if any(s is not slice(None) for s in slices):
       X = X[slices]
     K.set_shape(X, tuple([i if is_number(i) else None
                           for i in desire_shape]))
   return X
Exemplo n.º 7
0
Arquivo: feeder.py Projeto: imito/odin
 def shape(self):
   """ This is just an "UPPER" estimation, some data points might be lost
   during preprocessing each indices by recipes.
   """
   # ====== first time calculate the shape ====== #
   if self._cache_shape is None or self._recipes_changed:
     # for each Descriptor, create list of pairs: (name, length)
     shapes_indices = []
     for dat in self._data:
       indices = []
       length = 0
       for name in self.indices_keys:
         start, end = dat.indices[name]
         lng = end - start
         length += lng
         indices.append((name, lng))
       # modify shapes by estimted length from indices
       shapes = (dat.shape,) if is_number(dat.shape[0]) \
           else dat.shape
       # NOTE: the indices is copy for each shape (i.e. data),
       # hence, it will create some overhead in shape_transform
       for shp in [(length,) + shp[1:] for shp in shapes]:
         shapes_indices.append((shp, list(indices)))
     # Recipes shape_transform
     shapes = tuple([
         shp for shp, ids in self._recipes.shape_transform(shapes_indices)
     ])
     del shapes_indices
     self._cache_shape = tuple(shapes)
     self._recipes_changed = False
   # ====== get the cached shape ====== #
   if any(s[0] == 0 for s in self._cache_shape):
     raise RuntimeError("Feeder has `length=0` change the recipes to retain "
                        "minimum of `length>=1`, shape: %s" % str(self._cache_shape))
   return self._cache_shape
Exemplo n.º 8
0
 def __init__(self, slices, axis, data_idx=None):
   super(Slice, self).__init__()
   # ====== validate axis ====== #
   if not is_number(axis):
     raise ValueError('axis for Slice must be an integer.')
   self.axis = int(axis)
   # ====== validate indices ====== #
   if is_number(slices):
     slices = slice(int(slices), int(slices + 1))
   elif isinstance(slices, (tuple, list)):
     slices = [i if isinstance(i, slice) else slice(int(i), int(i + 1))
               for i in slices
               if isinstance(i, slice) or is_number(i)]
   elif not isinstance(slices, slice):
     raise ValueError('indices must be int, slice, or list of int and slice.')
   self.slices = slices
   # ====== validate target_data ====== #
   self.data_idx = data_idx
Exemplo n.º 9
0
 def _get_index(self, name):
   index = self.idx[name]
   if self.threshold is None:
     index
   elif hasattr(self.threshold, '__call__'):
     index = self.threshold(index)
   elif is_number(self.threshold):
     index = index >= float(self.threshold)
   if index.dtype != np.bool:
     index = index.astype('bool')
   return index
Exemplo n.º 10
0
def confusion_matrix(y_true, y_pred, labels=None, normalize=False, name=None):
  """
  Computes the confusion matrix of given vectors containing
  actual observations and predicted observations.

  Parameters
  ----------
  y_true : 1-d or 2-d tensor variable
      true values
  y_pred : 1-d or 2-d tensor variable
      prediction values
  normalize : bool
      if True, normalize each row to [0., 1.]
  labels : array, shape = [nb_classes], int (nb_classes)
      List of labels to index the matrix. This may be used to reorder
      or select a subset of labels.
      If none is given, those that appear at least once
      in ``y_true`` or ``y_pred`` are used in sorted order.

  Note
  ----
  if you want to calculate: Precision, Recall, F1 scores from the
  confusion matrix, set `normalize=False`

  """
  with tf.name_scope(name, 'confusion_matrix', [y_true, y_pred]):
    nb_classes = None
    if y_true.shape.ndims == 2:
      nb_classes = y_true.shape.as_list()[-1]
      y_true = tf.argmax(y_true, -1)
    elif y_true.shape.ndims != 1:
      raise ValueError('actual must be 1-d or 2-d tensor variable')
    if y_pred.shape.ndims == 2:
      nb_classes = y_pred.shape.as_list()[-1]
      y_pred = tf.argmax(y_pred, -1)
    elif y_pred.shape.ndims != 1:
      raise ValueError('pred must be 1-d or 2-d tensor variable')
    # check valid labels
    if labels is None:
      if nb_classes is None:
        raise RuntimeError(
            "Cannot infer the number of classes for confusion matrix")
      labels = int(nb_classes)
    elif is_number(labels):
      labels = int(labels)
    elif hasattr(labels, '__len__'):
      labels = len(labels)
    # transpose to match the format of sklearn
    cm = tf_cm(labels=y_true, predictions=y_pred, num_classes=labels)
    if normalize:
      cm = tf.cast(cm, dtype='float32')
      cm = cm / tf.reduce_sum(cm, axis=1, keep_dims=True)
    return cm
Exemplo n.º 11
0
def _check_label_mode(mode):
  if is_number(mode):
    return np.clip(float(mode), 0., 1.)
  if is_string(mode):
    mode = mode.lower()
    if mode == 'mid':
      mode = 'middle'
    if mode not in ('common', 'last', 'first', 'middle'):
      raise ValueError(
          "`label_mode` can be: 'common', 'last', 'first', 'middle'")
    return mode
  raise ValueError("No support for `label_mode`=%s" % str(mode))
Exemplo n.º 12
0
 def set_log_level(self, level):
   """ level: {int, bool}
     if `int`, log-level in integer (from 0 - 9) higher
     means more detail, -1 for turning off the log.
     if True, set the log-level to default: 2
   """
   if is_number(level):
     self._log_level = int(level)
   elif bool(level):
     self._log_level = 2
   else:
     self._log_level = -1
   return self
Exemplo n.º 13
0
def _preprocessing_data(train, valid):
  from odin import fuel as F
  train = F.as_data(train)
  if is_number(valid):
    start_train = 0.
    end_train = 1. - valid
    start_valid = 1. - valid
    end_valid = 1.
    valid = F.DataGroup(train.data).set_batch(start=start_valid, end=end_valid)
    train = F.DataGroup(train.data).set_batch(start=start_train, end=end_train)
  elif valid is not None:
    valid = F.as_data(valid)
  return train, valid
Exemplo n.º 14
0
 def __init__(self, indices, axis, target_data=None):
     super(Slice, self).__init__()
     # ====== validate axis ====== #
     if not isinstance(axis, int):
         raise ValueError('axis for Slice must be an integer.')
     if axis == 0 and target_data is not None:
         raise ValueError("You can only apply Slice on axis=0 for all Data, "
                          "(i.e. 'target_data' must be None when axis=0)")
     self.axis = axis
     # ====== validate indices ====== #
     if is_number(indices):
         indices = slice(int(indices), int(indices + 1))
     elif isinstance(indices, (tuple, list)):
         indices = [i if isinstance(i, slice) else slice(int(i), int(i + 1))
                    for i in indices
                    if isinstance(i, slice) or is_number(i)]
     elif not isinstance(indices, slice):
         raise ValueError('indices must be int, slice, or list of int and slice.')
     self.indices = indices
     # ====== validate target_data ====== #
     if target_data is not None and not isinstance(target_data, (tuple, list)):
         target_data = (target_data,)
     self._target_data = target_data
Exemplo n.º 15
0
def _validate_shape_dtype(x):
  if not isinstance(x, tuple):
    return False
  if not len(x) == 2:
    return False
  shape, dtype = x
  # check shape
  if not isinstance(shape, tuple) and \
  all(is_number(i) or isinstance(i, type(None)) for i in x):
    return False
  # check dtype
  if not is_string(dtype):
    return False
  return True
Exemplo n.º 16
0
  def __init__(self, lr, decay_steps=None, decay_rate=0.96, staircase=True,
               clipnorm=None, clipvalue=None, clip_alg='total_norm',
               name=None):
    if name is None:
      name = self.__class__.__name__ + '_' + str(uuid(length=4))
    elif not isinstance(name, string_types):
      name = str(name)
    self._name = str(name)
    self.staircase = bool(staircase)
    with tf.variable_scope(self._name):
      self._lr = _as_variable(lr, name='learning_rate', roles=LearningRate)
      self._lr_decay = None
      self._step = tf.Variable(0., dtype=floatX,
          name="%s_step" % self.__class__.__name__)
      self.decay_steps = decay_steps
      self.decay_rate = decay_rate

      if clipnorm is not None:
        if (clipnorm if is_number(clipnorm) else get_value(clipnorm)) <= 0:
          raise ValueError('`clipnorm` value must greater than 0.')
      self.clipnorm = _as_variable(clipnorm, name="clip_norm",
          roles=GraidentsClippingNorm)

      if clipvalue is not None:
        if (clipvalue if is_number(clipvalue) else get_value(clipvalue)) <= 0:
          raise ValueError('`clipvalue` value must greater than 0.')
      self.clipvalue = _as_variable(clipvalue, name="clip_value",
          roles=GraidentsClippingValue)
    # ====== internal states values ====== #
    clip_alg = str(clip_alg).strip().lower()
    if clip_alg not in ('total_norm', 'norm', 'avg_norm'):
      raise ValueError("clip_arg must be one of the following: "
          "'norm', 'total_norm', 'avg_norm'")
    self._norm = 0.
    self.clip_alg = clip_alg
    self._algorithm = None
    self._is_initialized = False
Exemplo n.º 17
0
def kl_gaussian(mu, logsigma, prior_mu=0., prior_logsigma=0.):
    """ KL-divergence between two gaussians.
  Useful for Variational AutoEncoders. Use this as an activation regularizer

  For taking kl_gaussian as variational regularization, you can take mean of
  the return matrix

  Parameters:
  -----------
  mean, logsigma: parameters of the input distributions
  prior_mean, prior_logsigma: paramaters of the desired distribution (note the
      log on logsigma)


  Return
  ------
  matrix: (n_samples, n_features)

  Note
  ----
  origin implementation from:
  https://github.com/Philip-Bachman/ICML-2015/blob/master/LogPDFs.py
  Copyright (c) Philip Bachman
  """
    if is_number(prior_mu):
        prior_mu = tf.convert_to_tensor(prior_mu,
                                        name='prior_mu',
                                        dtype=mu.dtype.base_dtype)
    if is_number(prior_logsigma):
        prior_logsigma = tf.convert_to_tensor(prior_logsigma,
                                              name='prior_logsigma',
                                              dtype=logsigma.dtype.base_dtype)
    gauss_klds = 0.5 * (
        2 * (prior_logsigma - logsigma) +
        (tf.exp(2 * logsigma) / tf.exp(2 * prior_logsigma)) + (tf.pow(
            (mu - prior_mu), 2.0) / tf.exp(2 * prior_logsigma)) - 1.0)
    return gauss_klds
Exemplo n.º 18
0
  def shape_transform(self, shapes):
    """
    Parameters
    ----------
    shapes: list of [(shape0, indices0), (shape1, indices1), ...]
        list of data shape tuple and indices, the indices is list
        of tuple (name, length)

    Return
    ------
    new shape that transformed by this Recipe
    new indices
    """
    for i in self._recipes:
      shapes = i.shape_transform(shapes)
      # ====== check returned ====== #
      if not all((isinstance(shp, (tuple, list)) and
                  all(is_number(s) for s in shp) and
                  is_string(ids[0][0]) and is_number(ids[0][1]))
                 for shp, ids in shapes):
        raise RuntimeError("Returned `shapes` must be the list of pair "
                           "`(shape, indices)`, where `indices` is the "
                           "list of (name, length(int)).")
    return shapes
Exemplo n.º 19
0
    def shape_transform(self, shapes):
        """
    Parameters
    ----------
    shapes: list of [(shape0, indices0), (shape1, indices1), ...]
        list of data shape tuple and indices, the indices is list
        of tuple (name, length)

    Return
    ------
    new shape that transformed by this Recipe
    new indices
    """
        for i in self._recipes:
            shapes = i.shape_transform(shapes)
            # ====== check returned ====== #
            if not all((isinstance(shp, (tuple, list)) and all(
                    is_number(s) for s in shp) and is_string(ids[0][0])
                        and is_number(ids[0][1])) for shp, ids in shapes):
                raise RuntimeError(
                    "Returned `shapes` must be the list of pair "
                    "`(shape, indices)`, where `indices` is the "
                    "list of (name, length(int)).")
        return shapes
Exemplo n.º 20
0
def _preprocess_prior_weights(y_true, prior_weights):
    if prior_weights is None:
        return None
    from odin import backend as K
    # ====== everything must be list ====== #
    if not isinstance(prior_weights, (tuple, list)):
        prior_weights = (prior_weights, )
    elif is_number(prior_weights[0]):
        prior_weights = (prior_weights, )
    # ====== matching indices and prior_weights ====== #
    pw = 0
    for yt, w in zip(y_true, prior_weights):
        if w is not None:
            pw += K.to_sample_weights(indices=yt, weights=w)
    return pw
Exemplo n.º 21
0
def _preprocessing_data(train, valid):
    from odin import fuel as F
    train = F.as_data(train)
    if is_number(valid):
        start_train = 0.
        end_train = 1. - valid
        start_valid = 1. - valid
        end_valid = 1.
        valid = F.DataGroup(train.data).set_batch(start=start_valid,
                                                  end=end_valid)
        train = F.DataGroup(train.data).set_batch(start=start_train,
                                                  end=end_train)
    elif valid is not None:
        valid = F.as_data(valid)
    return train, valid
Exemplo n.º 22
0
Arquivo: losses.py Projeto: imito/odin
def kl_gaussian(mu, logsigma,
                prior_mu=0., prior_logsigma=0.):
  """ KL-divergence between two gaussians.
  Useful for Variational AutoEncoders. Use this as an activation regularizer

  For taking kl_gaussian as variational regularization, you can take mean of
  the return matrix

  Parameters:
  -----------
  mean, logsigma: parameters of the input distributions
  prior_mean, prior_logsigma: paramaters of the desired distribution (note the
      log on logsigma)


  Return
  ------
  matrix: (n_samples, n_features)

  Note
  ----
  origin implementation from:
  https://github.com/Philip-Bachman/ICML-2015/blob/master/LogPDFs.py
  Copyright (c) Philip Bachman
  """
  if is_number(prior_mu):
    prior_mu = tf.convert_to_tensor(prior_mu, name='prior_mu',
        dtype=mu.dtype.base_dtype)
  if is_number(prior_logsigma):
    prior_logsigma = tf.convert_to_tensor(
        prior_logsigma, name='prior_logsigma',
        dtype=logsigma.dtype.base_dtype)
  gauss_klds = 0.5 * (2 * (prior_logsigma - logsigma) +
          (tf.exp(2 * logsigma) / tf.exp(2 * prior_logsigma)) +
          (tf.pow((mu - prior_mu), 2.0) / tf.exp(2 * prior_logsigma)) - 1.0)
  return gauss_klds
Exemplo n.º 23
0
def _preprocess_prior_weights(y_true, prior_weights):
  if prior_weights is None:
    return None
  from odin import backend as K
  # ====== everything must be list ====== #
  if not isinstance(prior_weights, (tuple, list)):
    prior_weights = (prior_weights,)
  elif is_number(prior_weights[0]):
    prior_weights = (prior_weights,)
  # ====== matching indices and prior_weights ====== #
  pw = 0
  for yt, w in zip(y_true, prior_weights):
    if w is not None:
      pw += K.to_sample_weights(indices=yt, weights=w)
  return pw
Exemplo n.º 24
0
def get_arguments():
    args = ArgController().add(
        "input", "Name of the dataset or path to csv file").add(
            "-n", "number of GMM components",
            2).add("-idx", "index of the positive component", 1).add(
                "-norm", "method for normalizing: raw, log", 'log',
                ('log', 'raw')).add(
                    "-outpath", "y_bin and y_prob will be saved to this path",
                    '').add("-figpath", "path for saving analysis figure",
                            '/tmp/tmp.pdf').add(
                                "--verbose",
                                "Enable verbose and saving diagnosis",
                                False).parse()
    inp = str(args.input)
    if os.path.exists(inp):
        assert os.path.isfile(inp), "%s must be path to a file" % inp
        data = []
        with open(inp, 'r') as f:
            for line in f:
                data.append(line.strip().split(','))
        data = np.array(data)
        if all(is_number(i, string_number=True) for i in data[0]):
            y_prot = data.astype('float32')
            y_prot_names = np.array(
                ['#%d' % i for i in range(y_prot.shape[1])])
        else:
            y_prot = data[1:].astype('float32')
            y_prot_names = data[0]
        outpath = args.outpath
    else:
        from sisua.data import get_dataset
        ds, gene_ds, prot_ds = get_dataset(inp, override=False)
        y_prot = ds['y']
        y_prot_names = np.array(ds['y_col'])
        outpath = ds.path if args.outpath == '' else args.outpath
    return {
        'y_prot': y_prot,
        'y_prot_names': y_prot_names,
        'n_components': int(args.n),
        'index': int(args.idx),
        'log_norm': True if args.norm == 'log' else False,
        'outpath': outpath if len(outpath) > 0 else None,
        'figpath': args.figpath if len(args.figpath) > 0 else None,
        'verbose': bool(args.verbose)
    }
Exemplo n.º 25
0
 def __init__(self, idx, threshold=None,
              mvn=False, varnorm=True,
              data_idx=None, label_idx=()):
   super(Indexing, self).__init__()
   if not hasattr(idx, '__getitem__'):
     raise ValueError("`sad` must has attribute __getitem__ which takes "
         "file name as input and return array of index, same length as data.")
   if threshold is not None and \
   not hasattr(threshold, '__call__') and \
   not is_number(threshold):
     raise ValueError("`threshold` can be None, call-able, or number.")
   self.idx = idx
   self.threshold = threshold
   self.data_idx = data_idx
   self.label_idx = label_idx
   # ====== for normalization ====== #
   self.mvn = bool(mvn)
   self.varnorm = bool(varnorm)
Exemplo n.º 26
0
def _apply_label_mode(y, mode):
  # This applying the label transform to 1-st axis
  if is_number(mode):
    n = y.shape[1]
    n = int(float(mode) * n)
    return y[:, n]
  if mode == 'common':
    raise NotImplementedError
  if mode == 'last':
    return y[:, -1]
  elif mode == 'first':
    return y[:, 0]
  elif mode == 'middle':
    n = y.shape[1]
    if n % 2 == 0:
      n //= 2
    else:
      n = n // 2 + 1
    return y[:, n]
  raise NotImplementedError("No support for label mode: '%s'" % mode)
Exemplo n.º 27
0
 def shape(self):
     """ This is just an "UPPER" estimation, some data points might be lost
 during preprocessing each indices by recipes.
 """
     # ====== first time calculate the shape ====== #
     if self._cache_shape is None or self._recipes_changed:
         # for each Descriptor, create list of pairs: (name, length)
         shapes_indices = []
         for dat in self._data:
             indices = []
             length = 0
             for name in self.indices_keys:
                 start, end = dat.indices[name]
                 lng = end - start
                 length += lng
                 indices.append((name, lng))
             # modify shapes by estimted length from indices
             shapes = (dat.shape,) if is_number(dat.shape[0]) \
                 else dat.shape
             # NOTE: the indices is copy for each shape (i.e. data),
             # hence, it will create some overhead in shape_transform
             for shp in [(length, ) + shp[1:] for shp in shapes]:
                 shapes_indices.append((shp, list(indices)))
         # Recipes shape_transform
         shapes = tuple([
             shp
             for shp, ids in self._recipes.shape_transform(shapes_indices)
         ])
         del shapes_indices
         self._cache_shape = tuple(shapes)
         self._recipes_changed = False
     # ====== get the cached shape ====== #
     if any(s[0] == 0 for s in self._cache_shape):
         raise RuntimeError(
             "Feeder has `length=0` change the recipes to retain "
             "minimum of `length>=1`, shape: %s" % str(self._cache_shape))
     return self._cache_shape
Exemplo n.º 28
0
 def format_score(s):
   return ctext('%.4f' % s if is_number(s) else s, 'yellow')
Exemplo n.º 29
0
 def transform(self,
               texts,
               mode='seq',
               dtype='int32',
               padding='pre',
               truncating='pre',
               value=0.,
               end_document=None,
               maxlen=None,
               token_not_found='ignore'):
     """
 Parameters
 ----------
 texts: iterator of unicode
     iterator, generator or list (e.g. [u'a', u'b', ...])
     of unicode documents.
 mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
     'binary', abc
     'tfidf', abc
     'count', abc
     'freq', abc
     'seq', abc
 token_not_found: 'ignore', 'raise', a token string, an integer
     pass
 """
     # ====== check arguments ====== #
     texts = self._validate_texts(texts)
     # ====== check mode ====== #
     mode = str(mode)
     if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
         raise ValueError('The "mode" argument must be: "seq", "binary", '
                          '"count", "freq", or "tfidf".')
     # ====== check token_not_found ====== #
     if not is_number(token_not_found) and \
     not is_string(token_not_found) and \
     token_not_found not in ('ignore', 'raise'):
         raise ValueError('token_not_found can be: "ignore", "raise"'
                          ', an integer of token index, or a string '
                          'represented a token.')
     if token_not_found not in ('ignore', 'raise'):
         token_not_found = int(self.dictionary[token_not_found])
     elif is_number(token_not_found):
         token_not_found = int(token_not_found)
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== Initialize variables ====== #
     dictionary = self.dictionary
     results = []
     # ====== preprocess arguments ====== #
     if isinstance(end_document, str):
         end_document = dictionary.index(end_document)
     elif is_number(end_document):
         end_document = int(end_document)
     # ====== processing ====== #
     if hasattr(texts, '__len__'):
         target_len = len(texts)
         auto_adjust_len = False
     else:
         target_len = 1234
         auto_adjust_len = True
     prog = Progbar(target=target_len,
                    name="Tokenize Transform",
                    print_report=True,
                    print_summary=True)
     for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
         # found the word in dictionary
         vec = []
         for x in doc:
             idx = dictionary.get(x, -1)
             if idx >= 0:
                 vec.append(idx)
                 # not found the token in dictionary
             elif token_not_found == 'ignore':
                 continue
             elif token_not_found == 'raise':
                 raise RuntimeError(
                     'Cannot find token: "%s" in dictionary' % x)
             elif isinstance(token_not_found, int):
                 vec.append(token_not_found)
         # append ending document token
         if end_document is not None:
             vec.append(end_document)
         # add the final results
         results.append(vec)
         # print progress
         if self.print_progress:
             prog['#Docs'] = nb_docs
             prog.add(1)
             if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
                 prog.target = 1.2 * prog.target
     # end the process
     # if self.print_progress and auto_adjust_len:
     #     prog.target = nb_docs; prog.update(nb_docs)
     # ====== pad the sequence ====== #
     # just transform into sequence of tokens
     if mode == 'seq':
         maxlen = self.longest_document_length if maxlen is None \
             else int(maxlen)
         results = pad_sequences(results,
                                 maxlen=maxlen,
                                 dtype=dtype,
                                 padding=padding,
                                 truncating=truncating,
                                 value=value)
     # transform into one-hot matrix
     else:
         X = np.zeros(shape=(len(results), self.nb_words))
         for i, seq in enumerate(results):
             if mode == 'binary':
                 X[i, seq] = 1
             elif mode == 'freq':
                 length = len(seq)
                 count = freqcount(seq)
                 for tok, n in count.items():
                     X[i, tok] = n / float(length)
             elif mode == 'count':
                 count = freqcount(seq)
                 for tok, n in count.items():
                     X[i, tok] = n
             elif mode == 'tfidf':
                 count = freqcount(seq)
                 for tok, n in count.items():
                     tf = 1 + np.log(n)
                     docs_freq = self._word_dictionary_info.get(
                         tok, (0, 0))[-1]
                     idf = np.log(1 + self.nb_docs / (1 + docs_freq))
                     X[i, tok] = tf * idf
         results = X
     return results
Exemplo n.º 30
0
 def fit(self, X, y=None, cv=None):
     self._initialize(X)
     if not hasattr(X, 'shape') or not hasattr(X, '__iter__') or \
     not hasattr(X, '__len__'):
         raise ValueError(
             "`X` must has 'shape', '__len__' and '__iter__' attributes")
     nb_train_samples = len(X)
     # convert to odin.fuel.Data if possible
     if isinstance(X, (np.ndarray, list, tuple)):
         X = F.as_data(X)
     if isinstance(y, (np.ndarray, list, tuple)):
         y = F.as_data(y)
     start_tr = 0
     end_tr = nb_train_samples
     # ====== check if cross validating ====== #
     create_it_cv = None
     if is_number(cv):
         cv = int(float(cv) * nb_train_samples) if cv < 1. else int(cv)
         end_tr = nb_train_samples - cv
         start_cv = end_tr
         end_cv = nb_train_samples
         nb_cv_samples = end_cv - start_cv
         create_it_cv = _create_it_func(X=X,
                                        y=y,
                                        batch_size=self.batch_size,
                                        start=start_cv,
                                        end=end_cv)
     elif isinstance(cv, (tuple, list)):
         X_cv, y_cv = cv
         nb_cv_samples = X_cv.shape[0]
         create_it_cv = _create_it_func(X=X_cv,
                                        y=y_cv,
                                        batch_size=self.batch_size,
                                        start=0,
                                        end=X_cv.shape[0])
     elif hasattr(cv, 'set_batch'):
         nb_cv_samples = cv.shape[0]
         create_it_cv = _create_it_func(X=cv,
                                        y=None,
                                        batch_size=self.batch_size,
                                        start=0,
                                        end=cv.shape[0])
     elif cv is not None:
         raise ValueError(
             '`cv` can be float (0-1), tuple or list of X and y, '
             'any object that have "shape" and "__iter__" attributes, '
             'or None')
     # ====== preprocessing ====== #
     create_it = _create_it_func(X=X,
                                 y=y,
                                 batch_size=self.batch_size,
                                 start=start_tr,
                                 end=end_tr)
     # ====== prepare ====== #
     curr_niter = sum(epoch[0] for epoch in self._train_history)
     curr_nepoch = len(self._train_history)
     curr_patience = int(self.patience)
     last_losses = None
     last_checkpoint = None
     best_epoch = None
     is_converged = False
     # ====== fitting ====== #
     while not is_converged:
         curr_nepoch += 1
         seed = self._rand_state.randint(0, 10e8)
         # ====== training ====== #
         nb_iter, duration, results = _fitting_helper(
             create_it(seed),
             fn=self._f_train,
             nb_samples=nb_train_samples,
             nb_classes=self.nb_classes,
             title='Epoch %d' % curr_nepoch)
         curr_niter += nb_iter
         self._train_history.append(
             (nb_train_samples, nb_iter, duration, results))
         # ====== cross validation ====== #
         if create_it_cv is not None:
             nb_iter, duration_valid, results = _fitting_helper(
                 create_it_cv(seed),
                 fn=self._f_score,
                 nb_samples=nb_cv_samples,
                 nb_classes=self.nb_classes,
                 title="Validating")
             self._valid_history.append(
                 (nb_train_samples, nb_iter, duration_valid, results))
             duration += duration_valid
         # ====== print log ====== #
         if self.verbose >= 2:
             print(
                 ctext('#epoch:', 'cyan') + str(curr_nepoch),
                 ctext('#iter:', 'cyan') + str(curr_niter),
                 ctext("Loss:", 'yellow') + '%.5f' % results[0],
                 ctext("Acc:", 'yellow') + '%.3f' % results[1],
                 ctext("%.2f(s)" % duration, 'magenta'))
             if self.confusion_matrix and (curr_nepoch - 1) % 8 == 0:
                 print(V.print_confusion(results[-1], labels=self.labels))
         # ====== early stopping ====== #
         losses = results[0]
         if last_checkpoint is None:  # first check point
             last_checkpoint = self.parameters
         if last_losses is not None:
             # degraded, smaller is better
             if last_losses - losses <= self.tol:
                 curr_patience -= 1
                 if self.rollback:
                     if self.verbose >= 2:
                         wprint(
                             '[LogisticRegression] Rollback to the best checkpoint '
                             'at epoch:%s patience:%s' %
                             (ctext(best_epoch,
                                    'cyan'), ctext(curr_patience, 'cyan')))
                     self.set_parameters(*last_checkpoint)
             # save best checkpoint
             else:
                 last_checkpoint = self.parameters
                 best_epoch = curr_nepoch
                 if self._path is not None:
                     with open(self._path, 'wb') as f:
                         pickle.dump(self, f)
         last_losses = losses
         if curr_patience <= 0:
             is_converged = True
         # end the training
         if self.max_iter is not None and \
         curr_niter >= self.max_iter:
             break
         if self.max_epoch is not None and \
         curr_nepoch >= self.max_epoch:
             break
     # ====== print summary plot ====== #
     if self.verbose >= 1:
         train_losses = [epoch[-1][0] for epoch in self._train_history]
         print(
             V.print_bar(train_losses,
                         height=12,
                         bincount=min(20, len(train_losses)),
                         title='Training Losses'))
         if create_it_cv is not None:
             valid_losses = [epoch[-1][0] for epoch in self._valid_history]
             print(
                 V.print_bar(valid_losses,
                             height=12,
                             bincount=min(20, len(train_losses)),
                             title='Validation Losses'))
         if self.confusion_matrix:
             print(
                 ctext("======== Training Confusion Matrix ========",
                       'cyan'))
             print(
                 V.print_confusion(arr=self._train_history[-1][-1][-1],
                                   labels=self.labels))
             if create_it_cv is not None:
                 print(
                     ctext("======== Validation Confusion Matrix ========",
                           'cyan'))
                 print(
                     V.print_confusion(arr=self._valid_history[-1][-1][-1],
                                       labels=self.labels))
     # ====== reset to best points ====== #
     self.set_parameters(*last_checkpoint)
     self._is_fitted = True
     if self._path is not None:
         with open(self._path, 'wb') as f:
             pickle.dump(self, f)
Exemplo n.º 31
0
 def __init__(self, nb_classes, l1=0., l2=0.,
              fit_intercept=True, confusion_matrix=True,
              tol=1e-4, patience=3, rollback=True,
              batch_size=1024, max_epoch=100, max_iter=None,
              optimizer='adadelta', learning_rate=1.0, class_weight=None,
              dtype='float32', seed=5218,
              verbose=False, path=None, name=None):
   super(LogisticRegression, self).__init__()
   # ====== basic dimensions ====== #
   if isinstance(nb_classes, (tuple, list, np.ndarray)):
     self._labels = tuple([str(i) for i in nb_classes])
     self._nb_classes = len(nb_classes)
   elif is_number(nb_classes):
     self._labels = tuple([str(i) for i in range(nb_classes)])
     self._nb_classes = int(nb_classes)
   self._feat_dim = None
   self._dtype = np.dtype(dtype)
   # ====== preprocessing class weight ====== #
   if class_weight is None:
     class_weight = np.ones(shape=(self.nb_classes,),
                            dtype=self.dtype)
   elif is_number(class_weight):
     class_weight = np.zeros(shape=(self.nb_classes,),
                             dtype=self.dtype) + class_weight
   self._class_weight = class_weight
   # ====== flags ====== #
   self.l1 = float(l1)
   self.l2 = float(l2)
   self.fit_intercept = bool(fit_intercept)
   self.confusion_matrix = bool(confusion_matrix)
   # ====== internal states ====== #
   self._is_fitted = False
   # ====== others ====== #
   if name is None:
     name = uuid(length=8)
     self._name = 'LogisticRegression_%s' % name
   else:
     self._name = str(name)
   self._path = path
   # ====== training ====== #
   self.batch_size = int(batch_size)
   self.max_epoch = max_epoch
   self.max_iter = max_iter
   if not is_string(optimizer):
     raise ValueError("`optimizer` must be one of the following")
   optimizer = optimizer.lower()
   if optimizer not in _optimizer_list:
     raise ValueError("`optimizer` must be one of the following: %s" %
       str(list(_optimizer_list.keys())))
   self._optimizer = _optimizer_list[optimizer.lower()](lr=float(learning_rate))
   self._optimizer_name = optimizer
   self._optimizer_lr = learning_rate
   # ====== stop training ====== #
   self.tol = float(tol)
   self.patience = int(patience)
   self.rollback = bool(rollback)
   # ====== others ====== #
   self._train_history = []
   self._valid_history = []
   self._rand_state = np.random.RandomState(seed=int(seed))
   self.verbose = int(verbose)
Exemplo n.º 32
0
Arquivo: data.py Projeto: imito/odin
 def __len__(self):
   """ len always return 1 number """
   shape = self.shape
   if is_number(shape[0]):
     return shape[0]
   return self.shape[0][0]
Exemplo n.º 33
0
Arquivo: text.py Projeto: imito/odin
 def transform(self, texts, mode='seq', dtype='int32',
               padding='pre', truncating='pre', value=0.,
               end_document=None, maxlen=None,
               token_not_found='ignore'):
   """
   Parameters
   ----------
   texts: iterator of unicode
       iterator, generator or list (e.g. [u'a', u'b', ...])
       of unicode documents.
   mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
       'binary', abc
       'tfidf', abc
       'count', abc
       'freq', abc
       'seq', abc
   token_not_found: 'ignore', 'raise', a token string, an integer
       pass
   """
   # ====== check arguments ====== #
   texts = self._validate_texts(texts)
   # ====== check mode ====== #
   mode = str(mode)
   if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
     raise ValueError('The "mode" argument must be: "seq", "binary", '
                      '"count", "freq", or "tfidf".')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== pick engine ====== #
   if self.__engine == 'spacy':
     processor = self._preprocess_docs_spacy
   elif self.__engine == 'odin':
     processor = self._preprocess_docs_odin
   # ====== Initialize variables ====== #
   dictionary = self.dictionary
   results = []
   # ====== preprocess arguments ====== #
   if isinstance(end_document, str):
     end_document = dictionary.index(end_document)
   elif is_number(end_document):
     end_document = int(end_document)
   # ====== processing ====== #
   if hasattr(texts, '__len__'):
     target_len = len(texts)
     auto_adjust_len = False
   else:
     target_len = 1208
     auto_adjust_len = True
   prog = Progbar(target=target_len, name="Tokenize Transform",
                  print_report=True, print_summary=True)
   for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
     # found the word in dictionary
     vec = []
     for x in doc:
       idx = dictionary.get(x, -1)
       if idx >= 0: vec.append(idx)
       # not found the token in dictionary
       elif token_not_found == 'ignore':
         continue
       elif token_not_found == 'raise':
         raise RuntimeError('Cannot find token: "%s" in dictionary' % x)
       elif isinstance(token_not_found, int):
         vec.append(token_not_found)
     # append ending document token
     if end_document is not None:
       vec.append(end_document)
     # add the final results
     results.append(vec)
     # print progress
     if self.print_progress:
       prog['#Docs'] = nb_docs
       prog.add(1)
       if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
         prog.target = 1.2 * prog.target
   # end the process
   # if self.print_progress and auto_adjust_len:
   #     prog.target = nb_docs; prog.update(nb_docs)
   # ====== pad the sequence ====== #
   # just transform into sequence of tokens
   if mode == 'seq':
     maxlen = self.longest_document_length if maxlen is None \
         else int(maxlen)
     results = pad_sequences(results, maxlen=maxlen, dtype=dtype,
                             padding=padding, truncating=truncating,
                             value=value)
   # transform into one-hot matrix
   else:
     X = np.zeros(shape=(len(results), self.nb_words))
     for i, seq in enumerate(results):
       if mode == 'binary':
         X[i, seq] = 1
       elif mode == 'freq':
         length = len(seq)
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n / float(length)
       elif mode == 'count':
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n
       elif mode == 'tfidf':
         count = freqcount(seq)
         for tok, n in count.items():
           tf = 1 + np.log(n)
           docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1]
           idf = np.log(1 + self.nb_docs / (1 + docs_freq))
           X[i, tok] = tf * idf
     results = X
   return results
Exemplo n.º 34
0
 def _post_processing(self, X):
   X = X[:, -1] # remove timestamp
   if is_number(self.threshold):
     X = (X >= self.threshold).astype("bool")
   return {self.output_name: X}
Exemplo n.º 35
0
 def fit(self, X, y=None, cv=None):
   self._initialize(X)
   if not hasattr(X, 'shape') or not hasattr(X, '__iter__') or \
   not hasattr(X, '__len__'):
     raise ValueError("`X` must has 'shape', '__len__' and '__iter__' attributes")
   nb_train_samples = len(X)
   # convert to odin.fuel.Data if possible
   if isinstance(X, (np.ndarray, list, tuple)):
     X = F.as_data(X)
   if isinstance(y, (np.ndarray, list, tuple)):
     y = F.as_data(y)
   start_tr = 0
   end_tr = nb_train_samples
   # ====== check if cross validating ====== #
   create_it_cv = None
   if is_number(cv):
     cv = int(float(cv) * nb_train_samples) if cv < 1. else int(cv)
     end_tr = nb_train_samples - cv
     start_cv = end_tr
     end_cv = nb_train_samples
     nb_cv_samples = end_cv - start_cv
     create_it_cv = _create_it_func(X=X, y=y, batch_size=self.batch_size,
                                    start=start_cv, end=end_cv)
   elif isinstance(cv, (tuple, list)):
     X_cv, y_cv = cv
     nb_cv_samples = X_cv.shape[0]
     create_it_cv = _create_it_func(X=X_cv, y=y_cv, batch_size=self.batch_size,
                                    start=0, end=X_cv.shape[0])
   elif hasattr(cv, 'set_batch'):
     nb_cv_samples = cv.shape[0]
     create_it_cv = _create_it_func(X=cv, y=None, batch_size=self.batch_size,
                                    start=0, end=cv.shape[0])
   elif cv is not None:
     raise ValueError('`cv` can be float (0-1), tuple or list of X and y, '
                      'any object that have "shape" and "__iter__" attributes, '
                      'or None')
   # ====== preprocessing ====== #
   create_it = _create_it_func(X=X, y=y, batch_size=self.batch_size,
                               start=start_tr, end=end_tr)
   # ====== prepare ====== #
   curr_niter = sum(epoch[0] for epoch in self._train_history)
   curr_nepoch = len(self._train_history)
   curr_patience = int(self.patience)
   last_losses = None
   last_checkpoint = None
   best_epoch = None
   is_converged = False
   # ====== fitting ====== #
   while not is_converged:
     curr_nepoch += 1
     seed = self._rand_state.randint(0, 10e8)
     # ====== training ====== #
     nb_iter, duration, results = _fitting_helper(create_it(seed),
                                                  fn=self._f_train,
                                                  nb_samples=nb_train_samples,
                                                  nb_classes=self.nb_classes,
                                                  title='Epoch %d' % curr_nepoch)
     curr_niter += nb_iter
     self._train_history.append(
         (nb_train_samples, nb_iter, duration, results))
     # ====== cross validation ====== #
     if create_it_cv is not None:
       nb_iter, duration_valid, results = _fitting_helper(create_it_cv(seed),
                                                    fn=self._f_score,
                                                    nb_samples=nb_cv_samples,
                                                    nb_classes=self.nb_classes,
                                                    title="Validating")
       self._valid_history.append(
           (nb_train_samples, nb_iter, duration_valid, results))
       duration += duration_valid
     # ====== print log ====== #
     if self.verbose >= 2:
       print(ctext('#epoch:', 'cyan') + str(curr_nepoch),
             ctext('#iter:', 'cyan') + str(curr_niter),
             ctext("Loss:", 'yellow') + '%.5f' % results[0],
             ctext("Acc:", 'yellow') + '%.3f' % results[1],
             ctext("%.2f(s)" % duration, 'magenta'))
       if self.confusion_matrix and (curr_nepoch - 1) % 8 == 0:
         print(V.print_confusion(results[-1], labels=self.labels))
     # ====== early stopping ====== #
     losses = results[0]
     if last_checkpoint is None: # first check point
       last_checkpoint = self.parameters
     if last_losses is not None:
       # degraded, smaller is better
       if last_losses - losses <= self.tol:
         curr_patience -= 1
         if self.rollback:
           if self.verbose >= 2:
             wprint('[LogisticRegression] Rollback to the best checkpoint '
                    'at epoch:%s patience:%s' %
                    (ctext(best_epoch, 'cyan'),
                     ctext(curr_patience, 'cyan')))
           self.set_parameters(*last_checkpoint)
       # save best checkpoint
       else:
         last_checkpoint = self.parameters
         best_epoch = curr_nepoch
         if self._path is not None:
           with open(self._path, 'wb') as f:
             pickle.dump(self, f)
     last_losses = losses
     if curr_patience <= 0:
       is_converged = True
     # end the training
     if self.max_iter is not None and \
     curr_niter >= self.max_iter:
       break
     if self.max_epoch is not None and \
     curr_nepoch >= self.max_epoch:
       break
   # ====== print summary plot ====== #
   if self.verbose >= 1:
     train_losses = [epoch[-1][0] for epoch in self._train_history]
     print(V.print_bar(train_losses, height=12,
                       bincount=min(20, len(train_losses)),
                       title='Training Losses'))
     if create_it_cv is not None:
       valid_losses = [epoch[-1][0] for epoch in self._valid_history]
       print(V.print_bar(valid_losses, height=12,
                         bincount=min(20, len(train_losses)),
                         title='Validation Losses'))
     if self.confusion_matrix:
       print(ctext("======== Training Confusion Matrix ========", 'cyan'))
       print(V.print_confusion(arr=self._train_history[-1][-1][-1],
                               labels=self.labels))
       if create_it_cv is not None:
         print(ctext("======== Validation Confusion Matrix ========", 'cyan'))
         print(V.print_confusion(arr=self._valid_history[-1][-1][-1],
                                 labels=self.labels))
   # ====== reset to best points ====== #
   self.set_parameters(*last_checkpoint)
   self._is_fitted = True
   if self._path is not None:
     with open(self._path, 'wb') as f:
       pickle.dump(self, f)
Exemplo n.º 36
0
def upsample(x, scale, axes, method='nn', name=None):
    """
  Parameters
  ----------
  scale: int, list of int
      scaling up factor
  axes: int, list of int
      the axes of tensor which the upsampling method will be applied
  method: str, int
      'nn' for nearest neighbor (e.g. [1, 2] => [1, 1, 2, 2]),
      'pad' for padding within the tensor. 'pad_margin' do padding
      in the margin of the tensor. 'repeat' simple algorithm for
      repeating the element (e.g. [1, 2] => [1, 2, 1, 2])
  """
    with tf.name_scope(name, "Upsample"):
        method = method.lower()
        input_shape = tf.shape(x)
        input_shape_int = x.shape.as_list()
        ndims = x.shape.ndims
        # normalize all negative axes
        if axes is None:
            raise ValueError("axes cannot be None.")
        axes = [1, 2] if axes is None else \
            [i % ndims for i in as_tuple(axes)]
        sorted(axes)
        # make scale a tuple
        scale = as_tuple(scale, N=len(axes), t=int)
        # mapping from axis -> scale
        scale_map = defaultdict(lambda: 1)
        scale_map.update([(i, j) for i, j in zip(axes, scale)])
        # create final output_shape
        output_shape = [input_shape[i] * scale_map[i] for i in range(ndims)]
        # ====== Nearest neighbor method ====== #
        if method == 'nn':
            # tensorflow only support for tile <= 6-D tensor
            if ndims >= 6:
                raise ValueError(
                    'upsample with NN mode does not support rank >= 6 tensor.')
            elif ndims + len(axes) > 6:
                for a in axes:
                    x = upsample(x, scale_map[a], axes=a, method='nn')
            else:
                # repeat the tensor
                x = dimshuffle(x,
                               pattern=list(range(ndims)) + ['x'] * len(axes))
                x = repeat(x,
                           scale,
                           axes=[i for i in range(ndims, ndims + len(axes))])
                # transpose it back to the right shape
                axes_map = {
                    i: j
                    for i, j in zip(axes, range(ndims, ndims + len(axes)))
                }
                new_axes = []
                for i in range(ndims):
                    if i not in axes_map:
                        new_axes.append(i)
                    else:
                        new_axes += [i, axes_map[i]]
                x = tf.transpose(x, perm=new_axes)
                x = reshape(x, output_shape)
        # ====== pading_margin ====== #
        elif method.lower() == 'pad_margin':
            paddings = [[0, 0] if i not in axes else [
                tf.cast(tf.ceil(input_shape[i] * (scale_map[i] - 1) /
                                2), 'int32'),
                tf.cast(tf.floor(input_shape[i] * (scale_map[i] - 1) /
                                 2), 'int32')
            ] for i in range(ndims)]
            x = tf.pad(x, paddings=paddings, mode='CONSTANT')
        # ====== pading ====== #
        elif method == 'pad':
            raise NotImplementedError
            # x = tf.scatter_nd(indices, x, shape=output_shape)
        # ====== repeat ====== #
        elif method == 'repeat':
            x = repeat(x, n=scale, axes=axes)
        # ====== no support ====== #
        else:
            raise ValueError("No support for method='%s'" % method)
        # ====== add_shape ====== #
        return set_shape(x,
                         shape=[
                             s * scale_map[i] if is_number(s) else None
                             for i, s in enumerate(input_shape_int)
                         ])
Exemplo n.º 37
0
 def _initialize(self, X):
   # ====== check inputs dimensions ====== #
   if not hasattr(X, 'shape'):
     raise ValueError("`X` must have `shape` attribute.")
   feat_dim = np.prod(X.shape[1:])
   if self._feat_dim is None:
     self._feat_dim = feat_dim
   # validate input dimension
   if feat_dim != self._feat_dim:
     raise RuntimeError("Feature dimension mismatch %d and %d" %
                        (feat_dim, self.feat_dim))
   # check if tensorflow op initalized
   if hasattr(self, '_f_train'):
     return
   # ====== binary or multi-classes ====== #
   if self.nb_classes == 2:
     out_shape = (None,)
     fn_activation = tf.nn.sigmoid
     fn_loss = tf.losses.sigmoid_cross_entropy
     fn_acc = K.metrics.binary_accuracy
   else:
     out_shape = (None, self.nb_classes)
     fn_activation = tf.nn.softmax
     fn_loss = tf.losses.softmax_cross_entropy
     fn_acc = K.metrics.categorical_accuracy
   # ====== create model ====== #
   with tf.name_scope(self.name, 'logistic_regression'):
     # inputs
     self._X = K.placeholder(shape=(None, self.feat_dim),
                             dtype=self.dtype,
                             name='%s_input' % self.name)
     self._y = K.placeholder(shape=out_shape,
                             dtype=self.dtype,
                             name='%s_output' % self.name)
     # check the bias
     if is_number(self.fit_intercept):
       b_init = float(self.fit_intercept)
     elif self.fit_intercept is False or \
     self.fit_intercept is None:
       b_init = None
     else:
       b_init = self.fit_intercept
     # create the model and initialize
     with K.variable_dtype(dtype=self.dtype):
       self._model = N.Dense(num_units=self.nb_classes,
                         W_init=init_ops.glorot_uniform_initializer(seed=self._rand_state.randint()),
                         b_init=b_init,
                         activation=K.linear)
       y_logits = self._model(self._X)
     y_prob = fn_activation(y_logits)
     # applying class weights
     class_weights = tf.constant(value=self._class_weight,
                                 dtype=self.dtype,
                                 name="class_weights")
     weights = tf.gather(class_weights,
                         tf.cast(self._y, 'int32') if self.nb_classes == 2 else
                         tf.argmax(self._y, axis=-1))
     # optimizer
     params = [v for v in self._model.variables
               if has_roles(v, Weight) or has_roles(v, Bias)]
     losses = fn_loss(self._y, y_logits, weights=weights)
     l1_norm = tf.norm(self._model.get('W'), ord=1) if self.l1 > 0. else 0
     l2_norm = tf.norm(self._model.get('W'), ord=2) if self.l2 > 0. else 0
     losses = losses + self.l1 * l1_norm + self.l2 * l2_norm
     acc = fn_acc(self._y, y_prob)
     updates = self._optimizer.get_updates(losses, params)
     # create function
     if self.confusion_matrix:
       cm = K.metrics.confusion_matrix(y_true=self._y, y_pred=y_prob,
                                       labels=self.nb_classes)
     metrics = [losses, acc, cm] if self.confusion_matrix else [losses, acc]
     self._f_train = K.function(inputs=(self._X, self._y),
                                outputs=metrics,
                                updates=updates,
                                training=True)
     self._f_score = K.function(inputs=(self._X, self._y),
                                outputs=metrics,
                                training=False)
     self._f_pred_prob = K.function(inputs=self._X,
                                    outputs=y_prob,
                                    training=False)
     self._f_pred_logit = K.function(inputs=self._X,
                                     outputs=y_logits,
                                     training=False)
   return self
Exemplo n.º 38
0
def confusion_matrix(y_true, y_pred, labels=None, normalize=False,
                     name=None):
  """
  Computes the confusion matrix of given vectors containing
  actual observations and predicted observations.

  Parameters
  ----------
  y_true : 1-d or 2-d tensor variable
      true values
  y_pred : 1-d or 2-d tensor variable
      prediction values
  normalize : bool
      if True, normalize each row to [0., 1.]
  labels : array, shape = [nb_classes], int (nb_classes)
      List of labels to index the matrix. This may be used to reorder
      or select a subset of labels.
      If none is given, those that appear at least once
      in ``y_true`` or ``y_pred`` are used in sorted order.

  Note
  ----
  if you want to calculate: Precision, Recall, F1 scores from the
  confusion matrix, set `normalize=False`

  """
  # ====== numpy ndarray ====== #
  if isinstance(y_true, np.ndarray) or isinstance(y_pred, np.ndarray):
    from sklearn.metrics import confusion_matrix as sk_cm
    nb_classes = None
    if y_true.ndim > 1:
      nb_classes = y_true.shape[1]
      y_true = np.argmax(y_true, axis=-1)
    if y_pred.ndim > 1:
      nb_classes = y_pred.shape[1]
      y_pred = np.argmax(y_pred, axis=-1)
    # get number of classes
    if labels is None:
      if nb_classes is None:
        raise RuntimeError("Cannot infer the number of classes for confusion matrix")
      labels = int(nb_classes)
    elif is_number(labels):
      labels = list(range(labels))
    cm = sk_cm(y_true=y_true, y_pred=y_pred, labels=labels)
    if normalize:
      cm = cm.astype('float32') / np.sum(cm, axis=1, keepdims=True)
    return cm
  # ====== tensorflow tensor ====== #
  with tf.name_scope(name, 'confusion_matrix', [y_true, y_pred]):
    from tensorflow.contrib.metrics import confusion_matrix as tf_cm
    nb_classes = None
    if y_true.shape.ndims == 2:
      nb_classes = y_true.shape.as_list()[-1]
      y_true = tf.argmax(y_true, -1)
    elif y_true.shape.ndims != 1:
      raise ValueError('actual must be 1-d or 2-d tensor variable')
    if y_pred.shape.ndims == 2:
      nb_classes = y_pred.shape.as_list()[-1]
      y_pred = tf.argmax(y_pred, -1)
    elif y_pred.shape.ndims != 1:
      raise ValueError('pred must be 1-d or 2-d tensor variable')
    # check valid labels
    if labels is None:
      if nb_classes is None:
        raise RuntimeError("Cannot infer the number of classes for confusion matrix")
      labels = int(nb_classes)
    elif is_number(labels):
      labels = int(labels)
    elif hasattr(labels, '__len__'):
      labels = len(labels)
    # transpose to match the format of sklearn
    cm = tf_cm(labels=y_true, predictions=y_pred,
               num_classes=labels)
    if normalize:
      cm = tf.cast(cm, dtype='float32')
      cm = cm / tf.reduce_sum(cm, axis=1, keep_dims=True)
    return add_roles(cm, ConfusionMatrix)
Exemplo n.º 39
0
 def __init__(self,
              nb_classes,
              l1=0.,
              l2=0.,
              fit_intercept=True,
              confusion_matrix=True,
              tol=1e-4,
              patience=3,
              rollback=True,
              batch_size=1024,
              max_epoch=100,
              max_iter=None,
              optimizer='adadelta',
              learning_rate=1.0,
              class_weight=None,
              dtype='float32',
              seed=1234,
              verbose=False,
              path=None,
              name=None):
     super(LogisticRegression, self).__init__()
     # ====== basic dimensions ====== #
     if isinstance(nb_classes, (tuple, list, np.ndarray)):
         self._labels = tuple([str(i) for i in nb_classes])
         self._nb_classes = len(nb_classes)
     elif is_number(nb_classes):
         self._labels = tuple([str(i) for i in range(nb_classes)])
         self._nb_classes = int(nb_classes)
     self._feat_dim = None
     self._dtype = np.dtype(dtype)
     # ====== preprocessing class weight ====== #
     if class_weight is None:
         class_weight = np.ones(shape=(self.nb_classes, ), dtype=self.dtype)
     elif is_number(class_weight):
         class_weight = np.zeros(shape=(self.nb_classes, ),
                                 dtype=self.dtype) + class_weight
     self._class_weight = class_weight
     # ====== flags ====== #
     self.l1 = float(l1)
     self.l2 = float(l2)
     self.fit_intercept = bool(fit_intercept)
     self.confusion_matrix = bool(confusion_matrix)
     # ====== internal states ====== #
     self._is_fitted = False
     # ====== others ====== #
     if name is None:
         name = uuid(length=8)
         self._name = 'LogisticRegression_%s' % name
     else:
         self._name = str(name)
     self._path = path
     # ====== training ====== #
     self.batch_size = int(batch_size)
     self.max_epoch = max_epoch
     self.max_iter = max_iter
     if not is_string(optimizer):
         raise ValueError("`optimizer` must be one of the following")
     optimizer = optimizer.lower()
     if optimizer not in _optimizer_list:
         raise ValueError("`optimizer` must be one of the following: %s" %
                          str(list(_optimizer_list.keys())))
     self._optimizer = _optimizer_list[optimizer.lower()](
         lr=float(learning_rate))
     self._optimizer_name = optimizer
     self._optimizer_lr = learning_rate
     # ====== stop training ====== #
     self.tol = float(tol)
     self.patience = int(patience)
     self.rollback = bool(rollback)
     # ====== others ====== #
     self._train_history = []
     self._valid_history = []
     self._rand_state = np.random.RandomState(seed=int(seed))
     self.verbose = int(verbose)
Exemplo n.º 40
0
Arquivo: base.py Projeto: imito/odin
 def format_score(s):
   return ctext('%.4f' % s if is_number(s) else s, 'yellow')
Exemplo n.º 41
0
 def _initialize(self, X):
     # ====== check inputs dimensions ====== #
     if not hasattr(X, 'shape'):
         raise ValueError("`X` must have `shape` attribute.")
     feat_dim = np.prod(X.shape[1:])
     if self._feat_dim is None:
         self._feat_dim = feat_dim
     # validate input dimension
     if feat_dim != self._feat_dim:
         raise RuntimeError("Feature dimension mismatch %d and %d" %
                            (feat_dim, self.feat_dim))
     # check if tensorflow op initalized
     if hasattr(self, '_f_train'):
         return
     # ====== binary or multi-classes ====== #
     if self.nb_classes == 2:
         out_shape = (None, )
         fn_activation = tf.nn.sigmoid
         fn_loss = tf.losses.sigmoid_cross_entropy
         fn_acc = K.metrics.binary_accuracy
     else:
         out_shape = (None, self.nb_classes)
         fn_activation = tf.nn.softmax
         fn_loss = tf.losses.softmax_cross_entropy
         fn_acc = K.metrics.categorical_accuracy
     # ====== create model ====== #
     with tf.name_scope(self.name, 'logistic_regression'):
         # inputs
         self._X = K.placeholder(shape=(None, self.feat_dim),
                                 dtype=self.dtype,
                                 name='%s_input' % self.name)
         self._y = K.placeholder(shape=out_shape,
                                 dtype=self.dtype,
                                 name='%s_output' % self.name)
         # check the bias
         if is_number(self.fit_intercept):
             b_init = float(self.fit_intercept)
         elif self.fit_intercept is False or \
         self.fit_intercept is None:
             b_init = None
         else:
             b_init = self.fit_intercept
         # create the model and initialize
         with K.variable_dtype(dtype=self.dtype):
             self._model = N.Dense(
                 num_units=self.nb_classes,
                 W_init=init_ops.glorot_uniform_initializer(
                     seed=self._rand_state.randint()),
                 b_init=b_init,
                 activation=K.linear)
             y_logits = self._model(self._X)
         y_prob = fn_activation(y_logits)
         # applying class weights
         class_weights = tf.constant(value=self._class_weight,
                                     dtype=self.dtype,
                                     name="class_weights")
         weights = tf.gather(
             class_weights,
             tf.cast(self._y, 'int32')
             if self.nb_classes == 2 else tf.argmax(self._y, axis=-1))
         # optimizer
         params = [
             v for v in self._model.variables
             if has_roles(v, Weight) or has_roles(v, Bias)
         ]
         losses = fn_loss(self._y, y_logits, weights=weights)
         l1_norm = tf.norm(self._model.get('W'),
                           ord=1) if self.l1 > 0. else 0
         l2_norm = tf.norm(self._model.get('W'),
                           ord=2) if self.l2 > 0. else 0
         losses = losses + self.l1 * l1_norm + self.l2 * l2_norm
         acc = fn_acc(self._y, y_prob)
         updates = self._optimizer.get_updates(losses, params)
         # create function
         if self.confusion_matrix:
             cm = K.metrics.confusion_matrix(y_true=self._y,
                                             y_pred=y_prob,
                                             labels=self.nb_classes)
         metrics = [losses, acc, cm
                    ] if self.confusion_matrix else [losses, acc]
         self._f_train = K.function(inputs=(self._X, self._y),
                                    outputs=metrics,
                                    updates=updates,
                                    training=True)
         self._f_score = K.function(inputs=(self._X, self._y),
                                    outputs=metrics,
                                    training=False)
         self._f_pred_prob = K.function(inputs=self._X,
                                        outputs=y_prob,
                                        training=False)
         self._f_pred_logit = K.function(inputs=self._X,
                                         outputs=y_logits,
                                         training=False)
     return self
Exemplo n.º 42
0
def standard_trainer(train_data,
                     valid_data,
                     X,
                     y_train,
                     y_score,
                     y_target,
                     parameters,
                     test_data=None,
                     cost_train=None,
                     cost_score=None,
                     optimizer=None,
                     confusion_matrix=False,
                     gradient_norm=True,
                     save_path=None,
                     save_obj=None,
                     batch_size=64,
                     nb_epoch=3,
                     valid_freq=0.6,
                     seed=1208,
                     shuffle_level=2,
                     patience=3,
                     earlystop=5,
                     report_path=None):
    """
    Parameters
    ----------
    cost_train: list of callable
        each function will be apply to a pair y_train and y_target

    Return
    ------
    MainLoop, and History

    Note
    ----

    """
    from odin import backend as K
    # ====== prepare variables and cost ====== #
    # check optimizer
    if optimizer is None:
        optimizer = K.optimizers.SGD(lr=0.0001, momentum=0.9, nesterov=True)
    elif not isinstance(optimizer, K.optimizers.Optimizer) and \
    not hasattr(optimizer, "get_updates"):
        raise ValueError(
            "Invalid optimizer, the optimizer must be instance of "
            "backend.optimizers.Optimizer or having function "
            "get_updates(self, loss_or_grads, params).")
    #  check the cost functions
    if cost_train is None:
        cost_train = K.categorical_crossentropy
    if cost_score is None:
        cost_score = K.categorical_crossentropy
    cost_train = as_tuple(cost_train)
    cost_score = as_tuple(cost_score)
    # check input X, y, parameters
    X = as_tuple(X)
    y_train = as_tuple(y_train)
    y_score = as_tuple(y_score)
    y_target = as_tuple(y_target)
    parameters = as_tuple(parameters)
    if len(X) == 0 or len(y_train) == 0 or len(y_score) == 0 or \
    len(y_target) == 0 or len(parameters) == 0:
        raise ValueError(
            "X(len=%d), y_train(len=%d), y_score(len=%d), y_target(len=%d),"
            "and parameters(len=%d) must be list or tuple with length > 0." %
            (len(X), len(y_train), len(y_score), len(y_target),
             len(parameters)))
    # get all cost
    if len(y_train) == 1:
        y_train = y_train * len(cost_train)
    if len(y_score) == 1:
        y_score = y_score * len(cost_score)
    cost_train = [
        K.mean(f_cost(y_, y), axis=0) for f_cost, y_, y in zip(
            cost_train, y_train,
            y_target * len(cost_train) if len(y_target) == 1 else y_target)
    ]
    cost_score = [
        K.mean(f_cost(y_, y), axis=0) for f_cost, y_, y in zip(
            cost_score, y_score,
            y_target * len(cost_score) if len(y_target) == 1 else y_target)
    ]
    # add confusion matrix
    if confusion_matrix:
        if not is_number(confusion_matrix) and \
        not isinstance(confusion_matrix, (tuple, list, np.ndarray)):
            raise ValueError(
                "confusion_matrix must be an integer, or list, tuple"
                " specifies number of classes, or list of all classes.")
        if is_number(confusion_matrix):
            confusion_matrix = list(range(int(confusion_matrix)))
        for y_, y in zip(y_score, y_target):
            cost_score.append(
                K.confusion_matrix(y_pred=y_,
                                   y_true=y,
                                   labels=confusion_matrix))
    # get the update
    updates = optimizer.get_updates(cost_train[0], parameters)
    # ====== create function ====== #
    grad_norm = [] if not gradient_norm or not hasattr(optimizer, 'norm') else \
        [optimizer.norm]
    cost_train = cost_train + grad_norm
    print('Building training functions ...')
    f_train = K.function(inputs=X + y_target,
                         outputs=cost_train,
                         updates=updates)
    print('Building scoring functions ...')
    f_score = K.function(inputs=X + y_target, outputs=cost_score)
    # ====== Create trainer ====== #
    task = MainLoop(batch_size=batch_size,
                    seed=seed,
                    shuffle_level=shuffle_level)
    if save_path is not None and save_obj is not None:
        task.set_save(save_path, save_obj, save_hist=True)
    # set task
    task.set_task(f_train, train_data, epoch=nb_epoch, name='train')
    task.set_subtask(f_score, valid_data, freq=valid_freq, name='valid')
    if test_data is not None:
        task.set_subtask(f_score, test_data, when=-1, epoch=1, name='test')
    # format for score
    score_format = 'Results:' + __format_string(
        len(cost_score) - (1 if confusion_matrix else 0))
    score_tracking = {
        (len(cost_score) - 1): lambda x: sum(x)
    } if confusion_matrix else []
    # set the callback
    history = History()
    task.set_callback([
        ProgressMonitor(name='train',
                        format='Results:' + __format_string(len(cost_train))),
        ProgressMonitor(name='valid',
                        format=score_format,
                        tracking=score_tracking),
        (ProgressMonitor(
            name='test', format=score_format, tracking=score_tracking)
         if test_data is not None else None), history,
        EarlyStopGeneralizationLoss(
            'valid',
            threshold=earlystop,
            patience=patience,
            get_value=lambda x: np.mean([i[0] for i in x]
                                        if isinstance(x[0],
                                                      (tuple, list)) else x)),
        NaNDetector(('train', 'valid'), patience=patience, rollback=True)
    ])
    return task, history