示例#1
0
 def __init__(self, **kw):
     RawDictCnv.__init__(self,
                         toProtectedAttrs={
                             '_nSorts', '_nBoxes', '_nTrain', '_nValid',
                             '_nTest', '_method', '_sort_boxes_list'
                         } | kw.pop('toProtectedAttrs', set()),
                         **kw)
示例#2
0
class Subset(LoggerStreamable):

  # There is only need to change version if a property is added
  _streamerObj = LoggerRawDictStreamer(toPublicAttrs = {'_ppChain'})
  _cnvObj      = RawDictCnv(toProtectedAttrs         = {'_ppChain'})

  def __init__(self, d={}, **kw):
    d.update( kw )
    self._ppChain    = d.pop('ppChain', PreProcChain(PrepObj()) )
    self._range      = d.pop('binRange'  , None)
    self._patternIdx = d.pop('pattern'   , 0)
    LoggerStreamable.__init__(self, d)

  def __call__(self, data):
    return self._apply(data)

  @abstractmethod
  def _apply(self, data):
    """
      Overload this method to apply the pre-processing
    """
    return self._ppChain.takeParams(data)

  def isRevertible(self):
    # Not possible to return after this
    return False

  def getBin(self):
    return self._range

  def setPatternIndex(self, idx):
    self._patternIdx=idx

  def checkPatternIndex(self,idx):
    if idx==self._patternIdx:
      return True
    else:
      return False

  def getPatternIndex(self):
    return self._patternIdx
示例#3
0
class SomCluster( Subset ):

  # There is only need to change version if a property is added
  _streamerObj = LoggerRawDictStreamer(toPublicAttrs = {'_code_book','_w'})
  _cnvObj      = RawDictCnv(toProtectedAttrs         = {'_code_book','_w'})

  def __init__(self, d={}, **kw):
    """
      Cluster finder class base on three parameters:
        code_book: centroids of the cluster given by any algorithm (e.g: kmeans)
        w        : weights, this will multipli the size of the cluster depends of the factor
                   e.g: the cluster was found 100 events and the w factor is 2. In the end we
                   will duplicate the events into the cluster to 200.
        matrix   : projection apply on the centroids.
        p_cluster: cluster target for each neuron map
    """
    d.update( kw ); del kw
    Subset.__init__(self,d) 

    self._code_book = d.pop('code_book', [])
    self._p_cluster = d.pop('p_cluster', [])
    self._w         = d.pop('w'  , 1   )
    checkForUnusedVars(d, self._warning )  
    del d
    # Some protections before start
    if type(self._code_book) is list:
      self._code_book = npCurrent.array(self._code_book)
    # If weigth factor is an integer, transform to an array of factors with the 
    # same size of the centroids
    if type(self._w) is int:
      self._w = npCurrent.int_array([self._w for i in range(self._code_book.shape[0])] )
    # transform to np.array if needed
    if type(self._w) is list:
      self._w = npCurrent.int_array(self._w)
    # In case to pass a list of weights, we need to check if weights and centroids has the same length.
    if self._w.shape[0] != self._code_book.shape[0]:
      raise ValueError("Weight factor must be an int, list or np.array with the same size than the code book param")
  #__init__ end


  def __call__(self, data):
    return self._apply(data)
  
  def _apply(self,data):
    """
    This function is slower than the C version but works for
    all input types.  If the inputs have the wrong types for the
    C versions of the function, this one is called as a last resort.

    It is about 20 times slower than the C version.
    """
    # Take param and apply pre-processing
    # hold the unprocess data
    self._ppChain.takeParams(data)
    tdata = self._ppChain(data)

    # n = number of observations
    # d = number of features
    if np.ndim(tdata) == 1:
      if not np.ndim(tdata) == np.ndim(self._code_book):
        raise ValueError("Observation and code_book should have the same rank")
    else:
      (n, d) = tdata.shape
    # code books and observations should have same number of features and same shape
    if not np.ndim(tdata) == np.ndim(self._code_book):
      raise ValueError("Observation and code_book should have the same rank")
    elif not d == self._code_book.shape[1]:
      raise ValueError("Code book(%d) and obs(%d) should have the same "
                       "number of features (eg columns)""" %
                       (self._code_book.shape[1], d))

    bmus = tensor_frobenius_argmin(tdata,self._code_book,10000,self._logger)    
    code = np.zeros(bmus.shape)
    # Fix matlab index 
    self._p_cluster = self._p_cluster-1
    for n in range(bmus.shape[0]):
      code[n] = self._p_cluster[bmus[n]]
    
    # Release memory
    del tdata
    gc.collect()
    # Join all clusters into a list of clusters
    cpattern=[]
    for target in range(max(self._p_cluster)+1):
      cpattern.append(data[np.where(code==target)[0],:])
    
    # Resize the cluster
    for i, c in enumerate(cpattern):
      cpattern[i] = np.repeat(cpattern[i],self._w[i],axis=0)  
      self._info('Cluster %d and factor %d with %d events and %d features',
                        i,self._w[i],cpattern[i].shape[0],cpattern[i].shape[1])
    return cpattern
示例#4
0
class GMMCluster( Cluster ):
  # There is only need to change version if a property is added
  _streamerObj = LoggerRawDictStreamer(toPublicAttrs = {'_sigma'})
  _cnvObj      = RawDictCnv(toProtectedAttrs         = {'_sigma'})

  def __init__(self,  d={}, **kw):
    """
      Cluster finder class base on three parameters:
        code_book: centroids of the cluster given by any algorithm (e.g: kmeans)
        w        : weights, this will multipli the size of the cluster depends of the factor
                   e.g: the cluster was found 100 events and the w factor is 2. In the end we
                   will duplicate the events into the cluster to 200.
        matrix   : projection apply on the centroids.
        sigma    : variance param of the gaussian, this algorithm will calculate the likelihood 
                   value using: lh[i] = np.exp(np.power((data-centroid[i])/sigma[i],2))
    """
    d.update( kw ); del kw
    self._sigma = d.pop('sigma' , npCurrent.array([])   )
    Cluster.__init__(self, d) 
    del d

    # Checking the sigma type
    if type(self._sigma) is list:
      self._sigma = npCurrent.array(self._sigma)
    if not self._sigma.shape == self._code_book.shape:
      raise ValueError("Code book and sigma matrix should have the same shape")
    #__init__ end


  def _apply(self,data):
    """
    This function is slower than the C version but works for
    all input types.  If the inputs have the wrong types for the
    C versions of the function, this one is called as a last resort.

    It is about 20 times slower than the C version.
    """
    # Take param and apply pre-processing
    # hold the unprocess data
    self._ppChain.takeParams(data)
    tdata = self._ppChain(data)
    # n = number of observations
    # d = number of features
    if np.ndim(tdata) == 1:
      if not np.ndim(tdata) == np.ndim(self._code_book):
        raise ValueError("Observation and code_book should have the same rank")
    else:
      (n, d) = tdata.shape
    # code books and observations should have same number of features and same shape
    if not np.ndim(tdata) == np.ndim(self._code_book):
      raise ValueError("Observation and code_book should have the same rank")
    elif not d == self._code_book.shape[1]:
      raise ValueError("Code book(%d) and obs(%d) should have the same "
                       "number of features (eg columns)""" %
                       (self._code_book.shape[1], d))
    # Prob finder equation is:
    # tdata     is n X d
    # code_book is m X d where m is the number of clusters
    # Sigma     is m X d
    # Prob = exp()
    # see here: http://scipy.github.io/old-wiki/pages/EricsBroadcastingDoc
    code = np.argmax(np.sum(np.exp(np.power((tdata[:,np.newaxis]-self._code_book),2)/self._sigma[np.newaxis,:]),axis=-1),axis=1)

    del tdata
    gc.collect()
    
    # Join all clusters into a list of clusters
    cpattern=[]
    for target in range(self._code_book.shape[0]):
      cpattern.append(data[np.where(code==target)[0],:])
    
    # Resize the cluster
    for i, c in enumerate(cpattern):
      cpattern[i] = np.repeat(cpattern[i],self._w[i],axis=0)  
      self._info('Cluster %d and factor %d with %d events and %d features',\
                        i,self._w[i],cpattern[i].shape[0],cpattern[i].shape[1])
    return cpattern
示例#5
0
 def __init__(self, **kw):
   RawDictCnv.__init__( self, 
                        toProtectedAttrs = {'_nSorts','_nBoxes',
                                            '_nTrain','_nValid',
                                            '_nTest', '_method','_sort_boxes_list'} | kw.pop('toProtectedAttrs', set()), 
                        **kw )