Пример #1
0
def get_data(seed=123456, nDocTotal=32, T=1000, **kwargs):
    ''' Generate several data sequences, returned as a bnpy data-object

    Args
    -------
    seed : integer seed for random number generator,
          used for actually *generating* the data
    seqLens : total number of observations in each sequence

    Returns
    -------
    Data : bnpy GroupXData object, with nObsTotal observations
    '''
    fullX, fullZ, doc_range = get_X(seed, nDocTotal, T)
    X = np.vstack(fullX)
    Z = np.asarray(fullZ)

    nUsedStates = len(np.unique(Z))
    if nUsedStates < K:
        print 'WARNING: NOT ALL TRUE STATES USED IN GENERATED DATA'

    Data = GroupXData(X=X, doc_range=doc_range, TrueZ=Z)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
Пример #2
0
def get_data(**kwargs):
    ''' Returns data from audio tracks
    '''

    if os.path.exists(DATAFILE_MAT):
        Data = GroupXData.LoadFromFile(DATAFILE_MAT)
    else:
        obs = []
        doc_range = [0]
        count = 0
        with h5py.File('../tracks.h5', 'r') as tracks:
            for track, grp in ProgressBar(tracks.items()):
                if 'gfccs' not in grp:
                    continue
                data = grp['gfccs']
                count += data.shape[0]
                doc_range.append(count)
                obs.append(data.value.astype(np.float64))
        X = np.vstack(obs)
        Data = GroupXData(X=X, doc_range=doc_range)
        Data.save_to_mat(DATAFILE_MAT)
    Data.name = 'AudioCorpus'
    Data.summary = 'Audio Corpus. obs=10.5M docs=559'

    return Data
Пример #3
0
def get_data(seed=8675309, nDocTotal=52, T=800, **kwargs):
    '''
      Args
      -------
      seed : integer seed for random number generator,
              used for actually *generating* the data
      nObsTotal : total number of observations for the dataset.

      Returns
      -------
        Data : bnpy XData object, with nObsTotal observations
    '''
    X, Xprev, TrueZ, doc_range = genToyData(
        seed=seed, nDocTotal=nDocTotal, T=T)
    Data = GroupXData(X=X, TrueZ=TrueZ, Xprev=Xprev, doc_range=doc_range)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
Пример #4
0
def get_data(seed=86758, seqLens=((3000, 3000, 3000, 3000, 500)), **kwargs):
    ''' Generate several data sequences, returned as a bnpy data-object

    Args
    -------
    seed : integer seed for random number generator,
          used for actually *generating* the data
    nObsTotal : total number of observations for the dataset.

    Returns
    -------
    Data : bnpy GroupXData object, with nObsTotal observations
    '''
    fullX, fullZ, seqIndicies = get_X(seed, seqLens)
    X = np.vstack(fullX)
    Z = np.asarray(fullZ)
    doc_range = np.asarray(seqIndicies)

    Data = GroupXData(X=X, doc_range=doc_range,
                      TrueZ=Z)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
Пример #5
0
def get_data(seed=DEFAULT_SEED, T=DEFAULT_LEN, **kwargs):
    ''' Generate toy data sequences, returned as a bnpy data-object

      Args
      -------
      seed : integer seed for random number generator,
              used for actually *generating* the data
      T : int number of observations in each sequence

      Returns
      -------
      Data : bnpy GroupXData object, with nObsTotal observations
    '''
    X, Xprev, Z, doc_range = get_X(seed, T)

    nUsedStates = len(np.unique(Z))
    if nUsedStates < K:
        print 'WARNING: NOT ALL TRUE STATES USED IN GENERATED DATA'

    Data = GroupXData(X=X, Xprev=Xprev, doc_range=doc_range, TrueZ=Z)
    Data.name = get_short_name()
    Data.summary = get_data_info()
    return Data
Пример #6
0
 def train_image_specific_topics(self,
                                 y,
                                 sigma,
                                 Niter=50,
                                 Kfresh=100,
                                 pixelMask=None):
     print('Training %d image-specific clusters...' % Kfresh)
     D, patchSize, GP = self.D, int(np.sqrt(self.D)), self.GP
     # gather fully observable patches
     if pixelMask is None:  # gray-scale image denoising
         v = im2col(y, patchSize)
     else:  # color image inpainting
         C = 3
         patchMask = np.logical_not(
             np.any(im2col(pixelMask, patchSize), axis=0))
         v = np.hstack(
             tuple([
                 im2col(y[:, :, c], patchSize)[:, patchMask]
                 for c in xrange(C)
             ]))
     v -= np.mean(v, axis=0)
     v = v.T
     testData = GroupXData(X=v, doc_range=[0, len(v)], nDocTotal=1)
     testData.name = 'test_image_patches'
     # set up hyper-parameters and run Bregman k-means
     cached_B_name = 'models/HDP/B.mat'
     xBar = loadmat(cached_B_name)['Cov']
     xBar2 = loadmat(cached_B_name)['Cov2']
     tmp0 = (np.diag(xBar) + sigma**2)**2
     tmp1 = np.diag(xBar2) + 6 * np.diag(xBar) * sigma**2 + 3 * sigma**4
     nu = D + 3 + 2 * np.sum(tmp0) / np.sum(tmp1 - tmp0)
     B = (nu - D - 1) * (xBar + sigma**2 * np.eye(D))
     obsModel = ZeroMeanGaussObsModel(D=D,
                                      min_covar=1e-8,
                                      inferType='memoVB',
                                      B=B,
                                      nu=nu)
     Z, Mu, Lscores = runKMeans_BregmanDiv(testData.X,
                                           Kfresh,
                                           obsModel,
                                           Niter=Niter,
                                           assert_monotonic=False)
     Korig = self.K
     Kall = np.max(Z) + Korig + 1
     Kfresh = Kall - Korig
     Z += Korig
     # load SuffStats of training images
     trainSS = loadSuffStatBag('models/HDP/SS.dump')
     trainSS.insertEmptyComps(Kfresh)
     # construct SuffStats of the test image
     DocTopicCount = np.bincount(Z, minlength=int(Kall)).reshape((1, Kall))
     DocTopicCount = np.array(DocTopicCount, dtype=np.float64)
     resp = np.zeros((len(Z), Kall))
     resp[np.arange(len(Z)), Z] = 1.0
     testLP = dict(resp=resp, DocTopicCount=DocTopicCount)
     alphaPi0 = np.hstack(
         (GP.alphaPi0, GP.alphaPi0Rem / (Kfresh + 1) * np.ones(Kfresh)))
     alphaPi0Rem = GP.alphaPi0Rem / (Kfresh + 1)
     testLP = updateLPGivenDocTopicCount(testLP, DocTopicCount, alphaPi0,
                                         alphaPi0Rem)
     testSS = self.patchModel.get_global_suff_stats(
         testData, testLP, doPrecompEntropy=1, doTrackTruncationGrowth=1)
     xxT = np.zeros((Kall, D, D))
     for k in xrange(Korig, Kall):
         idx = Z == k
         tmp = np.einsum('nd,ne->de', v[idx], v[idx])
         tmp -= testSS.N[k] * sigma**2 * np.eye(D)
         val, vec = np.linalg.eig(tmp)
         val[val < EPS] = EPS
         xxT[k] = np.dot(vec, np.dot(np.diag(val), vec.T))
     testSS.setField('xxT', xxT, dims=('K', 'D', 'D'))
     testSS.setUIDs(trainSS.uids)
     # combine training and test SS; update model parameters
     combinedSS = trainSS + testSS
     self.patchModel.update_global_params(combinedSS)
     self.calcGlobalParams()