def get_data(**kwargs): ''' Returns data from audio tracks ''' if os.path.exists(DATAFILE_MAT): Data = GroupXData.LoadFromFile(DATAFILE_MAT) else: obs = [] doc_range = [0] count = 0 with h5py.File('../tracks.h5', 'r') as tracks: for track, grp in ProgressBar(tracks.items()): if 'gfccs' not in grp: continue data = grp['gfccs'] count += data.shape[0] doc_range.append(count) obs.append(data.value.astype(np.float64)) X = np.vstack(obs) Data = GroupXData(X=X, doc_range=doc_range) Data.save_to_mat(DATAFILE_MAT) Data.name = 'AudioCorpus' Data.summary = 'Audio Corpus. obs=10.5M docs=559' return Data
def get_data(seed=123456, nDocTotal=32, T=1000, **kwargs): ''' Generate several data sequences, returned as a bnpy data-object Args ------- seed : integer seed for random number generator, used for actually *generating* the data seqLens : total number of observations in each sequence Returns ------- Data : bnpy GroupXData object, with nObsTotal observations ''' fullX, fullZ, doc_range = get_X(seed, nDocTotal, T) X = np.vstack(fullX) Z = np.asarray(fullZ) nUsedStates = len(np.unique(Z)) if nUsedStates < K: print 'WARNING: NOT ALL TRUE STATES USED IN GENERATED DATA' Data = GroupXData(X=X, doc_range=doc_range, TrueZ=Z) Data.name = get_short_name() Data.summary = get_data_info() return Data
def get_data(nDocTotal=200, nObsPerDoc=300, nLetterPerDoc=3, seed=0, dstart=0, **kwargs): ''' Generate data as GroupXData object Guarantees that each letter is used at least once every 26 docs. ''' nLetters = 26 PRNG = np.random.RandomState(seed) # Letters decay in probability from A to Z LetterProbs = np.ones(nLetters) for i in range(1, nLetters): LetterProbs[i] = 0.95 * LetterProbs[i - 1] LetterProbs /= LetterProbs.sum() X = np.zeros((nDocTotal * nObsPerDoc, 64)) TrueZ = np.zeros(nDocTotal * nObsPerDoc) doc_range = np.zeros(nDocTotal + 1, dtype=np.int32) for d in xrange(nDocTotal): start_d = d * nObsPerDoc doc_range[d] = start_d doc_range[d + 1] = start_d + nObsPerDoc # Select subset of letters to appear in current document mustIncludeLetter = (dstart + d) % 26 chosenLetters = PRNG.choice(nLetters, size=nLetterPerDoc, p=LetterProbs, replace=False) loc = np.flatnonzero(chosenLetters == mustIncludeLetter) if loc.size > 0: chosenLetters[loc[0]] = mustIncludeLetter else: chosenLetters[-1] = mustIncludeLetter lProbs_d = LetterProbs[chosenLetters] / LetterProbs[chosenLetters].sum( ) nObsPerChoice = PRNG.multinomial(nObsPerDoc, lProbs_d) assert nObsPerChoice.sum() == nObsPerDoc start = start_d for i in range(nLetterPerDoc): TrueZ[start:(start + nObsPerChoice[i])] = chosenLetters[i] Lcovmat = letter2covmat(chr(CHRSTART + chosenLetters[i])) X[start:(start + nObsPerChoice[i])] = PRNG.multivariate_normal( np.zeros(64), Lcovmat, size=nObsPerChoice[i]) start += nObsPerChoice[i] for i in range(nLetters): print chr(CHRSTART + i), np.sum(TrueZ == i) return GroupXData(X=X, TrueZ=TrueZ, doc_range=doc_range)
def get_data(seed=8675309, nDocTotal=52, T=800, **kwargs): ''' Args ------- seed : integer seed for random number generator, used for actually *generating* the data nObsTotal : total number of observations for the dataset. Returns ------- Data : bnpy XData object, with nObsTotal observations ''' X, Xprev, TrueZ, doc_range = genToyData( seed=seed, nDocTotal=nDocTotal, T=T) Data = GroupXData(X=X, TrueZ=TrueZ, Xprev=Xprev, doc_range=doc_range) Data.name = get_short_name() Data.summary = get_data_info() return Data
def MakeGroupData(seed, nDoc, nObsPerDoc): ''' Make a GroupXData object ''' PRNG = np.random.RandomState(seed) Pi = PRNG.dirichlet(gamma * np.ones(K), size=nDoc) XList = list() ZList = list() for d in range(nDoc): Npercomp = PRNG.multinomial(nObsPerDoc, Pi[d]) for k in range(K): if Npercomp[k] < 1: continue Xcur_k = _sample_data_from_comp(k, Npercomp[k], PRNG) XList.append(Xcur_k) ZList.append(k * np.ones(Npercomp[k])) doc_range = np.arange(0, nDoc * nObsPerDoc + 1, nObsPerDoc) X = np.vstack(XList) TrueZ = np.hstack(ZList) return GroupXData(X, doc_range, TrueZ=TrueZ)
def get_data(seed=86758, seqLens=((3000, 3000, 3000, 3000, 500)), **kwargs): ''' Generate several data sequences, returned as a bnpy data-object Args ------- seed : integer seed for random number generator, used for actually *generating* the data nObsTotal : total number of observations for the dataset. Returns ------- Data : bnpy GroupXData object, with nObsTotal observations ''' fullX, fullZ, seqIndicies = get_X(seed, seqLens) X = np.vstack(fullX) Z = np.asarray(fullZ) doc_range = np.asarray(seqIndicies) Data = GroupXData(X=X, doc_range=doc_range, TrueZ=Z) Data.name = get_short_name() Data.summary = get_data_info() return Data
def get_data(seed=DEFAULT_SEED, T=DEFAULT_LEN, **kwargs): ''' Generate toy data sequences, returned as a bnpy data-object Args ------- seed : integer seed for random number generator, used for actually *generating* the data T : int number of observations in each sequence Returns ------- Data : bnpy GroupXData object, with nObsTotal observations ''' X, Xprev, Z, doc_range = get_X(seed, T) nUsedStates = len(np.unique(Z)) if nUsedStates < K: print 'WARNING: NOT ALL TRUE STATES USED IN GENERATED DATA' Data = GroupXData(X=X, Xprev=Xprev, doc_range=doc_range, TrueZ=Z) Data.name = get_short_name() Data.summary = get_data_info() return Data
def train_image_specific_topics(self, y, sigma, Niter=50, Kfresh=100, pixelMask=None): print('Training %d image-specific clusters...' % Kfresh) D, patchSize, GP = self.D, int(np.sqrt(self.D)), self.GP # gather fully observable patches if pixelMask is None: # gray-scale image denoising v = im2col(y, patchSize) else: # color image inpainting C = 3 patchMask = np.logical_not( np.any(im2col(pixelMask, patchSize), axis=0)) v = np.hstack( tuple([ im2col(y[:, :, c], patchSize)[:, patchMask] for c in xrange(C) ])) v -= np.mean(v, axis=0) v = v.T testData = GroupXData(X=v, doc_range=[0, len(v)], nDocTotal=1) testData.name = 'test_image_patches' # set up hyper-parameters and run Bregman k-means cached_B_name = 'models/HDP/B.mat' xBar = loadmat(cached_B_name)['Cov'] xBar2 = loadmat(cached_B_name)['Cov2'] tmp0 = (np.diag(xBar) + sigma**2)**2 tmp1 = np.diag(xBar2) + 6 * np.diag(xBar) * sigma**2 + 3 * sigma**4 nu = D + 3 + 2 * np.sum(tmp0) / np.sum(tmp1 - tmp0) B = (nu - D - 1) * (xBar + sigma**2 * np.eye(D)) obsModel = ZeroMeanGaussObsModel(D=D, min_covar=1e-8, inferType='memoVB', B=B, nu=nu) Z, Mu, Lscores = runKMeans_BregmanDiv(testData.X, Kfresh, obsModel, Niter=Niter, assert_monotonic=False) Korig = self.K Kall = np.max(Z) + Korig + 1 Kfresh = Kall - Korig Z += Korig # load SuffStats of training images trainSS = loadSuffStatBag('models/HDP/SS.dump') trainSS.insertEmptyComps(Kfresh) # construct SuffStats of the test image DocTopicCount = np.bincount(Z, minlength=int(Kall)).reshape((1, Kall)) DocTopicCount = np.array(DocTopicCount, dtype=np.float64) resp = np.zeros((len(Z), Kall)) resp[np.arange(len(Z)), Z] = 1.0 testLP = dict(resp=resp, DocTopicCount=DocTopicCount) alphaPi0 = np.hstack( (GP.alphaPi0, GP.alphaPi0Rem / (Kfresh + 1) * np.ones(Kfresh))) alphaPi0Rem = GP.alphaPi0Rem / (Kfresh + 1) testLP = updateLPGivenDocTopicCount(testLP, DocTopicCount, alphaPi0, alphaPi0Rem) testSS = self.patchModel.get_global_suff_stats( testData, testLP, doPrecompEntropy=1, doTrackTruncationGrowth=1) xxT = np.zeros((Kall, D, D)) for k in xrange(Korig, Kall): idx = Z == k tmp = np.einsum('nd,ne->de', v[idx], v[idx]) tmp -= testSS.N[k] * sigma**2 * np.eye(D) val, vec = np.linalg.eig(tmp) val[val < EPS] = EPS xxT[k] = np.dot(vec, np.dot(np.diag(val), vec.T)) testSS.setField('xxT', xxT, dims=('K', 'D', 'D')) testSS.setUIDs(trainSS.uids) # combine training and test SS; update model parameters combinedSS = trainSS + testSS self.patchModel.update_global_params(combinedSS) self.calcGlobalParams()
with open(pickle_path,"wb") as f: pickle.dump(dict_obj_to_save, f) else: with open(pickle_path, "r") as f: dict_obj_to_save = pickle.load(f) list_of_empty_arrays = dict_obj_to_save['0'] list_of_action_indices = dict_obj_to_save['1'] file_names_list = dict_obj_to_save['2'] list_of_full_data = dict_obj_to_save['3'] x = dict_obj_to_save['4'] x_prev = dict_obj_to_save['5'] z = dict_obj_to_save['6'] doc_range = dict_obj_to_save['7'] # for movo these params! dataset = GroupXData(X=x,doc_range=doc_range, Xprev=x_prev) #, TrueZ=z list_of_old_skill_indices = [] list_of_new_skill_indices = [] list_of_skills = [] list_of_next_skills = [] filter_length = 50 state_window = 5 for trajectory_of_interest in range(doc_range.shape[0]-1): # trajectory_of_interest = 13 print("-----------------"+str(trajectory_of_interest)+"----------------")
# print("right") # x_eoc, x_prev_eoc, z_eoc, doc_range_eoc = read_data(path_right, z_value=0) # print("straight") # x_straight, x_prev_straight , z_straight , doc_range_straight = read_data(path_straight,doc_range=doc_range_eoc[-1],z_value=10) # print("left") # x_left, x_prev_left, z_left , doc_range_left = read_data(path_left,doc_range=doc_range_straight[-1],z_value=55) # x = np.vstack((x_eoc,x_straight,x_left)) # x_prev = np.vstack((x_prev_eoc,x_prev_straight, x_prev_left)) # z = np.hstack((z_eoc, z_straight, z_left)) # doc_range = np.hstack((doc_range_eoc[:-1], doc_range_straight[:-1], doc_range_left)) print("total trajectories: ", doc_range.shape) dataset = GroupXData(X=x[:, -5:-2], doc_range=doc_range, Xprev=x_prev[:, -5:-2]) #, TrueZ=z output_path_starter = '/media/ng/7ccf8f98-7ab8-498b-b405-54df784c3191/ng/workspace/bayesian_changepoint_detection/outputs/' ############################################################################### # # Setup: Initialization hyperparameters # ------------------------------------- init_kwargs = dict( K=20, initname='randexamples', ) alg_kwargs = dict(
def generateDataset(**kwargs): for key in Defaults: if key not in kwargs: kwargs[key] = Defaults[key] phi = makePhi(**kwargs) transPi = makePi(**kwargs) PRNG = np.random.RandomState(kwargs['seed']) nSeq = kwargs['nDocTotal'] T_in = kwargs['T'] if isinstance(T_in, str): Tvals = [int(T) for T in T_in.split(',')] else: Tvals = [T_in] if len(Tvals) == 1: seqLens = Tvals[0] * np.ones(nSeq, dtype=np.int32) elif len(Tvals) < nSeq: seqLens = np.tile(Tvals, nSeq)[:nSeq] elif len(Tvals) >= nSeq: seqLens = np.asarray(Tvals, dtype=np.int32)[:nSeq] doc_range = np.hstack([0, np.cumsum(seqLens)]) N = doc_range[-1] allX = np.zeros((N, D)) allZ = np.zeros(N, dtype=np.int32) startStates = [bgStateID, fgStateID] states0toKm1 = np.arange(K) # Each iteration generates one time-series/sequence # with starting state deterministically rotating among all states for i in range(nSeq): start = doc_range[i] stop = doc_range[i + 1] T = stop - start Z = np.zeros(T, dtype=np.int32) X = np.zeros((T, D)) nConsec = 0 Z[0] = startStates[i % len(startStates)] X[0] = PRNG.rand(D) < phi[Z[0]] for t in range(1, T): if nConsec > kwargs['maxTConsec']: # Force transition if we've gone on too long transPi_t = transPi[Z[t - 1]].copy() transPi_t[Z[t - 1]] = 0 transPi_t /= transPi_t.sum() else: transPi_t = transPi[Z[t - 1]] Z[t] = PRNG.choice(states0toKm1, p=transPi_t) X[t] = PRNG.rand(D) < phi[Z[t]] if Z[t] == Z[t - 1]: nConsec += 1 else: nConsec = 0 allZ[start:stop] = Z allX[start:stop] = X TrueParams = dict() TrueParams['beta'] = np.mean(transPi, axis=0) TrueParams['phi'] = phi TrueParams['Z'] = allZ TrueParams['K'] = K return GroupXData(allX, doc_range=doc_range, TrueParams=TrueParams)