def single_run_repeatable_and_monotonic(self, aArg, oArg, algName, iArg, nBatch=1): """ Test a single call to bnpy.run, verify repeatability and monotonic. """ self.pprintSingleRun(aArg, oArg, algName, iArg, nBatch) kwargs = self.makeAllKwArgs(aArg, oArg, algName, iArg, nBatch) model1, Info1 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg), algName, **kwargs) self.pprintResult(model1, Info1) loss_history = Info1['loss_history'] if algName.count('moVB'): loss_history = loss_history[Info1['lap_history'] >= 1.0] isMonotonic = self.isMonotonicallyIncreasing(-1 * loss_history) try: assert isMonotonic except AssertionError: from IPython import embed embed() model2, Info2 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg), algName, **kwargs) self.pprintResult(model2, Info2) isRepeatable = np.allclose(Info1['loss_history'], Info2['loss_history']) assert isRepeatable
def fit_model(name, dataset): """ :param name: :param dataset: bnpy.data.XData object :return: """ gamma = 1.0 # Prior on dirichlet dispersion parameter sF = 1.0 # Prior covariance matrix is Identity * sF K = 5 # Numver of initial clusters workdir = tempfile.mkdtemp(prefix=name) outputdir = 'trymoves-K={K}-gamma={G}-ECovMat={Cov}-moves=birth,merge,shuffle/'.format( K=K, G=gamma, Cov=sF) output_path = os.path.join(workdir, outputdir) blockPrint() trained_model, info_dict = bnpy.run(dataset, 'DPMixtureModel', 'Gauss', 'memoVB', output_path=output_path, nLap=100, nTask=1, nBatch=1, gamma0=gamma, sF=sF, ECovMat='eye', K=K, moves='birth,merge,shuffle') enablePrint() shutil.rmtree(workdir) return trained_model
def fit(self, X, lengths): ''' # Load dataset from file import os dataset_path = os.path.join(bnpy.DATASET_PATH, 'mocap6') mocap6_dataset = bnpy.data.GroupXData.read_npz(os.path.join(dataset_path, 'dataset.npz')) ipdb.set_trace() ''' Xprev = X[:-1, :] X = X[1:, :] doc_range = list([0]) doc_range += (np.cumsum(lengths).tolist()) dataset = bnpy.data.GroupXData(X, doc_range, None, Xprev) # -set the hyperparameters model, model_info = bnpy.run(dataset, self.alloModel, self.obsModel, self.varMethod, nLap=self.n_iteration, nTask=self.nTask, nBatch=self.nBatch, convergethr=self.convergethr, alpha=self.alpha, gamma=self.gamma, sF=self.sF, ECovMat=self.ECovMat, K=self.K, initname=self.initname) # self.log_startprob = log_mask_zero(model.allocModel.get_init_prob_vector()) self.log_startprob = model.allocModel.get_active_comp_probs() self.log_startprob = self.log_startprob / sum(self.log_startprob) self.log_transmat = model.allocModel.get_trans_prob_matrix() self.model = model return self
def fit(self, X, lengths): Xprev = X doc_range = list([0]) doc_range += (np.cumsum(lengths).tolist()) dataset = bnpy.data.GroupXData(X, doc_range, None, Xprev) # -set the hyperparameters model, model_info = bnpy.run( dataset, self.alloModel, self.obsModel, self.varMethod, #output_path = os.path.join(model_save_path, 'results'), nLap=self.n_iteration, nTask=self.nTask, nBatch=self.nBatch, convergethr=self.convergethr, alpha=self.alpha, gamma=self.gamma, sF=self.sF, ECovMat=self.ECovMat, K=self.K, initname=self.initname) self.model = model return self
def setUp(self): ''' Create a valid Data - model - LP - SS configuration ''' # Make toy data Data = ToyHMMK4.get_data(12345, T=15, nDocTotal=3) self.Data = Data hmodel, Info = bnpy.run(Data, 'HDPHMM', 'Gauss', 'VB', nLap=1, K=6, initname='randexamplesbydist', alpha=0.5, gamma=5.0, ECovMat='eye', sF=1.0, kappa=1e-5, doWriteStdOut=False, doSaveToDisk=False) LP = hmodel.calc_local_params(Data, limitMemoryLP=0) assert 'mHtable' not in LP self.mPairIDs = [(0, 1), (2, 3), (4, 5), (1, 5), (3, 4)] SS = hmodel.get_global_suff_stats(Data, LP, doPrecompEntropy=1, doPrecompMergeEntropy=1, mPairIDs=self.mPairIDs) hmodel.update_global_params(SS) self.hmodel = hmodel self.origLP = LP self.origSS = SS.copy()
def run(self, data, mixModel='DPMixtureModel', obsModel='Gauss', alg='memoVB'): dp_model, dp_info_dict = bnpy.run(data, mixModel, obsModel, alg, K=self.K, output_path=self.output_path, nLap=self.nLap, nTask=self.nTask, nBatch=self.nBatch, sF=self.sF, ECovMat=self.ECovMat, m_startLap=self.m_startLap, initname=self.initname, moves=self.moves, b_startLap=self.b_startLap, b_Kfresh=self.b_Kfresh, doSaveToDisk=self.doSaveToDisk, gamma1=self.gamma1, gamma0=self.gamma0, Kmax=self.Kmax, taskID=self.taskID) return dp_model, dp_info_dict
def single_run_monotonic(self, aArg, oArg, algName, iArg): """ Test a single call to bnpy.run, verify monotonicity only. """ self.pprintSingleRun(aArg, oArg, algName, iArg) kwargs = self.makeAllKwArgs(aArg, oArg, algName, iArg) model1, Info1 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg), algName, **kwargs) self.pprintResult(model1, Info1) isMonotonic = self.isMonotonic(Info1['evTrace']) assert isMonotonic
def run_MOVBWithMoves_SegmentManySeq( self, aArg, oArg, moves='merge,delete,shuffle,seqcreate', algName='moVB', nWorkers=0, **kwargs): """ Execute single run with all moves enabled. Post Condition -------------- Will raise AssertionError if any bad results detected. """ self.Data.alwaysTrackTruth = 1 Ktrue = np.unique(self.Data.TrueParams['Z']).size pprint(aArg) pprint(oArg) initArg = dict(**kwargs) pprint(initArg) viterbiPath = os.path.expandvars( '$BNPYROOT/bnpy/learnalg/extras/XViterbi.py') kwargs = self.makeAllKwArgs(aArg, oArg, initArg, moves=moves, nWorkers=nWorkers, customFuncPath=viterbiPath, doSaveToDisk=1, doWriteStdOut=1, printEvery=1, saveEvery=1000, **kwargs) kwargs['jobname'] += '-creationProposalName=%s' % ( kwargs['creationProposalName']) model, Info = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg), algName, **kwargs) pprintResult(model, Info, Ktrue=Ktrue) try: assert model.allocModel.K == model.obsModel.K assert model.allocModel.K == Ktrue except AssertionError as e: pprintCommandToReproduceError(self.datasetArg, aArg, oArg, algName, **kwargs) assert model.allocModel.K == model.obsModel.K if not model.allocModel.K == Ktrue: print('>>>>>> WHOA! Kfinal != Ktrue <<<<<<') print('') return Info
def single_run_repeatable_and_monotonic(self, aArg, oArg, algName, iArg): """ Test a single call to bnpy.run, verify repeatability and monotonic. """ self.pprintSingleRun(aArg, oArg, algName, iArg) kwargs = self.makeAllKwArgs(aArg, oArg, algName, iArg) model1, Info1 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg), algName, **kwargs) self.pprintResult(model1, Info1) evTrace = Info1['evTrace'] if algName.count('moVB'): evTrace = evTrace[Info1['lapTrace'] >= 1.0] isMonotonic = self.isMonotonic(evTrace) assert isMonotonic model2, Info2 = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg), algName, **kwargs) self.pprintResult(model2, Info2) isRepeatable = np.allclose(Info1['evTrace'], Info2['evTrace']) assert isRepeatable
def fit(self, name='MultivariateAnalysis', verbose=False): """ Fits the multivariate analysis :param name: Name for the output directory :return: """ if self.og_data.shape[0] > self.og_data.shape[1]: print 'WARNING: Number of genes outnumbers samples. ' \ 'Consider more stringent filtering.' # This is a pandas dataframe: genes x samples data = self.og_data if self.center: if self.verbose: print 'centering data' data = data.apply(lambda x: x - x.mean(), axis=1) data = data.T.values xdata = bnpy.data.XData(data) workdir = tempfile.mkdtemp(prefix="%s_" % name) output_dir = 'K={K}-gamma={G}-ECovMat={Cov}-moves=birth,merge,delete,shuffle/'.format( K=self.K, G=self.gamma, Cov=self.variance) output_path = os.path.join(workdir, output_dir) hmodel, info_dict = bnpy.run(xdata, 'DPMixtureModel', 'Gauss', 'memoVB', nLap=1000, nTask=1, nBatch=1, gamma0=self.gamma, sF=self.variance, ECovMat='eye', K=self.K, initname='randexamplesbydist', moves='birth,merge,delete,shuffle', b_startLap=0, m_startLap=2, d_startLap=2, output_path=output_path, doWriteStdOut=verbose) self.hmodel = hmodel self.clusters = collections.defaultdict(list) for sample, cluster in zip(self.og_data.columns, self.get_assignments(self.og_data)): self.clusters[cluster].append(sample) return self.hmodel
def run_MOVBWithMoves(self, aArg, oArg, moves='merge', algName='moVB', nWorkers=0, **kwargs): """ Execute single run with merge moves enabled. Post Condition -------------- Will raise AssertionError if any bad results detected. """ Ktrue = self.Data.TrueParams['K'] pprint(aArg) pprint(oArg) initArg = dict(**kwargs) pprint(initArg) kwargs = self.makeAllKwArgs(aArg, oArg, initArg, moves=moves, nWorkers=nWorkers, **kwargs) model, Info = bnpy.run(self.Data, arg2name(aArg), arg2name(oArg), algName, **kwargs) pprintResult(model, Info, Ktrue=Ktrue) afterFirstLapMask = Info['lapTrace'] >= 1.0 evTraceAfterFirstLap = Info['evTrace'][afterFirstLapMask] isMonotonic = is_monotonic(evTraceAfterFirstLap, aArg=aArg) try: assert isMonotonic assert model.allocModel.K == model.obsModel.K assert model.allocModel.K == Ktrue except AssertionError as e: pprintCommandToReproduceError(self.datasetArg, aArg, oArg, algName, **kwargs) assert isMonotonic assert model.allocModel.K == model.obsModel.K if not model.allocModel.K == Ktrue: print('>>>>>> WHOA! Kfinal != Ktrue <<<<<<') return Info
def bnpy_select_clusters(data, max_cells=50000): """ Args: data: matrix of shape genes x cells Returns: selected k based on converged Gaussian DPMM, and the assigned labels. """ # TODO: randomly sub-select max_cells selected_cell_ids = list(range(data.shape[1])) if max_cells < data.shape[1]: import random selected_cell_ids = random.sample(selected_cell_ids, max_cells) data = data[:, selected_cell_ids] tsvd = TruncatedSVD(8) data_tsvd = tsvd.fit_transform(log1p(cell_normalize(data)).T) data_dense_bnpy = bnpy.data.XData(data_tsvd) trained_model, info_dict = bnpy.run( data_dense_bnpy, 'DPMixtureModel', 'Gauss', 'memoVB', #doSaveToDisk=False, doWriteStdOut=False, output_path='./temp', nLap=100, nTask=1, nBatch=1, sF=0.1, ECovMat='eye', K=10, initname='randexamples', moves='birth,merge,shuffle', m_startLap=5, b_startLap=2, b_Kfresh=4) selected_k = info_dict['K_history'][-1] results = trained_model.calc_local_params(data_dense_bnpy) cluster_labels = results['resp'].argmax(1) return selected_k, cluster_labels
pylab.tight_layout() ############################################################################### # # Training the model # ------------------ # Let's do one single run of the VB algorithm. # # Using 10 clusters and the 'randexamples' initializatio procedure. trained_model, info_dict = bnpy.run( dataset, 'FiniteMixtureModel', 'Gauss', 'VB', output_path='/tmp/AsteriskK8/helloworld-K=10/', nLap=100, sF=0.1, ECovMat='eye', K=10, initname='randexamples') ############################################################################### # # Loss function trace plot # ------------------------ # We can plot the value of the loss function over iterations, # starting after the first full pass over the dataset (first lap). # # As expected, we see monotonic decrease in the loss function's score # after every subsequent iteration.
# perform at most this many iterations at each document nCoordAscentItersLP=100, # stop local iters early when max change in doc-topic counts < this thr convThrLP=convThrLP, ) for nBatch in [1, 16]: output_path = '/tmp/wiki/scalability-model=hdp_topic+mult-alg=memoized-nBatch=%d-nCoordAscentItersLP=%s-convThrLP=%.3g/' % ( nBatch, local_step_kwargs['nCoordAscentItersLP'], convThrLP) trained_model, info_dict = bnpy.run( dataset, 'HDPTopicModel', 'Mult', 'memoVB', output_path=output_path, nLap=nLap, nBatch=nBatch, convThr=convThr, K=K, gamma=gamma, alpha=alpha, lam=lam, initname='randomlikewang', moves='shuffle', traceEvery=traceEvery, printEvery=printEvery, **local_step_kwargs) ############################################################################### # Plot: Training Loss and Laps Completed vs. Wallclock time # --------------------------------------------------------- # # * Left column: Training Loss progress vs. wallclock time # * Right column: Laps completed vs. wallclock time # # Remember: one lap is a complete pass through entire training set (6400 docs)
cur_ax_handle.set_xticks([-2, -1, 0, 1, 2]) cur_ax_handle.set_yticks([-2, -1, 0, 1, 2]) cur_ax_handle.set_xlabel("lap: %d" % lap_val) pylab.tight_layout() ############################################################################### # Training from K=1 cluster # ------------------------- # # Using 1 initial cluster, with birth and merge proposal moves. K1_trained_model, K1_info_dict = bnpy.run( dataset, 'DPMixtureModel', 'Gauss', 'memoVB', output_path='/tmp/AsteriskK8/trymoves-K=1/', nLap=100, nTask=1, nBatch=1, sF=0.1, ECovMat='eye', K=1, initname='randexamples', moves='birth,merge,shuffle', m_startLap=5, b_startLap=2, b_Kfresh=4) show_clusters_over_time(K1_info_dict['task_output_path']) ############################################################################### # Training from K=4 cluster # ------------------------- # # Now using 4 initial clusters, with birth and merge proposal moves. K4_trained_model, K4_info_dict = bnpy.run( dataset, 'DPMixtureModel', 'Gauss', 'memoVB', output_path='/tmp/AsteriskK8/trymoves-K=4/',
def run_synthetic_data_comparisons( D: int, K: int, N: int, var_scale: int, alpha: int, iters: int, burnout: int, repeats: int, ): results = { "method": [], "k_mae": [], "NMI": [], "ARI": [], "Time": [], } i = 0 while i < repeats: # generate dataset data, labels = DPMMPython.generate_gaussian_data(N, D, K, var_scale) prior = niw(1, np.zeros(D), 100, np.eye(D) * 0.5) # run DPGMM if D == 2: start = timer() dpmm_splitnet_results = DPMMPython.fit( data, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="splitnet_2d", )[0] dpmm_net_time = timer() - start elif D <= 10: start = timer() dpmm_splitnet_results = DPMMPython.fit( data, alpha, iterations=ITERS, burnout=BURNOUT, verbose=False, init_type="splitnet_10d", )[0] dpmm_net_time = timer() - start else: start = timer() dpmm_splitnet_results = DPMMPython.fit( data, alpha, iterations=ITERS, burnout=BURNOUT, verbose=False, init_type="splitnet_128d", )[0] dpmm_net_time = timer() - start if len(np.unique(dpmm_splitnet_results)) < K // 2: print("failed.") else: start = timer() dpmm_rand_results = DPMMPython.fit( data, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="none", )[0] dpmm_rand_time = timer() - start start = timer() dpmm_kmeans_results = DPMMPython.fit( data, alpha, iterations=iters, burnout=burnout, verbose=False, init_type="kmeans", )[0] dpmm_kmeans_time = timer() - start # run kmeans start = timer() kmeans = KMeans(n_clusters=K).fit(data.T) kmeans_time = timer() - start kmeans_labels = kmeans.labels_ # run GMM start = timer() gmm = GaussianMixture(n_components=K, covariance_type="full").fit(data.T) gmm_labels = gmm.predict(data.T) gmm_time = timer() - start # sklearn DPGMM start = timer() dpgmm = BayesianGaussianMixture( n_components=2 * K, covariance_type="full", weight_concentration_prior=alpha, weight_concentration_prior_type="dirichlet_process", mean_precision_prior=1e2, covariance_prior=1e0 * np.eye(D), init_params="kmeans", max_iter=iters, verbose=0, ).fit(data.T) dpgmm_labels = dpgmm.predict(data.T) dpgmmsk_time = timer() - start # moVB # pass data NxD data_bnpy = bnpy.data.XData(data.T) start = timer() model, run_info = bnpy.run( data_bnpy, "DPMixtureModel", "Gauss", "memoVB", nTask=1, nBatch=1, K=1, nLap=iters, moves="birth,merge,shuffle", gt=labels, gamma0=alpha, ) moVB_time = timer() - start LP = model.calc_local_params(data_bnpy) moVB_labels = LP["resp"].argmax(axis=1) # calc metrics and aggregate results = add_results(results, "k-means", labels, kmeans_labels, kmeans_time) results = add_results(results, "EM-GMM", labels, gmm_labels, gmm_time) results = add_results(results, "DPGMM (SKlearn's)", labels, dpgmm_labels, dpgmmsk_time) results = add_results(results, "DPGMM-Random", labels, dpmm_rand_results, dpmm_rand_time) results = add_results(results, "DPGMM-k-means", labels, dpmm_kmeans_results, dpmm_kmeans_time) results = add_results(results, "DPGMM-SplitNet", labels, dpmm_splitnet_results, dpmm_net_time) results = add_results(results, "moVB", labels, moVB_labels, moVB_time) i += 1 print(f"Finished iteration {i}") return results
# Set "reactivation" limits # So that each cluster is eligible again after 10 passes thru dataset # Or when it's size changes by 400% m_nLapToReactivate=10, m_minPercChangeInNumAtomsToReactivate=400 * 0.01, # Specify how to rank pairs (determines order in which merges are tried) # 'obsmodel_elbo' means rank pairs by improvement to observation model ELBO m_pair_ranking_procedure='obsmodel_elbo', m_pair_ranking_direction='descending', ) goodelbopairs_trained_model, goodelbopairs_info_dict = bnpy.run( dataset, 'HDPHMM', 'AutoRegGauss', 'memoVB', # output_path=output_path_starter + 'trymerge-K=20-model=HDPHMM+ARMA-ECovMat=1*eye-merge_strategy=good_elbo_pairs/', moves='merge,shuffle', **dict(alg_kwargs.items() + init_kwargs.items() + hdphmm_kwargs.items() + gauss_kwargs.items() + goodelbopairs_merge_kwargs.items())) K = goodelbopairs_trained_model.obsModel.K start_prob_K = goodelbopairs_trained_model.allocModel.get_init_prob_vector() trans_prob_KK = goodelbopairs_trained_model.allocModel.get_trans_prob_matrix() prior = goodelbopairs_trained_model.obsModel.calcLogSoftEvMatrix_FromPost post = goodelbopairs_trained_model.obsModel.Post print("printing all data!") print(goodelbopairs_trained_model.obsModel.Post.M) print(goodelbopairs_trained_model.obsModel.Post.B) print(goodelbopairs_trained_model.obsModel.Post) print("printing finished!")
import librosa import matplotlib.pyplot as plt if __name__ == '__main__': ndim = 20 train_path = "../raw_data/train.wav" train_wav, _ = librosa.load(train_path, sr=44100) train = librosa.feature.mfcc(train_wav, sr=44100, n_mfcc=ndim) colnames = ['mfcc' + str(i) for i in range(train.shape[0])] df = pd.DataFrame(data=train.T, columns=colnames) df.to_csv('../raw_data/train_mfcc.csv', index=False) hmodel, Rinfo = bnpy.run('../raw_data/train_mfcc.csv', 'FiniteMixtureModel', 'Gauss', 'EM', K=3, output_path='out/2/', nLap=300, minLaps=10) plt.plot(np.arange(0, ndim), hmodel.obsModel.get_mean_for_comp(0), label="Cluster 0") plt.plot(np.arange(0, ndim), hmodel.obsModel.get_mean_for_comp(1), label="Cluster 1") plt.plot(np.arange(0, ndim), hmodel.obsModel.get_mean_for_comp(2), label="Cluster 2") plt.xlabel('Feat.') plt.ylabel('Norm. Pow') plt.ylim(-100, 180)
############################################################################### # # Run the VB+proposals algorithm # with only merges and re-shuffling. # # Initialization: 10 topics, using randomlikewang trained_model, info_dict = bnpy.run( dataset, 'HDPTopicModel', 'Mult', 'memoVB', output_path='/tmp/bars_one_per_doc/' + 'trymoves-model=hdp+mult-K=10-moves=merge,shuffle/', nLap=50, convergeThr=0.001, nBatch=1, K=10, initname='randomlikewang', alpha=0.5, lam=0.1, moves='merge,shuffle', **dict(list(merge_kwargs.items()) + list(local_step_kwargs.items()))) ############################################################################### # # def show_bars_over_time(task_output_path=None, query_laps=[0, 1, 2, 5, None],
# Start with too many clusters (K=25) gamma = 5.0 sF = 5.0 K = 25 diag1_trained_model, diag1_info_dict = bnpy.run( dataset, 'DPMixtureModel', 'DiagGauss', 'memoVB', output_path=( '/tmp/faithful/' + 'trymoves-K=%d-gamma=%s-lik=DiagGauss-ECovMat=%s*eye-moves=none/' % (K, gamma, sF)), nLap=1000, nTask=1, nBatch=1, convergeThr=0.0001, gamma0=gamma, sF=sF, ECovMat='eye', K=K, initname='randexamplesbydist', ) show_clusters_over_time(diag1_info_dict['task_output_path']) ############################################################################### # # *DiagGauss* observation model # --------------------------------------
pylab.tight_layout() ############################################################################### # # Cold-start model training # ------------------------- # Let's do one single run of the VB algorithm. # # Using 10 clusters and the 'randexamples' initialization procedure. cold_start_model, cold_info_dict = bnpy.run( dataset, 'FiniteMixtureModel', 'Gauss', 'VB', output_path='/tmp/AsteriskK8/coldstart-K=10/', nLap=25, sF=0.1, ECovMat='eye', K=10, initname='randexamples') ############################################################################### # # Setup helper method to visualize clusters # ----------------------------------------- # Here's a short function to show how clusters evolve during training. def show_clusters_over_time(task_output_path=None, query_laps=[0, 1, 2, 5, 10, None], nrows=2):
parser.add_argument('--K', type=int, default=200) parser.add_argument('--nnzPerRowLP', type=int, default=5) parser.add_argument('--convThrLP', type=float, default=-1.0) parser.add_argument('--nCoordAscentItersLP', type=int, default=50) parser.add_argument('--initLaps', type=int, default=2) args = parser.parse_args() if args.dataName == 'AdmixAsteriskK8': import AdmixAsteriskK8 Data = AdmixAsteriskK8.get_data(nDocTotal=args.nDocTotal, nObsPerDoc=200) hmodel, Info = bnpy.run(Data, 'HDPTopicModel', 'Gauss', 'memoVB', ECovMat='diagcovdata', sF=0.1, nLap=args.initLaps, initname='randexamples', K=args.K, nBatch=1) else: import MixBarsK10V900 Data = MixBarsK10V900.get_data(nDocTotal=args.nDocTotal, nWordsPerDoc=500) hmodel, Info = bnpy.run(Data, 'HDPTopicModel', 'Mult', 'memoVB', lam=0.1, nLap=args.initLaps, initname='randexamples',
# Set "reactivation" limits # So that each cluster is eligible again after 10 passes thru dataset # Or when it's size changes by 400% m_nLapToReactivate=10, m_minPercChangeInNumAtomsToReactivate=400 * 0.01, # Specify how to rank pairs (determines order in which merges are tried) # 'total_size' and 'descending' means try largest combined clusters first m_pair_ranking_procedure='total_size', m_pair_ranking_direction='descending', ) allpairs_trained_model, allpairs_info_dict = bnpy.run( dataset, 'HDPHMM', 'DiagGauss', 'memoVB', output_path= '/tmp/mocap6/trymerge-K=20-model=HDPHMM+DiagGauss-ECovMat=1*eye-merge_strategy=all_pairs/', moves='merge,shuffle', **dict(alg_kwargs.items() + init_kwargs.items() + hdphmm_kwargs.items() + gauss_kwargs.items() + allpairs_merge_kwargs.items())) ############################################################################### # # Large-Pairs : Try 5-largest-size pairs of merges every 10 laps # -------------------------------------------------------------- # # This is much cheaper than all pairs. Let's see how well it does. largepairs_merge_kwargs = dict( m_startLap=10, # Set limits to number of merges attempted each lap.
b_Kfresh=5) # output_path = os.path.join(bnpy.ROOT_PATH, # "code/output/trymoves-model=hdp_topic+mult-K=5/") # Start at 20 or 30 topics TODO trained_model, info_dict = bnpy.run( dataset, 'HDPTopicModel', 'Mult', 'memoVB', output_path='/tmp/hdp_topic+mult-K=5/', nLap=2000, convergeThr=0.01, nBatch=5, K=5, initname='randomlikewang', gamma=50.0, alpha=0.5, lam=0.1, moves='birth,merge,shuffle', **dict(local_step_kwargs.items() + merge_kwargs.items() + birth_kwargs.items())) ############################################################################### # # Setup: Helper function to plot topics at each stage of training def show_top_words_over_time(task_output_path=None,
ECovMat = 'eye' nLap = 200 ############################################################################### # # Baseline: Mixture model with *DiagGauss* observation model # ---------------------------------------------------------- # # We'll take the best of 3 independent inits ('tasks') mix_model, mix_info_dict = bnpy.run( dataset, 'FiniteMixtureModel', 'DiagGauss', 'memoVB', output_path='/tmp/mocap6/test-model=FiniteMixtureModel+DiagGauss-ECovMat=1*eye/', nLap=nLap, nTask=3, nBatch=1, convergeThr=0.0001, gamma=1.0, sF=sF, ECovMat=ECovMat, K=K, initname='randexamples', ) ############################################################################### # # FiniteTopicModel with *DiagGauss* observation model # --------------------------------------------------- # # We'll take the best of 3 independent inits ('tasks') finite_model, finite_info_dict = bnpy.run( dataset, 'FiniteTopicModel', 'DiagGauss', 'memoVB',
# Assumes diagonal covariances. # # No sparsity assumptions during training K = 3 # n clusters gamma = 50.0 # DP concentration param sF = 0.1 # scale of expected covariance full_trained_model, full_info_dict = bnpy.run( dataset, 'DPMixtureModel', 'DiagGauss', 'VB', output_path='/tmp/faithful/demo_sparse_resp-K=3-lik=Gauss-ECovMat=5*eye/', nLap=1000, nTask=5, nBatch=1, convergeThr=0.0001, gamma0=gamma, sF=sF, ECovMat='eye', K=K, initname='randexamples', ) # Add this model into the current plot bnpy.viz.PlotComps.plotCompsFromHModel(full_trained_model, ) ############################################################################### # # Do inference with L=1 sparsity # ------------------------------
sF = 1.0 # Set observation model prior so E[covariance] = identity ECovMat = 'eye' ############################################################################### # # DP mixture with *DiagGauss* observation model # --------------------------------------------- mixdiag_trained_model, mixdiag_info_dict = bnpy.run( dataset, 'DPMixtureModel', 'DiagGauss', 'memoVB', output_path='/tmp/mocap6/showcase-K=20-model=DP+DiagGauss-ECovMat=1*eye/', nLap=50, nTask=1, nBatch=1, convergeThr=0.0001, gamma=gamma, sF=sF, ECovMat=ECovMat, K=K, initname='randexamples', ) ############################################################################### # # HDP-HMM with *DiagGauss* observation model # ------------------------------------------- # # Assume diagonal covariances. #
# Train LDA topic model # --------------------- # # Using 10 clusters and the 'randexamples' initialization procedure. local_step_kwargs = dict( # perform at most this many iterations at each document nCoordAscentItersLP=100, # stop local iters early when max change in doc-topic counts < this thr convThrLP=0.001, ) trained_model, info_dict = bnpy.run( dataset, 'FiniteTopicModel', 'Mult', 'VB', output_path='/tmp/bars_one_per_doc/helloworld-model=topic+mult-K=10/', nLap=100, convergeThr=0.01, K=10, initname='randomlikewang', alpha=0.5, lam=0.1, **local_step_kwargs) ############################################################################### # # First, we can plot the loss function over time # We'll skip the first few iterations, since performance is quite bad. # pylab.figure(figsize=FIG_SIZE) pylab.plot(info_dict['lap_history'][1:], info_dict['loss_history'][1:], 'k.-') pylab.xlabel('num. laps') pylab.ylabel('loss') pylab.tight_layout()
# Plot the current model cur_ax_handle = ax_handle_list.flatten()[plot_id] bnpy.viz.PlotComps.plotCompsFromHModel( cur_model, Data=dataset, ) #ax_handle=cur_ax_handle) cur_ax_handle.set_xticks([-2, -1, 0, 1, 2]) cur_ax_handle.set_yticks([-2, -1, 0, 1, 2]) cur_ax_handle.set_xlabel("lap: %d" % lap_val) pylab.tight_layout() pylab.savefig("results/covMat1.png") pylab.waitforbuttonpress() pylab.show() K25_trained_model, K25_info_dict = bnpy.run( "msnbc_wh.csv", 'FiniteMixtureModel', 'Gauss', 'EM', output_path='results/', nLap=500, nTask=1, nBatch=1, sF=0.1, moves='birth,merge,shuffle', K=10, ) show_clusters_over_time(K25_info_dict['task_output_path'])
X_csr_DV = dataset.getSparseDocTypeCountMatrix() bnpy.viz.BarsViz.show_square_images( X_csr_DV[:10].toarray(), vmin=0, vmax=5) #pylab.colorbar() #pylab.clabel('word count') pylab.tight_layout() ############################################################################### # # Let's do one single run of the VB algorithm. # # Using 10 clusters and the 'randexamples' initializatio procedure. trained_model, info_dict = bnpy.run( dataset, 'FiniteTopicModel', 'Bern', 'VB', output_path='/tmp/bars_one_per_doc/helloworld-lik=bernoulli-K=10/', nLap=1000, convergeThr=0.0001, K=10, alpha=0.5, lambda1=0.1, lambda0=0.1) ############################################################################### # # First, we can plot the loss function over time # We'll skip the first few iterations, since performance is quite bad. # pylab.figure(figsize=FIG_SIZE) pylab.plot(info_dict['lap_history'][2:], info_dict['loss_history'][2:], 'k.-') pylab.xlabel('num. laps') pylab.ylabel('loss') pylab.tight_layout()