def initialize(self): """ Initialize the gibbs sampler state. I start with log N tables and randomly initialize customers to those tables. """ # First check the prior degrees of freedom. # It has to be >= num_dimension if self.prior.nu < self.embedding_size: self.log.warn( "The initial degrees of freedom of the prior is less than the dimension!. " "Setting it to the number of dimensions: {}".format( self.embedding_size)) self.prior.nu = self.embedding_size deg_of_freedom = self.prior.nu - self.embedding_size + 1 # Now calculate the covariance matrix of the multivariate T-distribution coeff = (self.prior.kappa + 1.) / (self.prior.kappa * deg_of_freedom) sigma_T = self.prior.sigma * coeff # This features in the original code, but doesn't get used # Or is it just to check that the invert doesn't fail? #sigma_Tinv = inv(sigma_T) sigma_TDet_sign, sigma_TDet = slogdet(sigma_T) if sigma_TDet_sign != 1: raise ValueError( "sign of log determinant of initial sigma is {}".format( sigma_TDet_sign)) # Storing zeros in sumTableCustomers and later will keep on adding each customer. self.sum_squared_table_customers[:] = 0 # Means are set to the prior and then updated as we add each assignment self.table_means.np[:] = self.prior.mu # Initialize the cholesky decomp of each table, with no counts yet for table in range(self.num_tables): self.table_cholesky_ltriangular_mat.np[ table] = self.prior.chol_sigma.copy() # Randomly assign customers to tables self.table_assignments = [] pbar = get_progress_bar(len(self.corpus), title="Initializing", show_progress=self.show_progress) for doc_num, doc in enumerate(pbar(self.corpus)): tables = list(np.random.randint(self.num_tables, size=len(doc))) self.table_assignments.append(tables) for (word, table) in zip(doc, tables): self.table_counts.np[table] += 1 self.table_counts_per_doc[table, doc_num] += 1 # update the sumTableCustomers self.sum_squared_table_customers[table] += np.outer( self.vocab_embeddings[word], self.vocab_embeddings[word]) self.update_table_params(table, word)
def sample(self, num_iterations): """ for num_iters: for each customer remove him from his old_table and update the table params. if old_table is empty: remove table Calculate prior and likelihood for this customer sitting at each table sample for a table index if new_table is equal to old_table don't have to update the parameters else update params of the old table. """ for iteration in range(num_iterations): self.log.info("Iteration {}".format(iteration)) pbar = get_progress_bar(len(self.corpus), title="Sampling") for d, doc in enumerate(pbar(self.corpus)): if self.show_topics is not None and self.show_topics > 0 and d % self.show_topics == 0: print("Topics after {:,} docs".format(d)) print(self.format_topics()) for w, cust_id in enumerate(doc): x = self.vocab_embeddings[cust_id] # Remove custId from his old_table old_table_id = self.table_assignments[d][w] self.table_assignments[d][ w] = -1 # Doesn't really make any difference, as only counts are used self.table_counts[old_table_id] -= 1 self.table_counts_per_doc[old_table_id, d] -= 1 # Update vector means etc self.sum_table_customers[old_table_id] -= x self.sum_squared_table_customers[old_table_id] -= np.outer( x, x) # Topic 'old_tabe_id' now has one member fewer if self.cholesky_decomp: # Just update params for this customer self.update_table_params_chol(old_table_id, cust_id, is_removed=True) else: # Now recalculate table paramters for this table self.set_table_parameters(old_table_id) #self.check_everything(iteration, d, w, mid_sample=True) # Now calculate the prior and likelihood for the customer to sit in each table and sample # Go over each table counts = self.table_counts_per_doc[:, d] + self.alpha # Now calculate the likelihood for each table log_lls = self.log_multivariate_tdensity_tables(x) # Add log prior in the posterior vector log_posteriors = np.log(counts) + log_lls # To prevent overflow, subtract by log(p_max). # This is because when we will be normalizing after exponentiating, # each entry will be exp(log p_i - log p_max )/\Sigma_i exp(log p_i - log p_max) # the log p_max cancels put and prevents overflow in the exponentiating phase. posterior = np.exp(log_posteriors - log_posteriors.max()) posterior /= posterior.sum() # Now sample an index from this posterior vector. new_table_id = np.random.choice(self.num_tables, p=posterior) # Now have a new assignment: add its counts self.table_assignments[d][w] = new_table_id self.table_counts[new_table_id] += 1 self.table_counts_per_doc[new_table_id, d] += 1 self.sum_table_customers[new_table_id] += x self.sum_squared_table_customers[new_table_id] += np.outer( x, x) if self.cholesky_decomp: self.update_table_params_chol(new_table_id, cust_id) else: self.set_table_parameters(new_table_id) #self.check_everything(iteration, d, w) #if self.cholesky_decomp: # # After each iteration, recompute the Cholesky decomposition fully, to avoid numerical inaccuracies # # blowing up with the repeated updates # # This also recomputes means # for table in range(self.num_tables): # inv_sigma = self.set_table_parameters(table) # self.table_cholesky_ltriangular_mat[table] = cholesky(inv_sigma) if self.show_topics is not None: print("Topics after iteration {}".format(iteration)) print(self.format_topics()) if self.save_path is not None: self.log.info("Saving model") self.save()
def initialize(self): """ Initialize the gibbs sampler state. I start with log N tables and randomly initialize customers to those tables. """ # First check the prior degrees of freedom. # It has to be >= num_dimension if self.prior.nu < self.embedding_size: self.log.warn( "The initial degrees of freedom of the prior is less than the dimension!. " "Setting it to the number of dimensions: {}".format( self.embedding_size)) self.prior.nu = self.embedding_size deg_of_freedom = self.prior.nu - self.embedding_size + 1 # Now calculate the covariance matrix of the multivariate T-distribution coeff = (self.prior.kappa + 1.) / (self.prior.kappa * deg_of_freedom) sigma_T = self.prior.sigma * coeff # This features in the original code, but doesn't get used # Or is it just to check that the invert doesn't fail? #sigma_Tinv = inv(sigma_T) sigma_TDet_sign, sigma_TDet = slogdet(sigma_T) if sigma_TDet_sign != 1: raise ValueError( "sign of log determinant of initial sigma is {}".format( sigma_TDet_sign)) # Storing zeros in sumTableCustomers and later will keep on adding each customer. self.sum_table_customers[:] = 0 self.sum_squared_table_customers[:] = 0 # With Cholesky: Means are set to the prior and then updated as we add each assignment # Without: Means are computed fully for each table after initialization self.table_means[:] = self.prior.mu # With Cholesky: This is ignored - we never use table_inverse_covariances # Without: This gets computed after initialization self.table_inverse_covariances[:] = 0 # Initialize the cholesky decomp of each table, with no counts yet for table in range(self.num_tables): self.table_cholesky_ltriangular_mat[ table] = self.prior.chol_sigma.copy() # Randomly assign customers to tables self.table_assignments = [] pbar = get_progress_bar(len(self.corpus), title="Initializing") for doc_num, doc in enumerate(pbar(self.corpus)): tables = list(np.random.randint(self.num_tables, size=len(doc))) self.table_assignments.append(tables) for (word, table) in zip(doc, tables): self.table_counts[table] += 1 self.table_counts_per_doc[table, doc_num] += 1 # update the sumTableCustomers self.sum_table_customers[table] += word self.sum_squared_table_customers[table] += np.outer(word, word) if self.cholesky_decomp: self.update_table_params_chol(table, word) #self.check_everything(-1, doc_num, -1) # Now compute the table parameters of each table # Go over each table. if not self.cholesky_decomp: for table in range(self.num_tables): self.set_table_parameters(table) else: # Make sure we don't accidentally use the inverse covariances anywhere, as they don't # get updated self.table_inverse_covariances = None
def sample(self, num_iterations): """ for num_iters: for each customer remove him from his old_table and update the table params. if old_table is empty: remove table Calculate prior and likelihood for this customer sitting at each table sample for a table index if new_table is equal to old_table don't have to update the parameters else update params of the old table. """ if self.show_topics is not None: print("Topics after initialization") print(self.format_topics()) # Compute the overall usage of topics across the training corpus topic_props = self.table_counts_per_doc.sum(axis=1).astype( np.float64) topic_props /= topic_props.sum() print("Words using topics: {}".format(", ".join( "{}={:.1f}%".format(i, prop) for i, prop in enumerate(topic_props * 100.)))) topic_doc_props = (self.table_counts_per_doc > 0).astype( np.float64).sum(axis=1) topic_doc_props /= self.num_documents print("Docs using topics: {}".format(", ".join( "{}={:.1f}%".format(i, prop) for i, prop in enumerate(topic_doc_props * 100.)))) with VoseAliasUpdater( self.aliases, self.vocab_embeddings, self.prior.kappa, self.prior.nu, self.table_counts, self.table_means, self.table_cholesky_ltriangular_mat, self.log_determinants, das_normalization=self.das_normalization, ) as alias_updater: for iteration in range(num_iterations): stats = SamplingDiagnostics() self.log.info("Iteration {}".format(iteration)) alias_updater.unpause() pbar = get_progress_bar(len(self.corpus), title="Sampling", show_progress=self.show_progress) for d, doc in enumerate(pbar(self.corpus)): if self.show_topics is not None and self.show_topics > 0 and d % self.show_topics == 0: print("Topics after {:,} docs".format(d)) print(self.format_topics()) for w, cust_id in enumerate(doc): x = self.vocab_embeddings[cust_id] # Remove custId from his old_table old_table_id = self.table_assignments[d][w] self.table_assignments[d][ w] = -1 # Doesn't really make any difference, as only counts are used with self.table_counts.lock: self.table_counts.np[old_table_id] -= 1 self.table_counts_per_doc[old_table_id, d] -= 1 # Update vector means etc self.sum_squared_table_customers[ old_table_id] -= np.outer(x, x) # Topic 'old_table_id' now has one member fewer # Just update params for this customer self.update_table_params(old_table_id, cust_id, is_removed=True) # Under the alias method, we only do the full likelihood computation for topics # that already have a non-zero count in the current document non_zero_tables = np.where( self.table_counts_per_doc[:, d] > 0)[0] if len(non_zero_tables) == 0: # If there's only one word in a doc, there are no topics to compute the full posterior for no_non_zero = True else: no_non_zero = False # We only compute the posterior for these topics log_priors = np.log( self.table_counts_per_doc[non_zero_tables, d]) log_likelihoods = np.zeros(len(non_zero_tables), dtype=np.float32) for nz_table, table in enumerate(non_zero_tables): log_likelihoods[ nz_table] = self.log_multivariate_tdensity( x, table) log_posterior = log_priors + log_likelihoods # To prevent overflow, subtract by log(p_max) max_log_posterior = log_posterior.max() scaled_posterior = log_posterior - max_log_posterior if self.das_normalization: # Not doing this now, but following what the Java impl does, however odd that seems psum = np.sum(np.exp(scaled_posterior)) else: # Java impl subtracts max before computing psum, but this seems to be wrong # We still subtract first, but then multiply by the max prob afterwards psum = np.exp( np.log(np.sum(np.exp(scaled_posterior))) + max_log_posterior) # Now just use the scaled log posterior in the same way as in the Java impl # They have a bin-search method for sampling from the cumulative dist, # but we simply normalize and use Numpy to sample unnormed_posterior = np.exp(scaled_posterior) normed_posterior = unnormed_posterior / unnormed_posterior.sum( ) # Don't let the alias parameters get updated in the middle of the sampling self.aliases.lock.acquire_read(cust_id) select_pr = psum / ( psum + self.alpha * self.aliases.likelihood_sum.np[cust_id]) # MHV to draw new topic # Take a number of Metropolis-Hastings samples current_sample = old_table_id # Calculate the true likelihood of this word under the current sample, # for calculating acceptance prob current_sample_log_prob = self.log_multivariate_tdensity( x, current_sample) for r in range(self.mh_steps): # 1. Flip a coin if not no_non_zero and np.random.sample( ) < select_pr: # Choose from the exactly computed posterior dist, only allowing # topics already sampled in the doc temp = np.random.choice(len(non_zero_tables), p=normed_posterior) new_sample = non_zero_tables[temp] stats.log_select_pr(True, select_pr) else: # Choose from the alias, allowing any topic but using slightly # out-of-date likelihoods new_sample = self.aliases.sample_vose(cust_id) stats.log_select_pr(False, select_pr) if new_sample != current_sample: # 2. Find acceptance probability new_sample_log_prob = self.log_multivariate_tdensity( x, new_sample) # This can sometimes generate an overflow warning from Numpy # We don't care, though: in that case acceptance > 1., so we always accept with np.errstate(over="ignore"): # From my reading of: # Li et al. (2014): Reducing the sampling complexity of topic models # the acceptance probability should be as follows: acceptance = \ (self.table_counts_per_doc[new_sample, d] + self.alpha) / \ (self.table_counts_per_doc[current_sample, d] + self.alpha) * \ np.exp(new_sample_log_prob - current_sample_log_prob) * \ (self.table_counts_per_doc[current_sample, d]*np.exp(current_sample_log_prob) + self.alpha*np.exp(self.aliases.log_likelihoods.np[cust_id, current_sample])) / \ (self.table_counts_per_doc[new_sample, d]*np.exp(new_sample_log_prob) + self.alpha*np.exp(self.aliases.log_likelihoods.np[cust_id, new_sample])) # The Java implementation, however, does this: #acceptance = \ # (self.table_counts_per_doc[new_table_id, d] + self.alpha) / \ # (self.table_counts_per_doc[current_sample, d] + self.alpha) * \ # np.exp(new_prob - old_prob) * \ # (self.table_counts_per_doc[current_sample, d]*old_log_prob + # self.alpha*alias.w.np[current_sample]) / \ # (self.table_counts_per_doc[new_table_id, d]*new_log_prob + # self.alpha*alias.w.np[new_table_id]) # The difference is the Java impl doesn't exp the log likelihood in the last # fraction, i.e. it uses a log prob instead of a prob # 3. Compare against uniform[0,1] # If the acceptance prob > 1, we always accept: this means the new sample # has a higher probability than the old if isinf( acceptance ) or acceptance >= 1. or np.random.sample( ) < acceptance: # No need to sample if acceptance >= 1 # If the acceptance prob < 1, sample whether to accept or not, such that # the more likely the new sample is compared to the old, the more likely we # are to keep it current_sample = new_sample current_sample_log_prob = new_sample_log_prob stats.log_acceptance(True, acceptance) else: stats.log_acceptance(False, acceptance) # NOTE: There seems to be a small error in the Java implementation here # On the last MH step, it doesn't make any difference whether we accept the # sample or not - we always end up using it self.aliases.lock.release_read() if current_sample == old_table_id: stats.log_sampled_same() else: stats.log_sampled_different() # Now have a new assignment: add its counts self.table_assignments[d][w] = current_sample with self.table_counts.lock: self.table_counts.np[current_sample] += 1 self.table_counts_per_doc[current_sample, d] += 1 self.sum_squared_table_customers[ current_sample] += np.outer(x, x) self.update_table_params(current_sample, cust_id) # Pause the alias updater until we start the next iteration alias_updater.pause() # Output some useful stats about sampling if stats.acceptance_used(): self.log.info( "Acceptance rate = {:.2f}%, mean acceptance: {:.2f} ({:,} samples draw)" .format(stats.acceptance_rate() * 100., stats.mean_acceptance(), stats.acceptance_samples())) else: self.log.info("No new samples drawn") self.log.info( "Prior select rate = {:.2f}%, mean select_pr = {:.2f}". format(stats.select_pr_rate() * 100., stats.mean_select_pr())) self.log.info("Chose new sample: {:.2f}%".format( stats.sample_change_rate() * 100.)) if self.show_topics is not None: print("Topics after iteration {}".format(iteration)) print(self.format_topics()) # Compute the overall usage of topics across the training corpus topic_props = self.table_counts_per_doc.sum(axis=1).astype( np.float64) topic_props /= topic_props.sum() print("Words using topics: {}".format(", ".join( "{}={:.1f}%".format(i, prop) for i, prop in enumerate(topic_props * 100.)))) topic_doc_props = (self.table_counts_per_doc > 0).astype( np.float64).sum(axis=1) topic_doc_props /= self.num_documents print("Docs using topics: {}".format(", ".join( "{}={:.1f}%".format(i, prop) for i, prop in enumerate(topic_doc_props * 100.)))) if self.save_path is not None: self.log.info("Saving model") self.save()
def sample(self, num_iterations): """ for num_iters: for each customer remove him from his old_table and update the table params. if old_table is empty: remove table Calculate prior and likelihood for this customer sitting at each table sample for a table index if new_table is equal to old_table don't have to update the parameters else update params of the old table. """ for iteration in range(num_iterations): self.log.info("Iteration {}".format(iteration)) pbar = get_progress_bar(len(self.corpus), title="Sampling") for d, doc in enumerate(pbar(self.corpus)): if self.show_topics is not None and self.show_topics > 0 and d % self.show_topics == 0: print("Topics after {:,} docs".format(d)) print(self.format_topics()) if len(doc) == 0: continue audio_doc = self.audio_corpus[d] frame_start = 0 pad = len(audio_doc) // len(doc) for w, cust_id in enumerate(doc): x = self.vocab_embeddings[cust_id] self.rm_id_table_text(d,w,x,cust_id) # Now calculate the prior and likelihood for the customer to sit in each table and sample # Go over each table counts_text = self.table_counts_per_doc[:, d] + self.alpha # Now calculate the likelihood for each table log_lls_text = self.log_multivariate_tdensity_tables(x) # Add log prior in the posterior vector log_posteriors_text = np.log(counts_text) + log_lls_text for frame_index in range(frame_start,pad): y = audio_doc[frame_index] self.rm_id_table_audio(d,frame_index,y) counts_audio = self.table_counts_per_doc_audio[:, d] + self.alpha log_lls_audio = self.log_multivariate_tdensity_tables(y,"audio") if frame_index == 0: log_prev_token_prob = 0 else : log_prev_token_prob = self.log_multivariate_tdensity_tables(audio_doc[frame_index-1],"audio") # Add log prior in the posterior vector log_posteriors_audio = np.log(counts_audio) + log_lls_audio + log_prev_token_prob log_posteriors = log_posteriors_text + log_posteriors_audio posterior = np.exp(log_posteriors - log_posteriors.max()) posterior /= posterior.sum() # Now sample an index from this posterior vector. new_table_id = np.random.choice(self.num_tables, p=posterior) self.update_table_audio(d, frame_index, y,new_table_id) frame_start=pad-1 pad+=pad-1 if pad > len(audio_doc): pad = len(audio_doc) log_posteriors = log_posteriors_text + log_posteriors_audio # To prevent overflow, subtract by log(p_max). # This is because when we will be normalizing after exponentiating, # each entry will be exp(log p_i - log p_max )/\Sigma_i exp(log p_i - log p_max) # the log p_max cancels put and prevents overflow in the exponentiating phase. posterior = np.exp(log_posteriors - log_posteriors.max()) posterior /= posterior.sum() # Now sample an index from this posterior vector. new_table_id = np.random.choice(self.num_tables, p=posterior) # Now have a new assignment: add its counts self.update_table_text(d, w, x,new_table_id, cust_id) #self.check_everything(iteration, d, w) #if self.cholesky_decomp: # # After each iteration, recompute the Cholesky decomposition fully, to avoid numerical inaccuracies # # blowing up with the repeated updates # # This also recomputes means # for table in range(self.num_tables): # inv_sigma = self.set_table_parameters(table) # self.table_cholesky_ltriangular_mat[table] = cholesky(inv_sigma) if self.show_topics is not None: print("Topics after iteration {}".format(iteration)) print(self.format_topics()) if self.save_path is not None: self.log.info("Saving model") self.save()