Exemplo n.º 1
0
    def initialize(self):
        """
        Initialize the gibbs sampler state.

        I start with log N tables and randomly initialize customers to those tables.

        """
        # First check the prior degrees of freedom.
        # It has to be >= num_dimension
        if self.prior.nu < self.embedding_size:
            self.log.warn(
                "The initial degrees of freedom of the prior is less than the dimension!. "
                "Setting it to the number of dimensions: {}".format(
                    self.embedding_size))
            self.prior.nu = self.embedding_size

        deg_of_freedom = self.prior.nu - self.embedding_size + 1
        # Now calculate the covariance matrix of the multivariate T-distribution
        coeff = (self.prior.kappa + 1.) / (self.prior.kappa * deg_of_freedom)
        sigma_T = self.prior.sigma * coeff
        # This features in the original code, but doesn't get used
        # Or is it just to check that the invert doesn't fail?
        #sigma_Tinv = inv(sigma_T)
        sigma_TDet_sign, sigma_TDet = slogdet(sigma_T)
        if sigma_TDet_sign != 1:
            raise ValueError(
                "sign of log determinant of initial sigma is {}".format(
                    sigma_TDet_sign))

        # Storing zeros in sumTableCustomers and later will keep on adding each customer.
        self.sum_squared_table_customers[:] = 0
        # Means are set to the prior and then updated as we add each assignment
        self.table_means.np[:] = self.prior.mu

        # Initialize the cholesky decomp of each table, with no counts yet
        for table in range(self.num_tables):
            self.table_cholesky_ltriangular_mat.np[
                table] = self.prior.chol_sigma.copy()

        # Randomly assign customers to tables
        self.table_assignments = []
        pbar = get_progress_bar(len(self.corpus),
                                title="Initializing",
                                show_progress=self.show_progress)
        for doc_num, doc in enumerate(pbar(self.corpus)):
            tables = list(np.random.randint(self.num_tables, size=len(doc)))
            self.table_assignments.append(tables)
            for (word, table) in zip(doc, tables):
                self.table_counts.np[table] += 1
                self.table_counts_per_doc[table, doc_num] += 1
                # update the sumTableCustomers
                self.sum_squared_table_customers[table] += np.outer(
                    self.vocab_embeddings[word], self.vocab_embeddings[word])

                self.update_table_params(table, word)
Exemplo n.º 2
0
    def sample(self, num_iterations):
        """
        for num_iters:
            for each customer
                remove him from his old_table and update the table params.
                if old_table is empty:
                    remove table
                Calculate prior and likelihood for this customer sitting at each table
                sample for a table index
                if new_table is equal to old_table
                    don't have to update the parameters
                else update params of the old table.
        """
        for iteration in range(num_iterations):
            self.log.info("Iteration {}".format(iteration))

            pbar = get_progress_bar(len(self.corpus), title="Sampling")
            for d, doc in enumerate(pbar(self.corpus)):
                if self.show_topics is not None and self.show_topics > 0 and d % self.show_topics == 0:
                    print("Topics after {:,} docs".format(d))
                    print(self.format_topics())

                for w, cust_id in enumerate(doc):
                    x = self.vocab_embeddings[cust_id]

                    # Remove custId from his old_table
                    old_table_id = self.table_assignments[d][w]
                    self.table_assignments[d][
                        w] = -1  # Doesn't really make any difference, as only counts are used
                    self.table_counts[old_table_id] -= 1
                    self.table_counts_per_doc[old_table_id, d] -= 1
                    # Update vector means etc
                    self.sum_table_customers[old_table_id] -= x
                    self.sum_squared_table_customers[old_table_id] -= np.outer(
                        x, x)

                    # Topic 'old_tabe_id' now has one member fewer
                    if self.cholesky_decomp:
                        # Just update params for this customer
                        self.update_table_params_chol(old_table_id,
                                                      cust_id,
                                                      is_removed=True)
                    else:
                        # Now recalculate table paramters for this table
                        self.set_table_parameters(old_table_id)

                    #self.check_everything(iteration, d, w, mid_sample=True)

                    # Now calculate the prior and likelihood for the customer to sit in each table and sample
                    # Go over each table
                    counts = self.table_counts_per_doc[:, d] + self.alpha
                    # Now calculate the likelihood for each table
                    log_lls = self.log_multivariate_tdensity_tables(x)
                    # Add log prior in the posterior vector
                    log_posteriors = np.log(counts) + log_lls
                    # To prevent overflow, subtract by log(p_max).
                    # This is because when we will be normalizing after exponentiating,
                    # each entry will be exp(log p_i - log p_max )/\Sigma_i exp(log p_i - log p_max)
                    # the log p_max cancels put and prevents overflow in the exponentiating phase.
                    posterior = np.exp(log_posteriors - log_posteriors.max())
                    posterior /= posterior.sum()
                    # Now sample an index from this posterior vector.
                    new_table_id = np.random.choice(self.num_tables,
                                                    p=posterior)

                    # Now have a new assignment: add its counts
                    self.table_assignments[d][w] = new_table_id
                    self.table_counts[new_table_id] += 1
                    self.table_counts_per_doc[new_table_id, d] += 1
                    self.sum_table_customers[new_table_id] += x
                    self.sum_squared_table_customers[new_table_id] += np.outer(
                        x, x)

                    if self.cholesky_decomp:
                        self.update_table_params_chol(new_table_id, cust_id)
                    else:
                        self.set_table_parameters(new_table_id)

                    #self.check_everything(iteration, d, w)

                #if self.cholesky_decomp:
                #    # After each iteration, recompute the Cholesky decomposition fully, to avoid numerical inaccuracies
                #    # blowing up with the repeated updates
                #    # This also recomputes means
                #    for table in range(self.num_tables):
                #        inv_sigma = self.set_table_parameters(table)
                #        self.table_cholesky_ltriangular_mat[table] = cholesky(inv_sigma)

            if self.show_topics is not None:
                print("Topics after iteration {}".format(iteration))
                print(self.format_topics())

            if self.save_path is not None:
                self.log.info("Saving model")
                self.save()
Exemplo n.º 3
0
    def initialize(self):
        """
        Initialize the gibbs sampler state.

        I start with log N tables and randomly initialize customers to those tables.

        """
        # First check the prior degrees of freedom.
        # It has to be >= num_dimension
        if self.prior.nu < self.embedding_size:
            self.log.warn(
                "The initial degrees of freedom of the prior is less than the dimension!. "
                "Setting it to the number of dimensions: {}".format(
                    self.embedding_size))
            self.prior.nu = self.embedding_size

        deg_of_freedom = self.prior.nu - self.embedding_size + 1
        # Now calculate the covariance matrix of the multivariate T-distribution
        coeff = (self.prior.kappa + 1.) / (self.prior.kappa * deg_of_freedom)
        sigma_T = self.prior.sigma * coeff
        # This features in the original code, but doesn't get used
        # Or is it just to check that the invert doesn't fail?
        #sigma_Tinv = inv(sigma_T)
        sigma_TDet_sign, sigma_TDet = slogdet(sigma_T)
        if sigma_TDet_sign != 1:
            raise ValueError(
                "sign of log determinant of initial sigma is {}".format(
                    sigma_TDet_sign))

        # Storing zeros in sumTableCustomers and later will keep on adding each customer.
        self.sum_table_customers[:] = 0
        self.sum_squared_table_customers[:] = 0
        # With Cholesky: Means are set to the prior and then updated as we add each assignment
        # Without: Means are computed fully for each table after initialization
        self.table_means[:] = self.prior.mu
        # With Cholesky: This is ignored - we never use table_inverse_covariances
        # Without: This gets computed after initialization
        self.table_inverse_covariances[:] = 0

        # Initialize the cholesky decomp of each table, with no counts yet
        for table in range(self.num_tables):
            self.table_cholesky_ltriangular_mat[
                table] = self.prior.chol_sigma.copy()

        # Randomly assign customers to tables
        self.table_assignments = []
        pbar = get_progress_bar(len(self.corpus), title="Initializing")
        for doc_num, doc in enumerate(pbar(self.corpus)):
            tables = list(np.random.randint(self.num_tables, size=len(doc)))
            self.table_assignments.append(tables)
            for (word, table) in zip(doc, tables):
                self.table_counts[table] += 1
                self.table_counts_per_doc[table, doc_num] += 1
                # update the sumTableCustomers
                self.sum_table_customers[table] += word
                self.sum_squared_table_customers[table] += np.outer(word, word)

                if self.cholesky_decomp:
                    self.update_table_params_chol(table, word)

            #self.check_everything(-1, doc_num, -1)

        # Now compute the table parameters of each table
        # Go over each table.
        if not self.cholesky_decomp:
            for table in range(self.num_tables):
                self.set_table_parameters(table)
        else:
            # Make sure we don't accidentally use the inverse covariances anywhere, as they don't
            # get updated
            self.table_inverse_covariances = None
Exemplo n.º 4
0
    def sample(self, num_iterations):
        """
        for num_iters:
            for each customer
                remove him from his old_table and update the table params.
                if old_table is empty:
                    remove table
                Calculate prior and likelihood for this customer sitting at each table
                sample for a table index
                if new_table is equal to old_table
                    don't have to update the parameters
                else update params of the old table.
        """
        if self.show_topics is not None:
            print("Topics after initialization")
            print(self.format_topics())
            # Compute the overall usage of topics across the training corpus
            topic_props = self.table_counts_per_doc.sum(axis=1).astype(
                np.float64)
            topic_props /= topic_props.sum()
            print("Words using topics: {}".format(", ".join(
                "{}={:.1f}%".format(i, prop)
                for i, prop in enumerate(topic_props * 100.))))
            topic_doc_props = (self.table_counts_per_doc > 0).astype(
                np.float64).sum(axis=1)
            topic_doc_props /= self.num_documents
            print("Docs using topics: {}".format(", ".join(
                "{}={:.1f}%".format(i, prop)
                for i, prop in enumerate(topic_doc_props * 100.))))

        with VoseAliasUpdater(
                self.aliases,
                self.vocab_embeddings,
                self.prior.kappa,
                self.prior.nu,
                self.table_counts,
                self.table_means,
                self.table_cholesky_ltriangular_mat,
                self.log_determinants,
                das_normalization=self.das_normalization,
        ) as alias_updater:
            for iteration in range(num_iterations):
                stats = SamplingDiagnostics()
                self.log.info("Iteration {}".format(iteration))

                alias_updater.unpause()
                pbar = get_progress_bar(len(self.corpus),
                                        title="Sampling",
                                        show_progress=self.show_progress)
                for d, doc in enumerate(pbar(self.corpus)):
                    if self.show_topics is not None and self.show_topics > 0 and d % self.show_topics == 0:
                        print("Topics after {:,} docs".format(d))
                        print(self.format_topics())

                    for w, cust_id in enumerate(doc):
                        x = self.vocab_embeddings[cust_id]

                        # Remove custId from his old_table
                        old_table_id = self.table_assignments[d][w]
                        self.table_assignments[d][
                            w] = -1  # Doesn't really make any difference, as only counts are used
                        with self.table_counts.lock:
                            self.table_counts.np[old_table_id] -= 1
                        self.table_counts_per_doc[old_table_id, d] -= 1
                        # Update vector means etc
                        self.sum_squared_table_customers[
                            old_table_id] -= np.outer(x, x)

                        # Topic 'old_table_id' now has one member fewer
                        # Just update params for this customer
                        self.update_table_params(old_table_id,
                                                 cust_id,
                                                 is_removed=True)

                        # Under the alias method, we only do the full likelihood computation for topics
                        # that already have a non-zero count in the current document
                        non_zero_tables = np.where(
                            self.table_counts_per_doc[:, d] > 0)[0]
                        if len(non_zero_tables) == 0:
                            # If there's only one word in a doc, there are no topics to compute the full posterior for
                            no_non_zero = True
                        else:
                            no_non_zero = False
                            # We only compute the posterior for these topics
                            log_priors = np.log(
                                self.table_counts_per_doc[non_zero_tables, d])
                            log_likelihoods = np.zeros(len(non_zero_tables),
                                                       dtype=np.float32)
                            for nz_table, table in enumerate(non_zero_tables):
                                log_likelihoods[
                                    nz_table] = self.log_multivariate_tdensity(
                                        x, table)
                            log_posterior = log_priors + log_likelihoods

                            # To prevent overflow, subtract by log(p_max)
                            max_log_posterior = log_posterior.max()
                            scaled_posterior = log_posterior - max_log_posterior
                            if self.das_normalization:
                                # Not doing this now, but following what the Java impl does, however odd that seems
                                psum = np.sum(np.exp(scaled_posterior))
                            else:
                                # Java impl subtracts max before computing psum, but this seems to be wrong
                                # We still subtract first, but then multiply by the max prob afterwards
                                psum = np.exp(
                                    np.log(np.sum(np.exp(scaled_posterior))) +
                                    max_log_posterior)
                            # Now just use the scaled log posterior in the same way as in the Java impl
                            # They have a bin-search method for sampling from the cumulative dist,
                            # but we simply normalize and use Numpy to sample
                            unnormed_posterior = np.exp(scaled_posterior)
                            normed_posterior = unnormed_posterior / unnormed_posterior.sum(
                            )

                        # Don't let the alias parameters get updated in the middle of the sampling
                        self.aliases.lock.acquire_read(cust_id)
                        select_pr = psum / (
                            psum + self.alpha *
                            self.aliases.likelihood_sum.np[cust_id])

                        # MHV to draw new topic
                        # Take a number of Metropolis-Hastings samples
                        current_sample = old_table_id
                        # Calculate the true likelihood of this word under the current sample,
                        # for calculating acceptance prob
                        current_sample_log_prob = self.log_multivariate_tdensity(
                            x, current_sample)
                        for r in range(self.mh_steps):
                            # 1. Flip a coin
                            if not no_non_zero and np.random.sample(
                            ) < select_pr:
                                # Choose from the exactly computed posterior dist, only allowing
                                # topics already sampled in the doc
                                temp = np.random.choice(len(non_zero_tables),
                                                        p=normed_posterior)
                                new_sample = non_zero_tables[temp]
                                stats.log_select_pr(True, select_pr)
                            else:
                                # Choose from the alias, allowing any topic but using slightly
                                # out-of-date likelihoods
                                new_sample = self.aliases.sample_vose(cust_id)
                                stats.log_select_pr(False, select_pr)

                            if new_sample != current_sample:
                                # 2. Find acceptance probability
                                new_sample_log_prob = self.log_multivariate_tdensity(
                                    x, new_sample)
                                # This can sometimes generate an overflow warning from Numpy
                                # We don't care, though: in that case acceptance > 1., so we always accept
                                with np.errstate(over="ignore"):
                                    # From my reading of:
                                    # Li et al. (2014): Reducing the sampling complexity of topic models
                                    # the acceptance probability should be as follows:
                                    acceptance = \
                                        (self.table_counts_per_doc[new_sample, d] + self.alpha) / \
                                        (self.table_counts_per_doc[current_sample, d] + self.alpha) * \
                                        np.exp(new_sample_log_prob - current_sample_log_prob) * \
                                        (self.table_counts_per_doc[current_sample, d]*np.exp(current_sample_log_prob) +
                                         self.alpha*np.exp(self.aliases.log_likelihoods.np[cust_id, current_sample])) / \
                                        (self.table_counts_per_doc[new_sample, d]*np.exp(new_sample_log_prob) +
                                         self.alpha*np.exp(self.aliases.log_likelihoods.np[cust_id, new_sample]))
                                    # The Java implementation, however, does this:
                                    #acceptance = \
                                    #    (self.table_counts_per_doc[new_table_id, d] + self.alpha) / \
                                    #    (self.table_counts_per_doc[current_sample, d] + self.alpha) * \
                                    #    np.exp(new_prob - old_prob) * \
                                    #    (self.table_counts_per_doc[current_sample, d]*old_log_prob +
                                    #     self.alpha*alias.w.np[current_sample]) / \
                                    #    (self.table_counts_per_doc[new_table_id, d]*new_log_prob +
                                    #     self.alpha*alias.w.np[new_table_id])
                                    # The difference is the Java impl doesn't exp the log likelihood in the last
                                    # fraction, i.e. it uses a log prob instead of a prob
                                # 3. Compare against uniform[0,1]
                                # If the acceptance prob > 1, we always accept: this means the new sample
                                # has a higher probability than the old
                                if isinf(
                                        acceptance
                                ) or acceptance >= 1. or np.random.sample(
                                ) < acceptance:
                                    # No need to sample if acceptance >= 1
                                    # If the acceptance prob < 1, sample whether to accept or not, such that
                                    # the more likely the new sample is compared to the old, the more likely we
                                    # are to keep it
                                    current_sample = new_sample
                                    current_sample_log_prob = new_sample_log_prob
                                    stats.log_acceptance(True, acceptance)
                                else:
                                    stats.log_acceptance(False, acceptance)
                                # NOTE: There seems to be a small error in the Java implementation here
                                # On the last MH step, it doesn't make any difference whether we accept the
                                # sample or not - we always end up using it
                        self.aliases.lock.release_read()

                        if current_sample == old_table_id:
                            stats.log_sampled_same()
                        else:
                            stats.log_sampled_different()

                        # Now have a new assignment: add its counts
                        self.table_assignments[d][w] = current_sample
                        with self.table_counts.lock:
                            self.table_counts.np[current_sample] += 1
                        self.table_counts_per_doc[current_sample, d] += 1
                        self.sum_squared_table_customers[
                            current_sample] += np.outer(x, x)

                        self.update_table_params(current_sample, cust_id)

                # Pause the alias updater until we start the next iteration
                alias_updater.pause()

                # Output some useful stats about sampling
                if stats.acceptance_used():
                    self.log.info(
                        "Acceptance rate = {:.2f}%, mean acceptance: {:.2f} ({:,} samples draw)"
                        .format(stats.acceptance_rate() * 100.,
                                stats.mean_acceptance(),
                                stats.acceptance_samples()))
                else:
                    self.log.info("No new samples drawn")
                self.log.info(
                    "Prior select rate = {:.2f}%, mean select_pr = {:.2f}".
                    format(stats.select_pr_rate() * 100.,
                           stats.mean_select_pr()))
                self.log.info("Chose new sample: {:.2f}%".format(
                    stats.sample_change_rate() * 100.))

                if self.show_topics is not None:
                    print("Topics after iteration {}".format(iteration))
                    print(self.format_topics())
                    # Compute the overall usage of topics across the training corpus
                    topic_props = self.table_counts_per_doc.sum(axis=1).astype(
                        np.float64)
                    topic_props /= topic_props.sum()
                    print("Words using topics: {}".format(", ".join(
                        "{}={:.1f}%".format(i, prop)
                        for i, prop in enumerate(topic_props * 100.))))
                    topic_doc_props = (self.table_counts_per_doc > 0).astype(
                        np.float64).sum(axis=1)
                    topic_doc_props /= self.num_documents
                    print("Docs using topics: {}".format(", ".join(
                        "{}={:.1f}%".format(i, prop)
                        for i, prop in enumerate(topic_doc_props * 100.))))

                if self.save_path is not None:
                    self.log.info("Saving model")
                    self.save()
Exemplo n.º 5
0
    def sample(self, num_iterations):
        """
        for num_iters:
            for each customer
                remove him from his old_table and update the table params.
                if old_table is empty:
                    remove table
                Calculate prior and likelihood for this customer sitting at each table
                sample for a table index
                if new_table is equal to old_table
                    don't have to update the parameters
                else update params of the old table.
        """
        for iteration in range(num_iterations):
            self.log.info("Iteration {}".format(iteration))
            
            pbar = get_progress_bar(len(self.corpus), title="Sampling")
            for d, doc in enumerate(pbar(self.corpus)):
                if self.show_topics is not None and self.show_topics > 0 and d % self.show_topics == 0:
                    print("Topics after {:,} docs".format(d))
                    print(self.format_topics())
                if  len(doc) == 0:
                    continue
                audio_doc = self.audio_corpus[d]
                frame_start = 0
                pad = len(audio_doc) // len(doc)
                for w, cust_id in enumerate(doc):
                    x = self.vocab_embeddings[cust_id]
                    self.rm_id_table_text(d,w,x,cust_id)
                    # Now calculate the prior and likelihood for the customer to sit in each table and sample
                    # Go over each table
                    counts_text = self.table_counts_per_doc[:, d] + self.alpha
                    # Now calculate the likelihood for each table
                    log_lls_text = self.log_multivariate_tdensity_tables(x)
                    
                    # Add log prior in the posterior vector
                    log_posteriors_text = np.log(counts_text) + log_lls_text
 
                    for frame_index in range(frame_start,pad):
                        y = audio_doc[frame_index]
                        self.rm_id_table_audio(d,frame_index,y)
                        counts_audio = self.table_counts_per_doc_audio[:, d] + self.alpha
                        log_lls_audio = self.log_multivariate_tdensity_tables(y,"audio")
                        
                        if frame_index == 0:
                            log_prev_token_prob = 0
                        else :    
                            log_prev_token_prob = self.log_multivariate_tdensity_tables(audio_doc[frame_index-1],"audio")

                        # Add log prior in the posterior vector
                        log_posteriors_audio = np.log(counts_audio) + log_lls_audio + log_prev_token_prob
                        log_posteriors = log_posteriors_text + log_posteriors_audio
 
                        posterior = np.exp(log_posteriors - log_posteriors.max())
                        posterior /= posterior.sum()
                        # Now sample an index from this posterior vector.
                        new_table_id = np.random.choice(self.num_tables, p=posterior)
                        self.update_table_audio(d, frame_index, y,new_table_id)
                    
                    frame_start=pad-1
                    pad+=pad-1
                    if pad > len(audio_doc):
                        pad = len(audio_doc)
                                    
                    log_posteriors = log_posteriors_text + log_posteriors_audio
                    # To prevent overflow, subtract by log(p_max).
                    # This is because when we will be normalizing after exponentiating,
                    # each entry will be exp(log p_i - log p_max )/\Sigma_i exp(log p_i - log p_max)
                    # the log p_max cancels put and prevents overflow in the exponentiating phase.
                    posterior = np.exp(log_posteriors - log_posteriors.max())
                    posterior /= posterior.sum()
                    # Now sample an index from this posterior vector.
                    new_table_id = np.random.choice(self.num_tables, p=posterior)

                    # Now have a new assignment: add its counts
                    self.update_table_text(d, w, x,new_table_id, cust_id)

                    #self.check_everything(iteration, d, w)

                #if self.cholesky_decomp:
                #    # After each iteration, recompute the Cholesky decomposition fully, to avoid numerical inaccuracies
                #    # blowing up with the repeated updates
                #    # This also recomputes means
                #    for table in range(self.num_tables):
                #        inv_sigma = self.set_table_parameters(table)
                #        self.table_cholesky_ltriangular_mat[table] = cholesky(inv_sigma)

            if self.show_topics is not None:
                print("Topics after iteration {}".format(iteration))
                print(self.format_topics())

            if self.save_path is not None:
                self.log.info("Saving model")
                self.save()