Пример #1
0
    def predict_log_probs(self, test_data):
        num_points = test_data["coordinates"].shape[0]
        topic_log_prob_vector = np.zeros((self.num_topics, num_points))
        # topic_prob_vector = np.ones((self.num_topics, num_points))

        for z in range(self.num_topics):
            try:
                rv = stats.multivariate_normal(mean=self.topic_centers[z, :],
                                               cov=self.topic_covar[z, :, :], allow_singular=True)
                loc_log_prob = rv.logpdf(test_data["coordinates"]).reshape((num_points, 1))  # Nx1
                # loc_prob = rv.pdf(test_data["coordinates"]).reshape((num_points, 1))  # Nx1

            except:
                print("Error while computing geo log probabilities for test, dumping data.", "\n",
                      self.topic_centers[z, :], "\n", self.topic_covar[z, :, :], file=sys.stderr)
                traceback.print_stack(file=sys.stderr)
                sys.exit(1)

            feature_log_prob = np.zeros((num_points, 1))
            # feature_prob = np.ones((num_points, 1))
            for feature in self.beta_arrays.keys():
                beta = self.beta_arrays[feature][z]
                num_words = beta.shape[0]
                feature_log_prob += test_data[feature] * np.log(beta.reshape((num_words, 1)))  # (NxV * Vx1) = Nx1
                # feature_prob *= np.prod(np.power(beta.reshape((num_words, 1)).T, test_data[feature].todense()), axis=1)

            topic_log_prob_vector[z] = (loc_log_prob +
                                        feature_log_prob +
                                        np.tile(np.log(self.theta[0, z]), (num_points, 1))).flatten()
            # topic_prob_vector[z] = np.multiply(np.multiply(loc_prob, feature_prob),
            #                                    np.tile(self.theta[0, z], (num_points, 1))).flatten()

        # print("direct")
        # print(np.sum(np.log(np.sum(topic_prob_vector, axis=0))))
        return np.sum(utils.log_sum(topic_log_prob_vector, axis=0))
Пример #2
0
def compute_probabilities_from_mixture(model, data):
    """
    Compute data probabilities, but by substituting Gaussian mixture of all topic distributions as geographical
    distribution, so all points are drawn from the same distribution.
    :param model:
    :param coordinates:
    :return:
    """
    theta = model.theta
    beta_arrays = model.beta_arrays
    num_points = data["coordinates"].shape[0]

    geo_log_prob = np.zeros((model.num_topics, num_points))  # kxN
    feature_log_prob = np.zeros((model.num_topics, num_points))  # kxN

    for z in range(model.num_topics):
        # Compute feature probabilities
        for feature in beta_arrays.keys():
            beta = beta_arrays[feature]  #  kxV

            feature_log_prob[z] += np.log(
                beta[z]) * data[feature].T  # kxV * NxV' = kxN

        # Compute geographical probabilities
        rv = multivariate_normal(mean=model.topic_centers[z, :],
                                 cov=model.topic_covar[z, :, :],
                                 allow_singular=True)

        geo_log_prob[z] += rv.logpdf(data["coordinates"])

        log_theta_for_z = np.log(theta[0, z])
        geo_log_prob[z] += log_theta_for_z
        feature_log_prob[z] += log_theta_for_z

    # Log-sum-exp over topics, then sum over data points
    final_geo_log_prob = log_sum(geo_log_prob, axis=0)
    final_feature_log_prob = log_sum(feature_log_prob, axis=0)

    return np.sum(final_geo_log_prob + final_feature_log_prob)
def compute_probabilities_from_mixture(model, data):
    """
    Compute data probabilities, but by substituting Gaussian mixture of all topic distributions as geographical
    distribution, so all points are drawn from the same distribution.
    :param model:
    :param coordinates:
    :return:
    """
    theta = model.theta
    beta_arrays = model.beta_arrays
    num_points = data["coordinates"].shape[0]

    geo_log_prob = np.zeros((model.num_topics, num_points))  # kxN
    feature_log_prob = np.zeros((model.num_topics, num_points))  # kxN

    for z in range(model.num_topics):
        # Compute feature probabilities
        for feature in beta_arrays.keys():
            beta = beta_arrays[feature]  #  kxV

            feature_log_prob[z] += np.log(beta[z]) * data[feature].T  # kxV * NxV' = kxN

        # Compute geographical probabilities
        rv = multivariate_normal(mean=model.topic_centers[z, :],
                                 cov=model.topic_covar[z, :, :],
                                 allow_singular=True)

        geo_log_prob[z] += rv.logpdf(data["coordinates"])

        log_theta_for_z = np.log(theta[0, z])
        geo_log_prob[z] += log_theta_for_z
        feature_log_prob[z] += log_theta_for_z

    # Log-sum-exp over topics, then sum over data points
    final_geo_log_prob = log_sum(geo_log_prob, axis=0)
    final_feature_log_prob = log_sum(feature_log_prob, axis=0)

    return np.sum(final_geo_log_prob + final_feature_log_prob)
Пример #4
0
    def get_topic_unigram(m_array, h_array):
        """
        m_array: 1 x V
        h_array: k x V
        beta_array: k x V
        """
        beta_array = m_array + h_array
        norm_sum = utils.log_sum(beta_array, axis=1)

        try:
            beta_array = np.exp(beta_array.T - norm_sum).T
        except:
            sys.stderr.write(str(beta_array))
            sys.stderr.write("\n")
            sys.stderr.write(str(norm_sum))
            sys.stderr.write("\n")
        return beta_array
Пример #5
0
    def get_topic_unigram(m_array, h_array):
        """
        m_array: 1 x V
        h_array: k x V
        beta_array: k x V
        """
        beta_array = m_array + h_array
        norm_sum = utils.log_sum(beta_array, axis=1)

        try:
            beta_array = np.exp(beta_array.T - norm_sum).T
        except:
            sys.stderr.write(str(beta_array))
            sys.stderr.write("\n")
            sys.stderr.write(str(norm_sum))
            sys.stderr.write("\n")
        return beta_array
Пример #6
0
    def __update_phi(data, beta_arrays, theta, topic_centers, topic_covar):
        """
        :param data: a dictionary containing coordinates and sparse N x V_F matrices for features
        :param beta_arrays: F x k x V
        :param theta: 1 x k
        :param topic_centers: k x 2
        :param topic_covar: k x 2 x 2
        """

        k, _ = topic_centers.shape
        N, _ = data["coordinates"].shape

        G = np.zeros((k, N))
        Z = np.zeros((k, N))

        # Compute geographical probabilities
        for z in range(k):
            # setup distribution variable
            rv = stats.multivariate_normal(mean=topic_centers[z, :],
                                           cov=topic_covar[z, :, :],
                                           allow_singular=True)

            log_probabilities = rv.logpdf(data["coordinates"])
            G[z, :] = log_probabilities

            # TODO: @MM, I think we can move this out of the loop, no? There can be a minor performance increase. - Emre
            Z[z, :] = np.log(theta[0, z])

        # Compute new phi
        # TODO: @MM, please check this. - Emre
        F = np.zeros((k, N))

        for feature in beta_arrays.keys():
            # (k x V) x (N x V)' + k x N
            F += np.log(beta_arrays[feature]) * data[feature].transpose()

        F += G + Z

        S = utils.log_sum(F, axis=0)  # 1 x N

        F = F - S

        phi = np.exp(F)  # k x N

        return phi
Пример #7
0
    def __update_phi(data, beta_arrays, theta, topic_centers, topic_covar):
        """
        :param data: a dictionary containing coordinates and sparse N x V_F matrices for features
        :param beta_arrays: F x k x V
        :param theta: 1 x k
        :param topic_centers: k x 2
        :param topic_covar: k x 2 x 2
        """

        k, _ = topic_centers.shape
        N, _ = data["coordinates"].shape

        G = np.zeros((k, N))
        Z = np.zeros((k, N))

        # Compute geographical probabilities
        for z in range(k):
            # setup distribution variable
            rv = stats.multivariate_normal(mean=topic_centers[z, :],
                                           cov=topic_covar[z, :, :], allow_singular=True)

            log_probabilities = rv.logpdf(data["coordinates"])
            G[z, :] = log_probabilities

            # TODO: @MM, I think we can move this out of the loop, no? There can be a minor performance increase. - Emre
            Z[z, :] = np.log(theta[0, z])

        # Compute new phi
        # TODO: @MM, please check this. - Emre
        F = np.zeros((k, N))

        for feature in beta_arrays.keys():
            # (k x V) x (N x V)' + k x N
            F += np.log(beta_arrays[feature]) * data[feature].transpose()

        F += G + Z

        S = utils.log_sum(F, axis=0)  # 1 x N

        F = F - S

        phi = np.exp(F)  # k x N

        return phi
Пример #8
0
    def predict_log_probs(self, test_data):
        num_points = test_data["coordinates"].shape[0]
        topic_log_prob_vector = np.zeros((self.num_topics, num_points))
        # topic_prob_vector = np.ones((self.num_topics, num_points))

        for z in range(self.num_topics):
            try:
                rv = stats.multivariate_normal(mean=self.topic_centers[z, :],
                                               cov=self.topic_covar[z, :, :],
                                               allow_singular=True)
                loc_log_prob = rv.logpdf(test_data["coordinates"]).reshape(
                    (num_points, 1))  # Nx1
                # loc_prob = rv.pdf(test_data["coordinates"]).reshape((num_points, 1))  # Nx1

            except:
                print(
                    "Error while computing geo log probabilities for test, dumping data.",
                    "\n",
                    self.topic_centers[z, :],
                    "\n",
                    self.topic_covar[z, :, :],
                    file=sys.stderr)
                traceback.print_stack(file=sys.stderr)
                sys.exit(1)

            feature_log_prob = np.zeros((num_points, 1))
            # feature_prob = np.ones((num_points, 1))
            for feature in self.beta_arrays.keys():
                beta = self.beta_arrays[feature][z]
                num_words = beta.shape[0]
                feature_log_prob += test_data[feature] * np.log(
                    beta.reshape((num_words, 1)))  # (NxV * Vx1) = Nx1
                # feature_prob *= np.prod(np.power(beta.reshape((num_words, 1)).T, test_data[feature].todense()), axis=1)

            topic_log_prob_vector[z] = (loc_log_prob + feature_log_prob +
                                        np.tile(np.log(self.theta[0, z]),
                                                (num_points, 1))).flatten()
            # topic_prob_vector[z] = np.multiply(np.multiply(loc_prob, feature_prob),
            #                                    np.tile(self.theta[0, z], (num_points, 1))).flatten()

        # print("direct")
        # print(np.sum(np.log(np.sum(topic_prob_vector, axis=0))))
        return np.sum(utils.log_sum(topic_log_prob_vector, axis=0))
Пример #9
0
        def compute_log_beta(h_matrix):
            beta_array = m_array + h_matrix
            norm_sum = utils.log_sum(beta_array, axis=1)

            return beta_array - norm_sum[:, np.newaxis]
Пример #10
0
        def compute_log_beta(h_matrix):
            beta_array = m_array + h_matrix
            norm_sum = utils.log_sum(beta_array, axis=1)

            return beta_array - norm_sum[:, np.newaxis]