Exemplo n.º 1
0
    def _initialize(self, dataE, dataW, dataToken):
        start = datetime.now()

        self.theta = probNormalize(np.random.random([self.E, self.K]))
        self.phi = probNormalize(np.random.random([self.K, self.V]))
        self.esp = []
        self.z = []
        z_dist = np.sum(self.theta, axis=0) / self.E
        for d in range(self.D):
            Nd = self.Nd[d]
            gamma = dataE[d]
            self.esp.append(multinomial(gamma, Nd))
            self.z.append(multinomial(z_dist, Nd))

        self.TE = np.zeros([self.K, self.E], dtype=np.int32)
        self.TV = np.zeros([self.K, self.V], dtype=np.int32)
        for d in range(self.D):
            docToken = dataToken[d]
            doc_z = self.z[d]
            doc_esp = self.esp[d]
            for n in range(self.Nd[d]):
                w = docToken[n]
                w_z = doc_z[n]
                w_esp = doc_esp[n]
                self.TE[w_z, w_esp] += 1
                self.TV[w_z, w] += 1
        self.TI = np.sum(self.TV, axis=1)
        self.IE = np.sum(self.TE, axis=0)

        duration = datetime.now() - start
        print "_initialize() takes %fs" % duration.total_seconds()
Exemplo n.º 2
0
    def _initialize(self, dataDUE, dataW, dataToken):
        start = datetime.now()

        self.theta = probNormalize(np.random.random([self.K]))
        self.pi = probNormalize(np.random.random([2]))
        self.eta = probNormalize(
            np.random.random([self.K, self.G, self.E]) + 0.1)
        self.phiB = probNormalize(np.random.random([self.V]))
        self.phiT = probNormalize(np.random.random([self.K, self.V]))
        self.psi = probNormalize(np.random.random([self.U, self.G]))

        self.z = np.zeros([self.D], dtype=np.int8)
        self.y = []
        self.x = []
        for d in range(self.D):
            self.z[d] = multinomial(self.theta)
            self.y.append(multinomial(self.pi, self.Nd[d]))
            ## time consuming, replaced with below ##
            # doc_x = []
            # for m in range(self.Md[d]):
            #     u = np.random.randint(0,self.U)
            #     doc_x.append(multinomial(self.psi[u]))
            # self.x.append(np.array(doc_x, dtype=np.int8))
            self.x.append(multinomial(self.psi[0], self.Md[d]))

        duration = datetime.now() - start
        self._log("_initialize() takes %fs" % duration.total_seconds())
Exemplo n.º 3
0
    def _initialize(self, dataDUE):
        start = datetime.now()

        self.theta = probNormalize(np.random.random([self.D, self.K]))
        self.phi = probNormalize(np.random.random([self.K, self.V]))
        self.eta = np.random.random([self.K, self.E])
        self.z = []
        for d in range(self.D):
            z_dist = self.theta[d]
            Nd = self.Nd[d]
            self.z.append(multinomial(z_dist, Nd))
        self.eta_beta_inv = multivariateBeta_inv(self.eta)

        self.TI = np.zeros([self.D, self.K], dtype=np.int32)
        self.TV = np.zeros([self.K, self.V], dtype=np.int32)
        self.dataE_smoothed = {}
        for docdata in dataDUE.generate():
            d, docToken, [doc_u, doc_e] = docdata
            doc_z = self.z[d]
            for n in range(self.Nd[d]):
                w = docToken[n]
                w_z = doc_z[n]
                self.TI[d, w_z] += 1
                self.TV[w_z, w] += 1
            doc_E = np.sum(np.identity(self.E, dtype=np.float64)[:, doc_e], axis=1)
            docE = probNormalize(doc_E + SMOOTH_FACTOR)
            self.dataE_smoothed[d] = docE

        duration = datetime.now() - start
        self._log("_initialize() takes %fs" % duration.total_seconds())
Exemplo n.º 4
0
    def _doc_z_update(self, d, doc_u, doc_e, docW, docToken):
        """ update document-level topic """
        doc_z = self.z[d]
        doc_XE = self.DXE[d]
        doc_Y1V = self.DY1V.getrow(d).tocsr()
        doc_Y1V_array = doc_Y1V.toarray().squeeze()

        # calculate leave-one out statistics #
        TI_no_d, TXE_no_d, Y1TV_no_d = self.TI, self.TXE, self.Y1TV
        TI_no_d[doc_z] += -1
        TXE_no_d[doc_z, :, :] += -doc_XE
        Y1TV_no_d[doc_z, :] += -doc_Y1V_array

        # conditional probability #
        prob_doc_z = self._prob_doc_z(TI_no_d, TXE_no_d, Y1TV_no_d, doc_XE,
                                      doc_Y1V)

        # new sampled result #
        doc_z_new = int(multinomial(prob_doc_z))

        # update #
        self.z[d] = doc_z_new
        TI_no_d[doc_z_new] += 1
        TXE_no_d[doc_z_new, :, :] += doc_XE
        Y1TV_no_d[doc_z_new, :] += doc_Y1V_array
        self.TI, self.TXE, self.Y1TV = TI_no_d, TXE_no_d, Y1TV_no_d
Exemplo n.º 5
0
    def _GibbsSamplingLocal(self, dataE, dataW, dataToken, epoch):
        """
        Gibbs sampling word-level topic
        """
        pbar = tqdm(range(self.D), total=self.D, desc='({0:^3})'.format(epoch))
        for d in pbar:  # sequentially sampling
            doc_Nd = self.Nd[d]
            docE = probNormalize(dataE[d] + SMOOTH_FACTOR)
            docToken = dataToken[d]
            for n in range(doc_Nd):
                w = docToken[n]
                w_z = self.z[d][n]

                ## sampling ##
                # calculate leave-one-out statistics #
                TI_no_dn, TV_no_dn = self.TI, self.TV
                TI_no_dn[d, w_z] += -1
                TV_no_dn[w_z, w] += -1
                # conditional probability #
                prob_pa = TI_no_dn[d] + self.alpha
                prob_pb = np.divide(TV_no_dn[:, w] + self.beta,
                                    np.sum(TV_no_dn + self.beta, axis=1))
                prob_pc = np.multiply(
                    self.eta_beta_inv,
                    np.prod(np.power(docE, self.eta - 1), axis=1))
                prob_w_z = probNormalize(prob_pa * prob_pb * prob_pc)
                # new sampled result #
                w_z_new = multinomial(prob_w_z)
                # update #
                self.z[d][n] = w_z_new
                TI_no_dn[d, w_z_new] += 1
                TV_no_dn[w_z_new, w] += 1
                self.TI, self.TV = TI_no_dn, TV_no_dn
Exemplo n.º 6
0
    def _initialize(self, dataE, dataW, dataToken):
        start = datetime.now()

        self.theta = probNormalize(np.random.random([self.D, self.K]))
        self.phi = probNormalize(np.random.random([self.K, self.V]))
        self.eta = np.random.random([self.K, self.E])
        self.z = []
        for d in range(self.D):
            z_dist = self.theta[d]
            Nd = self.Nd[d]
            self.z.append(multinomial(z_dist, Nd))
        self.eta_beta_inv = multivariateBeta_inv(self.eta)

        self.TI = np.zeros([self.D, self.K], dtype=np.int32)
        self.TV = np.zeros([self.K, self.V], dtype=np.int32)
        for d in range(self.D):
            docToken = dataToken[d]
            doc_z = self.z[d]
            for n in range(self.Nd[d]):
                w = docToken[n]
                w_z = doc_z[n]
                self.TI[d, w_z] += 1
                self.TV[w_z, w] += 1

        duration = datetime.now() - start
        print "_initialize() takes %fs" % duration.total_seconds()
Exemplo n.º 7
0
    def _initialize(self, dataE, dataW, dataToken):
        start = datetime.now()

        self.theta = probNormalize(np.random.random([self.K]))
        self.pi = probNormalize(np.random.random([2]))
        self.eta = probNormalize(np.random.random([self.K, self.E]))
        self.phiB = probNormalize(np.random.random([self.V]))
        self.phiT = probNormalize(np.random.random([self.K, self.V]))
        self.z = np.zeros([self.D], dtype=np.int8)
        self.y = []
        for d in range(self.D):
            self.z[d] = multinomial(self.theta)
            Nd = self.Nd[d]
            self.y.append(multinomial(self.pi, Nd))

        duration = datetime.now() - start
        print "_initialize() takes %fs" % duration.total_seconds()
Exemplo n.º 8
0
    def _GibbsSamplingLocal(self, dataE, dataW, dataToken, epoch):
        """
        Gibbs sampling word-level emotion and topic
        """
        pbar = tqdm(range(self.D),
                    total = self.D,
                    desc='({0:^3})'.format(epoch))
        for d in pbar:                                 # sequentially sampling
            doc_Nd = self.Nd[d]
            docE = dataE[d]
            docToken = dataToken[d]
            for n in range(doc_Nd):
                w = docToken[n]
                w_z = self.z[d][n]
                w_esp = self.esp[d][n]

                ## sampling ##
                # calculate leave-one out statistics #
                TE_no_dn, TV_no_dn, TI_no_dn, IE_no_dn,  = self.TE, self.TV, self.TI, self.IE
                TE_no_dn[w_z, w_esp] += -1
                TV_no_dn[w_z, w] += -1
                TI_no_dn[w_z] += -1
                IE_no_dn[w_esp] += -1
                # conditional probability #
                prob_w_esp = np.divide(np.multiply((self.alpha + TE_no_dn[w_z]), docE),
                                       (self.K * self.alpha + IE_no_dn))
                prob_w_esp = probNormalize(prob_w_esp)
                prob_w_z = np.divide(np.multiply((self.alpha + TE_no_dn[:, w_esp]), (self.beta + TV_no_dn[:, w])),
                                     (self.V * self.beta + TI_no_dn))

                prob_w_z = probNormalize(prob_w_z)
                # new sampled result #
                w_esp_new = multinomial(prob_w_esp)
                w_z_new = multinomial(prob_w_z)
                # update #
                self.z[d][n] = w_z_new
                self.esp[d][n] = w_esp_new
                TE_no_dn[w_z_new, w_esp_new] += 1
                TV_no_dn[w_z_new, w] += 1
                TI_no_dn[w_z_new] += 1
                IE_no_dn[w_esp_new] += 1
                self.TE, self.TV, self.TI, self.IE = TE_no_dn, TV_no_dn, TI_no_dn, IE_no_dn
Exemplo n.º 9
0
    def _GibbsSamplingLocal(self, dataDUE, epoch):
        """
        Gibbs sampling word-level topic
        """
        pbar = tqdm(dataDUE.generate(),
                    total=self.D_train,
                    desc='({0:^3})'.format(epoch))
        for docdata in pbar:  # sequentially sampling
            d, docToken, [doc_u, doc_e] = docdata
            doc_Nd = self.Nd[d]
            if d in self.dataE_smoothed:
                docE = self.dataE_smoothed[d]
            else:
                doc_E = np.sum(np.identity(self.E, dtype=np.float64)[:, doc_e],
                               axis=1)
                docE = probNormalize(doc_E + SMOOTH_FACTOR)
                self.dataE_smoothed[d] = docE
            for n in range(doc_Nd):
                w = docToken[n]
                w_z = self.z[d][n]

                ## sampling ##
                # calculate leave-one-out statistics #
                TI_no_dn, TV_no_dn = self.TI, self.TV
                TI_no_dn[d, w_z] += -1
                TV_no_dn[w_z, w] += -1
                # conditional probability #
                prob_pa = TI_no_dn[d] + self.alpha
                prob_pb = np.divide(TV_no_dn[:, w] + self.beta,
                                    np.sum(TV_no_dn + self.beta, axis=1))
                prob_pc = np.multiply(
                    self.eta_beta_inv,
                    np.prod(np.power(docE, self.eta - 1), axis=1))
                prob_w_z = probNormalize(prob_pa * prob_pb * prob_pc)
                # new sampled result #
                w_z_new = multinomial(prob_w_z)
                # update #
                self.z[d][n] = w_z_new
                TI_no_dn[d, w_z_new] += 1
                TV_no_dn[w_z_new, w] += 1
                self.TI, self.TV = TI_no_dn, TV_no_dn
Exemplo n.º 10
0
    def _GibbsSamplingLocal(self, dataE, dataW, dataToken, epoch):
        """
        Gibbs sampling word-level background-vs-topic and document-level topic
        """
        pbar = tqdm(range(self.D),
                    total = self.D,
                    desc='({0:^3})'.format(epoch))
        for d in pbar:                                 # sequentially sampling
            doc_Nd = self.Nd[d]
            docE = dataE[d]
            docW = dataW[d]
            docToken = dataToken[d]
            doc_z = self.z[d]

            # intermediate parameters calculation #
            Y1T = np.sum(self.Y1TV, axis=1)

            for n in range(doc_Nd):
                w = docToken[n]
                w_y = self.y[d][n]

                ## sampling for y ##
                # calculate leave-one out statistics #
                YI_no_dn_y, Y0V_no_dn_y, Y1TV_no_dn_y = self.YI, self.Y0V, self.Y1TV
                Y1T_no_dn_y = Y1T

                YI_no_dn_y[w_y] += -1
                if w_y == 0:
                    Y0V_no_dn_y[w] += -1
                elif w_y == 1:
                    Y1TV_no_dn_y[doc_z, w] += -1
                    Y1T_no_dn_y[doc_z] += -1
                    self.DY1V[d, w] += -1                           # delete w_y == 1 word
                else:
                    raise ValueError("w_y not 0 or 1")
                # conditional probability #
                prob_w_y_unnorm = np.zeros([2],dtype=np.float32)
                prob_w_y_unnorm[0] = (self.delta + YI_no_dn_y[0]) * (self.beta + Y0V_no_dn_y[w]) / \
                              (self.V * self.beta + YI_no_dn_y[0])
                prob_w_y_unnorm[1] = (self.delta + YI_no_dn_y[1]) * (self.beta + Y1TV_no_dn_y[doc_z, w]) / \
                              (self.V * self.beta + Y1T_no_dn_y[doc_z])
                prob_w_y = probNormalize(prob_w_y_unnorm)
                # new sampled result #
                try:
                    w_y_new = multinomial(prob_w_y)
                except ValueError, e:
                    print prob_w_y_unnorm
                    print prob_w_y
                    print np.sum(prob_w_y), np.sum(prob_w_y) > 1.0
                    print YI_no_dn_y, self.YI, Y0V_no_dn_y[w], Y1TV_no_dn_y[doc_z,w], Y1T_no_dn_y[doc_z]
                    print d
                    raise e
                # update #
                self.y[d][n] = w_y_new
                YI_no_dn_y[w_y_new] += 1
                if w_y_new == 0:
                    Y0V_no_dn_y[w] += 1
                elif w_y_new == 1:
                    Y1TV_no_dn_y[doc_z, w] += 1
                    Y1T_no_dn_y[doc_z] += 1
                    self.DY1V[d, w] += 1                            # add back word with w_y_new == 1
                else:
                    raise ValueError("w_y not 0 or 1")
                self.YI, self.Y0V, self.Y1TV = YI_no_dn_y, Y0V_no_dn_y, Y1TV_no_dn_y
                Y1T = Y1T_no_dn_y

            doc_Y1V = self.DY1V.getrow(d).tocsr()
            doc_Y1V_array = doc_Y1V.toarray().squeeze()
            ## sampling for z ##
            # calculate leave-one out statistics #
            TE_no_d_z, Y1TV_no_d_z, TI_no_d_z = self.TE, self.Y1TV, self.TI

            TE_no_d_z[doc_z,:] += -docE
            Y1TV_no_d_z[doc_z,:] += -doc_Y1V_array
            TI_no_d_z[doc_z] += -1
            # conditional probability #
            prob_doc_z = self._prob_doc_z(TE_no_d_z, Y1TV_no_d_z, TI_no_d_z, docE, docW, doc_Y1V)
            # new sampled result #
            doc_z_new = multinomial(prob_doc_z)
            # update #
            self.z[d] = doc_z_new
            TE_no_d_z[doc_z_new,:] += docE
            Y1TV_no_d_z[doc_z_new, :] += doc_Y1V_array
            TI_no_d_z[doc_z_new] += 1
            self.TE, self.Y1TV, self.TI = TE_no_d_z, Y1TV_no_d_z, TI_no_d_z