예제 #1
0
파일: tlc.py 프로젝트: jperla/happynews
 def optimize_documents(self):
     """Converts the local documents from sparse representation into normal vector."""
     # OPTIMIZATION: turn all documents into arrays
     self.documents = [topiclib.doc_to_array(d) for d in self.documents]
     self.comments = [topiclib.doc_to_array(d) for d in self.comments]
     self.labeled = [topiclib.doc_to_array(d) for d in self.labeled]
     self.background = [topiclib.doc_to_array(d) for d in self.background]
예제 #2
0
def test_doc_to_array():
    text = [(0,1), (1,1)]
    out = lm.doc_to_array(text)
    answer = np.array([0, 1])
    assert same(out, answer)

    doc1 = [(1,3), (2,2), (0,1)]
    out = lm.doc_to_array(doc1)
    answer = np.array([1, 1, 1, 2, 2, 0])
    assert same(out, answer)
예제 #3
0
def test_lda_recalculate_beta():
    K = 2
    W = 3

    doc0 = [(0,3), (1,1)]
    doc1 = [(1,3), (2,2), (0,1)]
    text = [doc0,doc1]

    text = [doc0,doc1]
    beta = np.empty((K,W), dtype=float)
    out = beta.copy()

    phi0 = np.zeros((sum([d[1] for d in doc0]), K))
    # two to topic one (word 0)
    # two to topic two (word 1)
    phi0[0][0] = 1
    phi0[1][0] = 1
    phi0[2][1] = 1
    phi0[3][1] = 1

    phi1 = np.zeros((sum([d[1] for d in doc1]), K))
    phi1[0][1] = 1
    phi1[1][1] = 1
    phi1[2][1] = 1
    phi1[3][0] = 1
    phi1[4][1] = 1
    phi1[5][0] = 1

    phi = [phi0, phi1]

    answer = np.array([[0.75, 0.0, 0.25],
                       [1.0/6, 2.0/3, 1.0/6]])

    assert out.shape == (2,3)
    lm.lda_recalculate_beta(text, out, phi)
    assert out.shape == (2,3)

    assert not same(beta, out)
    assert same(out, answer)

    # now test on docarray
    out = beta.copy()
    assert out.shape == (2,3)
    lm.lda_recalculate_beta([lm.doc_to_array(t) for t in text], out, phi)
    assert out.shape == (2,3)

    assert not same(beta, out)
    assert same(out, answer)


    # test log space
    log_out = np.log(out)
    log_phi = [np.log(p) for p in phi]
    assert log_out.shape == (2,3)
    lm.lda_recalculate_log_beta(text, log_out, log_phi)
    assert log_out.shape == (2,3)

    assert not same(beta, np.exp(log_out))
    assert same(np.exp(log_out), answer)
예제 #4
0
def test_slda_update_phi():
    gamma = np.array([3,4,5])
    text = [(0,1), (1,1)]
    beta = np.array([
                     [0.75, 0.25],
                     [0.40, 0.60],
                     [0.10, 0.90],
                    ])
    y_d = -0.5
    eta = np.array([-2.5, 1.6, 0.1])
    sigma_squared = 0.8
    phi = np.array([
                    [0.65, 0.25, 0.10],
                    [0.09, 0.78, 0.13],
                   ])

    """
    update phid:
    φd,n ∝ exp{ E[log θ|γ] + 
                E[log p(wn|β1:K)] + 
                (y / Nσ2) η  — 
                [2(ηTφd,-n)η + (η∘η)] / (2N2σ2) }
    Note that E[log p(wn|β1:K)] = log βTwn
    """
    eta_dot_eta = np.array([6.25, 2.56, 0.01])
    term1 = np.array([-1.51987734, -1.18654401154, -0.93654401154401])
    term2 = np.log(np.array([0.75, 0.40, 0.10]))
    term3 = np.array([0.78125, -0.5, -0.03125])
    term4 = -0.15625 * ((2 * (np.dot(eta, phi[1])) * eta) + eta_dot_eta)

    first_row = np.exp(term1 + term2 + term3 + term4)
    first_row /= np.sum(first_row) # normalize it, then set

    # note that this happens in sequential order, so must use first row, not old phi[0]
    term2 = np.log(np.array([0.25, 0.60, 0.90]))
    term4 = -0.15625 * ((2 * (np.dot(eta, first_row)) * eta) + eta_dot_eta)

    second_row = np.exp(term1 + term2 + term3 + term4)
    answer = np.array([first_row, second_row])

    graphlib.row_normalize(answer)

    out = phi.copy()
    lm.slda_update_phi(text, out, gamma, beta, y_d, eta, sigma_squared)
    assert same(out, answer)

    # test the fast updates; which will be slightly different
    fast_answer = answer.copy()
    fast_answer[1,:] = np.array([0.03422278, 0.26873478, 0.69704244])

    out = phi.copy()
    docarray = lm.doc_to_array([(0,1), (1,1)])
    lm.slda_update_phi(docarray, out, gamma, beta, y_d, eta, sigma_squared)
    
    assert same(out, fast_answer)