示例#1
0
    def testCrossValPerplexityOnRealDataWithLdaGibbsInc(self):
        ActiveFolds = 3
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(np.int32) # Gibbs expects integers as input, regardless of model dtype
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Training setup
        TrainSamplesPerTopic = 10
        QuerySamplesPerTopic = 2
        Thin = 2
        Debug = False

        # Start running experiments
        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPlan = lda_gibbs.newTrainPlan(K * TrainSamplesPerTopic, thin=Thin, debug=Debug)
            queryPlan = lda_gibbs.newTrainPlan(K * QuerySamplesPerTopic, thin=Thin, debug=Debug)

            trainPerps = []
            queryPerps = []
            for fold in range(ActiveFolds): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)
                estData, evalData = queryData.doc_completion_split()

                model = lda_gibbs.newModelAtRandom(trainData, K, dtype=dtype)
                query = lda_gibbs.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = lda_gibbs.train (trainData, model, query, trainPlan)

                like = lda_gibbs.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                query = lda_gibbs.newQueryState(estData, model)
                _, queryResult = lda_gibbs.query(estData, model, query, queryPlan)

                like = lda_gibbs.log_likelihood(evalData, model, queryResult)
                perp = perplexity_from_like(like, evalData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / ActiveFolds)
            queryPerps.append(sum(queryPerps) / ActiveFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
示例#2
0
    def testPerplexityOnRealData(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        K = 50
        model      = mtm.newModelAtRandom(data, K, K - 1, dtype=dtype)
        queryState = mtm.newQueryState(data, model)
        trainPlan  = mtm.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=True)

        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = mtm.train (data, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')

        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')

        fig.show()
        plt.show()

        fig, ax1 = plt.subplots()
        ax1.imshow(model.topicCov, interpolation="nearest", cmap=cm.Greys_r)
        fig.show()
        plt.show()

        # Print out the most likely topic words
        # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
        vocab = mtm.wordDists(model)
        topWordCount = 10
        kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

        like = mtm.log_likelihood(data, model, query)
        perp = perplexity_from_like(like, data.word_count)

        print ("Prior %s" % (str(model.topicPrior)))
        print ("Perplexity: %f\n\n" % perp)

        for k in range(model.K):
            print("\nTopic %d\n=============================" % k)
            print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))
示例#3
0
def train(data, model, query, plan):
    iterations, burnIn, thin, weightUpdateInterval, _, debug = \
        plan.iterations, plan.burnIn, plan.thin, plan.weightUpdateInterval, plan.logFrequency, plan.debug
    w_list, z_list, docLens = \
        query.w_list, query.z_list, query.docLens
    K, T, weights, topicPrior, vocabPrior, _, _, _, dtype, name = \
        model.K, model.T, model.weights, model.topicPrior, model.vocabPrior, model.topicSum, model.vocabSum, model.numSamples, model.dtype, model.name

    assert model.dtype == np.float64, "This is only implemented for 64-bit floats"
    D = docLens.shape[0]
    X = data.feats
    assert docLens.max(
    ) < 65536, "This only works for documents with fewer than 65,536 words"

    ndk = np.zeros((D, K), dtype=np.uint16)
    nkv = np.zeros((K, T), dtype=np.int32)
    nk = np.zeros((K, ), dtype=np.int32)

    num_samples = (iterations - burnIn) // thin
    n_dk_samples = np.zeros((D, K, num_samples), dtype=np.uint16)
    topicSum = np.zeros((D, K), dtype=dtype)
    vocabSum = np.zeros((K, T), dtype=dtype)

    compiled.initGlobalRng(0xC0FFEE)
    compiled.sumSuffStats(w_list, z_list, docLens, ndk, nkv, nk)

    # Burn in
    alphas = X.dot(weights.T)
    if debug: print("Burning")
    compiled.sample (burnIn, burnIn + 1, w_list, z_list, docLens, \
            alphas, ndk, nkv, nk, n_dk_samples, topicSum, vocabSum, \
            vocabPrior, False, debug)

    # True samples
    if debug: print("Training")
    sample_count = 0
    for _ in range(0, iterations - burnIn, weightUpdateInterval):
        alphas[:, :] = X.dot(weights.T)
        sample_count += compiled.sample (weightUpdateInterval, thin, w_list, z_list, docLens, \
                alphas, ndk, nkv, nk, n_dk_samples, topicSum, vocabSum, \
                vocabPrior, False, debug)

        if debug:  # Print out the perplexity so far
            likely = log_likelihood(data, \
                ModelState (K, T, weights, topicPrior, vocabPrior, n_dk_samples, topicSum, vocabSum, sample_count, dtype, name), \
                QueryState (w_list, z_list, docLens, topicSum, sample_count))
            perp = perplexity_from_like(likely, data)
            print("Sample-Count = %3d  Perplexity = %7.2f" %
                  (sample_count, perp))

        updateWeights(n_dk_samples, sample_count, X, weights, debug)


#     compiled.freeGlobalRng()

    return \
        ModelState (K, T, weights, topicPrior, vocabPrior, n_dk_samples, topicSum, vocabSum, sample_count, dtype, name), \
        QueryState (w_list, z_list, docLens, topicSum, sample_count), \
        (np.zeros(1), np.zeros(1), np.zeros(1))
示例#4
0
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype
    
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    W = data.words
    D = W.shape[0]
    
    # Necessary temp variables (notably the count of topic to word assignments
    # per topic per doc)
    isigT = la.inv(sigT)
    
    # Update the Variances
    varcs = 1./((n * (K-1.)/K)[:,np.newaxis] + isigT.flat[::K+1])
    debugFn (0, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n)
    
    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    R = W.copy()
    for itr in range(iterations):
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        V = expMeans * R.dot(vocab.T)
        
        # Update the Means
        rhs = V.copy()
        rhs += n[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean)
        rhs -= n[:,np.newaxis] * rowwise_softmax(means, out=means)
        if diagonalPriorCov:
            means = varcs * rhs
        else:
            for d in range(D):
                means[d,:] = la.inv(isigT + n[d] * A).dot(rhs[d,:])
        
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, n)
        
        like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, n))
        perp = perplexity_from_like(like, data.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    return modelState, queryState
示例#5
0
    def testCrossValPerplexityOnRealDataWithLdaInc(self):
        ActiveFolds = 3
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # Initialise the model
        trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)
        queryPlan = lda.newTrainPlan(iterations=50, logFrequency=5, fastButInaccurate=False, debug=False)

        topicCounts = [30, 35, 40, 45, 50] # [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        for K in topicCounts:
            trainPerps = []
            queryPerps = []
            for fold in range(ActiveFolds): # range(NumFolds):
                trainData, queryData = data.cross_valid_split(fold, NumFolds)

                model = lda.newModelAtRandom(trainData, K, dtype=dtype)
                query = lda.newQueryState(trainData, model)

                # Train the model, and the immediately save the result to a file for subsequent inspection
                model, trainResult, (_, _, _) = lda.train (trainData, model, query, trainPlan)

                like = lda.log_likelihood(trainData, model, trainResult)
                perp = perplexity_from_like(like, trainData.word_count)
                trainPerps.append(perp)

                estData, evalData = queryData.doc_completion_split()
                query = lda.newQueryState(estData, model)
                model, queryResult = lda.query(estData, model, query, queryPlan)

                like = lda.log_likelihood(evalData, model, queryResult)
                perp = perplexity_from_like(like, evalData.word_count)
                queryPerps.append(perp)

            trainPerps.append(sum(trainPerps) / ActiveFolds)
            queryPerps.append(sum(queryPerps) / ActiveFolds)
            print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps])))
            print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
示例#6
0
    def testPerplexityOnRealDataWithLdaInc(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
        perps = []
        for K in topicCounts:
            model      = lda.newModelAtRandom(data, K, dtype=dtype)
            queryState = lda.newQueryState(data, model)
            trainPlan  = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False)

            # Train the model, and the immediately save the result to a file for subsequent inspection
            model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan)
    #        with open(newModelFileFromModel(model), "wb") as f:
    #            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

            # Print out the most likely topic words
            # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
            # vocab = lda.wordDists(model)
            # topWordCount = 10
            # kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

            like = lda.log_likelihood(data, model, query)
            perp = perplexity_from_like(like, data.word_count)

            perps.append(perp)

            print ("K = %2d : Perplexity = %f\n\n" % (K, perp))
            #
            # for k in range(model.K):
            #     print("\nTopic %d\n=============================" % k)
            #     print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(topicCounts, perps, 'b-')
        ax1.set_xlabel('Topic Count')
        ax1.set_ylabel('Perplexity', color='b')

        fig.show()
        plt.show()
示例#7
0
def _debug_with_bound(itr, var_value, var_name, data, K, topicMean, topicCov,
                      vocab, dtype, means, varcs, A, n):
    if np.isnan(var_value).any():
        printStderr("WARNING: " + var_name + " contains NaNs")
    if np.isinf(var_value).any():
        printStderr("WARNING: " + var_name + " contains INFs")
    if var_value.dtype != dtype:
        printStderr("WARNING: dtype(" + var_name + ") = " +
                    str(var_value.dtype))

    model = ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME)
    query = QueryState(means, varcs, n)

    old_bound = _debug_with_bound.old_bound
    bound = var_bound(data, model, query)
    diff = "" if old_bound == 0 else "%15.4f" % (bound - old_bound)
    _debug_with_bound.old_bound = bound

    addendum = ""
    if var_name == "topicCov":
        try:
            addendum = "det(topicCov) = %g" % (la.det(topicCov))
        except:
            addendum = "det(topicCov) = <undefined>"

    if isnan(bound):
        printStderr("Bound is NaN")
    else:
        perp = perplexity_from_like(log_likelihood(data, model, query),
                                    data.word_count)
        if int(bound - old_bound) < 0:
            printStderr(
                "Iter %3d Update %-15s Bound %22f (%15s) (%5.0f)     %s" %
                (itr, var_name, bound, diff, perp, addendum))
        else:
            print("Iter %3d Update %-15s Bound %22f (%15s) (%5.0f)  %s" %
                  (itr, var_name, bound, diff, perp, addendum))
示例#8
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want it)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats

    assert W.dtype == modelState.dtype
    assert X.dtype == modelState.dtype
    
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype
    
    # Book-keeping for logs
    boundIters  = np.zeros(shape=(iterations // logFrequency,))
    boundValues = np.zeros(shape=(iterations // logFrequency,))
    likeValues  = np.zeros(shape=(iterations // logFrequency,))
    bvIdx = 0
    
    _debug_with_bound.old_bound = 0
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    sigT_regularizer = 0.001
    
    aI_P = 1./lfv  * ssp.eye(P, dtype=dtype)
    tI_F = 1./fv * ssp.eye(F, dtype=dtype)
    
    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    if ssp.issparse(R_A):
        R_A = R_A.todense()  # dense inverse typically as fast or faster than sparse inverse
    R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")
    
    s.fill(0)
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the covariance of the prior
        diff_a_yv = (A-Y.dot(V))
        diff_m_xa = (means-X.dot(A.T))
        
        sigT  = 1./lfv * (Y.dot(Y.T))
        sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T)
        sigT += diff_m_xa.T.dot(diff_m_xa)
        sigT.flat[::K+1] += varcs.sum(axis=0)
        sigT /= (P+F+D)
        sigT.flat[::K+1] += sigT_regularizer
        
        # Diagonalize it
        sigT = np.diag(sigT.flat[::K+1])
        # and invert it.
        isigT = np.diag(np.reciprocal(sigT.flat[::K+1]))
        debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Building Blocks - temporarily replaces means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        S = expMeans * R.dot(vocab.T)
        
        # Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        
        # Reset the means to their original form, and log effect of vocab update
        debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Finally update the parameter V
        V = la.inv(R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # And now this is the E-Step, though it's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the distribution on the latent space
        R_Y_base = aI_P + 1/fv * V.dot(V.T)
        R_Y = la.inv(R_Y_base)
        debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        Y = 1./fv * A.dot(V.T).dot(R_Y)
        debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the mapping from the features to topics
        A = (1./fv * (Y).dot(V) + (X.T.dot(means)).T).dot(R_A)
        debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Means
        vMat   = (s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S
        rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this
        lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] *  lxi)  # inverse of D diagonal matrices...
        means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent
                                # do doing a hadamard product for all d
        debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Variances
        varcs = 1./(n[:,np.newaxis] * lxi + isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the approximation parameters
        lxi = 2 * ctm.negJakkolaOfDerivedXi(means, varcs, s)
        debugFn (itr, lxi, "lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # s can sometimes grow unboundedly
        # Follow Bouchard's suggested approach of fixing it at zero
        #
#         s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1)
#         debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, lxi, s, n)
            
            boundValues[bvIdx] = var_bound(data, modelState, queryState, XTX)
            likeValues[bvIdx]  = log_likelihood(data, modelState, queryState)
            boundIters[bvIdx]  = itr
            perp = perplexity_from_like(likeValues[bvIdx], n.sum())
            print (time.strftime('%X') + " : Iteration %d: Perplexity %4.2f  bound %f" % (itr, perp, boundValues[bvIdx]))
            if bvIdx > 0 and  boundValues[bvIdx - 1] > boundValues[bvIdx]:
                printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx]))
#             print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

            # Check to see if the improvment in the likelihood has fallen below the threshold
            if bvIdx > 1 and boundIters[bvIdx] > 50:
                lastPerp = perplexity_from_like(likeValues[bvIdx - 1], n.sum())
                if lastPerp - perp < 1:
                    boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, likeValues, bvIdx)
                    return modelState, queryState, (boundIters, boundValues, likeValues)
            bvIdx += 1


    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, lxi, s, n), \
        (boundIters, boundValues, likeValues)
示例#9
0
    def testPerplexityOnRealDataWithMtm2(self):
        dtype = np.float64 # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            d = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount)

        # IDF frequency for when we print out the vocab later
        freq = np.squeeze(np.asarray(data.words.sum(axis=0)))
        scale = np.reciprocal(1 + freq)

        # Initialise the model
        K = 30 # TopicCount
        model      = mtm2.newModelAtRandom(data, K, dtype=dtype)
        queryState = mtm2.newQueryState(data, model)
        trainPlan  = mtm2.newTrainPlan(iterations=200, logFrequency=10, fastButInaccurate=False, debug=False)

        # Train the model, and the immediately save the result to a file for subsequent inspection
        model, query, (bndItrs, bndVals, bndLikes) = mtm2.train(data, model, queryState, trainPlan)
#        with open(newModelFileFromModel(model), "wb") as f:
#            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

        # Plot the evolution of the bound during training.
        fig, ax1 = plt.subplots()
        ax1.plot(bndItrs, bndVals, 'b-')
        ax1.set_xlabel('Iterations')
        ax1.set_ylabel('Bound', color='b')

        ax2 = ax1.twinx()
        ax2.plot(bndItrs, bndLikes, 'r-')
        ax2.set_ylabel('Likelihood', color='r')

        fig.show()
        plt.show()

        fig, ax1 = plt.subplots()
        ax1.imshow(model.topicCov, interpolation="nearest", cmap=cm.Greys_r)
        fig.show()
        plt.show()

        # Print out the most likely topic words
        # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
        vocab = mtm2.wordDists(model)
        topWordCount = 10
        kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)]

        like = mtm2.log_likelihood(data, model, query)
        perp = perplexity_from_like(like, data.word_count)

        print("Perplexity: %f\n\n" % perp)

        for k in range(model.K):
            print("\nTopic %d\n=============================" % k)
            print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

        print ("Most likely documents for each topic")
        print ("====================================")
        with open ("/Users/bryanfeeney/iCloud/Datasets/ACL/ACL.100/doc_ids.pkl", 'rb') as f:
            fileIds = pkl.load (f)
        docs_dict = [fileIds[fi] for fi in data.order]

        for k in range(model.K):
            arg_max_prob = np.argmax(query.means[:, k])
            print("K=%2d  Document ID = %s (found at %d)" % (k, docs_dict[arg_max_prob], arg_max_prob))

        print ("Done")

        with open ("/Users/bryanfeeney/Desktop/mtm2-" + str(K) + ".pkl", "wb") as f:
            pkl.dump((model, query), f)
示例#10
0
def train(data, model: ModelState, query: QueryState, plan: TrainPlan, updateVocab=True):
    """
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    query - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    """
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug
    docLens = query.docLens
    topicDist = topicDists(query)
    K, topicPrior, vocabPrior, wordDistParam, corpusTopicDistParam, dtype = \
        model.K, model.topicPrior, model.vocabPrior, wordDistsDirichletParam(model), corpusTopicDistDirichletParam(model), model.dtype

    W = data.words

    iters, bnds, likes = [], [], []

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError("Input document-term matrix contains at least one document with no words")
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    lnCorpusTopicDist = fns.digamma(corpusTopicDistParam) - fns.digamma(corpusTopicDistParam.sum())
    lnWordDist = fns.digamma(wordDistParam) - fns.digamma(wordDistParam.sum(axis=1))[:, np.newaxis]
    oldTopicDist = np.ndarray(topicDist.shape, dtype=topicDist.dtype)

    for itr in range(iterations):
        oldTopicDist[:, :] = topicDist[:, :]

        topicDist[:, :] = (data.words @ lnWordDist.T)
        topicDist[:, :] += lnCorpusTopicDist[np.newaxis, :]
        topicDist -= topicDist.max(axis=1)[:, np.newaxis]
        np.exp(topicDist, out=topicDist)
        topicDist /= topicDist.sum(axis=1)[:, np.newaxis]

        if np.abs(oldTopicDist - topicDist).sum() < CHANGE_TOLERANCE * topicDist.shape[0] * topicDist.shape[1]:
            logging.info(f"Stopping train after {itr + 1} iterations as change in topic distibution is minimal")
            break

        corpusTopicDistParam = topicDist.sum(axis=0) + model.topicPrior
        fns.digamma(corpusTopicDistParam, out=lnCorpusTopicDist)
        lnCorpusTopicDist -= fns.digamma(corpusTopicDistParam.sum())

        # Derive new parameter estimates
        wordDistParam = (data.words.T @ topicDist).T \
                      + model.vocabPrior[np.newaxis, :]
        fns.digamma(wordDistParam, out=lnWordDist)
        lnWordDist -= fns.digamma(wordDistParam.sum(axis=1))[:, np.newaxis]

        if debug or (logFrequency > 0 and itr % logFrequency == 0):
            m = ModelState(K, topicPrior, vocabPrior, wordDistParam, corpusTopicDistParam, True, dtype, model.name)
            q = QueryState(query.docLens, topicDist, True)

            iters.append(itr)
            bnds.append(var_bound(data, m, q))
            likes.append(log_likelihood_point(data, m, q))

            perp = perplexity_from_like(likes[-1], W.sum())
            print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" % (itr, perp, bnds[-1]))

            if len(iters) > 2 and iters[-1] > 50:
                lastPerp = perplexity_from_like(likes[-2], W.sum())
                if lastPerp - perp < 1:
                    break

    return ModelState(K, topicPrior, vocabPrior, wordDistParam, corpusTopicDistParam, True, dtype, model.name), \
           QueryState(query.docLens, topicDist, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, T = W.shape
    F = X.shape[1]

    # tmpNumDense = np.array([
    #     4	, 8	, 2	, 0	, 0,
    #     0	, 6	, 0	, 17, 0,
    #     12	, 13	, 1	, 7	, 8,
    #     0	, 5	, 0	, 0	, 0,
    #     0	, 6	, 0	, 0	, 44,
    #     0	, 7	, 2	, 0	, 0], dtype=np.float64).reshape((6,5))
    # tmpNum = ssp.csr_matrix(tmpNumDense)
    #
    # tmpDenomleft = (rd.random((tmpNum.shape[0], 12)) * 5).astype(np.int32).astype(np.float64) / 10
    # tmpDenomRight = (rd.random((12, tmpNum.shape[1])) * 5).astype(np.int32).astype(np.float64)
    #
    # tmpResult = tmpNum.copy()
    # tmpResult = sparseScalarQuotientOfDot(tmpNum, tmpDenomleft, tmpDenomRight)
    #
    # print (str(tmpNum.todense()))
    # print (str(tmpDenomleft.dot(tmpDenomRight)))
    # print (str(tmpResult.todense()))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, docLens = queryState.means, queryState.docLens
    K, A, U, Y,  V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype = \
        modelState.K, modelState.A, modelState.U, modelState.Y,  modelState.V, modelState.covA, modelState.tv, modelState.ltv, modelState.fv, modelState.lfv, modelState.vocab, modelState.vocabPrior, modelState.dtype

    tp, fp, ltp, lfp = 1. / tv, 1. / fv, 1. / ltv, 1. / lfv  # turn variances into precisions

    # FIXME Use passed in hypers
    print("tp = %f tv=%f" % (tp, tv))
    vocabPrior = np.ones(shape=(T, ), dtype=modelState.dtype)

    # FIXME undo truncation
    F = 363
    A = A[:F, :]
    X = X[:, :F]
    U = U[:F, :]
    data = DataSet(words=W, feats=X)

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    debugFn = _debug_with_bound if debug else _debug_with_nothing

    # Initialize some working variables
    if covA is None:
        precA = (fp * ssp.eye(F) +
                 X.T.dot(X)).todense()  # As the inverse is almost always dense
        covA = la.inv(precA,
                      overwrite_a=True)  # it's faster to densify in advance
    uniqLens = np.unique(docLens)

    debugFn(-1, covA, "covA", W, X, means, docLens, K, A, U, Y, V, covA, tv,
            ltv, fv, lfv, vocab, vocabPrior)

    H = 0.5 * (np.eye(K) - np.ones((K, K), dtype=dtype) / K)

    expMeans = means.copy()
    expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis], out=expMeans)
    R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=W.copy())

    lhs = H.copy()
    rhs = expMeans.copy()
    Y_rhs = Y.copy()

    # Iterate over parameters
    for itr in range(iterations):

        # Update U, V given A
        V = try_solve_sym_pos(Y.T.dot(U.T).dot(U).dot(Y),
                              A.T.dot(U).dot(Y).T).T
        V /= V[0, 0]
        U = try_solve_sym_pos(Y.dot(V.T).dot(V).dot(Y.T),
                              A.dot(V).dot(Y.T).T).T

        # Update Y given U, V, A
        Y_rhs[:, :] = U.T.dot(A).dot(V)

        Sv, Uv = la.eigh(V.T.dot(V), overwrite_a=True)
        Su, Uu = la.eigh(U.T.dot(U), overwrite_a=True)

        s = np.outer(Sv, Su).flatten()
        s += ltv * lfv
        np.reciprocal(s, out=s)

        M = Uu.T.dot(Y_rhs).dot(Uv)
        M *= unvec(s, row_count=M.shape[0])

        Y = Uu.dot(M).dot(Uv.T)
        debugFn(itr, Y, "Y", W, X, means, docLens, K, A, U, Y, V, covA, tv,
                ltv, fv, lfv, vocab, vocabPrior)

        A = covA.dot(fp * U.dot(Y).dot(V.T) + X.T.dot(means))
        debugFn(itr, A, "A", W, X, means, docLens, K, A, U, Y, V, covA, tv,
                ltv, fv, lfv, vocab, vocabPrior)

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # TODO One big sort by size, plus batch it.

        # Update the Means

        rhs[:, :] = expMeans
        rhs *= R.dot(vocab.T)
        rhs += X.dot(A) * tp
        rhs += docLens[:, np.newaxis] * means.dot(H)
        rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means)
        for l in uniqLens:
            inds = np.where(docLens == l)[0]
            lhs[:, :] = l * H
            lhs[np.diag_indices_from(lhs)] += tp
            lhs[:, :] = la.inv(lhs)
            means[inds, :] = rhs[inds, :].dot(
                lhs
            )  # left and right got switched going from vectors to matrices :-/

        debugFn(itr, means, "means", W, X, means, docLens, K, A, U, Y, V, covA,
                tv, ltv, fv, lfv, vocab, vocabPrior)

        # Standard deviation
        # DK        = means.shape[0] * means.shape[1]
        # newTp     = np.sum(means)
        # newTp     = (-newTp * newTp)
        # rhs[:,:]  = means
        # rhs      *= means
        # newTp     = DK * np.sum(rhs) - newTp
        # newTp    /= DK * (DK - 1)
        # newTp     = min(max(newTp, 1E-36), 1E+36)
        # tp        = 1 / newTp
        # if itr % logFrequency == 0:
        #     print ("Iter %3d stdev = %f, prec = %f, np.std^2=%f, np.mean=%f" % (itr, sqrt(newTp), tp, np.std(means.reshape((D*K,))) ** 2, np.mean(means.reshape((D*K,)))))

        # Update the vocabulary
        expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                          out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)

        vocab *= (
            R.T.dot(expMeans)
        ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)

        debugFn(itr, vocab, "vocab", W, X, means, docLens, K, A, U, Y, V, covA,
                tv, ltv, fv, lfv, vocab, vocabPrior)
        # print ("Iter %3d Vocab.min = %f" % (itr, vocab.min()))

        # Update the vocab prior
        # vocabPrior = estimate_dirichlet_param (vocab, vocabPrior)
        # print ("Iter %3d VocabPrior.(min, max) = (%f, %f) VocabPrior.mean=%f" % (itr, vocabPrior.min(), vocabPrior.max(), vocabPrior.mean()))

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, A, U, Y, V, covA, tv, ltv, fv, lfv,
                                    vocab, vocabPrior, dtype, modelState.name)
            queryState = QueryState(means, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print(
                time.strftime('%X') +
                " : Iteration %d: bound %f \t Perplexity: %.2f" %
                (itr, boundValues[-1],
                 perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug:
                        printStderr("ERROR: bound degradation: %f > %f" %
                                    (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > 100 and len(likelyValues) > 3 \
                    and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, A, U, Y,  V, covA, tv, ltv, fv, lfv, vocab, vocabPrior, dtype, modelState.name), \
        QueryState(means, expMeans, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
示例#12
0
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)

    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, L, LT, X = data.words, data.links, ssp.csr_matrix(
        data.links.T), data.feats
    D, _ = W.shape
    out_links = np.squeeze(np.asarray(data.links.sum(axis=1)))

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, varcs, docLens = queryState.means, queryState.varcs, queryState.docLens
    K, topicMean, topicCov, vocab, A, dtype = modelState.K, modelState.topicMean, modelState.topicCov, modelState.vocab, modelState.A, modelState.dtype

    emit_counts = docLens + out_links

    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []

    if debug:
        debugFn = _debug_with_bound

        initLikely = log_likelihood(data, modelState, queryState)
        initPerp = perplexity_from_like(initLikely, data.word_count)
        print("Initial perplexity is: %.2f" % initPerp)
    else:
        debugFn = _debug_with_nothing

    # Initialize some working variables
    W_weight = W.copy()
    L_weight = L.copy()
    LT_weight = LT.copy()

    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K, ), dtype=dtype)
    priorSigT_diag.fill(NIW_PSI)

    # Iterate over parameters
    for itr in range(iterations):

        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn(itr, topicMean, "topicMean", data, K, topicMean, topicCov,
                vocab, dtype, means, varcs, A, docLens)

        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis, :]
            topicCov = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            topicCov += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            topicCov /= (D + pseudoObsVar - K)
        else:
            topicCov = np.cov(
                means.T) if topicCov.dtype == np.float64 else np.cov(
                    means.T).astype(dtype)
            topicCov += np.diag(varcs.mean(axis=0))

        if diagonalPriorCov:
            diag = np.diag(topicCov)
            topicCov = np.diag(diag)
            itopicCov = np.diag(1. / diag)
        else:
            itopicCov = la.inv(topicCov)

        debugFn(itr, topicCov, "topicCov", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)
        #        print("                topicCov.det = " + str(la.det(topicCov)))

        # Building Blocks - temporarily replaces means with exp(means)
        expMeansCol = np.exp(means - means.max(axis=0)[np.newaxis, :])
        lse_at_k = np.sum(expMeansCol, axis=0)
        F = 0.5 * means \
          - (1. / (2*D + 2)) * means.sum(axis=0) \
          - expMeansCol / lse_at_k[np.newaxis, :]

        expMeansRow = np.exp(means - means.max(axis=1)[:, np.newaxis])
        W_weight = sparseScalarQuotientOfDot(W,
                                             expMeansRow,
                                             vocab,
                                             out=W_weight)

        # Update the vocabularies

        vocab *= (
            W_weight.T.dot(expMeansRow)
        ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += VocabPrior
        vocab = normalizerows_ip(vocab)

        docVocab = (
            expMeansCol /
            lse_at_k[np.newaxis, :]).T  # FIXME Dupes line in definitino of F

        # Recalculate w_top_sums with the new vocab and log vocab improvement
        W_weight = sparseScalarQuotientOfDot(W,
                                             expMeansRow,
                                             vocab,
                                             out=W_weight)
        w_top_sums = W_weight.dot(vocab.T) * expMeansRow

        debugFn(itr, vocab, "vocab", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        # Now do likewise for the links, do it twice to model in-counts (first) and
        # out-counts (Second). The difference is the transpose
        LT_weight = sparseScalarQuotientOfDot(LT,
                                              expMeansRow,
                                              docVocab,
                                              out=LT_weight)
        l_intop_sums = LT_weight.dot(docVocab.T) * expMeansRow
        in_counts = l_intop_sums.sum(axis=0)

        L_weight = sparseScalarQuotientOfDot(L,
                                             expMeansRow,
                                             docVocab,
                                             out=L_weight)
        l_outtop_sums = L_weight.dot(docVocab.T) * expMeansRow

        # Reset the means and use them to calculate the weighted sum of means
        meanSum = means.sum(axis=0) * in_counts

        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.

        # Update the Variances: var_d = (2 N_d * A + itopicCov)^{-1}
        varcs = np.reciprocal(docLens[:, np.newaxis] * (0.5 - 1. / K) +
                              np.diagonal(topicCov))
        debugFn(itr, varcs, "varcs", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        # Update the Means
        rhs = w_top_sums.copy()
        rhs += l_intop_sums
        rhs += l_outtop_sums
        rhs += itopicCov.dot(topicMean)
        rhs += emit_counts[:, np.newaxis] * (means.dot(A) -
                                             rowwise_softmax(means))
        rhs += in_counts[np.newaxis, :] * F
        if diagonalPriorCov:
            raise ValueError("Not implemented")
        else:
            for d in range(D):
                rhs_ = rhs[d, :] + (1. /
                                    (4 * D + 4)) * (meanSum -
                                                    in_counts * means[d, :])
                means[d, :] = la.inv(itopicCov + emit_counts[d] * A +
                                     np.diag(D * in_counts /
                                             (2 * D + 2))).dot(rhs_)
                if np.any(np.isnan(means[d, :])) or np.any(
                        np.isinf(means[d, :])):
                    pass

                if np.any(np.isnan(
                        np.exp(means[d, :] - means[d, :].max()))) or np.any(
                            np.isinf(np.exp(means[d, :] - means[d, :].max()))):
                    pass

        debugFn(itr, means, "means", data, K, topicMean, topicCov, vocab,
                dtype, means, varcs, A, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, topicCov, vocab, A, dtype,
                                    MODEL_NAME)
            queryState = QueryState(means, varcs, docLens)

            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)

            print(
                time.strftime('%X') +
                " : Iteration %d: bound %f \t Perplexity: %.2f" %
                (itr, boundValues[-1],
                 perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    printStderr("ERROR: bound degradation: %f > %f" %
                                (boundValues[-2], boundValues[-1]))

                # Check to see if the improvement in the bound has fallen below the threshold
                if False and itr > 100 and abs(
                        perplexity_from_like(likelyValues[-1], docLens.sum()) -
                        perplexity_from_like(likelyValues[-2], docLens.sum())
                ) < 1.0:
                    break


    return \
        ModelState(K, topicMean, topicCov, vocab, A, dtype, MODEL_NAME), \
        QueryState(means, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
示例#13
0
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    W - the DxT document-term matrix
    X - The DxF document-feature matrix, which is IGNORED in this case
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W   = data.words
    D,_ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, diagonalPriorCov, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    K, topicMean, sigT, vocab, vocabPrior, A, dtype = modelState.K, modelState.topicMean, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.A, modelState.dtype
    
    # Book-keeping for logs
    boundIters, boundValues, likelyValues = [], [], []
    
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    
    # Initialize some working variables
    isigT = la.inv(sigT)
    R = W.copy()
    
    pseudoObsMeans = K + NIW_PSEUDO_OBS_MEAN
    pseudoObsVar   = K + NIW_PSEUDO_OBS_VAR
    priorSigT_diag = np.ndarray(shape=(K,), dtype=dtype)
    priorSigT_diag.fill (NIW_PSI)
    
    # Iterate over parameters
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step
        
        # Update the mean and covariance of the prior
        topicMean = means.sum(axis = 0) / (D + pseudoObsMeans) \
                  if USE_NIW_PRIOR \
                  else means.mean(axis=0)
        debugFn (itr, topicMean, "topicMean", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if USE_NIW_PRIOR:
            diff = means - topicMean[np.newaxis,:]
            sigT = diff.T.dot(diff) \
                 + pseudoObsVar * np.outer(topicMean, topicMean)
            sigT += np.diag(varcs.mean(axis=0) + priorSigT_diag)
            sigT /= (D + pseudoObsVar - K)
        else:
            sigT = np.cov(means.T) if sigT.dtype == np.float64 else np.cov(means.T).astype(dtype)
            sigT += np.diag(varcs.mean(axis=0))
           
        if diagonalPriorCov:
            diag = np.diag(sigT)
            sigT = np.diag(diag)
            isigT = np.diag(1./ diag)
        else:
            isigT = la.inv(sigT)

        # FIXME Undo debug
        sigT  = np.eye(K)
        isigT = la.inv(sigT)
        
        debugFn (itr, sigT, "sigT", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
#        print("                sigT.det = " + str(la.det(sigT)))
        
        
        # Building Blocks - temporarily replaces means with exp(means)
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        
        # Update the vocabulary
        vocab *= (R.T.dot(expMeans)).T # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        
        # Reset the means to their original form, and log effect of vocab update
        R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
        V = expMeans * R.dot(vocab.T)

        debugFn (itr, vocab, "vocab", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # And now this is the E-Step, though itr's followed by updates for the
        # parameters also that handle the log-sum-exp approximation.
        
        # Update the Variances: var_d = (2 N_d * A + isigT)^{-1}
        varcs = np.reciprocal(docLens[:,np.newaxis] * (K-1.)/K + np.diagonal(sigT))
        debugFn (itr, varcs, "varcs", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        # Update the Means
        rhs = V.copy()
        rhs += docLens[:,np.newaxis] * means.dot(A) + isigT.dot(topicMean)
        rhs -= docLens[:,np.newaxis] * rowwise_softmax(means, out=means)
        if diagonalPriorCov:
            means = varcs * rhs
        else:
            for d in range(D):
                means[d, :] = la.inv(isigT + docLens[d] * A).dot(rhs[d, :])
        
#         means -= (means[:,0])[:,np.newaxis]
        
        debugFn (itr, means, "means", W, K, topicMean, sigT, vocab, vocabPrior, dtype, means, varcs, A, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)
            
            boundValues.append(var_bound(data, modelState, queryState))
            likelyValues.append(log_likelihood(data, modelState, queryState))
            boundIters.append(itr)
            
            print (time.strftime('%X') + " : Iteration %d: bound %f \t Perplexity: %.2f" % (itr, boundValues[-1], perplexity_from_like(likelyValues[-1], docLens.sum())))
            if len(boundValues) > 1:
                if boundValues[-2] > boundValues[-1]:
                    if debug: printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[-2], boundValues[-1]))
        
                # Check to see if the improvement in the bound has fallen below the threshold
                if itr > 100 and len(likelyValues) > 3 \
                    and abs(perplexity_from_like(likelyValues[-1], docLens.sum()) - perplexity_from_like(likelyValues[-2], docLens.sum())) < 1.0:
                    break

    return \
        ModelState(K, topicMean, sigT, vocab, vocabPrior, A, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (np.array(boundIters), np.array(boundValues), np.array(likelyValues))
示例#14
0
    def testMapOnRealData(self):
        dtype = np.float64  # DTYPE

        rd.seed(0xBADB055)
        data = DataSet.from_files(words_file=AclWordPath,
                                  links_file=AclCitePath)
        with open(AclDictPath, "rb") as f:
            dic = pkl.load(f)

        data.convert_to_dtype(dtype)
        data.convert_to_undirected_graph()
        data.convert_to_binary_link_matrix()
        data.prune_and_shuffle(min_doc_len=MinDocLen,
                               min_link_count=MinLinkCount)

        trainData, testData = data.doc_completion_split()

        for pseudoNegCount in (5, 10, 25, 50, 100):
            rd.seed(0xC0FFEE)

            # Initialise the model
            K = TopicCount
            model = rtm.newModelAtRandom(trainData,
                                         K,
                                         dtype=dtype,
                                         pseudoNegCount=data.doc_count *
                                         pseudoNegCount)
            queryState = rtm.newQueryState(trainData, model)
            trainPlan = rtm.newTrainPlan(iterations=50,
                                         logFrequency=LogFreq,
                                         fastButInaccurate=False,
                                         debug=True)

            # Train the model, and the immediately save the result to a file for subsequent inspection
            model, topics, (bndItrs, bndVals,
                            bndLikes) = rtm.train(trainData, model, queryState,
                                                  trainPlan)
            #        with open(newModelFileFromModel(model), "wb") as f:
            #            pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)

            # Plot the evolution of the bound during training.
            fig, ax1 = plt.subplots()
            ax1.plot(bndItrs, bndVals, 'b-')
            ax1.set_xlabel('Iterations')
            ax1.set_ylabel('Bound', color='b')

            ax2 = ax1.twinx()
            ax2.plot(bndItrs, bndLikes, 'r-')
            ax2.set_ylabel('Likelihood', color='r')

            fig.show()
            plt.show()

            # Print out the most likely topic words
            # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0))))
            vocab = rtm.wordDists(model)
            topWordCount = 10
            kTopWordInds = [
                self.topWordInds(vocab[k, :], topWordCount) for k in range(K)
            ]

            like = rtm.log_likelihood(trainData, model, topics)
            perp = perplexity_from_like(like, trainData.word_count)

            # print ("Prior %s" % (str(model.topicPrior)))
            print("Pseudo Neg-Count: %d " % pseudoNegCount)
            print("\tTrain Perplexity: %f\n\n" % perp)

            # for k in range(model.K):
            #     print ("\nTopic %d\n=============================" % k)
            #     print ("\n".join("%-20s\t%0.4f" % (dic[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount)))

            min_probs = rtm.min_link_probs(model, topics, testData.links)
            link_probs = rtm.link_probs(model, topics, min_probs)
            try:
                map = mean_average_prec(testData.links, link_probs)
            except:
                print("Unexpected error")

            print("\tThe Mean-Average-Precision is %.3f" % map)
def train (data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, _ = W.shape
    
    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype
    
    # Book-keeping for logs
    boundIters  = np.zeros(shape=(iterations // logFrequency,))
    boundValues = np.zeros(shape=(iterations // logFrequency,))
    boundLikes = np.zeros(shape=(iterations // logFrequency,))
    bvIdx = 0
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0
    
    # For efficient inference, we need a separate covariance for every unique
    # document length. For products to execute quickly, the doc-term matrix
    # therefore needs to be ordered in ascending terms of document length
    originalDocLens = docLens
    sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG) # sort needs to be stable in order to be reversible
    W = W[sortIdx,:] # deep sorted copy
    X = X[sortIdx,:]
    means, varcs = means[sortIdx,:], varcs[sortIdx,:]

    docLens = originalDocLens[sortIdx]
    
    lens, inds = np.unique(docLens, return_index=True)
    inds = np.append(inds, [W.shape[0]])
    
    # Initialize some working variables
    R = W.copy()
    
    aI_P = 1./lfv  * ssp.eye(P, dtype=dtype)
    
    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    R_A = R_A.todense()      # dense inverse typically as fast or faster than sparse inverse
    R_A.flat[::F+1] += 1./fv # and the result is usually dense in any case
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")


    diff_m_xa = (means-X.dot(A.T))
    means_cov_with_x_a = diff_m_xa.T.dot(diff_m_xa)

    expMeans = np.zeros((BatchSize, K), dtype=dtype)
    R = np.zeros((BatchSize, K), dtype=dtype)
    S = np.zeros((BatchSize, K), dtype=dtype)
    vocabScale = np.ones(vocab.shape, dtype=dtype)
    
    # Iterate over parameters
    batchIter = 0
    for itr in range(iterations):
        
        # We start with the M-Step, so the parameters are consistent with our
        # initialisation of the RVs when we do the E-Step

        # Update the covariance of the prior
        diff_a_yv = (A-Y.dot(V))
        sigT  = 1./lfv * (Y.dot(Y.T))
        sigT += 1./fv * diff_a_yv.dot(diff_a_yv.T)
        sigT += means_cov_with_x_a
        sigT.flat[::K+1] += varcs.sum(axis=0)

        # As small numbers lead to instable inverse estimates, we use the
        # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these
        # scales whenever we use the inverse of the unscaled covariance
        sigScale  = 1. / (P+D+F)
        isigScale = 1. / sigScale

        isigT = la.inv(sigT)
        debugFn (itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the vocabulary
        vocab *= vocabScale
        vocab += vocabPrior
        vocab = normalizerows_ip(vocab)
        debugFn (itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Finally update the parameter V
        V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        debugFn (itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        
        #
        # And now this is the E-Step
        # 
        
        # Update the distribution on the latent space
        R_Y_base = aI_P + 1/fv * V.dot(V.T)
        R_Y = la.inv(R_Y_base)
        debugFn (itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        Y = 1./fv * A.dot(V.T).dot(R_Y)
        debugFn (itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the mapping from the features to topics
        A = (1./fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A)
        debugFn (itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        # Update the Variances
        varcs = 1./((docLens * (K-1.)/K)[:,np.newaxis] + isigScale * isigT.flat[::K+1])
        debugFn (itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)

        # Faster version?
        vocabScale[:,:] = 0
        means_cov_with_x_a[:,:] = 0
        for lenIdx in range(len(lens)):
            nd         = lens[lenIdx]
            start, end = inds[lenIdx], inds[lenIdx + 1]
            lhs        = la.inv(isigT + sigScale * nd * Ab) * sigScale

            for d in range(start, end, BatchSize):
                end_d = min(d + BatchSize, end)
                span  = end_d - d

                expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:])
                R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[d:end_d,:], vocab)
                S[:span,:] = expMeans[:span, :] * R.dot(vocab.T)

                # Convert expMeans to a softmax(means)
                expMeans[:span,:] /= expMeans[:span,:].sum(axis=1)[:span,np.newaxis]

                mu   = X[d:end_d,:].dot(A.T)
                rhs  = mu.dot(isigT) * isigScale
                rhs += S[:span,:]
                rhs += docLens[d:end_d,np.newaxis] * means[d:end_d,:].dot(Ab)
                rhs -= docLens[d:end_d,np.newaxis] * expMeans[:span,:] # here expMeans is actually softmax(means)

                means[d:end_d,:] = rhs.dot(lhs) # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped

                expMeans[:span,:] = np.exp(means[d:end_d,:] - means[d:end_d,:].max(axis=1)[:span,np.newaxis], out=expMeans[:span,:])
                R = sparseScalarQuotientOfDot(W[d:end_d,:], expMeans[:span,:], vocab, out=R)

                stepSize = (Tau + batchIter) ** -Kappa
                batchIter += 1

                # Do a gradient update of the vocab
                vocabScale += (R.T.dot(expMeans[:span,:])).T
                # vocabScale *= vocab
                # normalizerows_ip(vocabScale)
                # # vocabScale += vocabPrior
                # vocabScale *= stepSize
                # vocab *= (1 - stepSize)
                # vocab += vocabScale

                diff = (means[d:end_d,:] - mu)
                means_cov_with_x_a += diff.T.dot(diff)

#       print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max()))
        debugFn (itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab, docLens)
        
        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)

            boundValues[bvIdx] = var_bound(DataSet(W, feats=X), modelState, queryState, XTX)
            boundLikes[bvIdx]  = log_likelihood(DataSet(W, feats=X), modelState, queryState)
            boundIters[bvIdx]  = itr
            perp = perplexity_from_like(boundLikes[bvIdx], docLens.sum())
            print (time.strftime('%X') + " : Iteration %d: Perplexity %4.0f bound %f" % (itr, perp, boundValues[bvIdx]))
            if bvIdx > 0 and  boundValues[bvIdx - 1] > boundValues[bvIdx]:
                printStderr ("ERROR: bound degradation: %f > %f" % (boundValues[bvIdx - 1], boundValues[bvIdx]))
#           print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

            # Check to see if the improvement in the likelihood has fallen below the threshold
            if bvIdx > 1 and boundIters[bvIdx] > 20:
                lastPerp = perplexity_from_like(boundLikes[bvIdx - 1], docLens.sum())
                if lastPerp - perp < 1:
                    boundIters, boundValues, likelyValues = clamp (boundIters, boundValues, boundLikes, bvIdx)
                    break
            bvIdx += 1
        
    revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG)
    means       = means[revert_sort,:]
    varcs       = varcs[revert_sort,:]
    docLens     = docLens[revert_sort]
    
    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (boundIters, boundValues, boundLikes)
示例#16
0
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    W, X = data.words, data.feats
    D, _ = W.shape

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, varcs, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype

    # TODO Get ride of this via a command-line param
    iterations = max(iterations, 100)

    # Debugging
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0

    # Necessary values
    isigT = la.inv(sigT)

    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    for itr in range(iterations):
        # Counts of topic assignments
        expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                          out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab)
        S = expMeans * R.dot(vocab.T)

        # the variance
        varcs[:] = 1. / ((n *
                          (K - 1.) / K)[:, np.newaxis] + isigT.flat[::K + 1])
        debugFn(itr, varcs, "query-varcs", W, X, None, F, P, K, A, R_A, fv, Y,
                R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab,
                n)

        # Update the Means
        rhs = X.dot(A.T).dot(isigT)
        rhs += S
        rhs += n[:, np.newaxis] * means.dot(Ab)
        rhs -= n[:, np.newaxis] * rowwise_softmax(means, out=means)

        # Long version
        inverses = dict()
        for d in range(D):
            if not n[d] in inverses:
                inverses[n[d]] = la.inv(isigT + n[d] * Ab)
            lhs = inverses[n[d]]
            means[d, :] = lhs.dot(rhs[d, :])
        debugFn(itr, means, "query-means", W, X, None, F, P, K, A, R_A, fv, Y,
                R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab,
                n)

        like = log_likelihood(data, modelState,
                              QueryState(means, expMeans, varcs, n))
        perp = perplexity_from_like(like, data.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    return modelState, queryState  # query vars altered in-place
示例#17
0
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the actual CTM model
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
                 
    Return:
    A new model object with the updated model (note parameters are
    updated in place, so make a defensive copy if you want itr)
    A new query object with the update query parameters
    '''
    W, X = data.words, data.feats
    D, _ = W.shape

    # Unpack the the structs, for ease of access and efficiency
    iterations, epsilon, logFrequency, fastButInaccurate, debug = trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    means, expMeans, varcs, docLens = queryState.means, queryState.expMeans, queryState.varcs, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, Ab, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.Ab, modelState.dtype

    # Book-keeping for logs
    boundIters, boundValues, boundLikes = [], [], []
    debugFn = _debug_with_bound if debug else _debug_with_nothing
    _debug_with_bound.old_bound = 0

    # For efficient inference, we need a separate covariance for every unique
    # document length. For products to execute quickly, the doc-term matrix
    # therefore needs to be ordered in ascending terms of document length
    originalDocLens = docLens
    sortIdx = np.argsort(docLens, kind=STABLE_SORT_ALG
                         )  # sort needs to be stable in order to be reversible
    W = W[sortIdx, :]  # deep sorted copy
    X = X[sortIdx, :]
    means, varcs = means[sortIdx, :], varcs[sortIdx, :]

    docLens = originalDocLens[sortIdx]

    lens, inds = np.unique(docLens, return_index=True)
    inds = np.append(inds, [W.shape[0]])

    # Initialize some working variables
    R = W.copy()

    aI_P = 1. / lfv * ssp.eye(P, dtype=dtype)

    print("Creating posterior covariance of A, this will take some time...")
    XTX = X.T.dot(X)
    R_A = XTX
    leastSquares = lambda feats, targets: la.lstsq(
        feats, targets, lapack_driver="gelsy")[0].T
    if ssp.issparse(
            R_A):  # dense inverse typically as fast or faster than sparse
        R_A = to_dense_array(
            R_A)  # inverse and the result is usually dense in any case
        leastSquares = lambda feats, targets: np.array(
            [ssp.linalg.lsqr(feats, targets[:, k])[0] for k in range(K)])
    R_A.flat[::F + 1] += 1. / fv
    R_A = la.inv(R_A)
    print("Covariance matrix calculated, launching inference")

    priorSigt_diag = np.ndarray(shape=(K, ), dtype=dtype)
    priorSigt_diag.fill(0.001)

    # Iterate over parameters
    for itr in range(iterations):
        A = leastSquares(X, means)
        diff_a_yv = (A - Y.dot(V))

        for _ in range(10):  #(50 if itr == 0 else 1):
            # Update the covariance of the prior
            diff_m_xa = (means - X.dot(A.T))

            sigT = 1. / lfv * (Y.dot(Y.T))
            sigT += 1. / fv * diff_a_yv.dot(diff_a_yv.T)
            sigT += diff_m_xa.T.dot(diff_m_xa)
            sigT.flat[::K + 1] += varcs.sum(axis=0)

            # As small numbers lead to instable inverse estimates, we use the
            # fact that for a scalar a, (a .* X)^-1 = 1/a * X^-1 and use these
            # scales whenever we use the inverse of the unscaled covariance
            sigScale = 1. / (P + D + F)
            isigScale = 1. / sigScale

            isigT = la.inv(sigT)
            debugFn(itr, sigT, "sigT", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y,
                    lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, Ab,
                    docLens)

            # Update the vocabulary
            vocab *= (
                R.T.dot(expMeans)
            ).T  # Awkward order to maintain sparsity (R is sparse, expMeans is dense)
            vocab += vocabPrior
            vocab = normalizerows_ip(vocab)

            # Reset the means to their original form, and log effect of vocab update
            R = sparseScalarQuotientOfDot(W, expMeans, vocab, out=R)
            S = expMeans * R.dot(vocab.T)
            debugFn(itr, vocab, "vocab", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            # Update the Variances
            varcs = 1. / ((docLens * (K - 1.) / K)[:, np.newaxis] +
                          isigScale * isigT.flat[::K + 1])
            debugFn(itr, varcs, "varcs", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            # Update the Means
            rhs = X.dot(A.T).dot(isigT) * isigScale
            rhs += S
            rhs += docLens[:, np.newaxis] * means.dot(Ab)
            rhs -= docLens[:, np.newaxis] * rowwise_softmax(means, out=means)

            # Faster version?
            for lenIdx in range(len(lens)):
                nd = lens[lenIdx]
                start, end = inds[lenIdx], inds[lenIdx + 1]
                lhs = la.inv(isigT + sigScale * nd * Ab) * sigScale

                means[start:end, :] = rhs[start:end, :].dot(
                    lhs
                )  # huh?! Left and right refer to eqn for a single mean: once we're talking a DxK matrix it gets swapped

    #       print("Vec-Means: %f, %f, %f, %f" % (means.min(), means.mean(), means.std(), means.max()))
            debugFn(itr, means, "means", W, X, XTX, F, P, K, A, R_A, fv, Y,
                    R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs,
                    Ab, docLens)

            expMeans = np.exp(means - means.max(axis=1)[:, np.newaxis],
                              out=expMeans)

        # for _ in range(150):
        #     # Finally update the parameter V
        #     V = la.inv(sigScale * R_Y + Y.T.dot(isigT).dot(Y)).dot(Y.T.dot(isigT).dot(A))
        #     debugFn(itr, V, "V", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)
        #
        #     # Update the distribution on the latent space
        #     R_Y_base = aI_P + 1 / fv * V.dot(V.T)
        #     R_Y = la.inv(R_Y_base)
        #     debugFn(itr, R_Y, "R_Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype,
        #             means, varcs, Ab, docLens)
        #
        #     Y = 1. / fv * A.dot(V.T).dot(R_Y)
        #     debugFn(itr, Y, "Y", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)
        #
        #     # Update the mapping from the features to topics
        #     A = (1. / fv * Y.dot(V) + (X.T.dot(means)).T).dot(R_A)
        #     debugFn(itr, A, "A", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means,
        #             varcs, Ab, docLens)

        if logFrequency > 0 and itr % logFrequency == 0:
            modelState = ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V,
                                    sigT * sigScale, vocab, vocabPrior, Ab,
                                    dtype, MODEL_NAME)
            queryState = QueryState(means, expMeans, varcs, docLens)

            boundValues.append(
                var_bound(DataSet(W, feats=X), modelState, queryState, XTX))
            boundLikes.append(
                log_likelihood(DataSet(W, feats=X), modelState, queryState))
            boundIters.append(itr)
            perp = perplexity_from_like(boundLikes[-1], docLens.sum())
            print(
                time.strftime('%X') +
                " : Iteration %d: Perplexity %4.0f bound %f" %
                (itr, perp, boundValues[-1]))
            if len(boundIters) >= 2 and boundValues[-2] > boundValues[-1]:
                printStderr("ERROR: bound degradation: %f > %f" %
                            (boundValues[-2], boundValues[-1]))


#           print ("Means: min=%f, avg=%f, max=%f\n\n" % (means.min(), means.mean(), means.max()))

# Check to see if the improvement in the likelihood has fallen below the threshold
            if len(boundIters) > 2 and boundIters[-1] > 20:
                lastPerp = perplexity_from_like(boundLikes[-2], docLens.sum())
                if lastPerp - perp < 1:
                    break

    revert_sort = np.argsort(sortIdx, kind=STABLE_SORT_ALG)
    means = means[revert_sort, :]
    varcs = varcs[revert_sort, :]
    docLens = docLens[revert_sort]

    return \
        ModelState(F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT * sigScale, vocab, vocabPrior, Ab, dtype, MODEL_NAME), \
        QueryState(means, expMeans, varcs, docLens), \
        (boundIters, boundValues, boundLikes)
示例#18
0
def train(data, modelState, queryState, trainPlan, query=False):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the actual LDA model. In a training run (query = False) this
                 will be mutated in place, and then returned.
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations. This will be mutated in-place
                 and then returned.
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
    query      -

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug
    W_list, docLens, q_n_dk, q_n_kt, q_n_k, z_dnk = \
        queryState.W_list, queryState.docLens, queryState.n_dk, queryState.n_kt, queryState.n_k, queryState.z_dnk
    K, topicPrior_, vocabPrior, m_n_dk, m_n_kt, m_n_k = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.n_dk, modelState.n_kt, modelState.n_k
    topicPrior = topicPrior_.mean()

    D_train = 0 if m_n_dk is None else m_n_dk.shape[0]
    D_query = q_n_dk.shape[0]
    W = data.words
    T = W.shape[1]

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )

    # Book-keeping for logs
    logPoints = 1 if logFrequency == 0 else iterations // logFrequency
    boundIters = []
    boundValues = []
    likelyValues = []

    # Early stopping check
    finishedTraining = False

    # Add the model counts (essentially the learnt model parameters) to those for
    # the query, assuming the model has been trained previously
    if m_n_dk is not None:
        np.add(q_n_kt, m_n_kt, out=q_n_kt)  # q_n_kt += m_n_kt
        np.add(q_n_k, m_n_k, out=q_n_k)  # q_n_k  += m_n_k


#     print ("Topic prior : " + str(topicPrior))

# Select the training iterations function appropriate for the dtype
    if debug: print("Starting Training")
    do_iterations = compiled.iterate_f32 \
                    if modelState.dtype == np.float32 \
                    else compiled.iterate_f64

    # Iterate in segments, pausing to take measures of the bound / likelihood
    segIters = logFrequency
    remainder = iterations - segIters * (logPoints - 1)
    for segment in range(logPoints - 1):
        do_iterations (segIters, D_query, D_train, K, T, \
                       W_list, docLens, \
                       q_n_dk, q_n_kt, q_n_k, z_dnk,\
                       topicPrior, vocabPrior)

        # Measure and record the improvement to the bound and log-likely
        boundIters.append(segment * segIters)
        boundValues.append(
            var_bound_intermediate(data, modelState, queryState, q_n_kt,
                                   q_n_k))
        likelyValues.append(
            log_likely_intermediate(data, modelState, queryState, q_n_kt,
                                    q_n_k))

        # Check to see if the improvement in the bound has fallen below the threshold
        perp = perplexity_from_like(likelyValues[-1], W.sum())
        print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" %
              (segment * segIters, perp, boundValues[-1]))

        if len(boundIters) > 2 and (boundIters[-1] > 50):
            lastPerp = perplexity_from_like(likelyValues[-2], W.sum())
            if lastPerp - perp < 1:
                finishedTraining = True
                print("Converged, existing early")
                break

    # Final scheduled batch of iterations if we haven't already converged.
    if not finishedTraining:
        do_iterations (remainder, D_query, D_train, K, T, \
                   W_list, docLens, \
                   q_n_dk, q_n_kt, q_n_k, z_dnk,\
                   topicPrior, vocabPrior)

        boundIters.append(iterations - 1)
        boundValues.append(
            var_bound_intermediate(data, modelState, queryState, q_n_kt,
                                   q_n_k))
        likelyValues.append(
            log_likely_intermediate(data, modelState, queryState, q_n_kt,
                                    q_n_k))

    # Now return the results
    if query:  # Model is unchanged, query is changed
        if m_n_dk is not None:
            np.subtract(q_n_kt, m_n_kt, out=q_n_kt)  # q_n_kt -= m_n_kt
            np.subtract(q_n_k, m_n_k, out=q_n_k)  # q_n_k  -= m_n_k
    else:  # train # Model is changed (or flat-out created). Query is changed
        if m_n_dk is not None:  # Amend existing
            m_n_dk = np.vstack((m_n_dk, q_n_dk))
            m_n_kt[:, :] = q_n_kt
            m_n_k[:] = q_n_k
        else:  # Create from scratch
            m_n_dk = q_n_dk.copy()
            m_n_kt = q_n_kt.copy()
            m_n_k = q_n_k.copy()

    return ModelState(K, topicPrior, vocabPrior, m_n_dk, m_n_kt, m_n_k, modelState.dtype, modelState.name), \
           QueryState(W_list, docLens, q_n_dk, q_n_kt, q_n_k, z_dnk), \
           (boundIters, boundValues, likelyValues)
示例#19
0
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the actual LDA model. In a training run (query = False) this
                 will be mutated in place, and then returned.
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations. This will be mutated in-place
                 and then returned.
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
    query      -

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize, rate_retardation, forgetting_rate = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug, \
        trainPlan.batchSize, trainPlan.rate_retardation, trainPlan.forgetting_rate
    W_list, docLens, topicDists = \
        queryState.W_list, queryState.docLens, queryState.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype

    W = data.words
    D, T = W.shape

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )

    # Book-keeping for logs
    logPoints = 1 if logFrequency == 0 else iterations // logFrequency
    boundIters = np.zeros(shape=(logPoints, ))
    boundValues = np.zeros(shape=(logPoints, ))
    likelyValues = np.zeros(shape=(logPoints, ))
    bvIdx = 0

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    z_dnk = np.empty((docLens.max(), K), dtype=dtype, order='F')

    # Select the training iterations function appropriate for the dtype
    current_micro_time = lambda: int(time.time())
    do_iterations = compiled.iterate_f32 \
                    if modelState.dtype == np.float32 \
                    else compiled.iterate_f64
    #    do_iterations = iterate # pure Python

    # Iterate in segments, pausing to take measures of the bound / likelihood
    segIters = logFrequency
    remainder = iterations - segIters * (logPoints - 1)
    totalItrs = 0
    for segment in range(logPoints - 1):
        start = current_micro_time()
        totalItrs += do_iterations (segIters, \
                 batchSize, segment * segIters, rate_retardation, forgetting_rate, \
                 D, K, T, \
                 W_list, docLens, \
                 topicPrior, vocabPrior, \
                 z_dnk, topicDists, wordDists)

        duration = current_micro_time() - start

        boundIters[bvIdx] = segment * segIters
        boundValues[bvIdx] = var_bound(data, modelState, queryState)
        likelyValues[bvIdx] = log_likelihood(data, modelState, queryState)
        perp = perplexity_from_like(likelyValues[bvIdx], W.sum())
        bvIdx += 1

        if converged(boundIters, boundValues, bvIdx, epsilon, minIters=20):
            boundIters, boundValues, likelyValues = clamp(
                boundIters, boundValues, likelyValues, bvIdx)
            return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \
                QueryState(W_list, docLens, topicDists), \
                (boundIters, boundValues, likelyValues)

        print(
            "Segment %d/%d Total Iterations %d Duration %d Perplexity %4.0f Bound %10.2f Likelihood %10.2f"
            % (segment, logPoints, totalItrs, duration, perp,
               boundValues[bvIdx - 1], likelyValues[bvIdx - 1]))

    # Final batch of iterations.
    do_iterations (remainder, D, K, T, \
                 W_list, docLens, \
                 topicPrior, vocabPrior, \
                 z_dnk, topicDists, wordDists)

    boundIters[bvIdx] = iterations - 1
    boundValues[bvIdx] = var_bound(data, modelState, queryState)
    likelyValues[bvIdx] = log_likelihood(data, modelState, queryState)


    return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \
           QueryState(W_list, docLens, topicDists), \
           (boundIters, boundValues, likelyValues)
示例#20
0
def query(data, modelState, queryState, queryPlan):
    '''
    Given a _trained_ model, attempts to predict the topics for each of
    the inputs.
    
    Params:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    modelState - the _trained_ model
    queryState - the query state generated for the query dataset
    queryPlan  - used in this case as we need to tighten up the approx
    
    Returns:
    The model state and query state, in that order. The model state is
    unchanged, the query is.
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = queryPlan.iterations, queryPlan.epsilon, queryPlan.logFrequency, queryPlan.fastButInaccurate, queryPlan.debug
    means, expMeans, varcs, lxi, s, n = queryState.means, queryState.expMeans, queryState.varcs, queryState.lxi, queryState.s, queryState.docLens
    F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype = modelState.F, modelState.P, modelState.K, modelState.A, modelState.R_A, modelState.fv, modelState.Y, modelState.R_Y, modelState.lfv, modelState.V, modelState.sigT, modelState.vocab, modelState.vocabPrior, modelState.dtype

    # Necessary temp variables (notably the count of topic to word assignments
    # per topic per doc)
    isigT = la.inv(sigT)
        
    W,X = data.words, data.feats
        
    # Enable logging or not. If enabled, we need the inner product of the feat matrix
    if debug:
        XTX = X.T.dot(X)
        debugFn = _debug_with_bound
        _debug_with_bound.old_bound=0
    else:
        XTX = None
        debugFn = _debug_with_nothing
    
    # Iterate over parameters
    lastPerp = 1E+300 if dtype is np.float64 else 1E+30
    for itr in range(iterations):
        # Estimate Z_dvk
        expMeans = np.exp(means - means.max(axis=1)[:,np.newaxis], out=expMeans)
        R = sparseScalarQuotientOfDot(W, expMeans, vocab)
        S = expMeans * R.dot(vocab.T)
        
        # Update the Means
        vMat   = (2  * s[:,np.newaxis] * lxi - 0.5) * n[:,np.newaxis] + S
        rhsMat = vMat + X.dot(A.T).dot(isigT) # TODO Verify this
        lhsMat = np.reciprocal(np.diag(isigT)[np.newaxis,:] + n[:,np.newaxis] * 2 * lxi)  # inverse of D diagonal matrices...
        means = lhsMat * rhsMat # as LHS is a diagonal matrix for all d, it's equivalent
                                # to doing a hadamard product for all d
        debugFn (itr, means, "query-means", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the Variances
        varcs = 1./(2 * n[:,np.newaxis] * lxi + isigT.flat[::K+1])
        debugFn (itr, varcs, "query-varcs", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # Update the approximation parameters
        lxi = ctm.negJakkolaOfDerivedXi(means, varcs, s)
        debugFn (itr, lxi, "query-lxi", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)
        
        # s can sometimes grow unboundedly
        # Follow Bouchard's suggested approach of fixing it at zero
        #
#         s = (np.sum(lxi * means, axis=1) + 0.25 * K - 0.5) / np.sum(lxi, axis=1)
#         debugFn (itr, s, "s", W, X, XTX, F, P, K, A, R_A, fv, Y, R_Y, lfv, V, sigT, vocab, vocabPrior, dtype, means, varcs, lxi, s, n)

        like = log_likelihood(data, modelState, QueryState(means, expMeans, varcs, lxi, s, n))
        perp = perplexity_from_like(like, data.word_count)
        if itr > 20 and lastPerp - perp < 1:
            break
        lastPerp = perp

    return modelState, QueryState (means, expMeans, varcs, lxi, s, n)
示例#21
0
def _old_train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.batchSize
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists ,dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )
    assert model.dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior)

    W = data.words
    D, T = W.shape

    iters, bnds, likes = [], [], []

    # A few parameters for handling adaptive step-sizes in SGD
    grad = 0
    grad_inner = 0
    grad_rate = 1
    log_likely = 0  # complete dataset likelihood for gradient adjustments
    stepSize = np.array([1.] * K, dtype=model.dtype)

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    diWordDistSums = np.empty((K, ), dtype=dtype)
    diWordDists = np.empty(wordDists.shape, dtype=dtype)
    wordUpdates = wordDists.copy() if batchSize > 0 else None
    batchProcessCount = 0

    # Amend the name if batchSize == 0 implying we're using SGD
    modelName = "lda/svbp/%s" % _sgd_desc(plan) \
                if batchSize > 0 else model.name
    print(modelName)

    for itr in range(iterations):
        diWordDistSums[:] = wordDists.sum(axis=1)
        fns.digamma(diWordDistSums, out=diWordDistSums)
        fns.digamma(wordDists, out=diWordDists)

        if updateVocab:
            # Perform inference, updating the vocab
            if batchSize == 0:
                wordDists[:, :] = vocabPrior
            else:
                wordUpdates[:, :] = 0

            for d in range(D):
                batchProcessCount += 1
                #if debug and d % 100 == 0: printAndFlushNoNewLine(".")
                wordIdx, z = _update_topics_at_d(d, data, docLens, topicMeans,
                                                 topicPrior, diWordDists,
                                                 diWordDistSums)
                wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

                if plan.rate_algor == RateAlgorAmaria:
                    log_likely += 0
                elif plan.rate_algor == RateAlgorVariance:
                    g = wordDists.mean(axis=0) + vocabPrior
                    grad *= (1 - grad_rate)
                    grad += grad_rate * wordDists
                    grad += grad_rate * vocabPrior
                    gg += 0
                elif plan.rate_algor != RateAlgorTimeKappa:
                    raise ValueError("Unknown rate algorithm " +
                                     str(plan.rate_algor))

                if batchSize > 0 and batchProcessCount == batchSize:
                    batch_index = (
                        itr * D + d
                    ) / batchSize  #TODO  Will not be right if batchSize is not a multiple of D
                    stepSize = _step_sizes(stepSize, batch_index, g, gg,
                                           log_likely, plan)
                    wordDists *= (1 - stepSize)
                    wordDists += stepSize * vocabPrior

                    stepSize *= float(D) / batchSize
                    wordUpdates *= stepSize
                    wordDists += wordUpdates

                    diWordDistSums[:] = wordDists.sum(axis=1)
                    fns.digamma(diWordDistSums, out=diWordDistSums)
                    fns.digamma(wordDists, out=diWordDists)

                    wordUpdates[:, :] = 0
                    batchProcessCount = 0
                    log_likely = 0

                    if debug:
                        bnds.append(_var_bound_internal(data, model, query))
                        likes.append(
                            _log_likelihood_internal(data, model, query))

                        perp = perplexity_from_like(likes[-1], W.sum())
                        print(
                            "Iteration %d, after %d docs: Train Perp = %4.0f  Bound = %.3f"
                            % (itr, batchSize, perp, bnds[-1]))
                        sys.stdout.flush()

            # Log bound and the determine if we can stop early
            if itr % logFrequency == 0 or debug:
                iters.append(itr)
                bnds.append(_var_bound_internal(data, model, query))
                likes.append(_log_likelihood_internal(data, model, query))

                perp = perplexity_from_like(likes[-1], W.sum())
                print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" %
                      (itr, perp, bnds[-1]))

                if len(iters) > 2 and (iters[-1] > 20 or
                                       (iters[-1] > 2 and batchSize > 0)):
                    lastPerp = perplexity_from_like(likes[-2], W.sum())
                    if lastPerp - perp < 1:
                        print("Converged, existing early")
                        break

            # Update hyperparameters (do this after bound, to make sure bound
            # calculation is internally consistent)
            if HyperUpdateEnabled and itr > 0 and itr % HyperParamUpdateInterval == 0:
                if debug: print("Topic Prior was " + str(topicPrior))
                _updateTopicHyperParamsFromMeans(model, query)
                if debug: print("Topic Prior is now " + str(topicPrior))
        else:
            for d in range(D):
                _ = _update_topics_at_d(d, data, docLens, topicMeans,
                                        topicPrior, diWordDists,
                                        diWordDistSums)

    topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)

    return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), \
           QueryState(docLens, topicMeans, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
示例#22
0
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''

    iterations, epsilon, logFrequency, fastButInaccurate, debug, burnIn, thinning = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.burnIn, plan.thinning
    docLens, topicDists = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    W = data.words
    D, T = W.shape

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    iters, bnds, likes = [], [], []

    sampleCount = 0
    wordDistSamples = np.zeros((K, T), dtype=np.float64)
    topicDistSamples = np.zeros((D, K), dtype=np.float64)

    for itr in range(plan.iterations + plan.burnIn):
        topicDists = sample_memberships(W, topicPrior, wordDists, topicDists)
        wordDists = sample_dirichlet(W, vocabPrior, topicDists, wordDists)

        if is_sampling_iteration(itr, plan):
            wordDistSamples += wordDists
            topicDistSamples += topicDists
            sampleCount += 1

        if debug or (logFrequency > 0 and itr % logFrequency == 0):
            m = ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype,
                           model.name)
            q = QueryState(query.docLens, topicDists, True)

            iters.append(itr)
            bnds.append(var_bound(data, m, q))
            likes.append(log_likelihood(data, m, q))

            perp = perplexity_from_like(likes[-1], W.sum())
            print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" %
                  (itr, perp, bnds[-1]))

            # if len(iters) > 2 and iters[-1] > 50:
            #     lastPerp = perplexity_from_like(likes[-2], W.sum())
            #     if lastPerp - perp < 1:
            #         break;

    return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, model.name), \
           QueryState(query.docLens, topicDists, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))