示例#1
0
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint, and additionally learns the weights
    needed to predict new links.

    Params:
    W - the DxT document-term matrix
    X - The DxD document-document matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, weights, negCount, reg, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.weights, model.pseudoNegCount, model.regularizer, model.dtype

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior)

    W = data.words
    D, T = W.shape
    X = data.links

    iters, bnds, likes = [], [], []

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    z = np.empty((K, ), dtype=dtype, order='F')
    diWordDistSums = np.empty((K, ), dtype=dtype)
    diWordDists = np.empty(wordDists.shape, dtype=dtype)

    for itr in range(iterations):
        if debug: printAndFlushNoNewLine("\n %4d: " % itr)

        diWordDistSums[:] = wordDists.sum(axis=1)
        fns.digamma(diWordDistSums, out=diWordDistSums)
        fns.digamma(wordDists, out=diWordDists)

        if updateVocab:
            # Perform inference, updating the vocab
            wordDists[:, :] = vocabPrior
            for d in range(D):
                if debug and d % 100 == 0: printAndFlushNoNewLine(".")
                wordIdx, z = _update_topics_at_d(d, data, weights, docLens,
                                                 topicMeans, topicPrior,
                                                 diWordDists, diWordDistSums)
                wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

            _infer_weights(data, weights, topicMeans, topicPrior, negCount,
                           reg)

            # Log bound and the determine if we can stop early
            if itr % logFrequency == 0:
                iters.append(itr)
                bnds.append(_var_bound_internal(data, model, query))
                likes.append(_log_likelihood_internal(data, model, query))

                if debug: print("%.3f < %.3f" % (bnds[-1], likes[-1]))
                if converged(iters, bnds, len(bnds) - 1, minIters=5):
                    break

            # Update hyperparameters (do this after bound, to make sure bound
            # calculation is internally consistent)
            if itr > 0 and itr % HyperParamUpdateInterval == 0:
                if debug: print("Topic Prior was " + str(topicPrior))
                _updateTopicHyperParamsFromMeans(model, query)
                if debug: print("Topic Prior is now " + str(topicPrior))
        else:
            for d in range(D):
                _ = _update_topics_at_d(d, W, docLens, topicMeans, topicPrior,
                                        diWordDists, diWordDistSums)

    topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)

    return ModelState(K, topicPrior, vocabPrior, wordDists, weights, negCount, reg, dtype, model.name), \
           QueryState(docLens, topicMeans), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
示例#2
0
def train(data, modelState, queryState, trainPlan):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint.

    Params:
    data - the dataset of words, features and links of which only words are used in this model
    modelState - the actual LDA model. In a training run (query = False) this
                 will be mutated in place, and then returned.
    queryState - the query results - essentially all the "local" variables
                 matched to the given observations. This will be mutated in-place
                 and then returned.
    trainPlan  - how to execute the training process (e.g. iterations,
                 log-interval etc.)
    query      -

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize, rate_retardation, forgetting_rate = \
        trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug, \
        trainPlan.batchSize, trainPlan.rate_retardation, trainPlan.forgetting_rate
    W_list, docLens, topicDists = \
        queryState.W_list, queryState.docLens, queryState.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        modelState.K, modelState.topicPrior, modelState.vocabPrior, modelState.wordDists, modelState.dtype

    W = data.words
    D, T = W.shape

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )

    # Book-keeping for logs
    logPoints = 1 if logFrequency == 0 else iterations // logFrequency
    boundIters = np.zeros(shape=(logPoints, ))
    boundValues = np.zeros(shape=(logPoints, ))
    likelyValues = np.zeros(shape=(logPoints, ))
    bvIdx = 0

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    z_dnk = np.empty((docLens.max(), K), dtype=dtype, order='F')

    # Select the training iterations function appropriate for the dtype
    current_micro_time = lambda: int(time.time())
    do_iterations = compiled.iterate_f32 \
                    if modelState.dtype == np.float32 \
                    else compiled.iterate_f64
    #    do_iterations = iterate # pure Python

    # Iterate in segments, pausing to take measures of the bound / likelihood
    segIters = logFrequency
    remainder = iterations - segIters * (logPoints - 1)
    totalItrs = 0
    for segment in range(logPoints - 1):
        start = current_micro_time()
        totalItrs += do_iterations (segIters, \
                 batchSize, segment * segIters, rate_retardation, forgetting_rate, \
                 D, K, T, \
                 W_list, docLens, \
                 topicPrior, vocabPrior, \
                 z_dnk, topicDists, wordDists)

        duration = current_micro_time() - start

        boundIters[bvIdx] = segment * segIters
        boundValues[bvIdx] = var_bound(data, modelState, queryState)
        likelyValues[bvIdx] = log_likelihood(data, modelState, queryState)
        perp = perplexity_from_like(likelyValues[bvIdx], W.sum())
        bvIdx += 1

        if converged(boundIters, boundValues, bvIdx, epsilon, minIters=20):
            boundIters, boundValues, likelyValues = clamp(
                boundIters, boundValues, likelyValues, bvIdx)
            return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \
                QueryState(W_list, docLens, topicDists), \
                (boundIters, boundValues, likelyValues)

        print(
            "Segment %d/%d Total Iterations %d Duration %d Perplexity %4.0f Bound %10.2f Likelihood %10.2f"
            % (segment, logPoints, totalItrs, duration, perp,
               boundValues[bvIdx - 1], likelyValues[bvIdx - 1]))

    # Final batch of iterations.
    do_iterations (remainder, D, K, T, \
                 W_list, docLens, \
                 topicPrior, vocabPrior, \
                 z_dnk, topicDists, wordDists)

    boundIters[bvIdx] = iterations - 1
    boundValues[bvIdx] = var_bound(data, modelState, queryState)
    likelyValues[bvIdx] = log_likelihood(data, modelState, queryState)


    return ModelState(K, topicPrior, vocabPrior, wordDists, modelState.dtype, modelState.name), \
           QueryState(W_list, docLens, topicDists), \
           (boundIters, boundValues, likelyValues)
示例#3
0
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint, and additionally learns the weights
    needed to predict new links.

    Params:
    W - the DxT document-term matrix
    X - The DxD document-document matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug
    docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts = \
        query.docLens, query.topics, query.postTopicCov, query.U, query.V, query.tsums_bydoc, query.tsums_bytop, query.exp_tsums_bydoc, query.exp_tsums_bytop, query.lse_at_k, query.out_counts, query.in_counts
    K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, name = \
     model.K, model.Q, model.topicPrior, model.vocabPrior, model.wordDists, model.topicCov, model.dtype, model.name

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )
    assert dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    W, L, X_fixme = data.words, data.links, data.feats
    D, T = W.shape

    iters, bnds, likes = [], [], []

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    diWordDistSums = np.empty((K, ), dtype=dtype)
    diWordDists = np.empty(wordDists.shape, dtype=dtype)

    new_in_counts = in_counts.copy()

    newCov = np.ndarray(shape=(K, K), dtype=model.dtype)
    newCov.fill(0)
    invCov = la.inv(topicCov)
    S = topicCov.copy()
    rhs = np.ndarray(shape=(K, ), dtype=model.dtype)
    b, f = rhs.copy(), rhs.copy(
    )  # linear coefficients of the Bohing quadratic bounds
    new_maxes_bytop = np.ndarray(shape=(K, ), dtype=model.dtype)
    maxes_bytop = topics.max(axis=0)

    for itr in range(iterations):
        if itr % logFrequency == 0:
            iters.append(itr)
            bnds.append(_var_bound_internal(data, model, query))
            likes.append(_log_likelihood_internal(data, model, query))

            if debug:
                print(
                    "Bound : %f \t Likelihood %f  \t Perplexity %.2f" %
                    (bnds[-1], likes[-1], np.exp(-likes[-1] / docLens.sum())))
            if converged(iters, bnds, len(bnds) - 1, minIters=5):
                break
        if debug: printAndFlushNoNewLine("\n %4d: " % itr)

        newCov[:, :] = 0
        new_in_counts[:] = 0

        # U and V FIXME DEBUG
        # U[:, :] = la.lstsq(V.T, topics.T)[0].T
        # V[:, :] = la.lstsq(U, topics)[0]

        diWordDistSums[:] = wordDists.sum(axis=1)
        fns.digamma(diWordDistSums, out=diWordDistSums)
        fns.digamma(wordDists, out=diWordDists)

        wordDists[:, :] = vocabPrior
        new_maxes_bytop.fill(1E-300)
        for d in range(D):
            if d % 100 == 0:
                printAndFlushNoNewLine(".")
            wordIdx, z, linkIdx, y = _update_topics_at_d(
                d, data, docLens, topics, topicPrior, lse_at_k, diWordDists,
                diWordDistSums)

            # Update the word distributions
            wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

            # Determine the topic distribution
            # Step 1, the covariance
            S[:, :] = invCov
            S[np.diag_indices_from(S)] += docLens[d] + out_counts[d]
            S[:, :] -= 1. / (K + 1)
            S[np.diag_indices_from(S)] += (K - 1.) / K * in_counts
            S = la.inv(S)

            # Topics Step 2, the actual right-hand side
            rhs[:] = invCov.dot(U[d, :].dot(V))
            rhs += (z * W[d, :].data[np.newaxis, :]).sum(axis=1)
            ysum = (y * L[d, :].data[:, np.newaxis]).sum(axis=0)
            #rhs    += ysum

            b[:] = topics[d, :] - 1. / (K + 1) * topics[d, :].sum() - softmax(
                topics[d, :])
            b *= docLens[d]
            rhs += b

            f[:] = topics[d, :] - 1. / (D + 1) * tsums_bytop - np.exp(
                topics[d, :] - maxes_bytop) / exp_tsums_bytop
            f *= in_counts
            rhs += f

            rhs[:] += (D - 1) / (2 * D + 2) * (in_counts *
                                               (tsums_bytop - topics[d, :]))

            # Topics Step 3: solve
            new_topics = S.dot(rhs)

            # Topics Step 4: update the running counts and covariance, then assign the new topics to "topics'
            tsums_bytop -= topics[d, :]
            tsums_bytop += new_topics

            new_maxes_bytop = np.maximum(new_maxes_bytop, new_topics)
            new_in_counts += ysum

            vec = new_topics - U[d, :].dot(V)
            newCov += np.outer(vec, vec)
            newCov += np.diag(S)

            topics[d, :] = new_topics

            # Next step is the posterior covariance
            postTopicCov[d, :] = np.diag(S)

        # The covariance hyper-parameter
        topicCov[:, :] = newCov
        invCov[:, :] = la.inv(topicCov)

        # The remaining running counts, and the column-wise softmax adjustment
        maxes_bytop[:] = new_maxes_bytop
        in_counts[:] = new_in_counts
        exp_tsums_bytop[:] = np.sum(np.exp(topics -
                                           maxes_bytop[np.newaxis, :]),
                                    axis=0)


    return ModelState(K, Q, topicPrior, vocabPrior, wordDists, topicCov, dtype, model.name), \
           QueryState(docLens, topics, postTopicCov, U, V, tsums_bydoc, tsums_bytop, exp_tsums_bydoc, exp_tsums_bytop, lse_at_k, out_counts, in_counts), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))