def train (data, model, query, trainPlan, isQuery=False): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: :param data: the dataset, must contain both words and links :param model: the actual model, which is modified in-place :param query: the query results - essentially all the "local" variables matched to the given observations :param trainPlan: how to execute the training process (e.g. iterations, log-interval etc.) Return: An new modelstate and a new querystate object with the learnt parameters, and and a tuple of iteration, vb-bound measurement and log-likelhood measurement ''' iterations, epsilon, logFrequency, fastButInaccurate, debug = \ trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug ldaModel, method, K, dtype, modelName = \ model.ldaModel, model.method, model.K, model.dtype, model.name ldaTopics = query.ldaTopics D, K = data.doc_count, ldaModel.K # Step 1: Learn the topics using vanilla LDA if method == TF_IDF: # First do TF docLens = np.squeeze(np.array(data.words.sum(axis=1))) reps = data.words.copy() #reps /= docLens[:, np.newaxis] replaced with line below to retain sparsity reps = ssp.diags(np.reciprocal(docLens), 0).dot(reps) occ = data.words.astype(np.bool).astype(dtype) docCount = np.squeeze(np.array(occ.sum(axis=0))) docCount += 1 idf = np.log(D / docCount) # reps *= idf[np.newaxis, :] reps = reps.dot(ssp.diags(idf, 0)) elif method == LDA: plan = lda.newTrainPlan(iterations, logFrequency=logFrequency, debug=debug) if isQuery: _, ldaTopics = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), plan) elif ldaTopics is None or not ldaTopics.processed: ldaModel, ldaTopics, (_, _, _) = lda.train(data, ldaModel, lda.newQueryState(data, ldaModel), plan) reps = np.sqrt(lda.topicDists(ldaTopics)) else: raise ValueError("Unknown method %s" % method) return ModelState(ldaModel, K, method, dtype, modelName), \ QueryState(reps, ldaTopics), \ ([0], [0], [0])
def testCrossValPerplexityOnRealDataWithLdaInc(self): ActiveFolds = 3 dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Initialise the model trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) queryPlan = lda.newTrainPlan(iterations=50, logFrequency=5, fastButInaccurate=False, debug=False) topicCounts = [30, 35, 40, 45, 50] # [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPerps = [] queryPerps = [] for fold in range(ActiveFolds): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) model = lda.newModelAtRandom(trainData, K, dtype=dtype) query = lda.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = lda.train (trainData, model, query, trainPlan) like = lda.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) estData, evalData = queryData.doc_completion_split() query = lda.newQueryState(estData, model) model, queryResult = lda.query(estData, model, query, queryPlan) like = lda.log_likelihood(evalData, model, queryResult) perp = perplexity_from_like(like, evalData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / ActiveFolds) queryPerps.append(sum(queryPerps) / ActiveFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))
def testOnRealData(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=NipsWordsPath, links_file=NipsCitePath) with open(NipsDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=50, min_link_count=0) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 10 model = lda.newModelAtRandom(data, K, dtype=dtype) queryState = lda.newQueryState(data, model) trainPlan = lda.newTrainPlan(iterations=30, logFrequency=2, debug=False, batchSize=50, rate_retardation=1, forgetting_rate=0.75) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() vocab = lda.wordDists(model) plt.imshow(vocab, interpolation="nearest", cmap=cm.Greys_r) plt.show() # Print out the most likely topic words topWordCount = 100 kTopWordInds = [topWordIndices(vocab[k, :] * scale, topWordCount) \ for k in range(K)] # Print out the most likely topic words print("Prior %s" % (str(model.topicPrior))) print("Perplexity: %f\n\n" % word_perplexity(lda.log_likelihood, model, query, data)) print("") printWordDists(K, lda.wordDists(model), d)
def testPerplexityOnRealDataWithLdaInc(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] perps = [] for K in topicCounts: model = lda.newModelAtRandom(data, K, dtype=dtype) queryState = lda.newQueryState(data, model) trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) # vocab = lda.wordDists(model) # topWordCount = 10 # kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = lda.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) perps.append(perp) print ("K = %2d : Perplexity = %f\n\n" % (K, perp)) # # for k in range(model.K): # print("\nTopic %d\n=============================" % k) # print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount))) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(topicCounts, perps, 'b-') ax1.set_xlabel('Topic Count') ax1.set_ylabel('Perplexity', color='b') fig.show() plt.show()
def newQueryState(data, model, withLdaTopics=None): ''' Creates a new LRO QueryState object. This contains all parameters and random variables tied to individual datapoints. Param: :param data: the dataset, must contain words and links. :param model: the model state object Return: A QueryState object ''' if withLdaTopics is None: withLdaTopics = lda.newQueryState(data, model.ldaModel) offsets = np.zeros((data.doc_count, model.K)) return QueryState(withLdaTopics, offsets)
def train (data, model, query, trainPlan, isQuery=False): ''' Infers the topic distributions in general, and specifically for each individual datapoint. Params: :param data: the dataset, must contain both words and links :param model: the actual model, which is modified in-place :param query: the query results - essentially all the "local" variables matched to the given observations :param trainPlan: how to execute the training process (e.g. iterations, log-interval etc.) Return: An new modelstate and a new querystate object with the learnt parameters, and and a tuple of iteration, vb-bound measurement and log-likelhood measurement ''' ldaPlan, iterations, epsilon, logFrequency, fastButInaccurate, debug = \ trainPlan.ldaPlan, trainPlan.iterations, trainPlan.epsilon, trainPlan.logFrequency, trainPlan.fastButInaccurate, trainPlan.debug ldaModel, noiseVar, predVar, scale, dtype = \ model.ldaModel, model.noiseVar, model.predVar, model.scale, model.dtype ldaQuery, offsetTopicDists = \ query.ldaQuery, query.offsetTopicDists D, K = data.doc_count, ldaModel.K epsilon = 0.01 * D * K if epsilon is None else epsilon tau = [predVar[0], predVar[1]] # Step 1: Learn the topics using vanilla LDA print (time.strftime('%X') + " Beginning Topic Inference") if isQuery: _, ldaQuery = lda.query(data, ldaModel, lda.newQueryState(data, ldaModel), ldaPlan) elif not ldaModel.processed: ldaModel, ldaQuery, (_, _, _) = lda.train(data, ldaModel, ldaQuery, ldaPlan) print (time.strftime('%X') + " Topic Inference Completed") tops = lda.topicDists(ldaQuery) offs = tops.copy() topsSum = tops.T.dot(tops) # Step 2: reverse the links matrix so we can talk about the origin (not target) of links inlinks = data.links.T.tocsr() # Step 3: Learn the scaling factor and offsets for each link's target-doc till converged print ("Learning Offsets") for itr in range(iterations): print ("Iteration " + str(itr), end=": ") # Record the current scale of the offsets before = la.norm(offs / scale) # Update the scale lhs, rhs = 0, 0 for p in range(data.doc_count): lhs += (tau[1] - tau[0]) * (tops[inlinks[p,:].indices,:].dot(offs[p,:]) ** 2).sum() lhs += tau[0] * (offs[p,:].dot(topsSum).dot(offs[p,:]) - offs[p,:].dot(np.outer(tops[p,:],tops[p,:])).dot(offs[p,:])) rhs += tau[1] * tops[inlinks[p,:].indices,:].dot(offs[p,:]).sum() scale = rhs / lhs # Update the offset for every target doc for p in range(data.doc_count): lhs = (tau[1] - tau[0]) * np.einsum("dj,k->jk", tops[inlinks[p,:].indices,:], tops[p,:]) lhs += tau[0] * (np.einsum("dj,k->jk", tops, tops[p,:]) - np.outer(tops[p,:], tops[p,:])) lhs *= (scale * scale) lhs[np.diag_indices_from(lhs)] += noiseVar rhs = tops[p,:] + scale * tau[1] * tops[inlinks[p,:].indices,:].sum(axis=0) offs[p,:] = la.inv(lhs).dot(rhs) # Check has the offsets changed significantly after = la.norm(offs / scale) print ("%f --> %f. scale=%f" % (before, after, scale)) if abs(before - after) < epsilon: break return ModelState(ldaModel, K, noiseVar, predVar, scale, dtype, MODEL_NAME), \ QueryState(ldaQuery, offs), \ ([0], [0], [0])