def newModelAtRandom(data, K, noiseVar=9, predVar=None, topicPrior=None, vocabPrior=lda.VocabPrior, ldaModel=None, dtype=DTYPE): ''' Creates a new LRO ModelState for the given training set and the given number of topics. Everything is instantiated purely at random, except for vocabularies, which are seeded with random documents, to get a good starting point. :param data: the DataSet, must contain words and links. :param K: the number of topics :noiseVar: the noise variance determining offset size :predVar: the various around predictions, a two element vector, the first being the prediction noise when links re not observed, the second when links are observed :param topicPrior: the prior over topics, either a scalar or a K-dimensional vector :param vocabPrior: the prior over vocabs, either a scalar or a T-dimensional vector :param dtype: the datatype to be used throughout. Return: A ModelState object ''' assert K > 1, "There must be at least two topics" assert K < 255, "There can be no more than 255 topics" D,T = data.words.shape Q,P = data.links.shape assert D == Q and Q == P, "Link matrix must be square and have same row-count as word-matrix" if ldaModel is None: ldaModel = lda.newModelAtRandom(data, K, topicPrior, vocabPrior, dtype) if predVar is None: predVar = np.array([0.01, 1]) assert len(predVar) == 2 scale = 1 return ModelState(ldaModel, K, noiseVar, predVar, scale, dtype, MODEL_NAME)
def newModelAtRandom(data, K, method=TF_IDF, topicPrior=None, vocabPrior=lda.VocabPrior, ldaModel=None, dtype=DTYPE): ''' Creates a new LRO ModelState for the given training set and the given number of topics. Everything is instantiated purely at random, except for vocabularies, which are seeded with random documents, to get a good starting point. :param data: the DataSet, must contain words and links. :param K: the number of topics :param method: the method by which the documents will be compared, either their LDA topic distribution or their TF_IDF scores :param topicPrior: the prior over topics, either a scalar or a K-dimensional vector :param vocabPrior: the prior over vocabs, either a scalar or a T-dimensional vector :param dtype: the datatype to be used throughout. Return: A ModelState object ''' assert K > 1, "There must be at least two topics" assert K < 255, "There can be no more than 255 topics" D,T = data.words.shape Q,P = data.links.shape assert D == Q and Q == P, "Link matrix must be square and have same row-count as word-matrix" if ldaModel is None: ldaModel = lda.newModelAtRandom(data, K, topicPrior, vocabPrior, dtype) if method == TF_IDF: modelName = MODEL_NAME_PREFIX + TF_IDF elif method == LDA: modelName = MODEL_NAME_PREFIX + LDA else: raise ValueError("Incorrect method name") return ModelState(ldaModel, K, method, dtype, modelName)
def testOnRealData(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=NipsWordsPath, links_file=NipsCitePath) with open(NipsDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=50, min_link_count=0) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 10 model = lda.newModelAtRandom(data, K, dtype=dtype) queryState = lda.newQueryState(data, model) trainPlan = lda.newTrainPlan(iterations=30, logFrequency=2, debug=False, batchSize=50, rate_retardation=1, forgetting_rate=0.75) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() plt.show() vocab = lda.wordDists(model) plt.imshow(vocab, interpolation="nearest", cmap=cm.Greys_r) plt.show() # Print out the most likely topic words topWordCount = 100 kTopWordInds = [topWordIndices(vocab[k, :] * scale, topWordCount) \ for k in range(K)] # Print out the most likely topic words print("Prior %s" % (str(model.topicPrior))) print("Perplexity: %f\n\n" % word_perplexity(lda.log_likelihood, model, query, data)) print("") printWordDists(K, lda.wordDists(model), d)
def testPerplexityOnRealDataWithLdaInc(self): dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) with open(AclDictPath, "rb") as f: d = pkl.load(f) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(data.words.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model topicCounts = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] perps = [] for K in topicCounts: model = lda.newModelAtRandom(data, K, dtype=dtype) queryState = lda.newQueryState(data, model) trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = lda.train (data, model, queryState, trainPlan) # with open(newModelFileFromModel(model), "wb") as f: # pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Print out the most likely topic words # scale = np.reciprocal(1 + np.squeeze(np.array(data.words.sum(axis=0)))) # vocab = lda.wordDists(model) # topWordCount = 10 # kTopWordInds = [self.topWordInds(vocab[k,:], topWordCount) for k in range(K)] like = lda.log_likelihood(data, model, query) perp = perplexity_from_like(like, data.word_count) perps.append(perp) print ("K = %2d : Perplexity = %f\n\n" % (K, perp)) # # for k in range(model.K): # print("\nTopic %d\n=============================" % k) # print("\n".join("%-20s\t%0.4f" % (d[kTopWordInds[k][c]], vocab[k][kTopWordInds[k][c]]) for c in range(topWordCount))) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(topicCounts, perps, 'b-') ax1.set_xlabel('Topic Count') ax1.set_ylabel('Perplexity', color='b') fig.show() plt.show()
def testCrossValPerplexityOnRealDataWithLdaInc(self): ActiveFolds = 3 dtype = np.float64 # DTYPE rd.seed(0xBADB055) data = DataSet.from_files(words_file=AclWordPath, links_file=AclCitePath) data.convert_to_dtype(dtype) data.prune_and_shuffle(min_doc_len=MinDocLen, min_link_count=MinLinkCount) # Initialise the model trainPlan = lda.newTrainPlan(iterations=800, logFrequency=10, fastButInaccurate=False, debug=False) queryPlan = lda.newTrainPlan(iterations=50, logFrequency=5, fastButInaccurate=False, debug=False) topicCounts = [30, 35, 40, 45, 50] # [5, 10, 15, 20, 25, 30, 35, 40, 45, 50] for K in topicCounts: trainPerps = [] queryPerps = [] for fold in range(ActiveFolds): # range(NumFolds): trainData, queryData = data.cross_valid_split(fold, NumFolds) model = lda.newModelAtRandom(trainData, K, dtype=dtype) query = lda.newQueryState(trainData, model) # Train the model, and the immediately save the result to a file for subsequent inspection model, trainResult, (_, _, _) = lda.train (trainData, model, query, trainPlan) like = lda.log_likelihood(trainData, model, trainResult) perp = perplexity_from_like(like, trainData.word_count) trainPerps.append(perp) estData, evalData = queryData.doc_completion_split() query = lda.newQueryState(estData, model) model, queryResult = lda.query(estData, model, query, queryPlan) like = lda.log_likelihood(evalData, model, queryResult) perp = perplexity_from_like(like, evalData.word_count) queryPerps.append(perp) trainPerps.append(sum(trainPerps) / ActiveFolds) queryPerps.append(sum(queryPerps) / ActiveFolds) print("K=%d,Segment=Train,%s" % (K, ",".join([str(p) for p in trainPerps]))) print("K=%d,Segment=Query,%s" % (K, ",".join([str(p) for p in queryPerps])))