def newModelAtRandom(data, P, K, featVar, latFeatVar, vocabPrior=VocabPrior, dtype=DTYPE): ''' Creates a new CtmModelState for the given training set and the given number of topics. Everything is instantiated purely at random. This contains all parameters independent of of the dataset (e.g. learnt priors) Param: data - the dataset of words, features and links of which only words and features are used in this model P - The size of the latent feature-space P << F K - the number of topics featVar - the prior variance of the feature-space: this is a scalar used to scale an identity matrix featVar - the prior variance of the latent feature-space: this is a scalar used to scale an identity matrix Return: A ModelState object ''' assert K > 1, "There must be at least two topics" base = ctm.newModelAtRandom(data, K, vocabPrior, dtype) _,F = data.feats.shape Y = rd.random((K,P)).astype(dtype) R_Y = latFeatVar * np.eye(P,P, dtype=dtype) V = rd.random((P,F)).astype(dtype) A = Y.dot(V) R_A = featVar * np.eye(F,F, dtype=dtype) return ModelState(F, P, K, A, R_A, featVar, Y, R_Y, latFeatVar, V, base.sigT, base.vocab, base.vocabPrior, dtype, MODEL_NAME)
def testOnRealData(self): print ("CTM/Bouchard") rd.seed(0xBADB055) path = "/Users/bryanfeeney/Desktop/NIPS" with open(path + "/ar.pkl", 'rb') as f: _, W, _, d = pkl.load(f) if len(d) == 1: d = d[0] if W.dtype != DTYPE: W = W.astype(DTYPE) docLens = np.squeeze(np.asarray(W.sum(axis=1))) good_rows = (np.where(docLens > 0.5))[0] if len(good_rows) < W.shape[0]: print ("Some rows in the doc-term matrix are empty. These have been removed.") W = W[good_rows, :] # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(W.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 20 model = ctm.newModelAtRandom(W, K, dtype=DTYPE) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan) with open(newModelFileFromModel(model), "wb") as f: pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() fig.suptitle("CTM/Bouchard (Identity Cov) on NIPS") plt.show() plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Print out the most likely topic words topWordCount = 100 kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \ for k in range(K)] print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query)) print ("\t\t".join (["Topic " + str(k) for k in range(K)])) print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
def _testOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") useDiagonalPriorCov = True rd.seed(0xBADB055) # Global init for repeatable test D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K) W = W.astype(DTYPE) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for useDiagonalPriorCov in [False, True]: trainLikely = [] trainWordCount = [] queryLikely = [] queryWordCount = [] for fold in range(folds): # Split the datasets start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D W_train = W[trainSet,:] W_query = W[querySet,:] # Train the model model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE) queryState = ctm.newQueryState(W_train, model) plan = ctm.newTrainPlan(iterations=40, logFrequency=1, fastButInaccurate=useDiagonalPriorCov, debug=True) model, queryState, (bndItrs, bndVals, likelies) = ctm.train (W_train, None, model, queryState, plan) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, likelies, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() # Plot the topic covariance self._plotCov(model) # Plot the vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Calculating the training set likelihood trainLikely.append(ctm.log_likelihood(W_train, model, queryState)) trainWordCount.append(W_train.data.sum()) # Now query the model. plan = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov) queryState = ctm.newQueryState(W_query, model) model, queryState = ctm.query(W_query, None, model, queryState, plan) queryLikely.append(ctm.log_likelihood(W_query, model, queryState)) queryWordCount.append(W_query.data.sum()) # Print out the likelihood and perplexity for each fold. print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances") for fold in range(folds): trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold]) queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold]) print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold])) print(" Perplexity: %12.2f \t Perplexity: %12.2f" % (trainPerp, queryPerp)) self.assertTrue(queryPerp < 60.0) # Maximum perplexity. self.assertTrue(trainPerp < 60.0) print ("\n\n") print("End of Test")
def _testOnModelHandcraftedData(self): # # Create the vocab # T = 3 * 3 K = 5 # Horizontal bars vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])), shape=(3,3)).todense() #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense() vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])), shape=(3,3)).todense() # Vertical bars vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])), shape=(3,3)).todense() #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense() vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])), shape=(3,3)).todense() # Diagonals vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])), shape=(3,3)).todense() #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense() # Put together T = vocab1.shape[0] * vocab1.shape[1] vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7] # Create a single matrix with the flattened vocabularies vocabVectors = [] for vocab in vocabs: vocabVectors.append (np.squeeze(np.asarray (vocab.reshape((1,T))))) vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE)) # Plot the vocab ones = np.ones(vocabs[0].shape) for k in range(K): plt.subplot(2, 3, k) plt.imshow(ones - vocabs[k], interpolation="none", cmap = cm.Greys_r) plt.show() # # Create the corpus # rd.seed(0xC0FFEE) D = 1000 # Make sense (of a sort) of this by assuming that these correspond to # Kittens Omelettes Puppies Oranges Tomatoes Dutch People Basketball Football #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25]) # topicCovar = np.array(\ # [[ 100, 5, 55, 20, 5, 15, 4, 0], \ # [ 5, 100, 5, 10, 70, 5, 0, 0], \ # [ 55, 5, 100, 5, 5, 10, 0, 5], \ # [ 20, 10, 5, 100, 30, 30, 20, 10], \ # [ 5, 70, 5, 30, 100, 0, 0, 0], \ # [ 15, 5, 10, 30, 0, 100, 10, 40], \ # [ 4, 0, 0, 20, 0, 10, 100, 20], \ # [ 0, 0, 5, 10, 0, 40, 20, 100]], dtype=DTYPE) / 100.0 topicMean = np.array([25, 15, 40, 5, 15]) self.assertEqual(100, topicMean.sum()) topicCovar = np.array(\ [[ 100, 5, 55, 20, 5 ], \ [ 5, 100, 5, 10, 70 ], \ [ 55, 5, 100, 5, 5 ], \ [ 20, 10, 5, 100, 30 ], \ [ 5, 70, 5, 30, 100 ], \ ], dtype=DTYPE) / 100.0 meanWordCount = 80 wordCounts = rd.poisson(meanWordCount, size=D) topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D) W = topicDists.dot(vocab) * wordCounts[:, np.newaxis] W = ssp.csr_matrix (W.astype(DTYPE)) # # Train the model # model = ctm.newModelAtRandom(W, K, dtype=DTYPE) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=65, logFrequency=1) self.assertTrue (0.99 < np.sum(model.topicMean) < 1.01) return self._doTest (W, model, queryState, trainPlan)
def run(args): ''' Parses the command-line arguments (excluding the application name portion). Executes a cross-validation run accordingly, saving the output at the end of each run. Returns the list of files created. ''' # # Enumerate all possible arguments # parser = ap.ArgumentParser(description='Execute a topic-modeling run.') parser.add_argument('--model', '-m', dest='model', metavar=' ', \ help='The type of mode to use, options are ' + ModelNames) parser.add_argument('--num-topics', '-k', dest='K', type=int, metavar=' ', \ help='The number of topics to fit') parser.add_argument('--num-lat-topics', '-q', dest='Q', type=int, metavar=' ', \ help='The number of latent topics (i.e. rank of the topic covariance matrix)') parser.add_argument('--num-lat-feats', '-p', dest='P', type=int, metavar=' ', \ help='The number of latent features (i.e. rank of the features covariance matrix)') parser.add_argument('--words', '-w', dest='words', metavar=' ', \ help='The path to the pickle file containing a DxT array or matrix of the word-counts across all D documents') parser.add_argument('--feats', '-x', dest='feats', metavar=' ', \ help='The path to the pickle file containing a DxF array or matrix of the features across all D documents') parser.add_argument('--links', '-c', dest='links', metavar=' ', \ help='The path to the pickle file containing a DxP array or matrix of the links (citations) emanated by all D documents') parser.add_argument('--eval', '-v', dest='eval', default=Perplexity, metavar=' ', \ help='Evaluation metric, available options are: ' + ','.join(EvalNames)) parser.add_argument('--out-model', '-o', dest='out_model', default=None, metavar=' ', \ help='Optional output path in which to store the model') parser.add_argument('--log-freq', '-l', dest='log_freq', type=int, default=10, metavar=' ', \ help='Log frequency - how many times to inspect the bound while running') parser.add_argument('--iters', '-i', dest='iters', type=int, default=500, metavar=' ', \ help='The maximum number of iterations to run when training') parser.add_argument('--query-iters', '-j', dest='query_iters', type=int, default=100, metavar=' ', \ help='The maximum number of iterations to run when querying, by default same as when training') parser.add_argument('--min-vb-change', '-e', dest='min_vb_change', type=float, default=1, metavar=' ', \ help='The amount by which the variational bound must change at each log-interval to avoid inference being stopped early.') parser.add_argument('--topic-var', dest='topic_var', type=float, default=DefaultPriorCov, metavar=' ', \ help="Scale of the prior isotropic variance over topics") parser.add_argument('--feat-var', dest='feat_var', type=float, default=DefaultPriorCov, metavar=' ', \ help="Scale of the prior isotropic variance over features") parser.add_argument('--lat-topic-var', dest='lat_topic_var', type=float, default=DefaultPriorCov, metavar=' ', \ help="Scale of the prior isotropic variance over latent topics") parser.add_argument('--lat-feat-var', dest='lat_feat_var', type=float, default=DefaultPriorCov, metavar=' ', \ help="Scale of the prior isotropic variance over latent features") parser.add_argument('--vocab-prior', dest='vocabPrior', type=float, default=1.1, metavar=' ', \ help="Symmetric prior over the vocabulary") parser.add_argument('--folds', '-f', dest='folds', type=int, default=1, metavar=' ', \ help="Number of cross validation folds.") parser.add_argument('--truncate-folds', dest='eval_fold_count', type=int, default=-1, metavar=' ', \ help="If set, stop running after the given number of folds had been processed") parser.add_argument('--debug', '-b', dest='debug', type=bool, default=False, metavar=' ', \ help="Display a debug message, with the bound, after every variable update") parser.add_argument('--dtype', '-t', dest='dtype', default="f4:f4", metavar=' ', \ help="Datatype to use, values are i4, f4 and f8. Specify two, a data dtype and model dtype, delimited by a colon") parser.add_argument('--limit-to', dest='limit', type=int, default=0, metavar=' ', \ help="If set, discard all but the initial given number of rows of the input dataset") parser.add_argument('--word-dict', dest='word_dict', default=None, metavar=' ', \ help='A dictionary of all words. Used to identify hashtag indices') parser.add_argument('--lda-model', dest='ldaModel', default=None, metavar=' ', \ help='A trained LDA model, used with the LRO model') parser.add_argument('--feats-mask', dest='features_mask_str', default=None, metavar=' ', \ help='Feature mask to use with FeatSplit runs, comma-delimited list of colon-delimited pairs') # # Initialization of the app: first parse the arguments # print("Random seed is 0xC0FFEE") rd.seed(0xC0FFEE) print("Args are : " + str(args)) args = parser.parse_args(args) K, P, Q = args.K, args.P, args.Q features_mask = parse_features_mask(args) (input_dtype, output_dtype) = parse_dtypes(args.dtype) fv, tv, lfv, ltv = args.feat_var, args.topic_var, args.lat_feat_var, args.lat_topic_var # # Load and prune the data # data = DataSet.from_files(args.words, args.feats, args.links, limit=args.limit) data.convert_to_dtype(input_dtype) data.prune_and_shuffle(min_doc_len=3, min_link_count=MinLinkCountPrune) print ("The combined word-count of the %d documents is %.0f, drawn from a vocabulary of %d distinct terms" % (data.doc_count, data.word_count, data.words.shape[1])) if data.add_intercept_to_feats_if_required(): print ("Appended an intercept to the given features") # # Instantiate and configure the model # if args.ldaModel is not None: ldaModel, ldaTopics = load_and_adapt_lda_model(args.ldaModel, data.order) else: ldaModel, ldaTopics = None, None print ("Building template model... ", end="") if args.model == CtmBouchard: import model.ctm as mdl templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype) elif args.model == CtmBohning: import model.ctm_bohning as mdl templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype) elif args.model == StmYvBouchard: import model.stm_yv as mdl templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype) elif args.model == StmYvBohning: import model.stm_yv_bohning as mdl templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype) elif args.model == StmYvBohningFakeOnline: import model.stm_yv_bohning_fake_online as mdl templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype) elif args.model == StmUyvBohning: import model.stm_uv_vec_y_bohning as mdl templateModel = mdl.newModelAtRandom(data, K=K, Q=Q, P=P, tv=tv, ltv=ltv, fv=fv, lfv=lfv, vocabPrior=args.vocabPrior, dtype=output_dtype) elif args.model == LdaCvbZero: import model.lda_cvb as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == LdaVb: import model.lda_vb_python as mdl templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype) elif args.model == LdaGibbs: import model.lda_gibbs as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Rtm: import model.rtm as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Mtm: import model.mtm2 as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Mtm2: import model.mtm3 as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Dmr: import model.dmr as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Lro: import model.lro_vb as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == SimLda: import model.sim_based_rec as mdl templateModel = mdl.newModelAtRandom(data, K, method=mdl.LDA, dtype=output_dtype) elif args.model == SimTfIdf: import model.sim_based_rec as mdl templateModel = mdl.newModelAtRandom(data, K, method=mdl.TF_IDF, dtype=output_dtype) else: raise ValueError ("Unknown model identifier " + args.model) print("Done") trainPlan = mdl.newTrainPlan(args.iters, debug=args.debug) queryPlan = mdl.newTrainPlan(args.query_iters, debug=args.debug) if args.eval == Perplexity: return cross_val_and_eval_perplexity(data, mdl, templateModel, trainPlan, queryPlan, args.folds, args.eval_fold_count, args.out_model) elif args.eval == HashtagPrecAtM: return cross_val_and_eval_hashtag_prec_at_m(data, mdl, templateModel, trainPlan, load_dict(args.word_dict), args.folds, args.eval_fold_count, args.out_model) elif args.eval == MeanAveragePrecAllDocs: return link_split_map (data, mdl, templateModel, trainPlan, args.folds, args.out_model) elif args.eval == MeanPrecRecAtMAllDocs: return link_split_prec_rec (data, mdl, templateModel, trainPlan, args.folds, args.eval_fold_count, args.out_model, ldaModel, ldaTopics) elif args.eval == LroMeanPrecRecAtMAllDocs: return insample_lro_style_prec_rec (data, mdl, templateModel, trainPlan, args.folds, args.eval_fold_count, args.out_model, ldaModel, ldaTopics) elif args.eval == LroMeanPrecRecAtMFeatSplit: return outsample_lro_style_prec_rec (data, mdl, templateModel, trainPlan, features_mask, args.out_model, ldaModel, ldaTopics) else: raise ValueError("Unknown evaluation metric " + args.eval) return modelFiles