def testOnRealData(self): print ("CTM/Bouchard") rd.seed(0xBADB055) path = "/Users/bryanfeeney/Desktop/NIPS" with open(path + "/ar.pkl", 'rb') as f: _, W, _, d = pkl.load(f) if len(d) == 1: d = d[0] if W.dtype != DTYPE: W = W.astype(DTYPE) docLens = np.squeeze(np.asarray(W.sum(axis=1))) good_rows = (np.where(docLens > 0.5))[0] if len(good_rows) < W.shape[0]: print ("Some rows in the doc-term matrix are empty. These have been removed.") W = W[good_rows, :] # IDF frequency for when we print out the vocab later freq = np.squeeze(np.asarray(W.sum(axis=0))) scale = np.reciprocal(1 + freq) # Initialise the model K = 20 model = ctm.newModelAtRandom(W, K, dtype=DTYPE) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=True) # Train the model, and the immediately save the result to a file for subsequent inspection model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan) with open(newModelFileFromModel(model), "wb") as f: pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, bndLikes, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() fig.suptitle("CTM/Bouchard (Identity Cov) on NIPS") plt.show() plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Print out the most likely topic words topWordCount = 100 kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \ for k in range(K)] print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query)) print ("\t\t".join (["Topic " + str(k) for k in range(K)])) print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
def newTrainPlan(iterations = 100, epsilon=2, logFrequency=10, fastButInaccurate=False, debug=DEBUG): ''' Create a training plan determining how many iterations we process, how often we plot the results, how often we log the variational bound, etc. ''' base = ctm.newTrainPlan(iterations, epsilon, logFrequency, fastButInaccurate, debug) return TrainPlan(base.iterations, base.epsilon, base.logFrequency, base.fastButInaccurate, base.debug)
def _testOnModelDerivedExample(self): print("Cross-validated likelihoods on model-derived example") useDiagonalPriorCov = True rd.seed(0xBADB055) # Global init for repeatable test D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K) W = W.astype(DTYPE) plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Create the cross-validation folds folds = 5 foldSize = ceil(D / 5) querySize = foldSize trainSize = D - querySize for useDiagonalPriorCov in [False, True]: trainLikely = [] trainWordCount = [] queryLikely = [] queryWordCount = [] for fold in range(folds): # Split the datasets start = fold * foldSize end = start + trainSize trainSet = np.arange(start,end) % D querySet = np.arange(end, end + querySize) % D W_train = W[trainSet,:] W_query = W[querySet,:] # Train the model model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE) queryState = ctm.newQueryState(W_train, model) plan = ctm.newTrainPlan(iterations=40, logFrequency=1, fastButInaccurate=useDiagonalPriorCov, debug=True) model, queryState, (bndItrs, bndVals, likelies) = ctm.train (W_train, None, model, queryState, plan) # Plot the evolution of the bound during training. fig, ax1 = plt.subplots() ax1.plot(bndItrs, bndVals, 'b-') ax1.set_xlabel('Iterations') ax1.set_ylabel('Bound', color='b') ax2 = ax1.twinx() ax2.plot(bndItrs, likelies, 'r-') ax2.set_ylabel('Likelihood', color='r') fig.show() # Plot the topic covariance self._plotCov(model) # Plot the vocab plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r) plt.show() # Calculating the training set likelihood trainLikely.append(ctm.log_likelihood(W_train, model, queryState)) trainWordCount.append(W_train.data.sum()) # Now query the model. plan = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov) queryState = ctm.newQueryState(W_query, model) model, queryState = ctm.query(W_query, None, model, queryState, plan) queryLikely.append(ctm.log_likelihood(W_query, model, queryState)) queryWordCount.append(W_query.data.sum()) # Print out the likelihood and perplexity for each fold. print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances") for fold in range(folds): trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold]) queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold]) print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold])) print(" Perplexity: %12.2f \t Perplexity: %12.2f" % (trainPerp, queryPerp)) self.assertTrue(queryPerp < 60.0) # Maximum perplexity. self.assertTrue(trainPerp < 60.0) print ("\n\n") print("End of Test")
def _testOnModelHandcraftedData(self): # # Create the vocab # T = 3 * 3 K = 5 # Horizontal bars vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])), shape=(3,3)).todense() #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense() vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])), shape=(3,3)).todense() # Vertical bars vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])), shape=(3,3)).todense() #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense() vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])), shape=(3,3)).todense() # Diagonals vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])), shape=(3,3)).todense() #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense() # Put together T = vocab1.shape[0] * vocab1.shape[1] vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7] # Create a single matrix with the flattened vocabularies vocabVectors = [] for vocab in vocabs: vocabVectors.append (np.squeeze(np.asarray (vocab.reshape((1,T))))) vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE)) # Plot the vocab ones = np.ones(vocabs[0].shape) for k in range(K): plt.subplot(2, 3, k) plt.imshow(ones - vocabs[k], interpolation="none", cmap = cm.Greys_r) plt.show() # # Create the corpus # rd.seed(0xC0FFEE) D = 1000 # Make sense (of a sort) of this by assuming that these correspond to # Kittens Omelettes Puppies Oranges Tomatoes Dutch People Basketball Football #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25]) # topicCovar = np.array(\ # [[ 100, 5, 55, 20, 5, 15, 4, 0], \ # [ 5, 100, 5, 10, 70, 5, 0, 0], \ # [ 55, 5, 100, 5, 5, 10, 0, 5], \ # [ 20, 10, 5, 100, 30, 30, 20, 10], \ # [ 5, 70, 5, 30, 100, 0, 0, 0], \ # [ 15, 5, 10, 30, 0, 100, 10, 40], \ # [ 4, 0, 0, 20, 0, 10, 100, 20], \ # [ 0, 0, 5, 10, 0, 40, 20, 100]], dtype=DTYPE) / 100.0 topicMean = np.array([25, 15, 40, 5, 15]) self.assertEqual(100, topicMean.sum()) topicCovar = np.array(\ [[ 100, 5, 55, 20, 5 ], \ [ 5, 100, 5, 10, 70 ], \ [ 55, 5, 100, 5, 5 ], \ [ 20, 10, 5, 100, 30 ], \ [ 5, 70, 5, 30, 100 ], \ ], dtype=DTYPE) / 100.0 meanWordCount = 80 wordCounts = rd.poisson(meanWordCount, size=D) topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D) W = topicDists.dot(vocab) * wordCounts[:, np.newaxis] W = ssp.csr_matrix (W.astype(DTYPE)) # # Train the model # model = ctm.newModelAtRandom(W, K, dtype=DTYPE) queryState = ctm.newQueryState(W, model) trainPlan = ctm.newTrainPlan(iterations=65, logFrequency=1) self.assertTrue (0.99 < np.sum(model.topicMean) < 1.01) return self._doTest (W, model, queryState, trainPlan)
def run(args): ''' Parses the command-line arguments (excluding the application name portion). Executes a cross-validation run accordingly, saving the output at the end of each run. Returns the list of files created. ''' # # Enumerate all possible arguments # parser = ap.ArgumentParser(description='Execute a topic-modeling run.') parser.add_argument('--model', '-m', dest='model', metavar=' ', \ help='The type of mode to use, options are ' + ModelNames) parser.add_argument('--num-topics', '-k', dest='K', type=int, metavar=' ', \ help='The number of topics to fit') parser.add_argument('--num-lat-topics', '-q', dest='Q', type=int, metavar=' ', \ help='The number of latent topics (i.e. rank of the topic covariance matrix)') parser.add_argument('--num-lat-feats', '-p', dest='P', type=int, metavar=' ', \ help='The number of latent features (i.e. rank of the features covariance matrix)') parser.add_argument('--words', '-w', dest='words', metavar=' ', \ help='The path to the pickle file containing a DxT array or matrix of the word-counts across all D documents') parser.add_argument('--feats', '-x', dest='feats', metavar=' ', \ help='The path to the pickle file containing a DxF array or matrix of the features across all D documents') parser.add_argument('--links', '-c', dest='links', metavar=' ', \ help='The path to the pickle file containing a DxP array or matrix of the links (citations) emanated by all D documents') parser.add_argument('--eval', '-v', dest='eval', default=Perplexity, metavar=' ', \ help='Evaluation metric, available options are: ' + ','.join(EvalNames)) parser.add_argument('--out-model', '-o', dest='out_model', default=None, metavar=' ', \ help='Optional output path in which to store the model') parser.add_argument('--log-freq', '-l', dest='log_freq', type=int, default=10, metavar=' ', \ help='Log frequency - how many times to inspect the bound while running') parser.add_argument('--iters', '-i', dest='iters', type=int, default=500, metavar=' ', \ help='The maximum number of iterations to run when training') parser.add_argument('--query-iters', '-j', dest='query_iters', type=int, default=100, metavar=' ', \ help='The maximum number of iterations to run when querying, by default same as when training') parser.add_argument('--min-vb-change', '-e', dest='min_vb_change', type=float, default=1, metavar=' ', \ help='The amount by which the variational bound must change at each log-interval to avoid inference being stopped early.') parser.add_argument('--topic-var', dest='topic_var', type=float, default=DefaultPriorCov, metavar=' ', \ help="Scale of the prior isotropic variance over topics") parser.add_argument('--feat-var', dest='feat_var', type=float, default=DefaultPriorCov, metavar=' ', \ help="Scale of the prior isotropic variance over features") parser.add_argument('--lat-topic-var', dest='lat_topic_var', type=float, default=DefaultPriorCov, metavar=' ', \ help="Scale of the prior isotropic variance over latent topics") parser.add_argument('--lat-feat-var', dest='lat_feat_var', type=float, default=DefaultPriorCov, metavar=' ', \ help="Scale of the prior isotropic variance over latent features") parser.add_argument('--vocab-prior', dest='vocabPrior', type=float, default=1.1, metavar=' ', \ help="Symmetric prior over the vocabulary") parser.add_argument('--folds', '-f', dest='folds', type=int, default=1, metavar=' ', \ help="Number of cross validation folds.") parser.add_argument('--truncate-folds', dest='eval_fold_count', type=int, default=-1, metavar=' ', \ help="If set, stop running after the given number of folds had been processed") parser.add_argument('--debug', '-b', dest='debug', type=bool, default=False, metavar=' ', \ help="Display a debug message, with the bound, after every variable update") parser.add_argument('--dtype', '-t', dest='dtype', default="f4:f4", metavar=' ', \ help="Datatype to use, values are i4, f4 and f8. Specify two, a data dtype and model dtype, delimited by a colon") parser.add_argument('--limit-to', dest='limit', type=int, default=0, metavar=' ', \ help="If set, discard all but the initial given number of rows of the input dataset") parser.add_argument('--word-dict', dest='word_dict', default=None, metavar=' ', \ help='A dictionary of all words. Used to identify hashtag indices') parser.add_argument('--lda-model', dest='ldaModel', default=None, metavar=' ', \ help='A trained LDA model, used with the LRO model') parser.add_argument('--feats-mask', dest='features_mask_str', default=None, metavar=' ', \ help='Feature mask to use with FeatSplit runs, comma-delimited list of colon-delimited pairs') # # Initialization of the app: first parse the arguments # print("Random seed is 0xC0FFEE") rd.seed(0xC0FFEE) print("Args are : " + str(args)) args = parser.parse_args(args) K, P, Q = args.K, args.P, args.Q features_mask = parse_features_mask(args) (input_dtype, output_dtype) = parse_dtypes(args.dtype) fv, tv, lfv, ltv = args.feat_var, args.topic_var, args.lat_feat_var, args.lat_topic_var # # Load and prune the data # data = DataSet.from_files(args.words, args.feats, args.links, limit=args.limit) data.convert_to_dtype(input_dtype) data.prune_and_shuffle(min_doc_len=3, min_link_count=MinLinkCountPrune) print ("The combined word-count of the %d documents is %.0f, drawn from a vocabulary of %d distinct terms" % (data.doc_count, data.word_count, data.words.shape[1])) if data.add_intercept_to_feats_if_required(): print ("Appended an intercept to the given features") # # Instantiate and configure the model # if args.ldaModel is not None: ldaModel, ldaTopics = load_and_adapt_lda_model(args.ldaModel, data.order) else: ldaModel, ldaTopics = None, None print ("Building template model... ", end="") if args.model == CtmBouchard: import model.ctm as mdl templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype) elif args.model == CtmBohning: import model.ctm_bohning as mdl templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype) elif args.model == StmYvBouchard: import model.stm_yv as mdl templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype) elif args.model == StmYvBohning: import model.stm_yv_bohning as mdl templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype) elif args.model == StmYvBohningFakeOnline: import model.stm_yv_bohning_fake_online as mdl templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype) elif args.model == StmUyvBohning: import model.stm_uv_vec_y_bohning as mdl templateModel = mdl.newModelAtRandom(data, K=K, Q=Q, P=P, tv=tv, ltv=ltv, fv=fv, lfv=lfv, vocabPrior=args.vocabPrior, dtype=output_dtype) elif args.model == LdaCvbZero: import model.lda_cvb as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == LdaVb: import model.lda_vb_python as mdl templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype) elif args.model == LdaGibbs: import model.lda_gibbs as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Rtm: import model.rtm as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Mtm: import model.mtm2 as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Mtm2: import model.mtm3 as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Dmr: import model.dmr as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == Lro: import model.lro_vb as mdl templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype) elif args.model == SimLda: import model.sim_based_rec as mdl templateModel = mdl.newModelAtRandom(data, K, method=mdl.LDA, dtype=output_dtype) elif args.model == SimTfIdf: import model.sim_based_rec as mdl templateModel = mdl.newModelAtRandom(data, K, method=mdl.TF_IDF, dtype=output_dtype) else: raise ValueError ("Unknown model identifier " + args.model) print("Done") trainPlan = mdl.newTrainPlan(args.iters, debug=args.debug) queryPlan = mdl.newTrainPlan(args.query_iters, debug=args.debug) if args.eval == Perplexity: return cross_val_and_eval_perplexity(data, mdl, templateModel, trainPlan, queryPlan, args.folds, args.eval_fold_count, args.out_model) elif args.eval == HashtagPrecAtM: return cross_val_and_eval_hashtag_prec_at_m(data, mdl, templateModel, trainPlan, load_dict(args.word_dict), args.folds, args.eval_fold_count, args.out_model) elif args.eval == MeanAveragePrecAllDocs: return link_split_map (data, mdl, templateModel, trainPlan, args.folds, args.out_model) elif args.eval == MeanPrecRecAtMAllDocs: return link_split_prec_rec (data, mdl, templateModel, trainPlan, args.folds, args.eval_fold_count, args.out_model, ldaModel, ldaTopics) elif args.eval == LroMeanPrecRecAtMAllDocs: return insample_lro_style_prec_rec (data, mdl, templateModel, trainPlan, args.folds, args.eval_fold_count, args.out_model, ldaModel, ldaTopics) elif args.eval == LroMeanPrecRecAtMFeatSplit: return outsample_lro_style_prec_rec (data, mdl, templateModel, trainPlan, features_mask, args.out_model, ldaModel, ldaTopics) else: raise ValueError("Unknown evaluation metric " + args.eval) return modelFiles