예제 #1
0
 def testOnRealData(self):
     print ("CTM/Bouchard")
     rd.seed(0xBADB055)
     path = "/Users/bryanfeeney/Desktop/NIPS"
     with open(path + "/ar.pkl", 'rb') as f:
         _, W, _, d = pkl.load(f)
     
     if len(d) == 1:
         d = d[0]
     
     if W.dtype != DTYPE:
         W = W.astype(DTYPE)
         
     docLens   = np.squeeze(np.asarray(W.sum(axis=1)))
     good_rows = (np.where(docLens > 0.5))[0]
     if len(good_rows) < W.shape[0]:
         print ("Some rows in the doc-term matrix are empty. These have been removed.")
     W = W[good_rows, :]
     
     # IDF frequency for when we print out the vocab later
     freq = np.squeeze(np.asarray(W.sum(axis=0)))
     scale = np.reciprocal(1 + freq)
    
     # Initialise the model  
     K = 20
     model      = ctm.newModelAtRandom(W, K, dtype=DTYPE)
     queryState = ctm.newQueryState(W, model)
     trainPlan  = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=True)
     
     # Train the model, and the immediately save the result to a file for subsequent inspection
     model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan)
     with open(newModelFileFromModel(model), "wb") as f:
         pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
     
     # Plot the evolution of the bound during training.
     fig, ax1 = plt.subplots()
     ax1.plot(bndItrs, bndVals, 'b-')
     ax1.set_xlabel('Iterations')
     ax1.set_ylabel('Bound', color='b')
     
     ax2 = ax1.twinx()
     ax2.plot(bndItrs, bndLikes, 'r-')
     ax2.set_ylabel('Likelihood', color='r')
     
     fig.show()
     fig.suptitle("CTM/Bouchard (Identity Cov) on NIPS")
     plt.show()
     
     plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
     plt.show()
 
     # Print out the most likely topic words
     topWordCount = 100
     kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \
                     for k in range(K)]
     
     print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query))
     print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
     print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
예제 #2
0
def newTrainPlan(iterations = 100, epsilon=2, logFrequency=10, fastButInaccurate=False, debug=DEBUG):
    '''
    Create a training plan determining how many iterations we
    process, how often we plot the results, how often we log
    the variational bound, etc.
    '''
    base = ctm.newTrainPlan(iterations, epsilon, logFrequency, fastButInaccurate, debug)
    return TrainPlan(base.iterations, base.epsilon, base.logFrequency, base.fastButInaccurate, base.debug)
예제 #3
0
 def _testOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     useDiagonalPriorCov = True
     
     rd.seed(0xBADB055) # Global init for repeatable test
     D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count
     tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K)
     
     W = W.astype(DTYPE)
     
     plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r)
     plt.show()
     
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     for useDiagonalPriorCov in [False, True]:
         trainLikely = []
         trainWordCount = []
         queryLikely = []
         queryWordCount = []
         
         for fold in range(folds):
             # Split the datasets
             start = fold * foldSize
             end   = start + trainSize
             
             trainSet = np.arange(start,end) % D
             querySet = np.arange(end, end + querySize) % D
             
             W_train = W[trainSet,:]
             W_query = W[querySet,:]
             
             # Train the model
             model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE)
             queryState = ctm.newQueryState(W_train, model)
             
             plan  = ctm.newTrainPlan(iterations=40, logFrequency=1, fastButInaccurate=useDiagonalPriorCov, debug=True)
             model, queryState, (bndItrs, bndVals, likelies) = ctm.train (W_train, None, model, queryState, plan)
                 
             # Plot the evolution of the bound during training.
             fig, ax1 = plt.subplots()
             ax1.plot(bndItrs, bndVals, 'b-')
             ax1.set_xlabel('Iterations')
             ax1.set_ylabel('Bound', color='b')
             
             ax2 = ax1.twinx()
             ax2.plot(bndItrs, likelies, 'r-')
             ax2.set_ylabel('Likelihood', color='r')
             
             fig.show()
         
             # Plot the topic covariance
             self._plotCov(model)
             
             # Plot the vocab
             plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
             plt.show()
             
             # Calculating the training set likelihood
             trainLikely.append(ctm.log_likelihood(W_train, model, queryState))
             trainWordCount.append(W_train.data.sum())
             
             # Now query the model.
             plan       = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov)
             queryState = ctm.newQueryState(W_query, model)
             model, queryState = ctm.query(W_query, None, model, queryState, plan)
             
             queryLikely.append(ctm.log_likelihood(W_query, model, queryState))
             queryWordCount.append(W_query.data.sum())
          
         # Print out the likelihood and perplexity for each fold.   
         print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances")
         for fold in range(folds):
             trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold])
             queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold])
             
             print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold]))
             print("                    Perplexity: %12.2f \t           Perplexity: %12.2f" % (trainPerp, queryPerp))
     
             self.assertTrue(queryPerp < 60.0) # Maximum perplexity.
             self.assertTrue(trainPerp < 60.0)
         print ("\n\n")
         
     print("End of Test")
예제 #4
0
    def _testOnModelHandcraftedData(self):
        #
        # Create the vocab
        #
        T = 3 * 3
        K = 5
        
        # Horizontal bars
        vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])), shape=(3,3)).todense()
        #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense()
        vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])), shape=(3,3)).todense()
        
        # Vertical bars
        vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])), shape=(3,3)).todense()
        #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense()
        vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])), shape=(3,3)).todense()
        
        # Diagonals
        vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])), shape=(3,3)).todense()
        #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense()
        
        # Put together
        T = vocab1.shape[0] * vocab1.shape[1]
        vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7]
        
        # Create a single matrix with the flattened vocabularies
        vocabVectors = []
        for vocab in vocabs:
            vocabVectors.append (np.squeeze(np.asarray (vocab.reshape((1,T)))))
        
        vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE))
        
        # Plot the vocab
        ones = np.ones(vocabs[0].shape)
        for k in range(K):
            plt.subplot(2, 3, k)
            plt.imshow(ones - vocabs[k], interpolation="none", cmap = cm.Greys_r)
        plt.show()
        
        #
        # Create the corpus
        #
        rd.seed(0xC0FFEE)
        D = 1000

        # Make sense (of a sort) of this by assuming that these correspond to
        # Kittens    Omelettes    Puppies    Oranges    Tomatoes    Dutch People    Basketball    Football
        #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25])
#        topicCovar = np.array(\
#            [[ 100,    5,     55,      20,     5,     15,      4,      0], \
#             [ 5,    100,      5,      10,    70,      5,      0,      0], \
#             [ 55,     5,    100,       5,     5,     10,      0,      5], \
#             [ 20,    10,      5,     100,    30,     30,     20,     10], \
#             [ 5,     70,      5,     30,    100,      0,      0,      0], \
#             [ 15,     5,     10,     30,      0,    100,     10,     40], \
#             [ 4,      0,      0,     20,      0,     10,    100,     20], \
#             [ 0,      0,      5,     10,      0,     40,     20,    100]], dtype=DTYPE) / 100.0

        topicMean = np.array([25, 15, 40, 5, 15])
        self.assertEqual(100, topicMean.sum())
        topicCovar = np.array(\
            [[ 100,    5,     55,      20,     5     ], \
             [ 5,    100,      5,      10,    70     ], \
             [ 55,     5,    100,       5,     5     ], \
             [ 20,    10,      5,     100,    30     ], \
             [ 5,     70,      5,     30,    100     ], \
             ], dtype=DTYPE) / 100.0
 
        
        meanWordCount = 80
        wordCounts = rd.poisson(meanWordCount, size=D)
        topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D)
        W = topicDists.dot(vocab) * wordCounts[:, np.newaxis]
        W = ssp.csr_matrix (W.astype(DTYPE))
        
        #
        # Train the model
        #
        model      = ctm.newModelAtRandom(W, K, dtype=DTYPE)
        queryState = ctm.newQueryState(W, model)
        trainPlan  = ctm.newTrainPlan(iterations=65, logFrequency=1)
        
        self.assertTrue (0.99 < np.sum(model.topicMean) < 1.01)
        
        return self._doTest (W, model, queryState, trainPlan)
예제 #5
0
def run(args):
    '''
    Parses the command-line arguments (excluding the application name portion).
    Executes a cross-validation run accordingly, saving the output at the end
    of each run.

    Returns the list of files created.
    '''

    #
    # Enumerate all possible arguments
    #
    parser = ap.ArgumentParser(description='Execute a topic-modeling run.')
    parser.add_argument('--model', '-m', dest='model', metavar=' ', \
                    help='The type of mode to use, options are ' + ModelNames)
    parser.add_argument('--num-topics', '-k', dest='K', type=int, metavar=' ', \
                    help='The number of topics to fit')
    parser.add_argument('--num-lat-topics', '-q', dest='Q', type=int, metavar=' ', \
                    help='The number of latent topics (i.e. rank of the topic covariance matrix)')
    parser.add_argument('--num-lat-feats', '-p', dest='P', type=int, metavar=' ', \
                    help='The number of latent features (i.e. rank of the features covariance matrix)')
    parser.add_argument('--words', '-w', dest='words', metavar=' ', \
                    help='The path to the pickle file containing a DxT array or matrix of the word-counts across all D documents')
    parser.add_argument('--feats', '-x', dest='feats', metavar=' ', \
                    help='The path to the pickle file containing a DxF array or matrix of the features across all D documents')
    parser.add_argument('--links', '-c', dest='links', metavar=' ', \
                    help='The path to the pickle file containing a DxP array or matrix of the links (citations) emanated by all D documents')
    parser.add_argument('--eval', '-v', dest='eval', default=Perplexity, metavar=' ', \
                    help='Evaluation metric, available options are: ' + ','.join(EvalNames))
    parser.add_argument('--out-model', '-o', dest='out_model', default=None, metavar=' ', \
                    help='Optional output path in which to store the model')
    parser.add_argument('--log-freq', '-l', dest='log_freq', type=int, default=10, metavar=' ', \
                    help='Log frequency - how many times to inspect the bound while running')
    parser.add_argument('--iters', '-i', dest='iters', type=int, default=500, metavar=' ', \
                    help='The maximum number of iterations to run when training')
    parser.add_argument('--query-iters', '-j', dest='query_iters', type=int, default=100, metavar=' ', \
                    help='The maximum number of iterations to run when querying, by default same as when training')
    parser.add_argument('--min-vb-change', '-e', dest='min_vb_change', type=float, default=1, metavar=' ', \
                    help='The amount by which the variational bound must change at each log-interval to avoid inference being stopped early.')
    parser.add_argument('--topic-var', dest='topic_var', type=float, default=DefaultPriorCov, metavar=' ', \
                    help="Scale of the prior isotropic variance over topics")
    parser.add_argument('--feat-var', dest='feat_var', type=float, default=DefaultPriorCov, metavar=' ', \
                    help="Scale of the prior isotropic variance over features")
    parser.add_argument('--lat-topic-var', dest='lat_topic_var', type=float, default=DefaultPriorCov, metavar=' ', \
                    help="Scale of the prior isotropic variance over latent topics")
    parser.add_argument('--lat-feat-var', dest='lat_feat_var', type=float, default=DefaultPriorCov, metavar=' ', \
                    help="Scale of the prior isotropic variance over latent features")
    parser.add_argument('--vocab-prior', dest='vocabPrior', type=float, default=1.1, metavar=' ', \
                    help="Symmetric prior over the vocabulary")
    parser.add_argument('--folds', '-f', dest='folds', type=int, default=1, metavar=' ', \
                    help="Number of cross validation folds.")
    parser.add_argument('--truncate-folds', dest='eval_fold_count', type=int, default=-1, metavar=' ', \
                    help="If set, stop running after the given number of folds had been processed")
    parser.add_argument('--debug', '-b', dest='debug', type=bool, default=False, metavar=' ', \
                    help="Display a debug message, with the bound, after every variable update")
    parser.add_argument('--dtype', '-t', dest='dtype', default="f4:f4", metavar=' ', \
                    help="Datatype to use, values are i4, f4 and f8. Specify two, a data dtype and model dtype, delimited by a colon")
    parser.add_argument('--limit-to', dest='limit', type=int, default=0, metavar=' ', \
                    help="If set, discard all but the initial given number of rows of the input dataset")
    parser.add_argument('--word-dict', dest='word_dict', default=None, metavar=' ', \
                    help='A dictionary of all words. Used to identify hashtag indices')
    parser.add_argument('--lda-model', dest='ldaModel', default=None, metavar=' ', \
                    help='A trained LDA model, used with the LRO model')
    parser.add_argument('--feats-mask', dest='features_mask_str', default=None, metavar=' ', \
                    help='Feature mask to use with FeatSplit runs, comma-delimited list of colon-delimited pairs')

    #
    # Initialization of the app: first parse the arguments
    #
    print("Random seed is 0xC0FFEE")
    rd.seed(0xC0FFEE)

    print("Args are : " + str(args))
    args = parser.parse_args(args)
    K, P, Q = args.K, args.P, args.Q

    features_mask = parse_features_mask(args)
    (input_dtype, output_dtype)  = parse_dtypes(args.dtype)

    fv, tv, lfv, ltv = args.feat_var, args.topic_var, args.lat_feat_var, args.lat_topic_var

    #
    #  Load and prune the data
    #
    data = DataSet.from_files(args.words, args.feats, args.links, limit=args.limit)
    data.convert_to_dtype(input_dtype)
    data.prune_and_shuffle(min_doc_len=3, min_link_count=MinLinkCountPrune)

    print ("The combined word-count of the %d documents is %.0f, drawn from a vocabulary of %d distinct terms" % (data.doc_count, data.word_count, data.words.shape[1]))
    if data.add_intercept_to_feats_if_required():
        print ("Appended an intercept to the given features")


    #
    # Instantiate and configure the model
    #
    if args.ldaModel is not None:
        ldaModel, ldaTopics = load_and_adapt_lda_model(args.ldaModel, data.order)
    else:
        ldaModel, ldaTopics = None, None

    print ("Building template model... ", end="")
    if args.model == CtmBouchard:
        import model.ctm as mdl
        templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype)
    elif args.model == CtmBohning:
        import model.ctm_bohning as mdl
        templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype)
    elif args.model == StmYvBouchard:
        import model.stm_yv as mdl
        templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype)
    elif args.model == StmYvBohning:
        import model.stm_yv_bohning as mdl
        templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype)
    elif args.model == StmYvBohningFakeOnline:
        import model.stm_yv_bohning_fake_online as mdl
        templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype)
    elif args.model == StmUyvBohning:
        import model.stm_uv_vec_y_bohning as mdl
        templateModel = mdl.newModelAtRandom(data, K=K, Q=Q, P=P,  tv=tv, ltv=ltv, fv=fv, lfv=lfv, vocabPrior=args.vocabPrior, dtype=output_dtype)
    elif args.model == LdaCvbZero:
        import model.lda_cvb as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == LdaVb:
        import model.lda_vb_python as mdl
        templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype)
    elif args.model == LdaGibbs:
        import model.lda_gibbs as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Rtm:
        import model.rtm as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Mtm:
        import model.mtm2 as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Mtm2:
        import model.mtm3 as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Dmr:
        import model.dmr as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Lro:
        import model.lro_vb as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == SimLda:
        import model.sim_based_rec as mdl
        templateModel = mdl.newModelAtRandom(data, K, method=mdl.LDA, dtype=output_dtype)
    elif args.model == SimTfIdf:
        import model.sim_based_rec as mdl
        templateModel = mdl.newModelAtRandom(data, K, method=mdl.TF_IDF, dtype=output_dtype)
    else:
        raise ValueError ("Unknown model identifier " + args.model)
    print("Done")

    trainPlan = mdl.newTrainPlan(args.iters, debug=args.debug)
    queryPlan = mdl.newTrainPlan(args.query_iters, debug=args.debug)

    if args.eval == Perplexity:
        return cross_val_and_eval_perplexity(data, mdl, templateModel, trainPlan, queryPlan, args.folds, args.eval_fold_count, args.out_model)
    elif args.eval == HashtagPrecAtM:
        return cross_val_and_eval_hashtag_prec_at_m(data, mdl, templateModel, trainPlan, load_dict(args.word_dict), args.folds, args.eval_fold_count, args.out_model)
    elif args.eval == MeanAveragePrecAllDocs:
        return link_split_map (data, mdl, templateModel, trainPlan, args.folds, args.out_model)
    elif args.eval == MeanPrecRecAtMAllDocs:
        return link_split_prec_rec (data, mdl, templateModel, trainPlan, args.folds, args.eval_fold_count, args.out_model, ldaModel, ldaTopics)
    elif args.eval == LroMeanPrecRecAtMAllDocs:
        return insample_lro_style_prec_rec (data, mdl, templateModel, trainPlan, args.folds, args.eval_fold_count, args.out_model, ldaModel, ldaTopics)
    elif args.eval == LroMeanPrecRecAtMFeatSplit:
        return outsample_lro_style_prec_rec (data, mdl, templateModel, trainPlan, features_mask, args.out_model, ldaModel, ldaTopics)
    else:
        raise ValueError("Unknown evaluation metric " + args.eval)

    return modelFiles