예제 #1
0
def newModelAtRandom(data, P, K, featVar, latFeatVar, vocabPrior=VocabPrior, dtype=DTYPE):
    '''
    Creates a new CtmModelState for the given training set and
    the given number of topics. Everything is instantiated purely
    at random. This contains all parameters independent of of
    the dataset (e.g. learnt priors)
    
    Param:
    data - the dataset of words, features and links of which only words and
           features are used in this model
    P - The size of the latent feature-space P << F
    K - the number of topics
    featVar - the prior variance of the feature-space: this is a
              scalar used to scale an identity matrix
    featVar - the prior variance of the latent feature-space: this
               is a scalar used to scale an identity matrix
    
    Return:
    A ModelState object
    '''
    assert K > 1, "There must be at least two topics"
    
    base = ctm.newModelAtRandom(data, K, vocabPrior, dtype)
    _,F = data.feats.shape
    Y = rd.random((K,P)).astype(dtype)
    R_Y = latFeatVar * np.eye(P,P, dtype=dtype)
    
    V = rd.random((P,F)).astype(dtype)
    A = Y.dot(V)
    R_A = featVar * np.eye(F,F, dtype=dtype)
    
    return ModelState(F, P, K, A, R_A, featVar, Y, R_Y, latFeatVar, V, base.sigT, base.vocab, base.vocabPrior, dtype, MODEL_NAME)
예제 #2
0
 def testOnRealData(self):
     print ("CTM/Bouchard")
     rd.seed(0xBADB055)
     path = "/Users/bryanfeeney/Desktop/NIPS"
     with open(path + "/ar.pkl", 'rb') as f:
         _, W, _, d = pkl.load(f)
     
     if len(d) == 1:
         d = d[0]
     
     if W.dtype != DTYPE:
         W = W.astype(DTYPE)
         
     docLens   = np.squeeze(np.asarray(W.sum(axis=1)))
     good_rows = (np.where(docLens > 0.5))[0]
     if len(good_rows) < W.shape[0]:
         print ("Some rows in the doc-term matrix are empty. These have been removed.")
     W = W[good_rows, :]
     
     # IDF frequency for when we print out the vocab later
     freq = np.squeeze(np.asarray(W.sum(axis=0)))
     scale = np.reciprocal(1 + freq)
    
     # Initialise the model  
     K = 20
     model      = ctm.newModelAtRandom(W, K, dtype=DTYPE)
     queryState = ctm.newQueryState(W, model)
     trainPlan  = ctm.newTrainPlan(iterations=100, logFrequency=10, fastButInaccurate=False, debug=True)
     
     # Train the model, and the immediately save the result to a file for subsequent inspection
     model, query, (bndItrs, bndVals, bndLikes) = ctm.train (W, None, model, queryState, trainPlan)
     with open(newModelFileFromModel(model), "wb") as f:
         pkl.dump ((model, query, (bndItrs, bndVals, bndLikes)), f)
     
     # Plot the evolution of the bound during training.
     fig, ax1 = plt.subplots()
     ax1.plot(bndItrs, bndVals, 'b-')
     ax1.set_xlabel('Iterations')
     ax1.set_ylabel('Bound', color='b')
     
     ax2 = ax1.twinx()
     ax2.plot(bndItrs, bndLikes, 'r-')
     ax2.set_ylabel('Likelihood', color='r')
     
     fig.show()
     fig.suptitle("CTM/Bouchard (Identity Cov) on NIPS")
     plt.show()
     
     plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
     plt.show()
 
     # Print out the most likely topic words
     topWordCount = 100
     kTopWordInds = [self.topWordInds(d, model.vocab[k,:] * scale, topWordCount) \
                     for k in range(K)]
     
     print ("Perplexity: %f\n\n" % ctm.perplexity(W, model, query))
     print ("\t\t".join (["Topic " + str(k) for k in range(K)]))
     print ("\n".join ("\t".join (d[kTopWordInds[k][c]] + "\t%0.4f" % model.vocab[k][kTopWordInds[k][c]] for k in range(K)) for c in range(topWordCount)))
예제 #3
0
 def _testOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     useDiagonalPriorCov = True
     
     rd.seed(0xBADB055) # Global init for repeatable test
     D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count
     tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K)
     
     W = W.astype(DTYPE)
     
     plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r)
     plt.show()
     
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     for useDiagonalPriorCov in [False, True]:
         trainLikely = []
         trainWordCount = []
         queryLikely = []
         queryWordCount = []
         
         for fold in range(folds):
             # Split the datasets
             start = fold * foldSize
             end   = start + trainSize
             
             trainSet = np.arange(start,end) % D
             querySet = np.arange(end, end + querySize) % D
             
             W_train = W[trainSet,:]
             W_query = W[querySet,:]
             
             # Train the model
             model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE)
             queryState = ctm.newQueryState(W_train, model)
             
             plan  = ctm.newTrainPlan(iterations=40, logFrequency=1, fastButInaccurate=useDiagonalPriorCov, debug=True)
             model, queryState, (bndItrs, bndVals, likelies) = ctm.train (W_train, None, model, queryState, plan)
                 
             # Plot the evolution of the bound during training.
             fig, ax1 = plt.subplots()
             ax1.plot(bndItrs, bndVals, 'b-')
             ax1.set_xlabel('Iterations')
             ax1.set_ylabel('Bound', color='b')
             
             ax2 = ax1.twinx()
             ax2.plot(bndItrs, likelies, 'r-')
             ax2.set_ylabel('Likelihood', color='r')
             
             fig.show()
         
             # Plot the topic covariance
             self._plotCov(model)
             
             # Plot the vocab
             plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
             plt.show()
             
             # Calculating the training set likelihood
             trainLikely.append(ctm.log_likelihood(W_train, model, queryState))
             trainWordCount.append(W_train.data.sum())
             
             # Now query the model.
             plan       = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov)
             queryState = ctm.newQueryState(W_query, model)
             model, queryState = ctm.query(W_query, None, model, queryState, plan)
             
             queryLikely.append(ctm.log_likelihood(W_query, model, queryState))
             queryWordCount.append(W_query.data.sum())
          
         # Print out the likelihood and perplexity for each fold.   
         print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances")
         for fold in range(folds):
             trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold])
             queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold])
             
             print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold]))
             print("                    Perplexity: %12.2f \t           Perplexity: %12.2f" % (trainPerp, queryPerp))
     
             self.assertTrue(queryPerp < 60.0) # Maximum perplexity.
             self.assertTrue(trainPerp < 60.0)
         print ("\n\n")
         
     print("End of Test")
예제 #4
0
    def _testOnModelHandcraftedData(self):
        #
        # Create the vocab
        #
        T = 3 * 3
        K = 5
        
        # Horizontal bars
        vocab1 = ssp.coo_matrix(([1, 1, 1], ([0, 0, 0], [0, 1, 2])), shape=(3,3)).todense()
        #vocab2 = ssp.coo_matrix(([1, 1, 1], ([1, 1, 1], [0, 1, 2])), shape=(3,3)).todense()
        vocab3 = ssp.coo_matrix(([1, 1, 1], ([2, 2, 2], [0, 1, 2])), shape=(3,3)).todense()
        
        # Vertical bars
        vocab4 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 0, 0])), shape=(3,3)).todense()
        #vocab5 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [1, 1, 1])), shape=(3,3)).todense()
        vocab6 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [2, 2, 2])), shape=(3,3)).todense()
        
        # Diagonals
        vocab7 = ssp.coo_matrix(([1, 1, 1], ([0, 1, 2], [0, 1, 2])), shape=(3,3)).todense()
        #vocab8 = ssp.coo_matrix(([1, 1, 1], ([2, 1, 0], [0, 1, 2])), shape=(3,3)).todense()
        
        # Put together
        T = vocab1.shape[0] * vocab1.shape[1]
        vocabs = [vocab1, vocab3, vocab4, vocab6, vocab7]
        
        # Create a single matrix with the flattened vocabularies
        vocabVectors = []
        for vocab in vocabs:
            vocabVectors.append (np.squeeze(np.asarray (vocab.reshape((1,T)))))
        
        vocab = normalizerows_ip(np.array(vocabVectors, dtype=DTYPE))
        
        # Plot the vocab
        ones = np.ones(vocabs[0].shape)
        for k in range(K):
            plt.subplot(2, 3, k)
            plt.imshow(ones - vocabs[k], interpolation="none", cmap = cm.Greys_r)
        plt.show()
        
        #
        # Create the corpus
        #
        rd.seed(0xC0FFEE)
        D = 1000

        # Make sense (of a sort) of this by assuming that these correspond to
        # Kittens    Omelettes    Puppies    Oranges    Tomatoes    Dutch People    Basketball    Football
        #topicMean = np.array([10, 25, 5, 15, 5, 5, 10, 25])
#        topicCovar = np.array(\
#            [[ 100,    5,     55,      20,     5,     15,      4,      0], \
#             [ 5,    100,      5,      10,    70,      5,      0,      0], \
#             [ 55,     5,    100,       5,     5,     10,      0,      5], \
#             [ 20,    10,      5,     100,    30,     30,     20,     10], \
#             [ 5,     70,      5,     30,    100,      0,      0,      0], \
#             [ 15,     5,     10,     30,      0,    100,     10,     40], \
#             [ 4,      0,      0,     20,      0,     10,    100,     20], \
#             [ 0,      0,      5,     10,      0,     40,     20,    100]], dtype=DTYPE) / 100.0

        topicMean = np.array([25, 15, 40, 5, 15])
        self.assertEqual(100, topicMean.sum())
        topicCovar = np.array(\
            [[ 100,    5,     55,      20,     5     ], \
             [ 5,    100,      5,      10,    70     ], \
             [ 55,     5,    100,       5,     5     ], \
             [ 20,    10,      5,     100,    30     ], \
             [ 5,     70,      5,     30,    100     ], \
             ], dtype=DTYPE) / 100.0
 
        
        meanWordCount = 80
        wordCounts = rd.poisson(meanWordCount, size=D)
        topicDists = rd.multivariate_normal(topicMean, topicCovar, size=D)
        W = topicDists.dot(vocab) * wordCounts[:, np.newaxis]
        W = ssp.csr_matrix (W.astype(DTYPE))
        
        #
        # Train the model
        #
        model      = ctm.newModelAtRandom(W, K, dtype=DTYPE)
        queryState = ctm.newQueryState(W, model)
        trainPlan  = ctm.newTrainPlan(iterations=65, logFrequency=1)
        
        self.assertTrue (0.99 < np.sum(model.topicMean) < 1.01)
        
        return self._doTest (W, model, queryState, trainPlan)
예제 #5
0
def run(args):
    '''
    Parses the command-line arguments (excluding the application name portion).
    Executes a cross-validation run accordingly, saving the output at the end
    of each run.

    Returns the list of files created.
    '''

    #
    # Enumerate all possible arguments
    #
    parser = ap.ArgumentParser(description='Execute a topic-modeling run.')
    parser.add_argument('--model', '-m', dest='model', metavar=' ', \
                    help='The type of mode to use, options are ' + ModelNames)
    parser.add_argument('--num-topics', '-k', dest='K', type=int, metavar=' ', \
                    help='The number of topics to fit')
    parser.add_argument('--num-lat-topics', '-q', dest='Q', type=int, metavar=' ', \
                    help='The number of latent topics (i.e. rank of the topic covariance matrix)')
    parser.add_argument('--num-lat-feats', '-p', dest='P', type=int, metavar=' ', \
                    help='The number of latent features (i.e. rank of the features covariance matrix)')
    parser.add_argument('--words', '-w', dest='words', metavar=' ', \
                    help='The path to the pickle file containing a DxT array or matrix of the word-counts across all D documents')
    parser.add_argument('--feats', '-x', dest='feats', metavar=' ', \
                    help='The path to the pickle file containing a DxF array or matrix of the features across all D documents')
    parser.add_argument('--links', '-c', dest='links', metavar=' ', \
                    help='The path to the pickle file containing a DxP array or matrix of the links (citations) emanated by all D documents')
    parser.add_argument('--eval', '-v', dest='eval', default=Perplexity, metavar=' ', \
                    help='Evaluation metric, available options are: ' + ','.join(EvalNames))
    parser.add_argument('--out-model', '-o', dest='out_model', default=None, metavar=' ', \
                    help='Optional output path in which to store the model')
    parser.add_argument('--log-freq', '-l', dest='log_freq', type=int, default=10, metavar=' ', \
                    help='Log frequency - how many times to inspect the bound while running')
    parser.add_argument('--iters', '-i', dest='iters', type=int, default=500, metavar=' ', \
                    help='The maximum number of iterations to run when training')
    parser.add_argument('--query-iters', '-j', dest='query_iters', type=int, default=100, metavar=' ', \
                    help='The maximum number of iterations to run when querying, by default same as when training')
    parser.add_argument('--min-vb-change', '-e', dest='min_vb_change', type=float, default=1, metavar=' ', \
                    help='The amount by which the variational bound must change at each log-interval to avoid inference being stopped early.')
    parser.add_argument('--topic-var', dest='topic_var', type=float, default=DefaultPriorCov, metavar=' ', \
                    help="Scale of the prior isotropic variance over topics")
    parser.add_argument('--feat-var', dest='feat_var', type=float, default=DefaultPriorCov, metavar=' ', \
                    help="Scale of the prior isotropic variance over features")
    parser.add_argument('--lat-topic-var', dest='lat_topic_var', type=float, default=DefaultPriorCov, metavar=' ', \
                    help="Scale of the prior isotropic variance over latent topics")
    parser.add_argument('--lat-feat-var', dest='lat_feat_var', type=float, default=DefaultPriorCov, metavar=' ', \
                    help="Scale of the prior isotropic variance over latent features")
    parser.add_argument('--vocab-prior', dest='vocabPrior', type=float, default=1.1, metavar=' ', \
                    help="Symmetric prior over the vocabulary")
    parser.add_argument('--folds', '-f', dest='folds', type=int, default=1, metavar=' ', \
                    help="Number of cross validation folds.")
    parser.add_argument('--truncate-folds', dest='eval_fold_count', type=int, default=-1, metavar=' ', \
                    help="If set, stop running after the given number of folds had been processed")
    parser.add_argument('--debug', '-b', dest='debug', type=bool, default=False, metavar=' ', \
                    help="Display a debug message, with the bound, after every variable update")
    parser.add_argument('--dtype', '-t', dest='dtype', default="f4:f4", metavar=' ', \
                    help="Datatype to use, values are i4, f4 and f8. Specify two, a data dtype and model dtype, delimited by a colon")
    parser.add_argument('--limit-to', dest='limit', type=int, default=0, metavar=' ', \
                    help="If set, discard all but the initial given number of rows of the input dataset")
    parser.add_argument('--word-dict', dest='word_dict', default=None, metavar=' ', \
                    help='A dictionary of all words. Used to identify hashtag indices')
    parser.add_argument('--lda-model', dest='ldaModel', default=None, metavar=' ', \
                    help='A trained LDA model, used with the LRO model')
    parser.add_argument('--feats-mask', dest='features_mask_str', default=None, metavar=' ', \
                    help='Feature mask to use with FeatSplit runs, comma-delimited list of colon-delimited pairs')

    #
    # Initialization of the app: first parse the arguments
    #
    print("Random seed is 0xC0FFEE")
    rd.seed(0xC0FFEE)

    print("Args are : " + str(args))
    args = parser.parse_args(args)
    K, P, Q = args.K, args.P, args.Q

    features_mask = parse_features_mask(args)
    (input_dtype, output_dtype)  = parse_dtypes(args.dtype)

    fv, tv, lfv, ltv = args.feat_var, args.topic_var, args.lat_feat_var, args.lat_topic_var

    #
    #  Load and prune the data
    #
    data = DataSet.from_files(args.words, args.feats, args.links, limit=args.limit)
    data.convert_to_dtype(input_dtype)
    data.prune_and_shuffle(min_doc_len=3, min_link_count=MinLinkCountPrune)

    print ("The combined word-count of the %d documents is %.0f, drawn from a vocabulary of %d distinct terms" % (data.doc_count, data.word_count, data.words.shape[1]))
    if data.add_intercept_to_feats_if_required():
        print ("Appended an intercept to the given features")


    #
    # Instantiate and configure the model
    #
    if args.ldaModel is not None:
        ldaModel, ldaTopics = load_and_adapt_lda_model(args.ldaModel, data.order)
    else:
        ldaModel, ldaTopics = None, None

    print ("Building template model... ", end="")
    if args.model == CtmBouchard:
        import model.ctm as mdl
        templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype)
    elif args.model == CtmBohning:
        import model.ctm_bohning as mdl
        templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype)
    elif args.model == StmYvBouchard:
        import model.stm_yv as mdl
        templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype)
    elif args.model == StmYvBohning:
        import model.stm_yv_bohning as mdl
        templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype)
    elif args.model == StmYvBohningFakeOnline:
        import model.stm_yv_bohning_fake_online as mdl
        templateModel = mdl.newModelAtRandom(data, P, K, fv, lfv, args.vocabPrior, dtype=output_dtype)
    elif args.model == StmUyvBohning:
        import model.stm_uv_vec_y_bohning as mdl
        templateModel = mdl.newModelAtRandom(data, K=K, Q=Q, P=P,  tv=tv, ltv=ltv, fv=fv, lfv=lfv, vocabPrior=args.vocabPrior, dtype=output_dtype)
    elif args.model == LdaCvbZero:
        import model.lda_cvb as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == LdaVb:
        import model.lda_vb_python as mdl
        templateModel = mdl.newModelAtRandom(data, K, args.vocabPrior, dtype=output_dtype)
    elif args.model == LdaGibbs:
        import model.lda_gibbs as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Rtm:
        import model.rtm as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Mtm:
        import model.mtm2 as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Mtm2:
        import model.mtm3 as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Dmr:
        import model.dmr as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == Lro:
        import model.lro_vb as mdl
        templateModel = mdl.newModelAtRandom(data, K, dtype=output_dtype)
    elif args.model == SimLda:
        import model.sim_based_rec as mdl
        templateModel = mdl.newModelAtRandom(data, K, method=mdl.LDA, dtype=output_dtype)
    elif args.model == SimTfIdf:
        import model.sim_based_rec as mdl
        templateModel = mdl.newModelAtRandom(data, K, method=mdl.TF_IDF, dtype=output_dtype)
    else:
        raise ValueError ("Unknown model identifier " + args.model)
    print("Done")

    trainPlan = mdl.newTrainPlan(args.iters, debug=args.debug)
    queryPlan = mdl.newTrainPlan(args.query_iters, debug=args.debug)

    if args.eval == Perplexity:
        return cross_val_and_eval_perplexity(data, mdl, templateModel, trainPlan, queryPlan, args.folds, args.eval_fold_count, args.out_model)
    elif args.eval == HashtagPrecAtM:
        return cross_val_and_eval_hashtag_prec_at_m(data, mdl, templateModel, trainPlan, load_dict(args.word_dict), args.folds, args.eval_fold_count, args.out_model)
    elif args.eval == MeanAveragePrecAllDocs:
        return link_split_map (data, mdl, templateModel, trainPlan, args.folds, args.out_model)
    elif args.eval == MeanPrecRecAtMAllDocs:
        return link_split_prec_rec (data, mdl, templateModel, trainPlan, args.folds, args.eval_fold_count, args.out_model, ldaModel, ldaTopics)
    elif args.eval == LroMeanPrecRecAtMAllDocs:
        return insample_lro_style_prec_rec (data, mdl, templateModel, trainPlan, args.folds, args.eval_fold_count, args.out_model, ldaModel, ldaTopics)
    elif args.eval == LroMeanPrecRecAtMFeatSplit:
        return outsample_lro_style_prec_rec (data, mdl, templateModel, trainPlan, features_mask, args.out_model, ldaModel, ldaTopics)
    else:
        raise ValueError("Unknown evaluation metric " + args.eval)

    return modelFiles