예제 #1
0
def outsample_lro_style_prec_rec (data, mdl, sample_model, train_plan, feature_mask, model_dir=None, ldaModel=None, ldaTopics=None):
    '''
    Take a feature list. Train on all documents where none of those features
    are set. Remove the first element from the feature list, query all documents
    with that feature set, and then evaluate link prediction. Repeat until
    feature-list is empty.

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param feature_mask:  the list of features used to separate training from query
    This is a list of tuples, the left side is the feature label, the right side
    is the
    :param model_dir: if not none, the models are stored in this directory.
    :param ldaModel: for those models that utilise and LDA component, a pre-trained
    LDA model can be supplied.
    :param ldaTopics: the topics of all documents in the corpus as given by the ldaModel
    :return: the list of model files stored
    '''
    def prepareForTraining(data):
        if mdl.is_undirected_link_predictor():
            result = data.copy()
            result.convert_to_undirected_graph()
            result.convert_to_binary_link_matrix()
            return result
        else:
            return data

    ms = [10, 20, 30, 40, 50, 75, 100, 150, 250, 500]
    model_files = []

    combi_precs, combi_recs, combi_dcounts = None, None, None
    mrr_sum, mrr_doc_count = 0, 0
    map_sum, map_doc_count = 0, 0
    while len(feature_mask) > 0:
        # try:
        # Prepare the training and query data
        feature_mask_indices = [i for _,i in feature_mask]
        train_data, query_data, train_indices = data.split_on_feature(feature_mask_indices)
        (feat_label, feat_id) = feature_mask.pop(0)
        print ("\n\nFeature: %s\n" % (feat_label,) + ("-" * 80))

        train_data = prepareForTraining(train_data) # make symmetric, if necessary, after split, so we
                                                    # can compare symmetric with non-symmetric models

        # Train the model
        if model_uses_lda(sample_model):
            ldaModelSubset, ldaTopicsSubset = subsetLda(ldaModel, ldaTopics, train_indices)
            model      = mdl.newModelFromExisting(sample_model, withLdaModel=ldaModelSubset)
            train_tops = mdl.newQueryState(train_data, model, withLdaTopics=ldaTopicsSubset)
        else:
            model = mdl.newModelFromExisting(sample_model)
            train_tops = mdl.newQueryState(train_data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = \
            mdl.train(train_data, model, train_tops, train_plan)

        print ("Training perplexity is %.2f " % perplexity_from_like(mdl.log_likelihood(train_data, model, train_tops), train_data.word_count))

        # Infer the expected link probabilities
        query_tops    = mdl.newQueryState(query_data, model)
        _, query_tops = mdl.query(query_data, model, query_tops, train_plan)

        min_link_probs       = mdl.min_link_probs(model, train_tops, query_tops, query_data.links)
        predicted_link_probs = mdl.link_probs(model, train_tops, query_tops, min_link_probs)
        expected_links       = query_data.links

        # Evaluation 1/3: Precision and Recall at M
        precs, recs, doc_counts = mean_prec_rec_at (expected_links, predicted_link_probs, at=ms, groups=[(0,3), (3,5), (5,10), (10,1000)])
        print (" Mean-Precisions for feature %s (#%d)" % (feat_label, feat_id), end="")

        printTable("Precision", precs, doc_counts, ms)
        printTable("Recall",    recs,  doc_counts, ms)

        combi_precs, _             = combine_map(combi_precs, combi_dcounts, precs, doc_counts)
        combi_recs,  combi_dcounts = combine_map(combi_recs,  combi_dcounts, recs,  doc_counts)

        # Evaluation 2/3: Mean Reciprocal-Rank
        mrr = mean_reciprocal_rank(expected_links, predicted_link_probs)
        print ("Mean reciprocal-rank : %f" % mrr)
        mrr_sum       += mrr * expected_links.shape[0]
        mrr_doc_count += expected_links.shape[0]

        # Evaluation 3/3: Mean Average-Precision
        map = mean_average_prec (expected_links, predicted_link_probs)
        print ("Mean Average Precision : %f" % map)
        map_sum       += map * expected_links.shape[0]
        map_doc_count += expected_links.shape[0]

        # Save the files if necessary and move onto the next fold if required
        model_files = save_if_necessary(model_files, model_dir, model, data, feat_id, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        # except Exception as e:
        #     print("Fold " + str(fold) + " failed: " + str(e))

    print ("-" * 80 + "\n\n Final Results\n\n")
    printTable("Precision", combi_precs, combi_dcounts, ms)
    printTable("Recall",    combi_recs,  combi_dcounts, ms)
    print("Mean reciprocal-rank: %f" % (mrr_sum / mrr_doc_count))
    print("Mean average-precision: %f" % (map_sum / map_doc_count))

    return model_files
예제 #2
0
 def _testOnModelDerivedExample(self):
     print("Cross-validated likelihoods on model-derived example")
     useDiagonalPriorCov = True
     
     rd.seed(0xBADB055) # Global init for repeatable test
     D, T, K = 1000, 100, 7 # Document count, vocabularly size ("term count") and topic count
     tpcs, vocab, docLens, W = self._sampleFromModel(D, T, K)
     
     W = W.astype(DTYPE)
     
     plt.imshow(vocab, interpolation="none", cmap = cm.Greys_r)
     plt.show()
     
     
     # Create the cross-validation folds
     folds     = 5
     foldSize  = ceil(D / 5)
     querySize = foldSize
     trainSize = D - querySize
     
     for useDiagonalPriorCov in [False, True]:
         trainLikely = []
         trainWordCount = []
         queryLikely = []
         queryWordCount = []
         
         for fold in range(folds):
             # Split the datasets
             start = fold * foldSize
             end   = start + trainSize
             
             trainSet = np.arange(start,end) % D
             querySet = np.arange(end, end + querySize) % D
             
             W_train = W[trainSet,:]
             W_query = W[querySet,:]
             
             # Train the model
             model = ctm.newModelAtRandom(W_train, K, dtype=DTYPE)
             queryState = ctm.newQueryState(W_train, model)
             
             plan  = ctm.newTrainPlan(iterations=40, logFrequency=1, fastButInaccurate=useDiagonalPriorCov, debug=True)
             model, queryState, (bndItrs, bndVals, likelies) = ctm.train (W_train, None, model, queryState, plan)
                 
             # Plot the evolution of the bound during training.
             fig, ax1 = plt.subplots()
             ax1.plot(bndItrs, bndVals, 'b-')
             ax1.set_xlabel('Iterations')
             ax1.set_ylabel('Bound', color='b')
             
             ax2 = ax1.twinx()
             ax2.plot(bndItrs, likelies, 'r-')
             ax2.set_ylabel('Likelihood', color='r')
             
             fig.show()
         
             # Plot the topic covariance
             self._plotCov(model)
             
             # Plot the vocab
             plt.imshow(model.vocab, interpolation="none", cmap = cm.Greys_r)
             plt.show()
             
             # Calculating the training set likelihood
             trainLikely.append(ctm.log_likelihood(W_train, model, queryState))
             trainWordCount.append(W_train.data.sum())
             
             # Now query the model.
             plan       = ctm.newTrainPlan(iterations=10, fastButInaccurate=useDiagonalPriorCov)
             queryState = ctm.newQueryState(W_query, model)
             model, queryState = ctm.query(W_query, None, model, queryState, plan)
             
             queryLikely.append(ctm.log_likelihood(W_query, model, queryState))
             queryWordCount.append(W_query.data.sum())
          
         # Print out the likelihood and perplexity for each fold.   
         print ("\n\n\nWith " + ("diagonal" if useDiagonalPriorCov else "full") + " covariances")
         for fold in range(folds):
             trainPerp = np.exp(-trainLikely[fold]/trainWordCount[fold])
             queryPerp = np.exp(-queryLikely[fold]/queryWordCount[fold])
             
             print("Fold %3d: Train-set Likelihood: %12f \t Query-set Likelihood: %12f" % (fold, trainLikely[fold], queryLikely[fold]))
             print("                    Perplexity: %12.2f \t           Perplexity: %12.2f" % (trainPerp, queryPerp))
     
             self.assertTrue(queryPerp < 60.0) # Maximum perplexity.
             self.assertTrue(trainPerp < 60.0)
         print ("\n\n")
         
     print("End of Test")
예제 #3
0
def cross_val_and_eval_perplexity(data, mdl, sample_model, train_plan, query_plan, num_folds, fold_run_count=-1, model_dir= None):
    '''
    Uses cross-validation go get the average perplexity. If folds == 1 a special path is
    triggered where perplexity is evaluated on the training data, and the results are
    not saved to disk, even if model_dir is not none

    :param data: the DataSet object with the data
    :param mdl:  the module with the train etc. functin
    :param sample_model: a preconfigured model which is cloned at the start of each
            cross-validation run
    :param train_plan:  the training plan (number of iterations etc.)
    :param query_plan:  the query play (number of iterations etc.)
    :param num_folds:  the number of folds to cross validation
    :param fold_run_count: for debugging stop early after processing the number
    of the folds
    :param model_dir: if not none, the models are stored in this directory.
    :return: the list of model files stored
    '''
    model_files = []
    if fold_run_count < 1:
        fold_run_count = num_folds

    if num_folds == 1:
        model = mdl.newModelFromExisting(sample_model)
        query = mdl.newQueryState(data, model)

        model, train_tops, (train_itrs, train_vbs, train_likes) = mdl.train(data, model, query, train_plan)
        likely = mdl.log_likelihood(data, model, train_tops)
        perp   = perplexity_from_like(likely, data.word_count)

        print("Train-set Likelihood: %12f" % (likely))
        print("Train-set Perplexity: %12f" % (perp))

        model_files = save_if_necessary(model_files, model_dir, model, data, 0, train_itrs, train_vbs, train_likes, train_tops, train_tops, mdl)
        return model_files

    query_like_sum    = 0 # to calculate the overall likelihood and
    query_wcount_sum  = 0 # perplexity for the whole dataset
    train_like_sum    = 0
    train_wcount_sum  = 0
    folds_finished    = 0 # count of folds that finished successfully

    fold = 0
    while fold < num_folds and folds_finished < fold_run_count:
        try:
            train_data, query_data = data.cross_valid_split(fold, num_folds)

            # Train the model
            print ("Duplicating model template... ", end="")
            model      = mdl.newModelFromExisting(sample_model)
            print ("Done.\nCreating query state...")
            train_tops = mdl.newQueryState(train_data, model)

            print ("Starting training")

            model, train_tops, (train_itrs, train_vbs, train_likes) \
                = mdl.train(train_data, model, train_tops, train_plan)

            train_like       = mdl.log_likelihood (train_data, model, train_tops)
            train_word_count = train_data.word_count
            train_perp       = perplexity_from_like(train_like, train_word_count)

            print ("DEBUG Train perplexity is " + str(train_perp))

            # Query the model - if there are no features we need to split the text
            print ("Starting query.")
            query_estim, query_eval = query_data.doc_completion_split()
            query_tops              = mdl.newQueryState(query_estim, model)
            model, query_tops = mdl.query(query_estim, model, query_tops, query_plan)

            query_like       = mdl.log_likelihood(query_eval, model, query_tops)
            query_word_count = query_eval.word_count
            query_perp       = perplexity_from_like(query_like, query_word_count)

            # Keep a record of the cumulative likelihood and query-set word-count
            train_like_sum += train_like
            train_wcount_sum  += train_word_count
            query_like_sum += query_like
            query_wcount_sum  += query_word_count
            folds_finished  += 1

            # Write out the output
            print("Fold %d: Train-set Perplexity: %12.3f \t Query-set Perplexity: %12.3f" % (fold, train_perp, query_perp))
            print("")

            # Save the model
            model_files = save_if_necessary(model_files, model_dir, model, data, fold, train_itrs, train_vbs, train_likes, train_tops, query_tops, mdl)
        # except Exception as e:
        #     traceback.print_exc()
        #     print("Abandoning fold %d due to the error : %s" % (fold, str(e)))
        finally:
            fold += 1

    print ("Total (%d): Train-set Likelihood: %12.3f \t Train-set Perplexity: %12.3f" % (folds_finished, train_like_sum, perplexity_from_like(train_like_sum, train_wcount_sum)))
    print ("Total (%d): Query-set Likelihood: %12.3f \t Query-set Perplexity: %12.3f" % (folds_finished, query_like_sum, perplexity_from_like(query_like_sum, query_wcount_sum)))

    return model_files