DATASET_NAME = "ArXiv16k"

print " LOADING DATA  for: " + DATASET_NAME

DATA_PARENT_DIR = "/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/"
VOCAB_FILE = DATA_PARENT_DIR + "vocab.txt"
DOCS_FILE = DATA_PARENT_DIR + "arXiv_train_docs.txt"
IDS_FILE = DATA_PARENT_DIR + "arXiv_train_ids.txt"

######################################################################

# loaders....

# vocab, model and doc2id
train_corpus = Low2Corpus(DOCS_FILE)
train_corpus.setVocabFromList(
    [w.strip() for w in open(VOCAB_FILE, 'r').readlines()])
train_corpus.doCounts()
id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines()]
doc2id = dict(enumerate(id_list))

phiT60_1 = np.load("../runs/repeatedT60-1/phi.npy")
thetaT60_1 = np.load("../runs/repeatedT60-1/theta.npy")
zT60_1 = np.load("../runs/repeatedT60-1/z.npy")

phiT60_2 = np.load("../runs/repeatedT60-2/phi.npy")
thetaT60_2 = np.load("../runs/repeatedT60-2/theta.npy")
zT60_2 = np.load("../runs/repeatedT60-2/z.npy")

phiT60_3 = np.load("../runs/repeatedT60-3/phi.npy")
示例#2
0
# original gensim
sys.path.insert(1, '/Projects/LatentDirichletAllocation/gensim/trunk/src')
from gensim import corpora, models, similarities

# ldalib
sys.path.insert(1, '/Projects/LatentDirichletAllocation/')
import liblda

# settings file with RUNDIRS path, topicmodel location and PROJECT_HOME
# from liblda import settings ?
from liblda.local_settings import *

# to see logging...
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

DATA_PATH = os.path.join(PROJECT_PATH, "data/semrelwords/")
INFILE = "ukwac-uniqmultiwordterms.SAMPLE.txt"
VOCABFILE = "ukwac-vocabulary.SAMPLE.txt"

logging.info("Creating corpus")
infilename = os.path.join(DATA_PATH, INFILE)
vfilename = os.path.join(DATA_PATH, VOCABFILE)
from liblda.low2corpus import Low2Corpus
c = Low2Corpus(infilename)
c.buildVocabs(vfilename)

logging.info("Importing NewmanLdaModel for you")
from liblda.newmanLDAmodel import NewmanLdaModel
示例#3
0
DATA_PARENT_DIR = "/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/"
VOCAB_FILE = DATA_PARENT_DIR + "vocab.txt"

DOCS_FILE = DATA_PARENT_DIR + "arXiv_train_docs.txt"
IDS_FILE = DATA_PARENT_DIR + "arXiv_train_ids.txt"

TEST_DOCS_FILE = DATA_PARENT_DIR + "arXiv_test_docs.txt"
TEST_IDS_FILE = DATA_PARENT_DIR + "arXiv_test_ids.txt"

######################################################################

# loaders....

# vocab, model and doc2id
arXiv_corpus = Low2Corpus(DOCS_FILE)
arXiv_corpus.setVocabFromList(
    [w.strip() for w in open(VOCAB_FILE, 'r').readlines()])
arXiv_corpus.doCounts()
id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines()]
doc2id = dict(enumerate(id_list))

# vocab, model and doc2id
arXiv_test_corpus = Low2Corpus(TEST_DOCS_FILE)
arXiv_test_corpus.setVocabFromList(
    [w.strip() for w in open(VOCAB_FILE, 'r').readlines()])
arXiv_test_corpus.doCounts()
test_id_list = [w.strip() for w in open(TEST_IDS_FILE, 'r').readlines()]
test_doc2id = dict(enumerate(test_id_list))

# the original to compare with
DATASET_NAME = "ArXiv16k"

print " LOADING DATA  for: " + DATASET_NAME

DATA_PARENT_DIR = "/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/"
VOCAB_FILE = DATA_PARENT_DIR + "vocab.txt"
DOCS_FILE = DATA_PARENT_DIR + "arXiv_train_docs.txt"
IDS_FILE = DATA_PARENT_DIR + "arXiv_train_ids.txt"

######################################################################

# loaders....

# vocab, model and doc2id
tcorpus3 = Low2Corpus(DOCS_FILE)
tcorpus3.setVocabFromList(
    [w.strip() for w in open(VOCAB_FILE, 'r').readlines()])
tcorpus3.doCounts()
id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines()]
doc2id = dict(enumerate(id_list))

# data
phi = np.load("../runs/subtopicsT40/phi.npy")
#seeded_phi = np.load("../runs/subtopicsT200seeded/phi.npy")
unseeded_phi = np.load("../runs/subtopicsT200unseeded/phi.npy")

theta = np.load("../runs/subtopicsT40/theta.npy")
#seeded_theta = np.load("../runs/subtopicsT200seeded/theta.npy")
unseeded_theta = np.load("../runs/subtopicsT200unseeded/theta.npy")
示例#5
0
def run(args):
    """
    The command-line run script for LDA experiments.


    """

    # scientific
    import numpy as np
    import scipy as sp

    # display what run got in args
    for tup in args.__dict__.iteritems():
        print tup

    # LOAD VOCAB
    wlist = smart_list_reader(args.vocab_file)
    if not wlist:
        print "Vocab format not recognized"
        sys.exit(-1)
    # convert from list [term1, term2, ...] to dicts
    # [term1:0, term2:1, ... ] and the inverse mapping
    id2word = dict(enumerate(wlist))
    word2id = dict([(word, id) for id, word in id2word.items()])
    vocab = word2id

    # SETUP CORPUS (LAZY)
    # doCounts -- not so lazy...
    if args.docs_file[-3:] == ".mm":
        from liblda.newmmcorpus import NewMmCorpus
        corpus = NewMmCorpus(args.docs_file)
        corpus.setVocabFromDict(vocab)
        corpus.doCounts()
    elif args.docs_file[-4:] == ".txt":
        from liblda.low2corpus import Low2Corpus
        corpus = Low2Corpus(args.docs_file)
        corpus.setVocabFromDict(vocab)
        corpus.doCounts()
    else:
        print "Corpus format not recognized"
        sys.exit(-1)

    # Create rundir
    from socket import gethostname
    from liblda.util import rungen

    full_hostname = gethostname()
    host_id = full_hostname.rstrip(".cs.mcgill.ca")

    if not args.rundirs_root:
        rundirs_root = RUNDIRS_ROOT
    else:
        rundirs_root = args.rundirs_root
    if not os.path.exists(rundirs_root):
        print "Error, rundirs_root %s doesn't exist" % rundirs_root
        sys.exit(-1)

    # create the host-specific rundir if necessary
    host_rundirs_root = os.path.join(rundirs_root, host_id)
    if not os.path.exists(host_rundirs_root):
        os.mkdir(host_rundirs_root)

    # create a new (sequential) rundir for this host
    rundir = rungen.mk_next_rundir(host_rundirs_root)
    logger.info("rundir: " + rundir)

    # prepare a dict which will become input.json
    input = {}
    input["rundir"] = rundir
    input["numT"] = args.numT
    input["iter"] = args.iter
    input["corpus"] = args.docs_file
    input["vocab"] = args.vocab_file
    input["alpha"] = args.alpha
    input["beta"] = args.beta
    input["seed"] = args.seed
    input["host_id"] = host_id
    # and write it to disk
    f = open(os.path.join(rundir, "input.json"), "w")
    simplejson.dump(input, f, indent=0)
    f.close()

    start_time = datetime.datetime.now()

    # setup the lda model
    lda = LdaModel(numT=args.numT,
                   alpha=args.alpha,
                   beta=args.beta,
                   corpus=corpus,
                   vocab=vocab)

    # if not in seeded mode run as usual
    if not args.seed_z_from:
        if not args.save_perplexity_every:
            lda.train(iter=args.iter, seed=args.seed)
        else:
            lda.allocate_arrays()
            lda.read_dw_alphabetical()
            lda.random_initialize()
            cum = 0
            perp_hist = []
            while (cum < args.iter):

                lda.gibbs_sample(iter=args.save_perplexity_every,
                                 seed=args.seed + cum)
                lda.wpdt_to_probs()
                perp_hist.append(lda.perplexity()
                                 )  # = np.exp( -1 * loglike() / totalNwords )

                cum += args.save_perplexity_every

    # NEW: S
    else:
        logger.info("Using seeded z training ... ")

        # training params
        if not args.iter:
            lda.iter = 50
        else:
            lda.iter = args.iter

        if not args.seed:
            seed = 777
            lda.seed = 2 * seed + 1
        else:
            lda.seed = 2 * args.seed + 1

        # loadup the seed_z_from file into seed_z np array
        seed_z = np.load(args.seed_z_from)
        if args.expand_factors:
            expand_factors_str = smart_list_reader(args.expand_factors)
            expand_factors = np.array([int(i) for i in expand_factors_str])
        else:
            expand_factors = None  # let lda.seeded_initialize() handle it

        # custom train sequence
        lda.allocate_arrays()
        lda.read_dw_alphabetical()
        #self.random_initialize()   # NO -- we want a seeded initialization!
        lda.seeded_initialize(seed_z, expand_factors)
        lda.gibbs_sample(iter=lda.iter, seed=lda.seed)
        lda.wpdt_to_probs()
        #self.deallocate_arrays()

    # record how long it took
    end_time = datetime.datetime.now()
    duration = (end_time - start_time).seconds

    # save word counts and topic assignment counts (these are sparse)
    if args.save_counts:  # TRUE by default
        state = ["dp", "wp", "alpha", "beta"]
        for var_name in state:
            f_name = os.path.join(rundir, RUN_FILENAMESS[var_name])
            np.save(f_name, lda.__getattribute__(var_name))
        logger.info("Done writing out Nwt+beta, Ndt+alpha")

    # Gibbs sampler state, which consists of
    # the full  topic assignments "z.npy"
    if args.save_z:
        var_name = "z"
        f_name = os.path.join(rundir, RUN_FILENAMESS[var_name])
        np.save(f_name, lda.__getattribute__(var_name))
        logger.info("Done writing out z.npy")

    # save probs
    if args.save_probs:
        probs = ["phi", "theta"]
        for var_name in probs:
            f_name = os.path.join(rundir, RUN_FILENAMESS[var_name])
            np.save(f_name, lda.__getattribute__(var_name))
        logger.info("Done writing out probabilities phi.npy and theta.npy")

    # prepare a dict which will become output.json
    output = {}
    # run details
    output["rundir"] = rundir
    output["host_id"] = host_id
    output["iter"] = args.iter
    output["seed"] = args.seed
    output["start_time"] = start_time.isoformat()  # ISO format string
    # to read ISO time stamps use dateutil
    #>>> from dateutil import parser
    #>>> parser.parse("2011-01-25T23:36:43.373248")
    # datetime.datetime(2011, 1, 25, 23, 36, 43, 373247)
    output["duration"] = int(duration)
    # corpus info
    output["corpus"] = args.docs_file
    output["vocab"] = args.vocab_file
    output["numDocs"] = lda.numDocs
    output["numTerms"] = lda.numTerms
    output["totalNterms"] = lda.corpus.totalNwords
    # model parameters
    output["numT"] = lda.numT
    # the hyperparameters are too long to store in full here,
    # use separate .npy files if alpha/beta non uniform
    output["alpha"] = lda.alpha[
        0]  #[np.average(lda.alpha), float(np.cov(lda.alpha)) ]  # [avg, var]
    output["beta"] = lda.beta[
        0]  #[np.average(lda.beta), float(np.cov(lda.beta)) ]  # [avg, var]
    #
    # calculate likelyhood
    output["loglike"] = lda.loglike()
    output["perplexity"] = lda.perplexity(
    )  # = np.exp( -1 * loglike() / totalNwords )
    if args.save_perplexity_every:
        output["perplexity_history"] = perp_hist
    logger.info("Log likelyhood: %f" % output["loglike"])
    logger.info("Perplexity: %f" % output["perplexity"])
    #
    # special seeding info
    if args.seed_z_from:
        output["seed_z_from"] = args.seed_z_from
    if args.expand_factors:
        output["expand_factors"] = args.expand_factors

    # compute sparseness and write it out
    sp = get_sparse_stats(lda.phi)
    np.save(os.path.join(rundir, "phi_sparseness.npy"), sp)
    # report on sparseness statisitcs (assume single mode)
    nz = sp.nonzero()[0]  # get the nonzero entries
    sp_avg = sum([sp[i] * i
                  for i in nz])  # where are non-zero they concentrated ?
    sp_var = sum([sp[i] * np.abs(i - sp_avg)**2 for i in nz])
    sp_stdev = np.sqrt(sp_var)  # how concentrated they are around sp_avg
    output["phi_sparseness_avg"] = sp_avg
    output["phi_sparseness_stdev"] = sp_stdev
    logger.info("Phi sparseness. center=%d, width=%d" %
                (int(sp_avg), int(sp_stdev)))

    # same for theta
    sp = get_sparse_stats(lda.theta)
    np.save(os.path.join(rundir, "theta_sparseness.npy"), sp)
    # report on sparseness statisitcs (assume single mode)
    nz = sp.nonzero()[0]  # get the nonzero entries
    sp_avg = sum([sp[i] * i
                  for i in nz])  # where are non-zero they concentrated ?
    sp_var = sum([sp[i] * np.abs(i - sp_avg)**2 for i in nz])
    sp_stdev = np.sqrt(sp_var)  # how concentrated they are around sp_avg
    output["theta_sparseness_avg"] = sp_avg
    output["theta_sparseness_stdev"] = sp_stdev
    logger.info("Theta sparseness. center=%d, width=%d" %
                (int(sp_avg), int(sp_stdev)))

    # write all output data to disk
    f = open(os.path.join(rundir, "output.json"), "w")
    simplejson.dump(output, f, indent=0)
    f.close()
    logger.info("Done saving output.json")

    if args.print_topics:
        from liblda.topicviz.show_top import show_top
        top_words_in_topics = show_top(lda.phi,
                                       num=args.print_topics,
                                       id2word=lda.corpus.id2word)

        for topic in top_words_in_topics:
            words = ", ".join(topic)
            print words

    logger.info("Done! --> thank you come again")
# ok now we are in the LatentDirichletAllocation root /
LDAdir = os.getcwd()


from liblda.low2corpus import Low2Corpus

execfile('mycmds.py')

testdir = os.path.realpath(os.path.join(LDAdir, "liblda/test/"))
# 1/10th of the quant-ph arXiv papers 2016 docs, vocab size of 10000
INFILE="arXiv_docs.txt"     # 2016 docs
VOCABFILE="arXiv_vocab.txt" # ~ 10 000
#arXiv_ids.txt
infilename = os.path.join(testdir, INFILE)
vfilename  = os.path.join(testdir, VOCABFILE)
tcorpus3 = Low2Corpus(infilename)
tcorpus3.setVocabFromList( [w.strip() for w in open(vfilename, 'r').readlines() ] )
tcorpus3.doCounts()


execfile('liblda/math/dirichlet_sparse_stats.py')
from liblda.LDAmodel import LdaModel


os.chdir(posterior_dir)

# T = 100
lda = LdaModel( numT=10, alpha=0.1, beta=0.01, corpus=tcorpus3)
watch_sparseness(lda, steps=[0,1,1,8,10,50,130], seed=7, filename='theta_and_phi_sparseness_for_alpha0.1beta0.01T10.png', initialize=True, pause=False )

lda = LdaModel( numT=10, alpha=0.01, beta=0.01, corpus=tcorpus3)