DATASET_NAME = "ArXiv16k" print " LOADING DATA for: " + DATASET_NAME DATA_PARENT_DIR = "/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/" VOCAB_FILE = DATA_PARENT_DIR + "vocab.txt" DOCS_FILE = DATA_PARENT_DIR + "arXiv_train_docs.txt" IDS_FILE = DATA_PARENT_DIR + "arXiv_train_ids.txt" ###################################################################### # loaders.... # vocab, model and doc2id train_corpus = Low2Corpus(DOCS_FILE) train_corpus.setVocabFromList( [w.strip() for w in open(VOCAB_FILE, 'r').readlines()]) train_corpus.doCounts() id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines()] doc2id = dict(enumerate(id_list)) phiT60_1 = np.load("../runs/repeatedT60-1/phi.npy") thetaT60_1 = np.load("../runs/repeatedT60-1/theta.npy") zT60_1 = np.load("../runs/repeatedT60-1/z.npy") phiT60_2 = np.load("../runs/repeatedT60-2/phi.npy") thetaT60_2 = np.load("../runs/repeatedT60-2/theta.npy") zT60_2 = np.load("../runs/repeatedT60-2/z.npy") phiT60_3 = np.load("../runs/repeatedT60-3/phi.npy")
# original gensim sys.path.insert(1, '/Projects/LatentDirichletAllocation/gensim/trunk/src') from gensim import corpora, models, similarities # ldalib sys.path.insert(1, '/Projects/LatentDirichletAllocation/') import liblda # settings file with RUNDIRS path, topicmodel location and PROJECT_HOME # from liblda import settings ? from liblda.local_settings import * # to see logging... import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) DATA_PATH = os.path.join(PROJECT_PATH, "data/semrelwords/") INFILE = "ukwac-uniqmultiwordterms.SAMPLE.txt" VOCABFILE = "ukwac-vocabulary.SAMPLE.txt" logging.info("Creating corpus") infilename = os.path.join(DATA_PATH, INFILE) vfilename = os.path.join(DATA_PATH, VOCABFILE) from liblda.low2corpus import Low2Corpus c = Low2Corpus(infilename) c.buildVocabs(vfilename) logging.info("Importing NewmanLdaModel for you") from liblda.newmanLDAmodel import NewmanLdaModel
DATA_PARENT_DIR = "/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/" VOCAB_FILE = DATA_PARENT_DIR + "vocab.txt" DOCS_FILE = DATA_PARENT_DIR + "arXiv_train_docs.txt" IDS_FILE = DATA_PARENT_DIR + "arXiv_train_ids.txt" TEST_DOCS_FILE = DATA_PARENT_DIR + "arXiv_test_docs.txt" TEST_IDS_FILE = DATA_PARENT_DIR + "arXiv_test_ids.txt" ###################################################################### # loaders.... # vocab, model and doc2id arXiv_corpus = Low2Corpus(DOCS_FILE) arXiv_corpus.setVocabFromList( [w.strip() for w in open(VOCAB_FILE, 'r').readlines()]) arXiv_corpus.doCounts() id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines()] doc2id = dict(enumerate(id_list)) # vocab, model and doc2id arXiv_test_corpus = Low2Corpus(TEST_DOCS_FILE) arXiv_test_corpus.setVocabFromList( [w.strip() for w in open(VOCAB_FILE, 'r').readlines()]) arXiv_test_corpus.doCounts() test_id_list = [w.strip() for w in open(TEST_IDS_FILE, 'r').readlines()] test_doc2id = dict(enumerate(test_id_list)) # the original to compare with
DATASET_NAME = "ArXiv16k" print " LOADING DATA for: " + DATASET_NAME DATA_PARENT_DIR = "/CurrentPorjects/LatentDirichletAllocation/data/arXiv_as_LOW2/" VOCAB_FILE = DATA_PARENT_DIR + "vocab.txt" DOCS_FILE = DATA_PARENT_DIR + "arXiv_train_docs.txt" IDS_FILE = DATA_PARENT_DIR + "arXiv_train_ids.txt" ###################################################################### # loaders.... # vocab, model and doc2id tcorpus3 = Low2Corpus(DOCS_FILE) tcorpus3.setVocabFromList( [w.strip() for w in open(VOCAB_FILE, 'r').readlines()]) tcorpus3.doCounts() id_list = [w.strip() for w in open(IDS_FILE, 'r').readlines()] doc2id = dict(enumerate(id_list)) # data phi = np.load("../runs/subtopicsT40/phi.npy") #seeded_phi = np.load("../runs/subtopicsT200seeded/phi.npy") unseeded_phi = np.load("../runs/subtopicsT200unseeded/phi.npy") theta = np.load("../runs/subtopicsT40/theta.npy") #seeded_theta = np.load("../runs/subtopicsT200seeded/theta.npy") unseeded_theta = np.load("../runs/subtopicsT200unseeded/theta.npy")
def run(args): """ The command-line run script for LDA experiments. """ # scientific import numpy as np import scipy as sp # display what run got in args for tup in args.__dict__.iteritems(): print tup # LOAD VOCAB wlist = smart_list_reader(args.vocab_file) if not wlist: print "Vocab format not recognized" sys.exit(-1) # convert from list [term1, term2, ...] to dicts # [term1:0, term2:1, ... ] and the inverse mapping id2word = dict(enumerate(wlist)) word2id = dict([(word, id) for id, word in id2word.items()]) vocab = word2id # SETUP CORPUS (LAZY) # doCounts -- not so lazy... if args.docs_file[-3:] == ".mm": from liblda.newmmcorpus import NewMmCorpus corpus = NewMmCorpus(args.docs_file) corpus.setVocabFromDict(vocab) corpus.doCounts() elif args.docs_file[-4:] == ".txt": from liblda.low2corpus import Low2Corpus corpus = Low2Corpus(args.docs_file) corpus.setVocabFromDict(vocab) corpus.doCounts() else: print "Corpus format not recognized" sys.exit(-1) # Create rundir from socket import gethostname from liblda.util import rungen full_hostname = gethostname() host_id = full_hostname.rstrip(".cs.mcgill.ca") if not args.rundirs_root: rundirs_root = RUNDIRS_ROOT else: rundirs_root = args.rundirs_root if not os.path.exists(rundirs_root): print "Error, rundirs_root %s doesn't exist" % rundirs_root sys.exit(-1) # create the host-specific rundir if necessary host_rundirs_root = os.path.join(rundirs_root, host_id) if not os.path.exists(host_rundirs_root): os.mkdir(host_rundirs_root) # create a new (sequential) rundir for this host rundir = rungen.mk_next_rundir(host_rundirs_root) logger.info("rundir: " + rundir) # prepare a dict which will become input.json input = {} input["rundir"] = rundir input["numT"] = args.numT input["iter"] = args.iter input["corpus"] = args.docs_file input["vocab"] = args.vocab_file input["alpha"] = args.alpha input["beta"] = args.beta input["seed"] = args.seed input["host_id"] = host_id # and write it to disk f = open(os.path.join(rundir, "input.json"), "w") simplejson.dump(input, f, indent=0) f.close() start_time = datetime.datetime.now() # setup the lda model lda = LdaModel(numT=args.numT, alpha=args.alpha, beta=args.beta, corpus=corpus, vocab=vocab) # if not in seeded mode run as usual if not args.seed_z_from: if not args.save_perplexity_every: lda.train(iter=args.iter, seed=args.seed) else: lda.allocate_arrays() lda.read_dw_alphabetical() lda.random_initialize() cum = 0 perp_hist = [] while (cum < args.iter): lda.gibbs_sample(iter=args.save_perplexity_every, seed=args.seed + cum) lda.wpdt_to_probs() perp_hist.append(lda.perplexity() ) # = np.exp( -1 * loglike() / totalNwords ) cum += args.save_perplexity_every # NEW: S else: logger.info("Using seeded z training ... ") # training params if not args.iter: lda.iter = 50 else: lda.iter = args.iter if not args.seed: seed = 777 lda.seed = 2 * seed + 1 else: lda.seed = 2 * args.seed + 1 # loadup the seed_z_from file into seed_z np array seed_z = np.load(args.seed_z_from) if args.expand_factors: expand_factors_str = smart_list_reader(args.expand_factors) expand_factors = np.array([int(i) for i in expand_factors_str]) else: expand_factors = None # let lda.seeded_initialize() handle it # custom train sequence lda.allocate_arrays() lda.read_dw_alphabetical() #self.random_initialize() # NO -- we want a seeded initialization! lda.seeded_initialize(seed_z, expand_factors) lda.gibbs_sample(iter=lda.iter, seed=lda.seed) lda.wpdt_to_probs() #self.deallocate_arrays() # record how long it took end_time = datetime.datetime.now() duration = (end_time - start_time).seconds # save word counts and topic assignment counts (these are sparse) if args.save_counts: # TRUE by default state = ["dp", "wp", "alpha", "beta"] for var_name in state: f_name = os.path.join(rundir, RUN_FILENAMESS[var_name]) np.save(f_name, lda.__getattribute__(var_name)) logger.info("Done writing out Nwt+beta, Ndt+alpha") # Gibbs sampler state, which consists of # the full topic assignments "z.npy" if args.save_z: var_name = "z" f_name = os.path.join(rundir, RUN_FILENAMESS[var_name]) np.save(f_name, lda.__getattribute__(var_name)) logger.info("Done writing out z.npy") # save probs if args.save_probs: probs = ["phi", "theta"] for var_name in probs: f_name = os.path.join(rundir, RUN_FILENAMESS[var_name]) np.save(f_name, lda.__getattribute__(var_name)) logger.info("Done writing out probabilities phi.npy and theta.npy") # prepare a dict which will become output.json output = {} # run details output["rundir"] = rundir output["host_id"] = host_id output["iter"] = args.iter output["seed"] = args.seed output["start_time"] = start_time.isoformat() # ISO format string # to read ISO time stamps use dateutil #>>> from dateutil import parser #>>> parser.parse("2011-01-25T23:36:43.373248") # datetime.datetime(2011, 1, 25, 23, 36, 43, 373247) output["duration"] = int(duration) # corpus info output["corpus"] = args.docs_file output["vocab"] = args.vocab_file output["numDocs"] = lda.numDocs output["numTerms"] = lda.numTerms output["totalNterms"] = lda.corpus.totalNwords # model parameters output["numT"] = lda.numT # the hyperparameters are too long to store in full here, # use separate .npy files if alpha/beta non uniform output["alpha"] = lda.alpha[ 0] #[np.average(lda.alpha), float(np.cov(lda.alpha)) ] # [avg, var] output["beta"] = lda.beta[ 0] #[np.average(lda.beta), float(np.cov(lda.beta)) ] # [avg, var] # # calculate likelyhood output["loglike"] = lda.loglike() output["perplexity"] = lda.perplexity( ) # = np.exp( -1 * loglike() / totalNwords ) if args.save_perplexity_every: output["perplexity_history"] = perp_hist logger.info("Log likelyhood: %f" % output["loglike"]) logger.info("Perplexity: %f" % output["perplexity"]) # # special seeding info if args.seed_z_from: output["seed_z_from"] = args.seed_z_from if args.expand_factors: output["expand_factors"] = args.expand_factors # compute sparseness and write it out sp = get_sparse_stats(lda.phi) np.save(os.path.join(rundir, "phi_sparseness.npy"), sp) # report on sparseness statisitcs (assume single mode) nz = sp.nonzero()[0] # get the nonzero entries sp_avg = sum([sp[i] * i for i in nz]) # where are non-zero they concentrated ? sp_var = sum([sp[i] * np.abs(i - sp_avg)**2 for i in nz]) sp_stdev = np.sqrt(sp_var) # how concentrated they are around sp_avg output["phi_sparseness_avg"] = sp_avg output["phi_sparseness_stdev"] = sp_stdev logger.info("Phi sparseness. center=%d, width=%d" % (int(sp_avg), int(sp_stdev))) # same for theta sp = get_sparse_stats(lda.theta) np.save(os.path.join(rundir, "theta_sparseness.npy"), sp) # report on sparseness statisitcs (assume single mode) nz = sp.nonzero()[0] # get the nonzero entries sp_avg = sum([sp[i] * i for i in nz]) # where are non-zero they concentrated ? sp_var = sum([sp[i] * np.abs(i - sp_avg)**2 for i in nz]) sp_stdev = np.sqrt(sp_var) # how concentrated they are around sp_avg output["theta_sparseness_avg"] = sp_avg output["theta_sparseness_stdev"] = sp_stdev logger.info("Theta sparseness. center=%d, width=%d" % (int(sp_avg), int(sp_stdev))) # write all output data to disk f = open(os.path.join(rundir, "output.json"), "w") simplejson.dump(output, f, indent=0) f.close() logger.info("Done saving output.json") if args.print_topics: from liblda.topicviz.show_top import show_top top_words_in_topics = show_top(lda.phi, num=args.print_topics, id2word=lda.corpus.id2word) for topic in top_words_in_topics: words = ", ".join(topic) print words logger.info("Done! --> thank you come again")
# ok now we are in the LatentDirichletAllocation root / LDAdir = os.getcwd() from liblda.low2corpus import Low2Corpus execfile('mycmds.py') testdir = os.path.realpath(os.path.join(LDAdir, "liblda/test/")) # 1/10th of the quant-ph arXiv papers 2016 docs, vocab size of 10000 INFILE="arXiv_docs.txt" # 2016 docs VOCABFILE="arXiv_vocab.txt" # ~ 10 000 #arXiv_ids.txt infilename = os.path.join(testdir, INFILE) vfilename = os.path.join(testdir, VOCABFILE) tcorpus3 = Low2Corpus(infilename) tcorpus3.setVocabFromList( [w.strip() for w in open(vfilename, 'r').readlines() ] ) tcorpus3.doCounts() execfile('liblda/math/dirichlet_sparse_stats.py') from liblda.LDAmodel import LdaModel os.chdir(posterior_dir) # T = 100 lda = LdaModel( numT=10, alpha=0.1, beta=0.01, corpus=tcorpus3) watch_sparseness(lda, steps=[0,1,1,8,10,50,130], seed=7, filename='theta_and_phi_sparseness_for_alpha0.1beta0.01T10.png', initialize=True, pause=False ) lda = LdaModel( numT=10, alpha=0.01, beta=0.01, corpus=tcorpus3)