Пример #1
0
def run(args):
    """
    The command-line run script for LDA experiments.


    """

    # scientific
    import numpy as np
    import scipy as sp


    # display what run got in args
    for tup in args.__dict__.iteritems():
        print tup


    # LOAD VOCAB
    wlist = smart_list_reader( args.vocab_file )
    if not wlist:
        print "Vocab format not recognized"
        sys.exit(-1)
    # convert from list [term1, term2, ...] to dicts
    # [term1:0, term2:1, ... ] and the inverse mapping
    id2word = dict( enumerate(wlist) )
    word2id = dict( [(word,id)  for id,word in id2word.items()] )
    vocab = word2id


    # SETUP CORPUS (LAZY)
    # doCounts -- not so lazy...
    if args.docs_file[-3:]==".mm":
        from liblda.newmmcorpus import NewMmCorpus
        corpus = NewMmCorpus(args.docs_file)
        corpus.setVocabFromDict( vocab )
        corpus.doCounts()
    elif args.docs_file[-4:]==".txt":
        from liblda.low2corpus import Low2Corpus
        corpus = Low2Corpus(args.docs_file)
        corpus.setVocabFromDict( vocab )
        corpus.doCounts()
    else:
        print "Corpus format not recognized"
        sys.exit(-1)



    # Create rundir
    from socket import gethostname
    from liblda.util import rungen

    full_hostname = gethostname()
    host_id = full_hostname.rstrip(".cs.mcgill.ca")

    if not args.rundirs_root:
        rundirs_root = RUNDIRS_ROOT
    else:
        rundirs_root = args.rundirs_root
    if not os.path.exists(rundirs_root):
        print "Error, rundirs_root %s doesn't exist" % rundirs_root
        sys.exit(-1)

    # create the host-specific rundir if necessary
    host_rundirs_root = os.path.join(rundirs_root, host_id)
    if not os.path.exists(host_rundirs_root):
        os.mkdir( host_rundirs_root )

    # create a new (sequential) rundir for this host
    rundir = rungen.mk_next_rundir(host_rundirs_root)
    logger.info("rundir: " + rundir  )

    # prepare a dict which will become input.json
    input = {}
    input["rundir"]=rundir
    input["numT"]=args.numT
    input["iter"]=args.iter
    input["corpus"]=args.docs_file
    input["vocab"]=args.vocab_file
    input["alpha"]=args.alpha
    input["beta"]= args.beta
    input["seed"]=args.seed
    input["host_id"]=host_id
    # and write it to disk
    f=open( os.path.join(rundir, "input.json"), "w" )
    simplejson.dump( input, f, indent=0 )
    f.close()




    start_time = datetime.datetime.now()



    # setup the lda model
    lda = LdaModel( numT=args.numT,    alpha=args.alpha, beta=args.beta,  corpus=corpus, vocab=vocab )


    # if not in seeded mode run as usual
    if not args.seed_z_from:
        lda.train(iter=args.iter, seed=args.seed )
    # NEW: S
    else:
        logger.info("Using seeded z training ... ")

        # training params
        if not args.iter:
            lda.iter = 50
        else:
            lda.iter = args.iter

        if not args.seed:
            seed = 777
            lda.seed = 2*seed+1
        else:
            lda.seed = 2*args.seed + 1


        # loadup the seed_z_from file into seed_z np array
        seed_z = np.load( args.seed_z_from)
        if args.expand_factors:
            expand_factors_str = smart_list_reader( args.expand_factors )
            expand_factors = np.array( [int(i) for i in expand_factors_str ] )
        else:
            expand_factors = None    # let lda.seeded_initialize() handle it

        # custom train sequence
        lda.allocate_arrays()
        lda.read_dw_alphabetical()
        #self.random_initialize()   # NO -- we want a seeded initialization!
        lda.seeded_initialize(seed_z, expand_factors )
        lda.gibbs_sample(iter=lda.iter, seed=lda.seed )
        lda.wpdt_to_probs()
        #self.deallocate_arrays()


    # record how long it took
    end_time = datetime.datetime.now()
    duration = (end_time-start_time).seconds







    # save word counts and topic assignment counts (these are sparse)
    if args.save_counts:    # TRUE by default
        state = ["dp", "wp", "alpha", "beta" ]
        for var_name in state:
            f_name = os.path.join(rundir, RUN_FILENAMESS[var_name] )
            np.save( f_name, lda.__getattribute__(var_name) )
        logger.info("Done writing out Nwt+beta, Ndt+alpha")

    # Gibbs sampler state, which consists of
    # the full  topic assignments "z.npy"
    if args.save_z:
        var_name="z"
        f_name = os.path.join(rundir, RUN_FILENAMESS[var_name] )
        np.save( f_name, lda.__getattribute__(var_name) )
        logger.info("Done writing out z.npy")

    # save probs
    if args.save_probs:
        probs = ["phi", "theta"]
        for var_name in probs:
            f_name = os.path.join(rundir, RUN_FILENAMESS[var_name] )
            np.save( f_name, lda.__getattribute__(var_name) )
        logger.info("Done writing out probabilities phi.npy and theta.npy")



    # prepare a dict which will become output.json
    output = {}
    # run details
    output["rundir"]=rundir
    output["host_id"]=host_id
    output["iter"]=lda.iter
    output["seed"]=lda.seed
    output["start_time"]=start_time.isoformat()  # ISO format string
                                    # to read ISO time stamps use dateutil
                                    #>>> from dateutil import parser
                                    #>>> parser.parse("2011-01-25T23:36:43.373248")
                                    # datetime.datetime(2011, 1, 25, 23, 36, 43, 373247)
    output["duration"]=int(duration)
    # corpus info
    output["corpus"]=args.docs_file
    output["vocab"]=args.vocab_file
    output["numDocs"] = lda.numDocs
    output["numTerms"] = lda.numTerms
    output["totalNterms"] = lda.corpus.totalNwords
    # model parameters
    output["numT"]=lda.numT
    # the hyperparameters are too long to store in full here,
    # use separate .npy files if alpha/beta non uniform
    output["alpha"]= lda.alpha[0] #[np.average(lda.alpha), float(np.cov(lda.alpha)) ]  # [avg, var]
    output["beta"]=  lda.beta[0]  #[np.average(lda.beta), float(np.cov(lda.beta)) ]  # [avg, var]
    #
    # calculate likelyhood
    output["loglike"]=lda.loglike()
    output["perplexity"]=lda.perplexity()   # = np.exp( -1 * loglike() / totalNwords )
    logger.info("Log likelyhood: %f" % output["loglike"] )
    logger.info("Perplexity: %f" % output["perplexity"] )
    #
    # special seeding info
    if args.seed_z_from:
        output["seed_z_from"]= args.seed_z_from
    if args.expand_factors:
        output["expand_factors"]= args.expand_factors



    # compute sparseness and write it out
    sp = get_sparse_stats( lda.phi )
    np.save(  os.path.join(rundir, "phi_sparseness.npy"), sp)
    # report on sparseness statisitcs (assume single mode)
    nz = sp.nonzero()[0]                        # get the nonzero entries
    sp_avg = sum([sp[i]*i for i in nz])         # where are non-zero they concentrated ?
    sp_var = sum( [sp[i]*np.abs(i-sp_avg)**2 for i in nz] )
    sp_stdev = np.sqrt( sp_var )                # how concentrated they are around sp_avg
    output["phi_sparseness_avg"]=sp_avg
    output["phi_sparseness_stdev"]=sp_stdev
    logger.info("Phi sparseness. center=%d, width=%d" % (int(sp_avg),int(sp_stdev))  )

    # same for theta
    sp = get_sparse_stats( lda.theta )
    np.save( os.path.join(rundir, "theta_sparseness.npy"), sp)
    # report on sparseness statisitcs (assume single mode)
    nz = sp.nonzero()[0]                        # get the nonzero entries
    sp_avg = sum([sp[i]*i for i in nz])         # where are non-zero they concentrated ?
    sp_var = sum( [sp[i]*np.abs(i-sp_avg)**2 for i in nz] )
    sp_stdev = np.sqrt( sp_var )                # how concentrated they are around sp_avg
    output["theta_sparseness_avg"]=sp_avg
    output["theta_sparseness_stdev"]=sp_stdev
    logger.info("Theta sparseness. center=%d, width=%d" % (int(sp_avg),int(sp_stdev))  )

    # write all output data to disk
    f=open( os.path.join(rundir, "output.json"), "w" )
    simplejson.dump( output, f, indent=0 )
    f.close()
    logger.info("Done saving output.json")



    if args.print_topics:
        from liblda.topicviz.show_top import show_top
        top_words_in_topics = show_top(lda.phi, num=args.print_topics, id2word=lda.corpus.id2word)

        for topic in top_words_in_topics:
            words = ", ".join(topic)
            print words


    logger.info("Done! --> thank you come again")
Пример #2
0



    # get the topic list
    top_words_in_topics = show_top(phi, num=args.num, id2word=id2word)

    for topic in top_words_in_topics:
        words = ", ".join(topic)
        print words


    print "phi sparsensess"

    # compute sparseness and write it out
    sp = get_sparse_stats( phi )
    np.save("phi_sparseness.npy", sp)

    # report on sparseness statisitcs (assume single mode)
    nz = sp.nonzero()[0]                        # get the nonzero entries
    sp_avg = sum([sp[i]*i for i in nz])         # where are non-zero they concentrated ?
    sp_var = sum( [sp[i]*np.abs(i-sp_avg)**2 for i in nz] )
    sp_stdev = np.sqrt( sp_var )                # how concentrated they are around sp_avg

    logger.info("Phi sparseness. center=%d, width=%d" % (int(sp_avg),int(sp_stdev))  )
    #print list(sp)

    # exit with OK status
    sys.exit(0)




    # get the topic list
    top_words_in_topics = show_top(phi, num=args.num, id2word=id2word)

    for topic in top_words_in_topics:
        words = ", ".join(topic)
        print words


    print "phi sparsensess"

    # compute sparseness and write it out
    sp = get_sparse_stats( phi )
    np.save("phi_sparseness.npy", sp)

    # report on sparseness statisitcs (assume single mode)
    nz = sp.nonzero()[0]                        # get the nonzero entries
    sp_avg = sum([sp[i]*i for i in nz])         # where are non-zero they concentrated ?
    sp_var = sum( [sp[i]*np.abs(i-sp_avg)**2 for i in nz] )
    sp_stdev = np.sqrt( sp_var )                # how concentrated they are around sp_avg

    logger.info("Phi sparseness. center=%d, width=%d" % (int(sp_avg),int(sp_stdev))  )
    #print list(sp)

    # exit with OK status
    sys.exit(0)

Пример #4
0
def run(args):
    """
    The command-line run script for LDA experiments.


    """

    # scientific
    import numpy as np
    import scipy as sp

    # display what run got in args
    for tup in args.__dict__.iteritems():
        print tup

    # LOAD VOCAB
    wlist = smart_list_reader(args.vocab_file)
    if not wlist:
        print "Vocab format not recognized"
        sys.exit(-1)
    # convert from list [term1, term2, ...] to dicts
    # [term1:0, term2:1, ... ] and the inverse mapping
    id2word = dict(enumerate(wlist))
    word2id = dict([(word, id) for id, word in id2word.items()])
    vocab = word2id

    # SETUP CORPUS (LAZY)
    # doCounts -- not so lazy...
    if args.docs_file[-3:] == ".mm":
        from liblda.newmmcorpus import NewMmCorpus
        corpus = NewMmCorpus(args.docs_file)
        corpus.setVocabFromDict(vocab)
        corpus.doCounts()
    elif args.docs_file[-4:] == ".txt":
        from liblda.low2corpus import Low2Corpus
        corpus = Low2Corpus(args.docs_file)
        corpus.setVocabFromDict(vocab)
        corpus.doCounts()
    else:
        print "Corpus format not recognized"
        sys.exit(-1)

    # Create rundir
    from socket import gethostname
    from liblda.util import rungen

    full_hostname = gethostname()
    host_id = full_hostname.rstrip(".cs.mcgill.ca")

    if not args.rundirs_root:
        rundirs_root = RUNDIRS_ROOT
    else:
        rundirs_root = args.rundirs_root
    if not os.path.exists(rundirs_root):
        print "Error, rundirs_root %s doesn't exist" % rundirs_root
        sys.exit(-1)

    # create the host-specific rundir if necessary
    host_rundirs_root = os.path.join(rundirs_root, host_id)
    if not os.path.exists(host_rundirs_root):
        os.mkdir(host_rundirs_root)

    # create a new (sequential) rundir for this host
    rundir = rungen.mk_next_rundir(host_rundirs_root)
    logger.info("rundir: " + rundir)

    # prepare a dict which will become input.json
    input = {}
    input["rundir"] = rundir
    input["numT"] = args.numT
    input["iter"] = args.iter
    input["corpus"] = args.docs_file
    input["vocab"] = args.vocab_file
    input["alpha"] = args.alpha
    input["beta"] = args.beta
    input["seed"] = args.seed
    input["host_id"] = host_id
    # and write it to disk
    f = open(os.path.join(rundir, "input.json"), "w")
    simplejson.dump(input, f, indent=0)
    f.close()

    start_time = datetime.datetime.now()

    # setup the lda model
    lda = LdaModel(numT=args.numT,
                   alpha=args.alpha,
                   beta=args.beta,
                   corpus=corpus,
                   vocab=vocab)

    # if not in seeded mode run as usual
    if not args.seed_z_from:
        if not args.save_perplexity_every:
            lda.train(iter=args.iter, seed=args.seed)
        else:
            lda.allocate_arrays()
            lda.read_dw_alphabetical()
            lda.random_initialize()
            cum = 0
            perp_hist = []
            while (cum < args.iter):

                lda.gibbs_sample(iter=args.save_perplexity_every,
                                 seed=args.seed + cum)
                lda.wpdt_to_probs()
                perp_hist.append(lda.perplexity()
                                 )  # = np.exp( -1 * loglike() / totalNwords )

                cum += args.save_perplexity_every

    # NEW: S
    else:
        logger.info("Using seeded z training ... ")

        # training params
        if not args.iter:
            lda.iter = 50
        else:
            lda.iter = args.iter

        if not args.seed:
            seed = 777
            lda.seed = 2 * seed + 1
        else:
            lda.seed = 2 * args.seed + 1

        # loadup the seed_z_from file into seed_z np array
        seed_z = np.load(args.seed_z_from)
        if args.expand_factors:
            expand_factors_str = smart_list_reader(args.expand_factors)
            expand_factors = np.array([int(i) for i in expand_factors_str])
        else:
            expand_factors = None  # let lda.seeded_initialize() handle it

        # custom train sequence
        lda.allocate_arrays()
        lda.read_dw_alphabetical()
        #self.random_initialize()   # NO -- we want a seeded initialization!
        lda.seeded_initialize(seed_z, expand_factors)
        lda.gibbs_sample(iter=lda.iter, seed=lda.seed)
        lda.wpdt_to_probs()
        #self.deallocate_arrays()

    # record how long it took
    end_time = datetime.datetime.now()
    duration = (end_time - start_time).seconds

    # save word counts and topic assignment counts (these are sparse)
    if args.save_counts:  # TRUE by default
        state = ["dp", "wp", "alpha", "beta"]
        for var_name in state:
            f_name = os.path.join(rundir, RUN_FILENAMESS[var_name])
            np.save(f_name, lda.__getattribute__(var_name))
        logger.info("Done writing out Nwt+beta, Ndt+alpha")

    # Gibbs sampler state, which consists of
    # the full  topic assignments "z.npy"
    if args.save_z:
        var_name = "z"
        f_name = os.path.join(rundir, RUN_FILENAMESS[var_name])
        np.save(f_name, lda.__getattribute__(var_name))
        logger.info("Done writing out z.npy")

    # save probs
    if args.save_probs:
        probs = ["phi", "theta"]
        for var_name in probs:
            f_name = os.path.join(rundir, RUN_FILENAMESS[var_name])
            np.save(f_name, lda.__getattribute__(var_name))
        logger.info("Done writing out probabilities phi.npy and theta.npy")

    # prepare a dict which will become output.json
    output = {}
    # run details
    output["rundir"] = rundir
    output["host_id"] = host_id
    output["iter"] = args.iter
    output["seed"] = args.seed
    output["start_time"] = start_time.isoformat()  # ISO format string
    # to read ISO time stamps use dateutil
    #>>> from dateutil import parser
    #>>> parser.parse("2011-01-25T23:36:43.373248")
    # datetime.datetime(2011, 1, 25, 23, 36, 43, 373247)
    output["duration"] = int(duration)
    # corpus info
    output["corpus"] = args.docs_file
    output["vocab"] = args.vocab_file
    output["numDocs"] = lda.numDocs
    output["numTerms"] = lda.numTerms
    output["totalNterms"] = lda.corpus.totalNwords
    # model parameters
    output["numT"] = lda.numT
    # the hyperparameters are too long to store in full here,
    # use separate .npy files if alpha/beta non uniform
    output["alpha"] = lda.alpha[
        0]  #[np.average(lda.alpha), float(np.cov(lda.alpha)) ]  # [avg, var]
    output["beta"] = lda.beta[
        0]  #[np.average(lda.beta), float(np.cov(lda.beta)) ]  # [avg, var]
    #
    # calculate likelyhood
    output["loglike"] = lda.loglike()
    output["perplexity"] = lda.perplexity(
    )  # = np.exp( -1 * loglike() / totalNwords )
    if args.save_perplexity_every:
        output["perplexity_history"] = perp_hist
    logger.info("Log likelyhood: %f" % output["loglike"])
    logger.info("Perplexity: %f" % output["perplexity"])
    #
    # special seeding info
    if args.seed_z_from:
        output["seed_z_from"] = args.seed_z_from
    if args.expand_factors:
        output["expand_factors"] = args.expand_factors

    # compute sparseness and write it out
    sp = get_sparse_stats(lda.phi)
    np.save(os.path.join(rundir, "phi_sparseness.npy"), sp)
    # report on sparseness statisitcs (assume single mode)
    nz = sp.nonzero()[0]  # get the nonzero entries
    sp_avg = sum([sp[i] * i
                  for i in nz])  # where are non-zero they concentrated ?
    sp_var = sum([sp[i] * np.abs(i - sp_avg)**2 for i in nz])
    sp_stdev = np.sqrt(sp_var)  # how concentrated they are around sp_avg
    output["phi_sparseness_avg"] = sp_avg
    output["phi_sparseness_stdev"] = sp_stdev
    logger.info("Phi sparseness. center=%d, width=%d" %
                (int(sp_avg), int(sp_stdev)))

    # same for theta
    sp = get_sparse_stats(lda.theta)
    np.save(os.path.join(rundir, "theta_sparseness.npy"), sp)
    # report on sparseness statisitcs (assume single mode)
    nz = sp.nonzero()[0]  # get the nonzero entries
    sp_avg = sum([sp[i] * i
                  for i in nz])  # where are non-zero they concentrated ?
    sp_var = sum([sp[i] * np.abs(i - sp_avg)**2 for i in nz])
    sp_stdev = np.sqrt(sp_var)  # how concentrated they are around sp_avg
    output["theta_sparseness_avg"] = sp_avg
    output["theta_sparseness_stdev"] = sp_stdev
    logger.info("Theta sparseness. center=%d, width=%d" %
                (int(sp_avg), int(sp_stdev)))

    # write all output data to disk
    f = open(os.path.join(rundir, "output.json"), "w")
    simplejson.dump(output, f, indent=0)
    f.close()
    logger.info("Done saving output.json")

    if args.print_topics:
        from liblda.topicviz.show_top import show_top
        top_words_in_topics = show_top(lda.phi,
                                       num=args.print_topics,
                                       id2word=lda.corpus.id2word)

        for topic in top_words_in_topics:
            words = ", ".join(topic)
            print words

    logger.info("Done! --> thank you come again")