Пример #1
0
def batch_generator(corpus, k, V, thread_batch, lockedEta):
    # ids
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()

    # process contral
    nBatch = LockedSum(0, Lock())

    for doc in corpus:

        for vid, count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k, voc_temp)
            etaSum = lockedEta.get_eta_sum(k, V)
            alpha = _mea.get_alpha(k)

            yield (doc_buffer, eta_temp, etaSum, alpha, batch_id)

            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k, voc_temp)
        etaSum = lockedEta.get_eta_sum(k, V)
        alpha = _mea.get_alpha(k)

        yield (doc_buffer, eta_temp, etaSum, alpha, batch_id)

        batch_id += 1
Пример #2
0
def batch_generator(corpus,k,V,thread_batch,lockedEta):
    # ids 
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()
    
    # process contral
    nBatch = LockedSum(0,Lock())

    for doc in corpus:
        
        for vid,count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k,voc_temp)
            etaSum = lockedEta.get_eta_sum(k,V)
            alpha = _mea.get_alpha(k)
            
            yield (doc_buffer,eta_temp,etaSum,alpha,batch_id)

            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k,voc_temp)
        etaSum = lockedEta.get_eta_sum(k,V)
        alpha = _mea.get_alpha(k)

        yield (doc_buffer,eta_temp,etaSum,alpha,batch_id)

        batch_id += 1    
Пример #3
0
def test():
    '''
    $1 path to config file
    '''
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    # loading configs;
    config = get_config(sys.argv[2])
    k = config['k']
    test_path = config['test_path']
    test_train_path = config['test_train']
    test_test_path = config['test_test']
    eta_path = config['eta_path']
    gensim = config['gensim']
    print eta_path
    corpus = _mCorpus.get_corpus(test_path)
    V = corpus.num_terms

    voc_set = set()
    for doc in corpus:
        for wid, count in doc:
            voc_set.add(wid)
    etaTest, etaSum = None, None

    if gensim:
        etaTest, etaSum = _mea.get_gensim_eta_etaSum(eta_path, voc_set)
    else:
        eta = _mea.load_eta(eta_path)
        etaTest = _mea.get_eta(k, eta, voc_set)
        etaSum = _mea.get_eta_sum(eta, k, V)

    test_test = _mCorpus.get_corpus(test_test_path)
    test_train = _mCorpus.get_corpus(test_train_path)
    alpha = _mea.get_alpha(k)
    perplexity = _mper.perplexity(test_train, test_test, alpha, etaTest,
                                  etaSum)

    print perplexity
Пример #4
0
def test():
    '''
    $1 path to config file
    '''
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    # loading configs;
    config = get_config(sys.argv[2])
    k = config['k']
    test_path = config['test_path']
    test_train_path = config['test_train']
    test_test_path = config['test_test']
    eta_path = config['eta_path']
    gensim= config['gensim']
    print eta_path
    corpus = _mCorpus.get_corpus(test_path)
    V = corpus.num_terms

    voc_set = set()
    for doc in corpus:
        for wid,count in doc:
            voc_set.add(wid)
    etaTest, etaSum = None,None
    
    if gensim:
        etaTest, etaSum = _mea.get_gensim_eta_etaSum(eta_path,voc_set)
    else:
        eta = _mea.load_eta(eta_path)
        etaTest = _mea.get_eta(k,eta,voc_set)
        etaSum = _mea.get_eta_sum(eta,k,V)

    test_test = _mCorpus.get_corpus(test_test_path)
    test_train = _mCorpus.get_corpus(test_train_path)
    alpha = _mea.get_alpha(k)
    perplexity = _mper.perplexity(test_train,test_test,alpha,etaTest,etaSum)
    
    print perplexity
Пример #5
0
def asyn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False):
    # configs
    thread_batch = minibatch/nthread
    # ids 
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()
    old_doc_seen = {}
    old_doc_seen[0] = 0
    # global data
    lockedEta = LockedEta({},Lock())
    
    # process contral
    pool = Pool(processes = nthread)
    nActPro = LockedSum(0,Lock())
    nBatch = LockedSum(0,Lock())
    results = []
    for doc in corpus:
        
        for vid,count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k,voc_temp)
            etaSum = lockedEta.get_eta_sum(k,V)
            alpha = _mea.get_alpha(k)
            while True: # check for active processes amount
                if nActPro.get_value() < nthread:
                    break
                
            cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen)
            result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb)
            results.append(result)
            nActPro.add_value(1)
            
            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

            
        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k,voc_temp)
        etaSum = lockedEta.get_eta_sum(k,V)
        alpha = _mea.get_alpha(k)
        while True: # check for active processes amount
            if nActPro.get_value() < nthread:
                break
                
        cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen)
        result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb)
        results.append(result)
        nActPro.add_value(1)
        batch_id += 1

    for r in results:
        r.wait()

    if nBatch.get_value() % nthread != 0:
        nBatch_value = nBatch.get_value()
        fn = 'eta.{}.pickle'.format(nBatch_value/nthread)
        path = os.path.join(var_path,fn)
        lockedEta.write_eta(path)
        
    return lockedEta.eta
Пример #6
0
def syn_framework(corpus,
                  k,
                  V,
                  nthread,
                  minibatch,
                  var_path,
                  record_eta=False):
    # configs
    thread_batch = minibatch / nthread
    # ids
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    batch_buffer = []  # [(docs,etas)]
    voc_temp = set()
    # global data
    eta = {}

    for doc in corpus:

        for vid, count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = _mea.get_eta(k, eta, voc_temp)
            etaSum = _mea.get_eta_sum(eta, k, V)
            batch_buffer.append((doc_buffer, eta_temp, etaSum))

            # clear doc buffer
            doc_buffer = []
            voc_temp = set()

            if batch_id % nthread == nthread - 1:
                # update eta
                eta = syn_master(batch_buffer, k, nthread, eta,
                                 _mea.get_alpha(k))
                if record_eta:
                    fn = 'eta.{}.pickle'.format(round_id)
                    path = os.path.join(var_path, fn)
                    _mea.write_eta(eta, path)
                # clear batch_buffer
                batch_buffer = []
                round_id += 1
                logging.info('round:{}, batch:{}'.format(round_id, batch_id))

            batch_id += 1

        doc_id += 1

    # process the docs in current doc_buffer
    if len(doc_buffer) > 0:
        # form a new batch
        eta_temp = _mea.get_eta(k, eta, voc_temp)
        etaSum = _mea.get_eta_sum(eta, k, V)
        batch_buffer.append((doc_buffer, eta_temp, etaSum))

        # form a new round
        eta = syn_master(batch_buffer, k, len(batch_buffer), eta,
                         _mea.get_alpha(k))
        if record_eta:
            fn = 'eta.{}.pickle'.format(round_id)
            path = os.path.join(var_path, fn)
            _mea.write_eta(eta, path)

        round_id += 1
        batch_id += 1

    return eta
Пример #7
0
def asyn_framework(corpus,
                   k,
                   V,
                   nthread,
                   minibatch,
                   var_path,
                   record_eta=False):
    # configs
    thread_batch = minibatch / nthread
    # ids
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    voc_temp = set()
    old_doc_seen = {}
    old_doc_seen[0] = 0
    # global data
    lockedEta = LockedEta({}, Lock())

    # process contral
    pool = Pool(processes=nthread)
    nActPro = LockedSum(0, Lock())
    nBatch = LockedSum(0, Lock())
    results = []
    for doc in corpus:

        for vid, count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = lockedEta.get_eta(k, voc_temp)
            etaSum = lockedEta.get_eta_sum(k, V)
            alpha = _mea.get_alpha(k)
            while True:  # check for active processes amount
                if nActPro.get_value() < nthread:
                    break

            cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path,
                                    nthread, thread_batch, old_doc_seen)
            result = pool.apply_async(asyn_workder,
                                      (doc_buffer, eta_temp, etaSum, alpha),
                                      callback=cb)
            results.append(result)
            nActPro.add_value(1)

            # clear buffer
            doc_buffer = []
            voc_temp = set()
            batch_id += 1

        doc_id += 1

    # some remain doc may not be processed
    if len(doc_buffer) > 0:
        eta_temp = lockedEta.get_eta(k, voc_temp)
        etaSum = lockedEta.get_eta_sum(k, V)
        alpha = _mea.get_alpha(k)
        while True:  # check for active processes amount
            if nActPro.get_value() < nthread:
                break

        cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path,
                                nthread, thread_batch, old_doc_seen)
        result = pool.apply_async(asyn_workder,
                                  (doc_buffer, eta_temp, etaSum, alpha),
                                  callback=cb)
        results.append(result)
        nActPro.add_value(1)
        batch_id += 1

    for r in results:
        r.wait()

    if nBatch.get_value() % nthread != 0:
        nBatch_value = nBatch.get_value()
        fn = 'eta.{}.pickle'.format(nBatch_value / nthread)
        path = os.path.join(var_path, fn)
        lockedEta.write_eta(path)

    return lockedEta.eta
Пример #8
0
def syn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False):
    # configs
    thread_batch = minibatch/nthread
    # ids 
    doc_id = 0
    batch_id = 0
    round_id = 0
    # temp data
    doc_buffer = []
    batch_buffer = [] # [(docs,etas)]
    voc_temp = set()
    # global data
    eta = {}

    for doc in corpus:

        for vid,count in doc:
            voc_temp.add(vid)
        doc_buffer.append(doc)

        if doc_id % thread_batch == thread_batch - 1:
            eta_temp = _mea.get_eta(k,eta, voc_temp)
            etaSum = _mea.get_eta_sum(eta,k,V)
            batch_buffer.append((doc_buffer,eta_temp,etaSum))
            
            # clear doc buffer
            doc_buffer = []
            voc_temp = set()
            
            if batch_id % nthread == nthread - 1:
                # update eta
                eta = syn_master(batch_buffer,k,nthread,eta,_mea.get_alpha(k))
                if record_eta:
                    fn = 'eta.{}.pickle'.format(round_id)
                    path = os.path.join(var_path,fn)
                    _mea.write_eta(eta,path)
                # clear batch_buffer
                batch_buffer = []
                round_id += 1
                logging.info('round:{}, batch:{}'.format(round_id,batch_id))

            batch_id += 1
            


        doc_id += 1


    # process the docs in current doc_buffer
    if len(doc_buffer) > 0:
        # form a new batch
        eta_temp = _mea.get_eta(k,eta, voc_temp)
        etaSum = _mea.get_eta_sum(eta,k,V)
        batch_buffer.append((doc_buffer,eta_temp,etaSum))
        
        # form a new round
        eta = syn_master(batch_buffer,k,len(batch_buffer),eta,_mea.get_alpha(k))
        if record_eta:
            fn = 'eta.{}.pickle'.format(round_id)
            path = os.path.join(var_path,fn)
            _mea.write_eta(eta,path)

        round_id +=1
        batch_id +=1

    return eta