def batch_generator(corpus, k, V, thread_batch, lockedEta): # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() # process contral nBatch = LockedSum(0, Lock()) for doc in corpus: for vid, count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) yield (doc_buffer, eta_temp, etaSum, alpha, batch_id) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) yield (doc_buffer, eta_temp, etaSum, alpha, batch_id) batch_id += 1
def batch_generator(corpus,k,V,thread_batch,lockedEta): # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() # process contral nBatch = LockedSum(0,Lock()) for doc in corpus: for vid,count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k,voc_temp) etaSum = lockedEta.get_eta_sum(k,V) alpha = _mea.get_alpha(k) yield (doc_buffer,eta_temp,etaSum,alpha,batch_id) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k,voc_temp) etaSum = lockedEta.get_eta_sum(k,V) alpha = _mea.get_alpha(k) yield (doc_buffer,eta_temp,etaSum,alpha,batch_id) batch_id += 1
def test(): ''' $1 path to config file ''' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # loading configs; config = get_config(sys.argv[2]) k = config['k'] test_path = config['test_path'] test_train_path = config['test_train'] test_test_path = config['test_test'] eta_path = config['eta_path'] gensim = config['gensim'] print eta_path corpus = _mCorpus.get_corpus(test_path) V = corpus.num_terms voc_set = set() for doc in corpus: for wid, count in doc: voc_set.add(wid) etaTest, etaSum = None, None if gensim: etaTest, etaSum = _mea.get_gensim_eta_etaSum(eta_path, voc_set) else: eta = _mea.load_eta(eta_path) etaTest = _mea.get_eta(k, eta, voc_set) etaSum = _mea.get_eta_sum(eta, k, V) test_test = _mCorpus.get_corpus(test_test_path) test_train = _mCorpus.get_corpus(test_train_path) alpha = _mea.get_alpha(k) perplexity = _mper.perplexity(test_train, test_test, alpha, etaTest, etaSum) print perplexity
def test(): ''' $1 path to config file ''' logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # loading configs; config = get_config(sys.argv[2]) k = config['k'] test_path = config['test_path'] test_train_path = config['test_train'] test_test_path = config['test_test'] eta_path = config['eta_path'] gensim= config['gensim'] print eta_path corpus = _mCorpus.get_corpus(test_path) V = corpus.num_terms voc_set = set() for doc in corpus: for wid,count in doc: voc_set.add(wid) etaTest, etaSum = None,None if gensim: etaTest, etaSum = _mea.get_gensim_eta_etaSum(eta_path,voc_set) else: eta = _mea.load_eta(eta_path) etaTest = _mea.get_eta(k,eta,voc_set) etaSum = _mea.get_eta_sum(eta,k,V) test_test = _mCorpus.get_corpus(test_test_path) test_train = _mCorpus.get_corpus(test_train_path) alpha = _mea.get_alpha(k) perplexity = _mper.perplexity(test_train,test_test,alpha,etaTest,etaSum) print perplexity
def asyn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False): # configs thread_batch = minibatch/nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() old_doc_seen = {} old_doc_seen[0] = 0 # global data lockedEta = LockedEta({},Lock()) # process contral pool = Pool(processes = nthread) nActPro = LockedSum(0,Lock()) nBatch = LockedSum(0,Lock()) results = [] for doc in corpus: for vid,count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k,voc_temp) etaSum = lockedEta.get_eta_sum(k,V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen) result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb) results.append(result) nActPro.add_value(1) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k,voc_temp) etaSum = lockedEta.get_eta_sum(k,V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen) result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb) results.append(result) nActPro.add_value(1) batch_id += 1 for r in results: r.wait() if nBatch.get_value() % nthread != 0: nBatch_value = nBatch.get_value() fn = 'eta.{}.pickle'.format(nBatch_value/nthread) path = os.path.join(var_path,fn) lockedEta.write_eta(path) return lockedEta.eta
def syn_framework(corpus, k, V, nthread, minibatch, var_path, record_eta=False): # configs thread_batch = minibatch / nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] batch_buffer = [] # [(docs,etas)] voc_temp = set() # global data eta = {} for doc in corpus: for vid, count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = _mea.get_eta(k, eta, voc_temp) etaSum = _mea.get_eta_sum(eta, k, V) batch_buffer.append((doc_buffer, eta_temp, etaSum)) # clear doc buffer doc_buffer = [] voc_temp = set() if batch_id % nthread == nthread - 1: # update eta eta = syn_master(batch_buffer, k, nthread, eta, _mea.get_alpha(k)) if record_eta: fn = 'eta.{}.pickle'.format(round_id) path = os.path.join(var_path, fn) _mea.write_eta(eta, path) # clear batch_buffer batch_buffer = [] round_id += 1 logging.info('round:{}, batch:{}'.format(round_id, batch_id)) batch_id += 1 doc_id += 1 # process the docs in current doc_buffer if len(doc_buffer) > 0: # form a new batch eta_temp = _mea.get_eta(k, eta, voc_temp) etaSum = _mea.get_eta_sum(eta, k, V) batch_buffer.append((doc_buffer, eta_temp, etaSum)) # form a new round eta = syn_master(batch_buffer, k, len(batch_buffer), eta, _mea.get_alpha(k)) if record_eta: fn = 'eta.{}.pickle'.format(round_id) path = os.path.join(var_path, fn) _mea.write_eta(eta, path) round_id += 1 batch_id += 1 return eta
def asyn_framework(corpus, k, V, nthread, minibatch, var_path, record_eta=False): # configs thread_batch = minibatch / nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() old_doc_seen = {} old_doc_seen[0] = 0 # global data lockedEta = LockedEta({}, Lock()) # process contral pool = Pool(processes=nthread) nActPro = LockedSum(0, Lock()) nBatch = LockedSum(0, Lock()) results = [] for doc in corpus: for vid, count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path, nthread, thread_batch, old_doc_seen) result = pool.apply_async(asyn_workder, (doc_buffer, eta_temp, etaSum, alpha), callback=cb) results.append(result) nActPro.add_value(1) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path, nthread, thread_batch, old_doc_seen) result = pool.apply_async(asyn_workder, (doc_buffer, eta_temp, etaSum, alpha), callback=cb) results.append(result) nActPro.add_value(1) batch_id += 1 for r in results: r.wait() if nBatch.get_value() % nthread != 0: nBatch_value = nBatch.get_value() fn = 'eta.{}.pickle'.format(nBatch_value / nthread) path = os.path.join(var_path, fn) lockedEta.write_eta(path) return lockedEta.eta
def syn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False): # configs thread_batch = minibatch/nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] batch_buffer = [] # [(docs,etas)] voc_temp = set() # global data eta = {} for doc in corpus: for vid,count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = _mea.get_eta(k,eta, voc_temp) etaSum = _mea.get_eta_sum(eta,k,V) batch_buffer.append((doc_buffer,eta_temp,etaSum)) # clear doc buffer doc_buffer = [] voc_temp = set() if batch_id % nthread == nthread - 1: # update eta eta = syn_master(batch_buffer,k,nthread,eta,_mea.get_alpha(k)) if record_eta: fn = 'eta.{}.pickle'.format(round_id) path = os.path.join(var_path,fn) _mea.write_eta(eta,path) # clear batch_buffer batch_buffer = [] round_id += 1 logging.info('round:{}, batch:{}'.format(round_id,batch_id)) batch_id += 1 doc_id += 1 # process the docs in current doc_buffer if len(doc_buffer) > 0: # form a new batch eta_temp = _mea.get_eta(k,eta, voc_temp) etaSum = _mea.get_eta_sum(eta,k,V) batch_buffer.append((doc_buffer,eta_temp,etaSum)) # form a new round eta = syn_master(batch_buffer,k,len(batch_buffer),eta,_mea.get_alpha(k)) if record_eta: fn = 'eta.{}.pickle'.format(round_id) path = os.path.join(var_path,fn) _mea.write_eta(eta,path) round_id +=1 batch_id +=1 return eta