def asyn_framework(): pool = Pool(processes = nthread) nActPro = LockedSum(0,Lock()) nBatch = LockedSum(0,Lock()) results = [] for doc in corpus: if doc_id % thread_batch == thread_batch - 1: # accumulate one batch while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen) result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb) results.append(result) nActPro.add_value(1) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen) result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb) results.append(result) nActPro.add_value(1) batch_id += 1 for r in results: r.wait() if nBatch.get_value() % nthread != 0: nBatch_value = nBatch.get_value() fn = 'eta.{}.pickle'.format(nBatch_value/nthread) path = os.path.join(var_path,fn) lockedEta.write_eta(path) return lockedEta.eta
def batch_generator(corpus, k, V, thread_batch, lockedEta): # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() # process contral nBatch = LockedSum(0, Lock()) for doc in corpus: for vid, count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) yield (doc_buffer, eta_temp, etaSum, alpha, batch_id) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) yield (doc_buffer, eta_temp, etaSum, alpha, batch_id) batch_id += 1
def asyn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False): # configs thread_batch = minibatch/nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() old_doc_seen = {} old_doc_seen[0] = 0 # global data lockedEta = LockedEta({},Lock()) # process contral pool = Pool(processes = nthread) nActPro = LockedSum(0,Lock()) nBatch = LockedSum(0,Lock()) results = [] for doc in corpus: for vid,count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k,voc_temp) etaSum = lockedEta.get_eta_sum(k,V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen) result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb) results.append(result) nActPro.add_value(1) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k,voc_temp) etaSum = lockedEta.get_eta_sum(k,V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen) result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb) results.append(result) nActPro.add_value(1) batch_id += 1 for r in results: r.wait() if nBatch.get_value() % nthread != 0: nBatch_value = nBatch.get_value() fn = 'eta.{}.pickle'.format(nBatch_value/nthread) path = os.path.join(var_path,fn) lockedEta.write_eta(path) return lockedEta.eta
def asyn_framework(corpus, k, V, nthread, minibatch, var_path, record_eta=False): # configs thread_batch = minibatch / nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() old_doc_seen = {} old_doc_seen[0] = 0 # global data lockedEta = LockedEta({}, Lock()) # process contral pool = Pool(processes=nthread) nActPro = LockedSum(0, Lock()) nBatch = LockedSum(0, Lock()) results = [] for doc in corpus: for vid, count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path, nthread, thread_batch, old_doc_seen) result = pool.apply_async(asyn_workder, (doc_buffer, eta_temp, etaSum, alpha), callback=cb) results.append(result) nActPro.add_value(1) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path, nthread, thread_batch, old_doc_seen) result = pool.apply_async(asyn_workder, (doc_buffer, eta_temp, etaSum, alpha), callback=cb) results.append(result) nActPro.add_value(1) batch_id += 1 for r in results: r.wait() if nBatch.get_value() % nthread != 0: nBatch_value = nBatch.get_value() fn = 'eta.{}.pickle'.format(nBatch_value / nthread) path = os.path.join(var_path, fn) lockedEta.write_eta(path) return lockedEta.eta