def master_process(comm, status, tags, corpus, k, V, nthread, minibatch, var_path, record_eta=False): # parameter for threads num_workers = comm.size - 1 task_id = 0 closed_workers = 0 logging.info('Master start with {} works'.format(num_workers)) thread_batch = minibatch / nthread lockedEta = LockedEta({}, Lock()) bg = batch_generator(corpus, k, V, thread_batch, lockedEta) last_doc_seen = 0 while closed_workers < num_workers: data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() tag = status.Get_tag() if tag == tags.READY: try: if source <= nthread: doc_buffer, eta_temp, etaSum, alpha, batch_id = next(bg) comm.send((doc_buffer, eta_temp, etaSum, alpha, batch_id), dest=source, tag=tags.START) else: comm.send(None, dest=source, tag=tags.EXIT) except StopIteration: comm.send(None, dest=source, tag=tags.EXIT) elif tag == tags.DONE: delta_eta, nBatch_value = data nBatch_value += 1 logging.info('Got batch {} results from worker {}'.format( nBatch_value, source)) lockedEta.add_eta(delta_eta) doc_seen = nBatch_value * thread_batch if doc_seen - last_doc_seen >= 100000: fn = 'eta.{}.pickle'.format(nBatch_value / nthread - 1) path = os.path.join(var_path, fn) lockedEta.write_eta(path) logging.info('round:{}, batch:{}'.format( nBatch_value / nthread - 1, nBatch_value)) last_doc_seen = doc_seen elif tag == tags.EXIT: logging.info('Worker {} exited.'.format(source)) closed_workers += 1 return lockedEta.eta
def master_process(comm,status,tags,corpus,k,V,nthread,minibatch,var_path,record_eta = False): # parameter for threads num_workers = comm.size - 1 task_id = 0 closed_workers = 0 logging.info('Master start with {} works'.format(num_workers)) thread_batch = minibatch/nthread lockedEta = LockedEta({},Lock()) bg = batch_generator(corpus,k,V,thread_batch,lockedEta) last_doc_seen = 0 while closed_workers < num_workers: data = comm.recv(source = MPI.ANY_SOURCE, tag = MPI.ANY_TAG, status = status) source = status.Get_source() tag = status.Get_tag() if tag == tags.READY: try: if source <= nthread: doc_buffer,eta_temp,etaSum,alpha,batch_id = next(bg) comm.send((doc_buffer,eta_temp,etaSum,alpha,batch_id),dest = source,tag = tags.START) else: comm.send(None, dest = source, tag = tags.EXIT) except StopIteration: comm.send(None, dest = source, tag = tags.EXIT) elif tag == tags.DONE: delta_eta, nBatch_value = data nBatch_value += 1 logging.info('Got batch {} results from worker {}'.format(nBatch_value,source)) lockedEta.add_eta(delta_eta) doc_seen = nBatch_value * thread_batch if doc_seen - last_doc_seen >= 100000: fn = 'eta.{}.pickle'.format(nBatch_value/nthread-1) path = os.path.join(var_path,fn) lockedEta.write_eta(path) logging.info('round:{}, batch:{}'.format(nBatch_value/nthread-1,nBatch_value)) last_doc_seen = doc_seen elif tag == tags.EXIT: logging.info('Worker {} exited.'.format(source)) closed_workers += 1 return lockedEta.eta
def asyn_framework(corpus,k,V,nthread,minibatch,var_path,record_eta = False): # configs thread_batch = minibatch/nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() old_doc_seen = {} old_doc_seen[0] = 0 # global data lockedEta = LockedEta({},Lock()) # process contral pool = Pool(processes = nthread) nActPro = LockedSum(0,Lock()) nBatch = LockedSum(0,Lock()) results = [] for doc in corpus: for vid,count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k,voc_temp) etaSum = lockedEta.get_eta_sum(k,V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen) result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb) results.append(result) nActPro.add_value(1) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k,voc_temp) etaSum = lockedEta.get_eta_sum(k,V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x,lockedEta,nActPro,nBatch,var_path,nthread,thread_batch,old_doc_seen) result = pool.apply_async(asyn_workder,(doc_buffer,eta_temp,etaSum,alpha),callback = cb) results.append(result) nActPro.add_value(1) batch_id += 1 for r in results: r.wait() if nBatch.get_value() % nthread != 0: nBatch_value = nBatch.get_value() fn = 'eta.{}.pickle'.format(nBatch_value/nthread) path = os.path.join(var_path,fn) lockedEta.write_eta(path) return lockedEta.eta
def asyn_framework(corpus, k, V, nthread, minibatch, var_path, record_eta=False): # configs thread_batch = minibatch / nthread # ids doc_id = 0 batch_id = 0 round_id = 0 # temp data doc_buffer = [] voc_temp = set() old_doc_seen = {} old_doc_seen[0] = 0 # global data lockedEta = LockedEta({}, Lock()) # process contral pool = Pool(processes=nthread) nActPro = LockedSum(0, Lock()) nBatch = LockedSum(0, Lock()) results = [] for doc in corpus: for vid, count in doc: voc_temp.add(vid) doc_buffer.append(doc) if doc_id % thread_batch == thread_batch - 1: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path, nthread, thread_batch, old_doc_seen) result = pool.apply_async(asyn_workder, (doc_buffer, eta_temp, etaSum, alpha), callback=cb) results.append(result) nActPro.add_value(1) # clear buffer doc_buffer = [] voc_temp = set() batch_id += 1 doc_id += 1 # some remain doc may not be processed if len(doc_buffer) > 0: eta_temp = lockedEta.get_eta(k, voc_temp) etaSum = lockedEta.get_eta_sum(k, V) alpha = _mea.get_alpha(k) while True: # check for active processes amount if nActPro.get_value() < nthread: break cb = lambda x: callback(x, lockedEta, nActPro, nBatch, var_path, nthread, thread_batch, old_doc_seen) result = pool.apply_async(asyn_workder, (doc_buffer, eta_temp, etaSum, alpha), callback=cb) results.append(result) nActPro.add_value(1) batch_id += 1 for r in results: r.wait() if nBatch.get_value() % nthread != 0: nBatch_value = nBatch.get_value() fn = 'eta.{}.pickle'.format(nBatch_value / nthread) path = os.path.join(var_path, fn) lockedEta.write_eta(path) return lockedEta.eta