def gmap(func, iterable, lazy=False, size=None): """As per map(), but each func is run in a seperate greenlet. If lazy, as per imap() instead. If size given, limits concurrency to at most that many greenlets at once.""" pool = gevent.pool.Group() if size is None else gevent.pool.Pool(size) results = pool.imap(func, iterable) if not lazy: results = list(results) return results
def _initial_sync(self): """ Initial sync. """ def classify(ns_tuple, large_colls, small_colls): """ Find out large and small collections. """ if self._is_large_collection(ns_tuple): points = self._split_coll(ns_tuple, self._n_workers) if points: large_colls.append((ns_tuple, points)) else: small_colls.append(ns_tuple) else: small_colls.append(ns_tuple) large_colls = [] small_colls = [] pool = gevent.pool.Pool(8) colls = self._collect_colls() for ns in colls: dbname, collname = ns log.info('%d\t%s.%s' % (self._src.client()[dbname][collname].count(), dbname, collname)) pool.spawn(classify, ns, large_colls, small_colls) pool.join() if len(large_colls) + len(small_colls) != len(colls): raise RuntimeError('classify collections error') log.info('large collections: %s' % ['.'.join(ns) for ns, points in large_colls]) log.info('small collections: %s' % ['.'.join(ns) for ns in small_colls]) # create progress logger self._progress_logger = LoggerThread(len(colls)) self._progress_logger.start() # small collections first pool = gevent.pool.Pool(20) for res in pool.imap(self._sync_collection, small_colls): if res is not None: sys.exit(1) # then large collections for ns, points in large_colls: self._sync_large_collection(ns, points)
def fetch_likes(self, number_of_likes): likes = [] pool = gevent.pool.Pool(10) for request in pool.imap(self.api_request, range(0, number_of_likes, 100)): for track in request: likes.append({ 'id': track.id, 'title': track.title, 'duration': track.duration, 'genre': track.genre, 'description': track.description, 'downloadable': track.downloadable, 'permalink_url': track.permalink_url }) return likes
if current_job: set_current_job(current_job) for x in iterable: yield x if current_job: set_current_job(None) start_time = time.time() pool = gevent.pool.Pool(size=pool_size) if unordered: iterator = pool.imap_unordered(inner_func, inner_iterable(), maxsize=buffer_size or pool_size) else: iterator = pool.imap(inner_func, inner_iterable()) for x in iterator: if flatten: for y in x: yield y else: yield x pool.join(raise_error=True) total_time = time.time() - start_time log.debug("SubPool ran %s greenlets in %0.6fs" % (counter, total_time)) def run_task(path, params):
for x in iterable: yield x if current_job: set_current_job(None) start_time = time.time() pool = gevent.pool.Pool(size=pool_size) if unordered: iterator = pool.imap_unordered(inner_func, inner_iterable(), maxsize=buffer_size or pool_size) else: iterator = pool.imap(inner_func, inner_iterable()) for x in iterator: if flatten: for y in x: yield y else: yield x pool.join(raise_error=True) total_time = time.time() - start_time log.debug("SubPool ran %s greenlets in %0.6fs" % (counter, total_time)) def run_task(path, params):
def multiprocess_upload(QueueClass, queue_name, tasks, parallel=True, total=None): if parallel is True: parallel = mp.cpu_count() elif parallel <= 0: raise ValueError("Parallel must be a positive number or zero (all cpus). Got: " + str(parallel)) if parallel == 1: return soloprocess_upload(QueueClass, queue_name, tasks) def capturing_soloprocess_upload(*args, **kwargs): try: return soloprocess_upload(*args, **kwargs) except Exception as err: print(err) error_queue.put(err) return 0 uploadfn = partial( capturing_soloprocess_upload, QueueClass, queue_name ) if isinstance(tasks, types.GeneratorType): try: task = next(item for item in tasks if item is not None) except StopIteration: return 0 tasks = itertools.chain([task], tasks) # This is a hack to get dill to pickle dynamically # generated classes. This is an important use case # for when we create iterators with generator __iter__ # functions on demand. # https://github.com/uqfoundation/dill/issues/56 # cls_module = task.__class__.__module__ # task.__class__.__module__ = '__main__' total = totalfn(tasks, total) block_size = 2000 if total is not None and (total / parallel) < block_size: if total > 500: block_size = int(math.ceil(total / parallel)) # Fix for MacOS which can segfault due to # urllib calling libdispatch which is not fork-safe # https://bugs.python.org/issue30385 no_proxy = os.environ.get("no_proxy", "") if platform.system().lower() == "darwin": os.environ["no_proxy"] = "*" ct = 0 with tqdm(desc="Upload", total=total) as pbar: with pathos.pools.ProcessPool(parallel) as pool: for num_inserted in pool.imap(uploadfn, sip(tasks, 2000)): pbar.update(num_inserted) ct += num_inserted QueueClass(queue_name).add_insert_count(ct) if platform.system().lower() == "darwin": os.environ["no_proxy"] = no_proxy # task.__class__.__module__ = cls_module if not error_queue.empty(): errors = [] while not error_queue.empty(): err = error_queue.get() if err is not StopIteration: errors.append(err) if len(errors): raise Exception(errors) return ct
def subpool_imap(pool_size, func, iterable, flatten=False, unordered=False, buffer_size=None): """ Generator version of subpool_map. Should be used with unordered=True for optimal performance """ if not pool_size: for args in iterable: yield func(*args) counter = itertools_count() current_job = get_current_job() def inner_func(*args): """ As each call to 'func' will be done in a random greenlet of the subpool, we need to register their IDs with set_current_job() to make get_current_job() calls work properly inside 'func'. """ next(counter) if current_job: set_current_job(current_job) try: ret = func(*args) except Exception as exc: trace = traceback.format_exc() log.error("Error in subpool: %s \n%s" % (exc, trace)) raise if current_job: set_current_job(None) return ret def inner_iterable(): """ This will be called inside the pool's main greenlet, which ID also needs to be registered """ if current_job: set_current_job(current_job) for x in iterable: yield x if current_job: set_current_job(None) start_time = time.time() pool = gevent.pool.Pool(size=pool_size) if unordered: iterator = pool.imap_unordered(inner_func, inner_iterable(), maxsize=buffer_size or pool_size) else: iterator = pool.imap(inner_func, inner_iterable()) for x in iterator: if flatten: for y in x: yield y else: yield x pool.join(raise_error=True) total_time = time.time() - start_time log.debug("SubPool ran %s greenlets in %0.6fs" % (counter, total_time))
return record.replace(full_phone_segment.group(), '"phones": [],') return record # process(file='/Users/dev-01/Downloads/Bullonerie.ldj', # file_out='/Users/dev-01/Desktop/Da_caricare_in_Piatttaforma_con_emails/Bullonerie.ldj') if __name__ == '__main__': start_time = time.time() pool = gevent.pool.Pool(20) lines = set() records = set() with open(sys.argv[1]) as file_in: for one_line in file_in.readlines(): lines.add(one_line) threads = pool.imap(process, lines) for th in threads: print th records.add(th) print 'Waiting till gevent can join the greenlets...' print "Starting phone number validation..." start_time_validation = time.time() greenlets = pool.imap(phone_number_validation_calling, records) with open(sys.argv[2], 'a') as writer: for greenlet in greenlets: if greenlet: writer.write(str(greenlet)) # for single_line in records: # single_line = phone_number_validation_calling(single_line)