Exemplo n.º 1
0
def gmap(func, iterable, lazy=False, size=None):
	"""As per map(), but each func is run in a seperate greenlet. If lazy, as per imap() instead.
	If size given, limits concurrency to at most that many greenlets at once."""
	pool = gevent.pool.Group() if size is None else gevent.pool.Pool(size)
	results = pool.imap(func, iterable)
	if not lazy: results = list(results)
	return results
Exemplo n.º 2
0
    def _initial_sync(self):
        """ Initial sync.
        """
        def classify(ns_tuple, large_colls, small_colls):
            """ Find out large and small collections.
            """
            if self._is_large_collection(ns_tuple):
                points = self._split_coll(ns_tuple, self._n_workers)
                if points:
                    large_colls.append((ns_tuple, points))
                else:
                    small_colls.append(ns_tuple)
            else:
                small_colls.append(ns_tuple)

        large_colls = []
        small_colls = []

        pool = gevent.pool.Pool(8)
        colls = self._collect_colls()
        for ns in colls:
            dbname, collname = ns
            log.info('%d\t%s.%s' %
                     (self._src.client()[dbname][collname].count(), dbname,
                      collname))
            pool.spawn(classify, ns, large_colls, small_colls)
        pool.join()

        if len(large_colls) + len(small_colls) != len(colls):
            raise RuntimeError('classify collections error')

        log.info('large collections: %s' %
                 ['.'.join(ns) for ns, points in large_colls])
        log.info('small collections: %s' %
                 ['.'.join(ns) for ns in small_colls])

        # create progress logger
        self._progress_logger = LoggerThread(len(colls))
        self._progress_logger.start()

        # small collections first
        pool = gevent.pool.Pool(20)
        for res in pool.imap(self._sync_collection, small_colls):
            if res is not None:
                sys.exit(1)

        # then large collections
        for ns, points in large_colls:
            self._sync_large_collection(ns, points)
Exemplo n.º 3
0
    def fetch_likes(self, number_of_likes):
        likes = []
        pool = gevent.pool.Pool(10)

        for request in pool.imap(self.api_request, range(0, number_of_likes, 100)):
            for track in request:
                likes.append({
                    'id': track.id,
                    'title': track.title,
                    'duration': track.duration,
                    'genre': track.genre,
                    'description': track.description,
                    'downloadable': track.downloadable,
                    'permalink_url': track.permalink_url
                })
        return likes
Exemplo n.º 4
0
        if current_job:
            set_current_job(current_job)

        for x in iterable:
            yield x

        if current_job:
            set_current_job(None)

    start_time = time.time()
    pool = gevent.pool.Pool(size=pool_size)

    if unordered:
        iterator = pool.imap_unordered(inner_func, inner_iterable(), maxsize=buffer_size or pool_size)
    else:
        iterator = pool.imap(inner_func, inner_iterable())

    for x in iterator:
        if flatten:
            for y in x:
                yield y
        else:
            yield x

    pool.join(raise_error=True)
    total_time = time.time() - start_time

    log.debug("SubPool ran %s greenlets in %0.6fs" % (counter, total_time))


def run_task(path, params):
Exemplo n.º 5
0
        for x in iterable:
            yield x

        if current_job:
            set_current_job(None)

    start_time = time.time()
    pool = gevent.pool.Pool(size=pool_size)

    if unordered:
        iterator = pool.imap_unordered(inner_func,
                                       inner_iterable(),
                                       maxsize=buffer_size or pool_size)
    else:
        iterator = pool.imap(inner_func, inner_iterable())

    for x in iterator:
        if flatten:
            for y in x:
                yield y
        else:
            yield x

    pool.join(raise_error=True)
    total_time = time.time() - start_time

    log.debug("SubPool ran %s greenlets in %0.6fs" % (counter, total_time))


def run_task(path, params):
Exemplo n.º 6
0
def multiprocess_upload(QueueClass, queue_name, tasks, parallel=True, total=None):
  if parallel is True:
    parallel = mp.cpu_count()
  elif parallel <= 0:
    raise ValueError("Parallel must be a positive number or zero (all cpus). Got: " + str(parallel))

  if parallel == 1:
    return soloprocess_upload(QueueClass, queue_name, tasks)
    
  def capturing_soloprocess_upload(*args, **kwargs):
    try:
      return soloprocess_upload(*args, **kwargs)
    except Exception as err:
      print(err)
      error_queue.put(err)
    return 0

  uploadfn = partial(
    capturing_soloprocess_upload, QueueClass, queue_name
  )

  if isinstance(tasks, types.GeneratorType):
    try:
      task = next(item for item in tasks if item is not None)
    except StopIteration:
      return 0
    tasks = itertools.chain([task], tasks)

  # This is a hack to get dill to pickle dynamically
  # generated classes. This is an important use case
  # for when we create iterators with generator __iter__
  # functions on demand.

  # https://github.com/uqfoundation/dill/issues/56

  # cls_module = task.__class__.__module__
  # task.__class__.__module__ = '__main__'

  total = totalfn(tasks, total)

  block_size = 2000
  if total is not None and (total / parallel) < block_size:
    if total > 500:
      block_size = int(math.ceil(total / parallel))

  # Fix for MacOS which can segfault due to 
  # urllib calling libdispatch which is not fork-safe
  # https://bugs.python.org/issue30385
  no_proxy = os.environ.get("no_proxy", "")
  if platform.system().lower() == "darwin":
    os.environ["no_proxy"] = "*"

  ct = 0
  with tqdm(desc="Upload", total=total) as pbar:
    with pathos.pools.ProcessPool(parallel) as pool:
      for num_inserted in pool.imap(uploadfn, sip(tasks, 2000)):
        pbar.update(num_inserted)
        ct += num_inserted

  QueueClass(queue_name).add_insert_count(ct)

  if platform.system().lower() == "darwin":
    os.environ["no_proxy"] = no_proxy
  # task.__class__.__module__ = cls_module

  if not error_queue.empty():
    errors = []
    while not error_queue.empty():
      err = error_queue.get()
      if err is not StopIteration:
        errors.append(err)
    if len(errors):
      raise Exception(errors)

  return ct
Exemplo n.º 7
0
def subpool_imap(pool_size, func, iterable, flatten=False, unordered=False, buffer_size=None):
    """ Generator version of subpool_map. Should be used with unordered=True for optimal performance """

    if not pool_size:
        for args in iterable:
            yield func(*args)

    counter = itertools_count()

    current_job = get_current_job()

    def inner_func(*args):
        """ As each call to 'func' will be done in a random greenlet of the subpool, we need to
        register their IDs with set_current_job() to make get_current_job() calls work properly
        inside 'func'.
    """
        next(counter)
        if current_job:
            set_current_job(current_job)

        try:
            ret = func(*args)
        except Exception as exc:
            trace = traceback.format_exc()
            log.error("Error in subpool: %s \n%s" % (exc, trace))
            raise

        if current_job:
            set_current_job(None)
        return ret

    def inner_iterable():
        """ This will be called inside the pool's main greenlet, which ID also needs to be registered """
        if current_job:
            set_current_job(current_job)

        for x in iterable:
            yield x

        if current_job:
            set_current_job(None)

    start_time = time.time()
    pool = gevent.pool.Pool(size=pool_size)

    if unordered:
        iterator = pool.imap_unordered(inner_func, inner_iterable(), maxsize=buffer_size or pool_size)
    else:
        iterator = pool.imap(inner_func, inner_iterable())

    for x in iterator:
        if flatten:
            for y in x:
                yield y
        else:
            yield x

    pool.join(raise_error=True)
    total_time = time.time() - start_time

    log.debug("SubPool ran %s greenlets in %0.6fs" % (counter, total_time))
Exemplo n.º 8
0
def subpool_imap(pool_size,
                 func,
                 iterable,
                 flatten=False,
                 unordered=False,
                 buffer_size=None):
    """ Generator version of subpool_map. Should be used with unordered=True for optimal performance """

    if not pool_size:
        for args in iterable:
            yield func(*args)

    counter = itertools_count()

    current_job = get_current_job()

    def inner_func(*args):
        """ As each call to 'func' will be done in a random greenlet of the subpool, we need to
        register their IDs with set_current_job() to make get_current_job() calls work properly
        inside 'func'.
    """
        next(counter)
        if current_job:
            set_current_job(current_job)

        try:
            ret = func(*args)
        except Exception as exc:
            trace = traceback.format_exc()
            log.error("Error in subpool: %s \n%s" % (exc, trace))
            raise

        if current_job:
            set_current_job(None)
        return ret

    def inner_iterable():
        """ This will be called inside the pool's main greenlet, which ID also needs to be registered """
        if current_job:
            set_current_job(current_job)

        for x in iterable:
            yield x

        if current_job:
            set_current_job(None)

    start_time = time.time()
    pool = gevent.pool.Pool(size=pool_size)

    if unordered:
        iterator = pool.imap_unordered(inner_func,
                                       inner_iterable(),
                                       maxsize=buffer_size or pool_size)
    else:
        iterator = pool.imap(inner_func, inner_iterable())

    for x in iterator:
        if flatten:
            for y in x:
                yield y
        else:
            yield x

    pool.join(raise_error=True)
    total_time = time.time() - start_time

    log.debug("SubPool ran %s greenlets in %0.6fs" % (counter, total_time))
Exemplo n.º 9
0
                return record.replace(full_phone_segment.group(), '"phones": [],')
            return record


# process(file='/Users/dev-01/Downloads/Bullonerie.ldj',
# file_out='/Users/dev-01/Desktop/Da_caricare_in_Piatttaforma_con_emails/Bullonerie.ldj')

if __name__ == '__main__':
    start_time = time.time()
    pool = gevent.pool.Pool(20)
    lines = set()
    records = set()
    with open(sys.argv[1]) as file_in:
        for one_line in file_in.readlines():
            lines.add(one_line)
        threads = pool.imap(process, lines)
        for th in threads:
            print th
            records.add(th)
    print 'Waiting till gevent can join the greenlets...'
    print "Starting phone number validation..."

    start_time_validation = time.time()
    greenlets = pool.imap(phone_number_validation_calling, records)

    with open(sys.argv[2], 'a') as writer:
        for greenlet in greenlets:
            if greenlet:
                writer.write(str(greenlet))
    #     for single_line in records:
    #         single_line = phone_number_validation_calling(single_line)