Exemplo n.º 1
0
    def map(self, func, iterable):


        if self.n_workers == 1:
            # only 1 worker, normal listcomp/map will work fine. Useful for testing code?
            ##results = [func(item) for item in iterable]
            results = list(map(func, iterable)) #forced eval to time it
        else:
            # many workers, lets use ActorPool

            if len(iterable) < self.n_workers:
                n_workers = len(iterable)
            else:
                n_workers = self.n_workers

            n_per_batch = int(len(iterable)/n_workers) + 1
            batches = [iterable[i:i + n_per_batch] for i in range(0, len(iterable), n_per_batch)]
            id_for_reorder = range(len(batches))
            eval_pool = ActorPool([Ray_Deap_Map.remote(self.creator_setup, self.pset_creator) for _ in range(n_workers)])
            unordered_results = list(eval_pool.map_unordered(lambda actor, input_tuple: actor.ray_remote_eval_batch.remote(func, input_tuple),
                                                             zip(batches, id_for_reorder)))
            # ensure order of batches
            ordered_batch_results = [batch for batch_id in id_for_reorder for batch in unordered_results if batch_id == batch[0][1]]
            
            #flatten batches to list of fitnes
            results = [item[0] for sublist in ordered_batch_results for item in sublist]
            
        return results
Exemplo n.º 2
0
def test_map_gh23107(init):
    sleep_time = 40

    # Reference - https://github.com/ray-project/ray/issues/23107
    @ray.remote
    class DummyActor:
        async def identity(self, s):
            if s == 6:
                await asyncio.sleep(sleep_time)
            return s, time.time()

    def func(a, v):
        return a.identity.remote(v)

    map_values = [1, 2, 3, 4, 5]

    pool_map = ActorPool([DummyActor.remote() for i in range(2)])
    pool_map.submit(func, 6)
    start_time = time.time()
    gen = pool_map.map(func, map_values)
    assert all(elem[0] in [1, 2, 3, 4, 5] for elem in list(gen))
    assert all(
        abs(elem[1] - start_time) < sleep_time in [1, 2, 3, 4, 5]
        for elem in list(gen))

    pool_map_unordered = ActorPool([DummyActor.remote() for i in range(2)])
    pool_map_unordered.submit(func, 6)
    start_time = time.time()
    gen = pool_map_unordered.map_unordered(func, map_values)
    assert all(elem[0] in [1, 2, 3, 4, 5] for elem in list(gen))
    assert all(
        abs(elem[1] - start_time) < sleep_time in [1, 2, 3, 4, 5]
        for elem in list(gen))
Exemplo n.º 3
0
def _main():
  opts = _parse_main()
  if not os.path.exists(os.path.join(opts.ckpt_dir) + "/"):
    os.makedirs(os.path.join(opts.ckpt_dir) + "/")
  files = recursively_get_files(opts.cnfs, exts=["cnf","gz", "dimacs"], forbidden=["bz2"])
  print(f"TRAINING WITH {len(files)} CNFS")

  ray.init()

  WM_USE_GPU = False
  weight_manager = ray.remote(num_gpus=(1 if WM_USE_GPU else 0))(WeightManager).remote(ckpt_dir=opts.ckpt_dir)
  ray.get(weight_manager.load_latest_ckpt.remote())

  if opts.model_cfg is not None:
    with open(opts.model_cfg, "r") as f:
      model_cfg = json.load(f)
  else:
    print("[rl_lbd._main] warning: using default configuration")
    model_cfg = defaultGNN1Cfg

  learner = ray.remote(num_gpus=(1 if torch.cuda.is_available() else 0))(Learner).options(max_concurrency=(opts.n_workers+2)).remote(weight_manager=weight_manager, batch_size=opts.batch_size, ckpt_freq=opts.ckpt_freq, ckpt_dir=opts.ckpt_dir, lr=opts.lr, restore=True, model_cfg=model_cfg) # TODO: to avoid oom, either dynamically batch or preprocess the formulas beforehand to ensure that they are under a certain size -- this will requre some changes throughout to avoid a fixed batch size

  print("LEARNER ONLINE")
  ray.get(learner.restore_weights.remote())

  workers = [ray.remote(EpisodeWorker).remote(learner=learner, weight_manager=weight_manager, model_cfg=model_cfg) for _ in range(opts.n_workers)]

  pool = ActorPool(workers)

  for w in workers:
    ray.get(w.try_update_weights.remote())

  with open(os.path.join(opts.ckpt_dir, "log.txt"), "a") as f:
    print(f"[{datetime.datetime.now()}] STARTING TRAINING RUN", file=f)
    print("ARGS:", file=f)
    for k,v in vars(opts).items():
      print(f"    {k}  :  {v}", file=f)
    print("\n\n", file=f)

  def shuffle_environments(ws, resample_frac=1.0):
    for w in ws:
      resample = np.random.choice([True,False], p=[resample_frac, 1-resample_frac])
      if resample:
        ray.get(w.set_env.remote(from_file=random.choice(files)))
    print("shuffled environments")

  shuffle_environments(workers)
  for k_epoch in range(opts.n_epochs):
    if opts.asynchronous:
      train_handle = learner.train.remote(synchronous=False)
    waiting = 0
    completed = 0
    shuffle_environments(workers, opts.resample_frac)
    for _ in pool.map_unordered((lambda a,v: a.sample_trajectory.remote()), range(opts.eps_per_worker*opts.n_workers)):
      pass
    if opts.asynchronous:
      ray.get(train_handle)
    else:
      ray.get(learner.train.remote(synchronous=True))
Exemplo n.º 4
0
def get_flame_parameters_for_objs(
    voca_objs,
    dest_path,
    model_fname="/models/flame_model/ch_models/generic_model.pkl",
):
    global ray_is_init
    if not ray_is_init:
        ray.init(num_gpus=2)
        ray_is_init = True

    MeshFitterActor = ray.remote(MeshFitter).options(num_gpus=0.01, num_cpus=1)

    dest_path.mkdir(parents=True, exist_ok=True)

    files = [x for x in voca_objs if not (dest_path / x.name).exists()]

    if not files:
        return [dest_path / x.name for x in voca_objs]

    cpu_count = int(ray.available_resources()["CPU"]) - 2

    actors = []
    for i in range(cpu_count):
        actors.append(MeshFitterActor.remote(model_fname))

    pool = ActorPool(actors)

    def run(a, file_):
        vertices = np.load(file_, allow_pickle=True)
        return a.fit.remote(vertices, dest_path / file_.name)

    dest_paths = []
    for dest_file_path, flame_params in tqdm(
            pool.map_unordered(lambda a, file_: run(a, file_), voca_objs),
            total=len(voca_objs),
    ):
        np.save(dest_file_path, flame_params)
        dest_paths.append(dest_file_path)

    return sorted(dest_paths)
def run_files(
    pbar,
    flame_fitting_dir,
    ringnet_dir,
    dir_,
    neutral_mesh_faces,
    dd,
    lmk_face_idx,
    lmk_b_coords,
    attempt=0,
):
    from tqdm import tqdm

    existing_files = set(
        os.path.basename(os.path.dirname(x))
        for x in glob(str(flame_fitting_dir / "*/flame_params.npy"))
    )

    files = sorted(
        [
            x
            for x in glob(str(dir_ / "*"))
            if os.path.basename(x)[:-4] not in existing_files
        ]
    )
    counter = 0
    actors = []

    cpu_count = int(ray.available_resources()["CPU"]) - 2

    pbar.set_description(f"{dir_.parent.name}/{dir_.name} ({cpu_count} cpus)")
    for x in range(min(len(files), cpu_count)):
        actors.append(
            FrameOptimizer.remote(
                dir_, neutral_mesh_faces, dd, lmk_face_idx, lmk_b_coords,
            )
        )
    file_len = len(files)
    pool = ActorPool(actors)
    try:
        pbar2 = tqdm(
            pool.map_unordered(
                lambda a, v: run(a, v, flame_fitting_dir, ringnet_dir), files
            ),
            total=file_len,
        )

        for x in pbar2:
            pbar2.set_description(f"{dir_.parent.name}/{dir_.name} ({cpu_count} cpus)")
            counter += 1
            if x is not None:
                flame_out_path, flame_out_params = x
                os.makedirs(os.path.dirname(flame_out_path), exist_ok=True)
                np.save(flame_out_path, flame_out_params)

    except ray.exceptions.RayActorError:
        if attempt > 10:
            raise Exception("too many attempts")
        for actor in actors:
            ray.kill(actor)
        if counter > 0:
            attempt = 0
        else:
            attempt += 1
        run_files(
            pbar,
            flame_fitting_dir,
            ringnet_dir,
            dir_,
            neutral_mesh_faces,
            dd,
            lmk_face_idx,
            lmk_b_coords,
            attempt=attempt,
        )
Exemplo n.º 6
0
            else:
                num_gpu = 2.0
        else:
            num_gpu = 0.0

        if num_gpu not in experiment_map:
            experiment_map[num_gpu] = []

        experiment_map[num_gpu] += experiments

    for num_gpu, experiments in experiment_map.items():
        max_available_actors = min(cpus // 2, gpus // num_gpu if gpus else cpus)
        pool = ActorPool([RayBatchActor.options(num_cpus=2, num_gpus=num_gpu).remote()
                          for _ in range(int(max_available_actors))])

        exp_result = pool.map_unordered(lambda actor, kwargs: actor.train.remote(kwargs), experiments)
        for name, results in groupby(sorted(exp_result, key=lambda t: t[0]), key=lambda t: t[0]):
            results = list(results)

            # Log the results
            logger.info('Experiment: %s ------------------', name)
            for name, runname, timedelta, devresult, devmaxresult, testresult in results:
                logger.info('\t%s training time: %10.3f', runname, timedelta)
                logger.info('\t%s dev. accuracy: %7.3f', runname, devresult)
                logger.info('\t%s dev. max. acc: %7.3f', runname, devmaxresult)
                logger.info('\t%s test accuracy: %7.3f', runname, testresult)

            # Write the average result
            if len(results) > 1:
                _, _, time_delta, dev_results, devmax_results, test_results = zip(*results)
                logger.info('\ttime average %7.3f±%7.3f', mean(time_delta), sem(time_delta))