Exemplo n.º 1
0
    def _generate_parallel(self, iteration, network, device, num_workers):
        q, r = divmod(self.remaining_games, num_workers)
        num_active_workers = Value('i', num_workers)
        resign_threshold = Value('d', self.resign_mgr.threshold())
        evaluator_mgr = BulkEvaluatorManager([network], device, num_workers)
        output_queue = SimpleQueue()

        # start the workers
        workers = []
        for worker_id in range(num_workers):
            num_games = q + 1 if worker_id < r else q
            evaluator = evaluator_mgr.get_evaluator(worker_id, 0)
            worker = Process(
                target=self._worker_job,
                args=(worker_id, num_games, num_active_workers,
                      resign_threshold, evaluator, output_queue),
            )
            workers.append(worker)
            worker.start()

        # start evaluator server
        server = evaluator_mgr.get_server(num_active_workers)
        server.start()

        # collect the examples generated by workers
        while num_active_workers.value > 0 or not output_queue.empty():
            examples, resign_value_history, result = output_queue.get()
            self.example_pool += examples
            self.game_length.append(len(examples))

            # add the history into resignation manager to update the threshold
            if resign_value_history is not None:
                self.resign_mgr.add(resign_value_history, result)
                resign_threshold.value = self.resign_mgr.threshold()

            self.remaining_games -= 1

            # periodically save the progress
            if (self.conf.GAMES_PER_ITERATION - self.remaining_games) \
                    % self.conf.EXAMPLE_POOL_SAVE_FREQUENCY == 0:
                self.save(iteration)
                log.info(
                    f'[iter={iteration}] ExamplePool: checkpoint saved, '
                    f'{self.remaining_games} games remaining'
                )

        for worker in workers:
            worker.join()
        server.join()
Exemplo n.º 2
0
def train(config):
    task_queue = SimpleQueue()
    result_queue = SimpleQueue()
    stop = mp.Value('i', False)
    stats = SharedStats(config.state_dim)
    normalizers = [StaticNormalizer(config.state_dim) for _ in range(config.num_workers)]
    for normalizer in normalizers:
        normalizer.offline_stats.load(stats)

    workers = [Worker(id, normalizers[id], task_queue, result_queue, stop, config) for id in range(config.num_workers)]
    for w in workers: w.start()

    opt = cma.CMAOptions()
    opt['tolfun'] = -config.target
    opt['popsize'] = config.pop_size
    opt['verb_disp'] = 0
    opt['verb_log'] = 0
    opt['maxiter'] = sys.maxsize
    es = cma.CMAEvolutionStrategy(config.initial_weight, config.sigma, opt)

    total_steps = 0
    initial_time = time.time()
    training_rewards = []
    training_steps = []
    training_timestamps = []
    test_mean, test_ste = test(config, config.initial_weight, stats)
    logger.info('total steps %d, %f(%f)' % (total_steps, test_mean, test_ste))
    training_rewards.append(test_mean)
    training_steps.append(0)
    training_timestamps.append(0)
    while True:
        solutions = es.ask()
        for id, solution in enumerate(solutions):
            task_queue.put((id, solution))
        while not task_queue.empty():
            continue
        result = []
        while len(result) < len(solutions):
            if result_queue.empty():
                continue
            result.append(result_queue.get())
        result = sorted(result, key=lambda x: x[0])
        total_steps += np.sum([r[2] for r in result])
        cost = [r[1] for r in result]
        best_solution = solutions[np.argmin(cost)]
        elapsed_time = time.time() - initial_time
        test_mean, test_ste = test(config, best_solution, stats)
        logger.info('total steps %d, test %f(%f), best %f, elapased time %f' %
            (total_steps, test_mean, test_ste, -np.min(cost), elapsed_time))
        training_rewards.append(test_mean)
        training_steps.append(total_steps)
        training_timestamps.append(elapsed_time)
        # with open('data/%s-best_solution_%s.bin' % (TAG, config.task), 'wb') as f:
        #     pickle.dump(solutions[np.argmin(result)], f)
        if config.max_steps and total_steps > config.max_steps:
            stop.value = True
            break

        cost = fitness_shift(cost)
        es.tell(solutions, cost)
        # es.disp()
        for normalizer in normalizers:
            stats.merge(normalizer.online_stats)
            normalizer.online_stats.zero()
        for normalizer in normalizers:
            normalizer.offline_stats.load(stats)

    stop.value = True
    for w in workers: w.join()
    return [training_rewards, training_steps, training_timestamps]
Exemplo n.º 3
0
def train(config):
    task_queue = SimpleQueue()
    result_queue = SimpleQueue()
    stop = mp.Value('i', False)
    stats = SharedStats(config.state_dim)
    param = torch.FloatTensor(torch.from_numpy(config.initial_weight))
    param.share_memory_()
    n_params = len(param.numpy().flatten())
    if config.args.noise_type == 'lss':
        noise_sizes = [
            cofig.state_dim * config.hidden_size,
            config.hidden_size * config.hidden_size,
            config.hidden_size * config.action_dim
        ]
    else:
        noise_sizes = None
    noise_generator = NoiseGenerator(n_params,
                                     config.pop_size,
                                     config.args.noise,
                                     noise_sizes=noise_sizes)
    normalizers = [
        StaticNormalizer(config.state_dim) for _ in range(config.num_workers)
    ]
    for normalizer in normalizers:
        normalizer.offline_stats.load(stats)
    workers = [
        Worker(id, param, normalizers[id], task_queue, result_queue, stop,
               noise_generator, config) for id in range(config.num_workers)
    ]
    for w in workers:
        w.start()

    training_rewards = []
    training_steps = []
    training_timestamps = []
    initial_time = time.time()
    total_steps = 0
    iteration = 0
    while not stop.value:
        test_mean, test_ste = test(config, param.numpy(), stats)
        elapsed_time = time.time() - initial_time
        training_rewards.append(test_mean)
        training_steps.append(total_steps)
        training_timestamps.append(elapsed_time)
        logger.info('Test: total steps %d, %f(%f), elapsed time %d' %
                    (total_steps, test_mean, test_ste, elapsed_time))

        for i in range(config.pop_size):
            task_queue.put(i)
        rewards = []
        epsilons = []
        steps = []
        while len(rewards) < config.pop_size:
            if result_queue.empty():
                continue
            epsilon, fitness, step = result_queue.get()
            epsilons.append(epsilon)
            rewards.append(fitness)
            steps.append(step)

        total_steps += np.sum(steps)
        r_mean = np.mean(rewards)
        r_std = np.std(rewards)
        # rewards = (rewards - r_mean) / r_std
        logger.info('Train: iteration %d, %f(%f)' %
                    (iteration, r_mean, r_std / np.sqrt(config.pop_size)))
        iteration += 1
        # if r_mean > config.target:
        if config.max_steps and total_steps > config.max_steps:
            stop.value = True
            break
        for normalizer in normalizers:
            stats.merge(normalizer.online_stats)
            normalizer.online_stats.zero()
        for normalizer in normalizers:
            normalizer.offline_stats.load(stats)
        if config.args.reward_type == 'rank':
            rewards = fitness_shift(rewards)
        gradient = np.asarray(epsilons) * np.asarray(rewards).reshape((-1, 1))
        gradient = np.mean(gradient, 0) / config.sigma
        gradient -= config.weight_decay * gradient
        if config.args.opt == 'adam':
            gradient = config.opt.update(gradient)
        gradient = torch.FloatTensor(gradient)
        param.add_(config.learning_rate * gradient)

    for w in workers:
        w.join()
    return [training_rewards, training_steps, training_timestamps]