Exemplo n.º 1
0
def train(config, reporter):
    trainer = DDPGTrainer(config=config, env=imuCalibrEnv_seq)
    while True:
        result = trainer.train()
        reporter(**result)
        if result["timesteps_since_restore"] > 250:
            phase = 1
        else:
            phase = 0
        trainer.workers.foreach_worker(
            lambda ev: ev.foreach_env(lambda env: env.set_phase(phase)))
Exemplo n.º 2
0
def train(config, reporter):
    trainer = DDPGTrainer(config=config, env=CamImuCalibrEnv_seq)
    #checkpoint_path = trainer.save()
    policy = trainer.get_policy()
    print(policy.dist_class)

    i = 0
    while True:
        result = trainer.train()
        reporter(**result)
        # if result["timesteps_since_restore"] > 200:
        #     phase = 1
        # else:
        #     phase = 0
        # trainer.workers.foreach_worker(
        #     lambda ev: ev.foreach_env(
        #         lambda env: env.set_phase(phase)))
        if i == 0:
            trainer.restore(
                "/home/yunke/ray_results/DDPG_CamImuCalibrEnv_seq_2020-06-03_23-18-37xwsq706i/checkpoint_437/checkpoint-437"
            )
        if i > 3:
            checkpoint_path = trainer.save()
            print(checkpoint_path)
        auto_garbage_collect()
        i += 1
def evaluate_model(args):
    if args.model_path == '':
        print('Cannot evaluate model, no --model_path set')
        exit(1)

    def get_env():
        # Simulator env uses a single map, so better for evaluation/testing.
        # return SteeringToWheelVelWrapper(DuckietownLF(
        # ))
        return MultiMapSteeringToWheelVelWrapper(
            simulator.Simulator(
                map_name=args.map,
                max_steps=2000,
            ))

    # Rather than reuse the env, another one is created later because I can't
    # figure out how to provide register_env with an object, th
    register_env('DuckieTown-Simulator', lambda _: get_env())
    trainer = DDPGTrainer(
        env="DuckieTown-Simulator",
        config={
            "framework": "torch",
            "model": {
                "custom_model": "image-ddpg",
            },
            "num_gpus": args.gpu_use,
        },
    )
    trainer.restore(args.model_path)

    sim_env = get_env()

    # Standard OpenAI Gym reset/action/step/render loop.
    # This matches how the `enjoy_reinforcement.py` script works, see: https://git.io/J3js2
    done = False
    observation = sim_env.reset()
    episode_reward = 0
    while not done:
        action = trainer.compute_action(observation)
        observation, reward, done, _ = sim_env.step(action)
        episode_reward += reward
        sim_env.render()

    print(f'Episode complete, total reward: {episode_reward}')
Exemplo n.º 4
0
class DDPGrl(object):
    def __init__(self, env, env_config, config):
        self.config = config
        self.config['env_config'] = env_config
        self.env = env(env_config)
        self.agent = DDPGTrainer(config=self.config, env=env)

    def fit(self, checkpoint=None):
        if checkpoint is None:
            checkpoint = os.path.join(os.getcwd(), 'data/checkpoint_rl.pkl')
        for idx in trange(5):
            result = self.agent.train()
            LOGGER.warning('result: ', result)
            if (idx + 1) % 5 == 0:
                LOGGER.warning('Save checkpoint at: {}'.format(idx + 1))
                state = self.agent.save_to_object()
                with open(checkpoint, 'wb') as fp:
                    pickle.dump(state, fp, protocol=pickle.HIGHEST_PROTOCOL)
        return result

    def predict(self, checkpoint=None):
        if checkpoint is not None:
            with open(checkpoint, 'rb') as fp:
                state = pickle.load(fp)
            self.agent.restore_from_object(state)
        done = False
        episode_reward = 0
        obs = self.env.reset()
        actions = []
        while not done:
            action = self.agent.compute_action(obs)
            actions.append(action)
            obs, reward, done, info = self.env.step(action)
            episode_reward += reward
        results = {'action': actions, 'reward': episode_reward}
        return results
Exemplo n.º 5
0
def train(config, reporter):
    trainer = DDPGTrainer(config=config, env=imuCalibrEnv_seq)
    #checkpoint_path = trainer.save()
    policy = trainer.get_policy()
    print(policy.dist_class)

    i = 0
    while True:
        result = trainer.train()
        reporter(**result)
        # if result["timesteps_since_restore"] > 200:
        #     phase = 1
        # else:
        #     phase = 0
        # trainer.workers.foreach_worker(
        #     lambda ev: ev.foreach_env(
        #         lambda env: env.set_phase(phase)))
        # if i==0:
        #     trainer.restore("/home/yunke/ray_results/DDPG_imuCalibrEnv_seq_2020-06-27_01-48-53hwk9uq89/checkpoint_995/checkpoint-995")
        if i>3 and i%100==0:
            checkpoint_path = trainer.save()
            print(checkpoint_path)
        auto_garbage_collect()
        i+=1
Exemplo n.º 6
0
def train_model(args, config):

    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    #    https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DDPG specifically there are also additionally these keys:
    #    https://docs.ray.io/en/master/rllib-algorithms.html#ddpg
    trainer = DDPGTrainer(
        env="DuckieTown-MultiMap",
        config=config,
    )

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    # TODO(balbok0): Start values from checkpoint, if available.
    best_mean_reward = -np.inf
    epoch_of_best_mean_reward = 0
    path_of_best_mean_reward = None

    for i in trange(args.epochs, desc="Epochs",
                    leave=False):  # Number of episodes (basically epochs)
        # print(f'----------------------- Starting epoch {i} ----------------------- ')
        # train() trains only a single episode
        result = trainer.train()
        # print(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        # print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        if result["episode_reward_mean"] > best_mean_reward:
            best_mean_reward = result["episode_reward_mean"]
            epoch_of_best_mean_reward = i
            path_of_best_mean_reward = checkpoint_path

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        # print(torch.cuda.memory_summary(device=None, abbreviated=False))
    return best_mean_reward, epoch_of_best_mean_reward, path_of_best_mean_reward
def train_model(args):
    # Define trainer. Apart from env, config/framework and config/model, which are common among trainers.
    # Here is a list of default config keys/values:
    #    https://docs.ray.io/en/master/rllib-training.html#common-parameters
    # For DDPG specifically there are also additionally these keys:
    #    https://docs.ray.io/en/master/rllib-algorithms.html#ddpg
    trainer = DDPGTrainer(env="DuckieTown-MultiMap",
                          config={
                              "framework": "torch",
                              "model": {
                                  "custom_model": "image-ddpg",
                              },
                              "learning_starts": 0,
                              "train_batch_size": 16,
                          })

    # Start training from a checkpoint, if available.
    if args.model_path:
        trainer.restore(args.model_path)

    plot = plotter.Plotter('ddpg_agent')
    for i in range(args.epochs):  # Number of episodes (basically epochs)
        print(
            f'----------------------- Starting epoch {i} ----------------------- '
        )
        # train() trains only a single episode
        result = trainer.train()
        print(result)
        plot.add_results(result)

        # Save model so far.
        checkpoint_path = trainer.save()
        print(f'Epoch {i}, checkpoint saved at: {checkpoint_path}')

        # Cleanup CUDA memory to reduce memory usage.
        torch.cuda.empty_cache()
        # Debug log to monitor memory.
        print(torch.cuda.memory_summary(device=None, abbreviated=False))

    plot.plot('DDPG DuckieTown-MultiMap')
Exemplo n.º 8
0
 def __init__(self, env, env_config, config):
     self.config = config
     self.config['env_config'] = env_config
     self.env = env(env_config)
     self.agent = DDPGTrainer(config=self.config, env=env)
def main():
    ray.init()
    logging.getLogger().setLevel(logging.INFO)
    date = datetime.now().strftime('%Y%m%d_%H%M%S')
    parser = argparse.ArgumentParser()
    # parser.add_argument('--scenario', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--config',
                        type=str,
                        default='config/global_config.json',
                        help='config file')
    parser.add_argument('--algo',
                        type=str,
                        default='DQN',
                        choices=['DQN', 'DDQN', 'DuelDQN'],
                        help='choose an algorithm')
    parser.add_argument('--inference',
                        action="store_true",
                        help='inference or training')
    parser.add_argument('--ckpt', type=str, help='inference or training')
    parser.add_argument('--epoch',
                        type=int,
                        default=10,
                        help='number of training epochs')
    parser.add_argument(
        '--num_step',
        type=int,
        default=10**3,
        help='number of timesteps for one episode, and for inference')
    parser.add_argument('--save_freq',
                        type=int,
                        default=100,
                        help='model saving frequency')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='model saving frequency')
    parser.add_argument('--state_time_span',
                        type=int,
                        default=5,
                        help='state interval to receive long term state')
    parser.add_argument('--time_span',
                        type=int,
                        default=30,
                        help='time interval to collect data')

    args = parser.parse_args()

    model_dir = "model/{}_{}".format(args.algo, date)
    result_dir = "result/{}_{}".format(args.algo, date)

    config_env = env_config(args)
    # ray.tune.register_env('gym_cityflow', lambda env_config:CityflowGymEnv(config_env))

    config_agent = agent_config(config_env)

    # # build cityflow environment

    trainer = DDPGTrainer(env=CityflowGymEnv, config=config_agent)
    for i in range(1000):
        # Perform one iteration of training the policy with DQN
        result = trainer.train()
        print(pretty_print(result))

        if i % 20 == 0:
            checkpoint = trainer.save()
            print("checkpoint saved at", checkpoint)
Exemplo n.º 10
0
        })
    results = tune.run(
        args.run,
        config=config,
        scheduler=scheduler,
        num_samples=4,
        stop=stop,
        # checkpoint_freq=10,
        checkpoint_at_end=True,
        # restore="/home/david/ray_results/SAC/SAC_FarmEnv_5aa8e_00000_0_2021-01-21_18-23-19/checkpoint_199/checkpoint-199",
    )
    if args.run == "PPO":
        agent = PPOTrainer(config=config)
    elif args.run == "SAC":
        agent = SACTrainer(config=config)
    elif args.run == "DDPG":
        agent = DDPGTrainer(config=config)

    # list of lists: one list per checkpoint; each checkpoint list contains
    # 1st the path, 2nd the metric value
    checkpoints = results.get_trial_checkpoints_paths(
        trial=results.get_best_trial("episode_reward_mean", mode='max'),
        metric="episode_reward_mean")
    checkpoint_path, _ = checkpoints[0]
    print(f'checkpoint_path {checkpoint_path}')
    #  agent = PPOTrainer(config=config_PPO)

    if args.as_test:
        check_learning_achieved(results, args.stop_reward)
    ray.shutdown()
Exemplo n.º 11
0
            config_copy = config.copy()
            config_copy['explore'] = False
            trainer = A2CTrainer(config=config_copy, env='Bertrand')
        elif trainer_choice == 'MADDPG':
            from ray.rllib.contrib.maddpg import MADDPGTrainer
            config['agent_id'] = 0
            # For eval afterward
            config_copy = config.copy()
            config_copy['explore'] = False
            trainer = MADDPGTrainer(config=config_copy, env='Bertrand')
        elif trainer_choice == 'DDPG':
            from ray.rllib.agents.ddpg import DDPGTrainer
            # For eval afterward
            config_copy = config.copy()
            config_copy['explore'] = False
            trainer = DDPGTrainer(config=config_copy, env='Bertrand')

        analysis = tune.run(
            trainer_choice,
            # num_samples = 4,
            config=config,
            local_dir='./log',
            stop={'training_iteration': sessions},
            mode='max',
            metric='episode_reward_mean',
            checkpoint_at_end=True)

        trainer.restore(checkpoint_path=analysis.best_checkpoint)

        # analysis = tune.run(
        #     trainer_choice,