Пример #1
0
    def __init__(self, env, device, model_dir, args):
        self.env = env
        self.env_name = args.env_name
        self.seed = args.seed
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.max_action = float(self.env.action_space.high[0])
        self.batch_size = args.batch_size
        self.max_timesteps = args.max_timesteps
        self.gaussian_std = args.gaussian_std
        self.start_timesteps = args.start_timesteps
        self.eval_freq = args.eval_freq
        self.rand_action_p = args.rand_action_p

        self.model_dir = os.path.join(model_dir,
                                      f"{args.env_name}_{args.seed}")

        self.algo = DDPG(self.state_dim, self.action_dim, self.max_action,
                         device)

        self.storage = ReplayBuffer(self.state_dim, self.action_dim, device)

        self.eval_rewards = []

        self.total_steps = 0
        self.episodes = 0
        self.episode_steps = 0
        self.episode_rewards = 0

        self.state = None
Пример #2
0
    def __init__(self):
        self.observation_space = SA_OBS_SPACE
        self.action_space = SA_ACTION_SPACE
        # self.agent = agent
        self.agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE,
                          CA_ACTION_BOUND)
        self.dqn_solver = DQNSolver(SA_OBS_SPACE, SA_ACTION_SPACE)

        logging.basicConfig(file_name="logs/log.log",
                            format='%(asctime)s %(message)s',
                            file_mode='w+')
        self.logger = logging.getLogger()
        self.logger.setLevel(logging.DEBUG)
Пример #3
0
def train(env,
          nb_epochs,
          nb_epoch_cycles,
          render_eval,
          reward_scale,
          render,
          param_noise,
          actor,
          critic,
          normalize_returns,
          normalize_observations,
          critic_l2_reg,
          actor_lr,
          critic_lr,
          action_noise,
          popart,
          gamma,
          clip_norm,
          nb_train_steps,
          nb_rollout_steps,
          nb_eval_steps,
          batch_size,
          memory,
          tau=0.01,
          eval_env=None,
          param_noise_adaption_interval=50):
    rank = MPI.COMM_WORLD.Get_rank()

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor,
                 critic,
                 memory,
                 env.observation_space.shape,
                 env.action_space.shape,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    if rank == 0:
        saver = tf.train.Saver()
    else:
        saver = None

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if rank == 0 and render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if rank == 0 and render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs,
                                                       apply_noise=False,
                                                       compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action
                        )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(
                                eval_episode_reward)
                            eval_episode_reward = 0.

            mpi_size = MPI.COMM_WORLD.Get_size()
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(
                epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(
                    eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = MPI.COMM_WORLD.allreduce(
                np.array([as_scalar(x) for x in combined_stats.values()]))
            combined_stats = {
                k: v / mpi_size
                for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
            }

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Пример #4
0
class FCMADRL:
    def __init__(self):
        self.observation_space = SA_OBS_SPACE
        self.action_space = SA_ACTION_SPACE
        # self.agent = agent
        self.agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE,
                          CA_ACTION_BOUND)
        self.dqn_solver = DQNSolver(SA_OBS_SPACE, SA_ACTION_SPACE)

        logging.basicConfig(file_name="logs/log.log",
                            format='%(asctime)s %(message)s',
                            file_mode='w+')
        self.logger = logging.getLogger()
        self.logger.setLevel(logging.DEBUG)

    def use_existing_dqn(self, dqn_model):
        self.dqn_solver.model = dqn_model

    def get_ddpg(self):
        return self.agent

    def get_dqn(self):
        return self.dqn_solver

    def get_dqn_model(self, dqn_solver):
        return dqn_solver.model

    """ 
    ca_step() is just for testing purposes 
    """

    def ca_step(self, action):
        return np.random.choice(
            SA_ACTION_SPACE,
            CA_OBS_SPACE), np.random.choice(10), np.random.choice(
                [True, False]), {}

    """ 
    sa_state(): To merge the two states received by the individual agents 
    (one from central agent and one from the environment) into one vector 
    """

    def sa_state(self, x, obs, i):
        one = x
        two = obs[i]
        three = np.array([i])
        f = np.append(one, two)
        f = np.append(f, three)
        return f

    def fcmadrl(self):
        # Randomly initialize critic,actor,target critic, target actor network and replay buffer
        exploration_noise = OUNoise(CA_ACTION_SPACE)
        counter = 0
        reward_per_episode = 0
        total_reward = 0
        num_states = CA_OBS_SPACE
        num_actions = CA_ACTION_SPACE

        self.logger.debug("Number of States:" + str(num_states))
        self.logger.debug("Number of Actions:" + str(num_actions))
        self.logger.debug("Number of Steps per episode:" + str(steps))
        # saving reward:
        reward_st = np.array([0])

        score_logger = ScoreLogger(ENV_NAME)

        # run = 0
        for i in xrange(episodes):
            print "==== Starting episode no:", i, "====", "\n"
            # observation = env.reset()
            observation = ca_reset()
            reward_per_episode = 0

            # run += 1
            obs = env.reset()
            # step = 0

            for t in xrange(steps):
                # rendering environment (optional)
                #env.render()
                print "Step: ", t

                x_arr = []
                observation_arr = []
                action_arr = []
                action_n = []
                state_arr = []
                next_state_arr = []
                action_n_arr = []

                for z in range(env.n):
                    self.take_action(action_arr, action_n, action_n_arr,
                                     exploration_noise, num_states, obs,
                                     observation, observation_arr, state_arr,
                                     x_arr, z)

                next_obs, reward_n, done_n, info_n = env.step(action_n)

                reward = reward_n[0]
                done = all(done_n)
                print "Reward_n: ", reward_n

                self.update_next_state(action_arr, next_obs, next_state_arr)
                self.memory_store(action_arr, action_n_arr, done_n,
                                  next_state_arr, observation_arr, reward_n,
                                  state_arr, x_arr)
                obs = next_obs
                # train critic and actor network
                if counter > 64:
                    self.agent.train()
                reward_per_episode += reward
                counter += 1
                # check if episode ends:
                if done or (t == steps - 1):
                    print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode
                    print "Printing reward to file"
                    exploration_noise.reset(
                    )  # reinitializing random noise for action exploration
                    reward_st = np.append(reward_st, reward_per_episode)
                    np.savetxt('rewards/episode_reward.txt',
                               reward_st,
                               newline="\n")

                    print "Run: " + str(i) + ", exploration: " + str(
                        self.dqn_solver.exploration_rate) + ", score: " + str(
                            reward_per_episode / t)
                    score_logger.add_score(reward_per_episode / t, i)
                    print '\n\n'

                    break
                self.dqn_solver.experience_replay()
            if (i % CHECKPOINT == 0):
                self.dqn_solver.save_dqn_model(i)

        total_reward += reward_per_episode
        print "Average reward per episode {}".format(total_reward / episodes)
        return total_reward

    def update_next_state(self, action_arr, next_obs, next_state_arr):
        for z in range(env.n):
            ns = self.sa_state(action_arr[z], next_obs, z)
            ns = np.reshape(ns, [1, self.observation_space])
            next_state_arr.append(ns)

    def take_action(self, action_arr, action_n, action_n_arr,
                    exploration_noise, num_states, obs, observation,
                    observation_arr, state_arr, x_arr, z):
        action = self.get_message(action_arr, exploration_noise, num_states,
                                  observation, x_arr)
        state = self.sa_state(action, obs, z)
        state = np.reshape(state, [1, self.observation_space])
        state_arr.append(state)
        act = self.get_final_action(action_n, action_n_arr, state)
        self.logger.debug("SA_Action: " + str(act))
        # print "CA State: ", x
        # print "CA Action: ", action
        # print "SA State: ", state
        # print "SA Action: ", act
        observation[z] = act
        observation_arr.append(np.array(list(observation)))

    def memory_store(self, action_arr, action_n_arr, done_n, next_state_arr,
                     observation_arr, reward_n, state_arr, x_arr):
        for z in range(env.n):
            # add s_t,s_t+1,action,reward to experience memory
            # print x_arr[z], observation_arr[z], action_arr[z], reward_n[z], done_n[z]
            self.agent.add_experience(x_arr[z], observation_arr[z],
                                      action_arr[z], reward_n[z], done_n[z])
            self.dqn_solver.remember(state_arr[z], action_n_arr[z],
                                     reward_n[z], next_state_arr[z], done_n[z])

    def get_final_action(self, action_n, action_n_arr, state):
        act = self.dqn_solver.act(state)
        a = np.zeros(SA_ACTION_SPACE)
        a[act] = 1.0
        action_n.append(a)
        action_n_arr.append(act)
        return act

    def get_message(self, action_arr, exploration_noise, num_states,
                    observation, x_arr):
        x = observation
        # x_arr.append(x)
        x_arr.append(np.array(list(x)))
        action = self.agent.evaluate_actor(np.reshape(x, [1, num_states]))
        noise = exploration_noise.noise()
        action = action[
            0] + noise  # Select action according to current policy and exploration noise
        action_arr.append(action)
        self.logger.debug("Action at Step: " + str(action))
        # print "Action at step", t ," :",action,"\n"
        return action
Пример #5
0
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
          normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
          popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
          tau=0.01, eval_env=None, param_noise_adaption_interval=50):

    assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info('scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
                 gamma=gamma, tau=tau, normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    # Set up logging stuff only for a single worker.
    saver = tf.train.Saver()

    step = 0
    episode = 0
    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    with U.single_threaded_session() as sess:
        # Prepare everything.
        agent.initialize(sess)
        sess.graph.finalize()

        agent.reset()
        obs = env.reset()
        if eval_env is not None:
            eval_obs = eval_env.reset()
        done = False
        episode_reward = 0.
        episode_step = 0
        episodes = 0
        t = 0

        epoch = 0
        start_time = time.time()

        epoch_episode_rewards = []
        epoch_episode_steps = []
        epoch_episode_eval_rewards = []
        epoch_episode_eval_steps = []
        epoch_start_time = time.time()
        epoch_actions = []
        epoch_qs = []
        epoch_episodes = 0
        for epoch in range(nb_epochs):
            for cycle in range(nb_epoch_cycles):
                # Perform rollouts.
                for t_rollout in range(nb_rollout_steps):
                    # Predict next action.
                    action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                    assert action.shape == env.action_space.shape

                    # Execute next action.
                    if render:
                        env.render()
                    assert max_action.shape == action.shape
                    new_obs, r, done, info = env.step(
                        max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    t += 1
                    if render:
                        env.render()
                    episode_reward += r
                    episode_step += 1

                    # Book-keeping.
                    epoch_actions.append(action)
                    epoch_qs.append(q)
                    agent.store_transition(obs, action, r, new_obs, done)
                    obs = new_obs

                    if done:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward)
                        episode_rewards_history.append(episode_reward)
                        epoch_episode_steps.append(episode_step)
                        episode_reward = 0.
                        episode_step = 0
                        epoch_episodes += 1
                        episodes += 1

                        agent.reset()
                        obs = env.reset()

                # Train.
                epoch_actor_losses = []
                epoch_critic_losses = []
                epoch_adaptive_distances = []
                for t_train in range(nb_train_steps):
                    # Adapt param noise, if necessary.
                    if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
                        distance = agent.adapt_param_noise()
                        epoch_adaptive_distances.append(distance)

                    cl, al = agent.train()
                    epoch_critic_losses.append(cl)
                    epoch_actor_losses.append(al)
                    agent.update_target_net()

                # Evaluate.
                eval_episode_rewards = []
                eval_qs = []
                if eval_env is not None:
                    eval_episode_reward = 0.
                    for t_rollout in range(nb_eval_steps):
                        eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
                        eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                            max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                        if render_eval:
                            eval_env.render()
                        eval_episode_reward += eval_r

                        eval_qs.append(eval_q)
                        if eval_done:
                            eval_obs = eval_env.reset()
                            eval_episode_rewards.append(eval_episode_reward)
                            eval_episode_rewards_history.append(eval_episode_reward)
                            eval_episode_reward = 0.

            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
            combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
            combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
            combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
            combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes
            combined_stats['rollout/actions_std'] = np.std(epoch_actions)
            # Evaluation statistics.
            if eval_env is not None:
                combined_stats['eval/return'] = eval_episode_rewards
                combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history)
                combined_stats['eval/Q'] = eval_qs
                combined_stats['eval/episodes'] = len(eval_episode_rewards)

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t

            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
                        pickle.dump(eval_env.get_state(), f)
Пример #6
0
def run_pusher3dof(args, sim=True, vanilla=False):
    try:
        from hyperdash import Experiment

        hyperdash_support = True
    except:
        hyperdash_support = False

    env = NormalizedEnv(gym.make(args.env))

    torques = [1.0] * 3  # if real
    colored = False

    if sim:
        torques = [args.t0, args.t1, args.t2]
        colored = True

    if not vanilla:
        env.env._init(
            torques=torques,
            colored=colored
        )

    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    nb_states = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]

    agent = DDPG(nb_states, nb_actions, args)
    evaluate = Evaluator(
        args.validate_episodes,
        args.validate_steps,
        args.output,
        max_episode_length=args.max_episode_length
    )

    exp = None

    if args.mode == 'train':
        if hyperdash_support:
            prefix = "real"
            if sim: prefix = "sim"

            exp = Experiment("s2r-pusher3dof-ddpg-{}".format(prefix))
            import socket

            exp.param("host", socket.gethostname())
            exp.param("type", prefix)  # sim or real
            exp.param("vanilla", vanilla)  # vanilla or not
            exp.param("torques", torques)
            exp.param("folder", args.output)

            for arg in ["env", "max_episode_length", "train_iter", "seed", "resume"]:
                arg_val = getattr(args, arg)
                exp.param(arg, arg_val)

        train(args, args.train_iter, agent, env, evaluate,
              args.validate_steps, args.output,
              max_episode_length=args.max_episode_length, debug=args.debug, exp=exp)

        # when done
        exp.end()

    elif args.mode == 'test':
        test(args.validate_episodes, agent, env, evaluate, args.resume,
             visualize=args.vis, debug=args.debug, load_best=args.best)

    else:
        raise RuntimeError('undefined mode {}'.format(args.mode))
Пример #7
0
        "fthigh": 120,
        "fshin": 60,
        "ffoot": 30
    },
    colored=False
)

if args.seed > 0:
    np.random.seed(args.seed)
    env.seed(args.seed)

nb_states = env.observation_space.shape[0]
nb_actions = env.action_space.shape[0]


agent = DDPG(nb_states, nb_actions, args)
evaluate = Evaluator(args.validate_episodes,
    args.validate_steps, args.output, max_episode_length=args.max_episode_length)

exp = None

if args.mode == 'train':
    exp = Experiment("sim2real-ddpg-real-cheetah")
    for arg in ["env", "rate", "prate", "hidden1", "hidden2", "warmup", "discount",
                "bsize", "rmsize", "window_length", "tau", "ou_theta", "ou_sigma", "ou_mu",
                "validate_episodes", "max_episode_length", "validate_steps", "init_w",
                "train_iter", "epsilon", "seed", "resume"]:
        arg_val = getattr(args, arg)

    import socket
    exp.param("host", socket.gethostname())
Пример #8
0
es = OUStrategy(env_spec=env.spec)

qf = ContinuousMLPQFunction(
    env_spec=env.spec,
    hidden_sizes=(100, 100),
    hidden_nonlinearity=tf.nn.relu,
)

algo = DDPG(env=env,
            policy=policy,
            es=es,
            qf=qf,
            batch_size=64,
            max_path_length=env.horizon,
            epoch_length=1000,
            min_pool_size=10000,
            n_epochs=args.num_epochs,
            discount=0.99,
            scale_reward=args.reward_scale,
            qf_learning_rate=1e-3,
            policy_learning_rate=1e-4,
            plot=False)

run_experiment_lite(
    algo.train(),
    log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
    snapshot_mode="last",
    # Specifies the seed for the experiment. If this is not provided, a random seed
)

es = OUStrategy(env_spec=env.spec)

qf = ContinuousMLPQFunction(env_spec=env.spec,
                            hidden_sizes=(100,100),
                            hidden_nonlinearity=tf.nn.relu,)

algo = DDPG(
    env=env,
    policy=policy,
    es=es,
    qf=qf,
    batch_size=64,
    max_path_length=env.horizon,
    epoch_length=1000,
    min_pool_size=10000,
    n_epochs=args.num_epochs,
    discount=0.99,
    scale_reward=args.reward_scale,
    qf_learning_rate=1e-3,
    policy_learning_rate=1e-4,
    plot=False
)


run_experiment_lite(
    algo.train(),
    log_dir=None if args.use_ec2 else args.data_dir,
    # Number of parallel workers for sampling
    n_parallel=1,
    # Only keep the snapshot parameters for the last iteration
Пример #10
0
class Base:
    def __init__(self, env, device, model_dir, args):
        self.env = env
        self.env_name = args.env_name
        self.seed = args.seed
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        self.max_action = float(self.env.action_space.high[0])
        self.batch_size = args.batch_size
        self.max_timesteps = args.max_timesteps
        self.gaussian_std = args.gaussian_std
        self.start_timesteps = args.start_timesteps
        self.eval_freq = args.eval_freq
        self.rand_action_p = args.rand_action_p

        self.model_dir = os.path.join(model_dir,
                                      f"{args.env_name}_{args.seed}")

        self.algo = DDPG(self.state_dim, self.action_dim, self.max_action,
                         device)

        self.storage = ReplayBuffer(self.state_dim, self.action_dim, device)

        self.eval_rewards = []

        self.total_steps = 0
        self.episodes = 0
        self.episode_steps = 0
        self.episode_rewards = 0

        self.state = None

    def iterate(self):
        assert self.state is not None

        self.episode_steps += 1

        if self.is_random_action():
            action = self.env.action_space.sample()
        else:
            action = (self.algo.select_action(np.array(self.state)) +
                      np.random.normal(0,
                                       self.max_action * self.gaussian_std,
                                       size=self.action_dim)).clip(
                                           -self.max_action, self.max_action)

        next_state, reward, done, _ = self.env.step(action)
        done_bool = float(
            done) if self.episode_steps < self.env._max_episode_steps else 0

        self.storage.add(self.state, action, next_state, reward, done_bool)

        self.state = next_state
        self.episode_rewards += reward

        if done:
            print(f"Total T: {self.total_steps + 1} "
                  f"Episode Num: {self.episodes + 1} "
                  f"Episode T: {self.episode_steps} "
                  f"Reward: {self.episode_rewards:.3f}")
            # Reset environment
            self.state = self.env.reset()
            self.episode_rewards = 0
            self.episode_steps = 0
            self.episodes += 1

        self.total_steps += 1

    def evaluate(self, eval_episodes=10):
        eval_env = gym.make(self.env_name)
        eval_env.seed(self.seed + 100)

        avg_reward = 0.
        for _ in range(eval_episodes):
            state, done = eval_env.reset(), False
            while not done:
                action = self.algo.select_action(np.array(state))
                state, reward, done, _ = eval_env.step(action)
                avg_reward += reward

        avg_reward /= eval_episodes

        print("---------------------------------------")
        print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
        print("---------------------------------------")
        return avg_reward
Пример #11
0
def main():
    if sys.platform.startswith('win'):
        # Add the _win_handler function to the windows console's handler function list
        win32api.SetConsoleCtrlHandler(_win_handler, True)
    if os.path.exists(
            os.path.join(config_file.config['config_file'], 'config.yaml')):
        config = sth.load_config(config_file.config['config_file'])
    else:
        config = config_file.config
        print(f'load config from config.')

    hyper_config = config['hyper parameters']
    train_config = config['train config']
    record_config = config['record config']

    basic_dir = record_config['basic_dir']
    last_name = record_config['project_name'] + '/' \
        + record_config['remark'] \
        + record_config['run_id']
    cp_dir = record_config['checkpoint_basic_dir'] + last_name
    cp_file = cp_dir + '/rb'
    log_dir = record_config['log_basic_dir'] + last_name
    excel_dir = record_config['excel_basic_dir'] + last_name
    config_dir = record_config['config_basic_dir'] + last_name
    sth.check_or_create(basic_dir, 'basic')
    sth.check_or_create(cp_dir, 'checkpoints')
    sth.check_or_create(log_dir, 'logs(summaries)')
    sth.check_or_create(excel_dir, 'excel')
    sth.check_or_create(config_dir, 'config')

    logger = create_logger(
        name='logger',
        console_level=logging.INFO,
        console_format='%(levelname)s : %(message)s',
        logger2file=record_config['logger2file'],
        file_name=log_dir + '\log.txt',
        file_level=logging.WARNING,
        file_format=
        '%(lineno)d - %(asctime)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s'
    )
    if train_config['train']:
        sth.save_config(config_dir, config)

    if train_config['unity_mode']:
        env = UnityEnvironment()
    else:
        env = UnityEnvironment(
            file_name=train_config['unity_file'],
            no_graphics=True if train_config['train'] else False,
            base_port=train_config['port'])
    brain_name = env.external_brain_names[0]
    brain = env.brains[brain_name]
    # set the memory use proportion of GPU
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5
    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default() as g:
        with tf.Session(graph=g, config=tf_config) as sess:
            logger.info('Algorithm: {0}'.format(
                train_config['algorithm'].name))
            if train_config['algorithm'] == config_file.algorithms.ppo_sep_ac:
                from ppo.ppo_base import PPO_SEP
                model = PPO_SEP(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                logger.info('PPO_SEP initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.ppo_com:
                from ppo.ppo_base import PPO_COM
                model = PPO_COM(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                logger.info('PPO_COM initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.sac:
                from sac.sac import SAC
                model = SAC(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                logger.info('SAC initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.sac_no_v:
                from sac.sac_no_v import SAC_NO_V
                model = SAC_NO_V(sess=sess,
                                 s_dim=brain.vector_observation_space_size,
                                 a_counts=brain.vector_action_space_size[0],
                                 hyper_config=hyper_config)
                logger.info('SAC_NO_V initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.ddpg:
                from ddpg.ddpg import DDPG
                model = DDPG(sess=sess,
                             s_dim=brain.vector_observation_space_size,
                             a_counts=brain.vector_action_space_size[0],
                             hyper_config=hyper_config)
                logger.info('DDPG initialize success.')
            elif train_config['algorithm'] == config_file.algorithms.td3:
                from td3.td3 import TD3
                model = TD3(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                logger.info('TD3 initialize success.')
            recorder = Recorder(log_dir,
                                excel_dir,
                                record_config,
                                logger,
                                max_to_keep=5,
                                pad_step_number=True,
                                graph=g)
            episode = init_or_restore(cp_dir, sess, recorder, cp_file)
            try:
                if train_config['train']:
                    train_OnPolicy(
                        sess=sess,
                        env=env,
                        brain_name=brain_name,
                        begin_episode=episode,
                        model=model,
                        recorder=recorder,
                        cp_file=cp_file,
                        hyper_config=hyper_config,
                        train_config=train_config) if not train_config[
                            'use_replay_buffer'] else train_OffPolicy(
                                sess=sess,
                                env=env,
                                brain_name=brain_name,
                                begin_episode=episode,
                                model=model,
                                recorder=recorder,
                                cp_file=cp_file,
                                hyper_config=hyper_config,
                                train_config=train_config)
                    tf.train.write_graph(g,
                                         cp_dir,
                                         'raw_graph_def.pb',
                                         as_text=False)
                    export_model(cp_dir, g)
                else:
                    inference(env, brain_name, model, train_config)
            except Exception as e:
                logger.error(e)
            finally:
                env.close()
    recorder.close()
    sys.exit()
Пример #12
0
def main():
    if sys.platform.startswith('win'):
        win32api.SetConsoleCtrlHandler(_win_handler, True)

    if train_config['unity_mode']:
        env = UnityEnvironment()
    else:
        env = UnityEnvironment(
            file_name=train_config['unity_file'],
            no_graphics=True if train_config['train'] else False,
            base_port=train_config['port'])
    brain_name = env.external_brain_names[0]
    brain = env.brains[brain_name]
    # set the memory use proportion of GPU
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5
    tf.reset_default_graph()
    graph = tf.Graph()
    with graph.as_default() as g:
        with tf.Session(graph=g, config=tf_config) as sess:
            print('Algorithm: {0}'.format(train_config['algorithm'].name))
            if train_config['algorithm'] == algorithms.ppo_sep_ac:
                from ppo.ppo_base import PPO_SEP
                model = PPO_SEP(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                print('PPO_SEP initialize success.')
            elif train_config['algorithm'] == algorithms.ppo_com:
                from ppo.ppo_base import PPO_COM
                model = PPO_COM(sess=sess,
                                s_dim=brain.vector_observation_space_size,
                                a_counts=brain.vector_action_space_size[0],
                                hyper_config=hyper_config)
                print('PPO_COM initialize success.')
            elif train_config['algorithm'] == algorithms.sac:
                from sac.sac import SAC
                model = SAC(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                print('SAC initialize success.')
            elif train_config['algorithm'] == algorithms.sac_no_v:
                from sac.sac_no_v import SAC_NO_V
                model = SAC_NO_V(sess=sess,
                                 s_dim=brain.vector_observation_space_size,
                                 a_counts=brain.vector_action_space_size[0],
                                 hyper_config=hyper_config)
                print('SAC_NO_V initialize success.')
            elif train_config['algorithm'] == algorithms.ddpg:
                from ddpg.ddpg import DDPG
                model = DDPG(sess=sess,
                             s_dim=brain.vector_observation_space_size,
                             a_counts=brain.vector_action_space_size[0],
                             hyper_config=hyper_config)
                print('DDPG initialize success.')
            elif train_config['algorithm'] == algorithms.td3:
                from td3.td3 import TD3
                model = TD3(sess=sess,
                            s_dim=brain.vector_observation_space_size,
                            a_counts=brain.vector_action_space_size[0],
                            hyper_config=hyper_config)
                print('TD3 initialize success.')
            sess.run(tf.global_variables_initializer())
            try:
                if train_config['train']:
                    train_OnPolicy(
                        sess=sess,
                        env=env,
                        brain_name=brain_name,
                        begin_episode=0,
                        model=model,
                        hyper_config=hyper_config,
                        train_config=train_config) if not train_config[
                            'use_replay_buffer'] else train_OffPolicy(
                                sess=sess,
                                env=env,
                                brain_name=brain_name,
                                begin_episode=0,
                                model=model,
                                hyper_config=hyper_config,
                                train_config=train_config)
                else:
                    inference(env, brain_name, model, train_config)
            except Exception as e:
                print(e)
            finally:
                env.close()
    sys.exit()
Пример #13
0
def run_reacher(args, sim=True):
    try:
        from hyperdash import Experiment

        hyperdash_support = True
    except:
        hyperdash_support = False

    env = NormalizedEnv(gym.make(args.env))

    torques = [200, 200]  # if real
    colors = None
    if sim:
        torques = [args.t0, args.t1]
        colors = {
            "arenaBackground": ".27 .27 .81",
            "arenaBorders": "1.0 0.8 0.4",
            "arm0": "0.9 0.6 0.9",
            "arm1": "0.9 0.9 0.6"
        }

    env.env.env._init(  # real robot
        torque0=torques[0],  # torque of joint 1
        torque1=torques[0],  # torque of joint 2
        topDown=True,
        colors=colors)

    if args.seed > 0:
        np.random.seed(args.seed)
        env.seed(args.seed)

    nb_states = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]

    agent = DDPG(nb_states, nb_actions, args)
    evaluate = Evaluator(args.validate_episodes,
                         args.validate_steps,
                         args.output,
                         max_episode_length=args.max_episode_length)

    exp = None

    if args.mode == 'train':
        if hyperdash_support:
            prefix = "real"
            if sim: prefix = "sim"

            exp = Experiment("s2r-reacher-ddpg-{}".format(prefix))
            import socket

            exp.param("host", socket.gethostname())
            exp.param("type", prefix)  # sim or real
            exp.param("torques", [torques[0], torques[1]])
            exp.param("folder", args.output)

            for arg in [
                    "env", "max_episode_length", "train_iter", "seed", "resume"
            ]:
                arg_val = getattr(args, arg)
                exp.param(arg, arg_val)

        train(args,
              args.train_iter,
              agent,
              env,
              evaluate,
              args.validate_steps,
              args.output,
              max_episode_length=args.max_episode_length,
              debug=args.debug,
              exp=exp)

        # when done
        exp.end()

    elif args.mode == 'test':
        test(args.validate_episodes,
             agent,
             env,
             evaluate,
             args.resume,
             visualize=args.vis,
             debug=args.debug,
             load_best=args.best)

    else:
        raise RuntimeError('undefined mode {}'.format(args.mode))