def __init__(self, env, train_flag = True, model_path = None, actor_learning_rate = 0.001, critic_learning_rate = 0.002,
                num_episodes = 1000, tau = 0.005, gamma = 0.99, memory_size = 100000, batch_size = 64):
        self.env = env
        self.train_flag = train_flag
        self.num_episodes = num_episodes
        self.tau = tau
        self.gamma = gamma
        self.batch_size = batch_size
        self.model_path = model_path

        self.actor_opt = Adam(lr = actor_learning_rate)
        self.critic_opt = Adam(lr = critic_learning_rate)

        self.states_shape = self.env.observation_space.shape[0]
        self.action_shape = self.env.action_space.shape[0]
        self.n_actions = self.env.action_space.shape[0]

        self.actions_lower_bound = env.action_space.low
        self.actions_upper_bound = env.action_space.high

        if self.train_flag:
            self.noise = OUActionNoise(mean = np.zeros(self.n_actions), std_deviation = 0.2 * np.ones(self.n_actions))
            self.actor = get_actor(self.states_shape, self.n_actions, self.actions_upper_bound)
            self.actor_target = get_actor(self.states_shape, self.n_actions, self.actions_upper_bound)
            self.critic = get_critic(self.states_shape, self.action_shape)
            self.critic_target = get_critic(self.states_shape, self.action_shape)

            self.actor_target.set_weights(self.actor.get_weights())
            self.critic_target.set_weights(self.critic.get_weights())
            self.memory = Memory(memory_size)
        else:
            self.actor = load_model(self.model_path)
예제 #2
0
def train(args, env, policy_net, value_net, running_state):
    for i_episode in count(1):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < args.batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(10000):  # Don't infinite loop while learning
                action = select_action(state, policy_net)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                if args.render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()

        #########################
        # TRPO update parameters
        #########################
        update_trpo = trpo.update_params(batch, value_net, policy_net, args,
                                         trpo_functions)
        update_trpo.execute()

        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {}\tAverage reward {:.2f}'.format(
                i_episode, reward_sum, reward_batch))
예제 #3
0
    def __init__(self, n_bits, lr, memory_size, batch_size, gamma):
        self.n_bits = n_bits
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.memory = Memory(self.memory_size)

        self.device = "cpu"
        self.model = DQN(n_inputs=2 * self.n_bits, n_outputs=n_bits).to(self.device)
        self.target_model = DQN(n_inputs=2 * self.n_bits, n_outputs=n_bits).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()
        self.opt = Adam(self.model.parameters(), lr=self.lr)
        self.loss_fn = MSELoss()
        self.epsilon = 1.0
        self.epsilon_decay = 0.001
예제 #4
0
    def __init__(self, env, \
                train_flag = True, \
                model_path = None, \
                memory_size = 512, \
                num_checkpoints = 5, \
                num_episodes = 1000, \
                batch_size = 64, \
                learning_rate = 0.01, \
                gamma = 1.0, \
                exploration_phase = 0.4, \
                train_start_episode = 100, \
                eps_start = 1.0, \
                eps_min = 0.05, \
                eps_decay = 0.999, \
                target_model_update_interval = 20):
        self.env = env
        self.train_flag = train_flag
        self.model_path = model_path
        self.input_shape = self.env.observation_space.shape[0]
        self.num_actions = self.env.action_space.n
        self.num_episodes = num_episodes
        self.num_checkpoints = num_checkpoints
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.gamma = gamma
        self.train_start_episode = train_start_episode
        self.eps = eps_start
        self.eps_min = eps_min
        self.eps_decay = (eps_start - eps_min) / (
            (num_episodes - train_start_episode) * exploration_phase)
        self.target_model_update_interval = target_model_update_interval

        if (self.train_flag):
            self.model = get_model(input_shape=self.input_shape,
                                   num_actions=self.num_actions,
                                   learning_rate=self.learning_rate)
            self.target_model = get_model(input_shape=self.input_shape,
                                          num_actions=self.num_actions,
                                          learning_rate=self.learning_rate)
            self.memory = Memory(self.memory_size)
        else:
            assert model_path != None, "Please pass the path of a trained model!"
            self.model = load_model(model_path)
            print('Model Loaded!!')
예제 #5
0
    def __init__(self,
                 env_id,
                 dim_latent,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 lr_p=1e-3,
                 lr_v=1e-3,
                 gamma=0.99,
                 polyak=0.995,
                 action_noise=0.1,
                 target_action_noise_std=0.2,
                 target_action_noise_clip=0.5,
                 explore_size=10000,
                 step_per_iter=3000,
                 batch_size=100,
                 min_update_step=1000,
                 update_step=50,
                 policy_update_delay=2,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.gamma = gamma
        self.polyak = polyak
        self.action_noise = action_noise
        self.target_action_noise_std = target_action_noise_std
        self.target_action_noise_clip = target_action_noise_clip
        self.memory = Memory(memory_size)
        self.explore_size = explore_size
        self.step_per_iter = step_per_iter
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.policy_update_delay = policy_update_delay
        self.model_path = model_path
        self.seed = seed
        self.dim_latent = dim_latent

        self._init_model()
예제 #6
0
    def __init__(self, env_name, n_states, n_actions, memory_size, batch_size,
                 gamma, alpha, lr, action_bounds, reward_scale):
        self.env_name = env_name
        self.n_states = n_states
        self.n_actions = n_actions
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.alpha = alpha
        self.lr = lr
        self.action_bounds = action_bounds
        self.reward_scale = reward_scale
        self.memory = Memory(memory_size=self.memory_size)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.policy_network = PolicyNetwork(
            n_states=self.n_states,
            n_actions=self.n_actions,
            action_bounds=self.action_bounds).to(self.device)
        self.q_value_network1 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.q_value_network2 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.value_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network.load_state_dict(
            self.value_network.state_dict())
        self.value_target_network.eval()

        self.value_loss = torch.nn.MSELoss()
        self.q_value_loss = torch.nn.MSELoss()

        self.value_opt = Adam(self.value_network.parameters(), lr=self.lr)
        self.q_value1_opt = Adam(self.q_value_network1.parameters(),
                                 lr=self.lr)
        self.q_value2_opt = Adam(self.q_value_network2.parameters(),
                                 lr=self.lr)
        self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr)
예제 #7
0
    def __init__(self, parameters):

        self.parameters = parameters
        self.env = gym.make(self.parameters['env'])
        self.nA = self.env.action_space.sample().shape[0]
        self.state_size = self.env.reset().shape[0]

        # Build our replay memory
        self.memory = Memory(replay_size=self.parameters['replay_size'],
                             action_size=self.nA,
                             state_size=self.state_size,
                             batch_size=self.parameters['batch_size'])

        # Create actor and critic
        self.actor_critic = ActorCritic(
            actor_lr=parameters['actor_learning_rate'],
            critic_lr=parameters['critic_learning_rate'],
            gamma=parameters['gamma'],
            state_size=self.state_size,
            action_size=self.nA,
            tau=parameters['tau'])
예제 #8
0
def experiment_2():
    env = gym.make('MountainCar-v0')
    #env = gym.make('CartPole-v0');
    state = env.reset()

    action_dim = 1
    num_actions = env.action_space.n
    obs_dim = env.observation_space.shape[0]
    memory = Memory(MEMORY_SIZE, BATCH_SIZE, action_dim, obs_dim)

    agent = LSPI(num_actions, obs_dim)

    return agent, env, memory
예제 #9
0
def collect_samples(policy_net, min_batch_size):
    memory = Memory()

    num_steps = 0
    reward_batch = 0
    num_episodes = 0

    while (num_steps < min_batch_size):
        state = env.reset()
        reward_sum = 0
        for t in range(10000):
            action = select_action(policy_net, state)
            action = action.data[0].numpy()
            next_state, reward, done, _ = env.step(action)
            reward_sum += reward

            mask = 0 if done else 1

            memory.push(state, np.array([action]), mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        num_steps += (t - 1)
        num_episodes += 1
        reward_batch += reward_sum

    print(num_episodes)
    reward_batch = reward_batch / num_episodes

    batch = memory.sample()

    return batch, reward_batch
예제 #10
0
def experiment_1():
    import gym
    #env = gym.make('InvertedPendulum-v1')
    env = gym.make('Acrobot-v0')
    state = env.reset()

    num_actions = env.action_space.n
    obs_dim = env.observation_space.shape[0]
    action_dim = 1

    memory = Memory(MEMORY_SIZE, BATCH_SIZE, action_dim, obs_dim)
    print(num_actions, obs_dim)

    agent = LSPI(num_actions, obs_dim)
    return agent, env, memory
예제 #11
0
파일: main.py 프로젝트: haanvid/LSPI
def cartpole_experiment(basis_opt="gaussian", basis_function_dim=5):
    print('Hello CartPole world!')
    env = gym.make('CartPole-v0')

    num_actions = env.action_space.n
    state_dim = env.observation_space.shape[0]
    action_dim = 1
    print("state_dim : ", state_dim)
    print("action_dim : ", action_dim)
    print("num_actions : ", num_actions)
    print("basis_function_option : ", basis_opt)
    print("basis_function_dim : ", basis_function_dim)

    memory = Memory(MEMORY_SIZE, action_dim, state_dim)

    agent = LSPI(num_actions,
                 state_dim,
                 basis_function_dim,
                 gamma=0.99,
                 opt=basis_opt)
    return agent, env, memory, 'CartPole-v0'
예제 #12
0
def collect_samples(pid, queue, env, policy, encoder, render, running_state,
                    custom_reward, min_batch_size):
    torch.set_num_threads(1)
    if pid > 0:
        torch.manual_seed(torch.randint(0, 5000, (1, )) * pid)
        if hasattr(env, 'np_random'):
            env.np_random.seed(env.np_random.randint(5000) * pid)
        if hasattr(env, 'env') and hasattr(env.env, 'np_random'):
            env.env.np_random.seed(env.env.np_random.randint(5000) * pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    num_episodes = 0

    min_episode_reward = float('inf')
    max_episode_reward = float('-inf')
    total_reward = 0

    while num_steps < min_batch_size:
        state = env.reset()
        episode_reward = 0
        if running_state:
            state = running_state(state)

        for t in range(10000):
            if render:
                env.render()

            enco_state = FLOAT(state).unsqueeze(0)  #.to(device)
            with torch.no_grad():
                enco_state = encoder.sample_prediction(enco_state)
            enco_state = enco_state.cpu().numpy()[0]
            state_tensor = FLOAT(enco_state).unsqueeze(0)
            with torch.no_grad():
                action, log_prob = policy.get_action_log_prob(state_tensor)
            action = action.cpu().numpy()[0]
            log_prob = log_prob.cpu().numpy()[0]
            next_state, reward, done, _ = env.step(action)
            if custom_reward:
                reward = custom_reward(state, action)
            episode_reward += reward

            if running_state:
                next_state = running_state(next_state)

            mask = 0 if done else 1
            # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
            memory.push(state, action, reward, next_state, mask, log_prob)
            num_steps += 1
            if done or num_steps >= min_batch_size:
                break

            state = next_state

        # num_steps += (t + 1)
        num_episodes += 1
        total_reward += episode_reward
        min_episode_reward = min(episode_reward, min_episode_reward)
        max_episode_reward = max(episode_reward, max_episode_reward)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_episode_reward'] = max_episode_reward
    log['min_episode_reward'] = min_episode_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
예제 #13
0
파일: main.py 프로젝트: HuiminHe/rl-swing
        mean0 = Variable(mean1.data)
        log_std0 = Variable(log_std1.data)
        std0 = Variable(std1.data)
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (
            2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)

    trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping)


running_state = ZFilter((num_inputs, ), clip=5)
running_reward = ZFilter((1, ), demean=False, clip=10)

for i_episode in count(1):
    memory = Memory()

    num_steps = 0
    reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        state = env.reset()
        state = running_state(state)

        reward_sum = 0
        for t in range(10000):  # Don't infinite loop while learning
            action = select_action(state)
            action = action.data[0].numpy()
            next_state, reward, done, _ = env.step(action)
            reward_sum += reward
예제 #14
0
class DDPG():
    def __init__(self, parameters):

        self.parameters = parameters
        self.env = gym.make(self.parameters['env'])
        self.nA = self.env.action_space.sample().shape[0]
        self.state_size = self.env.reset().shape[0]

        # Build our replay memory
        self.memory = Memory(replay_size=self.parameters['replay_size'],
                             action_size=self.nA,
                             state_size=self.state_size,
                             batch_size=self.parameters['batch_size'])

        # Create actor and critic
        self.actor_critic = ActorCritic(
            actor_lr=parameters['actor_learning_rate'],
            critic_lr=parameters['critic_learning_rate'],
            gamma=parameters['gamma'],
            state_size=self.state_size,
            action_size=self.nA,
            tau=parameters['tau'])

    def train(self):

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        # Create global step and increment operation
        global_step_tensor = tf.Variable(0,
                                         trainable=False,
                                         name='global_step')
        increment_global_step = tf.assign_add(global_step_tensor, 1)

        # Create model saver
        saver = tf.train.Saver()

        sess = tf.Session(config=config)

        if not self.parameters['restore']:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        self.actor_critic.set_moving_to_target(sess)
        run_id = np.random.randint(10000)

        trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id),
                                            graph=sess.graph)

        # Get action noise
        action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.nA),
            sigma=float(self.parameters['sigma']) * np.ones(self.nA))

        # Fill Replay Memory
        state = self.env.reset()
        fill_amount = 0
        while fill_amount < self.parameters['replay_init_size']:

            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)

            if done:
                state = self.env.reset()
            else:
                fill_amount += 1
                self.memory.add(state, action, reward, done, next_state)
                state = next_state

        # Main Loop
        steps = 0

        for i in range(self.parameters['num_epochs']):

            avg_epoch_rewards = 0
            num_epochs = 1
            for e in range(self.parameters['num_episodes']):

                state = self.env.reset()

                ep_reward = 0

                # Perform rollout
                while True:
                    noise = action_noise()
                    action = self.actor_critic.pi(sess, state[None, ...])
                    action += noise
                    action = np.clip(action, self.env.action_space.low[0],
                                     self.env.action_space.high[0])

                    assert action.shape == self.env.action_space.shape
                    """
					# UNCOMMENT TO PRINT ACTIONS
					a0 = tf.Summary(value=[tf.Summary.Value(tag="action_0", simple_value=action[0,0])])
					trainwriter.add_summary(a0,steps)
					a1 = tf.Summary(value=[tf.Summary.Value(tag="action_1", simple_value=action[0,1])])
					trainwriter.add_summary(a1,steps)
					a2 = tf.Summary(value=[tf.Summary.Value(tag="action_2", simple_value=action[0,2])])
					trainwriter.add_summary(a2,steps)
					steps += 1
					"""

                    next_state, reward, done, _ = self.env.step(action)

                    self.memory.add(state, action, reward, done, next_state)

                    if self.parameters['render_train']:
                        self.env.render()

                    ep_reward += reward

                    if done:

                        reward_summary = tf.Summary(value=[
                            tf.Summary.Value(tag="ep_rewards",
                                             simple_value=ep_reward)
                        ])
                        trainwriter.add_summary(
                            reward_summary,
                            i * self.parameters['num_episodes'] + e)
                        action_noise.reset()
                        break

                    state = next_state

                avg_epoch_rewards = avg_epoch_rewards + (
                    ep_reward - avg_epoch_rewards) / num_epochs
                num_epochs += 1

                # Perform train
                for t in range(self.parameters['num_train_steps']):
                    s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample(
                    )
                    # Train actor critic model
                    self.actor_critic.update(sess=sess,
                                             filewriter=trainwriter,
                                             state_batch=s_state,
                                             next_state_batch=s_next_state,
                                             action_batch=s_action,
                                             reward_batch=s_reward,
                                             done_batch=s_terminal)
                    sess.run(increment_global_step)

            # Print out epoch stats here

            table_data = [['Epoch', 'Average Reward'],
                          [
                              str(i) + "/" +
                              str(self.parameters['num_epochs']),
                              str(avg_epoch_rewards)
                          ]]

            table = AsciiTable(table_data, "Training Run: " + str(run_id))

            save_path = saver.save(sess, "./saves/model.ckpt")

            os.system('clear')
            print("Model saved in path: %s" % save_path + "\n" + table.table)

    def test(self):
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        saver = tf.train.Saver()
        sess = tf.Session(config=config)

        saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        while True:

            state = self.env.reset()

            # Perform rollout
            while True:
                action = self.actor_critic.pi(sess, state[None, ...])
                action = np.clip(action, self.env.action_space.low[0],
                                 self.env.action_space.high[0])

                assert action.shape == self.env.action_space.shape

                next_state, reward, done, _ = self.env.step(action)

                self.env.render()

                if done:

                    break

                state = next_state
예제 #15
0
            cur_id += cur_batch_size
            cur_id_exp += cur_batch_size_exp

#running_state = ZFilter((num_inputs,), clip=5)
#running_reward = ZFilter((1,), demean=False, clip=10)
episode_lengths = []
optim_epochs = args.optim_epochs
optim_batch_size = args.optim_batch_size

expert = Expert(args.expert_path, num_inputs)
print 'Loading expert trajectories ...'
expert.push()
print 'Expert trajectories loaded.'

for i_episode in count(1):
    memory = Memory()

    num_steps = 0
    reward_batch = 0
    true_reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        c = expert.sample_c() # read c sequence from expert trajectories
        #if np.argmax(c[0,:]) == 1: # left half
        #    set_diff = list(set(product(tuple(range(0, (width/2)-3)), tuple(range(1, height)))) - set(obstacles))
        #elif np.argmax(c[0,:]) == 3: # right half
        #    set_diff = list(set(product(tuple(range(width/2, width-2)), tuple(range(2, height)))) - set(obstacles))
                
        start_loc = sample_start(set_diff)
        s = State(start_loc, obstacles)
        #state = running_state(state)
예제 #16
0
reward_list = []
batchsize_list = []
for i_para in range(5):
    episode_list = []
    reward_tmp = []
    env.seed(args.seed)
    torch.manual_seed(args.seed)
    policy_net = Policy(num_inputs, num_actions)
    value_net = Value(num_inputs)
    batchsize_list.append(args.batch_size)

    running_state = ZFilter((num_inputs, ), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)

    for i_episode in tqdm(range(200)):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < args.batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(10000):  # Don't infinite loop while learning
                action = select_action(state)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward
예제 #17
0
optim_epochs = 5
optim_percentage = 0.05

for i_episode in count(1):
    ep_memory = Memory_Ep()

    num_steps = 0
    reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        state = env.reset()
        state = running_state(state)
        policy_net.reset()

        reward_sum = 0
        memory = Memory()
        for t in range(10000):  # Don't infinite loop while learning
            if args.use_joint_pol_val:
                action = select_action_actor_critic(state)
            else:
                action = select_action(state)
            action = action.data[0].numpy()
            next_state, reward, done, _ = env.step(action)
            reward_sum += reward

            next_state = running_state(next_state)

            mask = 1
            if done:
                mask = 0
예제 #18
0
class DDPG():
    def __init__(self, parameters):

        self.parameters = parameters
        self.env = gym.make(
            self.parameters['env'][:self.parameters['env'].find('_')])
        self.nA = self.env.action_space.sample().shape[0]
        self.state_size = self.env.reset().shape[0]

        # Build our replay memory
        self.memory = Memory(replay_size=self.parameters['replay_size'],
                             action_size=self.nA,
                             state_size=self.state_size,
                             batch_size=self.parameters['batch_size'])

        # Create actor and critic
        self.actor_critic = ActorCritic(
            actor_lr=parameters['actor_learning_rate'],
            critic_lr=parameters['critic_learning_rate'],
            gamma=parameters['gamma'],
            state_size=self.state_size,
            action_size=self.nA,
            tau=parameters['tau'])

    def train(self):

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        # Create global step and increment operation
        global_step_tensor = tf.Variable(0,
                                         trainable=False,
                                         name='global_step')
        increment_global_step = tf.assign_add(global_step_tensor, 1)

        # Create model saver
        saver = tf.train.Saver(max_to_keep=None)

        sess = tf.Session(config=config)

        if not self.parameters['restore']:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        self.actor_critic.set_moving_to_target(sess)
        run_id = np.random.randint(10000)

        trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id),
                                            graph=sess.graph)

        # Get action noise
        action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.nA),
            sigma=float(self.parameters['sigma']) * np.ones(self.nA))

        # Fill Replay Memory
        state = self.env.reset()
        fill_amount = 0
        while fill_amount < self.parameters['replay_init_size']:

            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)

            if done:
                state = self.env.reset()
            else:
                fill_amount += 1
                self.memory.add(state, action, reward, done, next_state)
                state = next_state

        # Main Loop
        plots = {'critic_loss': [], 'actor_loss': [], 'episode_reward': []}

        plots_dir = './plots/'
        weights_dir = './weights/'
        graph_dir = './graph/'
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)
        if not os.path.exists(weights_dir):
            os.makedirs(weights_dir)
        if not os.path.exists(graph_dir):
            os.makedirs(graph_dir)

        saver.export_meta_graph(graph_dir + self.parameters['env'] +
                                '/graph.meta')

        #cumulative step counter
        cumu_step = 0

        for i in range(self.parameters['num_epochs']):

            avg_epoch_rewards = 0
            n_epochs = 1

            for e in range(self.parameters['num_episodes']):

                state = self.env.reset()

                ep_reward = 0
                ep_n_action = 0

                # Perform rollout
                for _ in range(500):
                    noise = action_noise()
                    action = self.actor_critic.pi(sess, state[None, ...])
                    action += noise
                    action = np.clip(action, self.env.action_space.low[0],
                                     self.env.action_space.high[0])

                    assert action.shape == self.env.action_space.shape

                    next_state, reward, done, _ = self.env.step(action)
                    # print(action)
                    # print(next_state)
                    # print(reward)

                    self.memory.add(state, action, reward, done, next_state)

                    if self.parameters['render_train']: self.env.render()

                    ep_reward += reward
                    ep_n_action += 1
                    cumu_step += 1
                    state = next_state

                    # Perform train
                    avg_critic_loss = 0.0
                    avg_actor_loss = 0.0
                    for t in range(self.parameters['num_train_steps']):
                        s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample(
                        )
                        # Train actor critic model
                        _, _, critic_loss, actor_loss = self.actor_critic.update(
                            sess=sess,
                            filewriter=trainwriter,
                            state_batch=s_state,
                            next_state_batch=s_next_state,
                            action_batch=s_action,
                            reward_batch=s_reward,
                            done_batch=s_terminal)
                        avg_critic_loss += critic_loss
                        avg_actor_loss += actor_loss

                        sess.run(increment_global_step)

                    avg_critic_loss /= self.parameters['num_train_steps']
                    avg_actor_loss /= self.parameters['num_train_steps']

                    if done:
                        reward_summary = tf.Summary(value=[
                            tf.Summary.Value(tag="ep_rewards",
                                             simple_value=ep_reward)
                        ])
                        trainwriter.add_summary(
                            reward_summary,
                            i * self.parameters['num_episodes'] + e)
                        action_noise.reset()
                        break

                avg_epoch_rewards = avg_epoch_rewards + (
                    ep_reward - avg_epoch_rewards) / n_epochs
                n_epochs += 1


                print('Epoch: {:d} | Reward: {:d} | Avg_Q_loss: {:.4f} | Avg_a_loss: {:.4f} | Episode: {:d} | Step: {:d} | Cumu Step: {:d}'\
                 .format(i+1, int(ep_reward), avg_critic_loss, avg_actor_loss, e+1, ep_n_action, cumu_step))

                if e % 19 == 0:
                    save_path = saver.save(
                        sess,
                        weights_dir + self.parameters['env'] + '/model.ckpt',
                        global_step=i * e + 1)

                plots['episode_reward'].append(ep_reward)
                plots['critic_loss'].append(critic_loss)
                plots['actor_loss'].append(critic_loss)

                pickle.dump(
                    plots,
                    open(plots_dir + self.parameters['env'] + '_plot.pickle',
                         'wb'))

    def test(self):
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        saver = tf.train.Saver()
        sess = tf.Session(config=config)

        saver.restore(
            sess,
            tf.train.latest_checkpoint(
                './weights/HalfCheetah-v2_kirkiles_train50episode_noise_norm_bufsize1Mi1k'
            ))

        while True:
            state = self.env.reset()
            # Perform rollout
            while True:
                action = self.actor_critic.pi(sess, state[None, ...])
                action = np.clip(action, self.env.action_space.low[0],
                                 self.env.action_space.high[0])
                #print(action)
                assert action.shape == self.env.action_space.shape
                next_state, reward, done, _ = self.env.step(action)
                self.env.render()

                if done:
                    break

                state = next_state
예제 #19
0
파일: td3.py 프로젝트: soumyarani/mbbl
class TD3:
    def __init__(self,
                 env_id,
                 render=False,
                 num_process=1,
                 memory_size=1000000,
                 lr_p=1e-3,
                 lr_v=1e-3,
                 gamma=0.99,
                 polyak=0.995,
                 action_noise=0.1,
                 target_action_noise_std=0.2,
                 target_action_noise_clip=0.5,
                 explore_size=10000,
                 step_per_iter=3000,
                 batch_size=100,
                 min_update_step=1000,
                 update_step=50,
                 policy_update_delay=2,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.gamma = gamma
        self.polyak = polyak
        self.action_noise = action_noise
        self.target_action_noise_std = target_action_noise_std
        self.target_action_noise_clip = target_action_noise_clip
        self.memory = Memory(memory_size)
        self.explore_size = explore_size
        self.step_per_iter = step_per_iter
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.batch_size = batch_size
        self.min_update_step = min_update_step
        self.update_step = update_step
        self.policy_update_delay = policy_update_delay
        self.model_path = model_path
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, num_states, self.num_actions = get_env_info(
            self.env_id)
        assert env_continuous, "TD3 is only applicable to continuous environment !!!!"

        self.action_low, self.action_high = self.env.action_space.low[
            0], self.env.action_space.high[0]
        # seeding
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(num_states, self.num_actions,
                                 self.action_high).to(device)
        self.policy_net_target = Policy(num_states, self.num_actions,
                                        self.action_high).to(device)

        self.value_net_1 = Value(num_states, self.num_actions).to(device)
        self.value_net_target_1 = Value(num_states,
                                        self.num_actions).to(device)
        self.value_net_2 = Value(num_states, self.num_actions).to(device)
        self.value_net_target_2 = Value(num_states,
                                        self.num_actions).to(device)

        self.running_state = ZFilter((num_states, ), clip=5)

        self.num_states = num_states

        if self.model_path:
            print("Loading Saved Model {}_td3.p".format(self.env_id))
            self.policy_net, self.value_net_1, self.value_net_2, self.running_state = pickle.load(
                open('{}/{}_td3.p'.format(self.model_path, self.env_id), "rb"))

        self.policy_net_target.load_state_dict(self.policy_net.state_dict())
        self.value_net_target_1.load_state_dict(self.value_net_1.state_dict())
        self.value_net_target_2.load_state_dict(self.value_net_2.state_dict())

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v_1 = optim.Adam(self.value_net_1.parameters(),
                                        lr=self.lr_v)
        self.optimizer_v_2 = optim.Adam(self.value_net_2.parameters(),
                                        lr=self.lr_v)

    def choose_action(self, state, noise_scale):
        """select action"""
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action, log_prob = self.policy_net.get_action_log_prob(state)
        action = action.cpu().numpy()[0]
        # add noise
        noise = noise_scale * np.random.randn(self.num_actions)
        action += noise
        action = np.clip(action, -self.action_high, self.action_high)
        return action

    def eval(self, i_iter, render=False):
        """evaluate model"""
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            action = self.choose_action(state, 0)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter):
        """interact"""
        global_steps = (i_iter - 1) * self.step_per_iter
        log = dict()
        num_steps = 0
        num_episodes = 0
        total_reward = 0
        min_episode_reward = float('inf')
        max_episode_reward = float('-inf')

        while num_steps < self.step_per_iter:
            state = self.env.reset()
            episode_reward = 0

            for t in range(10000):

                if self.render:
                    self.env.render()

                if global_steps < self.explore_size:  # explore
                    action = self.env.action_space.sample()
                else:  # action with noise
                    action = self.choose_action(state, self.action_noise)

                next_state, reward, done, _ = self.env.step(action)
                # next_state = self.running_state(next_state)
                mask = 0 if done else 1
                # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
                self.memory.push(state, action, reward, next_state, mask, None)

                episode_reward += reward
                global_steps += 1
                num_steps += 1

                if global_steps >= self.min_update_step and global_steps % self.update_step == 0:
                    for k in range(self.update_step):
                        batch, permuted_batch = self.memory.sample(
                            self.batch_size)  # random sample batch
                        self.update(batch, permuted_batch, k)

                if done or num_steps >= self.step_per_iter:
                    break

                state = next_state

            num_episodes += 1
            total_reward += episode_reward
            min_episode_reward = min(episode_reward, min_episode_reward)
            max_episode_reward = max(episode_reward, max_episode_reward)

        self.env.close()

        log['num_steps'] = num_steps
        log['num_episodes'] = num_episodes
        log['total_reward'] = total_reward
        log['avg_reward'] = total_reward / num_episodes
        log['max_episode_reward'] = max_episode_reward
        log['min_episode_reward'] = min_episode_reward

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}")

        # record reward information
        writer.add_scalar("rewards/total_reward", log['total_reward'], i_iter)
        writer.add_scalar("rewards/average_reward", log['avg_reward'], i_iter)
        writer.add_scalar("rewards/min_reward", log['min_episode_reward'],
                          i_iter)
        writer.add_scalar("rewards/max_reward", log['max_episode_reward'],
                          i_iter)
        writer.add_scalar("rewards/num_steps", log['num_steps'], i_iter)

    def update(self, batch, batch2, k_iter):
        """learn model"""
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)

        # update by TD3
        alg_step_stats = td3_step(
            self.policy_net, self.policy_net_target, self.value_net_1,
            self.value_net_target_1, self.value_net_2, self.value_net_target_2,
            self.optimizer_p, self.optimizer_v_1, self.optimizer_v_2,
            batch_state, batch_action, batch_reward, batch_next_state,
            batch_mask, self.gamma, self.polyak, self.target_action_noise_std,
            self.target_action_noise_clip, self.action_high,
            k_iter % self.policy_update_delay == 0)

    def save(self, save_path):
        """save model"""
        check_path(save_path)
        pickle.dump((self.policy_net, self.value_net_1, self.value_net_2,
                     self.running_state),
                    open('{}/{}_td3.p'.format(save_path, self.env_id), 'wb'))
예제 #20
0
def train(rank, args, shared_model, opt_ac, can_save, shared_obs_stats):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = [1] * 48

    if args.render and can_save:
        env = RunEnv(visualize=True)
    else:
        env = RunEnv(visualize=False)

    #running_state = ZFilter((num_inputs,), clip=5)
    #running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    start_time = time.time()

    for i_episode in count(1):
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())
        ac_net.zero_grad()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        #Tot_loss = 0
        #Tot_num =
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            last_state = process_observation(state)
            state = process_observation(state)
            last_state, state = transform_observation(last_state, state)

            state = numpy.array(state)
            #global last_state
            #last_state,_ = update_observation(last_state,state)
            #last_state,state = update_observation(last_state,state)
            #print(state.shape[0])
            #print(state[41])
            state = Variable(torch.Tensor(state).unsqueeze(0))
            shared_obs_stats.observes(state)
            state = shared_obs_stats.normalize(state)
            state = state.data[0].numpy()
            #state = running_state(state)

            reward_sum = 0
            #timer = time.time()
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)
                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(state)
                    print(action)
                    print('ERROR')
                    raise RuntimeError('action NaN problem')
                #print(action)
                #print("------------------------")
                #timer = time.time()

                BB = numpy.append(action, action)
                #print(BB)

                reward = 0
                if args.skip:
                    #env.step(action)
                    _, A, _, _ = env.step(BB)
                    reward += A
                    _, A, _, _ = env.step(BB)
                    reward += A

                next_state, A, done, _ = env.step(BB)
                reward += A
                next_state = process_observation(next_state)
                last_state, next_state = transform_observation(
                    last_state, next_state)

                next_state = numpy.array(next_state)
                reward_sum += reward
                #print('env:')
                #print(time.time()-timer)

                #last_state ,next_state = update_observation(last_state,next_state)
                #next_state = running_state(next_state)
                next_state = Variable(torch.Tensor(next_state).unsqueeze(0))
                shared_obs_stats.observes(next_state)
                next_state = shared_obs_stats.normalize(next_state)
                next_state = next_state.data[0].numpy()
                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1

            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()

        #print('env:')
        #print(time.time()-timer)

        #timer = time.time()
        update_params_actor_critic(batch, args, shared_model, ac_net, opt_ac)
        #print('backpropagate:')
        #print(time.time()-timer)

        epoch = i_episode
        if (i_episode % args.log_interval == 0) and (rank == 0):

            print('TrainEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.
                  format(i_episode, reward_sum, reward_batch))
            if reward_batch > best_result:
                best_result = reward_batch
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': ac_net.state_dict(),
                        'optimizer': opt_ac,
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, 'best')

            if epoch % 30 == 1:
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': ac_net.state_dict(),
                        'optimizer': opt_ac,
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, epoch)
예제 #21
0
def main(gamma=0.995, env_name="Walker2d-v2", tau=0.97, number_of_batches=500,\
        batch_size=5000, maximum_steps=10000, render=False,\
        seed=543, log_interval=1, entropy_coeff=0.0, clip_epsilon=0.2):
    env = gym.make(env_name)
    #Get number of inputs for A3CActor
    num_inputs = env.observation_space.shape[0]
    #Get number of outputs required for describing action
    num_actions = env.action_space.shape[0]
    env.seed(seed)
    torch.manual_seed(seed)

    actor_net = A3CActor(num_inputs, num_actions)
    actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.001)

    running_state = ZFilter((num_inputs,), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)
    episode_lengths = []
    plot_rew = []
    for i_episode in range(number_of_batches):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(maximum_steps):
                action = select_action(state, actor_net)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state, reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t-1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()
        plot_rew.append(reward_batch)
        update_params(batch, actor_net, actor_optimizer, gamma, tau, clip_epsilon)
        if i_episode % log_interval == 0:
            print('Episode {}\t Last reward: {}\tAverage reward {:.2f}'.format(
                i_episode, reward_sum, reward_batch))

    plot_epi = []
    for i in range (number_of_batches):
        plot_epi.append(i)
    trace = go.Scatter( x = plot_epi, y = plot_rew)
    layout = go.Layout(title='A2C',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')))

    plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg')

    return
예제 #22
0
def train(rank, args, traffic_light, counter, shared_model,
          shared_grad_buffers, shared_obs_stats, opt_ac):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = [0] * 41
    last_v = [0] * 10
    #last_state = numpy.zeros(48)

    env = RunEnv(visualize=False)

    #running_state = ZFilter((num_inputs,), clip=5)
    #running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    #running_state = ZFilter((num_inputs,), clip=5)

    start_time = time.time()

    for i_episode in range(args.start_epoch + 1, 999999):
        #print(shared_obs_stats.n[0])
        #print('hei')
        #if rank == 0:
        #    print(running_state.rs._n)

        signal_init = traffic_light.get()
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        #Tot_loss = 0
        #Tot_num =
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            #state = numpy.array(state)

            last_state, last_v, state = process_observation(
                last_state, last_v, state)

            state = numpy.array(state)

            #state = running_state(state)

            state = Variable(torch.Tensor(state).unsqueeze(0))
            shared_obs_stats.observes(state)
            state = shared_obs_stats.normalize(state)
            state = state.data[0].numpy()

            #print(state)
            #return

            #print(AA)

            #print(type(AA))
            #print(type(state))
            #print(AA.shape)
            #print(state.shape)

            reward_sum = 0
            #timer = time.time()
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)
                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(state)
                    print(action)
                    print(ac_net.affine1.weight)
                    print(ac_net.affine1.weight.data)
                    print('ERROR')
                    #action = select_action_actor_critic(state,ac_net)
                    #action = action.data[0].numpy()
                    #state = state + numpy.random.rand(args.feature)*0.001

                    raise RuntimeError('action NaN problem')
                #print(action)
                #print("------------------------")
                #timer = time.time()
                reward = 0
                if args.skip:
                    #env.step(action)
                    _, A, _, _ = env.step(action)
                    reward += A
                    _, A, _, _ = env.step(action)
                    reward += A
                BB = numpy.append(action, action)
                next_state, A, done, _ = env.step(BB)
                reward += A
                #print(next_state)
                #last_state = process_observation(state)
                last_state, last_v, next_state = process_observation(
                    last_state, last_v, next_state)

                next_state = numpy.array(next_state)
                #print(next_state)
                #print(next_state.shape)
                #return
                reward_sum += reward
                #print('env:')
                #print(time.time()-timer)

                #last_state ,next_state = update_observation(last_state,next_state)

                #next_state = running_state(next_state)

                next_state = Variable(torch.Tensor(next_state).unsqueeze(0))
                shared_obs_stats.observes(next_state)
                next_state = shared_obs_stats.normalize(next_state)
                next_state = next_state.data[0].numpy()

                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1

            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()

        #print('env:')
        #print(time.time()-timer)

        #timer = time.time()
        update_params_actor_critic(batch, args, ac_net, opt_ac)
        shared_grad_buffers.add_gradient(ac_net)

        counter.increment()

        epoch = i_episode
        if (i_episode % args.log_interval == 0) and (rank == 0):

            print(
                'TrainEpisode {}\tTime{}\tLast reward: {}\tAverage reward {:.2f}'
                .format(
                    i_episode,
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, reward_batch))

            epoch = i_episode
            if reward_batch > best_result:
                best_result = reward_batch
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': shared_model.state_dict(),
                        'optimizer': opt_ac.state_dict(),
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, 'best')

            if epoch % 30 == 1:
                save_model(
                    {
                        'epoch': epoch,
                        'bh': args.bh,
                        'state_dict': shared_model.state_dict(),
                        'optimizer': opt_ac.state_dict(),
                        'obs': shared_obs_stats,
                    }, PATH_TO_MODEL, epoch)
        # wait for a new signal to continue
        while traffic_light.get() == signal_init:
            pass
예제 #23
0
def main(gamma=0.995, env_name="Walker2d-v2", tau=0.97, number_of_batches=500,\
        batch_size=5000, maximum_steps=10000, render=False,\
        seed=543, log_interval=1, entropy_coeff=0.0, clip_epsilon=0.2):
    env = gym.make(env_name)
    #Get number of inputs for A3CActor
    num_inputs = env.observation_space.shape[0]
    #Get number of outputs required for describing action
    num_actions = env.action_space.shape[0]
    env.seed(seed)
    torch.manual_seed(seed)

    actor_net = A3CActor(num_inputs, num_actions)
    actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.001)

    running_state = ZFilter((num_inputs, ), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)
    episode_lengths = []

    for i_episode in range(number_of_batches):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(maximum_steps):
                action = select_action(state, actor_net)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t - 1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()
        update_params(batch, actor_net, actor_optimizer, gamma, tau,
                      clip_epsilon)
        if i_episode % log_interval == 0:
            print('Episode {}\t Last reward: {}\tAverage reward {:.2f}'.format(
                i_episode, reward_sum, reward_batch))
    return
예제 #24
0
        std0 = Variable(std1.data)
        kl = log_std1 - log_std0 + (std0.pow(2) + (mean0 - mean1).pow(2)) / (
            2.0 * std1.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)

    trpo_step(policy_net, get_loss, get_kl, args.max_kl, args.damping)


running_state = ZFilter((num_inputs, ), clip=5)
running_reward = ZFilter((1, ), demean=False, clip=10)

plt.ion()
reward_plot = []
reward_batch_0 = 1000
for i_episode in count(1):
    memory_pro = Memory()
    memory_adv = Memory()

    num_steps = 0
    reward_batch = 0
    num_episodes = 0
    while num_steps < args.batch_size:
        state = env.reset()
        state = running_state(state)

        reward_sum = 0
        for t in range(10000):  # Don't infinite loop while learning
            action_pro = select_action(policy_net_pro, value_net_pro, state)
            action_pro = action_pro.data[0].numpy()
            action_adv = select_action(policy_net_adv, value_net_adv, state)
            action_adv = action_adv.data[0].numpy()
예제 #25
0
                                                - set(obstacles))
            elif np.argmax(c[0, :]) == 3:  # right half
                set_diff = list(set(product(tuple(range(width/2, width-2)),
                                            tuple(range(2, height)))) \
                                                - set(obstacles))

        start_loc = sample_start(set_diff)
        s = State(start_loc, obstacles)
        #state = running_state(state)
        policy_net.reset()
        reward_net.reset()
        R.reset()

        reward_sum = 0
        true_reward_sum = 0
        memory = Memory()

        for t in range(
                args.max_ep_length):  # Don't infinite loop while learning
            ct = c[t, :]
            action = select_action(np.concatenate((s.state, ct)))
            action = epsilon_greedy_linear_decay(action.data.cpu().numpy(),
                                                 args.num_episodes * 0.5,
                                                 i_episode,
                                                 low=0.05,
                                                 high=0.5)
            reward = -float(
                reward_net(
                    torch.cat(
                        (Variable(torch.from_numpy(
                            s.state).unsqueeze(0)).type(dtype),
예제 #26
0
class SAC:
    def __init__(self, env_name, n_states, n_actions, memory_size, batch_size,
                 gamma, alpha, lr, action_bounds, reward_scale):
        self.env_name = env_name
        self.n_states = n_states
        self.n_actions = n_actions
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.alpha = alpha
        self.lr = lr
        self.action_bounds = action_bounds
        self.reward_scale = reward_scale
        self.memory = Memory(memory_size=self.memory_size)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        self.policy_network = PolicyNetwork(
            n_states=self.n_states,
            n_actions=self.n_actions,
            action_bounds=self.action_bounds).to(self.device)
        self.q_value_network1 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.q_value_network2 = QvalueNetwork(n_states=self.n_states,
                                              n_actions=self.n_actions).to(
                                                  self.device)
        self.value_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network = ValueNetwork(n_states=self.n_states).to(
            self.device)
        self.value_target_network.load_state_dict(
            self.value_network.state_dict())
        self.value_target_network.eval()

        self.value_loss = torch.nn.MSELoss()
        self.q_value_loss = torch.nn.MSELoss()

        self.value_opt = Adam(self.value_network.parameters(), lr=self.lr)
        self.q_value1_opt = Adam(self.q_value_network1.parameters(),
                                 lr=self.lr)
        self.q_value2_opt = Adam(self.q_value_network2.parameters(),
                                 lr=self.lr)
        self.policy_opt = Adam(self.policy_network.parameters(), lr=self.lr)

    def store(self, state, reward, done, action, next_state):
        state = from_numpy(state).float().to("cpu")
        reward = torch.Tensor([reward]).to("cpu")
        done = torch.Tensor([done]).to("cpu")
        action = torch.Tensor([action]).to("cpu")
        next_state = from_numpy(next_state).float().to("cpu")
        self.memory.add(state, reward, done, action, next_state)

    def unpack(self, batch):
        batch = Transition(*zip(*batch))

        states = torch.cat(batch.state).view(self.batch_size,
                                             self.n_states).to(self.device)
        rewards = torch.cat(batch.reward).view(self.batch_size,
                                               1).to(self.device)
        dones = torch.cat(batch.done).view(self.batch_size, 1).to(self.device)
        actions = torch.cat(batch.action).view(-1,
                                               self.n_actions).to(self.device)
        next_states = torch.cat(batch.next_state).view(
            self.batch_size, self.n_states).to(self.device)

        return states, rewards, dones, actions, next_states

    def train(self):
        if len(self.memory) < self.batch_size:
            return 0, 0, 0
        else:
            batch = self.memory.sample(self.batch_size)
            states, rewards, dones, actions, next_states = self.unpack(batch)

            # Calculating the value target
            reparam_actions, log_probs = self.policy_network.sample_or_likelihood(
                states)
            q1 = self.q_value_network1(states, reparam_actions)
            q2 = self.q_value_network2(states, reparam_actions)
            q = torch.min(q1, q2)
            target_value = q.detach() - self.alpha * log_probs.detach()

            value = self.value_network(states)
            value_loss = self.value_loss(value, target_value)

            # Calculating the Q-Value target
            with torch.no_grad():
                target_q = self.reward_scale * rewards + \
                           self.gamma * self.value_target_network(next_states) * (1 - dones)
            q1 = self.q_value_network1(states, actions)
            q2 = self.q_value_network2(states, actions)
            q1_loss = self.q_value_loss(q1, target_q)
            q2_loss = self.q_value_loss(q2, target_q)

            policy_loss = (self.alpha * log_probs - q).mean()

            self.policy_opt.zero_grad()
            policy_loss.backward()
            self.policy_opt.step()

            self.value_opt.zero_grad()
            value_loss.backward()
            self.value_opt.step()

            self.q_value1_opt.zero_grad()
            q1_loss.backward()
            self.q_value1_opt.step()

            self.q_value2_opt.zero_grad()
            q2_loss.backward()
            self.q_value2_opt.step()

            self.soft_update_target_network(self.value_network,
                                            self.value_target_network)

            return value_loss.item(), 0.5 * (
                q1_loss + q2_loss).item(), policy_loss.item()

    def choose_action(self, states):
        states = np.expand_dims(states, axis=0)
        states = from_numpy(states).float().to(self.device)
        action, _ = self.policy_network.sample_or_likelihood(states)
        return action.detach().cpu().numpy()[0]

    @staticmethod
    def soft_update_target_network(local_network, target_network, tau=0.005):
        for target_param, local_param in zip(target_network.parameters(),
                                             local_network.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1 - tau) * target_param.data)

    def save_weights(self):
        torch.save(self.policy_network.state_dict(),
                   self.env_name + "_weights.pth")

    def load_weights(self):
        self.policy_network.load_state_dict(
            torch.load(self.env_name + "_weights.pth"))

    def set_to_eval_mode(self):
        self.policy_network.eval()
예제 #27
0
파일: CoTGAIL.py 프로젝트: Ark0617/CoTGAIL
                args.weight,
                'mixture',
                args.prior,
                args.traj_size,
                folder=args.ofolder,
                fname=fname,
                noise=args.noise)
if not args.only and not args.is_eval and args.use_cot and args.weight:
    labeled_traj = torch.Tensor(expert_traj[:num_label, :]).to(device)
    unlabeled_traj = torch.Tensor(expert_traj[num_label:, :]).to(device)
    label = torch.Tensor(expert_conf[:num_label, :]).to(device)
    batch_size = min(128, labeled_traj.shape[0])
    glfw.init()
    print('start training')
    for episode in range(args.num_epochs):
        memory = Memory()
        num_steps = 0
        num_episodes = 0
        reward_batch = []
        states = []
        actions = []
        mem_actions = []
        mem_mask = []
        mem_next = []
        sup_losses = []
        label = torch.Tensor(expert_conf[:num_label, :]).to(device)
        for i in range(args.sup_iters_per_episode):
            idx = np.random.choice(labeled_traj.shape[0], batch_size)
            labeled_state_action_batch = labeled_traj[idx, :]
            true_conf_batch = label[idx, :]
            sup_loss = sup_train_discriminator_one_step(
예제 #28
0
def main(gamma=0.995, env_name='Walker2d-v2', tau=0.97, seed=543, number_of_batches=500,\
        batch_size=5000, maximum_steps=10000, render=False, log_interval=1, entropy_coeff=0.0,\
        clip_epsilon=0.2, use_joint_pol_val=False):

    torch.set_default_tensor_type('torch.DoubleTensor')
    PI = torch.DoubleTensor([3.1415926])

    env = gym.make(env_name)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]

    env.seed(seed)
    torch.manual_seed(seed)

    policy_net = Policy(num_inputs, num_actions)
    value_net = Value(num_inputs)
    opt_policy = optim.Adam(policy_net.parameters(), lr=0.001)
    opt_value = optim.Adam(value_net.parameters(), lr=0.001)

    running_state = ZFilter((num_inputs,), clip=5)
    running_reward = ZFilter((1,), demean=False, clip=10)
    episode_lengths = []
    plot_rew = []
    for i_episode in range(number_of_batches):
        memory = Memory()

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < batch_size:
            state = env.reset()
            state = running_state(state)

            reward_sum = 0
            for t in range(maximum_steps): # Don't infinite loop while learning
                action = select_action(state, policy_net)
                action = action.data[0].numpy()
                next_state, reward, done, _ = env.step(action)
                reward_sum += reward

                next_state = running_state(next_state)

                mask = 1
                if done:
                    mask = 0

                memory.push(state, np.array([action]), mask, next_state, reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state
            num_steps += (t-1)
            num_episodes += 1
            reward_batch += reward_sum

        reward_batch /= num_episodes
        batch = memory.sample()
        plot_rew.append(reward_batch)
        update_params(batch, policy_net, value_net, gamma, opt_policy, opt_value)

        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {}\tAverage reward {:.2f}'.format(
                i_episode, reward_sum, reward_batch))
    
    plot_epi = []
    for i in range (number_of_batches):
        plot_epi.append(i)
    trace = go.Scatter( x = plot_epi, y = plot_rew) 
    layout = go.Layout(title='PPO',xaxis=dict(title='Episodes', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')),
    yaxis=dict(title='Average Reward', titlefont=dict(family='Courier New, monospace',size=18,color='#7f7f7f')))

    plotly.offline.plot({"data": [trace], "layout": layout},filename='PPO.html',image='jpeg')
예제 #29
0
def test(rank, args, shared_model, opt_ac):
    best_result = -1000
    torch.manual_seed(args.seed + rank)
    torch.set_default_tensor_type('torch.DoubleTensor')
    num_inputs = args.feature
    num_actions = 9
    last_state = numpy.zeros(41)

    if args.render:
        env = RunEnv(visualize=True)
    else:
        env = RunEnv(visualize=False)

    running_state = ZFilter((num_inputs, ), clip=5)
    running_reward = ZFilter((1, ), demean=False, clip=10)
    episode_lengths = []

    PATH_TO_MODEL = '../models/' + str(args.bh)

    ac_net = ActorCritic(num_inputs, num_actions)

    start_time = time.time()

    for i_episode in count(1):
        memory = Memory()
        ac_net.load_state_dict(shared_model.state_dict())

        num_steps = 0
        reward_batch = 0
        num_episodes = 0
        while num_steps < args.batch_size:
            #state = env.reset()
            #print(num_steps)
            state = env.reset(difficulty=0)
            state = numpy.array(state)
            #global last_state
            #last_state = state
            #last_state,_ = update_observation(last_state,state)
            #last_state,state = update_observation(last_state,state)
            #print(state.shape[0])
            #print(state[41])
            state = running_state(state)

            reward_sum = 0
            for t in range(10000):  # Don't infinite loop while learning
                #print(t)
                #timer = time.time()
                if args.use_sep_pol_val:
                    action = select_action(state)
                else:
                    action = select_action_actor_critic(state, ac_net)

                #print(action)
                action = action.data[0].numpy()
                if numpy.any(numpy.isnan(action)):
                    print(action)
                    puts('ERROR')
                    return
                #print('NN take:')
                #print(time.time()-timer)
                #print(action)
                #print("------------------------")

                #timer = time.time()
                if args.skip:
                    #env.step(action)
                    _, reward, _, _ = env.step(action)
                    reward_sum += reward
                next_state, reward, done, _ = env.step(action)
                next_state = numpy.array(next_state)
                reward_sum += reward

                #print('env take:')
                #print(time.time()-timer)

                #timer = time.time()

                #last_state ,next_state = update_observation(last_state,next_state)
                next_state = running_state(next_state)
                #print(next_state[41:82])

                mask = 1
                if done:
                    mask = 0

                #print('update take:')
                #print(time.time()-timer)

                #timer = time.time()

                memory.push(state, np.array([action]), mask, next_state,
                            reward)

                #print('memory take:')
                #print(time.time()-timer)

                #if args.render:
                #    env.render()
                if done:
                    break

                state = next_state

            num_steps += (t - 1)
            num_episodes += 1
            #print(num_episodes)
            reward_batch += reward_sum

        #print(num_episodes)
        reward_batch /= num_episodes
        batch = memory.sample()

        #update_params_actor_critic(batch,args,shared_model,ac_net,opt_ac)
        time.sleep(60)

        if i_episode % args.log_interval == 0:
            File = open(PATH_TO_MODEL + '/record.txt', 'a+')
            File.write("Time {}, episode reward {}, Average reward {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, reward_batch))
            File.close()
            #print('TestEpisode {}\tLast reward: {}\tAverage reward {:.2f}'.format(
            #    i_episode, reward_sum, reward_batch))
            print("Time {}, episode reward {}, Average reward {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, reward_batch))
            #print('!!!!')

        epoch = i_episode
        if reward_batch > best_result:
            best_result = reward_batch
            save_model(
                {
                    'epoch': epoch,
                    'bh': args.bh,
                    'state_dict': shared_model.state_dict(),
                    'optimizer': opt_ac.state_dict(),
                }, PATH_TO_MODEL, 'best')

        if epoch % 30 == 1:
            save_model(
                {
                    'epoch': epoch,
                    'bh': args.bh,
                    'state_dict': shared_model.state_dict(),
                    'optimizer': opt_ac.state_dict(),
                }, PATH_TO_MODEL, epoch)
예제 #30
0
            elif np.argmax(c[0,:]) == 3: # right half
                set_diff = list(set(product(tuple(range(width/2, width-2)),
                                            tuple(range(2, height)))) \
                                                    - set(obstacles))

        start_loc = sample_start(set_diff)
        s = State(start_loc, obstacles)
        #state = running_state(state)
        policy_net.reset()
        reward_net.reset()
        posterior_net.reset()
        R.reset()

        reward_sum = 0
        true_reward_sum = 0
        memory = Memory()

        for t in range(args.max_ep_length): # Don't infinite loop while learning
            ct = c[t,:]
            action = select_action(np.concatenate((s.state, ct)))
            action = epsilon_greedy_linear_decay(action.data.cpu().numpy(),
                                                 args.num_episodes * 0.5,
                                                 i_episode,
                                                 low=0.05,
                                                 high=0.3)
            reward = -float(reward_net(torch.cat((
                Variable(torch.from_numpy(
                    s.state).unsqueeze(0)).type(dtype),
                Variable(torch.from_numpy(
                    oned_to_onehot(action)).unsqueeze(0)).type(dtype),
                Variable(torch.from_numpy(