예제 #1
0
    def __init__(self, state_size, action_size, random_seed):
        """
        Args:
        ======
            state_size (int): state dim
            action_size (int): action dim
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # actor net initialization
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # critic net initialization
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck Exploration Noise Process
        self.noise = OUNoise(action_space=action_size, seed=random_seed)

        # Replay memory init
        self.memory = Memory(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    def __init__(self, hidden_size, env):
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        self.Actor = Actor(input_size=self.num_states,
                           hidden_size=hidden_size,
                           output_size=self.num_actions).cuda()

        self.Actor_target = Actor(input_size=self.num_states,
                                  hidden_size=hidden_size,
                                  output_size=self.num_actions).cuda()

        self.Critic = Critic(input_size=self.num_states,
                             hidden_size=hidden_size,
                             output_size=self.num_actions).cuda()

        self.Critic_target = Critic(input_size=self.num_states,
                                    hidden_size=hidden_size,
                                    output_size=self.num_actions).cuda()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = param.data

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data = param.data

        self.Memory = Memory(30000)
        self.criterion = nn.MSELoss().cuda()
        self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(),
                                                lr=1e-2)
        self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(),
                                                 lr=1e-1)
예제 #3
0
 def collect_samples(self, mini_batch_size, size=1):
     num_step = 0
     memory = Memory()
     while num_step < mini_batch_size:
         discrete_state, continuous_state = self.reset(size)
         for walk_step in range(self.max_traj_length - 1):
             with torch.no_grad():
                 discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step(
                     discrete_state, continuous_state, size)
             # Currently we assume the exploration step is not done until it reaches max_traj_length.
             mask = torch.ones((size, 1), device=device)
             memory.push(discrete_state.type(FloatTensor), continuous_state,
                         discrete_action.type(FloatTensor),
                         continuous_action,
                         next_discrete_state.type(FloatTensor),
                         next_continuous_state, old_log_prob, mask)
             discrete_state, continuous_state = next_discrete_state, next_continuous_state
             num_step += 1
             if num_step >= mini_batch_size:
                 return memory
         # one more step for push done
         with torch.no_grad():
             discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step(
                 discrete_state, continuous_state, size)
             mask = torch.zeros((size, 1), device=device)
             memory.push(discrete_state.type(FloatTensor), continuous_state,
                         discrete_action.type(FloatTensor),
                         continuous_action,
                         next_discrete_state.type(FloatTensor),
                         next_continuous_state, old_log_prob, mask)
             num_step += 1
     return memory
예제 #4
0
 def inference_speed_memory(self, batch_size, seq_length):
     # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
     key = jax.random.PRNGKey(0)
     input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
     @jax.jit
     def ref_step():
         out = self.model(input_ids=input_ids)
         return out[0]
     if jax.local_devices()[0].platform == 'gpu':
         nvml.nvmlInit()
         ref_step().block_until_ready()
         handle = nvml.nvmlDeviceGetHandleByIndex(0)
         meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
         max_bytes_in_use = meminfo.used
         memory = Memory(max_bytes_in_use)
         # shutdown nvml
         nvml.nvmlShutdown()
     else:
         memory = None
     timeit.repeat("ref_step().block_until_ready()", repeat=1, number=2,globals=locals())
     if self.jit:
         runtimes = timeit.repeat("ref_step().block_until_ready()", repeat=self.repeat,number=3,globals=locals())
     else:
         with jax.disable_jit():
             runtimes = timeit.repeat("ref_step().block_until_ready()",repeat=self.repeat,number=3,globals=locals())
     return float(np.min(runtimes)/3.0), memory
예제 #5
0
 def collect_samples(self, batch_size=1):
     memory = Memory()
     num_trajs = (batch_size + args.sample_traj_length -
                  1) // args.sample_traj_length
     onehot_state, multihot_state, continuous_state = self.reset(num_trajs)
     for walk_step in range(self.max_traj_length - 1):
         with torch.no_grad():
             onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step(
                 onehot_state, multihot_state, continuous_state, num_trajs)
         # Currently we assume the exploration step is not done until it reaches max_traj_length.
         mask = torch.ones((num_trajs, 1), device=device)
         memory.push(onehot_state.type(FloatTensor),
                     multihot_state.type(FloatTensor), continuous_state,
                     onehot_action.type(FloatTensor),
                     multihot_action.type(FloatTensor), continuous_action,
                     next_onehot_state.type(FloatTensor),
                     next_multihot_state.type(FloatTensor),
                     next_continuous_state, old_log_prob, mask)
         onehot_state, multihot_state, continuous_state = next_onehot_state, next_multihot_state, next_continuous_state
     # one more step for push done
     with torch.no_grad():
         onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step(
             onehot_state, multihot_state, continuous_state, num_trajs)
         mask = torch.zeros((num_trajs, 1), device=device)
         memory.push(onehot_state.type(FloatTensor),
                     multihot_state.type(FloatTensor), continuous_state,
                     onehot_action.type(FloatTensor),
                     multihot_action.type(FloatTensor), continuous_action,
                     next_onehot_state.type(FloatTensor),
                     next_multihot_state.type(FloatTensor),
                     next_continuous_state, old_log_prob, mask)
     return memory, num_trajs
예제 #6
0
 def __init__(self, num_actions, checkpoint=None):
     self.network, self.trainable_parameters = self.init_network(
         num_actions)
     self.optimizer = torch.optim.Adam(self.trainable_parameters, lr=1e-4)
     self.memory = Memory()
     if checkpoint is not None:
         load_checkpoint(self.network, self.optimizer, checkpoint)
예제 #7
0
    def __init__(self,
                 model_func,
                 n_way,
                 n_support,
                 jigsaw=False,
                 lbda=0.0,
                 rotation=False,
                 tracking=False,
                 use_bn=True,
                 pretrain=False,
                 image_loader=None,
                 len_dataset=None):
        super(ProtoNet, self).__init__(model_func, n_way, n_support, use_bn,
                                       pretrain)
        self.loss_fn = nn.CrossEntropyLoss()

        self.len_dataset = len_dataset
        self.cuda()
        self.memory = Memory(size=len_dataset, weight=0.5, device='cuda')
        self.memory.initialize(self.feature, image_loader)

        self.jigsaw = jigsaw
        self.rotation = rotation
        self.lbda = lbda
        self.global_count = 0

        self.indx = 0

        if self.jigsaw:

            self.projection_transformed_features = nn.Linear(
                512 * 9, 512)  ### Self-supervision branch

            #self.fc6 = nn.Sequential()
            #self.fc6.add_module('fc6_s1',nn.Linear(512, 512))#for resnet
            #self.fc6.add_module('relu6_s1',nn.ReLU(inplace=True))
            #self.fc6.add_module('drop6_s1',nn.Dropout(p=0.5))

            #self.fc7 = nn.Sequential()
            #self.fc7.add_module('fc7',nn.Linear(9*512,4096))#for resnet
            #self.fc7.add_module('relu7',nn.ReLU(inplace=True))
            #self.fc7.add_module('drop7',nn.Dropout(p=0.5))

            #self.classifier = nn.Sequential()
            #self.classifier.add_module('fc8',nn.Linear(4096, 35))

        if self.rotation:
            self.fc6 = nn.Sequential()
            self.fc6.add_module('fc6_s1', nn.Linear(512, 512))  #for resnet
            self.fc6.add_module('relu6_s1', nn.ReLU(inplace=True))
            self.fc6.add_module('drop6_s1', nn.Dropout(p=0.5))

            self.fc7 = nn.Sequential()
            self.fc7.add_module('fc7', nn.Linear(512, 128))  #for resnet
            self.fc7.add_module('relu7', nn.ReLU(inplace=True))
            self.fc7.add_module('drop7', nn.Dropout(p=0.5))

            self.classifier_rotation = nn.Sequential()
            self.classifier_rotation.add_module('fc8', nn.Linear(128, 4))
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=.9)
    args = parser.parse_args()

    env = gym.make(args.environment)
    unroll = 20

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound_high = env.action_space.high
    action_bound_low = env.action_space.low

    agent = direct_policy_search(state_dim, action_dim, action_bound_high,
                                 action_bound_low, unroll, .9, 5,
                                 'direct_policy_search')

    # Replay memory
    memory = Memory(args.replay_mem_size)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            #env.render()
            action = agent.act(sess, state)
            next_state, reward, done, _ = env.step(action)
            total_rewards += float(reward)

            # Store tuple in replay memory
            memory.add([
                np.atleast_2d(state),
                np.atleast_2d(action), reward,
                np.atleast_2d(next_state), done
            ])

            # Training step
            batch = np.array(memory.sample(args.batch_size))
            assert len(batch) > 0
            states = np.concatenate(batch[:, 0], axis=0)

            # Train the agent
            agent.train(sess, states)

            # s <- s'
            state = np.copy(next_state)

            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'unroll', unroll
                epoch += 1
                total_rewards = 0.
                state = env.reset()
예제 #9
0
파일: pai.py 프로젝트: wzkwzk123/new_drl
def main2():
    # Initialize environment.
    import gym
    env = gym.make('Pendulum-v0')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    action_bound_high = env.action_space.high
    action_bound_low = env.action_space.low

    # Initialize agent.
    pai = PAI(environment='Pendulum-v0',
              state_size=state_size,
              action_size=action_size,
              hidden_size=20,
              it_tloop=100,
              it_dyn=5000,
              bs_dyn=100,
              it_policy=1000,
              bs_policy=50,
              K=50,
              T=25,
              action_bound_low=action_bound_low,
              action_bound_high=action_bound_high,
              discount_factor=.9)

    # Initialize replay memory
    memory = Memory(
        400 * 10
    )  #Data from most recent 10 trials (each trial is 400 time steps long).

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(300):
            total_rewards = 0.
            state = env.reset()
            while True:
                action = pai.act(sess, state)
                next_state, reward, done, _ = env.step(action)
                total_rewards += float(reward)

                # Store tuple in replay memory
                memory.add([
                    np.atleast_2d(state),
                    np.atleast_2d(action), reward,
                    np.atleast_2d(next_state), done
                ])

                # s <- s'
                state = np.copy(next_state)

                if done == True:
                    print 'epoch', epoch, 'total rewards', total_rewards

                    # Train the agent
                    pai.train(sess, memory)
                    break
예제 #10
0
    def _rollout_with_memory(self,
                             env,
                             network,
                             args,
                             running_state,
                             max_episode_steps,
                             keep_memory=False):
        memory = Memory()
        num_steps = 0
        reward_list = []
        len_list = []
        while num_steps < args.batch_size:
            state = env.reset()
            if args.state_norm:
                state = running_state(state)
            if args.append_time:
                state = np.append(state, 1.0)
            reward_sum = 0
            for t in range(max_episode_steps):
                action_mean, action_std, value = network(
                    Tensor(state).unsqueeze(0))
                action_mean = action_mean[0]
                action_std = action_std[0]
                action, y = network.select_action(action_mean, action_std)
                action_mean = action_mean.data.numpy()
                action = action.data.numpy()
                y = y.data.numpy()
                next_state, reward, done, info = env.step(action)
                reward_sum += reward
                if args.state_norm:
                    next_state = running_state(next_state)
                if args.append_time:
                    next_state = np.append(next_state,
                                           1 - (t + 1) / max_episode_steps)
                mask = 0 if (done or ((t + 1) == max_episode_steps)) else 1
                memory.push(state, value, action_mean, action, y, mask,
                            next_state, reward)

                if done:
                    break

                state = next_state

            num_steps += (t + 1)
            reward_list.append(reward_sum)
            len_list.append(t + 1)

            meanepreward = np.mean(reward_list)
            meaneplen = np.mean(len_list)

        if keep_memory:
            self.memory = memory
            self.old_std = network.action_std.data
            return meanepreward, meaneplen
        else:
            return memory, meanepreward, meaneplen, num_steps
예제 #11
0
    def __init__(
        self,
        action_dim,
        filters_C,
        kernel_size,
        hidden_R,
        dropout,
        dropout_r,
        Hstep,
        activation,
        is_training_mode,
    ):
        self.policy_clip = 0.2
        self.value_clip = 0.2
        self.entropy_coef = 0.0
        self.vf_loss_coef = 0.5
        self.minibatch = 32
        self.PPO_epochs = 10

        # TODO use predicted results
        action_std = 1.0

        self.cov_mat = tf.fill([action_dim], action_std ** 2)
        self.is_training_mode = is_training_mode

        self.actor = Actor(
            action_dim,
            filters_C,
            kernel_size,
            hidden_R,
            dropout,
            dropout_r,
            activation,
            Hstep,
        )
        self.actor_old = Actor(
            action_dim,
            filters_C,
            kernel_size,
            hidden_R,
            dropout,
            dropout_r,
            activation,
            Hstep,
        )

        self.critic = Critic(
            filters_C, kernel_size, hidden_R, dropout, dropout_r, activation, Hstep
        )
        self.critic_old = Critic(
            filters_C, kernel_size, hidden_R, dropout, dropout_r, activation, Hstep
        )

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=2e-4)
        self.memory = Memory()
        self.utils = Utils()
예제 #12
0
    def train_speed_memory(self, batch_size, seq_length):
        key = jax.random.PRNGKey(0)
        input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
        targets = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
        labels = jax.random.randint(key, (batch_size, seq_length), 0, 2)
        # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
        # targets = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
        # labels = np.random.randint(0,2, (batch_size, seq_length))
        @jax.jit
        def train_step():

            def loss_fn(params):
                token_mask = jnp.where(labels > 0, 1.0, 0.0).astype(self.dtype)
                logits = self.model(input_ids=input_ids, train=True, params=params, dropout_rng=jax.random.PRNGKey(0))[0]
                loss, normalizing_factor = cross_entropy(logits,targets, token_mask)
                jax.profiler.save_device_memory_profile(f"memory/{workload[0]}_{workload[1]}_memory.prof", "gpu")
                return loss / normalizing_factor
            if self.fp16 and jax.local_devices()[0].platform == 'gpu':
                grad_fn = self.dynamic_scale.value_and_grad(loss_fn)
                dyn_scale, is_fin, loss, grad = grad_fn(self.model.params)
            else:
                grad_fn = jax.value_and_grad(loss_fn)
                loss, grad = grad_fn(self.model.params)
            return tree_flatten(grad)[0]


        if jax.local_devices()[0].platform == 'gpu':
            nvml.nvmlInit()
            train_step()
            handle = nvml.nvmlDeviceGetHandleByIndex(0)
            meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
            max_bytes_in_use = meminfo.used
            memory = Memory(max_bytes_in_use)
            # shutdown nvml
            nvml.nvmlShutdown()
        else:
            memory = None
        # timeit.repeat(train_step,repeat=1,number=2)
        timeit.repeat("for i in train_step():i.block_until_ready()", repeat=1, number=2,globals=locals())
        if self.jit:
            # runtimes = timeit.repeat(train_step,repeat=self.repeat,number=3)
            runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals())
        else:
            with jax.disable_jit():
                # runtimes = timeit.repeat(train_step, repeat=self.repeat, number=3)
                runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals())


        return float(np.min(runtimes)/3.0), memory
예제 #13
0
    def train_feature_extractor(self, sess, replay_buffer, batch_size=100, iterations=1, iterations_left=-1):
        try:
            self.buff
        except:
            self.buff = Memory(10000)

        for it in range(iterations):
            while len(self.buff.mem) < batch_size:
                state = replay_buffer.sample(1)[0][0]

                patches = []
                for i in range(0, state.shape[1]-self.w+1, self.s):
                    for j in range(0, state.shape[2]-self.w+1, self.s):
                        patches.append(state[0, i:i+self.w, j:j+self.w, :])
                        assert patches[-1].shape[0] == patches[-1].shape[1]
                        assert patches[-1].shape[0] == self.w
                from random import shuffle
                shuffle(patches)
                self.buff.mem += patches

            batch = self.buff.mem[:batch_size]
            self.buff.mem = self.buff.mem[batch_size:]

            batch = np.concatenate([b[np.newaxis, ...] for b in batch], axis=0)
            batch = self.process_states(batch)
            batch = [b.astype(np.float64) / 255. for b in batch]
            feed_dict = {}
            for i in range(self.no_inputs):
                feed_dict[self.params[i]['input']] = batch[i]

            _, recon_loss, = sess.run([self.update_model_recon,
                                       self.recon_loss],
                                       feed_dict=feed_dict)
                                                  
            print "train_feature_extractor - recon_loss:", recon_loss

########################################################################################################################
            if iterations_left <= 10:
                variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
                import pickle
                recon_x, recon_y, conv1, conv2 = sess.run([self.params[0]['recon'], self.params[1]['recon'], variables[2], variables[4]], feed_dict=feed_dict)
                pickle.dump( [conv1, conv2, batch, recon_x, recon_y], open( "recons.p", "wb" ) )
########################################################################################################################


        return iterations
예제 #14
0
    def __init__(self, stack_size=256, *args, **kwargs):
        self.stack = Stack(size=stack_size)
        self.memory = Memory(memory=kwargs.get('memory', b''))
        self.storage = dict()

        self.args = args
        self.kwargs = kwargs

        self.opcode = None
        self.current_opcode_name = None
        self.code = None
        self.msg = None
        self.code = None

        self.debug = False
        self.pc = 0
        self.prev_pc = -1
        self.stop = False
예제 #15
0
 def __init__(self, env, batch_size=32, gamma=0.99, 
              hidden_units=32, maxlen=10000, 
              tau=0.1, actor_lr=0.001, critic_lr=0.001):
     
     self.env=env
     self.batch_size=batch_size
     self.gamma=gamma
     self.maxlen=maxlen
     
     self.sess=tf.Session()
        
     
     self.actor=Actor(env, self.sess, hidden_units, tau, actor_lr)
     self.critic=Critic(env, self.sess, hidden_units, tau, critic_lr)
     self.memory=Memory(maxlen)
     
     self.sess.run(tf.global_variables_initializer())
     
     self.step=0
예제 #16
0
    def train_feature_extractor(self, sess, replay_buffer, batch_size=100, iterations=1):
        try:
            self.buff
        except:
            self.buff = Memory(10000)

        for it in range(iterations):
            while len(self.buff.mem) < batch_size:
                state = replay_buffer.sample(1)[0][0]
                state = state.astype(np.float64)
                state = state / 255.

                patches = []
                for i in range(0, state.shape[1]-self.w+1, self.s):
                    for j in range(0, state.shape[2]-self.w+1, self.s):
                        patches.append(state[0, i:i+self.w, j:j+self.w, :])
                        assert patches[-1].shape[0] == patches[-1].shape[1]
                        assert patches[-1].shape[0] == self.w
                from random import shuffle
                shuffle(patches)
                self.buff.mem += patches

            batch = self.buff.mem[:batch_size]
            self.buff.mem = self.buff.mem[batch_size:]

            _, recon_loss, = sess.run([self.update_model_recon,
                                       self.recon_loss],
                                       feed_dict={self.x:batch,
                                                  self.y:batch})
            print "train_feature_extractor - recon_loss:", recon_loss

########################################################################################################################
        import pickle
        recon_x, recon_y = sess.run([self.recon_x_, self.recon_y_], feed_dict={self.x:batch, self.y:batch})
        pickle.dump( [batch, recon_x, recon_y], open( "recons.p", "wb" ) )
########################################################################################################################

        return iterations
예제 #17
0
파일: lstd.py 프로젝트: wzkwzk123/new_drl
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=30000)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)


    args = parser.parse_args()

    env = gym.make(args.environment)
    args.action_size = env.action_space.n
    args.input_shape = [None] + list(env.observation_space.shape)

    print args

    # Initialize the controller
    controller = lstd(state_size=args.input_shape[-1],
                      action_size=args.action_size,
                      learning_rate=args.learning_rate)

    # Other parameters
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    for epoch in range(args.epochs):
        total_reward = 0
        observation = env.reset()
        for t in range(1000000):
            env.render()
            controller.updatew()
            action = controller.act(observation)
            if np.random.rand() < epsilon:
                action = np.random.randint(args.action_size)
            observation1, reward, done, info = env.step(action)
            total_reward += reward

            # Add to memory
            #memory.add([observation, action, reward, observation1, done])

            # Reduce epsilon
            time_step += 1.
            epsilon = args.epsilon_min + (args.epsilon_max - args.epsilon_min) * np.exp(-args.epsilon_decay * time_step)

            # Training step
            #batch = np.array(memory.sample(args.batch_size))
            controller.train([observation, action, reward, observation1, done])

            # Set observation
            observation = observation1

            if done:
                print"Episode finished after {} timesteps".format(t+1), 'epoch', epoch, 'total_rewards', total_reward
                break
예제 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-interface", type=str, default='gym!atari')
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=30000)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)

    parser.add_argument("--K",
                        type=int,
                        default=1,
                        help='The number of steps to train the environment')
    parser.add_argument(
        "--L",
        type=int,
        default=1,
        help='The number of Q-learning steps for hypothetical rollouts')
    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    args = parser.parse_args()

    env = env_interface(args.env_interface,
                        args.environment,
                        pixel_feature=False,
                        render=True)

    #args.action_size = env.action_space.n
    args.action_size = env.action_size
    args.input_shape = [None] + list(env.obs_space_shape)

    print args

    # Other parameters
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the GANs
    cgan_state = CGAN(input_shape=args.input_shape,
                      action_size=args.action_size,
                      latent_size=args.latent_size,
                      gen_input_shape=args.input_shape)
    cgan_reward = CGAN(input_shape=args.input_shape,
                       action_size=args.action_size,
                       latent_size=args.latent_size,
                       gen_input_shape=[None, 1])

    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    target_qnet = qnetwork(input_shape=args.input_shape,
                           action_size=args.action_size,
                           scope='target_qnet')
    update_ops = update_target_graph('qnet', 'target_qnet')

    rand_no = np.random.rand()
    #env = gym.wrappers.Monitor(env, '/tmp/cartpole-experiment-' + str(rand_no), force=True, video_callable=False)
    init = tf.initialize_all_variables()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(args.epochs):
            total_reward = 0
            observation = env.reset()
            for t in range(1000000):
                #env.render()
                action = qnet.get_action(sess, observation)
                if np.random.rand() < epsilon:
                    #action = env.action_space.sample()
                    action = np.random.randint(args.action_size)
                observation1, reward, done, info = env.step(action)
                total_reward += reward

                # Add to memory
                memory.add([observation, action, reward, observation1, done])

                # Reduce epsilon
                time_step += 1.
                epsilon = args.epsilon_min + (
                    args.epsilon_max - args.epsilon_min) * np.exp(
                        -args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.learning_rate, target_qnet)

                # Training step: environment model
                for k in range(args.K):
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.vstack(batch[:, 0])
                    actions = np.array(batch[:, 1])
                    rewards = batch[:, 2]
                    states1 = np.vstack(batch[:, 3])

                    _, D_loss_state = sess.run(
                        [cgan_state.D_solver, cgan_state.D_loss],
                        feed_dict={
                            cgan_state.states: states,
                            cgan_state.actions: actions,
                            cgan_state.Z: sample_z(len(batch),
                                                   args.latent_size),
                            cgan_state.X: states1
                        })
                    _, G_loss_state = sess.run(
                        [cgan_state.G_solver, cgan_state.G_loss],
                        feed_dict={
                            cgan_state.states: states,
                            cgan_state.actions: actions,
                            cgan_state.Z: sample_z(len(batch),
                                                   args.latent_size)
                        })

                    _, D_loss_reward = sess.run(
                        [cgan_reward.D_solver, cgan_reward.D_loss],
                        feed_dict={
                            cgan_reward.states: states,
                            cgan_reward.actions: actions,
                            cgan_reward.Z: sample_z(len(batch),
                                                    args.latent_size),
                            cgan_reward.X: rewards[..., np.newaxis]
                        })
                    _, G_loss_reward = sess.run(
                        [cgan_reward.G_solver, cgan_reward.G_loss],
                        feed_dict={
                            cgan_reward.states: states,
                            cgan_reward.actions: actions,
                            cgan_reward.Z: sample_z(len(batch),
                                                    args.latent_size)
                        })
                    #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state

                # Training step: imagination rollouts
                if time_step == 0.:
                    print "time_step 0 here"
                if time_step >= 0.:
                    for l in range(args.L):
                        batch = np.array(memory.sample(args.batch_size))
                        assert len(batch) > 0

                        states1 = np.vstack(batch[:, 3])
                        actions = np.random.randint(args.action_size,
                                                    size=len(batch))
                        dones = np.array([False] * len(batch))

                        G_sample_state = sess.run(cgan_state.G_sample,
                                                  feed_dict={
                                                      cgan_state.states:
                                                      states1,
                                                      cgan_state.actions:
                                                      actions,
                                                      cgan_state.Z:
                                                      sample_z(
                                                          len(batch),
                                                          args.latent_size)
                                                  })
                        G_sample_reward = sess.run(cgan_reward.G_sample,
                                                   feed_dict={
                                                       cgan_reward.states:
                                                       states1,
                                                       cgan_reward.actions:
                                                       actions,
                                                       cgan_reward.Z:
                                                       sample_z(
                                                           len(batch),
                                                           args.latent_size)
                                                   })
                        qnet.train(sess, None, args.learning_rate, target_qnet,
                                   states1, actions, G_sample_reward,
                                   G_sample_state, dones)

                # Set observation
                observation = observation1

                # Update?
                if int(time_step) % args.target_update_freq == 0:
                    #print "Updating target..."
                    sess.run(update_ops)

                if done:
                    print "Episode finished after {} timesteps".format(
                        t + 1), 'epoch', epoch, 'total_rewards', total_reward
                    break
예제 #19
0
def main():
    import gym
    import sys
    import copy
    sys.path.append('../..')
    from utils import Memory

    #env = gym.make('LunarLander-v2')
    env = gym.make('Pendulum-v0')
    #env = gym.make('CartPole-v0')
    mem = Memory(1000000)
    batch_size = 32
    try:
        a_size = env.action_space.n
        a_type = 'discrete'
    except:
        try:
            a_size = env.action_space.shape[0]
            a_type = 'continuous'
        except:
            raise ValueError('Cannot find action size.')
    emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]],
                            a_size=a_size,
                            out_shape=[None, env.observation_space.shape[0]],
                            a_type=a_type,
                            numfactors=256)
    #emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]], a_size=a_size, out_shape=[None, 1], a_type=a_type, numfactors=256)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        while True:
            s = env.reset()

            done = False
            while done == False:
                #env.render()
                #a = np.random.randint(a_size)
                a = random_action(a_size, a_type)
                s_, r, done, _ = env.step(a)

                mem.add([s, a, r, s_, done])
                batch = mem.sample(batch_size)
                if len(batch) == batch_size:
                    states = []
                    actions = []
                    rewards = []
                    states_ = []
                    for i in range(batch_size):
                        states.append(batch[i][0])
                        actions.append(batch[i][1])
                        rewards.append(batch[i][2])
                        states_.append(batch[i][3])

                    states = np.stack(states, axis=0)
                    actions = np.stack(actions, axis=0)
                    rewards = np.stack(rewards, axis=0)
                    states_ = np.stack(states_, axis=0)

                    #_, loss_s, loss_a, loss_s_, loss = sess.run([emg.update_model, emg.loss_s, emg.loss_a, emg.loss_s_, emg.loss], feed_dict={emg.states:states, emg.states_:rewards[..., np.newaxis], emg.actions_placeholder:actions})
                    _, loss_s, loss_a, loss_s_, loss = sess.run(
                        [
                            emg.update_model, emg.loss_s, emg.loss_a,
                            emg.loss_s_, emg.loss
                        ],
                        feed_dict={
                            emg.states: states,
                            emg.states_: states_,
                            emg.actions_placeholder: actions
                        })
                    print 'loss_s', loss_s, 'loss_a', loss_a, 'loss_s_', loss_s_, 'loss', loss

                s = copy.deepcopy(s_)
                if done == True:
                    break
예제 #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-interface", type=str, default='gym')
    parser.add_argument("--environment",
                        type=str,
                        default='BreakoutDeterministic-v4')
    parser.add_argument("--action-size", type=int, default=4)
    parser.add_argument("--input-shape", type=str, default='None,84,84,4')
    parser.add_argument("--state-len-max", type=int, default=4)
    parser.add_argument("--target-update-freq", type=int, default=10000)

    parser.add_argument("--ep-greedy-speed", type=str, default='slow')
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay-slow", type=int, default=1000000)

    parser.add_argument("--epsilon-decay-fast", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.95)
    parser.add_argument("--replay-start-size", type=int, default=50000)
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--epochs", type=int, default=30000)

    parser.add_argument("--pixel-feature", type=int, default=1)
    parser.add_argument("--padding", type=int, default=0)

    parser.add_argument("--model", type=str, default='nature')

    args = parser.parse_args()

    args.input_shape = str2list(args.input_shape)
    assert args.model in ['nature', 'gated']
    assert args.ep_greedy_speed in ['fast', 'slow']
    assert args.env_interface in [
        'gym', 'ale', 'custom_cart', 'custom_cartpole', 'ple'
    ]
    if args.env_interface in ['gym', 'ale']:
        env = env_interface(args.env_interface, args.environment)
    elif args.env_interface in ['custom_cart', 'custom_cartpole', 'ple']:
        env = env_interface(args.env_interface, args.environment,
                            bool(args.pixel_feature), bool(args.padding))
        args.input_shape = [None] + list(env.obs_space_shape) + [1]
    args.input_shape[-1] = args.state_len_max
    args.action_size = env.action_size
    assert args.state_len_max == args.input_shape[-1]
    print args

    #Other other paramters
    state_old = []
    state = []
    steps = 0

    #Other parameters
    if args.ep_greedy_speed == 'slow':
        epsilon = args.epsilon_max
        epsilon_rate = 0.
        if args.epsilon_decay_slow != 0:
            epsilon_rate = ((args.epsilon_max - args.epsilon_min) /
                            float(args.epsilon_decay_slow))
    elif args.ep_greedy_speed == 'fast':
        epsilon = args.epsilon_max

    #Initialize replay memory
    memory = Memory(args.replay_mem_size, args.input_shape[1:])

    #Initialize neural net
    qnet, tnet, update_ops = init_network(args.input_shape, args.action_size,
                                          args.model)

    #import time
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(update_ops)
        for epoch in range(args.epochs):
            frame = env.reset()
            total_rewards = 0.
            total_losses = 0.
            state_old = []
            state = [frame] * args.state_len_max
            done = False

            #start = time.time()
            while done == False:
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    image_in = np.stack(state, axis=-1)[np.newaxis, ...]
                    action = qnet.get_action(sess, image_in)

                frame, reward, done, _ = env.step(action)
                total_rewards += reward
                state_old = state[:]
                state.append(frame)
                if len(state) > args.state_len_max:
                    state = state[1:]

                #Add to memory
                memory.add([
                    np.stack(state_old, axis=-1)[np.newaxis, ...], action,
                    min(1., max(-1., reward)),
                    np.stack(state, axis=-1)[np.newaxis, ...], done
                ])

                #Reduce epsilon
                if args.ep_greedy_speed == 'slow':
                    epsilon = max(args.epsilon_min, epsilon - epsilon_rate)
                elif args.ep_greedy_speed == 'fast':
                    epsilon = args.epsilon_min + (
                        args.epsilon_max - args.epsilon_min) * np.exp(
                            -args.epsilon_decay_fast * float(steps))

                if steps > args.replay_start_size:
                    #Training step
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.concatenate(batch[:, 0], axis=0)
                    actions = batch[:, 1]
                    rewards = batch[:, 2]
                    states1 = np.concatenate(batch[:, 3], axis=0)
                    dones = batch[:, 4]

                    l = qnet.train(sess, states, actions, rewards, states1,
                                   dones, args.learning_rate, tnet)
                    total_losses += l

                #Increase the frame steps counter
                steps += 1
                #Check if target network is to be updated
                if steps % args.target_update_freq == 0:
                    print "Updating target..."
                    sess.run(update_ops)

                if done == True:
                    print "epoch:", epoch, "total rewards", total_rewards, "total losses", total_losses, qnet.string
                    #print 'time:', time.time() - start
                    break
    env.close()
예제 #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=300)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    args = parser.parse_args()

    env = gym.make(args.environment)
    args.action_size = env.action_space.n
    args.input_shape = [None] + list(env.observation_space.shape)

    print args

    # Epsilon parameter
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the agent
    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    tnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='tnet')
    update_ops = update_target_graph('qnet', 'tnet')

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(args.epochs):
            total_reward = 0
            state = env.reset()
            while True:
                #env.render()
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    action = qnet.act(sess, state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward

                # Add to memory
                memory.add([state, action, reward, next_state, done])

                # Reduce epsilon
                time_step += 1.
                epsilon = args.epsilon_min + (
                    args.epsilon_max - args.epsilon_min) * np.exp(
                        -args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.learning_rate, tnet)

                # s <- s'
                state = np.copy(next_state)

                # Update target network
                if int(time_step) % args.target_update_freq == 0:
                    sess.run(update_ops)

                if done:
                    print 'epoch:', epoch, 'total_rewards:', total_reward
                    break
    def __init__(self, level_name):  
        self.level_name = level_name  
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()
        
        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)
        
        # instantiate memory
        self.memory = Memory(max_size=memory_size)
        
        # initialize deque with zero images
        self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4)

        for i in range(pretrain_length):    
            # If it's the first step
            if i == 0:
                state = self.env.reset()        
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state
       
        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)
        
        self.write_op = tf.summary.merge_all()
예제 #23
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default='Pendulum-v0')
    parser.add_argument("--unroll-steps", type=int, default=25)
    parser.add_argument("--no-samples", type=int, default=20)
    parser.add_argument("--no-basis", type=int, default=256)
    parser.add_argument("--discount-factor", type=float, default=.9)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--train-policy-batch-size", type=int, default=32)
    parser.add_argument("--train-policy-iterations", type=int, default=30)
    parser.add_argument("--replay-start-size-epochs", type=int, default=2)
    parser.add_argument("--train-hyperparameters-iterations",
                        type=int,
                        default=50000)
    parser.add_argument("--goal-position", type=float, default=.45)
    args = parser.parse_args()

    print args

    #env = gym.make(args.env, goal_position=args.goal_position)
    env = gym.make(args.env)
    env.seed(seed=args.goal_position)

    # Gather data to train hyperparameters
    data = []
    rewards = []
    dones = []
    for _ in range(2):
        state = env.reset()
        while True:
            action = np.random.uniform(env.action_space.low,
                                       env.action_space.high, 1)
            next_state, reward, done, _ = env.step(action)
            data.append([state, action, next_state])
            rewards.append(reward)
            dones.append(done)
            state = np.copy(next_state)
            if done:
                break

    states, actions, next_states = [np.stack(d, axis=0) for d in zip(*data)]

    permutation = np.random.permutation(len(data))
    states_actions = np.concatenate([states, actions], axis=-1)[permutation]
    next_states = next_states[permutation]

    # Train the hyperparameters
    hs = [
        hyperparameter_search(dim=env.observation_space.shape[0] +
                              env.action_space.shape[0])
        for _ in range(env.observation_space.shape[0])
    ]
    hyperparameters = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        batch_size = 32
        iterations = args.train_hyperparameters_iterations
        #idxs = [np.random.randint(len(states_actions), size=batch_size) for _ in range(iterations)]
        for i in range(len(hs)):
            hs[i].train_hyperparameters(sess, states_actions, next_states[:,
                                                                          i],
                                        iterations, batch_size)
            hyperparameters.append(
                sess.run([hs[i].length_scale, hs[i].signal_sd,
                          hs[i].noise_sd]))

    blr = blr_model(x_dim=env.observation_space.shape[0] +
                    env.action_space.shape[0],
                    y_dim=env.observation_space.shape[0],
                    state_dim=env.observation_space.shape[0],
                    action_dim=env.action_space.shape[0],
                    observation_space_low=env.observation_space.low,
                    observation_space_high=env.observation_space.high,
                    action_bound_low=env.action_space.low,
                    action_bound_high=env.action_space.high,
                    unroll_steps=args.unroll_steps,
                    no_samples=args.no_samples,
                    no_basis=args.no_basis,
                    discount_factor=args.discount_factor,
                    train_policy_batch_size=args.train_policy_batch_size,
                    train_policy_iterations=args.train_policy_iterations,
                    hyperparameters=hyperparameters,
                    debugging_plot=False)

    # Initialize the memory
    memory = Memory(args.replay_mem_size)
    assert len(data) == len(rewards)
    assert len(data) == len(dones)
    for dat, reward, done in zip(data, rewards, dones):
        memory.add([dat[0], dat[1], reward, dat[2], done])
    memory2 = []

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        weights = pickle.load(
            open(
                '../custom_environments/weights/mountain_car_continuous_reward'
                + str(args.goal_position) + '.p', 'rb'))
        sess.run(blr.assign_ops,
                 feed_dict=dict(zip(blr.placeholders_reward, weights)))
        # Update the model with data used from training hyperparameters
        blr.update(sess, states_actions, next_states)
        blr.train(sess, memory)
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(30000):
            action = blr.act(sess, state)
            next_state, reward, done, _ = env.step(action)
            total_rewards += float(reward)

            # Append to the batch
            memory.add([state, action, reward, next_state, done])
            memory2.append([state, action, reward, next_state, done])

            # s <- s'
            state = np.copy(next_state)

            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards

                # Update the memory
                blr.update(sess, memory=memory2)

                # Train the policy
                blr.train(sess, memory)

                epoch += 1
                total_rewards = 0.
                state = env.reset()
                memory2 = []
예제 #24
0
def train(input_placeholder, output_data, sess):
    # build cost function
    action = tf.placeholder("float", [None, ACTIONS_CHOICE_NUMBER])
    y = tf.placeholder("float", [None])
    y_action = tf.reduce_sum(tf.multiply(output_data, action),
                             reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - y_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # start game
    game_state = game.GameState()

    global_timestamp = 0
    epsilon = EPSILON

    memory = Memory(MEMORY_SIZE, FRAME_NUM_PER_STACK)

    # start network
    sess.run(tf.global_variables_initializer())
    # network checkpoint saver and restore loader
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_FOLDER)
    if checkpoint and checkpoint.model_checkpoint_path and os.path.exists(
            os.path.join(CHECKPOINT_FOLDER, "global_state.pkl")):
        data = load_history(os.path.join(CHECKPOINT_FOLDER,
                                         "global_state.pkl"))
        global_timestamp = data['global_timestamp']
        epsilon = data['epsilon']
        memory = data['memory']
        game_state = data['game_state']
        saver.restore(sess, checkpoint.model_checkpoint_path)
        logging.info("restored from checkpoint",
                     extra={
                         'stage': get_stage_name(global_timestamp),
                         'timestamp': global_timestamp,
                         'epsilon': epsilon,
                         'reward': "",
                         'action': ""
                     })
    else:
        image_data, _, _ = game_state.frame_step(actions.NOTHING)
        memory.initial_stack(image_data)

    prev_state = memory.get_current_stack()

    while True:
        actions_scores = output_data.eval(
            feed_dict={input_placeholder: [prev_state]})[0]

        action_name, action_choice = actions.get_next_action(
            epsilon, actions_scores)

        image_data, reward, game_terminate = game_state.frame_step(
            action_choice)
        memory.stack_frame(image_data)

        new_state = memory.get_current_stack()
        memory.remember(prev_state, action_choice, reward, new_state,
                        game_terminate)

        # anneal
        if global_timestamp > OBSERVE_DURATION and epsilon > MIN_EPSILON:
            logging.info("start anneal",
                         extra={
                             'stage': get_stage_name(global_timestamp),
                             'timestamp': global_timestamp,
                             'epsilon': epsilon,
                             'reward': reward,
                             'action': action_name
                         })
            epsilon -= float(EPSILON - MIN_EPSILON) / ANNEAL_DURATION

        # explore + train
        if global_timestamp > OBSERVE_DURATION:
            prev_state_batch, action_batch, reward_batch, new_state_batch, game_terminate_batch = memory.get_sample_batches(
                BATCH_SIZE)

            y_batch = []
            evaluate = output_data.eval(
                feed_dict={input_placeholder: new_state_batch})
            for i, game_terminate in enumerate(game_terminate_batch):
                # train target to reward
                if game_terminate:
                    y_batch.append(reward_batch[i])
                else:
                    y_batch.append(reward_batch[i] +
                                   GAMMA * np.max(evaluate[i]))

            # gradient
            train_step.run(
                feed_dict={
                    y: y_batch,
                    action: action_batch,
                    input_placeholder: new_state_batch
                })

        if global_timestamp % CHECKPOINT_GAP == 0:
            saver.save(sess,
                       os.path.join(CHECKPOINT_FOLDER, 'flappy-bird'),
                       global_step=global_timestamp)
            save_history(
                os.path.join(CHECKPOINT_FOLDER, 'global_state.pkl'), {
                    'global_timestamp': global_timestamp,
                    'epsilon': epsilon,
                    'memory': memory,
                    'game_state': game_state
                })
            logging.info("checkpoint saved",
                         extra={
                             'stage': get_stage_name(global_timestamp),
                             'timestamp': global_timestamp,
                             'epsilon': epsilon,
                             'reward': reward,
                             'action': ""
                         })

        # update state
        prev_state = new_state
        logging.info("finish epoch",
                     extra={
                         'stage': get_stage_name(global_timestamp),
                         'timestamp': global_timestamp,
                         'epsilon': epsilon,
                         'reward': reward,
                         'action': action_name
                     })
        global_timestamp += 1
예제 #25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--action-dim", type=int, default=1)
    parser.add_argument("--state-dim", type=int, default=1)
    #parser.add_argument("--epochs", type=int, default=30000)
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument('--tau',
                        type=float,
                        help='soft target update parameter',
                        default=0.01)
    parser.add_argument("--action-bound", type=float, default=1.)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=.9)

    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    parser.add_argument("--model", type=str, default='gan')

    parser.add_argument("--mode", type=str, default='none')
    args = parser.parse_args()

    assert args.mode in ['none', 'test', 'transfer']
    assert args.model in [
        'mlp', 'gan', 'gated', 'dmlac_mlp', 'dmlac_gan', 'dmlac_gated',
        'ddpg_unrolled_pg_mlp', 'dmlac_gp', 'dmlac_truth', 'mpc'
    ]
    if args.model == 'dmlac_truth':
        assert args.environment == 'Pendulum-v0'
    # Initialize environment
    env = gym.make(args.environment)
    args.state_dim = env.observation_space.shape[0]
    args.action_dim = env.action_space.shape[0]
    #assert args.action_dim == 1
    args.action_bound_high = env.action_space.high
    args.action_bound_low = env.action_space.low

    assert len(args.action_bound_high) == len(args.action_bound_low)
    for i in range(len(args.action_bound_high)):
        assert args.action_bound_high[i] == -args.action_bound_low[i]
    print(args)

    jointddpg, update_target_actor, update_target_critic, copy_target_actor, copy_target_critic = init_model(
        [None, args.state_dim], args.action_dim, args.latent_size,
        args.learning_rate, args.action_bound_low, args.action_bound_high,
        args.tau, args.model)

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Actor noise
    exploration_strategy = OUStrategy(jointddpg, env)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #sess.run(copy_target_critic)
        #sess.run(copy_target_actor)

        if args.mode in ['test', 'transfer']:
            env.seed(1)
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            env.render()
            # Choose an action
            exploration = (float(args.time_steps - time_steps) /
                           float(args.time_steps))**4
            action = exploration_strategy.action(sess, state[np.newaxis, ...],
                                                 exploration)
            # Execute action
            state1, reward, done, _ = env.step(action)

            total_rewards += float(reward)
            # Store tuple in replay memory
            memory.add([
                state[np.newaxis, ...], action[np.newaxis, ...], reward,
                state1[np.newaxis, ...], done
            ])

            # Training step
            batch_B = np.array(memory.sample(args.batch_size))
            assert len(batch_B) > 0
            states_B = np.concatenate(batch_B[:, 0], axis=0)
            actions_B = np.concatenate(batch_B[:, 1], axis=0)
            rewards_B = batch_B[:, 2]
            states1_B = np.concatenate(batch_B[:, 3], axis=0)
            dones_B = batch_B[:, 4]

            #Get another batch
            batch_M = np.array(memory.sample(args.batch_size))
            assert len(batch_M) > 0
            states_M = np.vstack(batch_M[:, 0])
            actions_M = np.concatenate(batch_M[:, 1], axis=0)

            if args.model == 'dmlac_gp':
                jointddpg.update_hist(memory)

            jointddpg.train(sess, states_B, actions_B, rewards_B,
                            states1_B, dones_B, states_M, actions_M,
                            len(batch_M), args.latent_size)

            # Update target networks
            #jointddpg.update(self, sess, update_target_critic, update_target_actor)
            #sess.run(update_target_critic)
            #sess.run(update_target_actor)

            state = np.copy(state1)
            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards
                epoch += 1
                total_rewards = 0.
                if args.mode == 'transfer':
                    if time_steps >= args.time_steps / 3:
                        env.seed(0)
                    else:
                        env.seed(1)
                elif args.mode == 'test':
                    env.seed(1)
                state = env.reset()
            if args.mode == 'transfer':
                if time_steps == args.time_steps / 3:
                    memory = Memory(args.replay_mem_size)
예제 #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment",
                        type=str,
                        default='MountainCarContinuous-v0')
    parser.add_argument("--unroll-steps", type=int, default=20)
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--discount-factor", type=float, default=1.)
    parser.add_argument("--goal-position", type=float, default=.45)
    args = parser.parse_args()

    env = gym.make(args.environment)
    env.seed(seed=args.goal_position)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound_high = env.action_space.high
    action_bound_low = env.action_space.low

    agent = direct_policy_search(state_dim, action_dim, action_bound_high,
                                 action_bound_low, args.unroll_steps,
                                 args.discount_factor, 1,
                                 'direct_policy_search')

    # Replay memory
    memory = Memory(args.replay_mem_size)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #weights = pickle.load(open('../custom_environments/weights/pendulum_reward.p', 'rb'))
        #weights = pickle.load(open('../custom_environments/weights/mountain_car_continuous_reward'+str(args.goal_position)+'.p', 'rb'))
        #sess.run(agent.assign_ops0, feed_dict=dict(zip(agent.placeholders_reward, weights)))
        weights = pickle.load(
            open(
                '../custom_environments/weights/mountain_car_continuous_next_state.p',
                'rb'))
        sess.run(agent.assign_ops1,
                 feed_dict=dict(zip(agent.placeholders_state, weights)))
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            env.render()
            action = agent.act(sess, state)
            next_state, reward, done, _ = env.step(action)
            total_rewards += float(reward)

            # Store tuple in replay memory
            memory.add([
                np.atleast_2d(state),
                np.atleast_2d(action), reward,
                np.atleast_2d(next_state), done
            ])

            # Training step
            batch = np.array(memory.sample(args.batch_size))
            assert len(batch) > 0
            states = np.concatenate(batch[:, 0], axis=0)

            # Train the agent
            agent.train(sess, states)

            # s <- s'
            state = np.copy(next_state)

            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'unroll', args.unroll_steps
                epoch += 1
                total_rewards = 0.
                state = env.reset()
예제 #27
0
def main():
    #Arguments for the q-learner
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-interface", type=str, default='gym')
    parser.add_argument("--environment",
                        type=str,
                        default='BreakoutDeterministic-v4')
    parser.add_argument("--action-size", type=int, default=4)
    parser.add_argument("--input-shape", type=str, default='None,84,84,4')
    parser.add_argument("--state-len-max", type=int, default=4)
    parser.add_argument("--target-update-freq", type=int, default=10000)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=int, default=1000000)
    parser.add_argument("--learning-rate", type=float, default=.95)
    parser.add_argument("--replay-start-size", type=int, default=50000)
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--epochs", type=int, default=30000)

    #Arguments for the feature extractor
    parser.add_argument("--train-fe-shape", type=str, default='None,12,12,4')
    parser.add_argument("--stop-gradient", type=int, default=0)
    parser.add_argument("--train-fe-iterations", type=int, default=1000)
    parser.add_argument("--train-fe-batch-size", type=int, default=100)
    parser.add_argument("--train-fe-lamb", type=float, default=0.)
    parser.add_argument("--train-fe-numfactors", type=int, default=200)
    parser.add_argument("--train-fe-nummap", type=int, default=100)
    parser.add_argument("--train-fe-learning-rate", type=float, default=.001)
    parser.add_argument("--train-fe-w", type=int, default=12)
    parser.add_argument("--train-fe-s", type=int, default=1)

    parser.add_argument("--use-conv-after-fe", type=int, default=0)

    parser.add_argument("--ep-greedy-speed", type=str, default='slow')
    #Arguments for the environment interface
    parser.add_argument("--pixel-features", type=int, default=1)
    parser.add_argument("--padding", type=int, default=0)
    args = parser.parse_args()

    #Parse arguments wrt other arguments
    args.input_shape = str2list(args.input_shape)
    args.train_fe_shape = str2list(args.train_fe_shape)
    assert args.env_interface in [
        'gym', 'ale', 'custom_cart', 'custom_cartpole'
    ]
    assert args.ep_greedy_speed in ['fast', 'slow']
    env = env_interface(args.env_interface,
                        args.environment,
                        pixel_feature=bool(args.pixel_features),
                        padding=bool(args.padding),
                        render=True)
    args.action_size = env.action_size
    if args.env_interface in ['custom_cart', 'custom_cartpole']:
        args.input_shape = [None] + list(
            env.obs_space_shape) + [args.state_len_max]
    args.train_fe_shape[-1] = args.state_len_max
    print args

    #Other other parameters
    state_old = []
    state = []
    steps = 0

    #Other parameters
    epsilon_lambda = .001
    epsilon = args.epsilon_max
    epsilon_rate = 0.
    if args.epsilon_decay != 0:
        epsilon_rate = ((args.epsilon_max - args.epsilon_min) /
                        float(args.epsilon_decay))

    #Initialize replay memory
    print args.input_shape
    memory = Memory(args.replay_mem_size, args.input_shape[1:])

    #Initialize neural net
    from gated_qlearning import gated_qlearning
    qnet = gated_qlearning(shape=args.train_fe_shape,\
                           nummap=args.train_fe_nummap,\
                           numfactors=args.train_fe_numfactors,\
                           learning_rate=args.train_fe_learning_rate,\
                           frame_shape=args.input_shape,\
                           a_size=args.action_size,\
                           stop_gradient=bool(args.stop_gradient),\
                           lamb=args.train_fe_lamb,\
                           w=args.train_fe_w,\
                           s=args.train_fe_s,\
                           use_conv_after_fe=bool(args.use_conv_after_fe))
    qnet_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

    tnet = gated_qlearning(shape=args.train_fe_shape,\
                           nummap=args.train_fe_nummap,\
                           numfactors=args.train_fe_numfactors,\
                           learning_rate=args.train_fe_learning_rate,\
                           frame_shape=args.input_shape,\
                           a_size=args.action_size,\
                           stop_gradient=bool(args.stop_gradient),\
                           lamb=args.train_fe_lamb,\
                           w=args.train_fe_w,\
                           s=args.train_fe_s,\
                           use_conv_after_fe=bool(args.use_conv_after_fe))
    tnet_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES)[len(qnet_vars):]

    update_ops = update_target_graph_vars(qnet_vars, tnet_vars)
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        sess.run(update_ops)
        for epoch in range(args.epochs):
            frame = env.reset()
            total_rewards = 0.
            total_losses = 0.
            state_old = []
            state = [frame] * args.state_len_max
            done = False

            while done == False:
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    image_in = np.stack(state, axis=-1)[np.newaxis, ...]
                    action = qnet.get_action(sess, image_in)

                frame, reward, done, _ = env.step(action)
                total_rewards += reward
                state_old = state[:]
                state.append(frame)
                if len(state) > args.state_len_max:
                    state = state[1:]

                #Add to memory
                memory.add([np.stack(state_old, axis=-1)[np.newaxis, ...],\
                           action,\
                           min(1., max(-1., reward)),\
                           np.stack(state, axis=-1)[np.newaxis, ...],\
                           done])

                #Reduce epsilon
                if args.ep_greedy_speed == 'slow':
                    epsilon = max(args.epsilon_min, epsilon - epsilon_rate)
                elif args.ep_greedy_speed == 'fast':
                    epsilon = args.epsilon_min + (
                        args.epsilon_max - args.epsilon_min) * np.exp(
                            -epsilon_lambda * float(steps))

                #Train the reconstruction loss
                if args.train_fe_iterations > 0:
                    args.train_fe_iterations -= qnet.train_feature_extractor(
                        sess, memory, args.train_fe_batch_size, 10)
                    print args.train_fe_iterations

                if steps > args.replay_start_size and args.train_fe_iterations <= 0:
                    #Training step
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.concatenate(batch[:, 0], axis=0)
                    actions = batch[:, 1]
                    rewards = batch[:, 2]
                    states1 = np.concatenate(batch[:, 3], axis=0)
                    dones = batch[:, 4]

                    Q1 = qnet.get_Q1(sess, states1, tnet)

                    targetQ = rewards + (1. -
                                         dones) * args.learning_rate * np.amax(
                                             Q1, keepdims=False, axis=1)

                    l, _, _ = qnet.train(sess, states, actions,
                                         targetQ[..., np.newaxis])
                    total_losses += l

                #Increase the frame steps counter
                steps += 1
                #Check if target network is to be updated
                if steps % args.target_update_freq == 0:
                    print "Updating target..."
                    sess.run(update_ops)

                if done == True:
                    print "epoch", epoch, "total rewards", total_rewards, "total losses", total_losses
                    break

    env.close()
예제 #28
0
def main2():
    import gym
    import copy
    from utils import Memory
    from utils import process_frame2

    env = gym.make('BreakoutDeterministic-v4')
    gc = gated_convolution2(shape=[None, 84, 84, 4],
                            nummap=128,
                            numfactors=128,
                            learning_rate=.001,
                            w=8,
                            s=1,
                            a_size=env.action_space.n)
    mem = Memory(50000)
    batch_size = 4
    steps = 1
    length = 4
    action_space = env.action_space.n

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        while True:
            s = env.reset()
            s = process_frame2(s)

            state = [s[..., np.newaxis]] * length
            state_ = [s[..., np.newaxis]] * length
            action = [-1] * length

            done = False
            while done == False:
                #env.render()

                a = np.random.randint(env.action_space.n)
                s_, r, done, _ = env.step(a)
                s_ = process_frame2(s_)

                state_.pop(0)
                action.pop(0)

                state_.append(s_[..., np.newaxis])
                action.append(a)

                mem.add([
                    np.concatenate(state, axis=-1)[np.newaxis, ...],
                    np.array(action)[np.newaxis, ...],
                    np.concatenate(state_, axis=-1)[np.newaxis, ...]
                ])

                if len(mem.mem) >= batch_size:
                    batch = mem.sample(batch_size)
                    #Do stuff
                    states = []
                    actions = []
                    states_ = []
                    for i in range(len(batch)):
                        states.append(batch[i][0])
                        actions.append(batch[i][1])
                        states_.append(batch[i][2])
                    states = np.concatenate(states, axis=0).astype(
                        np.float64) / 255.
                    actions = np.concatenate(actions, axis=0)
                    states_ = np.concatenate(states_, axis=0).astype(
                        np.float64) / 255.

                    _, recon_loss, recon_x, recon_y, recon_action_loss = gc.run2(
                        sess, states, actions, states_)
                    print 'steps:', steps, 'recon_loss:', recon_loss, 'recon_action_loss', recon_action_loss, 'main2'

                steps += 1
                if done == True:
                    break
예제 #29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--action-dim", type=int, default=1)
    parser.add_argument("--state-dim", type=int, default=1)
    parser.add_argument("--input-shape", type=list, default=[None, 1])
    parser.add_argument("--epochs", type=int, default=30000)
    parser.add_argument('--tau',
                        help='soft target update parameter',
                        default=0.001)
    parser.add_argument("--action-bound", type=float, default=1.)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--gamma", type=float, default=.99)

    parser.add_argument("--K",
                        type=int,
                        default=1,
                        help='The number of steps to train the environment')
    parser.add_argument(
        "--L",
        type=int,
        default=1,
        help='The number of Q-learning steps for hypothetical rollouts')
    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    args = parser.parse_args()

    # Initialize environment
    env = gym.make(args.environment)
    args.state_dim = env.observation_space.shape[0]
    args.input_shape = [None, args.state_dim]
    args.action_dim = env.action_space.shape[0]
    #assert args.action_dim == 1
    args.action_bound = env.action_space.high
    print(args)

    # Networks
    actor_source = actor(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        output_bound=args.action_bound[0],\
        scope='actor_source')
    critic_source = critic(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        scope='critic_source')
    actor_target = actor(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        output_bound=args.action_bound[0],\
        scope='actor_target')
    critic_target = critic(state_shape=[None, args.state_dim],\
        action_shape=[None, args.action_dim],\
        scope='critic_target')

    # Initialize the GANs
    cgan_state = CGAN(input_shape=args.input_shape,\
        action_size=args.action_dim,\
        latent_size=args.latent_size,\
        gen_input_shape=args.input_shape,\
        continuous_action=True)
    cgan_reward = CGAN(input_shape=args.input_shape,\
        action_size=args.action_dim,\
        latent_size=args.latent_size,\
        gen_input_shape=[None, 1],\
        continuous_action=True)

    # Update and copy operators
    update_target_actor = update_target_graph2('actor_source', 'actor_target',
                                               args.tau)
    update_target_critic = update_target_graph2('critic_source',
                                                'critic_target', args.tau)

    copy_target_actor = update_target_graph2('actor_source', 'actor_target',
                                             1.)
    copy_target_critic = update_target_graph2('critic_source', 'critic_target',
                                              1.)

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Actor noise
    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(args.action_dim))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(copy_target_critic)
        sess.run(copy_target_actor)

        for epoch in range(args.epochs):
            state = env.reset()
            total_rewards = 0.0
            while True:
                #env.render()
                # Choose an action
                action = sess.run(
                    actor_source.action,
                    feed_dict={actor_source.states: state[np.newaxis, ...]
                               })[0] + actor_noise()
                # Execute action
                state1, reward, done, _ = env.step(action)
                total_rewards += float(reward)
                # Store tuple in replay memory
                memory.add([state[np.newaxis, ...],\
                    action[np.newaxis, ...],\
                    reward,\
                    state1[np.newaxis, ...],\
                    done])

                # Training step: update actor critic using real experience
                batch = np.array(memory.sample(args.batch_size))
                assert len(batch) > 0
                states = np.concatenate(batch[:, 0], axis=0)
                actions = np.concatenate(batch[:, 1], axis=0)
                rewards = batch[:, 2]
                states1 = np.concatenate(batch[:, 3], axis=0)
                dones = batch[:, 4]

                # Update the critic
                actions1 = sess.run(actor_target.action,\
                    feed_dict={actor_target.states:states1})
                targetQ = np.squeeze(sess.run(critic_target.Q,\
                    feed_dict={critic_target.states:states1,\
                        critic_target.actions:actions1}), axis=-1)
                targetQ = rewards + (
                    1. - dones.astype(np.float32)) * args.gamma * targetQ
                targetQ = targetQ[..., np.newaxis]
                _, critic_loss = sess.run([critic_source.critic_solver,\
                    critic_source.loss],\
                    feed_dict={critic_source.states:states,\
                        critic_source.actions:actions,\
                        critic_source.targetQ:targetQ})

                # Update the actor
                critic_grads = sess.run(critic_source.grads,\
                    feed_dict={critic_source.states:states,\
                        critic_source.actions:actions})[0]# Grab gradients from critic
                _ = sess.run(actor_source.opt,\
                    feed_dict={actor_source.states:states,\
                        actor_source.dQ_by_da:critic_grads})

                # Update target networks
                sess.run(update_target_critic)
                sess.run(update_target_actor)

                # Training step: update the environment model using real experience (i.e., update the conditional GANs)
                for k in range(args.K):
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.concatenate(batch[:, 0], axis=0)
                    actions = np.concatenate(batch[:, 1], axis=0)
                    rewards = batch[:, 2]
                    states1 = np.concatenate(batch[:, 3], axis=0)

                    _, D_loss_state = sess.run([cgan_state.D_solver, cgan_state.D_loss],\
                        feed_dict={cgan_state.states:states,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size),\
                            cgan_state.X:states1})

                    _, G_loss_state = sess.run([cgan_state.G_solver,\
                        cgan_state.G_loss],\
                        feed_dict={cgan_state.states:states,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size)})

                    _, D_loss_reward = sess.run([cgan_reward.D_solver,\
                        cgan_reward.D_loss],\
                        feed_dict={cgan_reward.states:states,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size),\
                            cgan_reward.X:rewards[..., np.newaxis]})

                    _, G_loss_reward = sess.run([cgan_reward.G_solver,\
                        cgan_reward.G_loss],\
                        feed_dict={cgan_reward.states:states,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size)})
                    #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state

                # Training step: update actor critic using imagination rollouts
                for l in range(args.L):
                    batch = np.array(memory.sample(args.batch_size))
                    states_ = np.concatenate(batch[:, 3], axis=0)
                    actions = np.random.uniform(env.action_space.low[0],\
                        env.action_space.high[0],\
                        size=(len(batch),\
                        env.action_space.shape[0]))
                    dones = np.array([False] * len(batch))

                    G_sample_state = sess.run(cgan_state.G_sample,\
                        feed_dict={cgan_state.states:states_,\
                            cgan_state.actions:actions,\
                            cgan_state.Z:sample_z(len(batch),\
                            args.latent_size)})
                    G_sample_reward = sess.run(cgan_reward.G_sample,\
                        feed_dict={cgan_reward.states:states_,\
                            cgan_reward.actions:actions,\
                            cgan_reward.Z:sample_z(len(batch),\
                            args.latent_size)})
                    G_sample_reward = np.squeeze(G_sample_reward, axis=-1)

                    # Update the critic
                    actions1 = sess.run(actor_target.action,\
                        feed_dict={actor_target.states:G_sample_state})
                    targetQ = np.squeeze(sess.run(critic_target.Q,\
                        feed_dict={critic_target.states:G_sample_state,\
                            critic_target.actions:actions1}), axis=-1)
                    targetQ = G_sample_reward + (
                        1. - dones.astype(np.float32)) * args.gamma * targetQ
                    targetQ = targetQ[..., np.newaxis]
                    _, critic_loss = sess.run([critic_source.critic_solver,\
                        critic_source.loss],\
                        feed_dict={critic_source.states:states_,\
                            critic_source.actions:actions,\
                            critic_source.targetQ:targetQ})

                    # Update the actor
                    critic_grads = sess.run(critic_source.grads,\
                        feed_dict={critic_source.states:states_,\
                            critic_source.actions:actions})[0]# Grab gradients from critic
                    _ = sess.run(actor_source.opt,\
                        feed_dict={actor_source.states:states_,\
                            actor_source.dQ_by_da:critic_grads})

                    # Update target networks
                    sess.run(update_target_critic)
                    sess.run(update_target_actor)

                state = np.copy(state1)
                if done == True:
                    print 'epoch', epoch, 'total rewards', total_rewards
                    break
예제 #30
0
print(device)

actor = Actor(obs_space=config.obs_space,
              action_space=config.action_space,
              hidden_size=config.hidden_size).to(device)
critic = Critic(obs_space=config.obs_space,
                hidden_size=config.hidden_size).to(device)
# actor.load_state_dict(torch.load('actor_model.h5'))
# critic.load_state_dict(torch.load('critic_model.h5'))

wandb.watch(actor)
wandb.watch(critic)

optimizer_actor = Adam(actor.parameters(), lr=config.actor_lr)
optimizer_critic = Adam(critic.parameters(), lr=config.critic_lr)
memory = Memory(env.agent_ids)


def compute_GAE(rewards, state_values, done, gamma, lamb):
    """
        Computes Generalized Advantage Estimations.
    """
    returns = [rewards[-1] + state_values[-1]]
    running_sum = rewards[-1] - state_values[-1]
    for i in reversed(range(len(rewards) - 1)):
        mask = 0 if done[i + 1] else 1
        delta = rewards[i] + gamma * state_values[i +
                                                  1] * mask - state_values[i]
        running_sum = delta + gamma * lamb * running_sum * mask
        returns.insert(0, running_sum + state_values[i])