示例#1
0
 def __init__(self, num_actions, checkpoint=None):
     self.network, self.trainable_parameters = self.init_network(
         num_actions)
     self.optimizer = torch.optim.Adam(self.trainable_parameters, lr=1e-4)
     self.memory = Memory()
     if checkpoint is not None:
         load_checkpoint(self.network, self.optimizer, checkpoint)
示例#2
0
    def __init__(self, state_size, action_size, random_seed):
        """
        Args:
        ======
            state_size (int): state dim
            action_size (int): action dim
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # actor net initialization
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # critic net initialization
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Ornstein-Uhlenbeck Exploration Noise Process
        self.noise = OUNoise(action_space=action_size, seed=random_seed)

        # Replay memory init
        self.memory = Memory(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    def __init__(self, hidden_size, env):
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        self.Actor = Actor(input_size=self.num_states,
                           hidden_size=hidden_size,
                           output_size=self.num_actions).cuda()

        self.Actor_target = Actor(input_size=self.num_states,
                                  hidden_size=hidden_size,
                                  output_size=self.num_actions).cuda()

        self.Critic = Critic(input_size=self.num_states,
                             hidden_size=hidden_size,
                             output_size=self.num_actions).cuda()

        self.Critic_target = Critic(input_size=self.num_states,
                                    hidden_size=hidden_size,
                                    output_size=self.num_actions).cuda()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = param.data

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data = param.data

        self.Memory = Memory(30000)
        self.criterion = nn.MSELoss().cuda()
        self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(),
                                                lr=1e-2)
        self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(),
                                                 lr=1e-1)
示例#4
0
    def __init__(self,
                 model_func,
                 n_way,
                 n_support,
                 jigsaw=False,
                 lbda=0.0,
                 rotation=False,
                 tracking=False,
                 use_bn=True,
                 pretrain=False,
                 image_loader=None,
                 len_dataset=None):
        super(ProtoNet, self).__init__(model_func, n_way, n_support, use_bn,
                                       pretrain)
        self.loss_fn = nn.CrossEntropyLoss()

        self.len_dataset = len_dataset
        self.cuda()
        self.memory = Memory(size=len_dataset, weight=0.5, device='cuda')
        self.memory.initialize(self.feature, image_loader)

        self.jigsaw = jigsaw
        self.rotation = rotation
        self.lbda = lbda
        self.global_count = 0

        self.indx = 0

        if self.jigsaw:

            self.projection_transformed_features = nn.Linear(
                512 * 9, 512)  ### Self-supervision branch

            #self.fc6 = nn.Sequential()
            #self.fc6.add_module('fc6_s1',nn.Linear(512, 512))#for resnet
            #self.fc6.add_module('relu6_s1',nn.ReLU(inplace=True))
            #self.fc6.add_module('drop6_s1',nn.Dropout(p=0.5))

            #self.fc7 = nn.Sequential()
            #self.fc7.add_module('fc7',nn.Linear(9*512,4096))#for resnet
            #self.fc7.add_module('relu7',nn.ReLU(inplace=True))
            #self.fc7.add_module('drop7',nn.Dropout(p=0.5))

            #self.classifier = nn.Sequential()
            #self.classifier.add_module('fc8',nn.Linear(4096, 35))

        if self.rotation:
            self.fc6 = nn.Sequential()
            self.fc6.add_module('fc6_s1', nn.Linear(512, 512))  #for resnet
            self.fc6.add_module('relu6_s1', nn.ReLU(inplace=True))
            self.fc6.add_module('drop6_s1', nn.Dropout(p=0.5))

            self.fc7 = nn.Sequential()
            self.fc7.add_module('fc7', nn.Linear(512, 128))  #for resnet
            self.fc7.add_module('relu7', nn.ReLU(inplace=True))
            self.fc7.add_module('drop7', nn.Dropout(p=0.5))

            self.classifier_rotation = nn.Sequential()
            self.classifier_rotation.add_module('fc8', nn.Linear(128, 4))
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=.9)
    args = parser.parse_args()

    env = gym.make(args.environment)
    unroll = 20

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound_high = env.action_space.high
    action_bound_low = env.action_space.low

    agent = direct_policy_search(state_dim, action_dim, action_bound_high,
                                 action_bound_low, unroll, .9, 5,
                                 'direct_policy_search')

    # Replay memory
    memory = Memory(args.replay_mem_size)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            #env.render()
            action = agent.act(sess, state)
            next_state, reward, done, _ = env.step(action)
            total_rewards += float(reward)

            # Store tuple in replay memory
            memory.add([
                np.atleast_2d(state),
                np.atleast_2d(action), reward,
                np.atleast_2d(next_state), done
            ])

            # Training step
            batch = np.array(memory.sample(args.batch_size))
            assert len(batch) > 0
            states = np.concatenate(batch[:, 0], axis=0)

            # Train the agent
            agent.train(sess, states)

            # s <- s'
            state = np.copy(next_state)

            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'unroll', unroll
                epoch += 1
                total_rewards = 0.
                state = env.reset()
示例#6
0
文件: pai.py 项目: wzkwzk123/new_drl
def main2():
    # Initialize environment.
    import gym
    env = gym.make('Pendulum-v0')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    action_bound_high = env.action_space.high
    action_bound_low = env.action_space.low

    # Initialize agent.
    pai = PAI(environment='Pendulum-v0',
              state_size=state_size,
              action_size=action_size,
              hidden_size=20,
              it_tloop=100,
              it_dyn=5000,
              bs_dyn=100,
              it_policy=1000,
              bs_policy=50,
              K=50,
              T=25,
              action_bound_low=action_bound_low,
              action_bound_high=action_bound_high,
              discount_factor=.9)

    # Initialize replay memory
    memory = Memory(
        400 * 10
    )  #Data from most recent 10 trials (each trial is 400 time steps long).

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(300):
            total_rewards = 0.
            state = env.reset()
            while True:
                action = pai.act(sess, state)
                next_state, reward, done, _ = env.step(action)
                total_rewards += float(reward)

                # Store tuple in replay memory
                memory.add([
                    np.atleast_2d(state),
                    np.atleast_2d(action), reward,
                    np.atleast_2d(next_state), done
                ])

                # s <- s'
                state = np.copy(next_state)

                if done == True:
                    print 'epoch', epoch, 'total rewards', total_rewards

                    # Train the agent
                    pai.train(sess, memory)
                    break
示例#7
0
    def _rollout_with_memory(self,
                             env,
                             network,
                             args,
                             running_state,
                             max_episode_steps,
                             keep_memory=False):
        memory = Memory()
        num_steps = 0
        reward_list = []
        len_list = []
        while num_steps < args.batch_size:
            state = env.reset()
            if args.state_norm:
                state = running_state(state)
            if args.append_time:
                state = np.append(state, 1.0)
            reward_sum = 0
            for t in range(max_episode_steps):
                action_mean, action_std, value = network(
                    Tensor(state).unsqueeze(0))
                action_mean = action_mean[0]
                action_std = action_std[0]
                action, y = network.select_action(action_mean, action_std)
                action_mean = action_mean.data.numpy()
                action = action.data.numpy()
                y = y.data.numpy()
                next_state, reward, done, info = env.step(action)
                reward_sum += reward
                if args.state_norm:
                    next_state = running_state(next_state)
                if args.append_time:
                    next_state = np.append(next_state,
                                           1 - (t + 1) / max_episode_steps)
                mask = 0 if (done or ((t + 1) == max_episode_steps)) else 1
                memory.push(state, value, action_mean, action, y, mask,
                            next_state, reward)

                if done:
                    break

                state = next_state

            num_steps += (t + 1)
            reward_list.append(reward_sum)
            len_list.append(t + 1)

            meanepreward = np.mean(reward_list)
            meaneplen = np.mean(len_list)

        if keep_memory:
            self.memory = memory
            self.old_std = network.action_std.data
            return meanepreward, meaneplen
        else:
            return memory, meanepreward, meaneplen, num_steps
示例#8
0
    def __init__(
        self,
        action_dim,
        filters_C,
        kernel_size,
        hidden_R,
        dropout,
        dropout_r,
        Hstep,
        activation,
        is_training_mode,
    ):
        self.policy_clip = 0.2
        self.value_clip = 0.2
        self.entropy_coef = 0.0
        self.vf_loss_coef = 0.5
        self.minibatch = 32
        self.PPO_epochs = 10

        # TODO use predicted results
        action_std = 1.0

        self.cov_mat = tf.fill([action_dim], action_std ** 2)
        self.is_training_mode = is_training_mode

        self.actor = Actor(
            action_dim,
            filters_C,
            kernel_size,
            hidden_R,
            dropout,
            dropout_r,
            activation,
            Hstep,
        )
        self.actor_old = Actor(
            action_dim,
            filters_C,
            kernel_size,
            hidden_R,
            dropout,
            dropout_r,
            activation,
            Hstep,
        )

        self.critic = Critic(
            filters_C, kernel_size, hidden_R, dropout, dropout_r, activation, Hstep
        )
        self.critic_old = Critic(
            filters_C, kernel_size, hidden_R, dropout, dropout_r, activation, Hstep
        )

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=2e-4)
        self.memory = Memory()
        self.utils = Utils()
示例#9
0
class DDPG:
    def __init__(self, env, batch_size=32, gamma=0.99, 
                 hidden_units=32, maxlen=10000, 
                 tau=0.1, actor_lr=0.001, critic_lr=0.001):
        
        self.env=env
        self.batch_size=batch_size
        self.gamma=gamma
        self.maxlen=maxlen
        
        self.sess=tf.Session()
           
        
        self.actor=Actor(env, self.sess, hidden_units, tau, actor_lr)
        self.critic=Critic(env, self.sess, hidden_units, tau, critic_lr)
        self.memory=Memory(maxlen)
        
        self.sess.run(tf.global_variables_initializer())
        
        self.step=0
        
    def store(self, exp):
        self.memory.add(exp)
        
    def update(self, ):
        if len(self.memory.buffer)<1000:#self.batch_size:
            return
        
        self.step+=1
        
        data = self.memory.sample(self.batch_size)
        s=np.array([d[0] for d in data])
        a=np.array([d[1] for d in data])
        r=np.array([d[2] for d in data])
        s_=np.array([d[3] for d in data])
        
        a_=self.actor.target_model.predict(s_)
        target_q=self.critic.target_model.predict([s_, a_])
        #y=np.array([d[2] for d in data])
        #for i in range(self.batch_size):
        #    y[i]+=self.gamma*target_q[i]
        y=r[:,np.newaxis]+self.gamma*target_q   
        self.critic.model.train_on_batch([s, a], y)
        
        action=self.actor.model.predict(s)     
        grads=self.critic.get_grads(s, action)
        self.actor.train(s,grads)
        
        if self.step%10==0:
            self.actor.update_weights()
            self.critic.update_weights()
        
        
    def get_action(self, s):
        return self.actor.get_action(s)
示例#10
0
 def inference_speed_memory(self, batch_size, seq_length):
     # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
     key = jax.random.PRNGKey(0)
     input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
     @jax.jit
     def ref_step():
         out = self.model(input_ids=input_ids)
         return out[0]
     if jax.local_devices()[0].platform == 'gpu':
         nvml.nvmlInit()
         ref_step().block_until_ready()
         handle = nvml.nvmlDeviceGetHandleByIndex(0)
         meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
         max_bytes_in_use = meminfo.used
         memory = Memory(max_bytes_in_use)
         # shutdown nvml
         nvml.nvmlShutdown()
     else:
         memory = None
     timeit.repeat("ref_step().block_until_ready()", repeat=1, number=2,globals=locals())
     if self.jit:
         runtimes = timeit.repeat("ref_step().block_until_ready()", repeat=self.repeat,number=3,globals=locals())
     else:
         with jax.disable_jit():
             runtimes = timeit.repeat("ref_step().block_until_ready()",repeat=self.repeat,number=3,globals=locals())
     return float(np.min(runtimes)/3.0), memory
示例#11
0
 def __init__(self, env, batch_size=32, gamma=0.99, 
              hidden_units=32, maxlen=10000, 
              tau=0.1, actor_lr=0.001, critic_lr=0.001):
     
     self.env=env
     self.batch_size=batch_size
     self.gamma=gamma
     self.maxlen=maxlen
     
     self.sess=tf.Session()
        
     
     self.actor=Actor(env, self.sess, hidden_units, tau, actor_lr)
     self.critic=Critic(env, self.sess, hidden_units, tau, critic_lr)
     self.memory=Memory(maxlen)
     
     self.sess.run(tf.global_variables_initializer())
     
     self.step=0
示例#12
0
 def collect_samples(self, batch_size=1):
     memory = Memory()
     num_trajs = (batch_size + args.sample_traj_length -
                  1) // args.sample_traj_length
     onehot_state, multihot_state, continuous_state = self.reset(num_trajs)
     for walk_step in range(self.max_traj_length - 1):
         with torch.no_grad():
             onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step(
                 onehot_state, multihot_state, continuous_state, num_trajs)
         # Currently we assume the exploration step is not done until it reaches max_traj_length.
         mask = torch.ones((num_trajs, 1), device=device)
         memory.push(onehot_state.type(FloatTensor),
                     multihot_state.type(FloatTensor), continuous_state,
                     onehot_action.type(FloatTensor),
                     multihot_action.type(FloatTensor), continuous_action,
                     next_onehot_state.type(FloatTensor),
                     next_multihot_state.type(FloatTensor),
                     next_continuous_state, old_log_prob, mask)
         onehot_state, multihot_state, continuous_state = next_onehot_state, next_multihot_state, next_continuous_state
     # one more step for push done
     with torch.no_grad():
         onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step(
             onehot_state, multihot_state, continuous_state, num_trajs)
         mask = torch.zeros((num_trajs, 1), device=device)
         memory.push(onehot_state.type(FloatTensor),
                     multihot_state.type(FloatTensor), continuous_state,
                     onehot_action.type(FloatTensor),
                     multihot_action.type(FloatTensor), continuous_action,
                     next_onehot_state.type(FloatTensor),
                     next_multihot_state.type(FloatTensor),
                     next_continuous_state, old_log_prob, mask)
     return memory, num_trajs
示例#13
0
 def collect_samples(self, mini_batch_size, size=1):
     num_step = 0
     memory = Memory()
     while num_step < mini_batch_size:
         discrete_state, continuous_state = self.reset(size)
         for walk_step in range(self.max_traj_length - 1):
             with torch.no_grad():
                 discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step(
                     discrete_state, continuous_state, size)
             # Currently we assume the exploration step is not done until it reaches max_traj_length.
             mask = torch.ones((size, 1), device=device)
             memory.push(discrete_state.type(FloatTensor), continuous_state,
                         discrete_action.type(FloatTensor),
                         continuous_action,
                         next_discrete_state.type(FloatTensor),
                         next_continuous_state, old_log_prob, mask)
             discrete_state, continuous_state = next_discrete_state, next_continuous_state
             num_step += 1
             if num_step >= mini_batch_size:
                 return memory
         # one more step for push done
         with torch.no_grad():
             discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step(
                 discrete_state, continuous_state, size)
             mask = torch.zeros((size, 1), device=device)
             memory.push(discrete_state.type(FloatTensor), continuous_state,
                         discrete_action.type(FloatTensor),
                         continuous_action,
                         next_discrete_state.type(FloatTensor),
                         next_continuous_state, old_log_prob, mask)
             num_step += 1
     return memory
示例#14
0
    def train_speed_memory(self, batch_size, seq_length):
        key = jax.random.PRNGKey(0)
        input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
        targets = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size)
        labels = jax.random.randint(key, (batch_size, seq_length), 0, 2)
        # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
        # targets = np.random.randint(0, self.vocab_size, (batch_size, seq_length))
        # labels = np.random.randint(0,2, (batch_size, seq_length))
        @jax.jit
        def train_step():

            def loss_fn(params):
                token_mask = jnp.where(labels > 0, 1.0, 0.0).astype(self.dtype)
                logits = self.model(input_ids=input_ids, train=True, params=params, dropout_rng=jax.random.PRNGKey(0))[0]
                loss, normalizing_factor = cross_entropy(logits,targets, token_mask)
                jax.profiler.save_device_memory_profile(f"memory/{workload[0]}_{workload[1]}_memory.prof", "gpu")
                return loss / normalizing_factor
            if self.fp16 and jax.local_devices()[0].platform == 'gpu':
                grad_fn = self.dynamic_scale.value_and_grad(loss_fn)
                dyn_scale, is_fin, loss, grad = grad_fn(self.model.params)
            else:
                grad_fn = jax.value_and_grad(loss_fn)
                loss, grad = grad_fn(self.model.params)
            return tree_flatten(grad)[0]


        if jax.local_devices()[0].platform == 'gpu':
            nvml.nvmlInit()
            train_step()
            handle = nvml.nvmlDeviceGetHandleByIndex(0)
            meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
            max_bytes_in_use = meminfo.used
            memory = Memory(max_bytes_in_use)
            # shutdown nvml
            nvml.nvmlShutdown()
        else:
            memory = None
        # timeit.repeat(train_step,repeat=1,number=2)
        timeit.repeat("for i in train_step():i.block_until_ready()", repeat=1, number=2,globals=locals())
        if self.jit:
            # runtimes = timeit.repeat(train_step,repeat=self.repeat,number=3)
            runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals())
        else:
            with jax.disable_jit():
                # runtimes = timeit.repeat(train_step, repeat=self.repeat, number=3)
                runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals())


        return float(np.min(runtimes)/3.0), memory
示例#15
0
    def train_feature_extractor(self, sess, replay_buffer, batch_size=100, iterations=1, iterations_left=-1):
        try:
            self.buff
        except:
            self.buff = Memory(10000)

        for it in range(iterations):
            while len(self.buff.mem) < batch_size:
                state = replay_buffer.sample(1)[0][0]

                patches = []
                for i in range(0, state.shape[1]-self.w+1, self.s):
                    for j in range(0, state.shape[2]-self.w+1, self.s):
                        patches.append(state[0, i:i+self.w, j:j+self.w, :])
                        assert patches[-1].shape[0] == patches[-1].shape[1]
                        assert patches[-1].shape[0] == self.w
                from random import shuffle
                shuffle(patches)
                self.buff.mem += patches

            batch = self.buff.mem[:batch_size]
            self.buff.mem = self.buff.mem[batch_size:]

            batch = np.concatenate([b[np.newaxis, ...] for b in batch], axis=0)
            batch = self.process_states(batch)
            batch = [b.astype(np.float64) / 255. for b in batch]
            feed_dict = {}
            for i in range(self.no_inputs):
                feed_dict[self.params[i]['input']] = batch[i]

            _, recon_loss, = sess.run([self.update_model_recon,
                                       self.recon_loss],
                                       feed_dict=feed_dict)
                                                  
            print "train_feature_extractor - recon_loss:", recon_loss

########################################################################################################################
            if iterations_left <= 10:
                variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
                import pickle
                recon_x, recon_y, conv1, conv2 = sess.run([self.params[0]['recon'], self.params[1]['recon'], variables[2], variables[4]], feed_dict=feed_dict)
                pickle.dump( [conv1, conv2, batch, recon_x, recon_y], open( "recons.p", "wb" ) )
########################################################################################################################


        return iterations
示例#16
0
    def __init__(self, stack_size=256, *args, **kwargs):
        self.stack = Stack(size=stack_size)
        self.memory = Memory(memory=kwargs.get('memory', b''))
        self.storage = dict()

        self.args = args
        self.kwargs = kwargs

        self.opcode = None
        self.current_opcode_name = None
        self.code = None
        self.msg = None
        self.code = None

        self.debug = False
        self.pc = 0
        self.prev_pc = -1
        self.stop = False
示例#17
0
    def train_feature_extractor(self, sess, replay_buffer, batch_size=100, iterations=1):
        try:
            self.buff
        except:
            self.buff = Memory(10000)

        for it in range(iterations):
            while len(self.buff.mem) < batch_size:
                state = replay_buffer.sample(1)[0][0]
                state = state.astype(np.float64)
                state = state / 255.

                patches = []
                for i in range(0, state.shape[1]-self.w+1, self.s):
                    for j in range(0, state.shape[2]-self.w+1, self.s):
                        patches.append(state[0, i:i+self.w, j:j+self.w, :])
                        assert patches[-1].shape[0] == patches[-1].shape[1]
                        assert patches[-1].shape[0] == self.w
                from random import shuffle
                shuffle(patches)
                self.buff.mem += patches

            batch = self.buff.mem[:batch_size]
            self.buff.mem = self.buff.mem[batch_size:]

            _, recon_loss, = sess.run([self.update_model_recon,
                                       self.recon_loss],
                                       feed_dict={self.x:batch,
                                                  self.y:batch})
            print "train_feature_extractor - recon_loss:", recon_loss

########################################################################################################################
        import pickle
        recon_x, recon_y = sess.run([self.recon_x_, self.recon_y_], feed_dict={self.x:batch, self.y:batch})
        pickle.dump( [batch, recon_x, recon_y], open( "recons.p", "wb" ) )
########################################################################################################################

        return iterations
示例#18
0
def main():
    import gym
    import sys
    import copy
    sys.path.append('../..')
    from utils import Memory

    #env = gym.make('LunarLander-v2')
    env = gym.make('Pendulum-v0')
    #env = gym.make('CartPole-v0')
    mem = Memory(1000000)
    batch_size = 32
    try:
        a_size = env.action_space.n
        a_type = 'discrete'
    except:
        try:
            a_size = env.action_space.shape[0]
            a_type = 'continuous'
        except:
            raise ValueError('Cannot find action size.')
    emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]],
                            a_size=a_size,
                            out_shape=[None, env.observation_space.shape[0]],
                            a_type=a_type,
                            numfactors=256)
    #emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]], a_size=a_size, out_shape=[None, 1], a_type=a_type, numfactors=256)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        while True:
            s = env.reset()

            done = False
            while done == False:
                #env.render()
                #a = np.random.randint(a_size)
                a = random_action(a_size, a_type)
                s_, r, done, _ = env.step(a)

                mem.add([s, a, r, s_, done])
                batch = mem.sample(batch_size)
                if len(batch) == batch_size:
                    states = []
                    actions = []
                    rewards = []
                    states_ = []
                    for i in range(batch_size):
                        states.append(batch[i][0])
                        actions.append(batch[i][1])
                        rewards.append(batch[i][2])
                        states_.append(batch[i][3])

                    states = np.stack(states, axis=0)
                    actions = np.stack(actions, axis=0)
                    rewards = np.stack(rewards, axis=0)
                    states_ = np.stack(states_, axis=0)

                    #_, loss_s, loss_a, loss_s_, loss = sess.run([emg.update_model, emg.loss_s, emg.loss_a, emg.loss_s_, emg.loss], feed_dict={emg.states:states, emg.states_:rewards[..., np.newaxis], emg.actions_placeholder:actions})
                    _, loss_s, loss_a, loss_s_, loss = sess.run(
                        [
                            emg.update_model, emg.loss_s, emg.loss_a,
                            emg.loss_s_, emg.loss
                        ],
                        feed_dict={
                            emg.states: states,
                            emg.states_: states_,
                            emg.actions_placeholder: actions
                        })
                    print 'loss_s', loss_s, 'loss_a', loss_a, 'loss_s_', loss_s_, 'loss', loss

                s = copy.deepcopy(s_)
                if done == True:
                    break
示例#19
0
class ProtoNet(MetaTemplate):
    def __init__(self,
                 model_func,
                 n_way,
                 n_support,
                 jigsaw=False,
                 lbda=0.0,
                 rotation=False,
                 tracking=False,
                 use_bn=True,
                 pretrain=False,
                 image_loader=None,
                 len_dataset=None):
        super(ProtoNet, self).__init__(model_func, n_way, n_support, use_bn,
                                       pretrain)
        self.loss_fn = nn.CrossEntropyLoss()

        self.len_dataset = len_dataset
        self.cuda()
        self.memory = Memory(size=len_dataset, weight=0.5, device='cuda')
        self.memory.initialize(self.feature, image_loader)

        self.jigsaw = jigsaw
        self.rotation = rotation
        self.lbda = lbda
        self.global_count = 0

        self.indx = 0

        if self.jigsaw:

            self.projection_transformed_features = nn.Linear(
                512 * 9, 512)  ### Self-supervision branch

            #self.fc6 = nn.Sequential()
            #self.fc6.add_module('fc6_s1',nn.Linear(512, 512))#for resnet
            #self.fc6.add_module('relu6_s1',nn.ReLU(inplace=True))
            #self.fc6.add_module('drop6_s1',nn.Dropout(p=0.5))

            #self.fc7 = nn.Sequential()
            #self.fc7.add_module('fc7',nn.Linear(9*512,4096))#for resnet
            #self.fc7.add_module('relu7',nn.ReLU(inplace=True))
            #self.fc7.add_module('drop7',nn.Dropout(p=0.5))

            #self.classifier = nn.Sequential()
            #self.classifier.add_module('fc8',nn.Linear(4096, 35))

        if self.rotation:
            self.fc6 = nn.Sequential()
            self.fc6.add_module('fc6_s1', nn.Linear(512, 512))  #for resnet
            self.fc6.add_module('relu6_s1', nn.ReLU(inplace=True))
            self.fc6.add_module('drop6_s1', nn.Dropout(p=0.5))

            self.fc7 = nn.Sequential()
            self.fc7.add_module('fc7', nn.Linear(512, 128))  #for resnet
            self.fc7.add_module('relu7', nn.ReLU(inplace=True))
            self.fc7.add_module('drop7', nn.Dropout(p=0.5))

            self.classifier_rotation = nn.Sequential()
            self.classifier_rotation.add_module('fc8', nn.Linear(128, 4))

    def train_loop(self,
                   epoch,
                   train_loader,
                   optimizer,
                   writer,
                   base_loader_u=None):

        print_freq = 10
        avg_loss = 0
        avg_loss_proto = 0
        avg_loss_jigsaw = 0
        avg_loss_rotation = 0

        if base_loader_u is not None:

            for i, inputs in enumerate(zip(train_loader,
                                           cycle(base_loader_u))):
                self.global_count += 1
                x = inputs[0][0]
                self.n_query = x.size(1) - self.n_support
                if self.change_way:
                    self.n_way = x.size(0)
                optimizer.zero_grad()
                loss_proto, acc = self.set_forward_loss(x)
                if self.jigsaw:
                    #loss_jigsaw, acc_jigsaw = self.set_forward_loss_unlabel(inputs[1][2], inputs[1][3],x)# torch.Size([5, 21, 9, 3, 75, 75]), torch.Size([5, 21])
                    loss_jigsaw = self.set_forward_loss_unlabel(
                        inputs[1][2], inputs[1][3], x
                    )  # torch.Size([5, 21, 9, 3, 64, 64]), torch.Size([5, 21])
                    loss = (1.0 -
                            self.lbda) * loss_proto + self.lbda * loss_jigsaw
                    writer.add_scalar('train/loss_proto',
                                      float(loss_proto.data.item()),
                                      self.global_count)
                    writer.add_scalar('train/loss_jigsaw',
                                      float(loss_jigsaw.data.item()),
                                      self.global_count)
                elif self.rotation:
                    loss_rotation, acc_rotation = self.set_forward_loss_unlabel(
                        inputs[1][2], inputs[1][3], x
                    )  # torch.Size([5, 21, 9, 3, 75, 75]), torch.Size([5, 21])
                    loss = (1.0 -
                            self.lbda) * loss_proto + self.lbda * loss_rotation
                    writer.add_scalar('train/loss_proto',
                                      float(loss_proto.data.item()),
                                      self.global_count)
                    writer.add_scalar('train/loss_rotation',
                                      float(loss_rotation.data.item()),
                                      self.global_count)
                else:
                    loss = loss_proto
                loss.backward()
                optimizer.step()
                avg_loss = avg_loss + loss.data
                writer.add_scalar('train/loss', float(loss.data.item()),
                                  self.global_count)

                if self.jigsaw:
                    avg_loss_proto += loss_proto.data
                    avg_loss_jigsaw += loss_jigsaw.data
                    writer.add_scalar('train/acc_proto', acc,
                                      self.global_count)
                    writer.add_scalar('train/acc_jigsaw', acc_jigsaw,
                                      self.global_count)
                elif self.rotation:
                    avg_loss_proto += loss_proto.data
                    avg_loss_rotation += loss_rotation.data
                    writer.add_scalar('train/acc_proto', acc,
                                      self.global_count)
                    writer.add_scalar('train/acc_rotation', acc_rotation,
                                      self.global_count)

                if (i + 1) % print_freq == 0:
                    if self.jigsaw:
                        print('Epoch {:d} | Batch {:d}/{:d} | Loss {:f} | Loss Proto {:f} | Loss Jigsaw {:f}'.\
                            format(epoch, i+1, len(train_loader), avg_loss/float(i+1), avg_loss_proto/float(i+1), avg_loss_jigsaw/float(i+1)))
                    elif self.rotation:
                        print('Epoch {:d} | Batch {:d}/{:d} | Loss {:f} | Loss Proto {:f} | Loss Rotation {:f}'.\
                            format(epoch, i+1, len(train_loader), avg_loss/float(i+1), avg_loss_proto/float(i+1), avg_loss_rotation/float(i+1)))
                    else:
                        print(
                            'Epoch {:d} | Batch {:d}/{:d} | Loss {:f}'.format(
                                epoch, i + 1, len(train_loader),
                                avg_loss / float(i + 1)))
        else:
            #### This branch is used
            self.memory.update_weighted_count()
            self.indx = 0
            for i, inputs in enumerate(train_loader):

                self.global_count += 1
                x = inputs[0]  ### [5,21,3,224,224]
                self.n_query = x.size(1) - self.n_support
                if self.change_way:
                    self.n_way = x.size(0)
                optimizer.zero_grad()
                loss_proto, acc = self.set_forward_loss(x)
                if self.jigsaw:
                    #  print(x.size(), inputs[2].size(), inputs[3].size())
                    loss_jigsaw = self.set_forward_loss_unlabel(
                        x, inputs[2], inputs[3]
                    )  # torch.Size([5, 21, 9, 3, 64, 64]), torch.Size([5, 21])
                    loss = (1.0 -
                            self.lbda) * loss_proto + self.lbda * loss_jigsaw
                    writer.add_scalar('train/loss_proto',
                                      float(loss_proto.data.item()),
                                      self.global_count)
                    writer.add_scalar('train/loss_jigsaw',
                                      float(loss_jigsaw.data.item()),
                                      self.global_count)
                elif self.rotation:
                    loss_rotation, acc_rotation = self.set_forward_loss_unlabel(
                        inputs[2], inputs[3], x
                    )  # torch.Size([5, 21, 9, 3, 75, 75]), torch.Size([5, 21])
                    loss = (1.0 -
                            self.lbda) * loss_proto + self.lbda * loss_rotation
                    writer.add_scalar('train/loss_proto',
                                      float(loss_proto.data.item()),
                                      self.global_count)
                    writer.add_scalar('train/loss_rotation',
                                      float(loss_rotation.data.item()),
                                      self.global_count)
                else:
                    loss = loss_proto
                loss.backward()
                optimizer.step()
                avg_loss = avg_loss + loss.item()
                writer.add_scalar('train/loss', float(loss.data.item()),
                                  self.global_count)

                if self.jigsaw:
                    avg_loss_proto += loss_proto.data
                    avg_loss_jigsaw += loss_jigsaw.data
                    writer.add_scalar('train/acc_proto', acc,
                                      self.global_count)
                    # writer.add_scalar('train/acc_jigsaw', acc_jigsaw, self.global_count)
                elif self.rotation:
                    avg_loss_proto += loss_proto.data
                    avg_loss_rotation += loss_rotation.data
                    writer.add_scalar('train/acc_proto', acc,
                                      self.global_count)
                    writer.add_scalar('train/acc_rotation', acc_rotation,
                                      self.global_count)

                if (i + 1) % print_freq == 0:
                    #print(optimizer.state_dict()['param_groups'][0]['lr'])
                    if self.jigsaw:
                        print('Epoch {:d} | Batch {:d}/{:d} | Loss {:f} | Loss Proto {:f} | Loss Jigsaw {:f}'.\
                            format(epoch, i+1, len(train_loader), avg_loss/float(i+1), avg_loss_proto/float(i+1), avg_loss_jigsaw/float(i+1)))
                    elif self.rotation:
                        print('Epoch {:d} | Batch {:d}/{:d} | Loss {:f} | Loss Proto {:f} | Loss Rotation {:f}'.\
                            format(epoch, i+1, len(train_loader), avg_loss/float(i+1), avg_loss_proto/float(i+1), avg_loss_rotation/float(i+1)))
                    else:
                        print(
                            'Epoch {:d} | Batch {:d}/{:d} | Loss {:f}'.format(
                                epoch, i + 1, len(train_loader),
                                avg_loss / float(i + 1)))
                self.indx += 105

    def test_loop(self, test_loader, record=None):
        # breakpoint()
        correct = 0
        count = 0
        acc_all = []
        acc_all_jigsaw = []
        acc_all_rotation = []

        iter_num = len(test_loader)
        for i, inputs in enumerate(test_loader):
            x = inputs[0]
            self.n_query = x.size(1) - self.n_support
            if self.change_way:
                self.n_way = x.size(0)

            if self.jigsaw:
                # correct_this, correct_this_jigsaw, count_this, count_this_jigsaw = self.correct(x, inputs[2], inputs[3])
                correct_this, count_this = self.correct(x)
            elif self.rotation:
                correct_this, correct_this_rotation, count_this, count_this_rotation = self.correct(
                    x, inputs[2], inputs[3])
            else:
                correct_this, count_this = self.correct(x)
            acc_all.append(correct_this / count_this * 100)
            # if self.jigsaw:
            #    acc_all_jigsaw.append(correct_this_jigsaw/ count_this_jigsaw*100)
            # elif self.rotation:
            #    acc_all_rotation.append(correct_this_rotation/ count_this_rotation*100)

        acc_all = np.asarray(acc_all)
        acc_mean = np.mean(acc_all)
        acc_std = np.std(acc_all)
        print('%d Test Protonet Acc = %4.2f%% +- %4.2f%%' %
              (iter_num, acc_mean, 1.96 * acc_std / np.sqrt(iter_num)))
        if self.jigsaw:
            # acc_all_jigsaw  = np.asarray(acc_all_jigsaw)
            # acc_mean_jigsaw = np.mean(acc_all_jigsaw)
            # acc_std_jigsaw  = np.std(acc_all_jigsaw)
            # print('%d Test Jigsaw Acc = %4.2f%% +- %4.2f%%' %(iter_num,  acc_mean_jigsaw, 1.96* acc_std_jigsaw/np.sqrt(iter_num)))
            #return acc_mean, acc_mean_jigsaw
            return acc_mean
        elif self.rotation:
            acc_all_rotation = np.asarray(acc_all_rotation)
            acc_mean_rotation = np.mean(acc_all_rotation)
            acc_std_rotation = np.std(acc_all_rotation)
            print('%d Test Rotation Acc = %4.2f%% +- %4.2f%%' %
                  (iter_num, acc_mean_rotation,
                   1.96 * acc_std_rotation / np.sqrt(iter_num)))
            return acc_mean, acc_mean_rotation
        else:
            return acc_mean

    def correct(self, x, patches=None, patches_label=None):

        scores = self.set_forward(x)
        #if self.jigsaw:
        #    x_, y_ = self.set_forward_unlabel(patches=patches,patches_label=patches_label)
        #elif self.rotation:
        #    x_, y_ = self.set_forward_unlabel(patches=patches,patches_label=patches_label)
        y_query = np.repeat(range(self.n_way), self.n_query)

        topk_scores, topk_labels = scores.data.topk(1, 1, True, True)
        topk_ind = topk_labels.cpu().numpy()
        top1_correct = np.sum(topk_ind[:, 0] == y_query)

        return float(top1_correct), len(y_query)

        #if self.jigsaw:
        #    pred = torch.max(x_,1)
        #    top1_correct_jigsaw = torch.sum(pred[1] == y_)
        #    return float(top1_correct), float(top1_correct_jigsaw), len(y_query), len(y_)
        #elif self.rotation:
        #    pred = torch.max(x_,1)
        #    top1_correct_rotation = torch.sum(pred[1] == y_)
        #    return float(top1_correct), float(top1_correct_rotation), len(y_query), len(y_)
        #else:
        #    return float(top1_correct), len(y_query)

    def set_forward(self, x, is_feature=False):

        z_support, z_query = self.parse_feature(x, is_feature)

        z_support = z_support.contiguous()
        z_proto = z_support.view(self.n_way, self.n_support, -1).mean(
            1)  #the shape of z is [n_data, n_dim]
        z_query = z_query.contiguous().view(self.n_way * self.n_query, -1)

        dists = euclidean_dist(z_query, z_proto)
        scores = -dists
        return scores

    def set_forward_unlabel(self, patches=None, patches_label=None):

        # print(patches.size())
        if len(patches.size()) == 6:
            patches_support = patches[:, :self.n_support]  ###support pathces
            Way, S, T, C, H, W = patches_support.size(
            )  #torch.Size([5, 5, 9, 3, 64, 64]) ###new
            B = Way * S
        elif len(patches.size()) == 5:
            B, T, C, H, W = patches.size()  #torch.Size([5, 15, 9, 3, 75, 75])
        if self.jigsaw:
            patches_support = patches_support.reshape(
                B * T, C, H, W).cuda()  #torch.Size([225, 3, 64, 64]) ###new
            if self.dual_cbam:
                patch_feat = self.feature(patches_support,
                                          jigsaw=True)  #torch.Size([225, 512])
            else:
                patch_feat = self.feature(
                    patches_support)  #torch.Size([225, 512])

            x_ = patch_feat.view(B, T, -1)  ### [25,9,512]
            x_ = x_[:, torch.randperm(x_.size()[1])]
            x_ = x_.view(B, -1)  #[25,4608] ###new
            v_t = self.projection_transformed_features(x_)  ### [25,512]
            v_t = v_t.view(self.n_way, self.n_way, -1)  ### [5,5,512]

            #x_ = x_.transpose(0,1)#torch.Size([9, 75, 512])

            #x_list = []
            #for i in range(9):
            #    z = self.fc6(x_[i])#torch.Size([75, 512])
            #    z = z.view([B,1,-1])#torch.Size([75, 1, 512])
            #    x_list.append(z)

            #x_ = torch.cat(x_list,1)#torch.Size([75, 9, 512])
            #x_ = (x_.view(B,-1))#torch.Size([105, 9*512])
            #x_=  self.projection_transformed_features(x_) # [105,512]
            #x_ = self.classifier(x_)

            #y_ = patches_label.view(-1).cuda()

            return v_t
        elif self.rotation:
            patches = patches.view(B * T, C, H, W).cuda()
            x_ = self.feature(patches)  #torch.Size([64, 512, 1, 1])
            x_ = x_.squeeze()
            x_ = self.fc6(x_)
            x_ = self.fc7(x_)  #64,128
            x_ = self.classifier_rotation(x_)  #64,4
            pred = torch.max(x_, 1)
            y_ = patches_label.view(-1).cuda()
            return x_, y_

    def set_forward_loss(self, x):

        y_query = torch.from_numpy(np.repeat(range(self.n_way), self.n_query))

        scores = self.set_forward(x)

        topk_scores, topk_labels = scores.data.topk(1, 1, True, True)
        topk_ind = topk_labels.cpu().numpy()
        acc = np.sum(topk_ind[:, 0] == y_query.numpy()) / len(y_query.numpy())
        y_query = Variable(y_query.cuda())

        return self.loss_fn(scores, y_query), acc

    def contrastive_loss(self, original_features, patch_features, negative_nb,
                         index):  ###new
        loss = 0
        # rng = np.random.default_rng()
        # print(z_support.size())

        #negatives = torch.empty(5,20,512)
        #negatives[0] = torch.cat((z_support[1], z_support[2], z_support[3], z_support[4]))
        #negatives[1] = torch.cat((z_support[0], z_support[2], z_support[3], z_support[4]))
        #negatives[2] = torch.cat((z_support[0], z_support[1], z_support[3], z_support[4]))
        #negatives[3] = torch.cat((z_support[0], z_support[1], z_support[2], z_support[4]))
        #negatives[4] = torch.cat((z_support[0], z_support[1], z_support[2], z_support[3]))

        for i in range(original_features.shape[0]):

            temp = 0.07
            cos = torch.nn.CosineSimilarity()
            criterion = torch.nn.CrossEntropyLoss()

            ### Obtaining negative images N=20

            # Index=np.array(range(0,original_features.shape[0])) ### [,25]
            # Index=np.delete(Index,i) ### [,24]
            # numbers = rng.choice(24, size=negative_nb, replace=False) # [1,20]

            #for j in range(negative_nb):
            # if(j==1):
            #  negative=z_support[Index[numbers[j]]]
            # else:
            #  negative=torch.cat((negative,z_support[Index[numbers[j]]]))

            ### Negative should have a size of [20,512]

            # negative = negatives[i//5]

            negative = self.memory.return_random(size=negative_nb,
                                                 index=[index[i]])
            negative = torch.Tensor(negative).to('cuda').detach()

            image_to_modification_similarity = cos(
                original_features[None, i, :],
                patch_features[None, i, :]) / temp  ### [,1]
            matrix_of_similarity = cos(patch_features[None, i, :],
                                       negative) / temp  ### [,20]

            similarities = torch.cat(
                (image_to_modification_similarity, matrix_of_similarity))
            loss += criterion(similarities[None, :],
                              torch.tensor([0]).to('cuda'))

        return loss / original_features.shape[0]

    def set_forward_loss_unlabel(self,
                                 x,
                                 patches=None,
                                 patches_label=None):  ###new

        if self.jigsaw:

            #x_, y_ = self.set_forward_unlabel(patches=patches,patches_label=patches_label)
            #pred = torch.max(x_,1)
            #acc_jigsaw = torch.sum(pred[1] == y_).cpu().numpy()*1.0/len(y_)
            #x = x.contiguous().view( self.n_way * (self.n_support + self.n_query), *x.size()[2:])

            v_t = self.set_forward_unlabel(
                patches=patches, patches_label=patches_label)  ###new [5,5,512]
            v_t = v_t.view(25, -1)  ###new [25,512]

            z_support, z_query = self.parse_feature(x,
                                                    is_feature=False)  ###new
            v = z_support  ###new [5,5,512]
            # print(v[0][0])
            v = v.reshape(-1, 512)
            # print(v[0])
            # print(v.size())
            # v=v.view(25,-1) ###new [25,512]

            indxs = [
                i + self.indx for i in [
                    0, 1, 2, 3, 4, 21, 22, 23, 24, 25, 42, 43, 44, 45, 46, 63,
                    64, 65, 66, 67, 84, 85, 86, 87, 88
                ]
            ]
            representations = self.memory.return_representations(indxs).to(
                'cuda').detach()

            negative_nb = 2000
            loss_weight = 0.5
            loss_1 = self.contrastive_loss(representations, v_t, negative_nb,
                                           indxs)
            loss_2 = self.contrastive_loss(representations, v, negative_nb,
                                           indxs)
            loss = loss_weight * loss_1 + (1 - loss_weight) * loss_2

            self.memory.update(indxs, v.detach().cpu().numpy())

        elif self.rotation:
            x_, y_ = self.set_forward_unlabel(patches=patches,
                                              patches_label=patches_label)
            pred = torch.max(x_, 1)
            acc_rotation = torch.sum(
                pred[1] == y_).cpu().numpy() * 1.0 / len(y_)

        if self.jigsaw:
            return loss

        elif self.rotation:
            return self.loss_fn(x_, y_), acc_rotation

    def parse_feature(self, x, is_feature):

        x = Variable(x.cuda())
        if is_feature:
            z_all = x
        else:
            x = x.contiguous().view(
                self.n_way * (self.n_support + self.n_query),
                *x.size()[2:])
            z_all = self.feature(x)
            z_all = z_all.view(self.n_way, self.n_support + self.n_query, -1)
        z_support = z_all[:, :self.n_support]
        z_query = z_all[:, self.n_support:]

        return z_support, z_query
示例#20
0
class ActorCriticAgent:
    """ Advantage Actor Critic agent """
    def __init__(self, num_actions, checkpoint=None):
        self.network, self.trainable_parameters = self.init_network(
            num_actions)
        self.optimizer = torch.optim.Adam(self.trainable_parameters, lr=1e-4)
        self.memory = Memory()
        if checkpoint is not None:
            load_checkpoint(self.network, self.optimizer, checkpoint)

    def init_network(self, num_actions):

        network = {'actor_critic': ActorCritic(num_actions)}
        trainable_parameters = list(network['actor_critic'].parameters())
        return network, trainable_parameters

    def play(self,
             environment,
             max_games=1,
             max_steps=500,
             train=False,
             verbose=False,
             recorder=None):

        n_steps = 0
        n_games = 0
        current_game_infos = {
            'game': n_games + 1,
            'reward': 0,
            'game_duration': 0
        }
        observation = environment.reset()
        if recorder is not None:
            recorder.reset()
            recorder.record(environment)

        while (n_steps < max_steps) and (n_games < max_games):

            self.init_rollout(observation)
            for rollout_step in range(20):

                value, log_policy, action = self.network['actor_critic'](
                    observation)
                self.memory.append({
                    'value': value,
                    'log_policy': log_policy,
                    'action': action
                })

                observation, extrinsic_reward, is_game_over, infos = environment.step(
                    action.numpy()[0])
                if recorder is not None:
                    recorder.record(environment)

                reward = self.get_reward(observation, extrinsic_reward)
                self.memory.append({'reward': reward})

                current_game_infos['reward'] += reward
                current_game_infos['game_duration'] += 1
                n_steps += 1

                if is_game_over:
                    n_games += 1
                    print(current_game_infos)
                    current_game_infos = {
                        'game': n_games + 1,
                        'reward': 0,
                        'game_duration': 0
                    }
                    observation = environment.reset()
                    break

            self.end_rollout(observation, is_game_over)
            if verbose:
                print(current_game_infos)

            if train:
                loss = self.compute_loss()
                self.backpropagate(loss)

        if recorder is not None:
            recorder.stop()

    def init_rollout(self, observation):

        self.memory.reset()
        self.network['actor_critic'].detach_internal_state()

    def end_rollout(self, observation, is_game_over):

        if is_game_over:
            next_value = torch.Tensor([[0]])
            self.network['actor_critic'].reset_internal_state()
        else:
            next_value = self.network['actor_critic'](observation)[0].detach()
        self.memory.append({'value': next_value})

    def get_reward(self, observation, extrinsic_reward):

        return np.clip(extrinsic_reward, -1, 1)

    def compute_loss(self):

        loss = self.network['actor_critic'].loss(self.memory)
        return loss

    def backpropagate(self, loss, max_gradient_norm=40):

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.trainable_parameters,
                                       max_gradient_norm)
        self.optimizer.step()
示例#21
0
def train(input_placeholder, output_data, sess):
    # build cost function
    action = tf.placeholder("float", [None, ACTIONS_CHOICE_NUMBER])
    y = tf.placeholder("float", [None])
    y_action = tf.reduce_sum(tf.multiply(output_data, action),
                             reduction_indices=1)
    cost = tf.reduce_mean(tf.square(y - y_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # start game
    game_state = game.GameState()

    global_timestamp = 0
    epsilon = EPSILON

    memory = Memory(MEMORY_SIZE, FRAME_NUM_PER_STACK)

    # start network
    sess.run(tf.global_variables_initializer())
    # network checkpoint saver and restore loader
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_FOLDER)
    if checkpoint and checkpoint.model_checkpoint_path and os.path.exists(
            os.path.join(CHECKPOINT_FOLDER, "global_state.pkl")):
        data = load_history(os.path.join(CHECKPOINT_FOLDER,
                                         "global_state.pkl"))
        global_timestamp = data['global_timestamp']
        epsilon = data['epsilon']
        memory = data['memory']
        game_state = data['game_state']
        saver.restore(sess, checkpoint.model_checkpoint_path)
        logging.info("restored from checkpoint",
                     extra={
                         'stage': get_stage_name(global_timestamp),
                         'timestamp': global_timestamp,
                         'epsilon': epsilon,
                         'reward': "",
                         'action': ""
                     })
    else:
        image_data, _, _ = game_state.frame_step(actions.NOTHING)
        memory.initial_stack(image_data)

    prev_state = memory.get_current_stack()

    while True:
        actions_scores = output_data.eval(
            feed_dict={input_placeholder: [prev_state]})[0]

        action_name, action_choice = actions.get_next_action(
            epsilon, actions_scores)

        image_data, reward, game_terminate = game_state.frame_step(
            action_choice)
        memory.stack_frame(image_data)

        new_state = memory.get_current_stack()
        memory.remember(prev_state, action_choice, reward, new_state,
                        game_terminate)

        # anneal
        if global_timestamp > OBSERVE_DURATION and epsilon > MIN_EPSILON:
            logging.info("start anneal",
                         extra={
                             'stage': get_stage_name(global_timestamp),
                             'timestamp': global_timestamp,
                             'epsilon': epsilon,
                             'reward': reward,
                             'action': action_name
                         })
            epsilon -= float(EPSILON - MIN_EPSILON) / ANNEAL_DURATION

        # explore + train
        if global_timestamp > OBSERVE_DURATION:
            prev_state_batch, action_batch, reward_batch, new_state_batch, game_terminate_batch = memory.get_sample_batches(
                BATCH_SIZE)

            y_batch = []
            evaluate = output_data.eval(
                feed_dict={input_placeholder: new_state_batch})
            for i, game_terminate in enumerate(game_terminate_batch):
                # train target to reward
                if game_terminate:
                    y_batch.append(reward_batch[i])
                else:
                    y_batch.append(reward_batch[i] +
                                   GAMMA * np.max(evaluate[i]))

            # gradient
            train_step.run(
                feed_dict={
                    y: y_batch,
                    action: action_batch,
                    input_placeholder: new_state_batch
                })

        if global_timestamp % CHECKPOINT_GAP == 0:
            saver.save(sess,
                       os.path.join(CHECKPOINT_FOLDER, 'flappy-bird'),
                       global_step=global_timestamp)
            save_history(
                os.path.join(CHECKPOINT_FOLDER, 'global_state.pkl'), {
                    'global_timestamp': global_timestamp,
                    'epsilon': epsilon,
                    'memory': memory,
                    'game_state': game_state
                })
            logging.info("checkpoint saved",
                         extra={
                             'stage': get_stage_name(global_timestamp),
                             'timestamp': global_timestamp,
                             'epsilon': epsilon,
                             'reward': reward,
                             'action': ""
                         })

        # update state
        prev_state = new_state
        logging.info("finish epoch",
                     extra={
                         'stage': get_stage_name(global_timestamp),
                         'timestamp': global_timestamp,
                         'epsilon': epsilon,
                         'reward': reward,
                         'action': action_name
                     })
        global_timestamp += 1
示例#22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-interface", type=str, default='gym')
    parser.add_argument("--environment",
                        type=str,
                        default='BreakoutDeterministic-v4')
    parser.add_argument("--action-size", type=int, default=4)
    parser.add_argument("--input-shape", type=str, default='None,84,84,4')
    parser.add_argument("--state-len-max", type=int, default=4)
    parser.add_argument("--target-update-freq", type=int, default=10000)

    parser.add_argument("--ep-greedy-speed", type=str, default='slow')
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay-slow", type=int, default=1000000)

    parser.add_argument("--epsilon-decay-fast", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.95)
    parser.add_argument("--replay-start-size", type=int, default=50000)
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--epochs", type=int, default=30000)

    parser.add_argument("--pixel-feature", type=int, default=1)
    parser.add_argument("--padding", type=int, default=0)

    parser.add_argument("--model", type=str, default='nature')

    args = parser.parse_args()

    args.input_shape = str2list(args.input_shape)
    assert args.model in ['nature', 'gated']
    assert args.ep_greedy_speed in ['fast', 'slow']
    assert args.env_interface in [
        'gym', 'ale', 'custom_cart', 'custom_cartpole', 'ple'
    ]
    if args.env_interface in ['gym', 'ale']:
        env = env_interface(args.env_interface, args.environment)
    elif args.env_interface in ['custom_cart', 'custom_cartpole', 'ple']:
        env = env_interface(args.env_interface, args.environment,
                            bool(args.pixel_feature), bool(args.padding))
        args.input_shape = [None] + list(env.obs_space_shape) + [1]
    args.input_shape[-1] = args.state_len_max
    args.action_size = env.action_size
    assert args.state_len_max == args.input_shape[-1]
    print args

    #Other other paramters
    state_old = []
    state = []
    steps = 0

    #Other parameters
    if args.ep_greedy_speed == 'slow':
        epsilon = args.epsilon_max
        epsilon_rate = 0.
        if args.epsilon_decay_slow != 0:
            epsilon_rate = ((args.epsilon_max - args.epsilon_min) /
                            float(args.epsilon_decay_slow))
    elif args.ep_greedy_speed == 'fast':
        epsilon = args.epsilon_max

    #Initialize replay memory
    memory = Memory(args.replay_mem_size, args.input_shape[1:])

    #Initialize neural net
    qnet, tnet, update_ops = init_network(args.input_shape, args.action_size,
                                          args.model)

    #import time
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(update_ops)
        for epoch in range(args.epochs):
            frame = env.reset()
            total_rewards = 0.
            total_losses = 0.
            state_old = []
            state = [frame] * args.state_len_max
            done = False

            #start = time.time()
            while done == False:
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    image_in = np.stack(state, axis=-1)[np.newaxis, ...]
                    action = qnet.get_action(sess, image_in)

                frame, reward, done, _ = env.step(action)
                total_rewards += reward
                state_old = state[:]
                state.append(frame)
                if len(state) > args.state_len_max:
                    state = state[1:]

                #Add to memory
                memory.add([
                    np.stack(state_old, axis=-1)[np.newaxis, ...], action,
                    min(1., max(-1., reward)),
                    np.stack(state, axis=-1)[np.newaxis, ...], done
                ])

                #Reduce epsilon
                if args.ep_greedy_speed == 'slow':
                    epsilon = max(args.epsilon_min, epsilon - epsilon_rate)
                elif args.ep_greedy_speed == 'fast':
                    epsilon = args.epsilon_min + (
                        args.epsilon_max - args.epsilon_min) * np.exp(
                            -args.epsilon_decay_fast * float(steps))

                if steps > args.replay_start_size:
                    #Training step
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.concatenate(batch[:, 0], axis=0)
                    actions = batch[:, 1]
                    rewards = batch[:, 2]
                    states1 = np.concatenate(batch[:, 3], axis=0)
                    dones = batch[:, 4]

                    l = qnet.train(sess, states, actions, rewards, states1,
                                   dones, args.learning_rate, tnet)
                    total_losses += l

                #Increase the frame steps counter
                steps += 1
                #Check if target network is to be updated
                if steps % args.target_update_freq == 0:
                    print "Updating target..."
                    sess.run(update_ops)

                if done == True:
                    print "epoch:", epoch, "total rewards", total_rewards, "total losses", total_losses, qnet.string
                    #print 'time:', time.time() - start
                    break
    env.close()
示例#23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=300)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    args = parser.parse_args()

    env = gym.make(args.environment)
    args.action_size = env.action_space.n
    args.input_shape = [None] + list(env.observation_space.shape)

    print args

    # Epsilon parameter
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the agent
    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    tnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='tnet')
    update_ops = update_target_graph('qnet', 'tnet')

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(args.epochs):
            total_reward = 0
            state = env.reset()
            while True:
                #env.render()
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    action = qnet.act(sess, state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward

                # Add to memory
                memory.add([state, action, reward, next_state, done])

                # Reduce epsilon
                time_step += 1.
                epsilon = args.epsilon_min + (
                    args.epsilon_max - args.epsilon_min) * np.exp(
                        -args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.learning_rate, tnet)

                # s <- s'
                state = np.copy(next_state)

                # Update target network
                if int(time_step) % args.target_update_freq == 0:
                    sess.run(update_ops)

                if done:
                    print 'epoch:', epoch, 'total_rewards:', total_reward
                    break
    def __init__(self, level_name):  
        self.level_name = level_name  
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()
        
        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)
        
        # instantiate memory
        self.memory = Memory(max_size=memory_size)
        
        # initialize deque with zero images
        self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4)

        for i in range(pretrain_length):    
            # If it's the first step
            if i == 0:
                state = self.env.reset()        
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state
       
        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)
        
        self.write_op = tf.summary.merge_all()
示例#25
0
print(device)

actor = Actor(obs_space=config.obs_space,
              action_space=config.action_space,
              hidden_size=config.hidden_size).to(device)
critic = Critic(obs_space=config.obs_space,
                hidden_size=config.hidden_size).to(device)
# actor.load_state_dict(torch.load('actor_model.h5'))
# critic.load_state_dict(torch.load('critic_model.h5'))

wandb.watch(actor)
wandb.watch(critic)

optimizer_actor = Adam(actor.parameters(), lr=config.actor_lr)
optimizer_critic = Adam(critic.parameters(), lr=config.critic_lr)
memory = Memory(env.agent_ids)


def compute_GAE(rewards, state_values, done, gamma, lamb):
    """
        Computes Generalized Advantage Estimations.
    """
    returns = [rewards[-1] + state_values[-1]]
    running_sum = rewards[-1] - state_values[-1]
    for i in reversed(range(len(rewards) - 1)):
        mask = 0 if done[i + 1] else 1
        delta = rewards[i] + gamma * state_values[i +
                                                  1] * mask - state_values[i]
        running_sum = delta + gamma * lamb * running_sum * mask
        returns.insert(0, running_sum + state_values[i])
class Agent:
    def __init__(self, level_name):  
        self.level_name = level_name  
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()
        
        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)
        
        # instantiate memory
        self.memory = Memory(max_size=memory_size)
        
        # initialize deque with zero images
        self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4)

        for i in range(pretrain_length):    
            # If it's the first step
            if i == 0:
                state = self.env.reset()        
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state
       
        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)
        
        self.write_op = tf.summary.merge_all()
    
    def predict_action(self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions):
        # first we randomize a number
        exp_exp_tradeoff = np.random.rand()

        explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)

        if explore_probability > exp_exp_tradeoff:
            # make a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
        else:
            # estimate the Qs values state
            Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state.reshape((1, *state.shape))})

            # take the biggest Q value (= best action)
            choice = np.argmax(Qs)
            action = self.possible_actions[choice]

        return action, choice, explore_probability
    
    def play_notebook(self):
        import matplotlib.pyplot as plt
        # imports to render env to gif
        from JSAnimation.IPython_display import display_animation
        from matplotlib import animation
        from IPython.display import display

        # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html
        def display_frames_as_gif(frames):
            """
            Displays a list of frames as a gif, with controls
            """
            #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
            patch = plt.imshow(frames[0])
            plt.axis('off')

            def animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
            display(display_animation(anim, default_mode='loop'))

        frames = []
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network 
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    frames.append(self.env.render(mode = 'rgb_array'))

                    total_rewards += reward

                    if done:
                        print ("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break


                    next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)
                    state = next_state

            self.env.close()

        display_frames_as_gif(frames)
        
    def play(self):
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            #self.env = wrap_env(self.env)

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network 
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print ("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()
    
    def train(self):        
        with tf.Session() as sess:
            # initialize the variables
            sess.run(tf.global_variables_initializer())

            # initialize decay rate (that will be used to reduce epsilon)
            decay_step = 0

            for episode in range(total_episodes):
                # set step to 0
                step = 0

                # initialize rewards of episode
                episode_rewards = []

                # make a new episode and opserve the first state
                state = self.env.reset()

                # remember that stack frame function
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("Episode:", episode)

                while step < max_steps:
                    step += 1
                    #print("step:", step)

                    # increase decay_step
                    decay_step += 1

                    # predict an action
                    action, choice, explore_probability = self.predict_action(sess,
                                                         explore_start, 
                                                         explore_stop, 
                                                         decay_rate, 
                                                         decay_step, 
                                                         state, 
                                                         self.possible_actions)

                    # perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)

                    if episode_render:
                        self.env.render()

                    # add the reward to total reward
                    episode_rewards.append(reward)

                    # the game is finished
                    if done:
                        print("done")
                        # the episode ends so no next state
                        next_state = np.zeros((110, 84), dtype=np.int)

                        next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

                        # set step = max_steps to end episode
                        step = max_steps

                        # get total reward of the episode
                        total_reward = np.sum(episode_rewards)

                        print("Episode:", episode, 
                              "Total reward:", total_reward, 
                              "Explore P:", explore_probability, 
                              "Training Loss:", loss)

                        #rewards_list.append((episode, total_reward))

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add((state, action, reward, next_state, done))
                    else:
                        # stack frame of the next state
                        next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add((state, action, reward, next_state, done))

                        # s_{i} := s_{i+1}
                        state = next_state

                    ### Learning part
                    # obtain random mini-batch from memory
                    batch = self.memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])

                    target_Qs_batch = []

                    # get Q values for next_state
                    Qs_next_state = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb})

                    # set Q_target = r if episode ends with s+1
                    for i in range(len(batch)):
                        terminal = dones_mb[i]

                    # if we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)

                    targets_mb = np.array([each for each in target_Qs_batch])

                    loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer],
                                      feed_dict={self.DQNetwork.inputs_: states_mb, 
                                                 self.DQNetwork.target_Q: targets_mb, 
                                                 self.DQNetwork.actions_: actions_mb})

                    # write tf summaries
                    summary = sess.run(self.write_op, feed_dict={self.DQNetwork.inputs_: states_mb, 
                                                 self.DQNetwork.target_Q: targets_mb, 
                                                 self.DQNetwork.actions_: actions_mb})
                    self.writer.add_summary(summary, episode)
                    self.writer.flush()

                # save model every 5 episodes
                if episode % 5 == 0:
                    self.saver.save(sess, "models/{0}.cpkt".format(self.level_name))
                    print("Model Saved")
示例#27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='Pendulum-v0')
    parser.add_argument("--action-dim", type=int, default=1)
    parser.add_argument("--state-dim", type=int, default=1)
    #parser.add_argument("--epochs", type=int, default=30000)
    parser.add_argument("--time-steps", type=int, default=30000)
    parser.add_argument('--tau',
                        type=float,
                        help='soft target update parameter',
                        default=0.01)
    parser.add_argument("--action-bound", type=float, default=1.)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--learning-rate", type=float, default=.9)

    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    parser.add_argument("--model", type=str, default='gan')

    parser.add_argument("--mode", type=str, default='none')
    args = parser.parse_args()

    assert args.mode in ['none', 'test', 'transfer']
    assert args.model in [
        'mlp', 'gan', 'gated', 'dmlac_mlp', 'dmlac_gan', 'dmlac_gated',
        'ddpg_unrolled_pg_mlp', 'dmlac_gp', 'dmlac_truth', 'mpc'
    ]
    if args.model == 'dmlac_truth':
        assert args.environment == 'Pendulum-v0'
    # Initialize environment
    env = gym.make(args.environment)
    args.state_dim = env.observation_space.shape[0]
    args.action_dim = env.action_space.shape[0]
    #assert args.action_dim == 1
    args.action_bound_high = env.action_space.high
    args.action_bound_low = env.action_space.low

    assert len(args.action_bound_high) == len(args.action_bound_low)
    for i in range(len(args.action_bound_high)):
        assert args.action_bound_high[i] == -args.action_bound_low[i]
    print(args)

    jointddpg, update_target_actor, update_target_critic, copy_target_actor, copy_target_critic = init_model(
        [None, args.state_dim], args.action_dim, args.latent_size,
        args.learning_rate, args.action_bound_low, args.action_bound_high,
        args.tau, args.model)

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Actor noise
    exploration_strategy = OUStrategy(jointddpg, env)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        #sess.run(copy_target_critic)
        #sess.run(copy_target_actor)

        if args.mode in ['test', 'transfer']:
            env.seed(1)
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(args.time_steps):
            env.render()
            # Choose an action
            exploration = (float(args.time_steps - time_steps) /
                           float(args.time_steps))**4
            action = exploration_strategy.action(sess, state[np.newaxis, ...],
                                                 exploration)
            # Execute action
            state1, reward, done, _ = env.step(action)

            total_rewards += float(reward)
            # Store tuple in replay memory
            memory.add([
                state[np.newaxis, ...], action[np.newaxis, ...], reward,
                state1[np.newaxis, ...], done
            ])

            # Training step
            batch_B = np.array(memory.sample(args.batch_size))
            assert len(batch_B) > 0
            states_B = np.concatenate(batch_B[:, 0], axis=0)
            actions_B = np.concatenate(batch_B[:, 1], axis=0)
            rewards_B = batch_B[:, 2]
            states1_B = np.concatenate(batch_B[:, 3], axis=0)
            dones_B = batch_B[:, 4]

            #Get another batch
            batch_M = np.array(memory.sample(args.batch_size))
            assert len(batch_M) > 0
            states_M = np.vstack(batch_M[:, 0])
            actions_M = np.concatenate(batch_M[:, 1], axis=0)

            if args.model == 'dmlac_gp':
                jointddpg.update_hist(memory)

            jointddpg.train(sess, states_B, actions_B, rewards_B,
                            states1_B, dones_B, states_M, actions_M,
                            len(batch_M), args.latent_size)

            # Update target networks
            #jointddpg.update(self, sess, update_target_critic, update_target_actor)
            #sess.run(update_target_critic)
            #sess.run(update_target_actor)

            state = np.copy(state1)
            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards
                epoch += 1
                total_rewards = 0.
                if args.mode == 'transfer':
                    if time_steps >= args.time_steps / 3:
                        env.seed(0)
                    else:
                        env.seed(1)
                elif args.mode == 'test':
                    env.seed(1)
                state = env.reset()
            if args.mode == 'transfer':
                if time_steps == args.time_steps / 3:
                    memory = Memory(args.replay_mem_size)
class DDPGagent:
    def __init__(self, hidden_size, env):
        self.num_states = env.observation_space.shape[0]
        self.num_actions = env.action_space.shape[0]

        self.Actor = Actor(input_size=self.num_states,
                           hidden_size=hidden_size,
                           output_size=self.num_actions).cuda()

        self.Actor_target = Actor(input_size=self.num_states,
                                  hidden_size=hidden_size,
                                  output_size=self.num_actions).cuda()

        self.Critic = Critic(input_size=self.num_states,
                             hidden_size=hidden_size,
                             output_size=self.num_actions).cuda()

        self.Critic_target = Critic(input_size=self.num_states,
                                    hidden_size=hidden_size,
                                    output_size=self.num_actions).cuda()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = param.data

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data = param.data

        self.Memory = Memory(30000)
        self.criterion = nn.MSELoss().cuda()
        self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(),
                                                lr=1e-2)
        self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(),
                                                 lr=1e-1)

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).cuda()
        action = self.Actor.forward(state)
        action = action.detach().cpu().numpy()
        return action

    def update(self, batch_size):
        states, actions, rewards, next_states, _ = self.Memory.sample(
            batch_size)
        states = torch.tensor(states).cuda()
        actions = torch.tensor(actions).cuda()
        rewards = torch.tensor(rewards).cuda()
        next_states = torch.tensor(next_states).cuda()

        Q_Value = self.Critic.forward(states, action=actions)
        next_actions = self.Actor_target(next_states)
        next_Q = self.Critic_target.forward(next_states, next_actions.detach())
        Q_prime = rewards + 0.99 * next_Q
        critic_loss = self.criterion(Q_Value, Q_prime)
        policy_loss = -self.Critic.forward(states,
                                           self.Actor.forward(states)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        for target_param, param in zip(self.Actor_target.parameters(),
                                       self.Actor.parameters()):
            target_param.data = (param.data * 1e-2 + target_param.data *
                                 (1.0 - 1e-2))

        for target_param, param in zip(self.Critic_target.parameters(),
                                       self.Critic.parameters()):
            target_param.data.copy_(param.data * 1e-2 + target_param.data *
                                    (1.0 - 1e-2))
示例#29
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default='Pendulum-v0')
    parser.add_argument("--unroll-steps", type=int, default=25)
    parser.add_argument("--no-samples", type=int, default=20)
    parser.add_argument("--no-basis", type=int, default=256)
    parser.add_argument("--discount-factor", type=float, default=.9)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--train-policy-batch-size", type=int, default=32)
    parser.add_argument("--train-policy-iterations", type=int, default=30)
    parser.add_argument("--replay-start-size-epochs", type=int, default=2)
    parser.add_argument("--train-hyperparameters-iterations",
                        type=int,
                        default=50000)
    parser.add_argument("--goal-position", type=float, default=.45)
    args = parser.parse_args()

    print args

    #env = gym.make(args.env, goal_position=args.goal_position)
    env = gym.make(args.env)
    env.seed(seed=args.goal_position)

    # Gather data to train hyperparameters
    data = []
    rewards = []
    dones = []
    for _ in range(2):
        state = env.reset()
        while True:
            action = np.random.uniform(env.action_space.low,
                                       env.action_space.high, 1)
            next_state, reward, done, _ = env.step(action)
            data.append([state, action, next_state])
            rewards.append(reward)
            dones.append(done)
            state = np.copy(next_state)
            if done:
                break

    states, actions, next_states = [np.stack(d, axis=0) for d in zip(*data)]

    permutation = np.random.permutation(len(data))
    states_actions = np.concatenate([states, actions], axis=-1)[permutation]
    next_states = next_states[permutation]

    # Train the hyperparameters
    hs = [
        hyperparameter_search(dim=env.observation_space.shape[0] +
                              env.action_space.shape[0])
        for _ in range(env.observation_space.shape[0])
    ]
    hyperparameters = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        batch_size = 32
        iterations = args.train_hyperparameters_iterations
        #idxs = [np.random.randint(len(states_actions), size=batch_size) for _ in range(iterations)]
        for i in range(len(hs)):
            hs[i].train_hyperparameters(sess, states_actions, next_states[:,
                                                                          i],
                                        iterations, batch_size)
            hyperparameters.append(
                sess.run([hs[i].length_scale, hs[i].signal_sd,
                          hs[i].noise_sd]))

    blr = blr_model(x_dim=env.observation_space.shape[0] +
                    env.action_space.shape[0],
                    y_dim=env.observation_space.shape[0],
                    state_dim=env.observation_space.shape[0],
                    action_dim=env.action_space.shape[0],
                    observation_space_low=env.observation_space.low,
                    observation_space_high=env.observation_space.high,
                    action_bound_low=env.action_space.low,
                    action_bound_high=env.action_space.high,
                    unroll_steps=args.unroll_steps,
                    no_samples=args.no_samples,
                    no_basis=args.no_basis,
                    discount_factor=args.discount_factor,
                    train_policy_batch_size=args.train_policy_batch_size,
                    train_policy_iterations=args.train_policy_iterations,
                    hyperparameters=hyperparameters,
                    debugging_plot=False)

    # Initialize the memory
    memory = Memory(args.replay_mem_size)
    assert len(data) == len(rewards)
    assert len(data) == len(dones)
    for dat, reward, done in zip(data, rewards, dones):
        memory.add([dat[0], dat[1], reward, dat[2], done])
    memory2 = []

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        weights = pickle.load(
            open(
                '../custom_environments/weights/mountain_car_continuous_reward'
                + str(args.goal_position) + '.p', 'rb'))
        sess.run(blr.assign_ops,
                 feed_dict=dict(zip(blr.placeholders_reward, weights)))
        # Update the model with data used from training hyperparameters
        blr.update(sess, states_actions, next_states)
        blr.train(sess, memory)
        state = env.reset()
        total_rewards = 0.0
        epoch = 1
        for time_steps in range(30000):
            action = blr.act(sess, state)
            next_state, reward, done, _ = env.step(action)
            total_rewards += float(reward)

            # Append to the batch
            memory.add([state, action, reward, next_state, done])
            memory2.append([state, action, reward, next_state, done])

            # s <- s'
            state = np.copy(next_state)

            if done == True:
                print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards

                # Update the memory
                blr.update(sess, memory=memory2)

                # Train the policy
                blr.train(sess, memory)

                epoch += 1
                total_rewards = 0.
                state = env.reset()
                memory2 = []
示例#30
0
def main():
    #Arguments for the q-learner
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-interface", type=str, default='gym')
    parser.add_argument("--environment",
                        type=str,
                        default='BreakoutDeterministic-v4')
    parser.add_argument("--action-size", type=int, default=4)
    parser.add_argument("--input-shape", type=str, default='None,84,84,4')
    parser.add_argument("--state-len-max", type=int, default=4)
    parser.add_argument("--target-update-freq", type=int, default=10000)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=int, default=1000000)
    parser.add_argument("--learning-rate", type=float, default=.95)
    parser.add_argument("--replay-start-size", type=int, default=50000)
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    parser.add_argument("--epochs", type=int, default=30000)

    #Arguments for the feature extractor
    parser.add_argument("--train-fe-shape", type=str, default='None,12,12,4')
    parser.add_argument("--stop-gradient", type=int, default=0)
    parser.add_argument("--train-fe-iterations", type=int, default=1000)
    parser.add_argument("--train-fe-batch-size", type=int, default=100)
    parser.add_argument("--train-fe-lamb", type=float, default=0.)
    parser.add_argument("--train-fe-numfactors", type=int, default=200)
    parser.add_argument("--train-fe-nummap", type=int, default=100)
    parser.add_argument("--train-fe-learning-rate", type=float, default=.001)
    parser.add_argument("--train-fe-w", type=int, default=12)
    parser.add_argument("--train-fe-s", type=int, default=1)

    parser.add_argument("--use-conv-after-fe", type=int, default=0)

    parser.add_argument("--ep-greedy-speed", type=str, default='slow')
    #Arguments for the environment interface
    parser.add_argument("--pixel-features", type=int, default=1)
    parser.add_argument("--padding", type=int, default=0)
    args = parser.parse_args()

    #Parse arguments wrt other arguments
    args.input_shape = str2list(args.input_shape)
    args.train_fe_shape = str2list(args.train_fe_shape)
    assert args.env_interface in [
        'gym', 'ale', 'custom_cart', 'custom_cartpole'
    ]
    assert args.ep_greedy_speed in ['fast', 'slow']
    env = env_interface(args.env_interface,
                        args.environment,
                        pixel_feature=bool(args.pixel_features),
                        padding=bool(args.padding),
                        render=True)
    args.action_size = env.action_size
    if args.env_interface in ['custom_cart', 'custom_cartpole']:
        args.input_shape = [None] + list(
            env.obs_space_shape) + [args.state_len_max]
    args.train_fe_shape[-1] = args.state_len_max
    print args

    #Other other parameters
    state_old = []
    state = []
    steps = 0

    #Other parameters
    epsilon_lambda = .001
    epsilon = args.epsilon_max
    epsilon_rate = 0.
    if args.epsilon_decay != 0:
        epsilon_rate = ((args.epsilon_max - args.epsilon_min) /
                        float(args.epsilon_decay))

    #Initialize replay memory
    print args.input_shape
    memory = Memory(args.replay_mem_size, args.input_shape[1:])

    #Initialize neural net
    from gated_qlearning import gated_qlearning
    qnet = gated_qlearning(shape=args.train_fe_shape,\
                           nummap=args.train_fe_nummap,\
                           numfactors=args.train_fe_numfactors,\
                           learning_rate=args.train_fe_learning_rate,\
                           frame_shape=args.input_shape,\
                           a_size=args.action_size,\
                           stop_gradient=bool(args.stop_gradient),\
                           lamb=args.train_fe_lamb,\
                           w=args.train_fe_w,\
                           s=args.train_fe_s,\
                           use_conv_after_fe=bool(args.use_conv_after_fe))
    qnet_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

    tnet = gated_qlearning(shape=args.train_fe_shape,\
                           nummap=args.train_fe_nummap,\
                           numfactors=args.train_fe_numfactors,\
                           learning_rate=args.train_fe_learning_rate,\
                           frame_shape=args.input_shape,\
                           a_size=args.action_size,\
                           stop_gradient=bool(args.stop_gradient),\
                           lamb=args.train_fe_lamb,\
                           w=args.train_fe_w,\
                           s=args.train_fe_s,\
                           use_conv_after_fe=bool(args.use_conv_after_fe))
    tnet_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES)[len(qnet_vars):]

    update_ops = update_target_graph_vars(qnet_vars, tnet_vars)
    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)
        sess.run(update_ops)
        for epoch in range(args.epochs):
            frame = env.reset()
            total_rewards = 0.
            total_losses = 0.
            state_old = []
            state = [frame] * args.state_len_max
            done = False

            while done == False:
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    image_in = np.stack(state, axis=-1)[np.newaxis, ...]
                    action = qnet.get_action(sess, image_in)

                frame, reward, done, _ = env.step(action)
                total_rewards += reward
                state_old = state[:]
                state.append(frame)
                if len(state) > args.state_len_max:
                    state = state[1:]

                #Add to memory
                memory.add([np.stack(state_old, axis=-1)[np.newaxis, ...],\
                           action,\
                           min(1., max(-1., reward)),\
                           np.stack(state, axis=-1)[np.newaxis, ...],\
                           done])

                #Reduce epsilon
                if args.ep_greedy_speed == 'slow':
                    epsilon = max(args.epsilon_min, epsilon - epsilon_rate)
                elif args.ep_greedy_speed == 'fast':
                    epsilon = args.epsilon_min + (
                        args.epsilon_max - args.epsilon_min) * np.exp(
                            -epsilon_lambda * float(steps))

                #Train the reconstruction loss
                if args.train_fe_iterations > 0:
                    args.train_fe_iterations -= qnet.train_feature_extractor(
                        sess, memory, args.train_fe_batch_size, 10)
                    print args.train_fe_iterations

                if steps > args.replay_start_size and args.train_fe_iterations <= 0:
                    #Training step
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.concatenate(batch[:, 0], axis=0)
                    actions = batch[:, 1]
                    rewards = batch[:, 2]
                    states1 = np.concatenate(batch[:, 3], axis=0)
                    dones = batch[:, 4]

                    Q1 = qnet.get_Q1(sess, states1, tnet)

                    targetQ = rewards + (1. -
                                         dones) * args.learning_rate * np.amax(
                                             Q1, keepdims=False, axis=1)

                    l, _, _ = qnet.train(sess, states, actions,
                                         targetQ[..., np.newaxis])
                    total_losses += l

                #Increase the frame steps counter
                steps += 1
                #Check if target network is to be updated
                if steps % args.target_update_freq == 0:
                    print "Updating target..."
                    sess.run(update_ops)

                if done == True:
                    print "epoch", epoch, "total rewards", total_rewards, "total losses", total_losses
                    break

    env.close()