Exemplo n.º 1
0
    def __init__(self,
                 device,
                 state_size,
                 action_size,
                 buffer_size=10,
                 batch_size=10,
                 learning_rate=0.1,
                 discount_rate=0.99,
                 eps_decay=0.9,
                 tau=0.1,
                 steps_per_update=4):
        self.device = device
        self.state_size = state_size
        self.action_size = action_size

        self.q_network_control = QNetwork(state_size, action_size).to(device)
        self.q_network_target = QNetwork(state_size, action_size).to(device)
        self.optimizer = torch.optim.Adam(self.q_network_control.parameters(),
                                          lr=learning_rate)

        self.batch_size = batch_size
        self.replay_buffer = ReplayBuffer(device, state_size, action_size,
                                          buffer_size)

        self.discount_rate = discount_rate

        self.eps = 1.0
        self.eps_decay = eps_decay

        self.tau = tau

        self.step_count = 0
        self.steps_per_update = steps_per_update
Exemplo n.º 2
0
    def __init__(self, env, state_space, action_space, device, learning_rate, buffer_size, \
                 batch_size, gamma, in_channels, train_freq = 4, target_update_freq=1e4, is_ddqn = False):
        self.env = env
        self.state_space = state_space
        self.action_space = action_space
        
        self.QNetwork_local = QNetwork(in_channels, self.state_space, self.action_space.n, device = device).to(device)
        self.QNetwork_local.init_weights()
        self.QNetwork_target = QNetwork(in_channels, self.state_space,self.action_space.n , device = device).to(device)
        self.QNetwork_target.load_state_dict(self.QNetwork_local.state_dict())
        
        self.optimizer = torch.optim.RMSprop(self.QNetwork_local.parameters(), lr=learning_rate, alpha=0.95, eps=0.01, centered=True)
        self.criterion = torch.nn.MSELoss()
        self.memory = ReplayBuffer(capacity=int(buffer_size), batch_size=batch_size)
        self.step_count = 0.
        self.batch_size = batch_size
        self.gamma = gamma
        
        self.device = device
        self.buffer_size = buffer_size
        self.num_train_updates = 0
        self.train_freq = train_freq
        self.target_update_freq = target_update_freq

        self.is_ddqn = is_ddqn
        
        print('Agent initialized with memory size:{}'.format(buffer_size))
Exemplo n.º 3
0
def main():
    env = gym.make('CartPole-v1')
    agent = QNetwork(env.observation_space, env.action_space)

    frame = 0
    while frame < MAX_NUM_FRAMES:
        state = env.reset()
        for t in range(MAX_EPISODE_DURATION):
            env.render()

            action = agent.act(state)
            
            next_state, reward, done, info = env.step(action)

            if done:
                # done doesn't return a negative reward...
                reward *= -1

            frame = frame + 1

            if done or frame == MAX_NUM_FRAMES:
                break

            state = next_state

    env.close()
Exemplo n.º 4
0
    def _init_modules(self):

        # Replay memory
        self.replay_memory = ReplayMemory(history_len=self.num_frames,
                                          capacity=self.capacity,
                                          batch_size=self.batch_size,
                                          input_scale=self.input_scale)

        input_shape = self.feedback_size + (self.num_frames, )
        # Q-network
        self.q_network = QNetwork(input_shape=input_shape,
                                  n_outputs=len(self.actions),
                                  network_type=self.config['network_type'],
                                  scope='q_network')
        # Target network
        self.target_network = QNetwork(
            input_shape=input_shape,
            n_outputs=len(self.actions),
            network_type=self.config['network_type'],
            scope='target_network')
        # Optimizer
        self.optimizer = Optimizer(config=self.config,
                                   feedback_size=self.feedback_size,
                                   q_network=self.q_network,
                                   target_network=self.target_network,
                                   replay_memory=self.replay_memory)
        # Ops for updating target network
        self.clone_op = self.target_network.get_clone_op(self.q_network)
        # For tensorboard
        self.t_score = tf.placeholder(dtype=tf.float32,
                                      shape=[],
                                      name='new_score')
        tf.summary.scalar("score", self.t_score, collections=['dqn'])
        self.summary_op = tf.summary.merge_all('dqn')
Exemplo n.º 5
0
class Agent():
    def __init__(self, state_size, action_size, fc_units, seed, lr,
                 buffer_size, batch_size, update_every):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.update_every = update_every

        self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                       fc_units).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                        fc_units).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer,
                                                   step_size=100,
                                                   gamma=0.5)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, gamma, tau):
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % self.update_every

        if (self.t_step == 0) and (len(self.memory) > self.memory.batch_size):
            experiences = self.memory.sample()
            self.learn(experiences, gamma, tau)

    def act(self, state, eps):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        if random.random() > eps:
            action = np.argmax(action_values.cpu().data.numpy())
        else:
            action = random.choice(np.arange(self.action_size))
        return action

    def learn(self, experiences, gamma, tau):
        states, actions, rewards, next_states, dones = experiences
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        Q_targets = rewards + gamma * Q_targets_next * (1 - dones)
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.soft_update(self.qnetwork_local, self.qnetwork_target, tau)

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Exemplo n.º 6
0
 def build_networks(self):
     self.learning_network = QNetwork(
         name=self.hyperparameters["agent_name"] + "_learning_q_network",
         env=self.env,
         hyperparameters=self.hyperparameters,
         placeholders=self.placeholders,
         sess=self.sess)
     self.target_network = QNetwork(
         name=self.hyperparameters["agent_name"] + "_target_q_network",
         env=self.env,
         hyperparameters=self.hyperparameters,
         placeholders=self.placeholders,
         sess=self.sess)
Exemplo n.º 7
0
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemplo n.º 8
0
    def __init__(self, state_size, action_size, fc_units, seed, lr,
                 buffer_size, batch_size, update_every):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.update_every = update_every

        self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                       fc_units).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                        fc_units).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer,
                                                   step_size=100,
                                                   gamma=0.5)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0
Exemplo n.º 9
0
    def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95):
        """
        Creates a DQN agent.

        :param env: game environment.
        :type env: Class Snake_Env().
        :param hidden_units: number of neurons in each layer.
        :type hidden_units: tupple with dimension (1, 3).
        :param network_LR: learning rate of the action-value neural network.
        :type network_LR: float.
        :param batch_size: size of the minibatch taken from the replay buffer.
        :type batch_size: int.
        :param update_every: number of iterations for updating the target qnetwork. 
        :type update_every: int
        :param gamma: discount factor.
        :type gamma: float.
        """
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma          
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)   
        self.ACTION_SIZE = env.ACTION_SPACE           
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every
       
        self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)
        
        self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)

        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) 

        #Temp variable
        self.t = 0
Exemplo n.º 10
0
 def __init__(self, state_size, action_size, seed, training, pixels, lr=LR):
     self.state_size = state_size
     self.action_size = action_size
     self.seed = random.seed(seed)
     self.t_step = 0.
     self.pixels = pixels
     if pixels is False:
         from q_network import QNetwork
     else:
         from q_network_cnn import QNetwork
         print('loaded cnn network')
         self.loader = transforms.Compose([
             transforms.ToTensor(),
             transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
         ])
     self.QN_local = QNetwork(state_size, action_size, seed,
                              training).to(device)
     self.QN_target = QNetwork(state_size, action_size, seed,
                               training).to(device)
     self.optimizer = optim.Adam(self.QN_local.parameters(), lr=lr)
     self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                device)  # TODO
Exemplo n.º 11
0
def main(env_name, optim, hiddens, eps_decay, memory_length, epochs,
         batch_size):
    env = gym.make(env_name)
    if hiddens:
        hidden_sizes = map(int, hiddens.split(','))
    else:
        hidden_sizes = []
    q_network = QNetwork(inps=env.observation_space.shape[0],
                         outs=env.action_space.n,
                         hidden_sizes=hidden_sizes,
                         str_optim=optim)
    policy = EpsilonGreedyPolicy(start_eps=1.0,
                                 eps_decay=eps_decay,
                                 action_values=q_network.get_action_values,
                                 n_actions=env.action_space.n)
    memory = Memory(max_length=memory_length)
    Transition = namedtuple(
        'Transition', ["state", "action", "reward", "state_next", "done"])

    for epoch in range(epochs):
        state = env.reset()
        epoch_reward = 0
        for step in range(env.spec.max_episode_steps):
            action = policy.act(state)
            next_state, reward, done, _ = env.step(action)
            memory.store(Transition(state, action, reward, next_state, done))
            try:
                q_network.fit_transitions(memory.random_sample(batch_size))
            except MemTooSmallError:  # memory not filled yet
                pass
            policy.update_epsilon()
            epoch_reward += reward
            state = next_state
            if done: break

        print 'Episode {}/{} | Total reward: {} | Epsilon: {:.4f}'\
               .format(epoch, epochs, epoch_reward, policy.epsilon)
    return
    def __init__(self,
                 env,
                 hidden_units=None,
                 network_LR=0.001,
                 batch_size=64,
                 update_every=4,
                 gamma=1.0):
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)  #this is pythonic

        self.nA = env.ACTION_SPACE  #number of actions agent can perform
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every

        #let's give it some brains
        self.qnetwork_local = QNetwork(input_shape=self.env.STATE_SPACE,
                                       hidden_units=self.HIDDEN_UNITS,
                                       output_size=self.nA,
                                       learning_rate=self.NETWORK_LR)
        print(self.qnetwork_local.model.summary())

        #I call the target network as the PC
        # Where our agent stores all the concrete and important stuff
        self.qnetwork_target = QNetwork(input_shape=self.env.STATE_SPACE,
                                        hidden_units=self.HIDDEN_UNITS,
                                        output_size=self.nA,
                                        learning_rate=self.NETWORK_LR)

        #and the memory of course
        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE)

        #handy temp variable
        self.t = 0
Exemplo n.º 13
0
def main():
    env = gym.make('CartPole-v1')
    replay_memory = ReplayMemory()
    agent = QNetwork(env.observation_space, env.action_space)
    get_training_batch = make_training_transformer(env.observation_space.shape,
                                                   agent)

    frame = 0
    acc_loss = 0
    acc_state_value = 0
    while frame < MAX_NUM_FRAMES:
        state = env.reset()
        for t in range(MAX_EPISODE_DURATION):
            if take_random_action(frame):
                action = env.action_space.sample()  # pick random action
            else:
                action = agent.act(state)

            next_state, reward, done, info = env.step(action)

            if done:
                # on done doesn't return a negative reward...
                reward *= -1

            experience = (state, action, reward, next_state, done)
            replay_memory.append(experience)
            frame += 1

            experience_samples = replay_memory.sample(BATCH_SIZE)
            state_batch, qs_batch = get_training_batch(experience_samples)
            acc_state_value += np.mean(qs_batch)

            loss = agent.training_step(state_batch, qs_batch)
            acc_loss += loss

            if frame % FRAMES_PER_SAVE == 0:
                model_filename = f"ckpt-loss={loss:.4f}"
                agent.save_model(model_filename)

            if frame % FRAMES_PER_PRINT == 0:
                print(f"Frame: {frame}")
                avg_loss = acc_loss / FRAMES_PER_PRINT
                avg_state_value = acc_state_value / FRAMES_PER_PRINT
                print(
                    f"avg loss: {avg_loss:.4f}; avg value: {avg_state_value:.2f}"
                )
                acc_loss = 0
                acc_state_value = 0

            if done or frame == MAX_NUM_FRAMES:
                break

            state = next_state

    env.close()
Exemplo n.º 14
0
    def __init__(self, params):
        action_size = params['action_size']
        state_size = params['state_size']
        buf_params = params['buf_params']
        nn_params = params['nn_params']
        nn_params['l1'][0] = state_size
        nn_params['l5'][1] = action_size

        self.__learning_mode = params['learning_mode']

        if self.__learning_mode['DuelingDDQN']:
            self.__qnetwork_local = DuelingQNetwork(nn_params).to(device)
            self.__qnetwork_target = DuelingQNetwork(nn_params).to(device)
        else:
            self.__qnetwork_local = QNetwork(nn_params).to(device)
            self.__qnetwork_target = QNetwork(nn_params).to(device)

        self.__action_size = action_size
        self.__state_size = state_size
        self.__memory = ReplayBuffer(buf_params)
        self.__t = 0

        self.eps = params['eps_initial']
        self.gamma = params['gamma']
        self.learning_rate = params['learning_rate']
        self.update_period = params['update_period']
        self.a = params['a']
        self.b = params['b']
        self.e = params['e']
        self.tau = params['tau']

        self.__optimiser = optim.Adam(self.__qnetwork_local.parameters(),
                                      self.learning_rate)

        # other parameters
        self.agent_loss = 0.0
Exemplo n.º 15
0
def main():
    """
    Main function for training the DQN network in learning how to accomplish autonomous landing.
    """
    # ATTENTION: If you want to restore files from a previous simulation you
    # must pass valid values for these variables:
    policy_weights_path = '/home/pulver/Desktop/episode_113250/policy/policy_checkpoint.ckp'
    summary_folder = ""  # if empty the summary is written in ./log/ + current time

    screen_width = 84  # original is 160
    screen_height = 84  # original is 210
    images_stack_size = 4
    # Use only the first 5 actions for this simulation
    # action_list = ['left', 'right', 'forward', 'backward', 'stop', 'descend']
    # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (descend)
    tot_actions = 6
    batch_size = 32  # size of the experience batch
    tot_steps = 1000  # finite-horizont simulation
    # 10x10^6 high number since training can be stopped at every time

    steps_per_episodes = 40  # expressed in number step
    noop_time = 2.0  # pause in seconds between actions

    timer_total_start = time.time()
    rospy.init_node("DRLTestNode")
    rospy.loginfo("----- DRL Test Node -----")

    # Create a subscriber fot the greyscale image
    rospy.Subscriber("/drl/grey_camera", ROSImage, image_callback)  # TODO
    # restore default
    # rospy.Subscriber("/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback, queue_size=30)  # Store the last 30 messages
    # before discarding them

    images_stack_size = 4

    r = rospy.Rate(10)  # 10hz

    # Init session and networks
    sess = tf.Session()
    if (summary_folder == ""):
        tf_summary_writer = tf.summary.FileWriter(
            './log/' + str(datetime.datetime.now().time()), sess.graph)
    else:
        tf_summary_writer = tf.summary.FileWriter(summary_folder, sess.graph)
    policy_network = QNetwork(sess,
                              tot_actions=tot_actions,
                              image_shape=(screen_width, screen_height,
                                           images_stack_size),
                              batch_size=batch_size,
                              network_name="policy_net")

    init = tf.global_variables_initializer()
    sess.run(init)

    if (policy_weights_path != ""):
        print("Loading weights from memory: " + str(policy_weights_path))
        policy_network.load_weights(policy_weights_path)
    else:
        print("The networks path are empty. Choose appropriate weights!")
        sys.exit()

    ##########################################################################
    ##                                START HERE                              ##
    ##########################################################################
    timer_start = time.time()
    actual_time = rospy.get_rostime()
    rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0
    frame_episode = 0
    cumulated_reward = 0
    send_action("takeoff")
    # 1-Accumulate the first state
    send_action("stop")
    rospy.sleep(1.0)
    # acquire image from bottomcamera
    # get_image()
    state = _last_image
    # create a stack of X images
    image_t = np.stack([state] * images_stack_size, axis=2)

    for step in range(tot_steps):
        # 2- Get the action following epsilon-greedy or through policy network.
        # Here image_t is always ready and it can be given as input to
        # the network
        action_distribution = policy_network.return_action_distribution(
            input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)),
            softmax=False)
        # print ""
        # print action_distribution
        action = np.argmax(action_distribution)
        action = convert_action_int_to_str(action)
        # print action
        send_action(action)
        rospy.sleep(noop_time)
        # 3- Get the state_t1 repeating the action obtained at step-2 and add everything
        # in the replay buffer. Then set state_t = state_t1
        # get_image()
        image_t1 = _last_image
        send_action("stop")
        # If the action taken is to land and the UAV is inside the landing
        # BB, done will be calculated accordingly

        # NOTE: in the real implementation there's no way to get done and reward
        # done_reward = get_done_reward()
        # reward = done_reward.reward
        # done = done_reward.done

        image_t1 = np.expand_dims(image_t1, 2)
        # stack the images
        image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2)
        frame_episode += 1
        image_t = image_t1

        # cumulated_reward += reward

        # At every step check if more than 30 seconds from the start passed.
        # In that case, set done = True and end the episode
        timer_stop = time.time()
        # Stop the episode if the number of frame is more than a threshold
        if frame_episode >= steps_per_episodes:
            done = True
        # When the episode is done
        if done:
            timer_stop = time.time()
            actual_time = rospy.get_rostime()
            rospy_stop_time = actual_time.secs + actual_time.nsecs / 1000000000.0
            rospy_time_elapsed = rospy_stop_time - rospy_start_time
            print("Time episode: " + str(timer_stop - timer_start) +
                  " seconds")
            print("Ros time episode: " + str(rospy_time_elapsed) + " seconds")
            # if cumulated_reward >= 0:
            #     rospy.logwarn("Positive reward obtained!")
            # print("Cumulated reward: " + str(cumulated_reward))
            print("Episode finished after {} timesteps".format(step + 1))
            sys.stdout.flush()
            break
Exemplo n.º 16
0
class Agent():
    def __init__(self, state_size, action_size, seed, training, pixels, lr=LR):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0.
        self.pixels = pixels
        if pixels is False:
            from q_network import QNetwork
        else:
            from q_network_cnn import QNetwork
            print('loaded cnn network')
            self.loader = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
            ])
        self.QN_local = QNetwork(state_size, action_size, seed,
                                 training).to(device)
        self.QN_target = QNetwork(state_size, action_size, seed,
                                  training).to(device)
        self.optimizer = optim.Adam(self.QN_local.parameters(), lr=lr)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed,
                                   device)  # TODO

    def act(self, state, eps):
        #         if self.pixels is True:

        #state = Variable(torch.from_numpy(state).float().to(device).view(state.shape[0],3,32,32))
        if not self.pixels:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.QN_local.eval()

        if torch.no_grad():
            action_values = self.QN_local(state)
        self.QN_local.train()
        if random.random() > eps:
            return int(np.argmax(action_values.cpu().data.numpy()))
        else:
            return int(random.choice(np.arange(self.action_size)))

    def step(self, state, action, reward, next_state, done, stack_size):
        self.memory.add(state, action, reward, next_state, done)
        self.t_step = (self.t_step + 1) % UPDATE_RATE
        if self.t_step == 0 and len(self.memory) > BATCH_SIZE:
            samples = self.memory.sample()
            self.learn(samples, GAMMA, stack_size)

    def learn(self, experiences, gamma, stack_size):
        states, actions, rewards, next_states, dones = experiences

        if self.pixels:
            next_states = Variable(
                next_states
            )  #next_states.view(next_states.shape[0],stack_size,3, stack_size,32,32))
            states = Variable(states)  #states.view(states.shape[0],3,64,64))
#         else:
#todo bring back the old version stuff here

        _target = self.QN_target(next_states).detach().max(1)[0].unsqueeze(1)
        action_values_target = rewards + gamma * _target * (1 - dones)
        action_values_expected = self.QN_local(states).gather(1, actions)

        loss = F.mse_loss(action_values_expected, action_values_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target Qnetwork
        for target_param, local_param in zip(self.QN_target.parameters(),
                                             self.QN_local.parameters()):
            target_param.data.copy_(TAU * local_param.data +
                                    (1.0 - TAU) * target_param.data)
Exemplo n.º 17
0
def main():

    parser = argparse.ArgumentParser(
        'a program to train or run a deep q-learning agent')
    parser.add_argument("game", type=str, help="name of game to play")
    parser.add_argument("agent_type",
                        type=str,
                        help="name of learning/acting technique used")
    parser.add_argument("agent_name",
                        type=str,
                        help="unique name of this agent instance")
    parser.add_argument("--rom_path",
                        type=str,
                        help="path to directory containing atari game roms",
                        default='../roms')
    parser.add_argument(
        "--watch",
        help=
        "if true, a pretrained model with the specified name is loaded and tested with the game screen displayed",
        action='store_true')

    parser.add_argument("--epochs",
                        type=int,
                        help="number of epochs",
                        default=200)
    parser.add_argument("--epoch_length",
                        type=int,
                        help="number of steps in an epoch",
                        default=250000)
    parser.add_argument("--test_steps",
                        type=int,
                        help="max number of steps per test",
                        default=125000)
    parser.add_argument("--test_steps_hardcap",
                        type=int,
                        help="absolute max number of steps per test",
                        default=135000)
    parser.add_argument("--test_episodes",
                        type=int,
                        help="max number of episodes per test",
                        default=30)
    parser.add_argument("--history_length",
                        type=int,
                        help="number of frames in a state",
                        default=4)
    parser.add_argument("--training_frequency",
                        type=int,
                        help="number of steps run before training",
                        default=4)
    parser.add_argument(
        "--random_exploration_length",
        type=int,
        help=
        "number of randomly-generated experiences to initially fill experience memory",
        default=50000)
    parser.add_argument("--initial_exploration_rate",
                        type=float,
                        help="initial exploration rate",
                        default=1.0)
    parser.add_argument("--final_exploration_rate",
                        type=float,
                        help="final exploration rate from linear annealing",
                        default=0.1)
    parser.add_argument(
        "--final_exploration_frame",
        type=int,
        help="frame at which the final exploration rate is reached",
        default=1000000)
    parser.add_argument("--test_exploration_rate",
                        type=float,
                        help="exploration rate while testing",
                        default=0.05)
    parser.add_argument("--frame_skip",
                        type=int,
                        help="number of frames to repeat chosen action",
                        default=4)
    parser.add_argument("--screen_dims",
                        type=tuple,
                        help="dimensions to resize frames",
                        default=(84, 84))
    # used for stochasticity and to help prevent overfitting.
    # Must be greater than frame_skip * (observation_length -1) + buffer_length - 1
    parser.add_argument("--max_start_wait",
                        type=int,
                        help="max number of frames to wait for initial state",
                        default=60)
    # buffer_length = 1 prevents blending
    parser.add_argument("--buffer_length",
                        type=int,
                        help="length of buffer to blend frames",
                        default=2)
    parser.add_argument("--blend_method",
                        type=str,
                        help="method used to blend frames",
                        choices=('max'),
                        default='max')
    parser.add_argument("--reward_processing",
                        type=str,
                        help="method to process rewards",
                        choices=('clip', 'none'),
                        default='clip')
    # must set network_architecture to custom in order use custom architecture
    parser.add_argument(
        "--conv_kernel_shapes",
        type=tuple,
        help=
        "shapes of convnet kernels: ((height, width, in_channels, out_channels), (next layer))"
    )
    # must have same length as conv_kernel_shapes
    parser.add_argument(
        "--conv_strides",
        type=tuple,
        help="connvet strides: ((1, height, width, 1), (next layer))")
    # currently,  you must have at least one dense layer
    parser.add_argument(
        "--dense_layer_shapes",
        type=tuple,
        help="shapes of dense layers: ((in_size, out_size), (next layer))")
    parser.add_argument("--discount_factor",
                        type=float,
                        help="constant to discount future rewards",
                        default=0.99)
    parser.add_argument("--learning_rate",
                        type=float,
                        help="constant to scale parameter updates",
                        default=0.00025)
    parser.add_argument("--optimizer",
                        type=str,
                        help="optimization method for network",
                        choices=('rmsprop', 'graves_rmsprop'),
                        default='rmsprop')
    parser.add_argument("--rmsprop_decay",
                        type=float,
                        help="decay constant for moving average in rmsprop",
                        default=0.95)
    parser.add_argument("--rmsprop_epsilon",
                        type=int,
                        help="constant to stabilize rmsprop",
                        default=0.01)
    # set error_clipping to less than 0 to disable
    parser.add_argument(
        "--error_clipping",
        type=str,
        help="constant at which td-error becomes linear instead of quadratic",
        default=1.0)
    # set gradient clipping to 0 or less to disable.  Currently only works with graves_rmsprop.
    parser.add_argument("--gradient_clip",
                        type=str,
                        help="clip gradients to have the provided L2-norm",
                        default=0)
    parser.add_argument("--target_update_frequency",
                        type=int,
                        help="number of steps between target network updates",
                        default=10000)
    parser.add_argument(
        "--memory_capacity",
        type=int,
        help="max number of experiences to store in experience memory",
        default=1000000)
    parser.add_argument(
        "--batch_size",
        type=int,
        help="number of transitions sampled from memory during learning",
        default=32)
    # must set to custom in order to specify custom architecture
    parser.add_argument("--network_architecture",
                        type=str,
                        help="name of prespecified network architecture",
                        choices=("deepmind_nips", "deepmind_nature, custom"),
                        default="deepmind_nature")
    parser.add_argument("--recording_frequency",
                        type=int,
                        help="number of steps before tensorboard recording",
                        default=50000)

    parser.add_argument("--saving_threshold",
                        type=int,
                        help="min score threshold for saving model.",
                        default=0)

    parser.add_argument("--parallel",
                        help="parallelize acting and learning",
                        action='store_true')
    parser.add_argument(
        "--double_dqn",
        help="use double q-learning algorithm in error target calculation",
        action='store_true')
    args = parser.parse_args()

    if args.network_architecture == 'deepmind_nature':
        args.conv_kernel_shapes = [[8, 8, 4, 32], [4, 4, 32, 64],
                                   [3, 3, 64, 64]]
        args.conv_strides = [[1, 4, 4, 1], [1, 2, 2, 1], [1, 1, 1, 1]]
        args.dense_layer_shapes = [[3136, 512]]
    elif args.network_architecture == 'deepmind_nips':
        args.conv_kernel_shapes = [[8, 8, 4, 16], [4, 4, 16, 32]]
        args.conv_strides = [[1, 4, 4, 1], [1, 2, 2, 1]]
        args.dense_layer_shapes = [[2592, 256]]

    if not args.watch:
        train_stats = RecordStats(args, False)
        test_stats = RecordStats(args, True)
        training_emulator = AtariEmulator(args)
        testing_emulator = AtariEmulator(args)
        num_actions = len(training_emulator.get_possible_actions())
        experience_memory = ExperienceMemory(args, num_actions)

        q_network = None
        agent = None
        if args.parallel:
            q_network = ParallelQNetwork(args, num_actions)
            agent = ParallelDQNAgent(args, q_network, training_emulator,
                                     experience_memory, num_actions,
                                     train_stats)
        else:
            q_network = QNetwork(args, num_actions)
            agent = DQNAgent(args, q_network, training_emulator,
                             experience_memory, num_actions, train_stats)

        experiment.run_experiment(args, agent, testing_emulator, test_stats)

    else:
        testing_emulator = AtariEmulator(args)
        num_actions = len(testing_emulator.get_possible_actions())
        q_network = QNetwork(args, num_actions)
        agent = DQNAgent(args, q_network, None, None, num_actions, None)
        experiment.evaluate_agent(args, agent, testing_emulator, None)
Exemplo n.º 18
0
class Agent:
    def __init__(self, env, state_space, action_space, device, learning_rate, buffer_size, \
                 batch_size, gamma, in_channels, train_freq = 4, target_update_freq=1e4, is_ddqn = False):
        self.env = env
        self.state_space = state_space
        self.action_space = action_space
        
        self.QNetwork_local = QNetwork(in_channels, self.state_space, self.action_space.n, device = device).to(device)
        self.QNetwork_local.init_weights()
        self.QNetwork_target = QNetwork(in_channels, self.state_space,self.action_space.n , device = device).to(device)
        self.QNetwork_target.load_state_dict(self.QNetwork_local.state_dict())
        
        self.optimizer = torch.optim.RMSprop(self.QNetwork_local.parameters(), lr=learning_rate, alpha=0.95, eps=0.01, centered=True)
        self.criterion = torch.nn.MSELoss()
        self.memory = ReplayBuffer(capacity=int(buffer_size), batch_size=batch_size)
        self.step_count = 0.
        self.batch_size = batch_size
        self.gamma = gamma
        
        self.device = device
        self.buffer_size = buffer_size
        self.num_train_updates = 0
        self.train_freq = train_freq
        self.target_update_freq = target_update_freq

        self.is_ddqn = is_ddqn
        
        print('Agent initialized with memory size:{}'.format(buffer_size))
        
    def act(self, state, epsilon):
        if random.random() > epsilon:
            #switch to evaluation mode, evaluate state, switch back to train mode
            self.QNetwork_local.eval()
            if torch.no_grad():
                actions = self.QNetwork_local(state)
            self.QNetwork_local.train()
            
            actions_np = actions.data.cpu().numpy()[0]
            best_action_idx = int(np.argmax(actions_np))
            return best_action_idx
        else:
            rand_action = self.action_space.sample()
            return rand_action
            
        
    def step(self, state, action, reward, next_state , done, add_memory=True):
        #TODO calculate priority here?
        if add_memory: 
            priority = 1.
            reward_clip = np.sign(reward)
            self.memory.add(state=state, action=action, next_state=next_state, reward=reward_clip, done=done, priority=priority)
            self.step_count = (self.step_count+1) % self.train_freq  #self.update_rate
            
            #if self.step_count == 0 and len(self.memory) >= self.batch_size:
            self.network_is_updated = False
            if self.step_count == 0 and len(self.memory) == self.buffer_size:
                samples = self.memory.random_sample(self.device)
                self.learn(samples)
                self.num_train_updates +=1
                self.network_is_updated = True
            
    def learn(self, samples):
        states, actions, rewards, next_states, dones = samples
        

        if self.is_ddqn is True:
            # DDQN: find max action using local network & gather the values of actions from target network
            next_actions = torch.argmax(self.QNetwork_local(next_states).detach(), dim=1).unsqueeze(1)
            q_target_next = self.QNetwork_target(next_states).gather(1,next_actions)
        else:
            # DQN: find the max action from target network
            q_target_next = self.QNetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
    
        # expected actions
        q_local_current = self.QNetwork_local(states).gather(1,actions)

        self.optimizer.zero_grad() #cleans up previous values

        # TD Error 
        TD_target = rewards + (self.gamma*q_target_next * (1-dones))
        TD_error = self.criterion(q_local_current, TD_target)
        TD_error.backward()
        torch.nn.utils.clip_grad_norm_(self.QNetwork_local.parameters(), 5.)
        self.optimizer.step()
        
        if (self.num_train_updates/self.train_freq) % self.target_update_freq == 0:
            self.QNetwork_target.load_state_dict(self.QNetwork_local.state_dict())
Exemplo n.º 19
0
        index = x.index(1)
        y = [0 for _ in range(index)] + [1 for _ in range(len(x) - index)]
        ys.append(y)
    if num_batches > batch_size:
        remainder = num_batches % batch_size
        while remainder != 0:
            xs.append([0 for _ in range(seq_length)])
            ys.append([0 for _ in range(seq_length)])
            remainder -= 1
    xs = torch.Tensor(xs).unsqueeze(2).to(device)
    ys = torch.Tensor(ys).unsqueeze(2).to(device)
    print('x: {}'.format(xs[:, :, 0]))
    print('y: {}'.format(ys[:, :, 0]))
    return xs, ys

q = QNetwork(input_size, hidden_size, seq_length).to(device)
p = PNetwork(input_size, hidden_size, seq_length).to(device)
params = list(q.parameters()) + list(p.parameters())
optimizer = torch.optim.Adagrad(params, lr=lr)


def train(xs, ys):
    q.train()
    p.train()
    train_loss = 0
    for epoch in range(num_epochs):
        for b in range(0, num_batches, batch_size):
            
            z = q.sample(xs[b:b+batch_size], ys[b:b+batch_size], 0)
            z = z.detach()
            # print(z.size())
Exemplo n.º 20
0
def deep_qlearning(env, nframes, discount_factor, N, C, mini_batch_size,
                   replay_start_size, sgd_update_frequency,
                   initial_exploration, final_exploration,
                   final_exploration_frame, lr, alpha, m):
    """
    Input:
    - env: environment
    - nframes: # of frames to train on
    - discount_factor (gamma): how much to discount future rewards
    - N: replay memory size
    - C: number of steps before updating Q target network
    - mini_batch_size: mini batch size
    - replay_start_size: minimum size of replay memory before learning starts
    - sgd_update_frequency: number of action selections in between consecutive
      mini batch SGD updates
    - initial_exploration: initial epsilon value
    - final_exploration: final epsilon value
    - final_exploration_frame: number of frames over which the epsilon is
      annealed to its final value
    - lr: learning rate used by RMSprop
    - alpha: alpha value used by RMSprop
    - m: number of consecutive frames to stack for input to Q network

    Output:
    - Q: trained Q-network
    """
    n_actions = env.action_space.n
    Q = QNetwork(n_actions)
    Q_target = deepcopy(Q)
    Q_target.eval()

    transform = T.Compose([T.ToTensor()])
    optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha)
    criterion = nn.MSELoss()

    D = deque(maxlen=N)  # replay memory

    last_Q_target_update = 0
    frames_count = 0
    last_sgd_update = 0
    episodes_count = 0
    episode_rewards = []

    while True:
        frame_sequence = initialize_frame_sequence(env, m)
        state = transform(np.stack(frame_sequence, axis=2))

        episode_reward = 0
        done = False

        while not done:
            epsilon = annealed_epsilon(initial_exploration, final_exploration,
                                       final_exploration_frame, frames_count)

            action = get_epsilon_greedy_action(Q, state.unsqueeze(0), epsilon,
                                               n_actions)

            frame, reward, done, _ = env.step(action.item())
            reward = torch.tensor([reward])

            episode_reward += reward.item()
            if done:
                next_state = None
                episode_rewards.append(episode_reward)
            else:
                frame_sequence.append(preprocess_frame(frame))
                next_state = transform(np.stack(frame_sequence, axis=2))

            D.append((state, action, reward, next_state))

            state = next_state

            if len(D) < replay_start_size:
                continue

            last_sgd_update += 1
            if last_sgd_update < sgd_update_frequency:
                continue
            last_sgd_update = 0

            sgd_update(Q, Q_target, D, mini_batch_size, discount_factor,
                       optimizer, criterion)

            last_Q_target_update += 1
            frames_count += mini_batch_size

            if last_Q_target_update % C == 0:
                Q_target = deepcopy(Q)
                Q_target.eval()

            if frames_count >= nframes:
                return Q, episode_rewards

        episodes_count += 1
        if episodes_count % 100 == 0:
            save_stuff(Q, episode_rewards)
            print(f'episodes completed = {episodes_count},',
                  f'frames processed = {frames_count}')
Exemplo n.º 21
0
    from agents.deep_q import DeepQLearning
    from environments.snake import SnakeVisual
    from q_network import QNetwork

    env = SnakeVisual(grid_size=[8, 8], render=True, render_freq=1)
    actions = env.valid_actions()
    size = np.shape(env.reset().state)

    nn = ks.models.Sequential()
    nn.add(ks.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='sigmoid', input_shape=size))
    nn.add(ks.layers.Conv2D(filters=24, kernel_size=(3, 3), activation='sigmoid'))
    nn.add(ks.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='sigmoid'))
    nn.add(ks.layers.Flatten())
    nn.add(ks.layers.Dense(units=16, activation='sigmoid'))
    nn.add(ks.layers.Dense(units=3,  activation='linear'))

    nn.compile(optimizer=ks.optimizers.Adam(lr=0.0001), loss='mse')

    print(nn.summary())

    def normalize_state(s):
        return np.reshape(s.state, newshape=(1,) + size)


    dqn = QNetwork(nn, actions, normalize_state)

    dql = DeepQLearning(env, dqn)

    q = dql.learn()
Exemplo n.º 22
0
class Agent:
    def __init__(self,
                 device,
                 state_size,
                 action_size,
                 buffer_size=10,
                 batch_size=10,
                 learning_rate=0.1,
                 discount_rate=0.99,
                 eps_decay=0.9,
                 tau=0.1,
                 steps_per_update=4):
        self.device = device
        self.state_size = state_size
        self.action_size = action_size

        self.q_network_control = QNetwork(state_size, action_size).to(device)
        self.q_network_target = QNetwork(state_size, action_size).to(device)
        self.optimizer = torch.optim.Adam(self.q_network_control.parameters(),
                                          lr=learning_rate)

        self.batch_size = batch_size
        self.replay_buffer = ReplayBuffer(device, state_size, action_size,
                                          buffer_size)

        self.discount_rate = discount_rate

        self.eps = 1.0
        self.eps_decay = eps_decay

        self.tau = tau

        self.step_count = 0
        self.steps_per_update = steps_per_update

    def policy(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        return self.epsilon_greedy_policy(self.eps, state)

    def epsilon_greedy_policy(self, eps, state):
        self.q_network_control.eval()
        with torch.no_grad():
            action_values = self.q_network_control(state)
        self.q_network_control.train()

        if random.random() > eps:
            greedy_choice = np.argmax(action_values.cpu().data.numpy())
            return greedy_choice
        else:
            return random.choice(np.arange(self.action_size))

    def step(self, state, action, reward, next_state, done):
        p = self.calculate_p(state, action, reward, next_state, done)
        self.replay_buffer.add(state, action, reward, next_state, done, p)
        if self.step_count % self.steps_per_update == 0:
            self.learn()
        self.step_count += 1

    def learn(self):
        if len(self.replay_buffer) < self.batch_size:
            return
        states, actions, rewards, next_states, dones, p = \
            self.replay_buffer.sample(self.batch_size)

        error = self.bellman_eqn_error(states, actions, rewards, next_states,
                                       dones)
        importance_scaling = (self.replay_buffer.buffer_size * p)**-1
        loss = (importance_scaling * (error**2)).sum() / self.batch_size
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_target()

    def bellman_eqn_error(self, states, actions, rewards, next_states, dones):
        """Double DQN error - use the control network to get the best action
        and apply the target network to it to get the target reward which is
        used for the bellman eqn error.
        """
        self.q_network_control.eval()
        with torch.no_grad():
            a_max = self.q_network_control(next_states).argmax(1).unsqueeze(1)

        target_action_values = self.q_network_target(next_states).gather(
            1, a_max)
        target_rewards = rewards + self.discount_rate * (1 - dones) \
                         * target_action_values

        self.q_network_control.train()
        current_rewards = self.q_network_control(states).gather(1, actions)
        error = current_rewards - target_rewards
        return error

    def calculate_p(self, state, action, reward, next_state, done):
        next_state = torch.from_numpy(next_state[np.newaxis, :]).float().to(
            self.device)
        state = torch.from_numpy(state[np.newaxis, :]).float().to(self.device)
        action = torch.from_numpy(np.array([[action]])).long().to(self.device)
        reward = torch.from_numpy(np.array([reward])).float().to(self.device)
        done = torch.from_numpy(np.array([[done]], dtype=np.uint8)).float().to(
            self.device)

        return abs(
            self.bellman_eqn_error(state, action, reward, next_state,
                                   done)) + 1e-3

    def update_target(self):
        for target_param, control_param in zip(
                self.q_network_target.parameters(),
                self.q_network_control.parameters()):
            target_param.data.copy_(self.tau * control_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def end_of_episode(self):
        self.eps *= self.eps_decay
        self.step_count = 0

    def save(self, path):
        torch.save(self.q_network_control.state_dict(), path)

    def restore(self, path):
        self.q_network_control.load_state_dict(torch.load(path))
Exemplo n.º 23
0
if __name__ == '__main__':
    import keras as ks
    import numpy as np
    from agents.deep_q import DeepQLearning
    from environments.cartpole import CartPole
    from q_network import QNetwork

    nn = ks.models.Sequential()
    nn.add(ks.layers.Dense(32, activation='sigmoid', input_shape=(4, )))
    nn.add(ks.layers.Dense(32, activation='sigmoid'))
    nn.add(ks.layers.Dense(2, activation='linear'))

    nn.compile(optimizer=ks.optimizers.Adam(lr=0.001), loss='mse')

    env = CartPole(render=True)
    actions = env.valid_actions()

    dqn = QNetwork(nn, actions, lambda x: np.reshape(x.state, newshape=(1, 4)))

    dql = DeepQLearning(env, dqn)

    q = dql.learn()
Exemplo n.º 24
0
def main():
    """
    Main function for training the DQN network in learning how to accomplish autonomous landing.
    """
    # ATTENTION: If you want to restore files from a previous simulation you
    # must pass valid values for these variables:
    policy_weights_path = '/home/pulver/Desktop/DQN-single/episode_36000/policy/policy_checkpoint.ckp'

    ground_list = [
        "asphalt11_small", "asphalt12_small", "asphalt13_small",
        "brick11_small", "brick12_small", "brick13_small", "grass11_small",
        "grass12_small", "grass13_small", "pavement11_small",
        "pavement12_small", "pavement13_small", "sand11_small", "sand12_small",
        "sand13_small", "snow11_small", "snow12_small", "snow13_small",
        "soil11_small", "soil12_small", "soil13_small"
    ]

    summary_folder = ""  # if empty the summary is written in ./log/ + current time
    start_episode = 0  # change to last episode number
    frame_counter = 0  # change to last number of frames

    screen_width = 84  # original is 160
    screen_height = 84  # original is 210
    images_stack_size = 4
    # Use only the first 5 actions for this simulation
    # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (land),
    # 6 (ascend), 7 (descend), 8 (rotate_left), 9 (rotate_right)
    tot_actions = 6
    batch_size = 32  # size of the experience batch
    tot_steps = 1000  # finite-horizont simulation
    # 10x10^6 high number since training can be stopped at every time
    tot_episodes = 50
    steps_per_episodes = 20  # expressed in number step
    noop_time = 2.0  # pause in seconds between actions

    num_ground_plane = 21
    actual_ground_index = 0
    # episode_per_ground specify the number of episodes with the same ground
    # plane
    ground_counter = 1
    episode_per_ground = 10

    timer_total_start = time.time()
    rospy.init_node("DeepReinforcedLanding")
    rospy.loginfo("----- Deep Reinforced Landing Node -----")

    rospy.Subscriber(
        "/quadrotor/ardrone/bottom/ardrone/bottom/image_raw",
        ROSImage,
        image_callback,
        queue_size=30)  # Store the last 30 messages before discarding them

    r = rospy.Rate(10)  # 10hz

    # Init session and networks
    sess = tf.Session()
    if (summary_folder == ""):
        tf_summary_writer = tf.summary.FileWriter(
            './log/' + str(datetime.datetime.now().time()), sess.graph)
    else:
        tf_summary_writer = tf.summary.FileWriter(summary_folder, sess.graph)
    policy_network = QNetwork(sess,
                              tot_actions=tot_actions,
                              image_shape=(screen_width, screen_height,
                                           images_stack_size),
                              batch_size=batch_size,
                              network_name="policy_net")
    init = tf.global_variables_initializer()
    sess.run(init)

    # Load Neural Networks weights from memory if a valid checkpoint path is passed
    # if(os.path.isfile(policy_weights_path) == True and
    # os.path.isfile(target_weights_path) == True):
    try:
        print("Loading weights from memory...")
        policy_network.load_weights(policy_weights_path)
    except:
        print("Error while loading the weights!")
        return

    # start here
    for episode in range(start_episode, tot_episodes):
        # Reset the ground in the environment every 50 episodes (or
        # episode_per_ground)
        ground_index = episode / episode_per_ground
        if ground_index != actual_ground_index or (episode == start_episode):
            clean_world(ground_list)
            generate_new_world(ground_list)
        actual_ground_index = ground_index

        timer_start = time.time()
        actual_time = rospy.get_rostime()
        rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0
        cumulated_reward = 0
        print("")
        print("Episode: " + str(episode))
        # 1-Accumulate the first state
        reset_pose()
        print "Reset pose!"
        send_action("stop")
        rospy.sleep(1.0)
        state = _last_image
        # create a stack of X images
        image_t = np.stack([state] * images_stack_size, axis=2)
        frame_episode = 0
        for step in range(tot_steps):
            # 2- Get the action following epsilon-greedy or through policy network.
            # With probability epsilon take random action otherwise it takes
            # an action from the policy network.
            # Take a random action

            # Here image_t is always ready and it can be given as input to
            # the network
            action_distribution = policy_network.return_action_distribution(
                input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)),
                softmax=False)
            action = np.argmax(action_distribution)
            action = convert_action_int_to_str(action)
            # action = get_random_action()
            send_action(action)
            rospy.sleep(noop_time)
            # 3- Get the state_t1 repeating the action obtained at step-2 and add everything
            # in the replay buffer. Then set state_t = state_t1
            # get_image()
            image_t1 = _last_image
            # If the action taken is to land and the UAV is inside the landing
            # BB, done will be calculated accordingly
            done_reward = get_done_reward()
            reward = done_reward.reward
            done = done_reward.done
            image_t1 = np.expand_dims(image_t1, 2)
            # stack the images
            image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2)
            image_t = image_t1
            cumulated_reward += reward
            send_action("stop")
            frame_episode = frame_episode + 1

            # get_relative_pose(ground_index, episode, step, reward)

            # At every step check if more than 30 seconds from the start passed.
            # In that case, set done = True and end the episode
            timer_stop = time.time()
            # Stop the episode if the number of frame is more than a threshold
            if frame_episode >= steps_per_episodes:
                done = True
            # if timer_stop - timer_start >= 30.0: # maximum time allowed
            #     #cumulated_reward += -1
            #     done = True
            # When the episode is done
            if done:
                timer_stop = time.time()
                actual_time = rospy.get_rostime()
                rospy_stop_time = actual_time.secs + actual_time.nsecs / 1000000000.0
                rospy_time_elapsed = rospy_stop_time - rospy_start_time
                print("Tot Frame counter: " + str(frame_counter))
                print("Time episode: " + str(timer_stop - timer_start) +
                      " seconds")
                print("Ros time episode: " + str(rospy_time_elapsed) +
                      " seconds")
                if cumulated_reward >= 0:
                    rospy.logwarn("Positive reward obtained!")
                print("Cumulated reward: " + str(cumulated_reward))
                print("Episode finished after {} timesteps".format(step + 1))
                sys.stdout.flush()
                # time.sleep(5)
                with open(
                        "./results_" +
                        str(datetime.date.today().strftime("%m%d%Y")) + ".csv",
                        "a") as myfile:
                    boolean_reward = 0
                    if (cumulated_reward > 0):
                        boolean_reward = 1
                    ground_list = "mixed"
                    string_to_add = "mixed" + "," + str(episode) + "," + str(
                        step) + "," + str(cumulated_reward) + "," + str(
                            boolean_reward) + '\n'
                    myfile.write(string_to_add)
                    break
                break
Exemplo n.º 25
0
def main():
    """
    Main function for training the DQN network in learning how to accomplish autonomous landing.
    """

    # ground_list = ["water1",
    #             "water2",
    #             "water3",
    #             "water4",
    #             "water5",
    #             "water6",
    #             "water7",
    #             "water8",
    #             "water9",
    #             "water10"]

    # ground_list = ["water11", "water12", "water13",
    #    "water14", "water15", "water16",
    #    "water17", "water18", "water19"]

    ground_list = [
        "pavement11", "pavement12", "pavement5", "pavement7", "pavement14",
        "pavement15", "pavement16"
    ]
    # NOTE: the ckp path must be modified in the code below
    ckp_list = ["120750"]
    # ATTENTION: If you want to restore files from a previous simulation you
    # must pass valid values for these variables:
    # policy_weights_path =
    # '/home/pulver/Desktop/marine_descending/checkpoint/episode_94500/policy/policy_checkpoint.ckp'
    summary_folder = ""  # if empty the summary is written in ./log/ + current time
    start_episode = 0  # change to last episode number
    frame_counter = 0  # change to last number of frames

    screen_width = 84  # original is 160
    screen_height = 84  # original is 210
    images_stack_size = 4
    # Use only the first 5 actions for this simulation
    # action_list = ['left', 'right', 'forward', 'backward', 'stop', 'descend']
    # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (descend)
    tot_actions = 6
    batch_size = 32  # size of the experience batch
    tot_steps = 1000  # finite-horizont simulation
    # 10x10^6 high number since training can be stopped at every time

    steps_per_episodes = 40  # expressed in number step
    noop_time = 2.0  # pause in seconds between actions

    num_ground_plane = len(ground_list)
    actual_ground_index = 0
    # episode_per_ground specify the number of episodes with the same ground
    # plane
    episode_per_ground = 10
    tot_episodes = episode_per_ground * num_ground_plane
    accuracy_ground = 0.0

    timer_total_start = time.time()
    rospy.init_node("DRLTestNode")
    rospy.loginfo("----- DRL Test Node -----")

    # Create a subscriber fot the greyscale image
    # rospy.Subscriber("/drl/grey_camera", ROSImage, image_callback)#TODO
    # restore default
    rospy.Subscriber(
        "/quadrotor/ardrone/bottom/ardrone/bottom/image_raw",
        ROSImage,
        image_callback,
        queue_size=30)  # Store the last 30 messages before discarding them

    # Create a publisher for changing light position
    # light_pub = rospy.Publisher('/gazebo/default/light/modify', Light)
    images_stack_size = 4

    # r = rospy.Rate(10)  # 10hz

    # Init session and networks
    sess = tf.Session()
    policy_network = QNetwork(sess,
                              tot_actions=tot_actions,
                              image_shape=(screen_width, screen_height,
                                           images_stack_size),
                              batch_size=batch_size,
                              network_name="policy_net")
    init = tf.global_variables_initializer()
    sess.run(init)

    for i in range(len(ckp_list)):
        ckp = ckp_list[i]
        policy_weights_path = '/home/pulver/Desktop/sim11/episode_' + \
            str(ckp) + '/policy/policy_checkpoint.ckp'

        if (summary_folder == ""):
            tf_summary_writer = tf.summary.FileWriter(
                './' + str(ckp) + '/log/' +
                str(datetime.datetime.now().time()), sess.graph)
        else:
            tf_summary_writer = tf.summary.FileWriter(summary_folder,
                                                      sess.graph)

        if (policy_weights_path != ""):
            print("Loading weights from memory: " + str(policy_weights_path))
            policy_network.load_weights(policy_weights_path)
        else:
            print("The networks path are empty. Choose appropriate weights!")
            sys.exit()

        # start here
        for episode in range(start_episode, tot_episodes):
            # Reset the ground in the environment every episode_per_ground
            ground_index = episode / episode_per_ground
            print "Current ground index: " + str(ground_index)
            if episode % (
                    episode_per_ground) == 0 and episode != start_episode:
                with open(
                        "./" + str(ckp) + "/accuracy_ckp" + str(ckp) + ".csv",
                        "a") as myfile:
                    accuracy_ground = accuracy_ground / \
                        (episode_per_ground * 1.0)
                    print "Final accuracy value: " + str(accuracy_ground)
                    string_to_add = ground_list[ground_index - 1] + \
                        "," + str(accuracy_ground) + "\n"
                    myfile.write(string_to_add)

            if ground_index != actual_ground_index or (episode
                                                       == start_episode):
                accuracy_ground = 0.0
                clean_world(ground_list)
                generate_new_world(ground_index, ground_list)

            # Reset the light every 10 episodes
            # if (episode % 10 == 0):
            #     # Change the position of the three point lights
            #     for i in range(0, 3):
            #         model_to_change = "user_point_light_" + str(i)
            #         x = STDrandom.uniform(-5.0, 5.0)
            #         y = STDrandom.uniform(-5.0, 5.0)
            #         z = STDrandom.uniform(1.0, 10.0)

            #         # os.system("rosrun gazebo_ros spawn_model -file ~/.gazebo/models/" + model_to_change +
            #         #           "/model.sdf -sdf -model " + model_to_change + "_plane -x " + x + " -y " + y + " -z " + z + "")

            #         light_msg = Light()
            #         light_msg.pose.x = x
            #         light_msg.pose.y = y
            #         light_msg.pose.z = z
            #         light_msg.name = model_to_change
            #         light_pub.publish(light_msg)
            #         rospy.loginfo("Lights have been resetted!")

            actual_ground_index = ground_index
            timer_start = time.time()
            actual_time = rospy.get_rostime()
            rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0
            frame_episode = 0
            cumulated_reward = 0
            epsilon_used = 0
            send_action("takeoff")
            print("")
            print("Episode: " + str(episode))
            # 0-Reset USV's position
            os.system(
                "rosservice call /gazebo/set_model_state '{model_state: { model_name: usv_model, pose: { position: { x: 0.0, y: 0.0 ,z: 5.0 }, orientation: {x: 0, y: 0.0, z: 0, w: 0.0 } }, twist: { linear: {x: 0.0 , y: 0 ,z: 0 } , angular: { x: 0.0 , y: 0 , z: 0.0 } } , reference_frame: world } }'"
            )
            # 1-Accumulate the first state
            reset_pose()
            print "Reset pose!"
            send_action("stop")
            rospy.sleep(1.0)
            #        get_image()
            state = _last_image
            # create a stack of X images
            image_t = np.stack([state] * images_stack_size, axis=2)

            for step in range(tot_steps):
                # 2- Get the action following epsilon-greedy or through policy network.
                # Here image_t is always ready and it can be given as input to
                # the network
                action_distribution = policy_network.return_action_distribution(
                    input_data=np.reshape(image_t,
                                          (1, 84, 84, images_stack_size)),
                    softmax=False)
                # print ""
                # print action_distribution
                action = np.argmax(action_distribution)
                action = convert_action_int_to_str(action)
                # print action
                send_action(action)
                rospy.sleep(noop_time)
                # 3- Get the state_t1 repeating the action obtained at step-2 and add everything
                # in the replay buffer. Then set state_t = state_t1
                # get_image()
                image_t1 = _last_image
                send_action("stop")
                # If the action taken is to land and the UAV is inside the landing
                # BB, done will be calculated accordingly
                done_reward = get_done_reward()
                reward = done_reward.reward
                done = done_reward.done
                image_t1 = np.expand_dims(image_t1, 2)
                # stack the images
                image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2)
                frame_counter += 1  # To call every time a frame is obtained
                frame_episode += 1
                image_t = image_t1
                get_relative_pose(ground_index, episode, step, reward,
                                  ground_list, ckp)
                cumulated_reward += reward
                # At every step check if more than 30 seconds from the start passed.
                # In that case, set done = True and end the episode
                timer_stop = time.time()
                # Stop the episode if the number of frame is more than a
                # threshold
                if frame_episode >= steps_per_episodes:
                    done = True
                # When the episode is done
                if done:
                    timer_stop = time.time()
                    actual_time = rospy.get_rostime()
                    rospy_stop_time = actual_time.secs + actual_time.nsecs / 1000000000.0
                    rospy_time_elapsed = rospy_stop_time - rospy_start_time
                    print("Tot Frame counter: " + str(frame_counter))
                    print("Time episode: " + str(timer_stop - timer_start) +
                          " seconds")
                    print("Ros time episode: " + str(rospy_time_elapsed) +
                          " seconds")
                    if cumulated_reward >= 0:
                        rospy.logwarn("Positive reward obtained!")
                    print("Cumulated reward: " + str(cumulated_reward))
                    print("Episode finished after {} timesteps".format(step +
                                                                       1))
                    sys.stdout.flush()

                    with open(
                            "./" + str(ckp) + "/results_ckp" + str(ckp) + "_" +
                            str(datetime.date.today().strftime("%m%d%Y")) +
                            ".csv", "a") as myfile:
                        boolean_reward = 0
                        if (cumulated_reward > 0):
                            boolean_reward = 1
                            accuracy_ground += 1.0
                        print "Current accuracy value: " + str(accuracy_ground)
                        string_to_add = ground_list[ground_index] + "," + str(
                            episode) + "," + str(step) + "," + str(
                                cumulated_reward) + "," + str(
                                    boolean_reward) + '\n'
                        myfile.write(string_to_add)

                    if episode == tot_episodes - 1:
                        with open(
                                "./" + str(ckp) + "/accuracy_ckp" + str(ckp) +
                                ".csv", "a") as myfile:
                            accuracy_ground = accuracy_ground / \
                                (episode_per_ground * 1.0)
                            print "Final accuracy value: " + str(
                                accuracy_ground)
                            string_to_add = ground_list[actual_ground_index] + \
                                "," + str(accuracy_ground) + "\n"
                            myfile.write(string_to_add)

                    break
Exemplo n.º 26
0
class DQN:
    def __init__(self,
                 config,
                 game,
                 directory,
                 callback=None,
                 summary_writer=None):

        self.game = game
        self.actions = game.get_available_actions()
        self.feedback_size = game.get_feedback_size()
        self.callback = callback
        self.summary_writer = summary_writer

        self.config = config
        self.batch_size = config['batch_size']
        self.n_episode = config['num_episode']
        self.capacity = config['capacity']
        self.epsilon_decay = config['epsilon_decay']
        self.epsilon_min = config['epsilon_min']
        self.num_frames = config['num_frames']
        self.num_nullops = config['num_nullops']
        self.time_between_two_copies = config['time_between_two_copies']
        self.input_scale = config['input_scale']
        self.update_interval = config['update_interval']
        self.directory = directory

        self._init_modules()

    def _init_modules(self):

        # Replay memory
        self.replay_memory = ReplayMemory(history_len=self.num_frames,
                                          capacity=self.capacity,
                                          batch_size=self.batch_size,
                                          input_scale=self.input_scale)

        input_shape = self.feedback_size + (self.num_frames, )
        # Q-network
        self.q_network = QNetwork(input_shape=input_shape,
                                  n_outputs=len(self.actions),
                                  network_type=self.config['network_type'],
                                  scope='q_network')
        # Target network
        self.target_network = QNetwork(
            input_shape=input_shape,
            n_outputs=len(self.actions),
            network_type=self.config['network_type'],
            scope='target_network')
        # Optimizer
        self.optimizer = Optimizer(config=self.config,
                                   feedback_size=self.feedback_size,
                                   q_network=self.q_network,
                                   target_network=self.target_network,
                                   replay_memory=self.replay_memory)
        # Ops for updating target network
        self.clone_op = self.target_network.get_clone_op(self.q_network)
        # For tensorboard
        self.t_score = tf.placeholder(dtype=tf.float32,
                                      shape=[],
                                      name='new_score')
        tf.summary.scalar("score", self.t_score, collections=['dqn'])
        self.summary_op = tf.summary.merge_all('dqn')

    def set_summary_writer(self, summary_writer=None):
        self.summary_writer = summary_writer
        self.optimizer.set_summary_writer(summary_writer)

    def choose_action(self, sess, state, epsilon_greedy):
        if numpy.random.binomial(1, epsilon_greedy) == 1:
            action = random.choice(self.actions)
        else:
            x = numpy.asarray(numpy.expand_dims(state, axis=0) /
                              self.input_scale,
                              dtype=numpy.float32)
            action = self.q_network.get_q_action(sess, x)[0]
        return action

    def play(self, action):
        r, new_state, termination = self.game.play_action(action)
        return r, new_state, termination

    def update_target_network(self, sess):
        sess.run(self.clone_op)

    def train(self, sess, saver=None):

        num_of_trials = -1
        for episode in range(self.n_episode):
            self.game.reset()
            frame = self.game.get_current_feedback()
            for _ in range(self.num_nullops):
                r, new_frame, termination = self.play(action=0)
                self.replay_memory.add(frame, 0, r, termination)
                frame = new_frame

            for _ in range(self.config['T']):
                num_of_trials += 1
                epsilon_greedy = self.epsilon_min + \
                    max(self.epsilon_decay - num_of_trials, 0) / \
                    self.epsilon_decay * (1 - self.epsilon_min)
                print("epi {}, frame {}k: reward {}, eps {}".format(
                    episode, int(num_of_trials / 1000),
                    self.game.get_total_reward(), epsilon_greedy))
                if num_of_trials % self.update_interval == 0:
                    self.optimizer.train_one_step(sess, num_of_trials,
                                                  self.batch_size)

                state = self.replay_memory.phi(frame)
                action = self.choose_action(sess, state, epsilon_greedy)
                r, new_frame, termination = self.play(action)
                self.replay_memory.add(frame, action, r, termination)
                frame = new_frame

                if num_of_trials % self.time_between_two_copies == 0:
                    self.update_target_network(sess)
                    self.save(sess, saver)

                if self.callback:
                    self.callback()
                if termination:
                    score = self.game.get_total_reward()
                    summary_str = sess.run(self.summary_op,
                                           feed_dict={self.t_score: score})
                    self.summary_writer.add_summary(summary_str, num_of_trials)
                    self.summary_writer.flush()
                    break

    def evaluate(self, sess):

        for episode in range(self.n_episode):
            self.game.reset()
            frame = self.game.get_current_feedback()
            for _ in range(self.num_nullops):
                r, new_frame, termination = self.play(action=0)
                self.replay_memory.add(frame, 0, r, termination)
                frame = new_frame

            for _ in range(self.config['T']):
                print("episode {}, total reward {}".format(
                    episode, self.game.get_total_reward()))

                state = self.replay_memory.phi(frame)
                action = self.choose_action(sess, state, self.epsilon_min)
                r, new_frame, termination = self.play(action)
                self.replay_memory.add(frame, action, r, termination)
                frame = new_frame

                if self.callback:
                    self.callback()
                    if termination:
                        break

    def save(self, sess, saver, model_name='model.ckpt'):
        if saver:
            try:
                checkpoint_path = os.path.join(self.directory, model_name)
                saver.save(sess, checkpoint_path)
            except:
                pass

    def load(self, sess, saver, model_name='model.ckpt'):
        if saver:
            try:
                checkpoint_path = os.path.join(self.directory, model_name)
                saver.restore(sess, checkpoint_path)
            except:
                pass
Exemplo n.º 27
0
def main():
    """
    Main function for training the DQN network in learning how to accomplish autonomous landing.
    """
    # ATTENTION: If you want to restore files from a previous simulation you
    # must pass valid values for these variables:
    policy_weights_path = '/home/pulver/Desktop/simulation_8/checkpoint/episode_20500/policy/policy_checkpoint.ckp'
    summary_folder = ""  # if empty the summary is written in ./log/ + current time
    start_episode = 0  # change to last episode number
    frame_counter = 0  # change to last number of frames
    epsilon = 0.1
    screen_width = 84  # original is 160
    screen_height = 84  # original is 210
    images_stack_size = 4
    # Use only the first 5 actions for this simulation
    # action_list = ['left', 'right', 'forward', 'backward', 'stop', 'descend']
    # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (descend)
    tot_actions = 6
    batch_size = 64  # size of the experience batch
    tot_steps = 1000  # finite-horizont simulation
    # 10x10^6 high number since training can be stopped at every time
    tot_episodes = 210
    steps_per_episodes = 40  # expressed in number step
    noop_time = 2.0  # pause in seconds between actions

    num_ground_plane = 70
    actual_ground_index = 0
    # episode_per_ground specify the number of episodes with the same ground plane
    ground_counter = 1
    episode_per_ground = 10

    timer_total_start = time.time()
    rospy.init_node("DRLTestNode")
    rospy.loginfo("----- DRL Test Node -----")

    # Create a subscriber fot the greyscale image
    # rospy.Subscriber("/drl/grey_camera", ROSImage, image_callback)#TODO
    # restore default
    rospy.Subscriber(
        "/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback,queue_size=30)  # Store the last 30 messages before discarding them

    images_stack_size = 4

    r = rospy.Rate(10)  # 10hz
    ground_list = ["asphalt11", "asphalt12", "asphalt13",
                   "brick11", "brick12", "brick13",
                   "grass11", "grass12", "grass13",
                   "pavement11", "pavement12", "pavement13",
                   "sand11", "sand12", "sand13",
                   "snow11", "snow12", "snow13",
                   "soil11", "soil12", "soil13"]

    # Init session and networks
    sess = tf.Session()
    if(summary_folder == ""):
        tf_summary_writer = tf.summary.FileWriter(
            './log/' + str(datetime.datetime.now().time()), sess.graph)
    else:
        tf_summary_writer = tf.summary.FileWriter(
            summary_folder, sess.graph)
    policy_network = QNetwork(sess, tot_actions=tot_actions, image_shape=(
        screen_width, screen_height, images_stack_size), batch_size=batch_size, network_name="policy_net")
    
    init = tf.global_variables_initializer()
    sess.run(init)

    
    if(policy_weights_path != ""):
        print("Loading weights from memory: " + str(policy_weights_path))
        policy_network.load_weights(policy_weights_path)
    else:
        print("The networks path are empty. Choose appropriate weights!")
        sys.exit()

    # start here
    for episode in range(start_episode, tot_episodes):
        # Reset the ground in the environment every 50 episodes (or episode_per_ground)
        ground_index = episode / episode_per_ground
        if ground_index != actual_ground_index or (episode == 0):
            clean_world(ground_list)
            generate_new_world(ground_index, ground_list)
        actual_ground_index = ground_index

        timer_start = time.time()
        actual_time = rospy.get_rostime()
        rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0
        frame_episode = 0
        cumulated_reward = 0
        epsilon_used = 0
        send_action("takeoff")
        print("")
        print("Episode: " + str(episode))
        # 1-Accumulate the first state
        reset_pose()
        print "Reset pose!"
        send_action("stop")
        rospy.sleep(1.0)
#        get_image()
        state = _last_image
        # create a stack of X images
        image_t = np.stack([state] * images_stack_size, axis=2)

        for step in range(tot_steps):
            # 2- Get the action following epsilon-greedy or through policy network.
            # Here image_t is always ready and it can be given as input to
            # the network
            epsilon_used_bool = False
            if(np.random.random_sample(1) < epsilon):
                epsilon_used += 1
                action = get_random_action()
                epsilon_used_bool = True
            # Take the action from the policy network
            else:
                # Here image_t is always ready and it can be given as input to
                # the network
                action_distribution = policy_network.return_action_distribution(
                    input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False)
                action = np.argmax(action_distribution)
                action = convert_action_int_to_str(action)
            #print action
            send_action(action)
            rospy.sleep(noop_time)
            # 3- Get the state_t1 repeating the action obtained at step-2 and add everything
            # in the replay buffer. Then set state_t = state_t1
            #get_image()
            image_t1 = _last_image
            send_action("stop")
            # If the action taken is to land and the UAV is inside the landing BB, done will be calculated accordingly
            done_reward = get_done_reward()
            reward = done_reward.reward
            done = done_reward.done
            if epsilon_used_bool:
                print " Step(" + str(step) + "), Action: " + action + ", Altitude: " + str(done_reward.z) + ", Reward: " + str(reward)
            else:
                print "#Step(" + str(step) + "), Action: " + action + ", Altitude: " + str(done_reward.z) + ", Reward: " + str(reward)
                # ['left', 'right', 'forward', 'backward', 'stop', 'land'
                print(" left:" + str(action_distribution[0][0][0]) + "; right:" + str(action_distribution[0][0][1]) + "; forward:" + str(action_distribution[0][0][2]) + "; backward:" + str(action_distribution[0][0][3]) + "; stop:" + str(action_distribution[0][0][4])  + "; descend:" + str(action_distribution[0][0][5]))
            image_t1 = np.expand_dims(image_t1, 2)
            # stack the images
            image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2)
            frame_counter += 1  # To call every time a frame is obtained
            frame_episode += 1
            image_t = image_t1
            get_relative_pose(ground_index, episode, step, reward)
            cumulated_reward += reward
            # At every step check if more than 30 seconds from the start passed.
            # In that case, set done = True and end the episode
            timer_stop = time.time()
            # Stop the episode if the number of frame is more than a threshold
            if frame_episode >= steps_per_episodes:
                done = True
            # When the episode is done
            if done:
                timer_stop = time.time()
                actual_time = rospy.get_rostime()
                rospy_stop_time = actual_time.secs + actual_time.nsecs / 1000000000.0
                rospy_time_elapsed = rospy_stop_time - rospy_start_time
                print("Tot Frame counter: " + str(frame_counter))
                print("Time episode: " + str(timer_stop - timer_start) + " seconds")
                print("Ros time episode: " +
                      str(rospy_time_elapsed) + " seconds")
                if cumulated_reward >= 0:
                    rospy.logwarn("Positive reward obtained!")
                print("Cumulated reward: " + str(cumulated_reward))
                print("Episode finished after {} timesteps".format(step + 1))
                print("Epsilon used: " + str(epsilon_used) + " out of " + str(step + 1) + "(" + str(float((epsilon_used * 100.0) / (step + 1.0))) + "%)")
                sys.stdout.flush()
                time.sleep(5)
                break
Exemplo n.º 28
0
class DeepQ_agent:
    """
    Represents the DQN agent.
    """
    def __init__(self, env, hidden_units = None, network_LR=0.01, batch_size=1024, update_every=5, gamma=0.95):
        """
        Creates a DQN agent.

        :param env: game environment.
        :type env: Class Snake_Env().
        :param hidden_units: number of neurons in each layer.
        :type hidden_units: tupple with dimension (1, 3).
        :param network_LR: learning rate of the action-value neural network.
        :type network_LR: float.
        :param batch_size: size of the minibatch taken from the replay buffer.
        :type batch_size: int.
        :param update_every: number of iterations for updating the target qnetwork. 
        :type update_every: int
        :param gamma: discount factor.
        :type gamma: float.
        """
        self.env = env
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma          
        self.NETWORK_LR = network_LR
        self.MEMORY_CAPACITY = int(1e5)   
        self.ACTION_SIZE = env.ACTION_SPACE           
        self.HIDDEN_UNITS = hidden_units
        self.UPDATE_EVERY = update_every
       
        self.qnetwork_local = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)
        
        self.qnetwork_target = QNetwork(input_shape = self.env.STATE_SPACE,
                                        hidden_units = self.HIDDEN_UNITS,
                                        output_size = self.ACTION_SIZE,
                                        learning_rate = self.NETWORK_LR)

        self.memory = ReplayMemory(self.MEMORY_CAPACITY, self.BATCH_SIZE) 

        #Temp variable
        self.t = 0


    def learn(self):
        """
        Learn from memorized experience.
        """
        if self.memory.__len__() > self.BATCH_SIZE:
            states, actions, rewards, next_states, dones = self.memory.sample(self.env.STATE_SPACE)
            
            #Calculating action-values using local network
            target = self.qnetwork_local.predict(states, self.BATCH_SIZE)
            
            #Future action-values using target network
            target_val = self.qnetwork_target.predict(next_states, self.BATCH_SIZE)
            
            #Future action-values using local network
            target_next = self.qnetwork_local.predict(next_states, self.BATCH_SIZE)
        
            max_action_values = np.argmax(target_next, axis=1)   #action selection
            
            for i in range(self.BATCH_SIZE):
                if dones[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + self.GAMMA*target_val[i][max_action_values[i]]   #action evaluation
            
            self.qnetwork_local.train(states, target, batch_size = self.BATCH_SIZE)

            if self.t == self.UPDATE_EVERY:
                self.update_target_weights()
                self.t = 0
            else:
                self.t += 1


    def act(self, state, epsilon=0.0):
        """
        Chooses an action using an epsilon-greedy policy.
        
        :param state: current state.
        :type state: NumPy array with dimension (1, 18).
        :param epsilon: epsilon used in epsilon-greedy policy.
        :type epsilon: float
        :return action: action chosen by the agent.
        :rtype: int
        """    
        state = state.reshape((1,)+state.shape)
        action_values = self.qnetwork_local.predict(state)    #returns a vector of size = self.ACTION_SIZE
        if random() > epsilon:
            action = np.argmax(action_values)                 #choose best action - Exploitation
        else:
            action = randint(0, self.ACTION_SIZE-1)           #choose random action - Exploration
        return action


    def add_experience(self, state, action, reward, next_state, done):
        """
        Add experience to agent's memory.
        """
        self.memory.add(state, action, reward, next_state, done)

    
    def update_target_weights(self):
        """
        Updates values of the Target network.
        """
        self.qnetwork_target.model.set_weights(self.qnetwork_local.model.get_weights())
def main():
    """
    Main function for training the DQN network in learning how to accomplish autonomous landing.
    """
    # ATTENTION: If you want to restore files from a previous simulation you
    # must pass valid values for these variables:
    policy_weights_path_1 = '/home/pulver/Desktop/episode_113250/policy/policy_checkpoint.ckp'
    root_images = "/home/pulver/Desktop/network_testing/" + \
        str(datetime.datetime.now().time()) + "/"
    # NOTE: openCV doesn't write in a folder that does not exist
    if not os.path.exists(root_images):
        os.makedirs(root_images)
    source = 3  # NOTE: 1 for real drone, 2 for gazebo, 3 for URL

    screen_width = 84  # original is 160
    screen_height = 84  # original is 210
    images_stack_size = 4
    # Use only the first 5 actions for this simulation
    # action_list = ['left', 'right', 'forward', 'backward', 'stop', 'descend']
    # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (descend)
    tot_actions = 6
    batch_size = 32  # size of the experience batch

    rospy.init_node("DRLTrainingNode")
    rospy.loginfo("----- DRL Training Node -----")

    # Create a subscriber fot the greyscale image

    # 1) Real drone
    if source == 1:
        rospy.Subscriber("/drl/grey_camera", ROSImage, image_callback)
    elif source == 2:
        # 2) Gazebo
        rospy.Subscriber("/quadrotor/ardrone/bottom/ardrone/bottom/image_raw",
                         ROSImage, image_callback, queue_size=30)  # Store the last 30 messages
    elif source == 3:
        # 3) URL
        # video_stream_url = 'http://10.188.34.59:8080/videofeed'
        video_stream_url = "http://10.5.5.9:8080/live/amba.m3u8"
        bytes = ''
    else:
        print "Insert a correct source value (1 for real drone, 2 for gazebo, 3 for URL)"

    images_stack_size = 4
    tot_steps = 3000000  # finite-horizont simulation

    r = rospy.Rate(1)  # 10hz

    # Init session and networks
    sess = tf.Session()
    summary_folder = ""  # if empty the summary is written in ./log/ + current time
    if(summary_folder == ""):
        tf_summary_writer = tf.summary.FileWriter(
            './log/' + str(datetime.datetime.now().time()), sess.graph)
    else:
        tf_summary_writer = tf.summary.FileWriter(
            summary_folder, sess.graph)
    policy_network_1 = QNetwork(sess, tot_actions=tot_actions, image_shape=(
        screen_width, screen_height, images_stack_size), batch_size=batch_size,
        network_name="policy_net")

    # Instructions for updating: Use `tf.global_variables_initializer
    init = tf.global_variables_initializer()
    sess.run(init)

    # Load Neural Networks weights from memory if a valid checkpoint path is
    # passed
    if(policy_weights_path_1 != ""):
        print("Loading weights 1 from memory...")
        policy_network_1.load_weights(policy_weights_path_1)
    else:
        print("The networks path are empty.")
    #     sys.exit()

    if source == 3:
        state = acquire_frame(video_stream_url)
        time.sleep(1)
    state = _last_image
    # Save first image
    image_path = root_images + "image_0.png"
    cv2.imwrite(image_path, state)
    # 1 - Create a stack of X images
    image_t = np.stack([state] * images_stack_size, axis=2)
    matplotlib.interactive(True)
    # fig = plt.figure()
    f, (ax3, ax2) = plt.subplots(2, 1)
    f.tight_layout()
    # gs = gridspec.GridSpec(2, 2, width_ratios=[1, 1])
    # gs.update(left=0.05, right=0.95, wspace=0.05, hspace=0)
    # fig.set_size_inches(8, 4)
    f.patch.set_facecolor('gray')

    for step in range(1, 100000000):
        ##########################################
        ##                CAMERA                ##
        ##########################################
        pad_size = 1
        pad_value = 0
        # print "Shape image_t:" + str(np.shape(image_t))
        image = np.lib.pad(image_t[:, :, 0], (pad_size, pad_size),
                           'constant', constant_values=(pad_value, pad_value))
        for d in range(1, images_stack_size):
            image_stack_padded = np.lib.pad(
                image_t[:, :, d], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value))
            image = np.append(image, image_stack_padded, axis=1)
        # print"shape: " + str(np.shape(image))
        # Plot in the first row the camera images
        # ax1 = plt.subplot(2, 1, 1)
        # ax1 = plt.subplot(gs[0, :])
        # but first clear the old one
        # ax1.clear()
        # ax1.axis("off")
        # specify greyscale instead BGR
        # ax1.imshow(image, cmap='gray')
        #########################################

        # 2 - Forward in input to NN
        # action_distribution_1 = policy_network_1.return_action_distribution(
        # input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)),
        # softmax=False)
        action_distribution_1, conv1, conv2, conv3 = policy_network_1.return_everything(
            input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False)
        action = np.argmax(action_distribution_1)
        action = convert_action_int_to_str(action)
        print "######################"
        print "Action distribution: " + str(action_distribution_1)
        print "Action_1 selected: " + action
        # print "Shape action dis:" + str(np.shape(action_distribution_1))
        # print "Shape conv1:" + str(np.shape(conv1))
        # print "Shape conv2:" + str(np.shape(conv2))
        # print "Shape conv3:" + str(np.shape(conv3))

        ##########################################
        ##                FILTERS               ##
        ##########################################
        ax3.clear()
        ax3.axis("off")

        padding = np.ones((1, 21 * 16))
        # 1 Conv layer ###########################
        conv1 = np.reshape(conv1, (21, 21, 32))
        image_cv1_1 = np.reshape(conv1[:, :, 0:16], (21, 21 * 16), order='F')
        image_cv1_2 = np.reshape(conv1[:, :, 16:32], (21, 21 * 16), order='F')
        # image_cv1_1 = np.reshape(conv1[:, :, 0:8], (21, 21 * 8), order='F')
        # image_cv1_2 = np.reshape(conv1[:, :, 8:16], (21, 21 * 8), order='F')
        # image_cv1_3 = np.reshape(conv1[:, :, 16:24], (21, 21 * 8), order='F')
        # image_cv1_4 = np.reshape(conv1[:, :, 24:32], (21, 21 * 8), order='F')
        image_cv1 = np.concatenate(
            (padding, image_cv1_1, image_cv1_2), axis=0)
        # Save filters
        filter_path = root_images + "filters/step_" + \
            str(step) + "/"
        if not os.path.exists(filter_path):
            os.makedirs(filter_path)
        filter_path = filter_path + "conv_1.jpg"

        I = image_cv1
        I8 = (((I - I.min()) / (I.max() - I.min())) * 255.9).astype(np.uint8)
        image_cv1_resized = cv2.resize(I8, (84 * 4, 21 * 2),
                                       interpolation=cv2.INTER_AREA)
        img = Image.fromarray(image_cv1_resized)
        img.save(filter_path)

        # 2 Conv layer ###########################
        padding = np.zeros((1, 11 * 32))
        conv2 = np.reshape(conv2, (11, 11, 64))
        image_cv2_1 = np.reshape(conv2[:, :, 0:32], (11, 11 * 32), order='F')
        image_cv2_2 = np.reshape(conv2[:, :, 32:64], (11, 11 * 32), order='F')
        # image_cv2_3 = np.reshape(conv2[:, :, 16:24], (21, 21 * 8), order='F')
        # image_cv2_4 = np.reshape(conv2[:, :, 24:32], (21, 21 * 8), order='F')
        image_cv2 = np.concatenate(
            (padding, image_cv2_1, image_cv2_2), axis=0)
        # Save filters
        filter_path = root_images + "filters/step_" + \
            str(step) + "/"
        if not os.path.exists(filter_path):
            os.makedirs(filter_path)
        filter_path = filter_path + "conv_2.jpg"

        I = image_cv2
        I8 = (((I - I.min()) / (I.max() - I.min())) * 255.9).astype(np.uint8)
        image_cv2_resized = cv2.resize(I8, (84 * 4, 11 * 2),
                                       interpolation=cv2.INTER_AREA)
        img = Image.fromarray(image_cv2_resized)
        img.save(filter_path)

        # 3 Conv layer ###########################
        padding = np.ones((1, 11 * 32))
        conv3 = np.reshape(conv3, (11, 11, 64))
        image_cv3_1 = np.reshape(conv3[:, :, 0:32], (11, 11 * 32), order='F')
        image_cv3_2 = np.reshape(conv3[:, :, 32:64], (11, 11 * 32), order='F')
        # image_cv2_3 = np.reshape(conv2[:, :, 16:24], (21, 21 * 8), order='F')
        # image_cv2_4 = np.reshape(conv2[:, :, 24:32], (21, 21 * 8), order='F')
        image_cv3 = np.concatenate(
            (padding, image_cv3_1, image_cv3_2), axis=0)
        # Save filters
        filter_path = root_images + "filters/step_" + \
            str(step) + "/"
        if not os.path.exists(filter_path):
            os.makedirs(filter_path)
        filter_path = filter_path + "conv_3.jpg"

        I = image_cv3
        I8 = (((I - I.min()) / (I.max() - I.min())) * 255.9).astype(np.uint8)
        image_cv3_resized = cv2.resize(I8, (84 * 4, 11 * 2),
                                       interpolation=cv2.INTER_AREA)
        img = Image.fromarray(image_cv3_resized)
        img.save(filter_path)

        # Assemble final image ####################

        image_input = np.reshape(image_t, (84, 84 * 4), order='F')
        image_input_resized = cv2.resize(image, (84 * 4, 84),
                                         interpolation=cv2.INTER_AREA)
        final_image = np.concatenate(
            (image_input_resized, image_cv1_resized, image_cv2_resized, image_cv3_resized))

        # Plot the image
        # ax3 = plt.subplot(gs[0, :])

        # but first clear the old one

        # specify greyscale instead BGR
        ax3 = plt.subplot(2, 1, 1)
        ax3.imshow(final_image, cmap='gray')
        ##########################################
        ##               HISTOGRAM              ##
        ##########################################
        # print np.shape(action_distribution_1)
        # print len(action_distribution_1[0])
        ind = np.arange(len(action_distribution_1[0]))
        # print ind
        width = 0.4
        # fig, ax = plt.subplots()
        # ax2 = plt.subplot(gs[1, :])
        ax2.clear()
        ax2 = plt.subplot(2, 1, 2)
        ax2.set_aspect(3)

        rects = ax2.bar(0.4 + ind, action_distribution_1[0],
                        width=width, color='r', alpha=0.4)
        rects[np.argmax(action_distribution_1)].set_color('g')
        ax2.set_xticks(0.4 + ind + width / 2)
        ax2.set_xticklabels(("left", "right", "forward",
                             "backward", "stop", "descend"))
        for i in ax2.xaxis.get_ticklabels():
            i.set_color("white")

        ax2.set_ylim([0, 1])

        # plt.xticks(fontsize=16)
        # plt.show()

        ##########################################

        # 3 - Send command and wait for translation
        raw_input("Press a button to acquire images...")

        # 4 - Acquire second frame and add to the stack
        if source == 3:
            global _last_image
            _last_image = acquire_frame(video_stream_url)
            time.sleep(1)
        image_t1 = _last_image
        # Save every one image image
        image_path = root_images + "image_" + str(step) + ".png"
        cv2.imwrite(image_path, image_t1)
        image_t1 = np.expand_dims(image_t1, 2)
        image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2)

        # 5 - Forward stack in input to the network
        image_t = image_t1
Exemplo n.º 30
0
def main():
    """
    Main function for training the DQN network in learning how to accomplish autonomous landing.
    """
    # ATTENTION: If you want to restore files from a previous simulation you
    # must pass valid values for these variables:
    policy_weights_path = '/home/pulver/Desktop/episode_32400/policy/policy_checkpoint.ckp'

    summary_folder = ""  # if empty the summary is written in ./log/ + current time
    start_episode = 0  # change to last episode number
    frame_counter = 0  # change to last number of frames

    screen_width = 84  # original is 160
    screen_height = 84  # original is 210
    images_stack_size = 4
    # Use only the first 5 actions for this simulation
    # 0 (left, 1 (right), 2 (forward), 3 (backward), 4 (stop), 5 (land),
    # 6 (ascend), 7 (descend), 8 (rotate_left), 9 (rotate_right)
    tot_actions = 6
    batch_size = 32  # size of the experience batch
    tot_steps = 1000  # finite-horizont simulation
    # 10x10^6 high number since training can be stopped at every time
    tot_episodes = 1050
    steps_per_episodes = 20  # expressed in number step
    noop_time = 2.0  # pause in seconds between actions

    num_ground_plane = 21
    episodes_per_ground_plane = tot_episodes / num_ground_plane
    actual_ground_index = 0

    timer_total_start = time.time()
    rospy.init_node("DeepReinforcedLanding")
    rospy.loginfo("----- Deep Reinforced Landing Node -----")

    rospy.Subscriber(
        "/quadrotor/ardrone/bottom/ardrone/bottom/image_raw", ROSImage, image_callback, queue_size=30)  # Store the last 30 messages before discarding them

    r = rospy.Rate(10)  # 10hz

    # Init session and networks
    sess = tf.Session()
    if(summary_folder == ""):
        tf_summary_writer = tf.summary.FileWriter(
            './log/' + str(datetime.datetime.now().time()), sess.graph)
    else:
        tf_summary_writer = tf.summary.FileWriter(
            summary_folder, sess.graph)
    policy_network = QNetwork(sess, tot_actions=tot_actions, image_shape=(
        screen_width, screen_height, images_stack_size), batch_size=batch_size, network_name="policy_net")
    init = tf.global_variables_initializer()
    sess.run(init)

    # Load Neural Networks weights from memory if a valid checkpoint path is passed
    # if(os.path.isfile(policy_weights_path) == True and
    # os.path.isfile(target_weights_path) == True):
    try:
        print("Loading weights from memory...")
        policy_network.load_weights(policy_weights_path)
    except:
        print("Error while loading the weights!")
        return

    print "Episode, step, action, reward, action_distribution"
    # start here
    for episode in range(start_episode, tot_episodes):
        # Reset the ground in the environment every 25k frames
        # ground_index = episode / 50
        # if ground_index != actual_ground_index or episode == 0:
        #     generate_new_world(ground_index)
        # actual_ground_index = ground_index
        timer_start = time.time()
        actual_time = rospy.get_rostime()
        rospy_start_time = actual_time.secs + actual_time.nsecs / 1000000000.0
        cumulated_reward = 0   
        # 1-Accumulate the first state
        #reset_pose()
        send_action("stop")
        rospy.sleep(1.0)
        state = _last_image
        # create a stack of X images
        image_t = np.stack([state] * images_stack_size, axis=2)
        frame_episode = 0
        pad_size = 3
        pad_value = 255
        for step in range(tot_steps):
            # 2- Get the action following epsilon-greedy or through policy network.
            # With probability epsilon take random action otherwise it takes
            # an action from the policy network.
            # Take a random action
            
            image_stack_padded = [np.lib.pad(image_t[:,:,0], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value)),
            np.lib.pad(image_t[:,:,1], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value)),
            np.lib.pad(image_t[:,:,2], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value)),
            np.lib.pad(image_t[:,:,3], (pad_size, pad_size), 'constant', constant_values=(pad_value, pad_value))]
            image_hstack = np.hstack(image_stack_padded)
            cv2.imwrite("/home/pulver/Desktop/print_network_action_distribution/img_ep"+str(episode)+"_step" + str(step) + ".png", image_hstack )            
            # Here image_t is always ready and it can be given as input to
            # the network
            action_distribution = policy_network.return_action_distribution( input_data=np.reshape(image_t, (1, 84, 84, images_stack_size)), softmax=False)
            
            action = np.argmax(action_distribution)
            action = convert_action_int_to_str(action)
            #action = get_random_action()
            send_action(action)
            rospy.sleep(noop_time)
            # 3- Get the state_t1 repeating the action obtained at step-2 and add everything
            # in the replay buffer. Then set state_t = state_t1
            # get_image()
            image_t1 = _last_image
            # If the action taken is to land and the UAV is inside the landing
            # BB, done will be calculated accordingly
            done_reward = get_done_reward()
            reward = done_reward.reward
            done = done_reward.done
            image_t1 = np.expand_dims(image_t1, 2)
            # stack the images
            image_t1 = np.append(image_t[:, :, 1:], image_t1, axis=2)
            image_t = image_t1
            cumulated_reward += reward
            send_action("stop")
            frame_episode = frame_episode + 1
            #get_relative_pose(ground_index, episode, step, reward)
            # At every step check if more than 30 seconds from the start passed.
            # In that case, set done = True and end the episode
            timer_stop = time.time()
            print str(episode) + "," + str(step) + "," + action + "," + str(reward) + "," +str(action_distribution)
            # Stop the episode if the number of frame is more than a threshold
            if frame_episode >= steps_per_episodes:
                done = True
            # if timer_stop - timer_start >= 30.0: # maximum time allowed
            #     #cumulated_reward += -1
            #     done = True
            # When the episode is done
            if done:
                break