示例#1
0
            if next_state_hash != current_state_hash:
                main_action = np.argmax(act_counter)
                graph.update_transition(current_state_hash, main_action,
                                        next_state_hash)
                act_counter = np.zeros((output_size, ), dtype=np.int32)

            # Take the action confidence with current state hash code as the intrinsic reward
            in_reward = curiosity_weight * graph.action_confidence(
                current_state_hash, action.item())
            # in_reward = curiosity_weight * np.sqrt(in_reward)       # Take the square root of confidence value

            # Record transition in memory
            memory.add_transition(action,
                                  log_prob,
                                  next_state,
                                  extrinsic_reward=reward,
                                  extrinsic_value_estimate=ex_val,
                                  intrinsic_reward=in_reward,
                                  intrinsic_value_estimate=in_val)
            # memory.add_transition(action, log_prob, next_state,
            #                       extrinsic_reward=running_reward if done else 0., extrinsic_value_estimate=ex_val,
            #                       intrinsic_reward=in_reward, intrinsic_value_estimate=in_val)

            # Update current state
            current_state = next_state
            current_state_hash = next_state_hash

            # Render this episode
            if render and (render_each_episode or
                           (not finished_rendering_this_epoch)):
                env.render()
            act_counter[action.item()] += 1

            # If next state hashed to a different code than the current state, then infer the dominating action,
            #   update causal link, and clear action counter
            if next_state_hash != current_state_hash:
                main_action = np.argmax(act_counter)
                graph.update_transition(current_state_hash, main_action, next_state_hash)
                act_counter = np.zeros((output_size,), dtype=np.int32)

            # Take the action confidence with current state hash code as the intrinsic reward
            in_reward = graph.action_confidence(current_state_hash, action.item())
            in_reward = curiosity_weight * np.sqrt(in_reward)       # Take the square root of confidence value

            # Record transition in memory
            memory.add_transition(action, log_prob, next_state,
                                  extrinsic_reward=running_reward if done else 0., extrinsic_value_estimate=ex_val,
                                  intrinsic_reward=in_reward, intrinsic_value_estimate=in_val)


            # Update current state
            current_state = next_state
            current_state_hash = next_state_hash

            # Render this episode
            if render and (render_each_episode or (not finished_rendering_this_epoch)):
                env.render()

            if done:
                # Load and print episode stats after each episode ends
                episode_durations.append(t + 1)
                episode_rewards.append(running_reward)
示例#3
0
            running_reward += reward

            # Estimate the value of the next state
            value = value_net(
                torch.tensor([next_state],
                             device=device)).squeeze()  # squeeze the dimension

            # Render this episode
            if render and (render_each_episode or
                           (not finished_rendering_this_epoch)):
                env.render()

            # Record transition in memory
            memory.add_transition(action,
                                  log_prob,
                                  next_state,
                                  extrinsic_reward=reward,
                                  extrinsic_value_estimate=value)

            # Update current state
            current_state = next_state

            if done:
                # Load and print episode stats after each episode ends
                episode_durations.append(t + 1)
                episode_rewards.append(running_reward)
                if running_reward > training_info["max reward achieved"]:
                    training_info["max reward achieved"] = running_reward

                # Decide whether to render next episode
                if not (render_each_episode):
示例#4
0
            frame_list.append(transform(next_frame))
            next_state = torch.cat(frame_list,
                                   dim=0).to(device)  # Stack the images

            # Obtain action, log probability and value estimate for the next state in a single propagation
            # Move the outputs to cpu to save memory
            next_action, next_log_prob, value = actor_critic(
                next_state.unsqueeze(dim=0))
            next_action = next_action.squeeze().cpu()
            next_log_prob = next_log_prob.squeeze().cpu()
            value = value.squeeze().cpu()

            # Record transition in memory
            memory.add_transition(action,
                                  log_prob.cpu(),
                                  next_state.clone().detach().cpu(),
                                  extrinsic_reward=reward,
                                  extrinsic_value_estimate=value)

            # Update current state and action
            action = next_action
            log_prob = next_log_prob

            # Visualizing AE Hash
            ae_hash.eval()  # Set in evaluation mode
            if stacked:
                code, latent = ae_hash.hash(next_state.unsqueeze(dim=0),
                                            base_ten=False)
                recon_state, _ = ae_hash(next_state.unsqueeze(dim=0))
            else:
                code, latent = ae_hash.hash(next_state[-1:].unsqueeze(dim=0),
示例#5
0
                # Or if reached termination, update transition to the termination state
                if next_state_hash != current_state_hash or done:
                    main_action = np.argmax(act_counter)
                    act_counter = np.zeros((actor_layer_sizes[-1],), dtype=np.int32)
                    if next_state_hash != current_state_hash:
                        graph.update_transition(current_state_hash, main_action, next_state_hash)
                    if done:
                        graph.update_termination(current_state_hash, main_action)


                in_reward = graph.action_confidence(current_state_hash, action.item())
                in_reward = curiosity_weight * in_reward  # Take the square root of confidence value

                # Store transition in memory
                memory.add_transition(action, log_prob.cpu(), next_state.clone().detach().cpu(),
                                      extrinsic_reward=reward, extrinsic_value_estimate=ex_val,
                                      intrinsic_reward=in_reward, intrinsic_value_estimate=in_val)

                current_state_hash = next_state_hash

            else:

                memory.add_transition(action, log_prob.cpu(), next_state.clone().detach().cpu(),
                                      extrinsic_reward=reward, extrinsic_value_estimate=ex_val,
                                      intrinsic_reward=0.0, intrinsic_value_estimate=in_val)

            current_state = next_state
            action = next_action
            log_prob = next_log_prob

            action, log_prob = policy_net(
                torch.tensor([current_state], device=device))
            log_prob = log_prob.squeeze()

            # Interact with the environment
            next_state, reward, done, _ = env.step(action.item())
            running_reward += reward

            # Render this episode
            if render and (render_each_episode or
                           (not finished_rendering_this_epoch)):
                env.render()

            # Record transition in memory
            memory.add_transition(action,
                                  log_prob,
                                  next_state,
                                  extrinsic_reward=reward)

            # Update current state
            current_state = next_state

            if done:
                # Load and print episode stats after each episode ends
                episode_durations.append(t + 1)
                episode_rewards.append(running_reward)
                if running_reward > training_info["max reward achieved"]:
                    training_info["max reward achieved"] = running_reward

                # Decide whether to render next episode
                if not (render_each_episode):
                    finished_rendering_this_epoch = True
示例#7
0
            # If next state hashed to a different code than the current state, then infer the dominating action,
            #   update causal link, and clear action counter
            if next_state_hash != current_state_hash:
                main_action = np.argmax(act_counter)
                graph.update_transition(current_state_hash, main_action, next_state_hash)
                act_counter = np.zeros((output_size,), dtype=np.int32)

            # Take the action confidence with current state hash code as the intrinsic reward
            in_reward = curiosity_weight * graph.action_confidence(current_state_hash, action.item())
            # in_reward = curiosity_weight * np.sqrt(in_reward)       # Take the square root of confidence value

            # Record transition in memory
            # If not done (end of episode), only record exploration bonus. If done, add exploration bonus at that step
            #   with end-of-episode total running extrinsic reward.
            memory.add_transition(action, log_prob, next_state,
                                  extrinsic_reward=in_reward if not done else in_reward + running_reward,
                                  extrinsic_value_estimate=ex_val)

            # Update current state
            current_state = next_state
            current_state_hash = next_state_hash

            # Render this episode
            if render and (render_each_episode or (not finished_rendering_this_epoch)):
                env.render()

            if done:
                # Load and print episode stats after each episode ends
                episode_durations.append(t + 1)
                episode_rewards.append(running_reward)
                if running_reward > training_info["max reward achieved"]:
示例#8
0
            if next_state_hash != current_state_hash:
                main_action = np.argmax(act_counter)
                graph.update_transition(current_state_hash, main_action,
                                        next_state_hash)

            # Obtain action causal confidence (variance) and calculate curiosity (weighted)
            act_confidence = graph.action_confidence(current_state_hash,
                                                     action.item())
            curiosity = curiosity_weight * act_confidence

            # Record transition in memory with curiosity
            # Only summed-up end-of-episode reward is fed to the agent
            if done:
                memory.add_transition(action,
                                      log_prob,
                                      next_state,
                                      extrinsic_reward=running_reward,
                                      intrinsic_reward=curiosity)
            else:
                memory.add_transition(action,
                                      log_prob,
                                      next_state,
                                      extrinsic_reward=0.,
                                      intrinsic_reward=curiosity)

            # Update current state
            current_state = next_state
            current_state_hash = next_state_hash

            # Render this episode
            if render and (render_each_episode or