示例#1
0
    def __init__(self, state_size, action_size, seed, algorithm='DQN'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # set algorithm
        if algorithm == "DQN":
            self.learn = self.learnDQN
        elif algorithm == "DDQN":
            self.learn = self.learnDDQN
        else:
            raise ('algorithm {} not implemented'.format(algorithm))
示例#2
0
    def __init__(self,
                 env,
                 gamma=0.95,
                 epsilon=1.0,
                 copy_period=1000,
                 lr=0.01,
                 update_period=2):
        """
            gammma: 割引率
            epsilon: 探索と活用の割合
        """

        self.env = env

        self.gamma = gamma

        self.epsion = epsilon

        self.copy_period = copy_period

        self.update_period = update_period

        self.lr = lr

        self.global_steps = 0

        self.q_network = QNetwork(self.env.action_space.n, lr=lr)

        self.q_network.build(input_shape=(None, 4))

        self.target_network = QNetwork(self.env.action_space.n)

        self.target_network.build(input_shape=(None, 4))

        self.experiences = collections.deque(maxlen=self.MAX_EXPERIENCES)
示例#3
0
    def __init__(self, env, render, config_info):
        self.env = env
        self.render = render
        self._reset_env()

        # Create run folder to store parameters, figures, and tensorboard logs
        self.path_runs = create_run_folder(config_info)

        # Extract training parameters from yaml config file
        param = load_training_parameters(config_info["config_param"])
        self.train_param = param["training"]

        # Define device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Device in use : {self.device}")

        # Define state and action dimension spaces
        state_dim = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]

        # Define models
        hidden_size = param["model"]["hidden_size"]
        self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device)

        self.target_q_net = QNetwork(state_dim, num_actions, hidden_size).to(
            self.device
        )
        self.target_q_net.load_state_dict(self.q_net.state_dict())

        self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to(
            self.device
        )

        # Define loss criterion
        self.q_criterion = nn.MSELoss()

        # Define optimizers
        lr = float(param["optimizer"]["learning_rate"])
        self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr)

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(param["training"]["replay_size"])

        self.transition = namedtuple(
            "transition",
            field_names=["state", "action", "reward", "done", "next_state"],
        )

        # Useful variables
        self.batch_size = param["training"]["batch_size"]
        self.gamma = param["training"]["gamma"]
        self.tau = param["training"]["tau"]
        self.start_step = param["training"]["start_step"]
        self.max_timesteps = param["training"]["max_timesteps"]
        self.alpha = param["training"]["alpha"]
示例#4
0
 def __init__(self, action_size, state_size, config):
     self.seed = config["seed"]
     torch.manual_seed(self.seed)
     np.random.seed(seed=self.seed)
     random.seed(self.seed)
     self.env = gym.make(config["env_name"])
     self.env.seed(self.seed)
     now = datetime.now()
     dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
     self.env.action_space.seed(self.seed)
     self.action_size = action_size
     self.state_size = state_size
     self.min_action = config["min_action"]
     self.max_action = config["max_action"]
     self.seed = config["seed"]
     self.tau = config["tau"]
     self.gamma = config["gamma"]
     self.batch_size = config["batch_size"]
     if not torch.cuda.is_available():
         config["device"] == "cpu"
     self.device = config["device"]
     self.eval = config["eval"]
     self.vid_path = config["vid_path"]
     print("actions size ", action_size)
     print("actions min ", self.min_action)
     print("actions max ", self.max_action)
     fc1 = config["fc1_units"]
     fc2 = config["fc1_units"]
     self.actor = Actor(state_size, action_size, self.seed, fc1,
                        fc2).to(self.device)
     self.optimizer_a = torch.optim.Adam(self.actor.parameters(),
                                         config["lr_actor"])
     self.target_actor = Actor(state_size, action_size, self.seed, fc1,
                               fc2).to(self.device)
     self.target_actor.load_state_dict(self.actor.state_dict())
     self.critic = QNetwork(state_size, action_size, self.seed, fc1,
                            fc2).to(self.device)
     self.optimizer_q = torch.optim.Adam(self.critic.parameters(),
                                         config["lr_critic"])
     self.target_critic = QNetwork(state_size, action_size, self.seed, fc1,
                                   fc2).to(self.device)
     self.target_critic.load_state_dict(self.critic.state_dict())
     self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(action_size),
                                           dimension=action_size)
     self.max_timesteps = config["max_episodes_steps"]
     self.noise.reset()
     self.episodes = config["episodes"]
     self.memory = ReplayBuffer((state_size, ), (action_size, ),
                                config["buffer_size"], self.seed,
                                self.device)
     pathname = str(config["seed"]) + str(dt_string)
     tensorboard_name = str(
         config["res_path"]) + '/runs/' + "DDPG" + str(pathname)
     self.writer = SummaryWriter(tensorboard_name)
     self.steps = 0
示例#5
0
    def __init__(self, state_size, action_size, config):
        self.env_name = config["env_name"]
        self.state_size = state_size
        self.action_size = action_size
        self.seed = config["seed"]
        self.clip = config["clip"]
        self.device = 'cuda'
        print("Clip ", self.clip)
        print("cuda ", torch.cuda.is_available())
        self.double_dqn = config["DDQN"]
        print("Use double dqn", self.double_dqn)
        self.lr_pre = config["lr_pre"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.tau = config["tau"]
        print("self tau", self.tau)
        self.gamma = 0.99
        self.fc1 = config["fc1_units"]
        self.fc2 = config["fc2_units"]
        self.fc3 = config["fc3_units"]
        self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2,self.fc3,  self.seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)
        
        self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr)
        self.soft_update(self.q_shift_local, self.q_shift_target, 1)
         
        self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3,  self.seed).to(self.device)
        self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
        self.soft_update(self.R_local, self.R_target, 1) 

        self.expert_q = DQNetwork(state_size, action_size, seed=self.seed).to(self.device)
        self.expert_q.load_state_dict(torch.load('checkpoint.pth'))
        self.memory = Memory(action_size, config["buffer_size"], self.batch_size, self.seed, self.device)
        self.t_step = 0
        self.steps = 0
        self.predicter = Classifier(state_size, action_size, self.seed).to(self.device)
        self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre)
        pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_fc3_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.fc3, self.seed)
        pathname += "_clip_{}".format(config["clip"])
        pathname += "_tau_{}".format(config["tau"])
        now = datetime.now()    
        dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
        pathname += dt_string
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
        self.writer = SummaryWriter(tensorboard_name)
        print("summery writer ", tensorboard_name)
        self.average_prediction = deque(maxlen=100)
        self.average_same_action = deque(maxlen=100)
        self.all_actions = []
        for a in range(self.action_size):
            action = torch.Tensor(1) * 0 +  a
            self.all_actions.append(action.to(self.device))
 def __init__(self, action_size, state_size, config):
     self.seed = config["seed"]
     torch.manual_seed(self.seed)
     np.random.seed(seed=self.seed)
     self.env = gym.make(config["env_name"])
     self.env = FrameStack(self.env, config)
     self.env.seed(self.seed)
     self.action_size = action_size
     self.state_size = state_size
     self.tau = config["tau"]
     self.gamma = config["gamma"]
     self.batch_size = config["batch_size"]
     self.lr = config["lr"]
     self.history_length = config["history_length"]
     self.size = config["size"]
     if not torch.cuda.is_available():
         config["device"] == "cpu"
     self.device = config["device"]
     self.eval = config["eval"]
     self.vid_path = config["vid_path"]
     print("actions size ", action_size)
     self.critic = QNetwork(state_size, action_size, config["fc1_units"],
                            config["fc2_units"]).to(self.device)
     self.q_optim = torch.optim.Adam(self.critic.parameters(),
                                     config["lr_critic"])
     self.target_critic = QNetwork(state_size, action_size,
                                   config["fc1_units"],
                                   config["fc2_units"]).to(self.device)
     self.target_critic.load_state_dict(self.critic.state_dict())
     self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
     self.alpha = self.log_alpha.exp()
     self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"])
     self.policy = SACActor(state_size, action_size).to(self.device)
     self.policy_optim = Adam(self.policy.parameters(),
                              lr=config["lr_policy"])
     self.encoder = Encoder(config).to(self.device)
     self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(),
                                               self.lr)
     self.episodes = config["episodes"]
     self.memory = ReplayBuffer((self.history_length, self.size, self.size),
                                (1, ), config["buffer_size"],
                                config["image_pad"], self.seed, self.device)
     pathname = config["seed"]
     tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname)
     self.writer = SummaryWriter(tensorboard_name)
     self.steps = 0
     self.target_entropy = -torch.prod(
         torch.Tensor(action_size).to(self.device)).item()
示例#7
0
def main(config: Config):
    print(config)

    # Let's run it!
    for i in range(config.num_experiments):
        experiment_seed = config.seed + i * config.num_episodes
        memory = ReplayMemory(config.replay_memory_size)

        # We will seed the algorithm (for reproducability).
        random.seed(experiment_seed)
        torch.manual_seed(experiment_seed)
        env.seed(experiment_seed)

        q_model = QNetwork(config.device, config.num_hidden_q_model)
        curiousity_model = StatePredictor(2, 3,
                                          config.num_hidden_curiosity_model,
                                          config.device)

        for i in range(20, 29):
            episode_durations, episode_loss = run_episodes(train,
                                                           q_model,
                                                           curiousity_model,
                                                           memory,
                                                           env,
                                                           experiment_seed,
                                                           config,
                                                           experiment_number=i)
        # print(i, episode_durations, episode_loss)
        print("Finished experiment {}/{}".format(i + 1,
                                                 config.num_experiments))
示例#8
0
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 lr,
                 hidden_1,
                 hidden_2,
                 update_every,
                 epsilon,
                 epsilon_min,
                 eps_decay,
                 seed
                 ):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.seed = random.seed(seed)
        self.learn_steps = 0
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
示例#9
0
    def _make_model(self, state_size, action_size, use_cnn):
        """
        Sets up the network model based on whether state data or pixel data is
        provided.
        """

        if use_cnn:
            return QCNNetwork(state_size, action_size,
                              self.seed).to(self.device)
        else:
            return QNetwork(state_size, action_size, self.seed).to(self.device)
示例#10
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network / Critic
        # Create the network, define the criterion and optimizer
        hidden_layers = [37, 37]
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers,
                                        seed).to(device)
        self.qnetwork_optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                             lr=LR_CRIT,
                                             weight_decay=WEIGHT_DECAY)

        # mu-Network / Actor
        # Create the network, define the criterion and optimizer
        hidden_layers = [33, 33]
        self.munetwork_local = ActorPolicy(state_size, action_size,
                                           hidden_layers, seed).to(device)
        self.munetwork_target = ActorPolicy(state_size, action_size,
                                            hidden_layers, seed).to(device)
        self.munetwork_optimizer = optim.Adam(
            self.munetwork_local.parameters(), lr=LR_ACTR)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
示例#11
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
示例#12
0
def run_dqn(env,
            num_episodes,
            memory_size,
            num_hidden,
            batch_size,
            discount_factor,
            learn_rate,
            update_target_q,
            max_steps,
            double_dqn=False):
    memory = ReplayMemory(memory_size)

    # continuous action space
    if isinstance(env.action_space, Box):
        dims = env.action_space.shape[0]
        n_out = SPLITS**dims
    # discrete action space
    else:
        n_out = env.action_space.n

    n_in = len(env.observation_space.low)
    model = QNetwork(n_in, n_out, num_hidden)
    target_net = QNetwork(n_in, n_out, num_hidden)

    episode_durations, q_vals, cum_reward = run_episodes(
        train=train,
        model=model,
        memory=memory,
        env=env,
        num_episodes=num_episodes,
        batch_size=batch_size,
        discount_factor=discount_factor,
        learn_rate=learn_rate,
        target_net=target_net,
        update_target_q=update_target_q,
        max_steps=max_steps,
        double_dqn=double_dqn)
    return model, episode_durations, q_vals, cum_reward
示例#13
0
 def __init__(self, action_size, state_size, config):
     self.action_size = action_size
     self.state_size = state_size
     self.min_action = config["min_action"]
     self.max_action = config["max_action"]
     self.seed = config["seed"]
     self.tau = config["tau"]
     self.gamma = config["gamma"]
     self.batch_size = config["batch_size"]
     if not torch.cuda.is_available():
         config["device"] == "cpu"
     self.device = config["device"]
     self.eval = config["eval"]
     torch.manual_seed(self.seed)
     np.random.seed(self.seed)
     self.vid_path = config["vid_path"]
     print("actions size ", action_size)
     print("actions min ", self.min_action)
     print("actions max ", self.max_action)
     self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
     self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"])
     self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
     self.target_critic.load_state_dict(self.critic.state_dict())
     self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
     self.alpha = self.log_alpha.exp()
     self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"])
     #self.policy = SACActor(state_size, action_size).to(self.device)
     self.policy = GaussianPolicy(state_size, action_size, 256).to(self.device)
     self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"])
     self.max_timesteps = config["max_episodes_steps"]
     self.episodes = config["episodes"]
     self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.device)
     pathname = config["seed"]
     tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname)
     self.writer = SummaryWriter(tensorboard_name)
     self.steps= 0
     self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item()
    def __init__(self, state_size, action_size, config):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(config["seed"])
        self.seed = config["seed"]
        self.gamma = 0.99
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.tau = config["tau"]
        self.fc1 = config["fc1_units"]
        self.fc2 = config["fc2_units"]
        self.device = config["device"]
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, self.fc1,
                                       self.fc2, self.seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.fc1,
                                        self.fc2, self.seed).to(self.device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)
        self.encoder = Encoder(config).to(self.device)
        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(),
                                                  self.lr)

        # Replay memory

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
示例#15
0
 def __init__(self, state_size, action_size, action_dim, config):
     self.state_size = state_size
     self.action_size = action_size
     self.action_dim = action_dim
     self.seed = 0
     self.device = 'cuda'
     self.batch_size = config["batch_size"]
     self.lr = 0.005
     self.gamma = 0.99
     self.q_shift_local = QNetwork(state_size, action_size,
                                   self.seed).to(self.device)
     self.q_shift_target = QNetwork(state_size, action_size,
                                    self.seed).to(self.device)
     self.Q_local = QNetwork(state_size, action_size,
                             self.seed).to(self.device)
     self.Q_target = QNetwork(state_size, action_size,
                              self.seed).to(self.device)
     self.R_local = RNetwork(state_size, action_size,
                             self.seed).to(self.device)
     self.R_target = RNetwork(state_size, action_size,
                              self.seed).to(self.device)
     self.policy = PolicyNetwork(state_size, action_size,
                                 self.seed).to(self.device)
     self.predicter = Classifier(state_size, action_dim,
                                 self.seed).to(self.device)
     #self.criterion = nn.CrossEntropyLoss()
     # optimizer
     self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(),
                                         lr=self.lr)
     self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr)
     self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
     self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
     self.optimizer_pre = optim.Adam(self.predicter.parameters(),
                                     lr=self.lr)
     pathname = "lr {} batch_size {} seed {}".format(
         self.lr, self.batch_size, self.seed)
     tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
     self.writer = SummaryWriter(tensorboard_name)
     self.steps = 0
     self.ratio = 1. / action_dim
     self.all_actions = []
     for a in range(self.action_dim):
         action = torch.Tensor(1) * 0 + a
         self.all_actions.append(action.to(self.device))
class OldSACAgent:
    def __init__(self, env, render, config_info):
        self.env = env
        self.render = render
        self._reset_env()

        # Create run folder to store parameters, figures, and tensorboard logs
        self.path_runs = create_run_folder(config_info)

        # Extract training parameters from yaml config file
        param = load_training_parameters(config_info["config_param"])
        self.train_param = param["training"]

        # Define device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Device in use : {self.device}")

        # Define state and action dimension spaces
        state_dim = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]

        # Define models
        hidden_size = param["model"]["hidden_size"]
        self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device)
        self.v_net = VNetwork(state_dim, hidden_size).to(self.device)
        self.target_v_net = VNetwork(state_dim, hidden_size).to(self.device)
        self.target_v_net.load_state_dict(self.v_net.state_dict())
        self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to(
            self.device
        )

        # Define loss criterion
        self.q_criterion = nn.MSELoss()
        self.v_criterion = nn.MSELoss()

        # Define optimizers
        lr = float(param["optimizer"]["learning_rate"])
        self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr)
        self.v_opt = optim.Adam(self.v_net.parameters(), lr=lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr)

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(param["training"]["replay_size"])

        self.transition = namedtuple(
            "transition",
            field_names=["state", "action", "reward", "done", "next_state"],
        )

        # Useful variables
        self.batch_size = param["training"]["batch_size"]
        self.gamma = param["training"]["gamma"]
        self.tau = param["training"]["tau"]
        self.start_step = param["training"]["start_step"]
        self.max_timesteps = param["training"]["max_timesteps"]
        self.alpha = param["training"]["alpha"]

    def _reset_env(self):
        # Reset the environment and initialize episode reward
        self.state, self.done = self.env.reset(), False
        self.episode_reward = 0.0
        self.episode_step = 0

    def train(self):
        # Main training loop
        total_timestep = 0
        all_episode_rewards = []
        all_mean_rewards = []
        update = 0

        # Create tensorboard writer
        writer = SummaryWriter(log_dir=self.path_runs, comment="-sac")

        for episode in itertools.count(1, 1):
            self._reset_env()

            while not self.done:
                # trick to improve exploration at the start of training
                if self.start_step > total_timestep:
                    action = self.env.action_space.sample()  # Sample random action
                else:
                    action = self.policy_net.get_action(
                        self.state, self.device
                    )  # Sample action from policy

                # Fill the replay buffer up with transitions
                if len(self.replay_buffer) > self.batch_size:
                    batch = self.replay_buffer.sample_buffer(self.batch_size)

                    # Update parameters of all the networks
                    q_loss, v_loss, policy_loss = self.train_on_batch(batch)
                    writer.add_scalar("loss/q", q_loss, update)
                    writer.add_scalar("loss/v", v_loss, update)
                    writer.add_scalar("loss/policy", policy_loss, update)
                    update += 1

                if self.render:
                    self.env.render()

                # Perform one step in the environment
                next_state, reward, self.done, _ = self.env.step(action)
                total_timestep += 1
                self.episode_step += 1
                self.episode_reward += reward

                # Create a tuple for the new transition
                new_transition = self.transition(
                    self.state, action, reward, self.done, next_state
                )

                # Append transition to the replay buffer
                self.replay_buffer.store_transition(new_transition)

                self.state = next_state

            if total_timestep > self.max_timesteps:
                break

            mean_reward = np.mean(all_episode_rewards[-100:])
            all_episode_rewards.append(self.episode_reward)
            all_mean_rewards.append(mean_reward)

            print(
                "Episode n°{} ; total timestep [{}/{}] ; episode steps {} ; "
                "reward {} ; mean reward {}".format(
                    episode,
                    total_timestep,
                    self.max_timesteps,
                    self.episode_step,
                    round(self.episode_reward, 2),
                    round(mean_reward, 2),
                )
            )

            writer.add_scalar("reward", self.episode_reward, episode)
            writer.add_scalar("mean reward", mean_reward, episode)

        # Save networks' weights
        path_critic = os.path.join(self.path_runs, "critic.pth")
        path_actor = os.path.join(self.path_runs, "actor.pth")
        torch.save(self.q_net.state_dict(), path_critic)
        torch.save(self.policy_net.state_dict(), path_actor)

        # Plot reward
        self.plot_reward(all_episode_rewards, all_mean_rewards)

        # Close all
        writer.close()
        self.env.close()

    def train_on_batch(self, batch_samples):
        # Unpack batch_size of transitions randomly drawn from the replay buffer
        (
            state_batch,
            action_batch,
            reward_batch,
            done_int_batch,
            next_state_batch,
        ) = batch_samples

        # Transform np arrays into tensors and send them to device
        state_batch = torch.tensor(state_batch).to(self.device)
        next_state_batch = torch.tensor(next_state_batch).to(self.device)
        action_batch = torch.tensor(action_batch).to(self.device)
        reward_batch = torch.tensor(reward_batch).unsqueeze(1).to(self.device)
        done_int_batch = torch.tensor(done_int_batch).unsqueeze(1).to(self.device)

        q_value, _ = self.q_net(state_batch, action_batch)
        value = self.v_net(state_batch)
        pi, log_pi = self.policy_net.sample(state_batch)

        ### Update Q
        target_next_value = self.target_v_net(next_state_batch)
        next_q_value = (
            reward_batch + (1 - done_int_batch) * self.gamma * target_next_value
        )

        q_loss = self.q_criterion(q_value, next_q_value.detach())

        ### Update V
        q_pi, _ = self.q_net(state_batch, pi)
        next_value = q_pi - log_pi
        v_loss = self.v_criterion(value, next_value.detach())

        ### Update policy
        log_pi_target = q_pi - value
        policy_loss = (log_pi * (log_pi - log_pi_target).detach()).mean()

        # Losses and optimizers
        self.q_opt.zero_grad()
        q_loss.backward()
        self.q_opt.step()

        self.v_opt.zero_grad()
        v_loss.backward()
        self.v_opt.step()

        self.policy_opt.zero_grad()
        policy_loss.backward()
        self.policy_opt.step()

        soft_update(self.target_v_net, self.v_net, self.tau)

        return q_loss.item(), v_loss.item(), policy_loss.item()

    def plot_reward(self, data, mean_data):
        plt.plot(data, label="reward")
        plt.plot(mean_data, label="mean reward")
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.title(f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment")
        plt.tight_layout()
        plt.legend()

        path_fig = os.path.join(self.path_runs, "figure.png")
        plt.savefig(path_fig)
        print(f"Figure saved to {path_fig}")

        plt.show()
示例#17
0
class TD3():
    def __init__(self, action_size, state_size, config):
        self.seed = config["seed"]
        print("TD3 seed", self.seed)
        torch.manual_seed(self.seed)
        np.random.seed(seed=self.seed)
        random.seed(self.seed)
        self.env = gym.make(config["env_name"])
        self.env.seed(self.seed)
        now = datetime.now()
        dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
        self.env.action_space.seed(self.seed)
        self.action_size = action_size
        self.state_size = state_size
        self.min_action = config["min_action"]
        self.max_action = config["max_action"]
        self.seed = config["seed"]
        self.tau = config["tau"]
        self.gamma = config["gamma"]
        self.batch_size = config["batch_size"]
        if not torch.cuda.is_available():
            config["device"] == "cpu"
        self.device = config["device"]
        self.eval = config["eval"]
        self.vid_path = config["vid_path"]
        print("actions size ", action_size)
        print("actions min ", self.min_action)
        print("actions max ", self.max_action)
        fc1 = config["fc1_units"]
        fc2 = config["fc2_units"]
        self.actor = Actor(state_size, action_size, self.seed, fc1, fc2).to(self.device)
        self.optimizer_a = torch.optim.Adam(self.actor.parameters(), config["lr_actor"])
        self.target_actor = Actor(state_size, action_size, self.seed, fc1, fc2).to(self.device)
        self.target_actor.load_state_dict(self.actor.state_dict()) 
        self.critic = QNetwork(state_size, action_size, self.seed, fc1, fc2).to(self.device)
        self.optimizer_q = torch.optim.Adam(self.critic.parameters(), config["lr_critic"])
        self.target_critic = QNetwork(state_size, action_size, self.seed, fc1, fc2).to(self.device)
        self.target_critic.load_state_dict(self.critic.state_dict())
        self.max_timesteps = config["max_episodes_steps"]
        self.episodes = config["episodes"]
        self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.seed, self.device)
        pathname = str(config["seed"]) + str(dt_string)
        tensorboard_name = str(config["res_path"]) + '/runs/'+ "TD3" + str(pathname)
        self.writer = SummaryWriter(tensorboard_name)
        self.steps= 0
        self.actor_freq = config["actor_freq"]
        self.policy_noise = config["policy_noise"]
        self.noise_clip = config["noise_clip"]
        self.expl_noise = config["exp_noise"]

    def act(self, state):
        state  = torch.as_tensor(state, dtype=torch.float32, device=self.device)
        action = self.actor(state.unsqueeze(0))
        actions = action.detach().cpu().numpy()[0] 
        actions = actions + + np.random.normal(0, self.max_action * self.expl_noise, size=self.action_size)
        actions = np.clip(actions, self.min_action, self.max_action)
        return actions
    
    def act_greedy(self, state):
        state  = torch.as_tensor(state, dtype=torch.float32, device=self.device)
        action = self.actor(state.unsqueeze(0))
        actions = action.detach().cpu().numpy()[0]
        actions = np.clip(actions, self.min_action, self.max_action)
        return actions
    
    def train_agent(self):
        average_reward = 0
        scores_window = deque(maxlen=100)
        s = 0
        t0 = time.time()
        for i_epiosde in range(self.episodes):
            episode_reward = 0
            state = self.env.reset()
            for t in range(self.max_timesteps):
                s += 1
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                episode_reward += reward
                if i_epiosde > 10:
                    self.learn()
                self.memory.add(state, reward, action, next_state, done)
                state = next_state
                if done:
                    scores_window.append(episode_reward)
                    break
            if i_epiosde % self.eval == 0:
                self.eval_policy()
            ave_reward = np.mean(scores_window)
            print("Epiosde {} Steps {} Reward {} Reward averge{} Time {}".format(i_epiosde, t, episode_reward, np.mean(scores_window), time_format(time.time() - t0)))
            self.writer.add_scalar('Aver_reward', ave_reward, self.steps)
            self.writer.add_scalar('steps_in_episode', t, self.steps)
            
    
    def learn(self):
        self.steps += 1
        states, rewards, actions, next_states, dones = self.memory.sample(self.batch_size)
        with torch.no_grad():
            next_action = self.target_actor(next_states)
            noise = (torch.randn_like(actions) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
            print(noise)
            next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
            q1_target, q2_target = self.target_critic(next_states, next_action)
            q_target = torch.min(q1_target, q2_target)
            q_target = rewards + (self.gamma * q_target * (1 - dones))
        q_pre1, q_pre2 = self.critic(states, actions)
        loss = F.mse_loss(q_pre1, q_target) + F.mse_loss(q_pre2, q_target)
        self.writer.add_scalar('Q_loss', loss, self.steps)
        self.optimizer_q.zero_grad()
        loss.backward()
        self.optimizer_q.step()
        # delay actor update
        if self.steps % self.actor_freq == 0:
            #-------------------------------update-actor-------------------------------------------------
            actor_actions  = self.actor(states)
            q_values = self.critic.Q1(states, actor_actions)
            loss_actor = -q_values.mean()
            self.optimizer_a.zero_grad()
            loss_actor.backward()
            self.writer.add_scalar('Actor_loss', loss_actor, self.steps)
            self.optimizer_a.step()
            #-------------------------------update-networks-------------------------------------------------
            self.soft_udapte(self.critic, self.target_critic)
            self.soft_udapte(self.actor, self.target_actor)
    
    
    def soft_udapte(self, online, target):
        for param, target_parm in zip(online.parameters(), target.parameters()):
            target_parm.data.copy_(self.tau * param.data + (1 - self.tau) * target_parm.data)


    def eval_policy(self, eval_episodes=4):
        env  = wrappers.Monitor(self.env, str(self.vid_path) + "/{}".format(self.steps), video_callable=lambda episode_id: True,force=True)
        average_reward = 0
        scores_window = deque(maxlen=100)
        for i_epiosde in range(eval_episodes):
            print("Eval Episode {} of {} ".format(i_epiosde, self.episodes))
            episode_reward = 0
            state = env.reset()
            while True: 
                action = self.act_greedy(state)
                state, reward, done, _ = env.step(action)
                episode_reward += reward
                if done:
                    scores_window.append(episode_reward)
                    break
        average_reward = np.mean(scores_window)
        self.writer.add_scalar('Eval_reward', average_reward, self.steps)
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed, memory=None):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network / Critic
        # Create the network, define the criterion and optimizer
        hidden_layers = [256, 128]
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device)
        self.qnetwork_optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR_CRIT, weight_decay=WEIGHT_DECAY)
        
        # mu-Network / Actor
        # Create the network, define the criterion and optimizer
        hidden_layers = [256, 128]
        self.munetwork_local = ActorPolicy(state_size, action_size, hidden_layers, seed).to(device)
        self.munetwork_target = ActorPolicy(state_size, action_size, hidden_layers, seed).to(device)
        self.munetwork_optimizer = optim.Adam(self.munetwork_local.parameters(), lr=LR_ACTR)
        
        # Noise process
        self.noise = OUNoise(action_size, seed, mu=0., theta=1.0, sigma=0.7)
        
        # Replay memory
        if memory is None:
            self.memory = ReplayBuffer(action_size)
        else:
            self.memory = memory
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            #if len(self.memory) > self.memory.buffer_size:
            if len(self.memory) >= self.memory.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, beta = 1.0, add_noise=True):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            add_noise (boolean): add noise for exploration
        """
        state = torch.from_numpy(state).float().to(device)
        self.munetwork_local.eval()
        with torch.no_grad():
            actions = self.munetwork_local(state).cpu().data.numpy()
            #actions = np.zeros(2)
        self.munetwork_local.train()
        if add_noise:
            actions += beta*self.noise.sample()
        return np.clip(actions, -1, 1)
    
    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) = mu(state) -> action
            critic_target(state, action) = Q(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences 

        # ---------------------------- update critic ---------------------------- #
        for _ in range(GD_EPOCH):
            # Get predicted next-state actions and Q values from target models
            actions_next = self.munetwork_target(next_states)
            Q_targets_next = self.qnetwork_target(next_states, actions_next)
            # Compute Q targets for current states (y_i)
            Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
            # Compute critic loss
            Q_expected = self.qnetwork_local(states, actions)
            critic_loss = F.mse_loss(Q_expected, Q_targets)
            # Minimize the loss
            self.qnetwork_optimizer.zero_grad()
            torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), CLIP_GRAD)
            critic_loss.backward()
            self.qnetwork_optimizer.step()
            del critic_loss

        # ---------------------------- update actor ---------------------------- #
        for _ in range(GD_EPOCH):
            actions_pred = self.munetwork_local(states)
            # Compute actor loss
            actor_loss = -self.qnetwork_local(states, actions_pred).mean()
            # Minimize the loss
            self.munetwork_optimizer.zero_grad()
            torch.nn.utils.clip_grad_norm_(self.munetwork_local.parameters(), CLIP_GRAD)
            actor_loss.backward()
            self.munetwork_optimizer.step()
            del actor_loss

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.munetwork_local, self.munetwork_target, TAU)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)      
        
        
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
示例#19
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, algorithm='DQN'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # set algorithm
        if algorithm == "DQN":
            self.learn = self.learnDQN
        elif algorithm == "DDQN":
            self.learn = self.learnDDQN
        else:
            raise ('algorithm {} not implemented'.format(algorithm))

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learnDQN(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## compute and minimize the loss
        self.optimizer.zero_grad()

        # target (Qsa_next)
        Qsa_next = torch.max(self.qnetwork_target(next_states),
                             dim=1,
                             keepdim=True)[0]
        targets = rewards + gamma * Qsa_next * (1 - dones)

        # output (Qsa)
        action_values = self.qnetwork_local(states)
        outputs = action_values.gather(1, actions)

        loss = F.mse_loss(outputs, targets)
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def learnDDQN(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Compute and minimize the loss
        self.optimizer.zero_grad()

        # target (Qsa_next)
        next_actions = torch.argmax(self.qnetwork_local(next_states),
                                    dim=1,
                                    keepdim=True)
        Qsa_next = self.qnetwork_target(next_states).gather(1, next_actions)
        targets = rewards + gamma * Qsa_next * (1 - dones)

        # output (Qsa)
        action_values = self.qnetwork_local(states)
        outputs = action_values.gather(1, actions)

        loss = F.mse_loss(outputs, targets)
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
示例#20
0
def _test_policy(state):
    model = QNetwork(device="cuda")
    action = model(state.to(model.device))
    return action
示例#21
0
class DQNAgent:

    MAX_EXPERIENCES = 20000

    MIN_EXPERIENCES = 512

    BATCH_SIZE = 16

    def __init__(self,
                 env,
                 gamma=0.95,
                 epsilon=1.0,
                 copy_period=1000,
                 lr=0.01,
                 update_period=2):
        """
            gammma: 割引率
            epsilon: 探索と活用の割合
        """

        self.env = env

        self.gamma = gamma

        self.epsion = epsilon

        self.copy_period = copy_period

        self.update_period = update_period

        self.lr = lr

        self.global_steps = 0

        self.q_network = QNetwork(self.env.action_space.n, lr=lr)

        self.q_network.build(input_shape=(None, 4))

        self.target_network = QNetwork(self.env.action_space.n)

        self.target_network.build(input_shape=(None, 4))

        self.experiences = collections.deque(maxlen=self.MAX_EXPERIENCES)

    def play(self, episodes):

        total_rewards = []

        for n in range(episodes):

            self.epsilon = 1.0 - min(0.95, self.global_steps * 0.95 / 500)

            total_reward = self.play_episode()

            total_rewards.append(total_reward)

            print(f"Episode {n}: {total_reward}")
            print(f"Current experiences {len(self.experiences)}")
            print(f"Current epsilon {self.epsilon}")
            print()

        return total_rewards

    def play_episode(self):

        total_reward = 0

        steps = 0

        done = False

        state = self.env.reset()

        while not done:

            action = self.sample_action(state)

            next_state, reward, done, info = self.env.step(action)

            total_reward += reward

            exp = Experience(state, action, reward, next_state, done)

            self.experiences.append(exp)

            state = next_state

            steps += 1

            self.global_steps += 1

            if self.global_steps % self.update_period == 0:
                self.update_qnetwork()

            if self.global_steps % self.copy_period == 0:
                self.target_network.set_weights(self.q_network.get_weights())

        return total_reward

    def sample_action(self, state):
        """探索と活用"""

        if np.random.random() < self.epsilon:
            random_action = np.random.choice(self.env.action_space.n)
            return random_action
        else:
            selected_action = np.argmax(self.q_network.predict(state))
            return selected_action

    def update_qnetwork(self):
        """ Q-Networkの訓練
            ただしExperiencesが規定数に達していないうちは何もしない
        """
        if len(self.experiences) < self.MIN_EXPERIENCES:
            return

        (states, actions, rewards, next_states,
         dones) = self.get_minibatch(self.BATCH_SIZE)

        next_Qs = np.max(self.target_network.predict(next_states), axis=1)

        target_values = [
            reward + self.gamma * next_q if not done else reward
            for reward, next_q, done in zip(rewards, next_Qs, dones)
        ]

        self.q_network.update(np.array(states), np.array(actions),
                              np.array(target_values))

    def get_minibatch(self, batch_size):
        """Experience Replay mechanism
        """
        indices = np.random.choice(len(self.experiences),
                                   size=batch_size,
                                   replace=False)

        selected_experiences = [self.experiences[i] for i in indices]

        states = [exp.state for exp in selected_experiences]
        actions = [exp.action for exp in selected_experiences]
        rewards = [exp.reward for exp in selected_experiences]
        next_states = [exp.next_state for exp in selected_experiences]
        dones = [exp.done for exp in selected_experiences]

        return (states, actions, rewards, next_states, dones)
class Agent():
    def __init__(self, state_size, action_size, config):
        self.seed = config["seed"]
        torch.manual_seed(self.seed)
        np.random.seed(seed=self.seed)
        random.seed(self.seed)
        env = gym.make(config["env_name"])
        self.env = FrameStack(env, config)
        self.env.seed(self.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.clip = config["clip"]
        self.device = 'cuda'
        self.double_dqn = config["DDQN"]
        self.lr_pre = config["lr_pre"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.tau = config["tau"]
        self.gamma = 0.99
        self.fc1 = config["fc1_units"]
        self.fc2 = config["fc2_units"]
        self.qnetwork_local = QNetwork(state_size, action_size, self.fc1,
                                       self.fc2, self.seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.fc1,
                                        self.fc2, self.seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)
        self.q_shift_local = QNetwork(state_size, action_size, self.fc1,
                                      self.fc2, self.seed).to(self.device)
        self.q_shift_target = QNetwork(state_size, action_size, self.fc1,
                                       self.fc2, self.seed).to(self.device)
        self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(),
                                          lr=self.lr)
        self.soft_update(self.q_shift_local, self.q_shift_target, 1)
        self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2,
                                self.seed).to(self.device)
        self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2,
                                 self.seed).to(self.device)
        self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
        self.soft_update(self.R_local, self.R_target, 1)
        self.steps = 0
        self.predicter = QNetwork(state_size, action_size, self.fc1, self.fc2,
                                  self.seed).to(self.device)
        self.optimizer_pre = optim.Adam(self.predicter.parameters(),
                                        lr=self.lr_pre)
        #self.encoder_freq = Encoder(config).to(self.device)
        #self.encoder_optimizer_frq = torch.optim.Adam(self.encoder_freq.parameters(), self.lr)
        self.encoder = Encoder(config).to(self.device)
        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(),
                                                  self.lr)
        pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format(
            self.lr, self.batch_size, self.fc1, self.fc2, self.seed)
        pathname += "_clip_{}".format(config["clip"])
        pathname += "_tau_{}".format(config["tau"])
        now = datetime.now()
        dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
        pathname += dt_string
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
        self.vid_path = str(config["locexp"]) + '/vid'
        self.writer = SummaryWriter(tensorboard_name)
        self.average_prediction = deque(maxlen=100)
        self.average_same_action = deque(maxlen=100)
        self.all_actions = []
        for a in range(self.action_size):
            action = torch.Tensor(1) * 0 + a
            self.all_actions.append(action.to(self.device))

    def learn(self, memory_ex):
        logging.debug(
            "--------------------------New update-----------------------------------------------"
        )
        self.steps += 1
        states, next_states, actions, dones = memory_ex.expert_policy(
            self.batch_size)
        states = states.type(torch.float32).div_(255)
        states = self.encoder.create_vector(states)
        next_states = next_states.type(torch.float32).div_(255)
        next_states = self.encoder.create_vector(next_states)
        self.state_action_frq(states, actions)
        actions = torch.randint(0,
                                3, (self.batch_size, 1),
                                dtype=torch.int64,
                                device=self.device)
        self.compute_shift_function(states.detach(), next_states, actions,
                                    dones)
        self.compute_r_function(states.detach(), actions)
        self.compute_q_function(states.detach(), next_states, actions, dones)
        self.soft_update(self.R_local, self.R_target, self.tau)
        self.soft_update(self.q_shift_local, self.q_shift_target, self.tau)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)
        return

    def compute_q_function(self, states, next_states, actions, dones):
        """Update value parameters using given batch of experience tuples. """
        actions = actions.type(torch.int64)
        # Get max predicted Q values (for next states) from target model
        if self.double_dqn:
            q_values = self.qnetwork_local(next_states).detach()
            _, best_action = q_values.max(1)
            best_action = best_action.unsqueeze(1)
            Q_targets_next = self.qnetwork_target(next_states).detach()
            Q_targets_next = Q_targets_next.gather(1, best_action)
        else:
            Q_targets_next = self.qnetwork_target(next_states).detach().max(
                1)[0].unsqueeze(1)
        # Compute Q targets for current states
        # Get expected Q values from local model
        # Compute loss
        rewards = self.R_target(states).detach().gather(
            1, actions.detach()).squeeze(0)
        Q_targets = rewards + (self.gamma * Q_targets_next * (dones))
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets.detach())
        # Get max predicted Q values (for next states) from target model
        self.writer.add_scalar('Q_loss', loss, self.steps)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.encoder_optimizer.step()
        # torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1)
        self.optimizer.step()

    def compute_shift_function(self, states, next_states, actions, dones):
        """Update Q shift parameters using given batch of experience tuples  """
        actions = actions.type(torch.int64)
        with torch.no_grad():
            # Get max predicted Q values (for next states) from target model
            if self.double_dqn:
                q_shift = self.q_shift_local(next_states)
                max_q, max_actions = q_shift.max(1)
                Q_targets_next = self.qnetwork_target(next_states).gather(
                    1, max_actions.unsqueeze(1))
            else:
                Q_targets_next = self.qnetwork_target(
                    next_states).detach().max(1)[0].unsqueeze(1)
            # Compute Q targets for current states
            Q_targets = self.gamma * Q_targets_next
        # Get expected Q values from local model
        Q_expected = self.q_shift_local(states).gather(1, actions)
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets.detach())
        # Minimize the loss
        self.optimizer_shift.zero_grad()
        loss.backward()
        self.writer.add_scalar('Shift_loss', loss, self.steps)
        self.optimizer_shift.step()

    def compute_r_function(self, states, actions, debug=False, log=False):
        """ compute reward for the state action pair """
        actions = actions.type(torch.int64)
        # sum all other actions
        size = states.shape[0]
        idx = 0
        all_zeros = [1 for i in range(actions.shape[0])]
        zeros = False
        y_shift = self.q_shift_target(states).gather(1, actions).detach()
        log_a = self.get_action_prob(states, actions).detach()
        y_r_part1 = log_a - y_shift
        y_r_part2 = torch.empty((size, 1), dtype=torch.float32).to(self.device)
        for a, s in zip(actions, states):
            y_h = 0
            taken_actions = 0
            for b in self.all_actions:
                b = b.type(torch.int64).unsqueeze(1)
                n_b = self.get_action_prob(s.unsqueeze(0), b)
                if torch.eq(a, b) or n_b is None:
                    continue
                taken_actions += 1
                y_s = self.q_shift_target(s.unsqueeze(0)).detach().gather(
                    1, b).item()
                n_b = n_b.data.item() - y_s
                r_hat = self.R_target(s.unsqueeze(0)).gather(1, b).item()
                y_h += (r_hat - n_b)
                if log:
                    text = "a {} r _hat {:.2f} - n_b  {:.2f} | sh {:.2f} ".format(
                        b.item(), r_hat, n_b, y_s)
                    logging.debug(text)
            if taken_actions == 0:
                all_zeros[idx] = 0
                zeros = True
                y_r_part2[idx] = 0.0
            else:
                y_r_part2[idx] = (1. / taken_actions) * y_h
            idx += 1
            y_r = y_r_part1 + y_r_part2
        # check if there are zeros (no update for this tuble) remove them from states and
        if zeros:
            mask = torch.BoolTensor(all_zeros)
            states = states[mask]
            actions = actions[mask]
            y_r = y_r[mask]
        y = self.R_local(states).gather(1, actions)
        if log:
            text = "Action {:.2f} r target {:.2f} =  n_a {:.2f} + n_b {:.2f}  y {:.2f}".format(
                actions[0].item(), y_r[0].item(), y_r_part1[0].item(),
                y_r_part2[0].item(), y[0].item())
            logging.debug(text)
        r_loss = F.mse_loss(y, y_r.detach())
        # Minimize the loss
        self.optimizer_r.zero_grad()
        r_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.R_local.parameters(), 5)
        self.optimizer_r.step()
        self.writer.add_scalar('Reward_loss', r_loss, self.steps)

    def get_action_prob(self, states, actions):
        """ compute prob for state action pair """
        actions = actions.type(torch.long)
        # check if action prob is zero
        output = self.predicter(states)
        output = F.softmax(output, dim=1)
        action_prob = output.gather(1, actions)
        action_prob = action_prob + torch.finfo(torch.float32).eps
        # check if one action if its to small
        if action_prob.shape[0] == 1:
            if action_prob.cpu().detach().numpy()[0][0] < 1e-4:
                return None
        action_prob = torch.log(action_prob)
        action_prob = torch.clamp(action_prob, min=self.clip, max=0)
        return action_prob

    def state_action_frq(self, states, action):
        """ Train classifer to compute state action freq """
        self.predicter.train()
        output = self.predicter(states)
        output = output.squeeze(0)
        y = action.type(torch.long).squeeze(1)
        loss = nn.CrossEntropyLoss()(output, y)
        self.optimizer_pre.zero_grad()
        self.encoder_optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.predicter.parameters(), 1)
        self.optimizer_pre.step()
        self.writer.add_scalar('Predict_loss', loss, self.steps)
        self.predicter.eval()

    def test_predicter(self, memory):
        """ Test the classifier """
        self.predicter.eval()
        same_state_predition = 0
        for i in range(memory.idx):
            states = memory.obses[i]
            actions = memory.actions[i]
            states = torch.as_tensor(states, device=self.device).unsqueeze(0)
            states = states.type(torch.float32).div_(255)
            states = self.encoder.create_vector(states)
            actions = torch.as_tensor(actions, device=self.device)
            output = self.predicter(states)
            output = F.softmax(output, dim=1)
            # create one hot encode y from actions
            y = actions.type(torch.long).item()
            p = torch.argmax(output.data).item()
            if y == p:
                same_state_predition += 1
        text = "Same prediction {} of {} ".format(same_state_predition,
                                                  memory.idx)
        print(text)
        logging.debug(text)

    def soft_update(self, local_model, target_model, tau=4):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        # print("use tau", tau)
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def load(self, filename):
        self.predicter.load_state_dict(torch.load(filename + "_predicter.pth"))
        self.optimizer_pre.load_state_dict(
            torch.load(filename + "_predicter_optimizer.pth"))
        self.R_local.load_state_dict(torch.load(filename + "_r_net.pth"))
        self.qnetwork_local.load_state_dict(torch.load(filename +
                                                       "_q_net.pth"))
        print("Load models to {}".format(filename))

    def save(self, filename):
        """
        """
        mkdir("", filename)
        torch.save(self.predicter.state_dict(), filename + "_predicter.pth")
        torch.save(self.optimizer_pre.state_dict(),
                   filename + "_predicter_optimizer.pth")
        torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth")
        torch.save(self.optimizer.state_dict(),
                   filename + "_q_net_optimizer.pth")
        torch.save(self.R_local.state_dict(), filename + "_r_net.pth")
        torch.save(self.q_shift_local.state_dict(),
                   filename + "_q_shift_net.pth")
        print("save models to {}".format(filename))

    def test_q_value(self, memory):
        test_elements = memory.idx
        test_elements = 100
        all_diff = 0
        error = True
        used_elements_r = 0
        used_elements_q = 0
        r_error = 0
        q_error = 0
        for i in range(test_elements):
            states = memory.obses[i]
            actions = memory.actions[i]
            states = torch.as_tensor(states, device=self.device).unsqueeze(0)
            states = states.type(torch.float32).div_(255)
            states = self.encoder.create_vector(states)
            actions = torch.as_tensor(actions, device=self.device)
            one_hot = torch.Tensor([0 for i in range(self.action_size)],
                                   device="cpu")
            one_hot[actions.item()] = 1
            with torch.no_grad():
                r_values = self.R_local(states.detach()).detach()
                q_values = self.qnetwork_local(states.detach()).detach()
                soft_r = F.softmax(r_values, dim=1).to("cpu")
                soft_q = F.softmax(q_values, dim=1).to("cpu")
                actions = actions.type(torch.int64)
                kl_q = F.kl_div(soft_q.log(), one_hot, None, None, 'sum')
                kl_r = F.kl_div(soft_r.log(), one_hot, None, None, 'sum')
                if kl_r == float("inf"):
                    pass
                else:
                    r_error += kl_r
                    used_elements_r += 1
                if kl_q == float("inf"):
                    pass
                else:
                    q_error += kl_q
                    used_elements_q += 1

        average_q_kl = q_error / used_elements_q
        average_r_kl = r_error / used_elements_r
        text = "Kl div of Reward {} of {} elements".format(
            average_q_kl, used_elements_r)
        print(text)
        text = "Kl div of Q_values {} of {} elements".format(
            average_r_kl, used_elements_q)
        print(text)
        self.writer.add_scalar('KL_reward', average_r_kl, self.steps)
        self.writer.add_scalar('KL_q_values', average_q_kl, self.steps)

    def act(self, states):
        states = torch.as_tensor(states, device=self.device).unsqueeze(0)
        states = states.type(torch.float32).div_(255)
        states = self.encoder.create_vector(states)
        q_values = self.qnetwork_local(states.detach()).detach()
        action = torch.argmax(q_values).item()
        return action

    def eval_policy(self, record=False, eval_episodes=2):
        if record:
            env = wrappers.Monitor(self.env,
                                   str(self.vid_path) +
                                   "/{}".format(self.steps),
                                   video_callable=lambda episode_id: True,
                                   force=True)
        else:
            env = self.env
        average_reward = 0
        scores_window = deque(maxlen=100)
        s = 0
        for i_epiosde in range(eval_episodes):
            episode_reward = 0
            state = env.reset()
            while True:
                s += 1
                action = self.act(state)
                state, reward, done, _ = env.step(action)
                episode_reward += reward
                if done:
                    break
            scores_window.append(episode_reward)
        if record:
            return
        average_reward = np.mean(scores_window)
        print("Eval Episode {}  average Reward {} ".format(
            eval_episodes, average_reward))
        self.writer.add_scalar('Eval_reward', average_reward, self.steps)
示例#23
0
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 lr,
                 lr_decay,
                 update_every,
                 update_mem_every,
                 update_mem_par_every,
                 experience_per_sampling,
                 seed,
                 epsilon,
                 epsilon_min,
                 eps_decay,
                 compute_weights,
                 hidden_1,
                 hidden_2,
                 ):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_decay = lr_decay
        self.update_every = update_every
        self.experience_per_sampling  = experience_per_sampling
        self.update_mem_every = update_mem_every
        self.update_mem_par_every = update_mem_par_every
        self.seed = random.seed(seed)
        self.epsilon= epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay
        self.compute_weights = compute_weights
        self.hidden_1 = hidden_1
        self.hidden_2 = hidden_2
        self.learn_steps = 0
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay
        self.compute_weights = compute_weights


        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.scheduler = StepLR(self.optimizer, step_size=1, gamma=self.lr_decay)


        # Replay memory
        self.memory = PrioritizedReplayBuffer(
                    self.action_size,
                    self.buffer_size,
                    self.batch_size,
                    self.experience_per_sampling,
                    self.seed,
                    self.compute_weights)
        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0
示例#24
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        print(device)
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ##compute and minimize the loss
        state_action_values = self.q_value(states, actions)

        next_state_action_values = self.max_q_value(next_states)
        expected_state_action_values = (next_state_action_values * gamma *
                                        (1 - dones)) + rewards

        loss = F.mse_loss(state_action_values, expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def q_value(self, state, action):
        q_values = self.qnetwork_local(state)
        state_action_value = q_values.gather(1, action)
        return state_action_value

    def max_q_value(self, state):
        max_state_action_value = self.qnetwork_target(state).max(1)[0].detach()
        return max_state_action_value.unsqueeze(1)

    def save(self):
        print("Model save as chechpint.pth")
        torch.save(self.qnetwork_local.state_dict(), 'checkpoint.pth')
示例#25
0
    def __init__(self, state_size, action_size, config):
        self.seed = config["seed"]
        torch.manual_seed(self.seed)
        np.random.seed(seed=self.seed)
        random.seed(self.seed)
        self.env = gym.make(config["env_name"])
        self.env.seed(self.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.clip = config["clip"]
        self.device = 'cuda'
        print("Clip ", self.clip)
        print("cuda ", torch.cuda.is_available())
        self.double_dqn = config["DDQN"]
        print("Use double dqn", self.double_dqn)
        self.lr_pre = config["lr_pre"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.tau = config["tau"]
        print("self tau", self.tau)
        self.gamma = 0.99
        self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item()
        self.fc1 = config["fc1_units"]
        self.fc2 = config["fc2_units"]
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha = self.log_alpha.exp()
        self.alpha_optim = optim.Adam([self.log_alpha], lr=config["lr_alpha"])
        self.policy = SACActor(state_size, action_size, self.seed).to(self.device)
        self.policy_optim = optim.Adam(self.policy.parameters(), lr=config["lr_policy"])
        
        self.qnetwork_local = QNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)
        
        self.q_shift_local = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device)
        self.q_shift_target = SQNetwork(state_size, action_size,self.seed, self.fc1, self.fc2).to(self.device)
        self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr)
        self.soft_update(self.q_shift_local, self.q_shift_target, 1)
         
        self.R_local = SQNetwork(state_size, action_size, self.seed,  self.fc1, self.fc2).to(self.device)
        self.R_target = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device)
        self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
        self.soft_update(self.R_local, self.R_target, 1) 

        self.steps = 0
        self.predicter = Classifier(state_size, action_size, self.seed, 256, 256).to(self.device)
        self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre)
        pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.seed)
        pathname += "_clip_{}".format(config["clip"])
        pathname += "_tau_{}".format(config["tau"])
        now = datetime.now()    
        dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
        pathname += dt_string
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
        self.vid_path = str(config["locexp"]) + '/vid'
        self.writer = SummaryWriter(tensorboard_name)
        print("summery writer ", tensorboard_name)
        self.average_prediction = deque(maxlen=100)
        self.average_same_action = deque(maxlen=100)
        self.all_actions = []
        for a in range(self.action_size):
            action = torch.Tensor(1) * 0 +  a
            self.all_actions.append(action.to(self.device))
示例#26
0
class DQNAgent():
    """Interacts with and learns from the environment."""

    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 lr,
                 hidden_1,
                 hidden_2,
                 update_every,
                 epsilon,
                 epsilon_min,
                 eps_decay,
                 seed
                 ):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.seed = random.seed(seed)
        self.learn_steps = 0
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0


    def step(self, state, action, reward, next_state,  done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # Sample if enough samples are available
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state):
        """Returns actions for given state as per current policy.
        Params
        ======
            state (array_like): current state
        """
        self.epsilon = max(self.epsilon*self.eps_decay, self.epsilon_min)

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > self.epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.learn_steps += 1

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
示例#27
0
class Agent():
    def __init__(self, state_size, action_size, config):
        self.env_name = config["env_name"]
        self.state_size = state_size
        self.action_size = action_size
        self.seed = config["seed"]
        self.clip = config["clip"]
        self.device = 'cuda'
        print("Clip ", self.clip)
        print("cuda ", torch.cuda.is_available())
        self.double_dqn = config["DDQN"]
        print("Use double dqn", self.double_dqn)
        self.lr_pre = config["lr_pre"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.tau = config["tau"]
        print("self tau", self.tau)
        self.gamma = 0.99
        self.fc1 = config["fc1_units"]
        self.fc2 = config["fc2_units"]
        self.fc3 = config["fc3_units"]
        self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2,self.fc3,  self.seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)
        
        self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr)
        self.soft_update(self.q_shift_local, self.q_shift_target, 1)
         
        self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3,  self.seed).to(self.device)
        self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
        self.soft_update(self.R_local, self.R_target, 1) 

        self.expert_q = DQNetwork(state_size, action_size, seed=self.seed).to(self.device)
        self.expert_q.load_state_dict(torch.load('checkpoint.pth'))
        self.memory = Memory(action_size, config["buffer_size"], self.batch_size, self.seed, self.device)
        self.t_step = 0
        self.steps = 0
        self.predicter = Classifier(state_size, action_size, self.seed).to(self.device)
        self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre)
        pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_fc3_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.fc3, self.seed)
        pathname += "_clip_{}".format(config["clip"])
        pathname += "_tau_{}".format(config["tau"])
        now = datetime.now()    
        dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
        pathname += dt_string
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
        self.writer = SummaryWriter(tensorboard_name)
        print("summery writer ", tensorboard_name)
        self.average_prediction = deque(maxlen=100)
        self.average_same_action = deque(maxlen=100)
        self.all_actions = []
        for a in range(self.action_size):
            action = torch.Tensor(1) * 0 +  a
            self.all_actions.append(action.to(self.device))
    
    
    def learn(self, memory):
        logging.debug("--------------------------New episode-----------------------------------------------")
        states, next_states, actions, dones = memory.expert_policy(self.batch_size)
        self.steps += 1
        self.state_action_frq(states, actions)
        self.compute_shift_function(states, next_states, actions, dones)
        for i in range(1):
            for a in range(self.action_size):
                action =  torch.ones([self.batch_size, 1], device= self.device) * a
                self.compute_r_function(states, action)

        self.compute_q_function(states, next_states, actions, dones)
        self.soft_update(self.q_shift_local, self.q_shift_target, self.tau)
        self.soft_update(self.R_local, self.R_target, self.tau)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)
        return
    
    def learn_predicter(self, memory):
        """

        """
        states, next_states, actions, dones = memory.expert_policy(self.batch_size)
        self.state_action_frq(states, actions)
    
    def state_action_frq(self, states, action):
        """ Train classifer to compute state action freq
        """ 
        self.predicter.train()
        output = self.predicter(states, train=True)
        output = output.squeeze(0)
        # logging.debug("out predicter {})".format(output))

        y = action.type(torch.long).squeeze(1)
        #print("y shape", y.shape)
        loss = nn.CrossEntropyLoss()(output, y)
        self.optimizer_pre.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.predicter.parameters(), 1)
        self.optimizer_pre.step()
        self.writer.add_scalar('Predict_loss', loss, self.steps)
        self.predicter.eval()

    def test_predicter(self, memory):
        """

        """
        self.predicter.eval()
        same_state_predition = 0
        for i in range(memory.idx):
            states = memory.obses[i]
            actions = memory.actions[i]
        
            states = torch.as_tensor(states, device=self.device).unsqueeze(0)
            actions = torch.as_tensor(actions, device=self.device)
            output = self.predicter(states)   
            output = F.softmax(output, dim=1)
            # create one hot encode y from actions
            y = actions.type(torch.long).item()
            p =torch.argmax(output.data).item()
            if y==p:
                same_state_predition += 1

        
        #self.average_prediction.append(same_state_predition)
        #average_pred = np.mean(self.average_prediction)
        #self.writer.add_scalar('Average prediction acc', average_pred, self.steps)
        #logging.debug("Same prediction {} of 100".format(same_state_predition))
        text = "Same prediction {} of {} ".format(same_state_predition, memory.idx)
        print(text)
        # self.writer.add_scalar('Action prediction acc', same_state_predition, self.steps)
        self.predicter.train()


    def get_action_prob(self, states, actions):
        """
        """
        actions = actions.type(torch.long)
        # check if action prob is zero
        output = self.predicter(states)
        output = F.softmax(output, dim=1)
        # print("get action_prob ", output) 
        # output = output.squeeze(0)
        action_prob = output.gather(1, actions)
        action_prob = action_prob + torch.finfo(torch.float32).eps
        # check if one action if its to small
        if action_prob.shape[0] == 1:
            if action_prob.cpu().detach().numpy()[0][0] < 1e-4:
                return None
        # logging.debug("action_prob {})".format(action_prob))
        action_prob = torch.log(action_prob)
        action_prob = torch.clamp(action_prob, min= self.clip, max=0)
        return action_prob

    def compute_shift_function(self, states, next_states, actions, dones):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        actions = actions.type(torch.int64)
        with torch.no_grad():
            # Get max predicted Q values (for next states) from target model
            if self.double_dqn:
                qt = self.q_shift_local(next_states)
                max_q, max_actions = qt.max(1)
                Q_targets_next = self.qnetwork_target(next_states).gather(1, max_actions.unsqueeze(1))
            else:
                Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1)
            # Compute Q targets for current states
            Q_targets = (self.gamma * Q_targets_next * (dones))

        # Get expected Q values from local model
        Q_expected = self.q_shift_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer_shift.zero_grad()
        loss.backward()
        self.writer.add_scalar('Shift_loss', loss, self.steps)
        self.optimizer_shift.step()


    def compute_r_function(self, states, actions, debug=False, log=False):
        """

        """
        actions = actions.type(torch.int64)
        
        # sum all other actions
        # print("state shape ", states.shape)
        size = states.shape[0]
        idx = 0
        all_zeros = []
        with torch.no_grad():
            y_shift = self.q_shift_target(states).gather(1, actions)
            log_a = self.get_action_prob(states, actions)
            index_list = index_None_value(log_a)
            # print("is none", index_list)
            if index_list is None:
                return


            y_r_part1 = log_a - y_shift
            y_r_part2 =  torch.empty((size, 1), dtype=torch.float32).to(self.device)
            for a, s in zip(actions, states):
                y_h = 0
                taken_actions = 0
                for b in self.all_actions:
                    b = b.type(torch.int64).unsqueeze(1)
                    n_b = self.get_action_prob(s.unsqueeze(0), b)
                    if torch.eq(a, b) or n_b is None:
                        logging.debug("best action {} ".format(a))
                        logging.debug("n_b action {} ".format(b))
                        logging.debug("n_b {} ".format(n_b))
                        continue
                    taken_actions += 1
                    r_hat = self.R_target(s.unsqueeze(0)).gather(1, b)

                    y_s = self.q_shift_target(s.unsqueeze(0)).gather(1, b)
                    n_b = n_b - y_s

                    y_h += (r_hat - n_b)
                    if debug:
                        print("action", b.item())
                        print("r_pre {:.3f}".format(r_hat.item()))
                        print("n_b {:.3f}".format(n_b.item()))
                if taken_actions == 0:
                    all_zeros.append(idx)
                else:
                    y_r_part2[idx] = (1. / taken_actions)  * y_h
                idx += 1
            #print(y_r_part2, y_r_part1)
            y_r = y_r_part1 + y_r_part2
            #print("_________________")
            #print("r update zeros ", len(all_zeros))
        if len(index_list) > 0:
            print("none list", index_list)
        y = self.R_local(states).gather(1, actions)
        if log:
            text = "Action {:.2f}  y target {:.2f} =  n_a {:.2f} + {:.2f} and pre{:.2f}".format(actions.item(), y_r.item(), y_r_part1.item(), y_r_part2.item(), y.item())
            logging.debug(text)

        if debug:
            print("expet action ", actions.item())
            # print("y r {:.3f}".format(y.item()))
            # print("log a prob {:.3f}".format(log_a.item()))
            # print("n_a {:.3f}".format(y_r_part1.item()))
            print("Correct action p {:.3f} ".format(y.item()))
            print("Correct action target {:.3f} ".format(y_r.item()))
            print("part1 corret action {:.2f} ".format(y_r_part1.item()))
            print("part2 incorret action {:.2f} ".format(y_r_part2.item()))
        
        #print("y", y.shape)
        #print("y_r", y_r.shape)
        
        r_loss = F.mse_loss(y, y_r)
        
        #con = input()
        #sys.exit()
        # Minimize the loss
        self.optimizer_r.zero_grad()
        r_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.R_local.parameters(), 5)
        self.optimizer_r.step()
        self.writer.add_scalar('Reward_loss', r_loss, self.steps)
        if debug:
            print("after update r pre ", self.R_local(states).gather(1, actions).item())
            print("after update r target ", self.R_target(states).gather(1, actions).item())
        # ------------------- update target network ------------------- #
        #self.soft_update(self.R_local, self.R_target, 5e-3)
        if debug:
            print("after soft upda r target ", self.R_target(states).gather(1, actions).item())
    
    def compute_q_function(self, states, next_states, actions, dones, debug=False, log= False):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        actions = actions.type(torch.int64)
        if debug:
            print("---------------q_update------------------")
            print("expet action ", actions.item())
            print("state ", states)
        with torch.no_grad():
            # Get max predicted Q values (for next states) from target model
            if self.double_dqn:
                qt = self.qnetwork_local(next_states)
                max_q, max_actions = qt.max(1)
                Q_targets_next = self.qnetwork_target(next_states).gather(1, max_actions.unsqueeze(1))
            else:
                Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1)
            # Compute Q targets for current states
            rewards = self.R_target(states).gather(1, actions)
            Q_targets = rewards + (self.gamma * Q_targets_next * (dones))
            if debug:
                print("reward  {}".format(rewards.item()))
                print("Q target next {}".format(Q_targets_next.item()))
                print("Q_target {}".format(Q_targets.item()))



        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        if log:
            text = "Action {:.2f}  q target {:.2f} =  r_a {:.2f} + target {:.2f} and pre{:.2f}".format(actions.item(), Q_targets.item(), rewards.item(), Q_targets_next.item(), Q_expected.item())
            logging.debug(text)
        if debug:
            print("q for a {}".format(Q_expected))
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        self.writer.add_scalar('Q_loss', loss, self.steps)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if debug:
            print("q after update {}".format(self.qnetwork_local(states)))
            print("q loss {}".format(loss.item()))


        # ------------------- update target network ------------------- #



    def dqn_train(self, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
        env =  gym.make('LunarLander-v2')
        scores = []                        # list containing scores from each episode
        scores_window = deque(maxlen=100)  # last 100 scores
        eps = eps_start
        for i_episode in range(1, n_episodes+1):
            state = env.reset()
            score = 0
            for t in range(max_t):
                self.t_step += 1
                action = self.dqn_act(state, eps)
                next_state, reward, done, _ = env.step(action)
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    self.test_q()
                    break
            scores_window.append(score)       # save most recent score
            scores.append(score)              # save most recent score
            eps = max(eps_end, eps_decay*eps) # decrease epsilon
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            if np.mean(scores_window)>=200.0:
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
                break



    def test_policy(self):
        env =  gym.make('LunarLander-v2')
        logging.debug("new episode")
        average_score = [] 
        average_steps = []
        average_action = []
        for i in range(5):
            state = env.reset()
            score = 0
            same_action = 0
            logging.debug("new episode")
            for t in range(200):
                state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
                q_expert = self.expert_q(state)
                q_values = self.qnetwork_local(state)
                logging.debug("q expert a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}".format(q_expert.data[0][0], q_expert.data[0][1], q_expert.data[0][2], q_expert.data[0][3]))
                logging.debug("q values a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}  )".format(q_values.data[0][0], q_values.data[0][1], q_values.data[0][2], q_values.data[0][3]))
                action = torch.argmax(q_values).item()
                action_e = torch.argmax(q_expert).item()
                if action == action_e:
                    same_action += 1
                next_state, reward, done, _ = env.step(action)
                state = next_state
                score += reward
                if done:
                    average_score.append(score)
                    average_steps.append(t)
                    average_action.append(same_action)
                    break
        mean_steps = np.mean(average_steps)
        mean_score = np.mean(average_score)
        mean_action= np.mean(average_action)
        self.writer.add_scalar('Ave_epsiode_length', mean_steps , self.steps)
        self.writer.add_scalar('Ave_same_action', mean_action, self.steps)
        self.writer.add_scalar('Ave_score', mean_score, self.steps)


    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % 4
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.update_q(experiences)


    def dqn_act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def update_q(self, experiences, debug=False):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        with torch.no_grad():
            Q_targets_next = self.qnetwork_target(next_states).max(1)[0].unsqueeze(1)
            # Compute Q targets for current states
            Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        if debug:
            print("----------------------")
            print("----------------------")
            print("Q target", Q_targets)
            print("pre", Q_expected)
            print("all local",self.qnetwork_local(states))

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)



    def test_q(self):
        experiences = self.memory.test_sample()
        self.update_q(experiences, True)

    def test_q_value(self, memory):
        same_action = 0
        test_elements = memory.idx
        all_diff = 0
        error = True
        self.predicter.eval()
        for i in range(test_elements):
            # print("lop", i)
            states = memory.obses[i]
            next_states = memory.next_obses[i]
            actions = memory.actions[i]
            dones = memory.not_dones[i]
            states = torch.as_tensor(states, device=self.device).unsqueeze(0)
            next_states = torch.as_tensor(next_states, device=self.device)
            actions = torch.as_tensor(actions, device=self.device)
            dones = torch.as_tensor(dones, device=self.device)
            with torch.no_grad():
                output = self.predicter(states)
                output = F.softmax(output, dim=1)
                q_values = self.qnetwork_local(states)
                expert_values = self.expert_q(states)
                print("q values ", q_values)
                print("ex values  ", expert_values)
                best_action = torch.argmax(q_values).item()
                actions = actions.type(torch.int64)
                q_max = q_values.max(1)
                
                #print("q values", q_values)
                q = q_values[0][actions.item()].item()
                #print("q action", q)
                max_q =  q_max[0].data.item()
                diff = max_q - q
                all_diff += diff
                #print("q best", max_q)
                #print("difference ", diff)
            if  actions.item() != best_action:
                r = self.R_local(states)
                rt = self.R_target(states)
                qt = self.qnetwork_target(states)
                logging.debug("------------------false action --------------------------------")
                logging.debug("expert action  {})".format(actions.item()))
                logging.debug("out predicter a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}  )".format(output.data[0][0], output.data[0][1], output.data[0][2], output.data[0][3]))
                logging.debug("q values a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}  )".format(q_values.data[0][0], q_values.data[0][1], q_values.data[0][2], q_values.data[0][3]))
                logging.debug("q target a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}  )".format(qt.data[0][0], qt.data[0][1], qt.data[0][2], qt.data[0][3]))
                logging.debug("rewards a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}  )".format(r.data[0][0], r.data[0][1], r.data[0][2], r.data[0][3]))
                logging.debug("re target a0: {:.2f} a1: {:.2f} a2: {:.2f} a3: {:.2f}  )".format(rt.data[0][0], rt.data[0][1], rt.data[0][2], rt.data[0][3]))
                """ 
                logging.debug("---------Reward Function------------")
                action = torch.Tensor(1) * 0 +  0
                self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True)
                action = torch.Tensor(1) * 0 +  1
                self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True)
                action = torch.Tensor(1) * 0 +  2
                self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True)
                action = torch.Tensor(1) * 0 +  3
                self.compute_r_function(states, action.unsqueeze(0).to(self.device), log= True)
                logging.debug("------------------Q Function --------------------------------")
                action = torch.Tensor(1) * 0 +  0
                self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True)
                action = torch.Tensor(1) * 0 +  1
                self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True)
                action = torch.Tensor(1) * 0 +  2
                self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True)
                action = torch.Tensor(1) * 0 +  3
                self.compute_q_function(states, next_states.unsqueeze(0), action.unsqueeze(0).to(self.device), dones, log= True)
                """
                

            if  actions.item() == best_action:
                same_action += 1
                continue
                print("-------------------------------------------------------------------------------")
                print("state ", i)
                print("expert ", actions)
                print("q values", q_values.data)
                print("action prob predicter  ", output.data)
                self.compute_r_function(states, actions.unsqueeze(0), True)
                self.compute_q_function(states, next_states.unsqueeze(0), actions.unsqueeze(0), dones, True)
            else:
                if error:
                    continue
                    print("-------------------------------------------------------------------------------")
                    print("expert action ", actions.item())
                    print("best action q ", best_action)
                    print(i)
                    error = False
                continue
                # logging.debug("experte action  {} q fun {}".format(actions.item(), q_values))
                print("-------------------------------------------------------------------------------")
                print("state ", i)
                print("expert ", actions)
                print("q values", q_values.data)
                print("action prob predicter  ", output.data)
                self.compute_r_function(states, actions.unsqueeze(0), True)
                self.compute_q_function(states, next_states.unsqueeze(0), actions.unsqueeze(0), dones, True)


        self.writer.add_scalar('diff', all_diff, self.steps)
        self.average_same_action.append(same_action)
        av_action = np.mean(self.average_same_action)
        self.writer.add_scalar('Same_action', same_action, self.steps)
        print("Same actions {}  of {}".format(same_action, test_elements))
        self.predicter.train()


    def soft_update(self, local_model, target_model, tau=4):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        # print("use tau", tau)
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
    
    def save(self, filename):
        """

        """
        mkdir("", filename)
        torch.save(self.predicter.state_dict(), filename + "_predicter.pth")
        torch.save(self.optimizer_pre.state_dict(), filename + "_predicter_optimizer.pth")
        torch.save(self.qnetwork_local.state_dict(), filename + "_q_net.pth")
        """
        torch.save(self.optimizer_q.state_dict(), filename + "_q_net_optimizer.pth")
        torch.save(self.q_shift_local.state_dict(), filename + "_q_shift_net.pth")
        torch.save(self.optimizer_q_shift.state_dict(), filename + "_q_shift_net_optimizer.pth")
        """
        print("save models to {}".format(filename))
    
    def load(self, filename):
        self.predicter.load_state_dict(torch.load(filename + "_predicter.pth"))
        self.optimizer_pre.load_state_dict(torch.load(filename + "_predicter_optimizer.pth"))
        print("Load models to {}".format(filename))
 def __init__(self, state_size, action_size, config):
     self.seed = config["seed"]
     torch.manual_seed(self.seed)
     np.random.seed(seed=self.seed)
     random.seed(self.seed)
     env = gym.make(config["env_name"])
     self.env = FrameStack(env, config)
     self.env.seed(self.seed)
     self.state_size = state_size
     self.action_size = action_size
     self.clip = config["clip"]
     self.device = 'cuda'
     self.double_dqn = config["DDQN"]
     self.lr_pre = config["lr_pre"]
     self.batch_size = config["batch_size"]
     self.lr = config["lr"]
     self.tau = config["tau"]
     self.gamma = 0.99
     self.fc1 = config["fc1_units"]
     self.fc2 = config["fc2_units"]
     self.qnetwork_local = QNetwork(state_size, action_size, self.fc1,
                                    self.fc2, self.seed).to(self.device)
     self.qnetwork_target = QNetwork(state_size, action_size, self.fc1,
                                     self.fc2, self.seed).to(self.device)
     self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                 lr=self.lr)
     self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)
     self.q_shift_local = QNetwork(state_size, action_size, self.fc1,
                                   self.fc2, self.seed).to(self.device)
     self.q_shift_target = QNetwork(state_size, action_size, self.fc1,
                                    self.fc2, self.seed).to(self.device)
     self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(),
                                       lr=self.lr)
     self.soft_update(self.q_shift_local, self.q_shift_target, 1)
     self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2,
                             self.seed).to(self.device)
     self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2,
                              self.seed).to(self.device)
     self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
     self.soft_update(self.R_local, self.R_target, 1)
     self.steps = 0
     self.predicter = QNetwork(state_size, action_size, self.fc1, self.fc2,
                               self.seed).to(self.device)
     self.optimizer_pre = optim.Adam(self.predicter.parameters(),
                                     lr=self.lr_pre)
     #self.encoder_freq = Encoder(config).to(self.device)
     #self.encoder_optimizer_frq = torch.optim.Adam(self.encoder_freq.parameters(), self.lr)
     self.encoder = Encoder(config).to(self.device)
     self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(),
                                               self.lr)
     pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format(
         self.lr, self.batch_size, self.fc1, self.fc2, self.seed)
     pathname += "_clip_{}".format(config["clip"])
     pathname += "_tau_{}".format(config["tau"])
     now = datetime.now()
     dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
     pathname += dt_string
     tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
     self.vid_path = str(config["locexp"]) + '/vid'
     self.writer = SummaryWriter(tensorboard_name)
     self.average_prediction = deque(maxlen=100)
     self.average_same_action = deque(maxlen=100)
     self.all_actions = []
     for a in range(self.action_size):
         action = torch.Tensor(1) * 0 + a
         self.all_actions.append(action.to(self.device))
class SACAgent():
    def __init__(self, action_size, state_size, config):
        self.seed = config["seed"]
        torch.manual_seed(self.seed)
        np.random.seed(seed=self.seed)
        self.env = gym.make(config["env_name"])
        self.env = FrameStack(self.env, config)
        self.env.seed(self.seed)
        self.action_size = action_size
        self.state_size = state_size
        self.tau = config["tau"]
        self.gamma = config["gamma"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.history_length = config["history_length"]
        self.size = config["size"]
        if not torch.cuda.is_available():
            config["device"] == "cpu"
        self.device = config["device"]
        self.eval = config["eval"]
        self.vid_path = config["vid_path"]
        print("actions size ", action_size)
        self.critic = QNetwork(state_size, action_size, config["fc1_units"],
                               config["fc2_units"]).to(self.device)
        self.q_optim = torch.optim.Adam(self.critic.parameters(),
                                        config["lr_critic"])
        self.target_critic = QNetwork(state_size, action_size,
                                      config["fc1_units"],
                                      config["fc2_units"]).to(self.device)
        self.target_critic.load_state_dict(self.critic.state_dict())
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha = self.log_alpha.exp()
        self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"])
        self.policy = SACActor(state_size, action_size).to(self.device)
        self.policy_optim = Adam(self.policy.parameters(),
                                 lr=config["lr_policy"])
        self.encoder = Encoder(config).to(self.device)
        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(),
                                                  self.lr)
        self.episodes = config["episodes"]
        self.memory = ReplayBuffer((self.history_length, self.size, self.size),
                                   (1, ), config["buffer_size"],
                                   config["image_pad"], self.seed, self.device)
        pathname = config["seed"]
        tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname)
        self.writer = SummaryWriter(tensorboard_name)
        self.steps = 0
        self.target_entropy = -torch.prod(
            torch.Tensor(action_size).to(self.device)).item()

    def act(self, state, evaluate=False):
        with torch.no_grad():
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
            state = state.type(torch.float32).div_(255)
            self.encoder.eval()
            state = self.encoder.create_vector(state)
            self.encoder.train()
            if evaluate is False:
                action = self.policy.sample(state)
            else:
                action_prob, _ = self.policy(state)
                action = torch.argmax(action_prob)
                action = action.cpu().numpy()
                return action
            # action = np.clip(action, self.min_action, self.max_action)
            action = action.cpu().numpy()[0]
        return action

    def train_agent(self):
        average_reward = 0
        scores_window = deque(maxlen=100)
        t0 = time.time()
        for i_epiosde in range(1, self.episodes):
            episode_reward = 0
            state = self.env.reset()
            t = 0
            while True:
                t += 1
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                episode_reward += reward
                if i_epiosde > 10:
                    self.learn()
                self.memory.add(state, reward, action, next_state, done)
                state = next_state
                if done:
                    scores_window.append(episode_reward)
                    break
            if i_epiosde % self.eval == 0:
                self.eval_policy()
            ave_reward = np.mean(scores_window)
            print("Epiosde {} Steps {} Reward {} Reward averge{:.2f} Time {}".
                  format(i_epiosde, t, episode_reward, np.mean(scores_window),
                         time_format(time.time() - t0)))
            self.writer.add_scalar('Aver_reward', ave_reward, self.steps)

    def learn(self):
        self.steps += 1
        states, rewards, actions, next_states, dones = self.memory.sample(
            self.batch_size)
        states = states.type(torch.float32).div_(255)
        states = self.encoder.create_vector(states)
        states_detached = states.detach()
        qf1, qf2 = self.critic(states)
        q_value1 = qf1.gather(1, actions)
        q_value2 = qf2.gather(1, actions)

        with torch.no_grad():
            next_states = next_states.type(torch.float32).div_(255)
            next_states = self.encoder.create_vector(next_states)
            q1_target, q2_target = self.target_critic(next_states)
            min_q_target = torch.min(q1_target, q2_target)
            next_action_prob, next_action_log_prob = self.policy(next_states)
            next_q_target = (
                next_action_prob *
                (min_q_target - self.alpha * next_action_log_prob)).sum(
                    dim=1, keepdim=True)
            next_q_value = rewards + (1 - dones) * self.gamma * next_q_target

        # --------------------------update-q--------------------------------------------------------
        loss = F.mse_loss(q_value1, next_q_value) + F.mse_loss(
            q_value2, next_q_value)
        self.q_optim.zero_grad()
        self.encoder_optimizer.zero_grad()
        loss.backward()
        self.q_optim.step()
        self.encoder_optimizer.zero_grad()
        self.writer.add_scalar('loss/q', loss, self.steps)

        # --------------------------update-policy--------------------------------------------------------
        action_prob, log_action_prob = self.policy(states_detached)
        with torch.no_grad():
            q_pi1, q_pi2 = self.critic(states_detached)
            min_q_values = torch.min(q_pi1, q_pi2)
        #policy_loss = (action_prob *  ((self.alpha * log_action_prob) - min_q_values).detach()).sum(dim=1).mean()
        policy_loss = (action_prob *
                       ((self.alpha * log_action_prob) - min_q_values)).sum(
                           dim=1).mean()
        self.policy_optim.zero_grad()
        policy_loss.backward()
        self.policy_optim.step()
        self.writer.add_scalar('loss/policy', policy_loss, self.steps)

        # --------------------------update-alpha--------------------------------------------------------
        alpha_loss = (action_prob.detach() *
                      (-self.log_alpha *
                       (log_action_prob + self.target_entropy).detach())).sum(
                           dim=1).mean()
        self.alpha_optim.zero_grad()
        alpha_loss.backward()
        self.alpha_optim.step()
        self.writer.add_scalar('loss/alpha', alpha_loss, self.steps)
        self.soft_udapte(self.critic, self.target_critic)
        self.alpha = self.log_alpha.exp()

    def soft_udapte(self, online, target):
        for param, target_parm in zip(online.parameters(),
                                      target.parameters()):
            target_parm.data.copy_(self.tau * param.data +
                                   (1 - self.tau) * target_parm.data)

    def eval_policy(self, eval_episodes=4):
        env = gym.make(self.env_name)
        env = wrappers.Monitor(env,
                               str(self.vid_path) + "/{}".format(self.steps),
                               video_callable=lambda episode_id: True,
                               force=True)
        average_reward = 0
        scores_window = deque(maxlen=100)
        for i_epiosde in range(eval_episodes):
            print("Eval Episode {} of {} ".format(i_epiosde, eval_episodes))
            episode_reward = 0
            state = env.reset()
            while True:
                action = self.act(state, evaluate=True)
                state, reward, done, _ = env.step(action)
                episode_reward += reward
                if done:
                    break
            scores_window.append(episode_reward)
        average_reward = np.mean(scores_window)
        self.writer.add_scalar('Eval_reward', average_reward, self.steps)
示例#30
0
def train(args):
    chrome_driver_path = args.chrome_driver_path
    checkpoint_path = args.checkpoint_path
    nb_actions = args.nb_actions
    initial_epsilon = args.initial_epsilon
    epsilon = initial_epsilon
    final_epsilon = args.final_epsilon
    gamma = args.gamma
    nb_memory = args.nb_memory
    nb_expolre = args.nb_expolre
    is_debug = args.is_debug
    batch_size = args.batch_size
    nb_observation = args.nb_observation
    desired_fps = args.desired_fps
    is_cuda = True if args.use_cuda and torch.cuda.is_available() else False
    log_frequency = args.log_frequency
    save_frequency = args.save_frequency
    ratio_of_win = args.ratio_of_win
    if args.exploiting:
        nb_observation = -1
        epsilon = final_epsilon

    seed = 22
    np.random.seed(seed)
    memory = deque()
    env = DinoSeleniumEnv(chrome_driver_path, speed=args.game_speed)
    agent = Agent(env)
    game_state = GameState(agent, debug=is_debug)
    qnetwork = QNetwork(nb_actions)
    if is_cuda:
        qnetwork.cuda()
    optimizer = torch.optim.Adam(qnetwork.parameters(), 1e-4)
    tmp_param = next(qnetwork.parameters())
    try:
        m = torch.load(checkpoint_path)
        qnetwork.load_state_dict(m["qnetwork"])
        optimizer.load_state_dict(m["optimizer"])
    except:
        logger.warn("No model found in {}".format(checkpoint_path))
    loss_fcn = torch.nn.MSELoss()
    action_indx = 0  # do nothing as the first action
    screen, reward, is_gameover, score = game_state.get_state(action_indx)
    current_state = np.expand_dims(screen, 0)
    # [IMAGE_CHANNELS,IMAGE_WIDTH,IMAGE_HEIGHT]
    current_state = np.tile(current_state, (IMAGE_CHANNELS, 1, 1))
    initial_state = current_state

    t = 0
    last_time = 0
    sum_scores = 0
    total_loss = 0
    max_score = 0
    qvalues = np.array([0, 0])
    lost_action = []
    win_actions = []
    action_random = 0
    action_greedy = 0
    episodes = 0
    nb_episodes = 0
    if not args.exploiting:
        try:
            t, memory, epsilon, nb_episodes = pickle.load(open(
                "cache.p", "rb"))
        except:
            logger.warn("Could not load cache file! Starting from scratch.")
    try:
        while True:
            qnetwork.eval()
            if np.random.random() < epsilon:  # epsilon greedy
                action_indx = np.random.randint(nb_actions)
                action_random += 1
            else:
                action_greedy += 1
                tensor = torch.from_numpy(current_state).float().unsqueeze(0)
                with torch.no_grad():
                    qvalues = qnetwork(tensor).squeeze()
                _, action_indx = qvalues.max(-1)
                action_indx = action_indx.item()
            if epsilon > final_epsilon and t > nb_observation:
                epsilon -= (initial_epsilon - final_epsilon) / nb_expolre
            screen, reward, is_gameover, score = game_state.get_state(
                action_indx)
            if is_gameover:
                episodes += 1
                nb_episodes += 1
                lost_action.append(action_indx)
                sum_scores += score
            else:
                win_actions.append(action_indx)
            if score > max_score:
                max_score = score
            if last_time:
                fps = 1 / (time.time() - last_time)
                if fps > desired_fps:
                    time.sleep(1 / desired_fps - 1 / fps)
            if last_time and t % log_frequency == 0:
                logger.info('fps: {0}'.format(1 / (time.time() - last_time)))
            last_time = time.time()
            screen = np.expand_dims(screen, 0)
            next_state = np.append(screen,
                                   current_state[:IMAGE_CHANNELS - 1, :, :],
                                   axis=0)
            if not args.exploiting and (is_gameover
                                        or np.random.random() < ratio_of_win):
                memory.append((current_state, action_indx, reward, next_state,
                               is_gameover))
            if len(memory) > nb_memory:
                memory.popleft()
            if nb_observation > 0 and t > nb_observation:
                indxes = np.random.choice(len(memory),
                                          batch_size,
                                          replace=False)
                minibatch = [memory[b] for b in indxes]
                inputs = tmp_param.new(batch_size, IMAGE_CHANNELS, IMAGE_WIDTH,
                                       IMAGE_HEIGHT).zero_()
                targets = tmp_param.new(batch_size, nb_actions).zero_()
                for i, (state_t, action_t, reward_t, state_t1,
                        is_gameover_t1) in enumerate(minibatch):
                    inputs[i] = torch.from_numpy(state_t).float()
                    tensor = inputs[i].unsqueeze(0)
                    with torch.no_grad():
                        qvalues = qnetwork(tensor).squeeze()
                    targets[i] = qvalues
                    if is_gameover_t1:
                        assert reward_t == -1
                        targets[i, action_t] = reward_t
                    else:
                        tensor = torch.from_numpy(state_t1).float().unsqueeze(
                            0)
                        with torch.no_grad():
                            qvalues = qnetwork(tensor).squeeze()
                        qvalues = qvalues.cpu().numpy()
                        targets[i, action_t] = reward_t + gamma * qvalues.max()
                qnetwork.train()
                qnetwork.zero_grad()
                q_values = qnetwork(inputs)
                loss = loss_fcn(q_values, targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            current_state = initial_state if is_gameover else next_state
            t += 1
            if t % log_frequency == 0:
                logger.info(
                    "For t {}: mean score is {} max score is {} mean loss: {} number of episode: {}"
                    .format(t, sum_scores / (episodes + 0.1), max_score,
                            total_loss / 1000, episodes))
                logger.info(
                    "t: {} action_index: {} reward: {} max qvalue: {} total number of eposodes so far: {}"
                    .format(t, action_indx, reward, qvalues.max(),
                            nb_episodes))
                tmp = np.array(lost_action)
                dnc = (tmp == 0).sum()
                logger.info(
                    "Lost actions do_nothing: {} jump: {} length of memory {}".
                    format(dnc,
                           len(tmp) - dnc, len(memory)))
                tmp = np.array(win_actions)
                dnc = (tmp == 0).sum()
                logger.info("Win actions do_nothing: {} jump: {}".format(
                    dnc,
                    len(tmp) - dnc))
                logger.info("Greedy action {} Random action {}".format(
                    action_greedy, action_random))
                action_greedy = 0
                action_random = 0
                lost_action = []
                win_actions = []
                if episodes != 0:
                    sum_scores = 0
                total_loss = 0
                episodes = 0
            if t % save_frequency and not args.exploiting:
                env.pause_game()
                with open("cache.p", "wb") as fh:
                    pickle.dump((t, memory, epsilon, nb_episodes), fh)
                gc.collect()
                torch.save(
                    {
                        "qnetwork": qnetwork.state_dict(),
                        "optimizer": optimizer.state_dict()
                    }, checkpoint_path)
                env.resume_game()
    except KeyboardInterrupt:
        if not args.exploiting:
            torch.save(
                {
                    "qnetwork": qnetwork.state_dict(),
                    "optimizer": optimizer.state_dict()
                }, checkpoint_path)
            with open("cache.p", "wb") as fh:
                pickle.dump((t, memory, epsilon, nb_episodes), fh)