예제 #1
0
    def __init__(self, state_size, action_size, seed, algorithm='DQN'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

        # set algorithm
        if algorithm == "DQN":
            self.learn = self.learnDQN
        elif algorithm == "DDQN":
            self.learn = self.learnDDQN
        else:
            raise ('algorithm {} not implemented'.format(algorithm))
예제 #2
0
    def __init__(self,
                 env,
                 gamma=0.95,
                 epsilon=1.0,
                 copy_period=1000,
                 lr=0.01,
                 update_period=2):
        """
            gammma: 割引率
            epsilon: 探索と活用の割合
        """

        self.env = env

        self.gamma = gamma

        self.epsion = epsilon

        self.copy_period = copy_period

        self.update_period = update_period

        self.lr = lr

        self.global_steps = 0

        self.q_network = QNetwork(self.env.action_space.n, lr=lr)

        self.q_network.build(input_shape=(None, 4))

        self.target_network = QNetwork(self.env.action_space.n)

        self.target_network.build(input_shape=(None, 4))

        self.experiences = collections.deque(maxlen=self.MAX_EXPERIENCES)
예제 #3
0
    def __init__(self, env, render, config_info):
        self.env = env
        self.render = render
        self._reset_env()

        # Create run folder to store parameters, figures, and tensorboard logs
        self.path_runs = create_run_folder(config_info)

        # Extract training parameters from yaml config file
        param = load_training_parameters(config_info["config_param"])
        self.train_param = param["training"]

        # Define device
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Device in use : {self.device}")

        # Define state and action dimension spaces
        state_dim = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]

        # Define models
        hidden_size = param["model"]["hidden_size"]
        self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device)

        self.target_q_net = QNetwork(state_dim, num_actions, hidden_size).to(
            self.device
        )
        self.target_q_net.load_state_dict(self.q_net.state_dict())

        self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to(
            self.device
        )

        # Define loss criterion
        self.q_criterion = nn.MSELoss()

        # Define optimizers
        lr = float(param["optimizer"]["learning_rate"])
        self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr)

        # Initialize replay buffer
        self.replay_buffer = ReplayBuffer(param["training"]["replay_size"])

        self.transition = namedtuple(
            "transition",
            field_names=["state", "action", "reward", "done", "next_state"],
        )

        # Useful variables
        self.batch_size = param["training"]["batch_size"]
        self.gamma = param["training"]["gamma"]
        self.tau = param["training"]["tau"]
        self.start_step = param["training"]["start_step"]
        self.max_timesteps = param["training"]["max_timesteps"]
        self.alpha = param["training"]["alpha"]
예제 #4
0
 def __init__(self, action_size, state_size, config):
     self.seed = config["seed"]
     torch.manual_seed(self.seed)
     np.random.seed(seed=self.seed)
     random.seed(self.seed)
     self.env = gym.make(config["env_name"])
     self.env.seed(self.seed)
     now = datetime.now()
     dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
     self.env.action_space.seed(self.seed)
     self.action_size = action_size
     self.state_size = state_size
     self.min_action = config["min_action"]
     self.max_action = config["max_action"]
     self.seed = config["seed"]
     self.tau = config["tau"]
     self.gamma = config["gamma"]
     self.batch_size = config["batch_size"]
     if not torch.cuda.is_available():
         config["device"] == "cpu"
     self.device = config["device"]
     self.eval = config["eval"]
     self.vid_path = config["vid_path"]
     print("actions size ", action_size)
     print("actions min ", self.min_action)
     print("actions max ", self.max_action)
     fc1 = config["fc1_units"]
     fc2 = config["fc1_units"]
     self.actor = Actor(state_size, action_size, self.seed, fc1,
                        fc2).to(self.device)
     self.optimizer_a = torch.optim.Adam(self.actor.parameters(),
                                         config["lr_actor"])
     self.target_actor = Actor(state_size, action_size, self.seed, fc1,
                               fc2).to(self.device)
     self.target_actor.load_state_dict(self.actor.state_dict())
     self.critic = QNetwork(state_size, action_size, self.seed, fc1,
                            fc2).to(self.device)
     self.optimizer_q = torch.optim.Adam(self.critic.parameters(),
                                         config["lr_critic"])
     self.target_critic = QNetwork(state_size, action_size, self.seed, fc1,
                                   fc2).to(self.device)
     self.target_critic.load_state_dict(self.critic.state_dict())
     self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(action_size),
                                           dimension=action_size)
     self.max_timesteps = config["max_episodes_steps"]
     self.noise.reset()
     self.episodes = config["episodes"]
     self.memory = ReplayBuffer((state_size, ), (action_size, ),
                                config["buffer_size"], self.seed,
                                self.device)
     pathname = str(config["seed"]) + str(dt_string)
     tensorboard_name = str(
         config["res_path"]) + '/runs/' + "DDPG" + str(pathname)
     self.writer = SummaryWriter(tensorboard_name)
     self.steps = 0
예제 #5
0
 def __init__(self, action_size, state_size, config):
     self.seed = config["seed"]
     torch.manual_seed(self.seed)
     np.random.seed(seed=self.seed)
     self.env = gym.make(config["env_name"])
     self.env = FrameStack(self.env, config)
     self.env.seed(self.seed)
     self.action_size = action_size
     self.state_size = state_size
     self.tau = config["tau"]
     self.gamma = config["gamma"]
     self.batch_size = config["batch_size"]
     self.lr = config["lr"]
     self.history_length = config["history_length"]
     self.size = config["size"]
     if not torch.cuda.is_available():
         config["device"] == "cpu"
     self.device = config["device"]
     self.eval = config["eval"]
     self.vid_path = config["vid_path"]
     print("actions size ", action_size)
     self.critic = QNetwork(state_size, action_size, config["fc1_units"],
                            config["fc2_units"]).to(self.device)
     self.q_optim = torch.optim.Adam(self.critic.parameters(),
                                     config["lr_critic"])
     self.target_critic = QNetwork(state_size, action_size,
                                   config["fc1_units"],
                                   config["fc2_units"]).to(self.device)
     self.target_critic.load_state_dict(self.critic.state_dict())
     self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
     self.alpha = self.log_alpha.exp()
     self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"])
     self.policy = SACActor(state_size, action_size).to(self.device)
     self.policy_optim = Adam(self.policy.parameters(),
                              lr=config["lr_policy"])
     self.encoder = Encoder(config).to(self.device)
     self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(),
                                               self.lr)
     self.episodes = config["episodes"]
     self.memory = ReplayBuffer((self.history_length, self.size, self.size),
                                (1, ), config["buffer_size"],
                                config["image_pad"], self.seed, self.device)
     pathname = config["seed"]
     tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname)
     self.writer = SummaryWriter(tensorboard_name)
     self.steps = 0
     self.target_entropy = -torch.prod(
         torch.Tensor(action_size).to(self.device)).item()
예제 #6
0
파일: dqn.py 프로젝트: TheBrick/curious_car
def main(config: Config):
    print(config)

    # Let's run it!
    for i in range(config.num_experiments):
        experiment_seed = config.seed + i * config.num_episodes
        memory = ReplayMemory(config.replay_memory_size)

        # We will seed the algorithm (for reproducability).
        random.seed(experiment_seed)
        torch.manual_seed(experiment_seed)
        env.seed(experiment_seed)

        q_model = QNetwork(config.device, config.num_hidden_q_model)
        curiousity_model = StatePredictor(2, 3,
                                          config.num_hidden_curiosity_model,
                                          config.device)

        for i in range(20, 29):
            episode_durations, episode_loss = run_episodes(train,
                                                           q_model,
                                                           curiousity_model,
                                                           memory,
                                                           env,
                                                           experiment_seed,
                                                           config,
                                                           experiment_number=i)
        # print(i, episode_durations, episode_loss)
        print("Finished experiment {}/{}".format(i + 1,
                                                 config.num_experiments))
예제 #7
0
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 lr,
                 hidden_1,
                 hidden_2,
                 update_every,
                 epsilon,
                 epsilon_min,
                 eps_decay,
                 seed
                 ):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.seed = random.seed(seed)
        self.learn_steps = 0
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #8
0
 def __init__(self, state_size, action_size, config):
     self.seed = config["seed"]
     torch.manual_seed(self.seed)
     np.random.seed(seed=self.seed)
     random.seed(self.seed)
     self.env = gym.make(config["env_name"])
     self.env.seed(self.seed)
     self.state_size = state_size
     self.action_size = action_size
     self.clip = config["clip"]
     self.device = 'cuda'
     self.double_dqn = config["DDQN"]
     self.lr_pre = config["lr_pre"]
     self.batch_size = config["batch_size"]
     self.lr = config["lr"]
     self.tau = config["tau"]
     self.gamma = 0.99
     self.fc1 = config["fc1_units"]
     self.fc2 = config["fc2_units"]
     self.qnetwork_local = QNetwork(state_size, action_size, self.fc1,
                                    self.fc2, self.seed).to(self.device)
     self.qnetwork_target = QNetwork(state_size, action_size, self.fc1,
                                     self.fc2, self.seed).to(self.device)
     self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                 lr=self.lr)
     self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)
     self.q_shift_local = QNetwork(state_size, action_size, self.fc1,
                                   self.fc2, self.seed).to(self.device)
     self.q_shift_target = QNetwork(state_size, action_size, self.fc1,
                                    self.fc2, self.seed).to(self.device)
     self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(),
                                       lr=self.lr)
     self.soft_update(self.q_shift_local, self.q_shift_target, 1)
     self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2,
                             self.seed).to(self.device)
     self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2,
                              self.seed).to(self.device)
     self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
     self.soft_update(self.R_local, self.R_target, 1)
     self.steps = 0
     self.predicter = QNetwork(state_size, action_size, self.fc1, self.fc2,
                               self.seed).to(self.device)
     self.optimizer_pre = optim.Adam(self.predicter.parameters(),
                                     lr=self.lr_pre)
     pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format(
         self.lr, self.batch_size, self.fc1, self.fc2, self.seed)
     pathname += "_clip_{}".format(config["clip"])
     pathname += "_tau_{}".format(config["tau"])
     now = datetime.now()
     dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
     pathname += dt_string
     tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
     self.vid_path = str(config["locexp"]) + '/vid'
     self.writer = SummaryWriter(tensorboard_name)
     self.average_prediction = deque(maxlen=100)
     self.average_same_action = deque(maxlen=100)
     self.all_actions = []
     for a in range(self.action_size):
         action = torch.Tensor(1) * 0 + a
         self.all_actions.append(action.to(self.device))
예제 #9
0
 def __init__(self, state_size, action_size, action_dim, config):
     self.state_size = state_size
     self.action_size = action_size
     self.action_dim = action_dim
     self.seed = 0
     self.device = 'cuda'
     self.batch_size = config["batch_size"]
     self.lr = 0.005
     self.gamma = 0.99
     self.q_shift_local = QNetwork(state_size, action_size,
                                   self.seed).to(self.device)
     self.q_shift_target = QNetwork(state_size, action_size,
                                    self.seed).to(self.device)
     self.Q_local = QNetwork(state_size, action_size,
                             self.seed).to(self.device)
     self.Q_target = QNetwork(state_size, action_size,
                              self.seed).to(self.device)
     self.R_local = RNetwork(state_size, action_size,
                             self.seed).to(self.device)
     self.R_target = RNetwork(state_size, action_size,
                              self.seed).to(self.device)
     self.policy = PolicyNetwork(state_size, action_size,
                                 self.seed).to(self.device)
     self.predicter = Classifier(state_size, action_dim,
                                 self.seed).to(self.device)
     #self.criterion = nn.CrossEntropyLoss()
     # optimizer
     self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(),
                                         lr=self.lr)
     self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr)
     self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
     self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr)
     self.optimizer_pre = optim.Adam(self.predicter.parameters(),
                                     lr=self.lr)
     pathname = "lr {} batch_size {} seed {}".format(
         self.lr, self.batch_size, self.seed)
     tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
     self.writer = SummaryWriter(tensorboard_name)
     self.steps = 0
     self.ratio = 1. / action_dim
     self.all_actions = []
     for a in range(self.action_dim):
         action = torch.Tensor(1) * 0 + a
         self.all_actions.append(action.to(self.device))
예제 #10
0
    def _make_model(self, state_size, action_size, use_cnn):
        """
        Sets up the network model based on whether state data or pixel data is
        provided.
        """

        if use_cnn:
            return QCNNetwork(state_size, action_size,
                              self.seed).to(self.device)
        else:
            return QNetwork(state_size, action_size, self.seed).to(self.device)
예제 #11
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network / Critic
        # Create the network, define the criterion and optimizer
        hidden_layers = [37, 37]
        self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers,
                                        seed).to(device)
        self.qnetwork_optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                             lr=LR_CRIT,
                                             weight_decay=WEIGHT_DECAY)

        # mu-Network / Actor
        # Create the network, define the criterion and optimizer
        hidden_layers = [33, 33]
        self.munetwork_local = ActorPolicy(state_size, action_size,
                                           hidden_layers, seed).to(device)
        self.munetwork_target = ActorPolicy(state_size, action_size,
                                            hidden_layers, seed).to(device)
        self.munetwork_optimizer = optim.Adam(
            self.munetwork_local.parameters(), lr=LR_ACTR)

        # Noise process
        self.noise = OUNoise(action_size, seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #12
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #13
0
def run_dqn(env,
            num_episodes,
            memory_size,
            num_hidden,
            batch_size,
            discount_factor,
            learn_rate,
            update_target_q,
            max_steps,
            double_dqn=False):
    memory = ReplayMemory(memory_size)

    # continuous action space
    if isinstance(env.action_space, Box):
        dims = env.action_space.shape[0]
        n_out = SPLITS**dims
    # discrete action space
    else:
        n_out = env.action_space.n

    n_in = len(env.observation_space.low)
    model = QNetwork(n_in, n_out, num_hidden)
    target_net = QNetwork(n_in, n_out, num_hidden)

    episode_durations, q_vals, cum_reward = run_episodes(
        train=train,
        model=model,
        memory=memory,
        env=env,
        num_episodes=num_episodes,
        batch_size=batch_size,
        discount_factor=discount_factor,
        learn_rate=learn_rate,
        target_net=target_net,
        update_target_q=update_target_q,
        max_steps=max_steps,
        double_dqn=double_dqn)
    return model, episode_durations, q_vals, cum_reward
예제 #14
0
 def __init__(self, action_size, state_size, config):
     self.action_size = action_size
     self.state_size = state_size
     self.min_action = config["min_action"]
     self.max_action = config["max_action"]
     self.seed = config["seed"]
     self.tau = config["tau"]
     self.gamma = config["gamma"]
     self.batch_size = config["batch_size"]
     if not torch.cuda.is_available():
         config["device"] == "cpu"
     self.device = config["device"]
     self.eval = config["eval"]
     torch.manual_seed(self.seed)
     np.random.seed(self.seed)
     self.vid_path = config["vid_path"]
     print("actions size ", action_size)
     print("actions min ", self.min_action)
     print("actions max ", self.max_action)
     self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
     self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"])
     self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device)
     self.target_critic.load_state_dict(self.critic.state_dict())
     self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
     self.alpha = self.log_alpha.exp()
     self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"])
     #self.policy = SACActor(state_size, action_size).to(self.device)
     self.policy = GaussianPolicy(state_size, action_size, 256).to(self.device)
     self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"])
     self.max_timesteps = config["max_episodes_steps"]
     self.episodes = config["episodes"]
     self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.device)
     pathname = config["seed"]
     tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname)
     self.writer = SummaryWriter(tensorboard_name)
     self.steps= 0
     self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item()
예제 #15
0
    def __init__(self, state_size, action_size, config):
        self.env_name = config["env_name"]
        self.state_size = state_size
        self.action_size = action_size
        self.seed = config["seed"]
        self.clip = config["clip"]
        self.device = 'cuda'
        print("Clip ", self.clip)
        print("cuda ", torch.cuda.is_available())
        self.double_dqn = config["DDQN"]
        print("Use double dqn", self.double_dqn)
        self.lr_pre = config["lr_pre"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.tau = config["tau"]
        print("self tau", self.tau)
        self.gamma = 0.99
        self.fc1 = config["fc1_units"]
        self.fc2 = config["fc2_units"]
        self.fc3 = config["fc3_units"]
        self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2,self.fc3,  self.seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)
        
        self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr)
        self.soft_update(self.q_shift_local, self.q_shift_target, 1)
         
        self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3,  self.seed).to(self.device)
        self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device)
        self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
        self.soft_update(self.R_local, self.R_target, 1) 

        self.expert_q = DQNetwork(state_size, action_size, seed=self.seed).to(self.device)
        self.expert_q.load_state_dict(torch.load('checkpoint.pth'))
        self.memory = Memory(action_size, config["buffer_size"], self.batch_size, self.seed, self.device)
        self.t_step = 0
        self.steps = 0
        self.predicter = Classifier(state_size, action_size, self.seed).to(self.device)
        self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre)
        pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_fc3_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.fc3, self.seed)
        pathname += "_clip_{}".format(config["clip"])
        pathname += "_tau_{}".format(config["tau"])
        now = datetime.now()    
        dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
        pathname += dt_string
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
        self.writer = SummaryWriter(tensorboard_name)
        print("summery writer ", tensorboard_name)
        self.average_prediction = deque(maxlen=100)
        self.average_same_action = deque(maxlen=100)
        self.all_actions = []
        for a in range(self.action_size):
            action = torch.Tensor(1) * 0 +  a
            self.all_actions.append(action.to(self.device))
    def __init__(self, state_size, action_size, config):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(config["seed"])
        self.seed = config["seed"]
        self.gamma = 0.99
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.tau = config["tau"]
        self.fc1 = config["fc1_units"]
        self.fc2 = config["fc2_units"]
        self.device = config["device"]
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, self.fc1,
                                       self.fc2, self.seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.fc1,
                                        self.fc2, self.seed).to(self.device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)
        self.encoder = Encoder(config).to(self.device)
        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(),
                                                  self.lr)

        # Replay memory

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
예제 #17
0
    def __init__(self,
                 state_size,
                 action_size,
                 buffer_size,
                 batch_size,
                 gamma,
                 tau,
                 lr,
                 lr_decay,
                 update_every,
                 update_mem_every,
                 update_mem_par_every,
                 experience_per_sampling,
                 seed,
                 epsilon,
                 epsilon_min,
                 eps_decay,
                 compute_weights,
                 hidden_1,
                 hidden_2,
                 ):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_decay = lr_decay
        self.update_every = update_every
        self.experience_per_sampling  = experience_per_sampling
        self.update_mem_every = update_mem_every
        self.update_mem_par_every = update_mem_par_every
        self.seed = random.seed(seed)
        self.epsilon= epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay
        self.compute_weights = compute_weights
        self.hidden_1 = hidden_1
        self.hidden_2 = hidden_2
        self.learn_steps = 0
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.eps_decay = eps_decay
        self.compute_weights = compute_weights


        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.scheduler = StepLR(self.optimizer, step_size=1, gamma=self.lr_decay)


        # Replay memory
        self.memory = PrioritizedReplayBuffer(
                    self.action_size,
                    self.buffer_size,
                    self.batch_size,
                    self.experience_per_sampling,
                    self.seed,
                    self.compute_weights)
        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0
예제 #18
0
def _test_policy(state):
    model = QNetwork(device="cuda")
    action = model(state.to(model.device))
    return action
예제 #19
0
import torch.nn.functional as F

from checkpoints.model_checkpoint_backup_config import config
from models import QNetwork

def get_env_configs(config):
    env = gym.make(config["env"])
    config["num_actions"] = env.action_space.n
    config["observation_shape"] = env.observation_space.shape
    return config

if __name__ == '__main__':
    config = get_env_configs(config)

    env = gym.make('CartPole-v1').unwrapped
    net = QNetwork(config)
    print(net.net)
    net.load_state_dict(torch.load("checkpoints/model_checkpoint_backup"))

    high_score = -math.inf
    episode = 0
    num_samples = 0
    while True:
        done = False
        state = env.reset()

        score, frame = 0, 1
        while not done:
            env.render()

            state = torch.tensor(state, dtype=torch.float32)
예제 #20
0
    def __init__(self, state_size, action_size, config):
        self.seed = config["seed"]
        torch.manual_seed(self.seed)
        np.random.seed(seed=self.seed)
        random.seed(self.seed)
        self.env = gym.make(config["env_name"])
        self.env.seed(self.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.clip = config["clip"]
        self.device = 'cuda'
        print("Clip ", self.clip)
        print("cuda ", torch.cuda.is_available())
        self.double_dqn = config["DDQN"]
        print("Use double dqn", self.double_dqn)
        self.lr_pre = config["lr_pre"]
        self.batch_size = config["batch_size"]
        self.lr = config["lr"]
        self.tau = config["tau"]
        print("self tau", self.tau)
        self.gamma = 0.99
        self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item()
        self.fc1 = config["fc1_units"]
        self.fc2 = config["fc2_units"]
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha = self.log_alpha.exp()
        self.alpha_optim = optim.Adam([self.log_alpha], lr=config["lr_alpha"])
        self.policy = SACActor(state_size, action_size, self.seed).to(self.device)
        self.policy_optim = optim.Adam(self.policy.parameters(), lr=config["lr_policy"])
        
        self.qnetwork_local = QNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)
        self.soft_update(self.qnetwork_local, self.qnetwork_target, 1)
        
        self.q_shift_local = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device)
        self.q_shift_target = SQNetwork(state_size, action_size,self.seed, self.fc1, self.fc2).to(self.device)
        self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr)
        self.soft_update(self.q_shift_local, self.q_shift_target, 1)
         
        self.R_local = SQNetwork(state_size, action_size, self.seed,  self.fc1, self.fc2).to(self.device)
        self.R_target = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device)
        self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr)
        self.soft_update(self.R_local, self.R_target, 1) 

        self.steps = 0
        self.predicter = Classifier(state_size, action_size, self.seed, 256, 256).to(self.device)
        self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre)
        pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.seed)
        pathname += "_clip_{}".format(config["clip"])
        pathname += "_tau_{}".format(config["tau"])
        now = datetime.now()    
        dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
        pathname += dt_string
        tensorboard_name = str(config["locexp"]) + '/runs/' + pathname
        self.vid_path = str(config["locexp"]) + '/vid'
        self.writer = SummaryWriter(tensorboard_name)
        print("summery writer ", tensorboard_name)
        self.average_prediction = deque(maxlen=100)
        self.average_same_action = deque(maxlen=100)
        self.all_actions = []
        for a in range(self.action_size):
            action = torch.Tensor(1) * 0 +  a
            self.all_actions.append(action.to(self.device))
예제 #21
0
def train(args):
    chrome_driver_path = args.chrome_driver_path
    checkpoint_path = args.checkpoint_path
    nb_actions = args.nb_actions
    initial_epsilon = args.initial_epsilon
    epsilon = initial_epsilon
    final_epsilon = args.final_epsilon
    gamma = args.gamma
    nb_memory = args.nb_memory
    nb_expolre = args.nb_expolre
    is_debug = args.is_debug
    batch_size = args.batch_size
    nb_observation = args.nb_observation
    desired_fps = args.desired_fps
    is_cuda = True if args.use_cuda and torch.cuda.is_available() else False
    log_frequency = args.log_frequency
    save_frequency = args.save_frequency
    ratio_of_win = args.ratio_of_win
    if args.exploiting:
        nb_observation = -1
        epsilon = final_epsilon

    seed = 22
    np.random.seed(seed)
    memory = deque()
    env = DinoSeleniumEnv(chrome_driver_path, speed=args.game_speed)
    agent = Agent(env)
    game_state = GameState(agent, debug=is_debug)
    qnetwork = QNetwork(nb_actions)
    if is_cuda:
        qnetwork.cuda()
    optimizer = torch.optim.Adam(qnetwork.parameters(), 1e-4)
    tmp_param = next(qnetwork.parameters())
    try:
        m = torch.load(checkpoint_path)
        qnetwork.load_state_dict(m["qnetwork"])
        optimizer.load_state_dict(m["optimizer"])
    except:
        logger.warn("No model found in {}".format(checkpoint_path))
    loss_fcn = torch.nn.MSELoss()
    action_indx = 0  # do nothing as the first action
    screen, reward, is_gameover, score = game_state.get_state(action_indx)
    current_state = np.expand_dims(screen, 0)
    # [IMAGE_CHANNELS,IMAGE_WIDTH,IMAGE_HEIGHT]
    current_state = np.tile(current_state, (IMAGE_CHANNELS, 1, 1))
    initial_state = current_state

    t = 0
    last_time = 0
    sum_scores = 0
    total_loss = 0
    max_score = 0
    qvalues = np.array([0, 0])
    lost_action = []
    win_actions = []
    action_random = 0
    action_greedy = 0
    episodes = 0
    nb_episodes = 0
    if not args.exploiting:
        try:
            t, memory, epsilon, nb_episodes = pickle.load(open(
                "cache.p", "rb"))
        except:
            logger.warn("Could not load cache file! Starting from scratch.")
    try:
        while True:
            qnetwork.eval()
            if np.random.random() < epsilon:  # epsilon greedy
                action_indx = np.random.randint(nb_actions)
                action_random += 1
            else:
                action_greedy += 1
                tensor = torch.from_numpy(current_state).float().unsqueeze(0)
                with torch.no_grad():
                    qvalues = qnetwork(tensor).squeeze()
                _, action_indx = qvalues.max(-1)
                action_indx = action_indx.item()
            if epsilon > final_epsilon and t > nb_observation:
                epsilon -= (initial_epsilon - final_epsilon) / nb_expolre
            screen, reward, is_gameover, score = game_state.get_state(
                action_indx)
            if is_gameover:
                episodes += 1
                nb_episodes += 1
                lost_action.append(action_indx)
                sum_scores += score
            else:
                win_actions.append(action_indx)
            if score > max_score:
                max_score = score
            if last_time:
                fps = 1 / (time.time() - last_time)
                if fps > desired_fps:
                    time.sleep(1 / desired_fps - 1 / fps)
            if last_time and t % log_frequency == 0:
                logger.info('fps: {0}'.format(1 / (time.time() - last_time)))
            last_time = time.time()
            screen = np.expand_dims(screen, 0)
            next_state = np.append(screen,
                                   current_state[:IMAGE_CHANNELS - 1, :, :],
                                   axis=0)
            if not args.exploiting and (is_gameover
                                        or np.random.random() < ratio_of_win):
                memory.append((current_state, action_indx, reward, next_state,
                               is_gameover))
            if len(memory) > nb_memory:
                memory.popleft()
            if nb_observation > 0 and t > nb_observation:
                indxes = np.random.choice(len(memory),
                                          batch_size,
                                          replace=False)
                minibatch = [memory[b] for b in indxes]
                inputs = tmp_param.new(batch_size, IMAGE_CHANNELS, IMAGE_WIDTH,
                                       IMAGE_HEIGHT).zero_()
                targets = tmp_param.new(batch_size, nb_actions).zero_()
                for i, (state_t, action_t, reward_t, state_t1,
                        is_gameover_t1) in enumerate(minibatch):
                    inputs[i] = torch.from_numpy(state_t).float()
                    tensor = inputs[i].unsqueeze(0)
                    with torch.no_grad():
                        qvalues = qnetwork(tensor).squeeze()
                    targets[i] = qvalues
                    if is_gameover_t1:
                        assert reward_t == -1
                        targets[i, action_t] = reward_t
                    else:
                        tensor = torch.from_numpy(state_t1).float().unsqueeze(
                            0)
                        with torch.no_grad():
                            qvalues = qnetwork(tensor).squeeze()
                        qvalues = qvalues.cpu().numpy()
                        targets[i, action_t] = reward_t + gamma * qvalues.max()
                qnetwork.train()
                qnetwork.zero_grad()
                q_values = qnetwork(inputs)
                loss = loss_fcn(q_values, targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            current_state = initial_state if is_gameover else next_state
            t += 1
            if t % log_frequency == 0:
                logger.info(
                    "For t {}: mean score is {} max score is {} mean loss: {} number of episode: {}"
                    .format(t, sum_scores / (episodes + 0.1), max_score,
                            total_loss / 1000, episodes))
                logger.info(
                    "t: {} action_index: {} reward: {} max qvalue: {} total number of eposodes so far: {}"
                    .format(t, action_indx, reward, qvalues.max(),
                            nb_episodes))
                tmp = np.array(lost_action)
                dnc = (tmp == 0).sum()
                logger.info(
                    "Lost actions do_nothing: {} jump: {} length of memory {}".
                    format(dnc,
                           len(tmp) - dnc, len(memory)))
                tmp = np.array(win_actions)
                dnc = (tmp == 0).sum()
                logger.info("Win actions do_nothing: {} jump: {}".format(
                    dnc,
                    len(tmp) - dnc))
                logger.info("Greedy action {} Random action {}".format(
                    action_greedy, action_random))
                action_greedy = 0
                action_random = 0
                lost_action = []
                win_actions = []
                if episodes != 0:
                    sum_scores = 0
                total_loss = 0
                episodes = 0
            if t % save_frequency and not args.exploiting:
                env.pause_game()
                with open("cache.p", "wb") as fh:
                    pickle.dump((t, memory, epsilon, nb_episodes), fh)
                gc.collect()
                torch.save(
                    {
                        "qnetwork": qnetwork.state_dict(),
                        "optimizer": optimizer.state_dict()
                    }, checkpoint_path)
                env.resume_game()
    except KeyboardInterrupt:
        if not args.exploiting:
            torch.save(
                {
                    "qnetwork": qnetwork.state_dict(),
                    "optimizer": optimizer.state_dict()
                }, checkpoint_path)
            with open("cache.p", "wb") as fh:
                pickle.dump((t, memory, epsilon, nb_episodes), fh)