def __init__(self, state_size, action_size, seed, algorithm='DQN'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # set algorithm if algorithm == "DQN": self.learn = self.learnDQN elif algorithm == "DDQN": self.learn = self.learnDDQN else: raise ('algorithm {} not implemented'.format(algorithm))
def __init__(self, env, gamma=0.95, epsilon=1.0, copy_period=1000, lr=0.01, update_period=2): """ gammma: 割引率 epsilon: 探索と活用の割合 """ self.env = env self.gamma = gamma self.epsion = epsilon self.copy_period = copy_period self.update_period = update_period self.lr = lr self.global_steps = 0 self.q_network = QNetwork(self.env.action_space.n, lr=lr) self.q_network.build(input_shape=(None, 4)) self.target_network = QNetwork(self.env.action_space.n) self.target_network.build(input_shape=(None, 4)) self.experiences = collections.deque(maxlen=self.MAX_EXPERIENCES)
def __init__(self, env, render, config_info): self.env = env self.render = render self._reset_env() # Create run folder to store parameters, figures, and tensorboard logs self.path_runs = create_run_folder(config_info) # Extract training parameters from yaml config file param = load_training_parameters(config_info["config_param"]) self.train_param = param["training"] # Define device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define state and action dimension spaces state_dim = env.observation_space.shape[0] num_actions = env.action_space.shape[0] # Define models hidden_size = param["model"]["hidden_size"] self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device) self.target_q_net = QNetwork(state_dim, num_actions, hidden_size).to( self.device ) self.target_q_net.load_state_dict(self.q_net.state_dict()) self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to( self.device ) # Define loss criterion self.q_criterion = nn.MSELoss() # Define optimizers lr = float(param["optimizer"]["learning_rate"]) self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr) # Initialize replay buffer self.replay_buffer = ReplayBuffer(param["training"]["replay_size"]) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) # Useful variables self.batch_size = param["training"]["batch_size"] self.gamma = param["training"]["gamma"] self.tau = param["training"]["tau"] self.start_step = param["training"]["start_step"] self.max_timesteps = param["training"]["max_timesteps"] self.alpha = param["training"]["alpha"]
def __init__(self, action_size, state_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) self.env = gym.make(config["env_name"]) self.env.seed(self.seed) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") self.env.action_space.seed(self.seed) self.action_size = action_size self.state_size = state_size self.min_action = config["min_action"] self.max_action = config["max_action"] self.seed = config["seed"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] self.vid_path = config["vid_path"] print("actions size ", action_size) print("actions min ", self.min_action) print("actions max ", self.max_action) fc1 = config["fc1_units"] fc2 = config["fc1_units"] self.actor = Actor(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.optimizer_a = torch.optim.Adam(self.actor.parameters(), config["lr_actor"]) self.target_actor = Actor(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.target_actor.load_state_dict(self.actor.state_dict()) self.critic = QNetwork(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.optimizer_q = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, self.seed, fc1, fc2).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(action_size), dimension=action_size) self.max_timesteps = config["max_episodes_steps"] self.noise.reset() self.episodes = config["episodes"] self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.seed, self.device) pathname = str(config["seed"]) + str(dt_string) tensorboard_name = str( config["res_path"]) + '/runs/' + "DDPG" + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps = 0
def __init__(self, action_size, state_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) self.env = gym.make(config["env_name"]) self.env = FrameStack(self.env, config) self.env.seed(self.seed) self.action_size = action_size self.state_size = state_size self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.history_length = config["history_length"] self.size = config["size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] self.vid_path = config["vid_path"] print("actions size ", action_size) self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"]) self.policy = SACActor(state_size, action_size).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"]) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) self.episodes = config["episodes"] self.memory = ReplayBuffer((self.history_length, self.size, self.size), (1, ), config["buffer_size"], config["image_pad"], self.seed, self.device) pathname = config["seed"] tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.target_entropy = -torch.prod( torch.Tensor(action_size).to(self.device)).item()
def main(config: Config): print(config) # Let's run it! for i in range(config.num_experiments): experiment_seed = config.seed + i * config.num_episodes memory = ReplayMemory(config.replay_memory_size) # We will seed the algorithm (for reproducability). random.seed(experiment_seed) torch.manual_seed(experiment_seed) env.seed(experiment_seed) q_model = QNetwork(config.device, config.num_hidden_q_model) curiousity_model = StatePredictor(2, 3, config.num_hidden_curiosity_model, config.device) for i in range(20, 29): episode_durations, episode_loss = run_episodes(train, q_model, curiousity_model, memory, env, experiment_seed, config, experiment_number=i) # print(i, episode_durations, episode_loss) print("Finished experiment {}/{}".format(i + 1, config.num_experiments))
def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, hidden_1, hidden_2, update_every, epsilon, epsilon_min, eps_decay, seed ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every self.seed = random.seed(seed) self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) # Replay memory self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) self.env = gym.make(config["env_name"]) self.env.seed(self.seed) self.state_size = state_size self.action_size = action_size self.clip = config["clip"] self.device = 'cuda' self.double_dqn = config["DDQN"] self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.steps = 0 self.predicter = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format( self.lr, self.batch_size, self.fc1, self.fc2, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.vid_path = str(config["locexp"]) + '/vid' self.writer = SummaryWriter(tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
def __init__(self, state_size, action_size, action_dim, config): self.state_size = state_size self.action_size = action_size self.action_dim = action_dim self.seed = 0 self.device = 'cuda' self.batch_size = config["batch_size"] self.lr = 0.005 self.gamma = 0.99 self.q_shift_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.R_local = RNetwork(state_size, action_size, self.seed).to(self.device) self.R_target = RNetwork(state_size, action_size, self.seed).to(self.device) self.policy = PolicyNetwork(state_size, action_size, self.seed).to(self.device) self.predicter = Classifier(state_size, action_dim, self.seed).to(self.device) #self.criterion = nn.CrossEntropyLoss() # optimizer self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr) pathname = "lr {} batch_size {} seed {}".format( self.lr, self.batch_size, self.seed) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.ratio = 1. / action_dim self.all_actions = [] for a in range(self.action_dim): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
def _make_model(self, state_size, action_size, use_cnn): """ Sets up the network model based on whether state data or pixel data is provided. """ if use_cnn: return QCNNetwork(state_size, action_size, self.seed).to(self.device) else: return QNetwork(state_size, action_size, self.seed).to(self.device)
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network / Critic # Create the network, define the criterion and optimizer hidden_layers = [37, 37] self.qnetwork_local = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, hidden_layers, seed).to(device) self.qnetwork_optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR_CRIT, weight_decay=WEIGHT_DECAY) # mu-Network / Actor # Create the network, define the criterion and optimizer hidden_layers = [33, 33] self.munetwork_local = ActorPolicy(state_size, action_size, hidden_layers, seed).to(device) self.munetwork_target = ActorPolicy(state_size, action_size, hidden_layers, seed).to(device) self.munetwork_optimizer = optim.Adam( self.munetwork_local.parameters(), lr=LR_ACTR) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def run_dqn(env, num_episodes, memory_size, num_hidden, batch_size, discount_factor, learn_rate, update_target_q, max_steps, double_dqn=False): memory = ReplayMemory(memory_size) # continuous action space if isinstance(env.action_space, Box): dims = env.action_space.shape[0] n_out = SPLITS**dims # discrete action space else: n_out = env.action_space.n n_in = len(env.observation_space.low) model = QNetwork(n_in, n_out, num_hidden) target_net = QNetwork(n_in, n_out, num_hidden) episode_durations, q_vals, cum_reward = run_episodes( train=train, model=model, memory=memory, env=env, num_episodes=num_episodes, batch_size=batch_size, discount_factor=discount_factor, learn_rate=learn_rate, target_net=target_net, update_target_q=update_target_q, max_steps=max_steps, double_dqn=double_dqn) return model, episode_durations, q_vals, cum_reward
def __init__(self, action_size, state_size, config): self.action_size = action_size self.state_size = state_size self.min_action = config["min_action"] self.max_action = config["max_action"] self.seed = config["seed"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch_size"] if not torch.cuda.is_available(): config["device"] == "cpu" self.device = config["device"] self.eval = config["eval"] torch.manual_seed(self.seed) np.random.seed(self.seed) self.vid_path = config["vid_path"] print("actions size ", action_size) print("actions min ", self.min_action) print("actions max ", self.max_action) self.critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.q_optim = torch.optim.Adam(self.critic.parameters(), config["lr_critic"]) self.target_critic = QNetwork(state_size, action_size, config["fc1_units"], config["fc2_units"]).to(self.device) self.target_critic.load_state_dict(self.critic.state_dict()) self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = Adam([self.log_alpha], lr=config["lr_alpha"]) #self.policy = SACActor(state_size, action_size).to(self.device) self.policy = GaussianPolicy(state_size, action_size, 256).to(self.device) self.policy_optim = Adam(self.policy.parameters(), lr=config["lr_policy"]) self.max_timesteps = config["max_episodes_steps"] self.episodes = config["episodes"] self.memory = ReplayBuffer((state_size, ), (action_size, ), config["buffer_size"], self.device) pathname = config["seed"] tensorboard_name = str(config["res_path"]) + '/runs/' + str(pathname) self.writer = SummaryWriter(tensorboard_name) self.steps= 0 self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item()
def __init__(self, state_size, action_size, config): self.env_name = config["env_name"] self.state_size = state_size self.action_size = action_size self.seed = config["seed"] self.clip = config["clip"] self.device = 'cuda' print("Clip ", self.clip) print("cuda ", torch.cuda.is_available()) self.double_dqn = config["DDQN"] print("Use double dqn", self.double_dqn) self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] print("self tau", self.tau) self.gamma = 0.99 self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.fc3 = config["fc3_units"] self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2,self.fc3, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.R_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.fc3, self.seed).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.expert_q = DQNetwork(state_size, action_size, seed=self.seed).to(self.device) self.expert_q.load_state_dict(torch.load('checkpoint.pth')) self.memory = Memory(action_size, config["buffer_size"], self.batch_size, self.seed, self.device) self.t_step = 0 self.steps = 0 self.predicter = Classifier(state_size, action_size, self.seed).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_fc3_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.fc3, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) print("summery writer ", tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
def __init__(self, state_size, action_size, config): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(config["seed"]) self.seed = config["seed"] self.gamma = 0.99 self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.device = config["device"] # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.fc1, self.fc2, self.seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.encoder = Encoder(config).to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), self.lr) # Replay memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr, lr_decay, update_every, update_mem_every, update_mem_par_every, experience_per_sampling, seed, epsilon, epsilon_min, eps_decay, compute_weights, hidden_1, hidden_2, ): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_decay = lr_decay self.update_every = update_every self.experience_per_sampling = experience_per_sampling self.update_mem_every = update_mem_every self.update_mem_par_every = update_mem_par_every self.seed = random.seed(seed) self.epsilon= epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay self.compute_weights = compute_weights self.hidden_1 = hidden_1 self.hidden_2 = hidden_2 self.learn_steps = 0 self.epsilon = epsilon self.epsilon_min = epsilon_min self.eps_decay = eps_decay self.compute_weights = compute_weights # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_1, hidden_2).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) self.scheduler = StepLR(self.optimizer, step_size=1, gamma=self.lr_decay) # Replay memory self.memory = PrioritizedReplayBuffer( self.action_size, self.buffer_size, self.batch_size, self.experience_per_sampling, self.seed, self.compute_weights) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0
def _test_policy(state): model = QNetwork(device="cuda") action = model(state.to(model.device)) return action
import torch.nn.functional as F from checkpoints.model_checkpoint_backup_config import config from models import QNetwork def get_env_configs(config): env = gym.make(config["env"]) config["num_actions"] = env.action_space.n config["observation_shape"] = env.observation_space.shape return config if __name__ == '__main__': config = get_env_configs(config) env = gym.make('CartPole-v1').unwrapped net = QNetwork(config) print(net.net) net.load_state_dict(torch.load("checkpoints/model_checkpoint_backup")) high_score = -math.inf episode = 0 num_samples = 0 while True: done = False state = env.reset() score, frame = 0, 1 while not done: env.render() state = torch.tensor(state, dtype=torch.float32)
def __init__(self, state_size, action_size, config): self.seed = config["seed"] torch.manual_seed(self.seed) np.random.seed(seed=self.seed) random.seed(self.seed) self.env = gym.make(config["env_name"]) self.env.seed(self.seed) self.state_size = state_size self.action_size = action_size self.clip = config["clip"] self.device = 'cuda' print("Clip ", self.clip) print("cuda ", torch.cuda.is_available()) self.double_dqn = config["DDQN"] print("Use double dqn", self.double_dqn) self.lr_pre = config["lr_pre"] self.batch_size = config["batch_size"] self.lr = config["lr"] self.tau = config["tau"] print("self tau", self.tau) self.gamma = 0.99 self.target_entropy = -torch.prod(torch.Tensor(action_size).to(self.device)).item() self.fc1 = config["fc1_units"] self.fc2 = config["fc2_units"] self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optim = optim.Adam([self.log_alpha], lr=config["lr_alpha"]) self.policy = SACActor(state_size, action_size, self.seed).to(self.device) self.policy_optim = optim.Adam(self.policy.parameters(), lr=config["lr_policy"]) self.qnetwork_local = QNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) self.q_shift_local = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.q_shift_target = SQNetwork(state_size, action_size,self.seed, self.fc1, self.fc2).to(self.device) self.optimizer_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.soft_update(self.q_shift_local, self.q_shift_target, 1) self.R_local = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.R_target = SQNetwork(state_size, action_size, self.seed, self.fc1, self.fc2).to(self.device) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.soft_update(self.R_local, self.R_target, 1) self.steps = 0 self.predicter = Classifier(state_size, action_size, self.seed, 256, 256).to(self.device) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr_pre) pathname = "lr_{}_batch_size_{}_fc1_{}_fc2_{}_seed_{}".format(self.lr, self.batch_size, self.fc1, self.fc2, self.seed) pathname += "_clip_{}".format(config["clip"]) pathname += "_tau_{}".format(config["tau"]) now = datetime.now() dt_string = now.strftime("%d_%m_%Y_%H:%M:%S") pathname += dt_string tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.vid_path = str(config["locexp"]) + '/vid' self.writer = SummaryWriter(tensorboard_name) print("summery writer ", tensorboard_name) self.average_prediction = deque(maxlen=100) self.average_same_action = deque(maxlen=100) self.all_actions = [] for a in range(self.action_size): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device))
def train(args): chrome_driver_path = args.chrome_driver_path checkpoint_path = args.checkpoint_path nb_actions = args.nb_actions initial_epsilon = args.initial_epsilon epsilon = initial_epsilon final_epsilon = args.final_epsilon gamma = args.gamma nb_memory = args.nb_memory nb_expolre = args.nb_expolre is_debug = args.is_debug batch_size = args.batch_size nb_observation = args.nb_observation desired_fps = args.desired_fps is_cuda = True if args.use_cuda and torch.cuda.is_available() else False log_frequency = args.log_frequency save_frequency = args.save_frequency ratio_of_win = args.ratio_of_win if args.exploiting: nb_observation = -1 epsilon = final_epsilon seed = 22 np.random.seed(seed) memory = deque() env = DinoSeleniumEnv(chrome_driver_path, speed=args.game_speed) agent = Agent(env) game_state = GameState(agent, debug=is_debug) qnetwork = QNetwork(nb_actions) if is_cuda: qnetwork.cuda() optimizer = torch.optim.Adam(qnetwork.parameters(), 1e-4) tmp_param = next(qnetwork.parameters()) try: m = torch.load(checkpoint_path) qnetwork.load_state_dict(m["qnetwork"]) optimizer.load_state_dict(m["optimizer"]) except: logger.warn("No model found in {}".format(checkpoint_path)) loss_fcn = torch.nn.MSELoss() action_indx = 0 # do nothing as the first action screen, reward, is_gameover, score = game_state.get_state(action_indx) current_state = np.expand_dims(screen, 0) # [IMAGE_CHANNELS,IMAGE_WIDTH,IMAGE_HEIGHT] current_state = np.tile(current_state, (IMAGE_CHANNELS, 1, 1)) initial_state = current_state t = 0 last_time = 0 sum_scores = 0 total_loss = 0 max_score = 0 qvalues = np.array([0, 0]) lost_action = [] win_actions = [] action_random = 0 action_greedy = 0 episodes = 0 nb_episodes = 0 if not args.exploiting: try: t, memory, epsilon, nb_episodes = pickle.load(open( "cache.p", "rb")) except: logger.warn("Could not load cache file! Starting from scratch.") try: while True: qnetwork.eval() if np.random.random() < epsilon: # epsilon greedy action_indx = np.random.randint(nb_actions) action_random += 1 else: action_greedy += 1 tensor = torch.from_numpy(current_state).float().unsqueeze(0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() _, action_indx = qvalues.max(-1) action_indx = action_indx.item() if epsilon > final_epsilon and t > nb_observation: epsilon -= (initial_epsilon - final_epsilon) / nb_expolre screen, reward, is_gameover, score = game_state.get_state( action_indx) if is_gameover: episodes += 1 nb_episodes += 1 lost_action.append(action_indx) sum_scores += score else: win_actions.append(action_indx) if score > max_score: max_score = score if last_time: fps = 1 / (time.time() - last_time) if fps > desired_fps: time.sleep(1 / desired_fps - 1 / fps) if last_time and t % log_frequency == 0: logger.info('fps: {0}'.format(1 / (time.time() - last_time))) last_time = time.time() screen = np.expand_dims(screen, 0) next_state = np.append(screen, current_state[:IMAGE_CHANNELS - 1, :, :], axis=0) if not args.exploiting and (is_gameover or np.random.random() < ratio_of_win): memory.append((current_state, action_indx, reward, next_state, is_gameover)) if len(memory) > nb_memory: memory.popleft() if nb_observation > 0 and t > nb_observation: indxes = np.random.choice(len(memory), batch_size, replace=False) minibatch = [memory[b] for b in indxes] inputs = tmp_param.new(batch_size, IMAGE_CHANNELS, IMAGE_WIDTH, IMAGE_HEIGHT).zero_() targets = tmp_param.new(batch_size, nb_actions).zero_() for i, (state_t, action_t, reward_t, state_t1, is_gameover_t1) in enumerate(minibatch): inputs[i] = torch.from_numpy(state_t).float() tensor = inputs[i].unsqueeze(0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() targets[i] = qvalues if is_gameover_t1: assert reward_t == -1 targets[i, action_t] = reward_t else: tensor = torch.from_numpy(state_t1).float().unsqueeze( 0) with torch.no_grad(): qvalues = qnetwork(tensor).squeeze() qvalues = qvalues.cpu().numpy() targets[i, action_t] = reward_t + gamma * qvalues.max() qnetwork.train() qnetwork.zero_grad() q_values = qnetwork(inputs) loss = loss_fcn(q_values, targets) loss.backward() optimizer.step() total_loss += loss.item() current_state = initial_state if is_gameover else next_state t += 1 if t % log_frequency == 0: logger.info( "For t {}: mean score is {} max score is {} mean loss: {} number of episode: {}" .format(t, sum_scores / (episodes + 0.1), max_score, total_loss / 1000, episodes)) logger.info( "t: {} action_index: {} reward: {} max qvalue: {} total number of eposodes so far: {}" .format(t, action_indx, reward, qvalues.max(), nb_episodes)) tmp = np.array(lost_action) dnc = (tmp == 0).sum() logger.info( "Lost actions do_nothing: {} jump: {} length of memory {}". format(dnc, len(tmp) - dnc, len(memory))) tmp = np.array(win_actions) dnc = (tmp == 0).sum() logger.info("Win actions do_nothing: {} jump: {}".format( dnc, len(tmp) - dnc)) logger.info("Greedy action {} Random action {}".format( action_greedy, action_random)) action_greedy = 0 action_random = 0 lost_action = [] win_actions = [] if episodes != 0: sum_scores = 0 total_loss = 0 episodes = 0 if t % save_frequency and not args.exploiting: env.pause_game() with open("cache.p", "wb") as fh: pickle.dump((t, memory, epsilon, nb_episodes), fh) gc.collect() torch.save( { "qnetwork": qnetwork.state_dict(), "optimizer": optimizer.state_dict() }, checkpoint_path) env.resume_game() except KeyboardInterrupt: if not args.exploiting: torch.save( { "qnetwork": qnetwork.state_dict(), "optimizer": optimizer.state_dict() }, checkpoint_path) with open("cache.p", "wb") as fh: pickle.dump((t, memory, epsilon, nb_episodes), fh)