def __init__(self, state_size: int, action_size: int, replay_buffer: ReplayMemory, seed: int, batch_size=BATCH_SIZE, update_every=UPDATE_EVERY, tau=TAU, gamma=GAMMA): """Initialize the agent""" self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.batch_size = batch_size self.tau = tau self.update_target_every = update_every self.gamma = gamma self.qnet_local = DQN(state_size, action_size, seed).to(device) self.qnet_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=LR) self.max_gradient_norm = float('inf') self.memory = replay_buffer self.t_step = 0
def load_and_test(opt): netp1 = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt) netp1.load(opt.load_path) load_path2 = list(opt.load_path) print(opt.load_path) load_path2[-11] = '2' load_path2 = "".join(load_path2) print(load_path2) netp2 = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt) netp2.load(load_path2) if opt.player == 1: r1, r2, w, d = test_ep_pvp(netp1, netp2, opt.num_test, opt.eps, render=opt.render) print('p1 average reward:', r1) print('p2 average reward:', r2) print('p1 win rate:', w) print('p2 win rate:', 1 - w - d) print('draw rate:', d) elif opt.player == 2: r2, r1, w, d = test_ep_pvp(netp2, netp1, opt.num_test, opt.eps, render=opt.render) print('p1 average reward:', r1) print('p2 average reward:', r2) print('p1 win rate:', 1 - w - d) print('p2 win rate:', w) print('draw rate:', d)
def __init__(self): self.mode = "train" with open("config.yaml") as reader: self.config = yaml.safe_load(reader) print(self.config) self.load_config() self.online_net = DQN(config=self.config, word_vocab=self.word_vocab, char_vocab=self.char_vocab, answer_type=self.answer_type) self.target_net = DQN(config=self.config, word_vocab=self.word_vocab, char_vocab=self.char_vocab, answer_type=self.answer_type) self.online_net.train() self.target_net.train() self.update_target_net() for param in self.target_net.parameters(): param.requires_grad = False if self.use_cuda: self.online_net.cuda() self.target_net.cuda() self.naozi = ObservationPool(capacity=self.naozi_capacity) # optimizer self.optimizer = torch.optim.Adam( self.online_net.parameters(), lr=self.config['training']['optimizer']['learning_rate']) self.clip_grad_norm = self.config['training']['optimizer'][ 'clip_grad_norm']
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to( device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model and os.path.isfile(args.model): # Always load tensors onto CPU by default, will shift to GPU if necessary self.online_net.load_state_dict( torch.load(args.model, map_location='cpu')) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps)
def __init__(self, args, state_size, action_size): """Initialize an Agent object. Params ====== args (class defined on the notebook): A set of parameters that will define the agent hyperparameters state_size (int): dimension of each state action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size self.params = args # Deep Q-Network if args.use_NoisyNet: self.DQN_local = DQN_NoisyNet(args, state_size, action_size).to(args.device) self.DQN_target = DQN_NoisyNet(args, state_size, action_size).to(args.device) else: self.DQN_local = DQN(args, state_size, action_size).to(args.device) self.DQN_target = DQN(args, state_size, action_size).to(args.device) self.optimizer = optim.Adam(self.DQN_local.parameters(), lr=args.lr, eps=args.adam_eps) # Replay memory self.memory = ReplayBuffer(args, action_size) # Initialize time step (for updating every args.target_update steps) self.t_step = 0
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, args.atoms) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.priority_exponent = args.priority_exponent self.max_gradient_norm = args.max_gradient_norm self.policy_net = DQN(args, self.action_space) if args.model and os.path.isfile(args.model): self.policy_net.load_state_dict(torch.load(args.model)) self.policy_net.train() self.target_net = DQN(args, self.action_space) self.update_target_net() self.target_net.eval() self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps) if args.cuda: self.policy_net.cuda() self.target_net.cuda() self.support = self.support.cuda()
def __init__(self): self.device = args.device self.batch_size = args.batch_size self.lr = args.lr self.history_size = args.history_size self.replay_size = args.replay_size self.width = args.width self.height = args.height self.hidden_size = args.hidden_size self.action_size = args.action_size self.update_cycle = args.update_cycle self.log_interval = args.log_interval self.actor_num = args.actor_num self.alpha = 0.7 self.beta_init = 0.4 self.beta = self.beta_init self.beta_increment = 1e-6 self.e = 1e-6 self.dis = 0.99 self.start_epoch = 0 self.mainDQN = DQN(self.history_size, self.hidden_size, self.action_size).to(self.device) self.targetDQN = DQN(self.history_size, self.hidden_size, self.action_size).to(self.device) self.update_target_model() self.optimizer = optim.Adam(self.mainDQN.parameters(), lr=args.lr) self.replay_memory = deque(maxlen=self.replay_size) self.priority = deque(maxlen=self.replay_size) if args.load_model != '000000000000': self.log = args.log_directory + args.load_model + '/' args.time_stamp = args.load_model[:12] args.start_epoch = self.load_model() self.log = args.log_directory + args.time_stamp + config + '/' self.writer = SummaryWriter(self.log)
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.norm_clip = args.norm_clip self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model: # Load pretrained model if provided if os.path.isfile(args.model): state_dict = torch.load(args.model, map_location='cpu') # Always load tensors onto CPU by default, will shift to GPU if necessary if 'conv1.weight' in state_dict.keys(): for old_key, new_key in (('conv1.weight', 'convs.0.weight'), ('conv1.bias', 'convs.0.bias'), ('conv2.weight', 'convs.2.weight'), ('conv2.bias', 'convs.2.bias'), ('conv3.weight', 'convs.4.weight'), ('conv3.bias', 'convs.4.bias')): state_dict[new_key] = state_dict[old_key] # Re-map state dict for old pretrained models del state_dict[old_key] # Delete old keys for strict load_state_dict self.online_net.load_state_dict(state_dict) print("Loading pretrained model: " + args.model) else: # Raise error if incorrect model path provided raise FileNotFoundError(args.model) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
def __init__(self, env, args): super(DQNTrainer).__init__() self.model = DQN(env, args, Nash=False).to(args.device) self.target = DQN(env, args, Nash=False).to(args.device) self.replay_buffer = ReplayBuffer(args.buffer_size) self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) self.args = args
def run(): ray.init() policy_net = DQN(num_channels=4, num_actions=19) target_net = DQN(num_channels=4, num_actions=19) target_net.load_state_dict(policy_net.state_dict()) #memory = Memory(50000) #shared_memory = ray.get(ray.put(memory)) memory = RemoteMemory.remote(30000) num_channels = 4 num_actions = 19 batch_size = 256 param_server = ParameterServer.remote(num_channels, num_actions) learner = (Learner.remote(param_server, batch_size, num_channels, num_actions)) print(learner) print(learner.get_state_dict.remote()) num_actors = 2 epsilon = 0.9 actor_list = [ Actor.remote(learner, param_server, i, epsilon, num_channels, num_actions) for i in range(num_actors) ] explore = [actor.explore.remote(learner, memory) for actor in actor_list] #ray.get(explore) learn = learner.update_network.remote(memory)
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms # size of value distribution. self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device) self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space).to( device=args.device) # greedily selects the action. if args.model and os.path.isfile(args.model): self.online_net.load_state_dict( torch.load(args.model, map_location='cpu') ) # state_dict: python dictionary that maps each layer to its parameters. self.online_net.train() self.target_net = DQN(args, self.action_space).to( device=args.device) # use to compute target q-values. self.update_target_net( ) # sets it to the parameters of the online network. self.target_net.train() for param in self.target_net.parameters( ): # not updated through backpropagation. param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps)
def __init__(self, state_size, action_size, seed, network): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.network = network # Q-Network if self.network == "duel": self.qnetwork_local = DuelingDQN(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) else: self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to( device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model: # Load pretrained model if provided if os.path.isfile(args.model): self.online_net.load_state_dict( torch.load(args.model, map_location='cpu') ) # Always load tensors onto CPU by default, will shift to GPU if necessary print("Loading pretrained model: " + args.model) else: # Raise error if incorrect model path provided raise FileNotFoundError(args.model) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
def __init__(self, time_step, split, lr): self.dataset = Dataset(T=time_step, split_ratio=split, binary_file=config.BINARY_DATASET) self.policy_net_encoder = AttnEncoder( input_size=self.dataset.get_num_features(), hidden_size=config.ENCODER_HIDDEN_SIZE, time_step=time_step) self.policy_net_decoder = AttnDecoder( code_hidden_size=config.ENCODER_HIDDEN_SIZE, hidden_size=config.DECODER_HIDDEN_SIZE, time_step=time_step) self.policy_net = DQN(self.policy_net_encoder, self.policy_net_decoder) self.target_net_encoder = AttnEncoder( input_size=self.dataset.get_num_features(), hidden_size=config.ENCODER_HIDDEN_SIZE, time_step=time_step) self.target_net_decoder = AttnDecoder( code_hidden_size=config.ENCODER_HIDDEN_SIZE, hidden_size=config.DECODER_HIDDEN_SIZE, time_step=time_step) self.target_net = DQN(self.target_net_encoder, self.target_net_decoder) if torch.cuda.is_available(): self.policy_net_encoder = self.policy_net_encoder.cuda() self.policy_net_decoder = self.policy_net_decoder.cuda() self.target_net_encoder = self.target_net_encoder.cuda() self.target_net_decoder = self.target_net_decoder.cuda() self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.memory = ReplayMemory(config.MEMORY_CAPACITY) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)
def __init__(self, meta_controller_experience_memory=None, lr=0.00025, alpha=0.95, eps=0.01, batch_size=32, gamma=0.99, num_options=12): # expereince replay memory self.meta_controller_experience_memory = meta_controller_experience_memory self.lr = lr # learning rate self.alpha = alpha # optimizer parameter self.eps = 0.01 # optimizer parameter self.gamma = 0.99 # BUILD MODEL USE_CUDA = torch.cuda.is_available() if torch.cuda.is_available() and torch.cuda.device_count() > 1: self.device = torch.device("cuda:1") elif torch.cuda.device_count() == 1: self.device = torch.device("cuda:0") else: self.device = torch.device("cpu") dfloat_cpu = torch.FloatTensor dfloat_gpu = torch.cuda.FloatTensor dlong_cpu = torch.LongTensor dlong_gpu = torch.cuda.LongTensor duint_cpu = torch.ByteTensor dunit_gpu = torch.cuda.ByteTensor dtype = torch.cuda.FloatTensor if torch.cuda.is_available( ) else torch.FloatTensor dlongtype = torch.cuda.LongTensor if torch.cuda.is_available( ) else torch.LongTensor duinttype = torch.cuda.ByteTensor if torch.cuda.is_available( ) else torch.ByteTensor self.dtype = dtype self.dlongtype = dlongtype self.duinttype = duinttype Q = DQN(in_channels=4, num_actions=num_options).type(dtype) Q_t = DQN(in_channels=4, num_actions=num_options).type(dtype) Q_t.load_state_dict(Q.state_dict()) Q_t.eval() for param in Q_t.parameters(): param.requires_grad = False Q = Q.to(self.device) Q_t = Q_t.to(self.device) self.batch_size = batch_size self.Q = Q self.Q_t = Q_t # optimizer optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps) self.optimizer = optimizer print('init: Meta Controller --> OK')
def __init__(self): """ initializes all the class variables """ self.env = gym.make('CartPole-v0').unwrapped self.resize = T.Compose([ T.ToPILImage(), T.Resize(40, interpolation=Image.CUBIC), T.ToTensor() ]) self.env.reset() init_screen = self.get_screen() self.env.reset() _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space self.n_actions = self.env.action_space.n self.policy_net = DQN(screen_height, screen_width, self.n_actions).to(device) self.target_net = DQN(screen_height, screen_width, self.n_actions).to(device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.0001) self.memory = PriortizedReplayMemory(10000)
def __init__(self, args, obs): self.net = DQN(args.n_obs, args.n_action) self.target_net = DQN(args.n_obs, args.n_action) if os.path.isfile('./weights/ckpt.pth'): self.net.load_state_dict(torch.load('./weights/ckpt.pth')) self.target_net.load_state_dict(torch.load('./weights/ckpt.pth')) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.state_preproc = StatePreproc(self.device) self.n_action = args.n_action self.gamma = args.gamma self.max_grad_norm = args.max_grad_norm self.num_procs = args.num_procs self.memory = ReplayBuffer(args) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=args.lr, betas=(0.9, 0.99)) self.criterion = torch.nn.MSELoss() # log self.log_episode_rewards = torch.zeros(self.num_procs, device=self.device, dtype=torch.float) self.episode_rewards = deque([0] * 100, maxlen=100) self.episode = 1 self.init(obs) # eval self.test_episode = args.test_episode
def __init__(self, state_size, action_size, config=RLConfig()): self.seed = random.seed(config.seed) self.state_size = state_size self.action_size = action_size self.batch_size = config.batch_size self.batch_indices = torch.arange(config.batch_size).long().to(device) self.samples_before_learning = config.samples_before_learning self.learn_interval = config.learning_interval self.parameter_update_interval = config.parameter_update_interval self.per_epsilon = config.per_epsilon self.tau = config.tau self.gamma = config.gamma if config.useDuelingDQN: self.qnetwork_local = DuelingDQN(state_size, action_size, config.seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, config.seed).to(device) else: self.qnetwork_local = DQN(state_size, action_size, config.seed).to(device) self.qnetwork_target = DQN(state_size, action_size, config.seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=config.learning_rate) self.doubleDQN = config.useDoubleDQN self.usePER = config.usePER if self.usePER: self.memory = PrioritizedReplayBuffer(config.buffer_size, config.per_alpha) else: self.memory = ReplayBuffer(config.buffer_size) self.t_step = 0
def __init__(self, learner, actor_idx, epsilon): # environment initialization import gym import minerl self.actor_idx = actor_idx self.env = gym.make("MineRLTreechop-v0") self.port_number = int("12340") + actor_idx print("actor environment %d initialize successfully" % self.actor_idx) self.shared_network_cpu = ray.get(learner.get_network.remote()) # self.shared_memory = ray.get(shared_memory_id) # print("shared memory assign successfully") # network initalization self.actor_network = DQN(19).cpu() self.actor_target_network = DQN(19).cpu() self.actor_network.load_state_dict(self.shared_network_cpu.state_dict()) self.actor_target_network.load_state_dict(self.actor_network.state_dict()) print("actor network %d initialize successfully" % self.actor_idx) self.initialized = False self.epi_counter = 0 # exploring info self.epsilon = epsilon self.max_step = 100 self.local_buffer_size = 100 self.local_buffer = deque(maxlen=self.local_buffer_size) project_name = 'apex_dqfd_Actor%d' %(actor_idx) wandb.init(project=project_name, entity='neverparadise')
def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() #3. Create Prioritized Experience Replay Memory self.memory = Memory(Config.MEMORY_SIZE)
def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 500000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net and the target net self.policy_net = DQN(action_size) self.policy_net.to(device) self.target_net = DQN(action_size) self.target_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR( self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) # Initialize a target network and initialize the target network to the policy net ### CODE ### self.update_target_net()
def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] print('LOAD PATH -- agent.init:', load_path) time.sleep(2) #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: print('entrou no not train') self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(1000)
def __init__(self): # build models self.Qt = DQN(in_channels=5, num_actions=18) # Controller Q network self.Qt_t = DQN(in_channels=5, num_actions=18) # Controller target network # self.meta_controller = Model(in_channels=4, num_actions=10) self.Q = None # Meta-Controller Q network self.Q_t = None # Meta-Controller target network
def main(): max_episodes = 5000 replay_buffer = deque() with tf.Session() as sess: mainDQN = DQN(sess, input_size, output_size, name='main') targetDQN = DQN(sess, input_size, output_size, name='target') tf.global_variables_initializer().run() copy_ops = get_copy_var_ops(dest_scope_name='target', src_scope_name='main') sess.run(copy_ops) for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: break print("Episode: {} Steps: {}".format(episode, step_count)) if step_count > 10000: pass if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) print("Loss: ", loss) sess.run(copy_ops) bot_play(mainDQN)
def __init__(self, network, batch_size): self.learner_network = DQN(19).cuda().float() self.learner_target_network = DQN(19).cuda().float() self.learner_network.load_state_dict(network.state_dict()) self.learner_target_network.load_state_dict(network.state_dict()) self.shared_network = DQN(19).cpu() self.count = 0 self.batch_size = batch_size wandb.init(project='apex_dqfd_Learner', entity='neverparadise')
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.saved_model_path = args.saved_model_path self.experiment = args.experiment self.plots_path = args.plots_path self.data_save_path = args.data_save_path self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model and os.path.isfile(args.model): # Always load tensors onto CPU by default, will shift to GPU if necessary self.online_net.load_state_dict(torch.load(args.model, map_location='cpu')) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) # list of layers: self.online_net_layers = [self.online_net.conv1, self.online_net.conv2, self.online_net.conv3, self.online_net.fc_h_v, self.online_net.fc_h_a, self.online_net.fc_z_v, self.online_net.fc_z_a ] self.target_net_layers = [self.target_net.conv1, self.target_net.conv2, self.target_net.conv3, self.target_net.fc_h_v, self.target_net.fc_h_a, self.target_net.fc_z_v, self.target_net.fc_z_a ] # freeze all layers except the last, and reinitialize last if args.freeze_layers > 0: self.freeze_layers(args.freeze_layers) if args.reinitialize_layers > 0: self.reinit_layers(args.reinitialize_layers)
def __init__(self, param_server, batch_size, num_channels, num_actions): self.learner_network = DQN(num_channels, num_actions).cuda().float() self.learner_target_network = DQN(num_channels, num_actions).cuda().float() self.count = 0 self.batch_size = batch_size self.writer = SummaryWriter(f'runs/apex/learner') self.lr = LR self.optimizer = optim.Adam(self.learner_network.parameters(), self.lr) self.param_server = param_server
def run(): policy_net = DQN(num_channels, 19).cuda() target_net = DQN(num_channels, 19).cuda() optimizer = optim.Adam(policy_net.parameters(), LR) memory = Memory(50000) env = gym.make(ENV_NAME) env.make_interactive(port=6666, realtime=False) max_epi = 100 n_step = 2 update_period = 10 gamma = 0.99 total_steps = 0 epsilon = 0.95 endEpsilon = 0.01 stepDrop = (epsilon - endEpsilon) / max_epi for num_epi in range(max_epi): obs = env.reset() state = converter(ENV_NAME, obs).cuda() state = state.float() done = False total_reward = 0 steps = 0 if epsilon > endEpsilon: epsilon -= stepDrop while not done: steps += 1 total_steps += 1 a_out = policy_net.sample_action(state, epsilon) action_index = a_out action = make_19action(env, action_index) obs_prime, reward, done, info = env.step(action) total_reward += reward if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) writer.add_scalar('Rewards/train', total_reward, num_epi) break state_prime = converter(ENV_NAME, obs_prime).cuda() append_sample(memory, policy_net, target_net, state, action_index, reward, state_prime, done) state = state_prime if memory.size() > 1000: update_network(policy_net, target_net, memory, 2, optimizer, total_steps) if total_steps % 2000 == 0: update_target(policy_net, target_net)
def __init__(self, env, taskCount=100, alpha=0.4, hiddenSize=500, perfix=''): self.GAMMA = 0.99 self.epsilon = 0.3 self.epsilon_end = 0.05 self.epsilon_decay = 200 self.update_step = 20 self.memory_size = 2000 self.max_epoch = 500 self.batch_size = 32 # self.max_epoch = 500 # self.batch_size = 1 # self.memory_size = 1 # self.update_step = 1 self.hiddenSize = hiddenSize # self.save_path = '../Model/' + str(taskCount) + '-' + str(alpha) + perfix +'.pth' self.save_path = '../Model/' + perfix + '-' + str( taskCount) + '-' + str(alpha) + '.pth' # Variables self.var_phi = autograd.Variable(torch.Tensor(6), volatile=True) # For training self.var_batch_phi = autograd.Variable(torch.Tensor( self.batch_size, 6)) self.var_batch_a = autograd.Variable(torch.LongTensor( self.batch_size, 1), requires_grad=False) self.var_batch_r = autograd.Variable(torch.Tensor(self.batch_size, 1)) self.var_batch_phi_next = autograd.Variable( torch.Tensor(self.batch_size, 6)) self.var_batch_r_mask = autograd.Variable(torch.Tensor( self.batch_size, 1), requires_grad=False) self.MP = MemoryReplay(self.memory_size, self.batch_size) self.dqn = DQN(hiddenSize=self.hiddenSize) self.target_dqn = DQN(hiddenSize=self.hiddenSize) self.target_dqn.load_state_dict(self.dqn.state_dict()) self.optimz = optim.RMSprop(self.dqn.parameters(), lr=0.00025, alpha=0.9, eps=1e-02, momentum=0.0) self.env = env
def main(*args, **kwargs): M = kwargs["M"] M.env = gym.make("BreakoutDeterministic-v4") M.policy = DQN() M.target = DQN() M.target.eval() M.policy.to(M.device) M.target.to(M.device) starter = "./bootstrap/model-epoch-2750.pt" starter_target = "./bootstrap/model-epoch-2750.pt" M.policy.load_state_dict(T.load(starter, map_location=M.device)) M.target.load_state_dict(T.load(starter_target, map_location=M.device)) M.time = int(time.time()) M.log = open("./logs/log-{}.txt".format(M.time), "a") M.model_folder = "./model-{}".format(M.time) os.mkdir(M.model_folder) M.memory = rl.ReplayMemory(REPLAY_BUF_SIZE) if DISPLAY_ENABLED: M.display = Display("breakout", DISPLAY_WIDTH, DISPLAY_HEIGHT) M.action_db = {0: "NOP", 1: "Fire", 2: "Right", 3: "Left"} M.optim(optim.RMSprop(M.policy.parameters(), lr=LEARNING_RATE)) M.steps = 0 durations = [] test_durations = [] for epoch in range(EPOCHS): M.epoch = epoch duration, avg_loss = train(M) durations.append(duration) print( "[train/{}] duration: {}, total steps: {}, avg loss: {:0.6f}, eps: {:0.2f}" .format(epoch, duration, M.steps, avg_loss, M.eps)) if M.steps >= STEPS_BEFORE_TRAIN: test_duration = test(M) test_durations.append(test_duration) print("[model-{}][test/{}] test_duration: {}".format( M.time, epoch, test_duration)) # Save model save_path = "./{}/model-epoch-{}.pt".format(M.model_folder, epoch) T.save(M.policy.state_dict(), save_path) # Log training progress M.log.write( "epoch, {}, train_dur, {}, test_dur, {}, avg loss, {}\n". format(epoch, duration, test_duration, avg_loss)) M.log.flush()