def __init__(self, state_size, action_size, seed, network): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.network = network # Q-Network if self.network == "duel": self.qnetwork_local = DuelingDQN(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) else: self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.norm_clip = args.norm_clip self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model: # Load pretrained model if provided if os.path.isfile(args.model): state_dict = torch.load(args.model, map_location='cpu') # Always load tensors onto CPU by default, will shift to GPU if necessary if 'conv1.weight' in state_dict.keys(): for old_key, new_key in (('conv1.weight', 'convs.0.weight'), ('conv1.bias', 'convs.0.bias'), ('conv2.weight', 'convs.2.weight'), ('conv2.bias', 'convs.2.bias'), ('conv3.weight', 'convs.4.weight'), ('conv3.bias', 'convs.4.bias')): state_dict[new_key] = state_dict[old_key] # Re-map state dict for old pretrained models del state_dict[old_key] # Delete old keys for strict load_state_dict self.online_net.load_state_dict(state_dict) print("Loading pretrained model: " + args.model) else: # Raise error if incorrect model path provided raise FileNotFoundError(args.model) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
def __init__(self, time_step, split, lr): self.dataset = Dataset(T=time_step, split_ratio=split, binary_file=config.BINARY_DATASET) self.policy_net_encoder = AttnEncoder( input_size=self.dataset.get_num_features(), hidden_size=config.ENCODER_HIDDEN_SIZE, time_step=time_step) self.policy_net_decoder = AttnDecoder( code_hidden_size=config.ENCODER_HIDDEN_SIZE, hidden_size=config.DECODER_HIDDEN_SIZE, time_step=time_step) self.policy_net = DQN(self.policy_net_encoder, self.policy_net_decoder) self.target_net_encoder = AttnEncoder( input_size=self.dataset.get_num_features(), hidden_size=config.ENCODER_HIDDEN_SIZE, time_step=time_step) self.target_net_decoder = AttnDecoder( code_hidden_size=config.ENCODER_HIDDEN_SIZE, hidden_size=config.DECODER_HIDDEN_SIZE, time_step=time_step) self.target_net = DQN(self.target_net_encoder, self.target_net_decoder) if torch.cuda.is_available(): self.policy_net_encoder = self.policy_net_encoder.cuda() self.policy_net_decoder = self.policy_net_decoder.cuda() self.target_net_encoder = self.target_net_encoder.cuda() self.target_net_decoder = self.target_net_decoder.cuda() self.policy_net = self.policy_net.cuda() self.target_net = self.target_net.cuda() self.memory = ReplayMemory(config.MEMORY_CAPACITY) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms # size of value distribution. self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device) self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space).to( device=args.device) # greedily selects the action. if args.model and os.path.isfile(args.model): self.online_net.load_state_dict( torch.load(args.model, map_location='cpu') ) # state_dict: python dictionary that maps each layer to its parameters. self.online_net.train() self.target_net = DQN(args, self.action_space).to( device=args.device) # use to compute target q-values. self.update_target_net( ) # sets it to the parameters of the online network. self.target_net.train() for param in self.target_net.parameters( ): # not updated through backpropagation. param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps)
def __init__(self): """ initializes all the class variables """ self.env = gym.make('CartPole-v0').unwrapped self.resize = T.Compose([ T.ToPILImage(), T.Resize(40, interpolation=Image.CUBIC), T.ToTensor() ]) self.env.reset() init_screen = self.get_screen() self.env.reset() _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space self.n_actions = self.env.action_space.n self.policy_net = DQN(screen_height, screen_width, self.n_actions).to(device) self.target_net = DQN(screen_height, screen_width, self.n_actions).to(device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.0001) self.memory = PriortizedReplayMemory(10000)
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to( device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model: # Load pretrained model if provided if os.path.isfile(args.model): self.online_net.load_state_dict( torch.load(args.model, map_location='cpu') ) # Always load tensors onto CPU by default, will shift to GPU if necessary print("Loading pretrained model: " + args.model) else: # Raise error if incorrect model path provided raise FileNotFoundError(args.model) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
def __init__(self, args, state_size, action_size): """Initialize an Agent object. Params ====== args (class defined on the notebook): A set of parameters that will define the agent hyperparameters state_size (int): dimension of each state action_size (int): dimension of each action """ self.state_size = state_size self.action_size = action_size self.params = args # Deep Q-Network if args.use_NoisyNet: self.DQN_local = DQN_NoisyNet(args, state_size, action_size).to(args.device) self.DQN_target = DQN_NoisyNet(args, state_size, action_size).to(args.device) else: self.DQN_local = DQN(args, state_size, action_size).to(args.device) self.DQN_target = DQN(args, state_size, action_size).to(args.device) self.optimizer = optim.Adam(self.DQN_local.parameters(), lr=args.lr, eps=args.adam_eps) # Replay memory self.memory = ReplayBuffer(args, action_size) # Initialize time step (for updating every args.target_update steps) self.t_step = 0
def __init__(self, args, obs): self.net = DQN(args.n_obs, args.n_action) self.target_net = DQN(args.n_obs, args.n_action) if os.path.isfile('./weights/ckpt.pth'): self.net.load_state_dict(torch.load('./weights/ckpt.pth')) self.target_net.load_state_dict(torch.load('./weights/ckpt.pth')) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.state_preproc = StatePreproc(self.device) self.n_action = args.n_action self.gamma = args.gamma self.max_grad_norm = args.max_grad_norm self.num_procs = args.num_procs self.memory = ReplayBuffer(args) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=args.lr, betas=(0.9, 0.99)) self.criterion = torch.nn.MSELoss() # log self.log_episode_rewards = torch.zeros(self.num_procs, device=self.device, dtype=torch.float) self.episode_rewards = deque([0] * 100, maxlen=100) self.episode = 1 self.init(obs) # eval self.test_episode = args.test_episode
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to( device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model and os.path.isfile(args.model): # Always load tensors onto CPU by default, will shift to GPU if necessary self.online_net.load_state_dict( torch.load(args.model, map_location='cpu')) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps)
def __init__(self, state_size: int, action_size: int, replay_buffer: ReplayMemory, seed: int, batch_size=BATCH_SIZE, update_every=UPDATE_EVERY, tau=TAU, gamma=GAMMA): """Initialize the agent""" self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.batch_size = batch_size self.tau = tau self.update_target_every = update_every self.gamma = gamma self.qnet_local = DQN(state_size, action_size, seed).to(device) self.qnet_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=LR) self.max_gradient_norm = float('inf') self.memory = replay_buffer self.t_step = 0
def test(): sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) total_succ = 0 for episode in range(10000): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) step = 0 while not terminal and step <= 200: action = brain.get_action() state, reward, terminal, succ = game.step(action) if terminal and succ: total_succ += 1 step += 1 print(total_succ)
def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() #3. Create Prioritized Experience Replay Memory self.memory = Memory(Config.MEMORY_SIZE)
def __init__(self, action_size): self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 500000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.train_start = 100000 self.update_target = 1000 # Generate the memory self.memory = ReplayMemory() # Create the policy net and the target net self.policy_net = DQN(action_size) self.policy_net.to(device) self.target_net = DQN(action_size) self.target_net.to(device) self.optimizer = optim.Adam(params=self.policy_net.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.StepLR( self.optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) # Initialize a target network and initialize the target network to the policy net ### CODE ### self.update_target_net()
def test(env, args): current_model = DQN(env, args).to(args.device) current_model.eval() load_model(current_model, args) episode_reward = 0 episode_length = 0 state = env.reset() while True: if args.render: env.render() action = current_model.act( torch.FloatTensor(state).to(args.device), 0.) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward episode_length += 1 if done: break print("Test Result - Reward {} Length {}".format(episode_reward, episode_length))
def __init__(self, env, args): super(DQNTrainer).__init__() self.model = DQN(env, args, Nash=False).to(args.device) self.target = DQN(env, args, Nash=False).to(args.device) self.replay_buffer = ReplayBuffer(args.buffer_size) self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) self.args = args
def __init__(self, state_size, action_size, config=RLConfig()): self.seed = random.seed(config.seed) self.state_size = state_size self.action_size = action_size self.batch_size = config.batch_size self.batch_indices = torch.arange(config.batch_size).long().to(device) self.samples_before_learning = config.samples_before_learning self.learn_interval = config.learning_interval self.parameter_update_interval = config.parameter_update_interval self.per_epsilon = config.per_epsilon self.tau = config.tau self.gamma = config.gamma if config.useDuelingDQN: self.qnetwork_local = DuelingDQN(state_size, action_size, config.seed).to(device) self.qnetwork_target = DuelingDQN(state_size, action_size, config.seed).to(device) else: self.qnetwork_local = DQN(state_size, action_size, config.seed).to(device) self.qnetwork_target = DQN(state_size, action_size, config.seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=config.learning_rate) self.doubleDQN = config.useDoubleDQN self.usePER = config.usePER if self.usePER: self.memory = PrioritizedReplayBuffer(config.buffer_size, config.per_alpha) else: self.memory = ReplayBuffer(config.buffer_size) self.t_step = 0
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, args.atoms) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.priority_exponent = args.priority_exponent self.max_gradient_norm = args.max_gradient_norm self.policy_net = DQN(args, self.action_space) if args.model and os.path.isfile(args.model): self.policy_net.load_state_dict(torch.load(args.model)) self.policy_net.train() self.target_net = DQN(args, self.action_space) self.update_target_net() self.target_net.eval() self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps) if args.cuda: self.policy_net.cuda() self.target_net.cuda() self.support = self.support.cuda()
def main(_): game = Game(screen_width, screen_height, show_game=not FLAGS.train) state = game.get_state() brain = DQN(n_action, screen_width, screen_height, state) while 1: game.reset() gameover = FLAGS.train print (" Avg. Reward: %d, Total Game: %d" % ( game.total_reward / game.total_game, game.total_game)) while not gameover: # DQN 모델을 이용해 실행할 액션을 결정합니다. action = brain.get_action(FLAGS.train) # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. reward, gameover = game.proceed(np.argmax(action)) # 위에서 결정한 액션에 따른 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. state = game.get_state() # DQN 으로 학습을 진행합니다. brain.step(state, action, reward, gameover) # 학습모드가 아닌 경우, 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다. if not FLAGS.train: time.sleep(0.3)
def __init__(self, learner, actor_idx, epsilon): # environment initialization import gym import minerl self.actor_idx = actor_idx self.env = gym.make("MineRLTreechop-v0") self.port_number = int("12340") + actor_idx print("actor environment %d initialize successfully" % self.actor_idx) self.shared_network_cpu = ray.get(learner.get_network.remote()) # self.shared_memory = ray.get(shared_memory_id) # print("shared memory assign successfully") # network initalization self.actor_network = DQN(19).cpu() self.actor_target_network = DQN(19).cpu() self.actor_network.load_state_dict(self.shared_network_cpu.state_dict()) self.actor_target_network.load_state_dict(self.actor_network.state_dict()) print("actor network %d initialize successfully" % self.actor_idx) self.initialized = False self.epi_counter = 0 # exploring info self.epsilon = epsilon self.max_step = 100 self.local_buffer_size = 100 self.local_buffer = deque(maxlen=self.local_buffer_size) project_name = 'apex_dqfd_Actor%d' %(actor_idx) wandb.init(project=project_name, entity='neverparadise')
def __init__(self, action_set, train=True, load_path=None): #1. Initialize agent params self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.action_set = action_set self.action_number = len(action_set) self.steps_done = 0 self.epsilon = Config.EPS_START self.episode_durations = [] print('LOAD PATH -- agent.init:', load_path) time.sleep(2) #2. Build networks self.policy_net = DQN().to(self.device) self.target_net = DQN().to(self.device) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE) if not train: print('entrou no not train') self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0) self.policy_net.load(load_path, optimizer=self.optimizer) self.policy_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(1000)
def __init__(self): self.device = args.device self.batch_size = args.batch_size self.lr = args.lr self.history_size = args.history_size self.replay_size = args.replay_size self.width = args.width self.height = args.height self.hidden_size = args.hidden_size self.action_size = args.action_size self.update_cycle = args.update_cycle self.log_interval = args.log_interval self.actor_num = args.actor_num self.alpha = 0.7 self.beta_init = 0.4 self.beta = self.beta_init self.beta_increment = 1e-6 self.e = 1e-6 self.dis = 0.99 self.start_epoch = 0 self.mainDQN = DQN(self.history_size, self.hidden_size, self.action_size).to(self.device) self.targetDQN = DQN(self.history_size, self.hidden_size, self.action_size).to(self.device) self.update_target_model() self.optimizer = optim.Adam(self.mainDQN.parameters(), lr=args.lr) self.replay_memory = deque(maxlen=self.replay_size) self.priority = deque(maxlen=self.replay_size) if args.load_model != '000000000000': self.log = args.log_directory + args.load_model + '/' args.time_stamp = args.load_model[:12] args.start_epoch = self.load_model() self.log = args.log_directory + args.time_stamp + config + '/' self.writer = SummaryWriter(self.log)
def __init__(self): self.mode = "train" with open("config.yaml") as reader: self.config = yaml.safe_load(reader) print(self.config) self.load_config() self.online_net = DQN(config=self.config, word_vocab=self.word_vocab, char_vocab=self.char_vocab, answer_type=self.answer_type) self.target_net = DQN(config=self.config, word_vocab=self.word_vocab, char_vocab=self.char_vocab, answer_type=self.answer_type) self.online_net.train() self.target_net.train() self.update_target_net() for param in self.target_net.parameters(): param.requires_grad = False if self.use_cuda: self.online_net.cuda() self.target_net.cuda() self.naozi = ObservationPool(capacity=self.naozi_capacity) # optimizer self.optimizer = torch.optim.Adam( self.online_net.parameters(), lr=self.config['training']['optimizer']['learning_rate']) self.clip_grad_norm = self.config['training']['optimizer'][ 'clip_grad_norm']
def __init__(self): # build models self.Qt = DQN(in_channels=5, num_actions=18) # Controller Q network self.Qt_t = DQN(in_channels=5, num_actions=18) # Controller target network # self.meta_controller = Model(in_channels=4, num_actions=10) self.Q = None # Meta-Controller Q network self.Q_t = None # Meta-Controller target network
def main(): max_episodes = 5000 replay_buffer = deque() with tf.Session() as sess: mainDQN = DQN(sess, input_size, output_size, name='main') targetDQN = DQN(sess, input_size, output_size, name='target') tf.global_variables_initializer().run() copy_ops = get_copy_var_ops(dest_scope_name='target', src_scope_name='main') sess.run(copy_ops) for episode in range(max_episodes): e = 1. / ((episode / 10) + 1) done = False step_count = 0 state = env.reset() while not done: if np.random.rand(1) < e: action = env.action_space.sample() else: action = np.argmax(mainDQN.predict(state)) next_state, reward, done, _ = env.step(action) if done: reward = -100 replay_buffer.append((state, action, reward, next_state, done)) if len(replay_buffer) > REPLAY_MEMORY: replay_buffer.popleft() state = next_state step_count += 1 if step_count > 10000: break print("Episode: {} Steps: {}".format(episode, step_count)) if step_count > 10000: pass if episode % 10 == 1: for _ in range(50): minibatch = random.sample(replay_buffer, 10) loss, _ = replay_train(mainDQN, targetDQN, minibatch) print("Loss: ", loss) sess.run(copy_ops) bot_play(mainDQN)
def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_dim, self.config.action_dim).cuda() self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
def __init__(self, env, model, optimizer, criterion, reward_func, config): super(DQNSolver, self).__init__(env, model, optimizer, criterion, reward_func, config) self.init_eps, self.final_eps, self.eps_step = config.init_eps, config.final_eps, config.eps_step self.target = DQN(in_c=config.in_c, num_actions=config.num_actions).to(self.device) self.target.load_state_dict(self.model.state_dict()) self.batch_size, self.num_actions = config.batch_size, config.num_actions self.reward_mean, self.reward_list = None, deque(maxlen=config.display_interval) self.epsilon = self.init_eps if config.visdom: self._build_visdom()
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.saved_model_path = args.saved_model_path self.experiment = args.experiment self.plots_path = args.plots_path self.data_save_path = args.data_save_path self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model and os.path.isfile(args.model): # Always load tensors onto CPU by default, will shift to GPU if necessary self.online_net.load_state_dict(torch.load(args.model, map_location='cpu')) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) # list of layers: self.online_net_layers = [self.online_net.conv1, self.online_net.conv2, self.online_net.conv3, self.online_net.fc_h_v, self.online_net.fc_h_a, self.online_net.fc_z_v, self.online_net.fc_z_a ] self.target_net_layers = [self.target_net.conv1, self.target_net.conv2, self.target_net.conv3, self.target_net.fc_h_v, self.target_net.fc_h_a, self.target_net.fc_z_v, self.target_net.fc_z_a ] # freeze all layers except the last, and reinitialize last if args.freeze_layers > 0: self.freeze_layers(args.freeze_layers) if args.reinitialize_layers > 0: self.reinit_layers(args.reinitialize_layers)
def __init__(self, param_server, batch_size, num_channels, num_actions): self.learner_network = DQN(num_channels, num_actions).cuda().float() self.learner_target_network = DQN(num_channels, num_actions).cuda().float() self.count = 0 self.batch_size = batch_size self.writer = SummaryWriter(f'runs/apex/learner') self.lr = LR self.optimizer = optim.Adam(self.learner_network.parameters(), self.lr) self.param_server = param_server
def __init__(self, env, taskCount=100, alpha=0.4, hiddenSize=500, perfix=''): self.GAMMA = 0.99 self.epsilon = 0.3 self.epsilon_end = 0.05 self.epsilon_decay = 200 self.update_step = 20 self.memory_size = 2000 self.max_epoch = 500 self.batch_size = 32 # self.max_epoch = 500 # self.batch_size = 1 # self.memory_size = 1 # self.update_step = 1 self.hiddenSize = hiddenSize # self.save_path = '../Model/' + str(taskCount) + '-' + str(alpha) + perfix +'.pth' self.save_path = '../Model/' + perfix + '-' + str( taskCount) + '-' + str(alpha) + '.pth' # Variables self.var_phi = autograd.Variable(torch.Tensor(6), volatile=True) # For training self.var_batch_phi = autograd.Variable(torch.Tensor( self.batch_size, 6)) self.var_batch_a = autograd.Variable(torch.LongTensor( self.batch_size, 1), requires_grad=False) self.var_batch_r = autograd.Variable(torch.Tensor(self.batch_size, 1)) self.var_batch_phi_next = autograd.Variable( torch.Tensor(self.batch_size, 6)) self.var_batch_r_mask = autograd.Variable(torch.Tensor( self.batch_size, 1), requires_grad=False) self.MP = MemoryReplay(self.memory_size, self.batch_size) self.dqn = DQN(hiddenSize=self.hiddenSize) self.target_dqn = DQN(hiddenSize=self.hiddenSize) self.target_dqn.load_state_dict(self.dqn.state_dict()) self.optimz = optim.RMSprop(self.dqn.parameters(), lr=0.00025, alpha=0.9, eps=1e-02, momentum=0.0) self.env = env
def run(): policy_net = DQN(num_channels, 19).cuda() target_net = DQN(num_channels, 19).cuda() optimizer = optim.Adam(policy_net.parameters(), LR) memory = Memory(50000) env = gym.make(ENV_NAME) env.make_interactive(port=6666, realtime=False) max_epi = 100 n_step = 2 update_period = 10 gamma = 0.99 total_steps = 0 epsilon = 0.95 endEpsilon = 0.01 stepDrop = (epsilon - endEpsilon) / max_epi for num_epi in range(max_epi): obs = env.reset() state = converter(ENV_NAME, obs).cuda() state = state.float() done = False total_reward = 0 steps = 0 if epsilon > endEpsilon: epsilon -= stepDrop while not done: steps += 1 total_steps += 1 a_out = policy_net.sample_action(state, epsilon) action_index = a_out action = make_19action(env, action_index) obs_prime, reward, done, info = env.step(action) total_reward += reward if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) writer.add_scalar('Rewards/train', total_reward, num_epi) break state_prime = converter(ENV_NAME, obs_prime).cuda() append_sample(memory, policy_net, target_net, state, action_index, reward, state_prime, done) state = state_prime if memory.size() > 1000: update_network(policy_net, target_net, memory, 2, optimizer, total_steps) if total_steps % 2000 == 0: update_target(policy_net, target_net)
def replay(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: action = brain.get_action() # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) # 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다. time.sleep(0.3) print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
def replay(): print('wake up the brain...') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) time.sleep(0.3) print('episode: %d, score: %d' % (episode + 1, total_reward))
def main(): env = gym.make(config.ENV_NAME) agent = DQN(env) optimizer = optim.Adam(agent.parameters(), lr=0.001) finished = False for epoch in range(config.EPOCHS): state = env.reset() for step in range(config.ITERATIONS): action = agent.get_action(state, 'egreedy') next_state, reward, done, _ = env.step(action[0, 0]) if done: reward = -1 agent.replay_memory.push(Transition( config.FloatTensor([state]), action, config.FloatTensor([reward]), config.FloatTensor([next_state]) if not done else None)) state = next_state if len(agent.replay_memory) >= config.BATCH_SIZE: batch = agent.replay_memory.sample(config.BATCH_SIZE) batch = Transition(*zip(*batch)) non_final_mask = config.ByteTensor( [s is not None for s in batch.next_state]) non_final_next_state_batch = Variable(torch.cat([ s for s in batch.next_state if s is not None])) state_batch = Variable(torch.cat(batch.state), requires_grad=False) action_batch = Variable(torch.cat(batch.action).view(-1, 1), requires_grad=False) reward_batch = Variable(torch.cat(batch.reward), requires_grad=False) q_values = agent(state_batch).gather(1, action_batch) s_values = Variable(torch.zeros(config.BATCH_SIZE).type( config.FloatTensor), requires_grad=False) s_values[non_final_mask] = agent( non_final_next_state_batch).max(1)[0] expected_q_values = config.GAMMA * s_values + reward_batch loss = F.smooth_l1_loss(torch.sum(q_values), torch.sum(expected_q_values)) optimizer.zero_grad() loss.backward() optimizer.step() if done: break agent.epsilon = config.EPSILON_START - epoch / config.EPOCHS * ( config.EPSILON_START - config.EPSILON_END) if epoch % config.TEST_INTERVAL == 0: sum_reward = 0 for _epoch in range(config.TEST_EPOCHS): epoch_reward = 0 state = env.reset() for step in range(config.TEST_ITERATIONS): # env.render() action = agent.get_action(state) # Default state, reward, done, _ = env.step(action[0, 0]) if done: break epoch_reward += reward sum_reward += epoch_reward avg_reward = sum_reward / config.TEST_EPOCHS print('Epoch: {}, Average Reward: {}'.format(epoch, avg_reward)) print('Current Epsilon:', agent.epsilon) if avg_reward > 195: finished = True if finished: break while True: state = env.reset() round_reward = 0 for step in range(config.TEST_ITERATIONS): env.render() action = agent.get_action(state) # Default state, reward, done, _ = env.step(action[0, 0]) if done: break round_reward += reward print('Round reward:', round_reward)
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() # 타겟 네트웍을 초기화합니다. brain.update_target_network() # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다. epsilon = 1.0 # 프레임 횟수 time_step = 0 total_reward_list = [] # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 # 게임을 초기화하고 현재 상태를 가져옵니다. # 상태는 screen_width x screen_height 크기의 화면 구성입니다. state = game.reset() brain.init_state(state) while not terminal: # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고 # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다. # 초반엔 학습이 적게 되어 있기 때문입니다. # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어 # 나중에는 거의 사용하지 않게됩니다. if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다. # 초반에는 학습이 전혀 안되어 있기 때문입니다. if episode > OBSERVE: epsilon -= 1 / 1000 # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다. state, reward, terminal = game.step(action) total_reward += reward # 현재 상태를 Brain에 기억시킵니다. # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다. brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: # DQN 으로 학습을 진행합니다. brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: # 타겟 네트웍을 업데이트 해 줍니다. brain.update_target_network() time_step += 1 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def train(): print('wake up the brain...') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() brain.update_target_network() epsilon = 1.0 time_step = 0 total_reward_list = [] for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() if episode > OBSERVE: epsilon -= 1 / 1000. state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 print('episode: %d, score: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)