def __init__(self, state_size, action_size, seed, is_prioritized_sample=False): '''Initialize an Agent. Params ====== state_size (int): the dimension of the state action_size (int): the number of actions seed (int): random seed ''' self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP) self.is_prioritized_sample = is_prioritized_sample self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device) self.qnetowrk_target = QNetwork(self.state_size, self.action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if self.is_prioritized_sample == False: self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed) else: self.replay_memory = PrioritizedReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)
def __init__(self, sess, s_size, a_size, scope, queues, trainer): self.queue = queues[0] self.param_queue = queues[1] self.replaymemory = ReplayMemory(100000) self.sess = sess self.learner_net = network(s_size, a_size, scope, 20) self.q = self.learner_net.q self.Q = self.learner_net.Q self.actions_q = tf.placeholder(shape=[None, a_size, N], dtype=tf.float32) self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32) self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32) self.q_actiona = tf.multiply(self.q, self.actions_q) self.q_action = tf.reduce_sum(self.q_actiona, axis=1) self.u = tf.abs(self.q_target - self.q_action) self.loss = tf.reduce_mean( tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1)) self.local_vars = self.learner_net.local_vars #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) self.gradients = tf.gradients(self.loss, self.local_vars) #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0) self.apply_grads = trainer.apply_gradients( zip(self.gradients, self.local_vars)) self.sess.run(tf.global_variables_initializer())
def __init__(self, policy_net, target_net, durability, optimizer, name, constants): """An agent class that takes action on the environment and optimizes the action based on the reward. Parameters ---------- policy_net : DQN [description] target_net : DQN [description] durability : int [description] optimizer : [type] [description] name : str The name of agent constants: Constants The hyper-parameters from Constants class """ self.CONSTANTS = constants self.policy_net = policy_net self.target_net = target_net self.target_net.load_state_dict(policy_net.state_dict()) self.durability = durability self.optimizer = optimizer self.name = name self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE) self.steps_done = 0 self.total_reward = 0.0 self.reward = 0.0 self.obtained_reward = 0.0 self.n_best = 0 self.policy_net_flag = False
def __init__(self, env, mode, pre_trained_model, tensorboard_writer=None): super(DQNAgent, self).__init__(env, mode, tensorboard_writer) self.agent_name = 'DQN' + str(self.agent_no) self.memory = ReplayMemory() self.network = DeepQNetwork(self.obs_space[0], self.action_space) if self.mode == 'play': self.network.load_params(pre_trained_model) self.network.eval() elif self.mode == 'train': self.eval_network = DeepQNetwork(self.obs_space[0], self.action_space) self.eval_network.eval() if pre_trained_model: self.eval_network.load_params(pre_trained_model) self.optimizer = optim.RMSprop(self.network.parameters(), lr=LR) self.loss_func = SmoothL1Loss() else: raise ValueError( 'Please set a valid mode for the agent (play or train)')
def __init__(self, load_checkpoint, n_states, n_actions, checkpoint_file, mem_size=10**6, batch_size=64, n_hid1=400, n_hid2=300, alpha=1e-4, beta=1e-3, gamma=0.99, tau=0.99): self.batch_size = batch_size self.gamma = gamma self.tau = tau self.actor = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, alpha, checkpoint_file, name='actor') self.critic = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, beta, checkpoint_file, name='critic') self.actor_target = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, alpha, checkpoint_file, name='actor_target') self.critic_target = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, beta, checkpoint_file, name='critic_target') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.memory = ReplayMemory(mem_size, n_states, n_actions) self.update_network_parameters_phil(tau=1) if load_checkpoint: self.actor.eval() self.load_checkpoint = load_checkpoint
def main(game, episodes, training_mode=False, log=False, no_ops=30): env = gym.make(game) num_actions = env.action_space.n dqn = DeepQNetwork(num_actions, (4, 84, 84)) replay = ReplayMemory(100000) obs = env.reset() h, w, c = obs.shape phi = Phi(4, 84, 84, c, h, w) agent = Agent(replay, dqn, training_mode=training_mode) stats = Stats('results/results.csv') for i_episode in range(episodes): env.reset() for i in range(random.randint(1, no_ops)): observation, _, _, _ = env.step(0) pre_state = phi.add(observation) game_score = 0 done = False t = 0 while not done: t += 1 env.render() action = agent.get_action(pre_state) observation, reward, done, _ = env.step(action) post_state = phi.add(observation) if training_mode: agent.update_replay_memory(pre_state, action, reward, post_state, done) if agent.time_step > agent.replay_start_size: stats.log_time_step(agent.get_loss()) pre_state = post_state game_score += reward print("Episode {} finished after {} time steps with score {}".format( i_episode, t, game_score)) phi.reset() if agent.time_step > agent.replay_start_size: stats.log_game(game_score, t) stats.close() if log: dqn.save_model('results/model_weights.hdf5')
def __init__(self, env, name, s_size, a_size, trainer, model_path, global_episodes): self.name = "worker_" + str(name) self.number = name self.model_path = model_path self.trainer = trainer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] #Create the local copy of the network and the tensorflow op to copy global paramters to local network self.local_Q = Q_Network(s_size, a_size, self.name, trainer) self.update_local_ops = update_target_graph('global', self.name) self.env = env self.replaymemory = ReplayMemory(max_memory)
def main(): game = FlappyBird() env = PLE(game, fps=30, display_screen=False) env_evaluate = PLE(game, fps=30, display_screen=False) obs_dim = len(env.getGameState()) action_dim = 2 # 只能是up键,还有一个其它,所以是2 # rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim) rpm = ReplayMemory(MEMORY_SIZE) model = Model(act_dim=action_dim) algorithm = parl.algorithms.DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent( algorithm, obs_dim=obs_dim, act_dim=action_dim, e_greed=0.2, # explore e_greed_decrement=1e-6 ) # probability of exploring is decreasing during training if os.path.exists('./model_dir'): agent.restore('./model_dir') # while rpm.size() < MEMORY_WARMUP_SIZE: # warm up replay memory while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory run_episode(agent, env, rpm) max_episode = 5000 # start train episode = 0 while episode < max_episode: # train part for i in range(0, 50): total_reward = run_episode(agent, env, rpm) episode += 1 eval_reward = evaluate(agent, env_evaluate) logger.info('episode:{} test_reward:{}'.format( episode, eval_reward)) agent.save('./model_dir')
def __init__(self, num_states, num_actions, Double, Dueling, PER): self.num_actions = num_actions # 행동 가짓수(2)를 구함 self.Double = Double self.Dueling = Dueling self.PER = PER # transition을 기억하기 위한 메모리 객체 생성 self.memory = ReplayMemory(CAPACITY) # 신경망 구성 n_in, n_mid, n_out = num_states, 32, num_actions self.main_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용 self.target_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용 print(self.main_q_network) # 신경망의 구조를 출력 # 최적화 기법을 선택 self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001) # PER - TD 오차를 기억하기 위한 메모리 객체 생성 if self.PER == True: self.td_error_memory = TDerrorMemory(CAPACITY)
def __init__(self, dim): self.critic_path = cst.CN_CKPT_PATH self.actor_path = cst.AN_CKPT_PATH self.replaymemory_path = cst.RM_PATH self.dim_body = dim[0] self.dim_sensor = dim[1] self.dim_state = dim[0] + dim[1] * 3 self.dim_action = dim[2] self.sess = tf.InteractiveSession() self.act_lr = cst.ACT_LEARNING_RATE self.cri_lr = cst.CRI_LEARNING_RATE self.tau = cst.TAU self.batch_size = cst.BATCH_SIZE self.gamma = cst.REWARD_DECAY self.actorNN = ActorNetwork(self.sess, self.dim_state, self.dim_action, self.act_lr, self.tau, self.batch_size) self.criticNN = CriticNetwork(self.sess, self.dim_state, self.dim_action, self.cri_lr, self.tau, self.gamma, self.actorNN.get_num_trainable_vars()) self.sess.run(tf.global_variables_initializer()) self.actorNN.update_target_network() self.criticNN.update_target_network() self.rm = ReplayMemory('DDPG') self.agent_count = cst.AGENT_COUNT self.exploration_rate = cst.EXPLORATION_RATE self.epsilon = cst.CRITIC_EPSILON self.LOSS_ITERATION = cst.LOSS_ITERATION self.expl_noise = OUNoise(self.dim_action) self.expl = False self.expl_decay = cst.EXPLORATION_DECAY
N_EPOCHS = 4 N_SAMPLES = 1000 SAMPLE_LENGTH = 15 memory_capacity = 2000 GAMMA = .997 LAMBDA = .95 EPSILON = .2 TARGET_DISCOUNT = .4 N_TIMESTEPS_PER_UPDATE = 300 # ~~~~~~~~~~~~~~~~~~ # Initialization # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ env = gym.make('CartPole-v1') replay_memory = ReplayMemory(memory_capacity) policy_net = Actor(sum(env.observation_space.shape), 200, env.action_space.n) value_net = Critic(sum(env.observation_space.shape), 200, 1) target_value_net = Critic(sum(env.observation_space.shape), 200, 1) target_value_net.load_state_dict(value_net.state_dict()) target_value_net.eval() params = list(policy_net.parameters()) + list(value_net.parameters()) optimizer = optim.SGD(params, lr=1e-3, momentum=.9, weight_decay=1e-6) writer = SummaryWriter() reward_normalizer = RewardNormalizer() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def __init__(self, load_checkpoint, checkpoint_file, env, n_states, n_actions, update_actor_interval=2, warmup=1000, mem_size=10**6, batch_size=100, n_hid1=400, n_hid2=300, lr_alpha=1e-3, lr_beta=1e-3, gamma=0.99, tau=5e-3, noise_mean=0, noise_sigma=0.1): self.load_checkpoint = load_checkpoint self.checkpoint_file = checkpoint_file # needed for clamping in the learn function self.env = env self.max_action = float(env.action_space.high[0]) self.low_action = float(env.action_space.low[0]) self.n_actions = n_actions # to keep track of how often we call "learn" function, for the actor network self.learn_step_counter = 0 # to handle countdown to the end of the warmup period, incremented every time we call an action self.time_step = 0 self.update_actor_interval = update_actor_interval self.warmup = warmup self.gamma = gamma self.tau = tau self.batch_size = batch_size self.noise_mean = noise_mean self.noise_sigma = noise_sigma self.actor = TD3ActorNetwork(n_states, n_actions, n_hid1, n_hid2, lr_alpha, checkpoint_file, name='actor') self.target_actor = TD3ActorNetwork(n_states, n_actions, n_hid1, n_hid2, lr_alpha, checkpoint_file, name='target_actor') self.critic_1 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='critic_1') self.critic_2 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='critic_2') self.target_critic_1 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='target_critic_1') self.target_critic_2 = TD3CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr_beta, checkpoint_file, name='target_critic_2') self.memory = ReplayMemory(mem_size, n_states, n_actions) # tau=1 perform an exact copy of the networks to the respective targets # self.update_network_parameters(tau=1) self.update_network_parameters(self.actor, self.target_actor, tau=1) self.update_network_parameters(self.critic_1, self.target_critic_1, tau=1) self.update_network_parameters(self.critic_2, self.target_critic_2, tau=1)
image_dimensions = 210 * 160 * 3 num_episodes = 50 target_episode_update = 5 action_threshold = 250 train_batch_size = 64 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 steps_done = 0 n_actions = env.action_space.n screen_height = 210 screen_width = 160 memory = ReplayMemory(10000) policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) def optimize_model(): if len(memory) < train_batch_size: return transitions = memory.sample(train_batch_size) print('Training on:', len(transitions))
def __init__(self, game, mem_size = 1000000, state_buffer_size = 4, batch_size = 64, learning_rate = 1e-5, pretrained_model = None, frameskip = 4 ): """ Inputs: - game: string to select the game - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - record: boolean to enable record option """ # Namestring self.game = game # Environment self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip) # Cuda self.use_cuda = torch.cuda.is_available() # Neural network self.net = DQN(channels_in = state_buffer_size, num_actions = self.env.get_number_of_actions()) self.target_net = DQN(channels_in = state_buffer_size, num_actions = self.env.get_number_of_actions()) if self.use_cuda: self.net.cuda() self.target_net.cuda() if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate) #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 1 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 50000 else: self.start_train_after = mem_size//2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500
def __init__( self, game, mem_size=512 * 512, #1024*512, state_buffer_size=4, batch_size=64, learning_rate=1e-5, pretrained_model=None, frameskip=4, #1 record=False): """ Inputs: - game: string to select the game - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - record: boolean to enable record option """ # Namestring self.game = game # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders) #if self.game == 'Breakout-v0': # dimensions = (32, 195, 8, 152) #elif self.game == 'SpaceInvaders-v0': # dimensions = (21, 195, 20, 141) #elif self.game == 'Assault-v0': # dimensions = (50, 240, 5, 155) #elif self.game == 'Phoenix-v0': # dimensions = (23, 183, 0, 160) #elif self.game == 'Skiing-v0': # dimensions = (55, 202, 8, 152) #elif self.game == 'Enduro-v0': # dimensions = (50, 154, 8, 160) #elif self.game == 'BeamRider-v0': # dimensions = (32, 180, 9, 159) if self.game == 'BreakoutAndSpace': dimensions_break = (32, 195, 8, 152) dimensions_space = (21, 195, 20, 141) elif self.game != 'BreakoutAndSpace': print( 'Error! This version is for playing BreakOut and SpaceInvaders at the same time.' ) # Environment self.env_break = Environment('BreakoutNoFrameskip-v4', dimensions_break, frameskip=frameskip) self.env_space = Environment('SpaceInvaders-v0', dimensions_space, frameskip=frameskip) # Cuda self.use_cuda = torch.cuda.is_available() # Neural network self.net = DQN(channels_in=state_buffer_size, num_actions=self.env_space.get_number_of_actions()) self.target_net = DQN( channels_in=state_buffer_size, num_actions=self.env_space.get_number_of_actions()) if self.use_cuda: self.net.cuda() self.target_net.cuda() if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate) #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 4 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 25000 else: self.start_train_after = mem_size // 2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500
EPISODES = 500 START_RANDOM = False MAX_EPISODE_COUNTER = 3600 * 24 * 2.0 / PERIOD ACTION_DIM = 1 STATE_DIM = 6 ACTION_MAX = 1.0 MAX_BUFFER = 100000 MAX_TOTAL_REWARD = 300 EPISODE_PLOT = 25 # -------------------------------------------- # # LOAD USEFULL CLASSES. # -------------------------------------------- # # Load the memroy memory = ReplayMemory(MAX_BUFFER) # Load the environment. env = Environment(FILENAME, QUOTE_QTY, TRADE_QTY) # Load the trainer. trainer = Trainer(STATE_DIM, ACTION_DIM, ACTION_MAX, memory) # Load the window. window = Window(LOOK_BACK) window.add_norm("#t", method="log_change", ref="close_price_#t") # Load the tensorboard writer. writer = SummaryWriter("tensorboard/runs") # -------------------------------------------- #
def __init__(self, game, agent_type, display, load_model, record, test): self.name = game self.agent_type = agent_type self.ale = ALEInterface() self.ale.setInt(str.encode('random_seed'), np.random.randint(100)) self.ale.setBool(str.encode('display_screen'), display or record) if record: self.ale.setString(str.encode('record_screen_dir'), str.encode('./data/recordings/{}/{}/tmp/'.format(game, agent_type))) self.ale.loadROM(str.encode('./roms/{}.bin'.format(self.name))) self.action_list = list(self.ale.getMinimalActionSet()) self.frame_shape = np.squeeze(self.ale.getScreenGrayscale()).shape if test: self.name += '_test' if 'space_invaders' in self.name: # Account for blinking bullets self.frameskip = 2 else: self.frameskip = 3 self.frame_buffer = deque(maxlen=4) if load_model and not record: self.load_replaymemory() else: self.replay_memory = ReplayMemory(500000, 32) model_input_shape = self.frame_shape + (4,) model_output_shape = len(self.action_list) if agent_type == 'dqn': self.model = DeepQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) elif agent_type == 'double': self.model = DoubleDQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) else: self.model = DuelingDQN( model_input_shape, model_output_shape, self.action_list, self.replay_memory, self.name, load_model ) print('{} Loaded!'.format(' '.join(self.name.split('_')).title())) print('Displaying: ', display) print('Frame Shape: ', self.frame_shape) print('Frame Skip: ', self.frameskip) print('Action Set: ', self.action_list) print('Model Input Shape: ', model_input_shape) print('Model Output Shape: ', model_output_shape) print('Agent: ', agent_type)
if not os.path.exists(model_path): os.makedirs(model_path) env = get_env(task) a_size = env.action_space.n global_episodes = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False) trainer = tf.train.AdamOptimizer(learning_rate=0.00015) num_workers = 4 batch_size = 10 max_memory = 300000 replaymemory = ReplayMemory(max_memory) saver = tf.train.Saver(max_to_keep=5) lock = threading.Lock() with tf.Session() as sess: UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() UPDATE_EVENT.clear() ROLLING_EVENT.set() GLOBAL_STEP = 0 coord = tf.train.Coordinator() master_network = Apex_Network(sess, s_size, a_size, 'global', trainer) workers = [] for i in range(num_workers): env = get_env(task) workers.append(
def __init__(self, load_checkpoint, checkpoint_file, env, n_states, n_actions, mem_size=10**6, batch_size=256, n_hid1=256, n_hid2=256, lr=3e-4, gamma=0.99, tau=5e-3, reward_scale=2): self.load_checkpoint = load_checkpoint self.max_action = float(env.action_space.high[0]) self.low_action = float(env.action_space.low[0]) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.reward_scale = reward_scale self.memory_counter = 0 self.memory = ReplayMemory(mem_size, n_states, n_actions) self.actor = ActorNetwork(n_states, n_actions, n_hid1, n_hid2, self.max_action, lr, checkpoint_file, name='_actor') self.critic_1 = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr, checkpoint_file, name='_crtic1') self.critic_2 = CriticNetwork(n_states, n_actions, n_hid1, n_hid2, lr, checkpoint_file, name='_crtic2') self.value_net = ValueNetwork(n_states, n_hid1, n_hid2, lr, checkpoint_file, name='_value') self.target_value_net = ValueNetwork(n_states, n_hid1, n_hid2, lr, checkpoint_file, name='_value_target') # tau=1 performs an exact copy of the networks to the respective targets # self.update_network_parameters(tau=1) self.update_network_parameters(self.value_net, self.target_value_net, tau=1)
def __init__(self, game1, game2, mem_size = 1000000, state_buffer_size = 4, batch_size = 64, learning_rate = 1e-5, pretrained_model = None, pretrained_subnet1 = False, pretrained_subnet2 = False, frameskip = 4, frozen = False ): """ Inputs: - game 1: string to select the game 1 - game 2: string to select the game 2 - mem_size: int length of the replay memory - state_buffer_size: int number of recent frames used as input for neural network - batch_size: int - learning_rate: float - pretrained_model: str path to the model - pretrained_subnet1: str path to the model of the subnet - pretrained_subnet2: str path to the model of the subnet - frozen: boolean freeze pretrained subnets """ # Namestring self.game1 = game1 self.game2 = game2 # Environment self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip) self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip) # Neural net self.pretrained_subnet1 = pretrained_subnet1 self.pretrained_subnet2 = pretrained_subnet2 self.net = TwinDQN(channels_in = state_buffer_size, num_actions = self.env2.get_number_of_actions(), pretrained_subnet1 = pretrained_subnet1, pretrained_subnet2 = pretrained_subnet2, frozen = frozen) self.target_net = TwinDQN(channels_in = state_buffer_size, num_actions = self.env2.get_number_of_actions(), pretrained_subnet1 = pretrained_subnet1, pretrained_subnet2 = pretrained_subnet2, frozen = frozen) # Cuda self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.net.cuda() self.target_net.cuda() # Pretrained if pretrained_model: self.net.load(pretrained_model) self.target_net.load(pretrained_model) self.pretrained_model = True else: self.pretrained_model = False # Optimizer self.learning_rate = learning_rate self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()), lr=learning_rate) #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()), # lr=learning_rate,alpha=0.95, eps=0.01) self.batch_size = batch_size self.optimize_each_k = 1 self.update_target_net_each_k_steps = 10000 self.noops_count = 0 # Replay Memory (Long term memory) self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size) self.mem_size = mem_size # Fill replay memory before training if not self.pretrained_model: self.start_train_after = 50000 else: self.start_train_after = mem_size//2 # Buffer for the most recent states (Short term memory) self.num_stored_frames = state_buffer_size # Steps self.steps = 0 # Save net self.save_net_each_k_episodes = 500