def DQN(args): # Initialize replay memory D to capacity N memory = ReplayBuffer(size=args.replay_mem_size, frame_history_len=4) exploration = LinearExplorationSchedule(args.eps_start, args.eps_end, args.eps_decay) #exploration = ExponentialExplorationSchedule(args.eps_start, args.eps_end, args.eps_decay) # Initialize action-value function Q with random weights D = PRE_PROCESS_OUTPUT_DIM n_actions = encode_action.n_actions = args.num_actions q_target = QNet(n_actions=n_actions, n_input_ch=history_len * n_channels, input_shape=(D, D)).to(get_device()) q_behavior = QNet(n_actions=n_actions, n_input_ch=history_len * n_channels, input_shape=(D, D)).to(get_device()) q_target.eval() # Freeze target network for p in q_target.parameters(): p.requires_grad = False q_behavior.train() # Copy the weights, so both Q-value approximators initialize the same q_behavior.load_state_dict(q_target.state_dict()) criterion = nn.MSELoss() #criterion = nn.SmoothL1Loss() # Huber loss # “Human-level control through deep reinforcement learning” - rmsprop config LEARNING_RATE = 0.00025 ALPHA = 0.95 EPS = 0.01 optimizer = torch.optim.RMSprop( q_behavior.parameters(), lr=LEARNING_RATE, alpha=ALPHA, eps=EPS) # , lr=0.00025, momentum=0.95, eps=0.01) reward_ema = ExponentialMovingAvg(args.reward_eam_factor) max_return = -np.inf cnt_transitions = 0 for episode in itertools.count(): with GameProgressBar(episode) as progress_bar: episode_return, n_transitions = run_episode( episode, memory, cnt_transitions, q_behavior, q_target, optimizer, criterion, exploration, progress_bar) reward_ema.update(episode_return) cnt_transitions += n_transitions if episode % args.target_update_rate == 0: update_target_network(q_behavior, q_target) max_return = max(max_return, episode_return) writer.add_scalar('running_return', reward_ema.value, episode) # print(f"End of episode {episode} (G={episode_return} " # f"transitions={n_transitions} max_return={max_return} " # f"reward_ema={reward_ema.value})") print(' '.join([ f'reward={episode_return:.2f}', f'running mean={reward_ema.value:.2f}' ]), end='') env.close()
def run_td_realtime(**kargs): if kargs['output_dir'] is None and kargs['logdir'] is not None: kargs['output_dir'] = kargs['logdir'] from collections import namedtuple args = namedtuple("TDRealtimeParams", kargs.keys())(*kargs.values()) if 'dont_init_tf' not in kargs.keys() or not kargs['dont_init_tf']: init_nn_library(True, "1") env = get_env(args.game, args.atari, args.env_transforms) envOps = EnvOps(env.observation_space.shape, env.action_space.n, args.learning_rate, mode="train") print(env.observation_space.low) print(env.observation_space.high) env_model = globals()[args.env_model](envOps) if args.env_weightfile is not None: env_model.model.load_weights(args.env_weightfile) v_model = globals()[args.vmodel](envOps) import numpy as np td_model = TDNetwork(env_model.model, v_model, envOps) summary_writer = tf.summary.FileWriter(args.logdir, K.get_session().graph) if not args.logdir is None else None replay_buffer = ReplayBuffer(args.replay_buffer_size, 1, args.update_frequency, args.replay_start_size, args.batch_size) from utils.network_utils import NetworkSaver network_saver = NetworkSaver(args.save_freq, args.logdir, v_model.model) v_agent = VAgent(env.action_space, env_model, v_model, envOps, summary_writer, True, replay_buffer, args.target_network_update) egreedyOps = EGreedyOps() if replay_buffer is not None: egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE egreedyOps.mode = args.mode egreedyOps.test_epsilon = args.test_epsilon #egreedyOps.FINAL_EXPLORATION_FRAME = 10000 if args.mode == "train": egreedyOps.FINAL_EXPLORATION_FRAME = args.egreedy_final_step if args.mode == "train": if args.egreedy_decay<1: egreedyOps.DECAY = args.egreedy_decay egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, v_agent) else: egreedyAgent = MultiEGreedyAgent(env.action_space, egreedyOps, v_agent, args.egreedy_props, args.egreedy_final, final_exp_frame=args.egreedy_final_step) else: egreedyAgent = EGreedyAgent(env.action_space, egreedyOps, v_agent) runner = Runner(env, egreedyAgent, None, 1, max_step=args.max_step, max_episode=args.max_episode) runner.listen(replay_buffer, None) runner.listen(v_agent, None) runner.listen(egreedyAgent, None) runner.listen(network_saver, None) #runner.run() return runner, v_agent
def run(config): if config.saved_model is None: raise Exception("In Evaluation Mode, the saved model couldn't be None") torch.manual_seed(config.seed) np.random.seed(config.seed) assert "NoFrameskip" in config.env, "Require environment with no frameskip" env = gym.make(config.env) env.seed(config.seed) env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) env = EpisodicLifeEnv(env) env = FireResetEnv(env) env = WarpFrame(env) env = PyTorchFrame(env) env = ClipRewardEnv(env) env = FrameStack(env, 4) # env = gym.wrappers.Monitor(env, './video/', video_callable=lambda episode_id: episode_id % 1 == 0, force=True) ifi = 1 / config.fps replay_buffer = ReplayBuffer(config.buffer_size) agent = DDPGAgent(env.observation_space, env.action_space, replay_buffer) print(f"Loading the networks parameters - {config.saved_model} ") agent.load_params(torch.load(config.saved_model)) episodes_count = 0 for episode_i in range(config.num_episodes): state = env.reset() episode_reward = 0.0 if config.display: env.render() while True: calc_start = time.time() action = agent.step(state) next_state, reward, done, info = env.step(action) episode_reward += reward episodes_count += 1 if done: break state = next_state if config.display: calc_end = time.time() elapsed = calc_end - calc_start if elapsed < ifi: time.sleep(ifi - elapsed) env.render() print("********************************************************") print("steps: {}".format(episodes_count)) print("episodes: {}".format(episode_i)) print("mean episode reward: {}".format(episode_reward)) print("********************************************************") env.close()
def __init__(self, config, env, env_params, her): BaseAgent.__init__(self, config) self.env = env self.config = config self.env_params = env_params #self.network = ActorCriticDeterministic(env_params['obs'], env_params['action'], # env_params['goal'], # config.hidden_layers, # use_her=config.use_her) self.actor = actor(env_params) self.target_actor = actor(env_params) self.target_actor.load_state_dict(self.actor.state_dict()) #============= self.critic = critic(env_params) self.target_critic = critic(env_params) self.target_critic.load_state_dict(self.critic.state_dict()) # self.target_network = ActorCriticDeterministic(env_params['obs'], env_params['action'], # env_params['goal'], # config.hidden_layers, # use_her=config.use_her) # self.target_network.load_state_dict(self.network.state_dict()) self.her = her_sampler(config.replay_strategy, config.replay_k, env.compute_reward) self.replay_buffer = ReplayBuffer( env_params, buffer_size=int(config.buffer_size), sample_func=self.her.sample_her_transitions) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.config.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.config.clip_range) self.model_path = '/home/mohamed/Desktop/Project/utils'
def run_dqn(**kargs): if kargs['output_dir'] is None and kargs['logdir'] is not None: kargs['output_dir'] = kargs['logdir'] q_model_initial = kargs[ 'q_model_initial'] if 'q_model_initial' in kargs else None from collections import namedtuple args = namedtuple("DQNParams", kargs.keys())(*kargs.values()) if 'dont_init_tf' not in kargs.keys() or not kargs['dont_init_tf']: #init_nn_library(True, "1") init_nn_library("gpu" in kargs and kargs["gpu"] is not None, kargs["gpu"] if "gpu" in kargs else "1") #if args.atari: # env = gym_env(args.game + 'NoFrameskip-v0') # env = WarmUp(env, min_step=0, max_step=30) # env = ActionRepeat(env, 4) # #q_model = A3CModel(modelOps) #else: # if args.game == "Grid": # env = GridEnv() # else: # env = gym_env(args.game) # #q_model = TabularQModel(modelOps) #for trans in args.env_transforms: # env = globals()[trans](env) if 'use_env' in kargs and kargs['use_env'] is not None: env = kargs['use_env'] else: env = get_env(args.game, args.atari, args.env_transforms, kargs['monitor_dir'] if 'monitor_dir' in kargs else None) if 'env_model' in kargs and kargs['env_model'] is not None and kargs[ 'env_weightfile'] is not None: print('Using simulated environment') envOps = EnvOps(env.observation_space.shape, env.action_space.n, args.learning_rate) env_model = globals()[kargs['env_model']](envOps) env_model.model.load_weights(kargs['env_weightfile']) env = SimulatedEnv(env, env_model, use_reward='env_reward' in kargs and kargs['env_reward']) modelOps = DqnOps(env.action_count) modelOps.dueling_network = args.dueling_dqn viewer = None if args.enable_render: viewer = EnvViewer(env, args.render_step, 'human') if args.atari: proproc = PreProPipeline( [GrayPrePro(), ResizePrePro(modelOps.INPUT_SIZE)]) rewproc = PreProPipeline([RewardClipper(-1, 1)]) else: if env.observation_space.__class__.__name__ is 'Discrete': modelOps.INPUT_SIZE = env.observation_space.n else: modelOps.INPUT_SIZE = env.observation_space.shape modelOps.AGENT_HISTORY_LENGTH = 1 proproc = None rewproc = None modelOps.LEARNING_RATE = args.learning_rate if q_model_initial is None: q_model = globals()[args.model](modelOps) else: q_model = q_model_initial if not args.load_weightfile is None: q_model.model.load_weights(args.load_weightfile) summary_writer = tf.summary.FileWriter( args.logdir, K.get_session().graph) if not args.logdir is None else None agentOps = DqnAgentOps() agentOps.double_dqn = args.double_dqn agentOps.mode = args.mode if args.mode == "train": agentOps.TARGET_NETWORK_UPDATE_FREQUENCY = args.target_network_update replay_buffer = None if args.replay_buffer_size > 0: if 'load_trajectory' in kargs and kargs['load_trajectory'] is not None: replay_buffer = TrajectoryReplay(kargs['load_trajectory'], kargs['batch_size'], args.update_frequency, args.replay_start_size) else: replay_buffer = ReplayBuffer(args.replay_buffer_size, modelOps.AGENT_HISTORY_LENGTH, args.update_frequency, args.replay_start_size, args.batch_size) #replay_buffer = NStepBuffer(modelOps.AGENT_HISTORY_LENGTH, 8) agent = DqnAgent(env.action_space, q_model, replay_buffer, rewproc, agentOps, summary_writer) egreedyOps = EGreedyOps() if replay_buffer is not None: egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE egreedyOps.mode = args.mode egreedyOps.test_epsilon = args.test_epsilon #egreedyOps.FINAL_EXPLORATION_FRAME = 10000 if args.mode == "train": egreedyOps.FINAL_EXPLORATION_FRAME = args.egreedy_final_step if args.mode == "train": if args.egreedy_decay < 1: egreedyOps.DECAY = args.egreedy_decay egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, agent) else: egreedyAgent = MultiEGreedyAgent( env.action_space, egreedyOps, agent, args.egreedy_props, args.egreedy_final, final_exp_frame=args.egreedy_final_step) else: egreedyAgent = EGreedyAgent(env.action_space, egreedyOps, agent) runner = Runner(env, egreedyAgent, proproc, modelOps.AGENT_HISTORY_LENGTH, max_step=args.max_step, max_episode=args.max_episode) if replay_buffer is not None: runner.listen(replay_buffer, proproc) runner.listen(agent, None) runner.listen(egreedyAgent, None) if viewer is not None: runner.listen(viewer, None) if args.output_dir is not None: networkSaver = NetworkSaver( 50000 if 'save_interval' not in kargs else kargs['save_interval'], args.output_dir, q_model.model) runner.listen(networkSaver, None) return runner, agent
(env.observation_space['observation'].shape[0] // 2) * 2, env.action_space.shape[0]).to(device) crt = QNetwork((env.observation_space['observation'].shape[0] // 2) * 2, env.action_space.shape[0]).to(device) tgt_crt = QNetwork((env.observation_space['observation'].shape[0] // 2) * 2, env.action_space.shape[0]).to(device) tgt_crt.load_state_dict(crt.state_dict()) policy_optim = optim.Adam(policy.parameters(), lr=lr) crt_optim = optim.Adam(crt.parameters(), lr=lr) # %% noise = OUNoise(env.action_space) memory = ReplayBuffer(1000000) # %% def dist(x, y): x = x.cpu().numpy() y = y.cpu().numpy() res = np.linalg.norm(x - y, axis=1) return torch.tensor(res).unsqueeze(1).to(device) # %% def train_policy(act_net, crt_net, tgt_crt_net, optimizer_act,
return CartPoleModel(self.ops, m) q_model = CartPoleModel(modelOps) summary_writer = tf.summary.FileWriter( args.logdir, K.get_session().graph) if not args.logdir is None else None agentOps = DqnAgentOps() agentOps.double_dqn = args.double_dqn agentOps.TARGET_NETWORK_UPDATE_FREQUENCY = 20 #agentOps.REPLAY_START_SIZE = 100 #agentOps.FINAL_EXPLORATION_FRAME = 10000 replay_buffer = ReplayBuffer(int(2000), 1, 1, 1000, 64) #replay_buffer = NStepBuffer(modelOps.AGENT_HISTORY_LENGTH, 8) agent = DqnAgent(env.action_space, q_model, replay_buffer, rewproc, agentOps, summary_writer) egreedyOps = EGreedyOps() egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE egreedyOps.FINAL_EXPLORATION_FRAME = 10000 egreedyOps.FINAL_EXPLORATION = 0.01 egreedyOps.DECAY = 0.999 egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, agent) runner = Runner(env, egreedyAgent, proproc, 1) runner.listen(replay_buffer, proproc) runner.listen(agent, None) runner.listen(egreedyAgent, None)
def __init__(self, threadId, sess, graph): StoppableThread.__init__(self) self.threadId = threadId self.sess = sess self.graph = graph with self.graph.as_default(): if args.atari: env = gym_env(args.game + 'NoFrameskip-v0') env = WarmUp(env, min_step=0, max_step=30) env = ActionRepeat(env, 4) proproc = PreProPipeline( [GrayPrePro(), ResizePrePro(modelOps.INPUT_SIZE)]) rewproc = PreProPipeline([RewardClipper(-1, 1)]) #q_model = A3CModel(modelOps) else: if args.game == "Grid": env = GridEnv() else: env = gym_env(args.game) proproc = None rewproc = None #q_model = TabularQModel(modelOps) for trans in args.env_transforms: env = globals()[trans](env) if 'shared_model' in kargs and kargs['shared_model']: q_model = model else: q_model = globals()[args.model](modelOps) q_model.model_update = model.model q_model.set_weights(model.get_weights()) summary_writer = tf.summary.FileWriter( args.logdir + '/thread-' + str(threadId), K.get_session().graph) if not args.logdir is None else None agentOps = DqnAgentOps() agentOps.double_dqn = args.double_dqn agentOps.REPLAY_START_SIZE = 1 #agentOps.INITIAL_EXPLORATION = 0 agentOps.TARGET_NETWORK_UPDATE_FREQUENCY = 1e10 #replay_buffer = ReplayBuffer(int(1e6), 4, 4, agentOps.REPLAY_START_SIZE, 32) replay_buffer = None #if threadId > 0: if args.nstep > 0: replay_buffer = NStepBuffer(modelOps.AGENT_HISTORY_LENGTH, args.nstep) else: replay_buffer = ReplayBuffer(args.replay_buffer_size, modelOps.AGENT_HISTORY_LENGTH, args.update_frequency, args.replay_start_size, args.batch_size) #print(kargs['agent']) if kargs['agent'] == 'ActorCriticAgent': agent = ActorCriticAgent(env.action_space, q_model, replay_buffer, rewproc, agentOps, summary_writer, ac_model_update=model) # else: agent = DqnAgent(env.action_space, q_model, replay_buffer, rewproc, agentOps, summary_writer, model_eval=model_eval) # egreedyAgent = None if threadId > 0 and kargs[ 'agent'] != 'ActorCriticAgent': # first thread is for testing egreedyOps = EGreedyOps() egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE #egreedyOps.FINAL_EXPLORATION_FRAME = int(args.egreedy_final_step / args.thread_count) #if args.egreedy_decay<1: # egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, agent) #else: if len(args.egreedy_props ) > 1 and args.egreedy_props[0] == round( args.egreedy_props[0]): cs = np.array(args.egreedy_props) cs = np.cumsum(cs) idx = np.searchsorted(cs, threadId) print('Egreedyagent selected', idx, args.egreedy_final[idx], args.egreedy_decay[idx], args.egreedy_final_step[idx]) egreedyAgent = MultiEGreedyAgent( env.action_space, egreedyOps, agent, [1], [args.egreedy_final[idx]], [args.egreedy_decay[idx]], [args.egreedy_final_step[idx]]) else: egreedyAgent = MultiEGreedyAgent( env.action_space, egreedyOps, agent, args.egreedy_props, args.egreedy_final, args.egreedy_decay, args.egreedy_final_step) self.runner = Runner( env, egreedyAgent if egreedyAgent is not None else agent, proproc, modelOps.AGENT_HISTORY_LENGTH) if replay_buffer is not None: self.runner.listen(replay_buffer, proproc) self.runner.listen(agent, None) if egreedyAgent is not None: self.runner.listen(egreedyAgent, None) self.runner.listen(self, proproc) self.agent = agent self.q_model = q_model pass
def run(config): model_dir = Path('./ddpg_models') if not model_dir.exists(): curr_run = 'run1' else: exst_run_nums = [ int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run') ] if len(exst_run_nums) == 0: curr_run = 'run1' else: curr_run = 'run%i' % (max(exst_run_nums) + 1) run_dir = model_dir / curr_run figures_dir = run_dir / 'figures' os.makedirs(str(run_dir), exist_ok=True) os.makedirs(str(figures_dir)) torch.manual_seed(config.seed) np.random.seed(config.seed) assert "NoFrameskip" in config.env, "Require environment with no frameskip" env = gym.make(config.env) env.seed(config.seed) env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) env = EpisodicLifeEnv(env) env = FireResetEnv(env) env = WarpFrame(env) env = PyTorchFrame(env) env = ClipRewardEnv(env) env = FrameStack(env, 4) # env = gym.wrappers.Monitor(env, './video/', video_callable=lambda episode_id: episode_id % 1 == 0, force=True) replay_buffer = ReplayBuffer(config.buffer_size) agent = DDPGAgent(env.observation_space, env.action_space, replay_buffer, hidden_sizes=config.hidden_sizes, critic_lr=config.critic_lr, actor_lr=config.actor_lr, batch_size=config.batch_size, gamma=config.discounted_factor, tau=config.tau) if config.saved_model: print(f"Loading the networks parameters - { config.saved_model } ") agent.load_params(torch.load(config.saved_model)) total_rewards = [0.0] mean_100ep_rewards = [] state = env.reset() for step_i in range(config.num_steps): if config.display: env.render() action = agent.step(state) next_state, reward, done, info = env.step(action) agent.replay_buffer.add(state, action, reward, next_state, float(done)) state = next_state total_rewards[-1] += reward if done: state = env.reset() total_rewards.append(0.0) if len(replay_buffer ) > config.batch_size and step_i > config.learning_start: agent.update() agent.update_target() if config.display: env.render() num_episode = len(total_rewards) if done and num_episode % config.print_freq == 0: mean_100ep_reward = round(np.mean(total_rewards[-101:-1]), 1) print("********************************************************") print("steps: {}".format(step_i)) print("episodes: {}".format(num_episode)) print("mean 100 episode reward: {}".format(mean_100ep_reward)) print("********************************************************") with open(str(run_dir) + '/episodes_reward.csv', 'ab') as file: np.savetxt(file, total_rewards[-config.print_freq - 1:-1], delimiter=',', fmt='%1.2f') mean_100ep_rewards.append(mean_100ep_reward) if done and num_episode % config.save_model_freq == 0: # os.makedirs(str(run_dir / 'incremental'), exist_ok=True) # agent.save(str(run_dir / 'incremental' / ('model_ep%i.pt' % num_episode))) agent.save(str(run_dir / 'model.pt')) agent.save(str(run_dir / 'model.pt')) env.close() index = list(range(len(total_rewards))) plt.plot(index, total_rewards) plt.ylabel('Total Rewards') plt.savefig(str(figures_dir) + '/reward_curve.jpg') # plt.show() plt.close() index = list(range(len(mean_100ep_rewards))) plt.plot(index, mean_100ep_rewards) plt.ylabel('mean_100ep_reward') plt.savefig(str(figures_dir) + '/mean_100ep_reward_curve.jpg') # plt.show() plt.close()
def __init__(self, env, name, **kwargs): #batch_size=64, learning_rate=1e-4, reward_decay=0.99, #train_freq=1, target_update=2000, memory_size=2 ** 10, eval_obs=None, #use_dueling=True, use_double=True, use_conv=False, #custom_state_space=None, num_gpu=1, infer_batch_size=8192, network_type=0): """ Init DQN :param env: Environment environment :param name: str name of this model :param batch_size: int :param learning_rate: float :param reward_decay: float reward_decay in TD :param train_freq: int mean training times of a sample :param target_update: int target will update every target_update batches :param memory_size: int weight of entropy loss in total loss :param eval_obs: numpy array evaluation set of observation :param use_dueling: bool whether use dueling q network :param use_double: bool whether use double q network :param use_conv: bool use convolution or fully connected layer as state encoder :param custom_state_space: tuple :param num_gpu: int number of gpu :param infer_batch_size: int batch size while inferring actions :param network_type: """ TFBaseModel.__init__(self, env, name, "tfdqn") # ======================== set config ======================== self.env = env self.state_space = env.get_state_space() self.num_actions = env.get_action_space() self.batch_size = kwargs.setdefault("batch_size", 64) self.learning_rate = kwargs.setdefault("learning_rate", 1e4) self.training_freq = kwargs.setdefault("training_freq", 1) # train time of every sample (s,a,r,s') self.target_update = kwargs.setdefault("target_update", 1000) # target network update frequency self.eval_obs = kwargs.setdefault("eval_obs", None) # self.infer_batch_size = kwargs.setdefault("infer_batch_size", 8192) # maximum batch size when infer actions, # change this to fit your GPU memory if you meet a OOM self.network_param = kwargs.setdefault("network", False) self.use_dueling = self.network_param["use_dueling"] self.use_double = self.network_param["use_double"] self.num_gpu = self.network_param["num_gpu"] self.use_conv = self.network_param["use_conv"] self.nn_layers = self.network_param["layers"] self.activation = self.network_param["activation"] self.train_ct = 0 # ======================= build network ======================= tf.reset_default_graph() # input place holder self.target = tf.placeholder(tf.float32, [None], name="target") self.weight = tf.placeholder(tf.float32, [None], name="weight") self.input_state = tf.placeholder(tf.float32, [None, self.state_space], name="input_state") self.action = tf.placeholder(tf.int32, [None], name="action") self.mask = tf.placeholder(tf.float32, [None], name="mask") self.eps = tf.placeholder(tf.float32, name="eps") # e-greedy # build a graph with tf.variable_scope(self.name): with tf.variable_scope("eval_net_scope"): self.eval_scope_name = tf.get_variable_scope().name self.qvalues = self._create_network(self.input_state, self.use_conv) if self.num_gpu > 1: # build inference graph for multiple gpus self._build_multi_gpu_infer(self.num_gpu) with tf.variable_scope("target_net_scope"): self.target_scope_name = tf.get_variable_scope().name self.target_qvalues = self._create_network(self.input_state, self.use_conv) # loss self.gamma = kwargs.setdefault("reward_decay", 0.99) self.actions_onehot = tf.one_hot(self.action, self.num_actions) td_error = tf.square(self.target - tf.reduce_sum(tf.multiply(self.actions_onehot, self.qvalues), axis=1)) self.loss = tf.reduce_sum(td_error * self.mask) / tf.reduce_sum(self.mask) self.loss_summary = tf.summary.scalar(name='Loss_summary', tensor=self.loss) # train op(clip gradient) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name="ADAM") gradients, variables = zip(*optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.train_op = optimizer.apply_gradients(zip(gradients, variables), name="train_op") # self.train_summary = tf.summary.scalar(name='Train_summary', tensor=self.train_op) # output action def out_action(qvalues): best_action = tf.argmax(qvalues, axis=1) best_action = tf.to_int32(best_action) random_action = tf.random_uniform(tf.shape(best_action), 0, self.num_actions, tf.int32, name="random_action") should_explore = tf.random_uniform(tf.shape(best_action), 0, 1) < self.eps return tf.where(should_explore, random_action, best_action) self.output_action = out_action(self.qvalues) if self.num_gpu > 1: self.infer_out_action = [out_action(qvalue) for qvalue in self.infer_qvalues] # target network update op self.update_target_op = [] t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.target_scope_name) e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.eval_scope_name) for i in range(len(t_params)): self.update_target_op.append(tf.assign(t_params[i], e_params[i])) # Initialize the tensor board if not os.path.exists('summaries'): os.mkdir('summaries') if not os.path.exists(os.path.join('summaries', 'first')): os.mkdir(os.path.join('summaries', 'first')) # init tensorflow session config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.summ_writer = tf.summary.FileWriter(os.path.join('summaries', 'first'), self.sess.graph) self.sess.run(tf.global_variables_initializer()) # init replay buffers self.replay_buffer_len = 0 self.memory_size = int(kwargs.setdefault("memory_size", 2**10)) print("Memory size ", self.memory_size) self.replay_buf_state = ReplayBuffer(shape=(self.memory_size, self.state_space)) self.replay_buf_action = ReplayBuffer(shape=(self.memory_size,), dtype=np.int32) self.replay_buf_reward = ReplayBuffer(shape=(self.memory_size,)) self.replay_buf_terminal = ReplayBuffer(shape=(self.memory_size,), dtype=np.bool) self.replay_buf_mask = ReplayBuffer(shape=(self.memory_size,)) # if mask[i] == 0, then the item is used for padding, not for training self.policy = BoltzmanPolicy(action_space=self.num_actions)
class DeepQNetwork(TFBaseModel): def __init__(self, env, name, **kwargs): #batch_size=64, learning_rate=1e-4, reward_decay=0.99, #train_freq=1, target_update=2000, memory_size=2 ** 10, eval_obs=None, #use_dueling=True, use_double=True, use_conv=False, #custom_state_space=None, num_gpu=1, infer_batch_size=8192, network_type=0): """ Init DQN :param env: Environment environment :param name: str name of this model :param batch_size: int :param learning_rate: float :param reward_decay: float reward_decay in TD :param train_freq: int mean training times of a sample :param target_update: int target will update every target_update batches :param memory_size: int weight of entropy loss in total loss :param eval_obs: numpy array evaluation set of observation :param use_dueling: bool whether use dueling q network :param use_double: bool whether use double q network :param use_conv: bool use convolution or fully connected layer as state encoder :param custom_state_space: tuple :param num_gpu: int number of gpu :param infer_batch_size: int batch size while inferring actions :param network_type: """ TFBaseModel.__init__(self, env, name, "tfdqn") # ======================== set config ======================== self.env = env self.state_space = env.get_state_space() self.num_actions = env.get_action_space() self.batch_size = kwargs.setdefault("batch_size", 64) self.learning_rate = kwargs.setdefault("learning_rate", 1e4) self.training_freq = kwargs.setdefault("training_freq", 1) # train time of every sample (s,a,r,s') self.target_update = kwargs.setdefault("target_update", 1000) # target network update frequency self.eval_obs = kwargs.setdefault("eval_obs", None) # self.infer_batch_size = kwargs.setdefault("infer_batch_size", 8192) # maximum batch size when infer actions, # change this to fit your GPU memory if you meet a OOM self.network_param = kwargs.setdefault("network", False) self.use_dueling = self.network_param["use_dueling"] self.use_double = self.network_param["use_double"] self.num_gpu = self.network_param["num_gpu"] self.use_conv = self.network_param["use_conv"] self.nn_layers = self.network_param["layers"] self.activation = self.network_param["activation"] self.train_ct = 0 # ======================= build network ======================= tf.reset_default_graph() # input place holder self.target = tf.placeholder(tf.float32, [None], name="target") self.weight = tf.placeholder(tf.float32, [None], name="weight") self.input_state = tf.placeholder(tf.float32, [None, self.state_space], name="input_state") self.action = tf.placeholder(tf.int32, [None], name="action") self.mask = tf.placeholder(tf.float32, [None], name="mask") self.eps = tf.placeholder(tf.float32, name="eps") # e-greedy # build a graph with tf.variable_scope(self.name): with tf.variable_scope("eval_net_scope"): self.eval_scope_name = tf.get_variable_scope().name self.qvalues = self._create_network(self.input_state, self.use_conv) if self.num_gpu > 1: # build inference graph for multiple gpus self._build_multi_gpu_infer(self.num_gpu) with tf.variable_scope("target_net_scope"): self.target_scope_name = tf.get_variable_scope().name self.target_qvalues = self._create_network(self.input_state, self.use_conv) # loss self.gamma = kwargs.setdefault("reward_decay", 0.99) self.actions_onehot = tf.one_hot(self.action, self.num_actions) td_error = tf.square(self.target - tf.reduce_sum(tf.multiply(self.actions_onehot, self.qvalues), axis=1)) self.loss = tf.reduce_sum(td_error * self.mask) / tf.reduce_sum(self.mask) self.loss_summary = tf.summary.scalar(name='Loss_summary', tensor=self.loss) # train op(clip gradient) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name="ADAM") gradients, variables = zip(*optimizer.compute_gradients(self.loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.train_op = optimizer.apply_gradients(zip(gradients, variables), name="train_op") # self.train_summary = tf.summary.scalar(name='Train_summary', tensor=self.train_op) # output action def out_action(qvalues): best_action = tf.argmax(qvalues, axis=1) best_action = tf.to_int32(best_action) random_action = tf.random_uniform(tf.shape(best_action), 0, self.num_actions, tf.int32, name="random_action") should_explore = tf.random_uniform(tf.shape(best_action), 0, 1) < self.eps return tf.where(should_explore, random_action, best_action) self.output_action = out_action(self.qvalues) if self.num_gpu > 1: self.infer_out_action = [out_action(qvalue) for qvalue in self.infer_qvalues] # target network update op self.update_target_op = [] t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.target_scope_name) e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.eval_scope_name) for i in range(len(t_params)): self.update_target_op.append(tf.assign(t_params[i], e_params[i])) # Initialize the tensor board if not os.path.exists('summaries'): os.mkdir('summaries') if not os.path.exists(os.path.join('summaries', 'first')): os.mkdir(os.path.join('summaries', 'first')) # init tensorflow session config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.summ_writer = tf.summary.FileWriter(os.path.join('summaries', 'first'), self.sess.graph) self.sess.run(tf.global_variables_initializer()) # init replay buffers self.replay_buffer_len = 0 self.memory_size = int(kwargs.setdefault("memory_size", 2**10)) print("Memory size ", self.memory_size) self.replay_buf_state = ReplayBuffer(shape=(self.memory_size, self.state_space)) self.replay_buf_action = ReplayBuffer(shape=(self.memory_size,), dtype=np.int32) self.replay_buf_reward = ReplayBuffer(shape=(self.memory_size,)) self.replay_buf_terminal = ReplayBuffer(shape=(self.memory_size,), dtype=np.bool) self.replay_buf_mask = ReplayBuffer(shape=(self.memory_size,)) # if mask[i] == 0, then the item is used for padding, not for training self.policy = BoltzmanPolicy(action_space=self.num_actions) def _create_network(self, input_state, use_conv=False, reuse=None): """ Define computation graph of network :param input_state: tf.tensor :param use_conv: bool :param reuse: bool :return: """ kernel_num = [32, 32] hidden_size = self.nn_layers.values() if len(hidden_size) is 1: if self.activation == "Linear": print("1 NN with Linear activation, " + str(hidden_size[0]) + " neurons") h_state = tf.layers.dense(input_state, units=hidden_size[0], activation=None, name="h_state", reuse=reuse) else: print("1 NN with RELU activation, " + str(hidden_size[0]) + " neurons") h_state = tf.layers.dense(input_state, units=hidden_size[0], activation=tf.nn.relu, name="h_state", reuse=reuse) elif len(hidden_size) is 2: print("2 Layers NN with " + str(hidden_size[0]) + " and " + str(hidden_size[1]) + " neurons") activation = None if self.activation != "Linear": activation = tf.nn.relu h_state_0 = tf.layers.dense(input_state, units=hidden_size[0], activation=activation, name="h_state_0", reuse=reuse) h_state = tf.layers.dense(h_state_0, units=hidden_size[1], activation=activation, name="h_state", reuse=reuse) if self.use_dueling: value = tf.layers.dense(h_state, units=1, name="value", reuse=reuse) advantage = tf.layers.dense(h_state, units=self.num_actions, use_bias=False, name="advantage", reuse=reuse) qvalues = value + advantage - tf.reduce_mean(advantage, axis=1, keep_dims=True) else: qvalues = tf.layers.dense(h_state, units=self.num_actions, name="value", reuse=reuse) return qvalues def infer_action(self, user_id, sn, obs, step, policy="e_greedy", eps=0): """ infer action for the given agent. :param raw_obs: :param user_id: int id of the user :param policy: can be eps-greedy or greedy :param eps: float used when policy is eps-greedy :return: """ if policy == 'e_greedy': eps = eps elif policy == 'greedy': eps = 0 # if self.num_gpu > 1 and n > batch_size: # infer by multi gpu in parallel # ret = self._infer_multi_gpu(view, feature, ids, eps) qvalues = self.sess.run(self.qvalues, feed_dict={self.input_state: obs}) best_actions = np.argmax(qvalues, axis=1) n = 1 # Since we take an action only for 1 user. random = np.random.randint(self.num_actions, size=(n,)) cond = np.random.uniform(0, 1, size=(n,)) < eps ret = np.where(cond, random, best_actions) action = ret.astype(np.int32) # TODO: enable this later. #action = self.policy.take_action(qvalues, step) #actions.append(action) # action = self.sess.run(self.output_action, feed_dict={ # self.input_state: obs, # self.eps: eps # }) return action def _calc_target(self, next_state, reward, terminal): """ Calculate target value :param next_state: next_state of the user. :param reward: reward of the previous action :param terminal: :return: """ n = len(reward) if self.use_double: t_qvalues, qvalues = self.sess.run([self.target_qvalues, self.qvalues], feed_dict={self.input_state: next_state}) next_value = t_qvalues[np.arange(n), np.argmax(qvalues, axis=1)] else: t_qvalues = self.sess.run(self.target_qvalues, feed_dict={self.input_state: next_state}) next_value = np.max(t_qvalues, axis=1) target = np.where(terminal, reward, reward + self.gamma * next_value) return target def _add_to_replay_buffer(self, sample_buffer): """ Add stored episode buffers to replay buffer. :param sample_buffer: :return: """ n = 0 for episode in sample_buffer.episodes(): # Each user has its own episode. s, a, r = [], [], [] for step in range(len(episode.states)): # Step represent the sequence number of the transmitted packet. if (episode.states[step] is not -1)and (episode.actions[step] is not -1)and (episode.rewards[step]is not -1): # This part is required to make sure we synchronize the s,a and reward. # in order words, to alleviate the effect of delayed rewards. s.append(episode.states[step]) a.append(episode.actions[step]) r.append(episode.rewards[step]) m = len(r) if m is 0: continue mask = np.ones((m,)) terminal = np.zeros((m,), dtype=np.bool) if episode.terminal: terminal[-1] = True else: mask[-1] = 0 self.replay_buf_state.put(s) self.replay_buf_action.put(a) self.replay_buf_reward.put(r) self.replay_buf_terminal.put(terminal) self.replay_buf_mask.put(mask) n += m self.replay_buffer_len = min(self.memory_size, self.replay_buffer_len + n) return n def train(self, sample_buffer, print_every=1000, **kwargs): """ Add new samples in sample buffer to replay buffer and train Parameters ---------- :param sample_buffer: memory.EpisodesBuffer :param print_every: int print log every print_every batches :param kwargs: :return: loss: float bellman residual loss value: float estimated state value """ add_num = self._add_to_replay_buffer(sample_buffer) batch_size = self.batch_size total_loss = 0 n_batches = int(self.training_freq * add_num / batch_size) if n_batches == 0: return 0, 0 print("batch number: %d add: %d batch_size: %d training_freq: %d replay_len: %d/%d" % (n_batches, add_num, batch_size, self.training_freq, self.replay_buffer_len, self.memory_size)) start_time = time.time() ct = 0 for i in range(n_batches): # fetch a batch index = np.random.choice(self.replay_buffer_len - 1, batch_size) batch_state = self.replay_buf_state.get(index) batch_action = self.replay_buf_action.get(index) batch_reward = self.replay_buf_reward.get(index) batch_terminal = self.replay_buf_terminal.get(index) batch_mask = self.replay_buf_mask.get(index) batch_next_state = self.replay_buf_state.get(index+1) batch_target = self._calc_target(batch_next_state, batch_reward, batch_terminal) ret = self.sess.run([self.train_op, self.loss], feed_dict = { self.input_state: batch_state, self.action: batch_action, self.target: batch_target, self.mask: batch_mask }) loss = ret[1] total_loss += loss if ct % self.target_update == 0: print("Target Q update ct " + str(ct)) self.sess.run(self.update_target_op) if ct % print_every == 0: print("batch %5d, loss %.6f, eval %.6f" % (ct, loss, self._eval(batch_target))) ct += 1 self.train_ct += 1 total_time = time.time() - start_time step_average = total_time / max(1.0, (ct / 1000.0)) print("batches: %d, total time: %.2f, 1k average: %.2f" % (ct, total_time, step_average)) return total_loss / ct if ct != 0 else 0, self._eval(batch_target) def _eval(self, target): """ Evaluate estimated q value""" if self.eval_obs is None: return np.mean(target) else: return np.mean(self.sess.run([self.qvalues], feed_dict = { self.input_state: self.eval_obs[0] })) def clean_buffer(self): """ Clean replay buffer """ self.replay_buf_len = 0 self.replay_buf_view.clear() self.replay_buf_feature.clear() self.replay_buf_action.clear() self.replay_buf_reward.clear() self.replay_buf_terminal.clear() self.replay_buf_mask.clear()
from utils.network_utils import NetworkSaver from runner.runner import TrajRunner from utils.trajectory_utils import TrajectoryReplay from nets.net import init_nn_library arguments = vars(args) env = get_env(args.game, args.atari, args.env_transforms) if args.load_trajectory is None: dqn_args = arguments.copy() dqn_args['mode'] = 'test' dqn_args['replay_buffer_size'] = 0 runner = run_dqn(**dqn_args) replay_buffer = ReplayBuffer(args.replay_buffer_size, 1, args.update_frequency, args.replay_start_size, args.batch_size) else: init_nn_library(True, "1") runner = TrajRunner(args.max_step) replay_buffer = TrajectoryReplay(args.load_trajectory, args.batch_size) envOps = EnvOps(env.observation_space.shape, env.action_space.n, args.learning_rate) summary_writer = tf.summary.FileWriter(args.logdir, K.get_session().graph) if not args.logdir is None else None #model = EnvModelCartpole(envOps) model = globals()[args.env_model](envOps) env = EnvLearner(replay_buffer, model, summary_writer, args.reward_scale) runner.listen(env, None) if args.output_dir is None:
class DDPGAgent(BaseAgent): def __init__(self, config, env, env_params, her): BaseAgent.__init__(self, config) self.env = env self.config = config self.env_params = env_params #self.network = ActorCriticDeterministic(env_params['obs'], env_params['action'], # env_params['goal'], # config.hidden_layers, # use_her=config.use_her) self.actor = actor(env_params) self.target_actor = actor(env_params) self.target_actor.load_state_dict(self.actor.state_dict()) #============= self.critic = critic(env_params) self.target_critic = critic(env_params) self.target_critic.load_state_dict(self.critic.state_dict()) # self.target_network = ActorCriticDeterministic(env_params['obs'], env_params['action'], # env_params['goal'], # config.hidden_layers, # use_her=config.use_her) # self.target_network.load_state_dict(self.network.state_dict()) self.her = her_sampler(config.replay_strategy, config.replay_k, env.compute_reward) self.replay_buffer = ReplayBuffer( env_params, buffer_size=int(config.buffer_size), sample_func=self.her.sample_her_transitions) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.o_norm = normalizer(size=env_params['obs'], default_clip_range=self.config.clip_range) self.g_norm = normalizer(size=env_params['goal'], default_clip_range=self.config.clip_range) self.model_path = '/home/mohamed/Desktop/Project/utils' def learn(self): for epoch in range(1, self.config.n_epochs + 1): for _ in range(self.config.n_cycles): #for _ in range(2): episode = self._sample(epoch) self.replay_buffer.store_episode(episode) self._update_normalizer(episode) for _ in range(self.config.n_batches): self._update() self._soft_update() success_rate = self._eval_agent() print( 'Success rate in after {} epochs is {:.3f} over {} test runs'. format(epoch, success_rate, self.config.test_rollouts)) torch.save([ self.o_norm.mean, self.o_norm.std, self.g_norm.mean, self.g_norm.std, self.actor.state_dict() ], self.model_path + '/model.pt') def _sample(self, epoch): obs_batch = [] action_batch = [] achieved_goals_batch = [] goals_batch = [] actions_episode = [] obs_episode = [] goals_episode = [] achieved_goals_episode = [] observation = self.env.reset() goal = observation['desired_goal'] obs = observation['observation'] achieved_goal = observation['achieved_goal'] i = 0 while True: if self.config.render: self.env.render() with torch.no_grad(): action = self.actor(obs, goal) if self.config.add_noise: action = self._select_actions(action[0], 1 / epoch) new_obs, _, _, info = self.env.step(action) achieved_goal = new_obs['achieved_goal'] obs_episode.append(obs.copy()) obs = new_obs['observation'] achieved_goals_episode.append(achieved_goal.copy()) i += 1 if i > self.env_params['max_timesteps']: break actions_episode.append(action.copy()) goals_episode.append(goal.copy()) obs_batch.append(obs_episode) action_batch.append(actions_episode) achieved_goals_batch.append(achieved_goals_episode) goals_batch.append(goals_episode) episode = [ np.array(obs_batch), np.array(achieved_goals_batch), np.array(goals_batch), np.array(action_batch) ] # self.replay_buffer.store_episode([np.array(obs_batch), # np.array(achieved_goals_batch), # np.array(goals_batch), # np.array(action_batch)]) # self._update_normalizer([np.array(obs_batch), # np.array(achieved_goals_batch), # np.array(goals_batch), # np.array(action_batch)]) return episode def _update(self): experiences = self.replay_buffer.sample(self.config.batch_size) states, goals = self._preproc_og(experiences['obs'], experiences['g']) next_states, next_goals = self._preproc_og(experiences['next_obs'], experiences['g']) actions = experiences['actions'] rewards = experiences['r'] states = self.o_norm.normalize(states) goals = self.g_norm.normalize(goals) next_states = self.o_norm.normalize(next_states) next_goals = self.g_norm.normalize(next_goals) with torch.no_grad(): next_actions = self.target_actor(next_states, next_goals) target_value = self.target_critic(next_states, next_actions[0], next_goals) expected_value = (to_tensor(rewards) + self.config.discount * target_value).detach() clip_return = 1 / (1 - self.config.discount) expected_value = torch.clamp(expected_value, -clip_return, 0) #====== Value loss ======== value_criterion = nn.MSELoss() value = self.critic(states, actions, goals) value_loss = value_criterion(expected_value, value) #====== Policy loss ======= actions_ = self.actor(states, goals) policy_loss = -(self.critic(states, actions_[0], goals)).mean() policy_loss += self.config.action_l2 * (actions_[0]).pow(2).mean() #====== Policy update ======= self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() #====== Value update ======== self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step() def _soft_update(self): tau = self.config.tau_ddpg for targetp, netp in zip(self.target_critic.parameters(), self.critic.parameters()): targetp.data.copy_(tau * netp + (1 - tau) * targetp) for targetp, netp in zip(self.target_actor.parameters(), self.actor.parameters()): targetp.data.copy_(tau * netp + (1 - tau) * targetp) def _eval_agent(self): success_rate = 0 for _ in range(self.config.test_rollouts): observation = self.env.reset() obs = observation['observation'] goal = observation['desired_goal'] obs, goal = self._preproc_inputs(obs, goal) for _ in range(self.env_params['max_timesteps']): if self.config.render: self.env.render() with torch.no_grad(): action = self.actor(obs, goal) new_obs, _, _, info = self.env.step(to_numpy(action[0])) obs, goal = self._preproc_inputs(new_obs['observation'], new_obs['desired_goal']) success_rate += info['is_success'] return success_rate / self.config.test_rollouts def _select_actions(self, pi, eps): action = pi.cpu().numpy().squeeze() # add the gaussian action += self.config.eps * self.env_params[ 'action_max'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['action_max'], self.env_params['action_max']) # random actions... random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \ size=self.env_params['action']) # choose if use the random actions action += np.random.binomial(1, eps, 1) * (random_actions - action) return action def _update_normalizer(self, episode_batch): mb_obs, mb_ag, mb_g, mb_actions = episode_batch mb_obs_next = mb_obs[:, 1:, :] mb_ag_next = mb_ag[:, 1:, :] # get the number of normalization transitions num_transitions = mb_actions.shape[1] # create the new buffer to store them buffer_temp = { 'obs': mb_obs, 'ag': mb_ag, 'g': mb_g, 'actions': mb_actions, 'next_obs': mb_obs_next, 'next_ag': mb_ag_next, } transitions = self.her.sample_her_transitions(buffer_temp, num_transitions) obs, g = transitions['obs'], transitions['g'] # pre process the obs and g transitions['obs'], transitions['g'] = self._preproc_og(obs, g) # update self.o_norm.update(transitions['obs']) self.g_norm.update(transitions['g']) # recompute the stats self.o_norm.recompute_stats() self.g_norm.recompute_stats() def _preproc_inputs(self, obs, g): obs_norm = self.o_norm.normalize(obs) g_norm = self.g_norm.normalize(g) return obs_norm, g_norm def _preproc_og(self, o, g): o = np.clip(o, -self.config.clip_obs, self.config.clip_obs) g = np.clip(g, -self.config.clip_obs, self.config.clip_obs) return o, g