Пример #1
0
def DQN(args):
    # Initialize replay memory D to capacity N
    memory = ReplayBuffer(size=args.replay_mem_size, frame_history_len=4)
    exploration = LinearExplorationSchedule(args.eps_start, args.eps_end,
                                            args.eps_decay)
    #exploration = ExponentialExplorationSchedule(args.eps_start, args.eps_end, args.eps_decay)

    # Initialize action-value function Q with random weights
    D = PRE_PROCESS_OUTPUT_DIM
    n_actions = encode_action.n_actions = args.num_actions
    q_target = QNet(n_actions=n_actions,
                    n_input_ch=history_len * n_channels,
                    input_shape=(D, D)).to(get_device())
    q_behavior = QNet(n_actions=n_actions,
                      n_input_ch=history_len * n_channels,
                      input_shape=(D, D)).to(get_device())
    q_target.eval()
    # Freeze target network
    for p in q_target.parameters():
        p.requires_grad = False
    q_behavior.train()
    # Copy the weights, so both Q-value approximators initialize the same
    q_behavior.load_state_dict(q_target.state_dict())
    criterion = nn.MSELoss()
    #criterion = nn.SmoothL1Loss() # Huber loss
    # “Human-level control through deep reinforcement learning” - rmsprop config
    LEARNING_RATE = 0.00025
    ALPHA = 0.95
    EPS = 0.01
    optimizer = torch.optim.RMSprop(
        q_behavior.parameters(), lr=LEARNING_RATE, alpha=ALPHA,
        eps=EPS)  # , lr=0.00025, momentum=0.95, eps=0.01)

    reward_ema = ExponentialMovingAvg(args.reward_eam_factor)
    max_return = -np.inf
    cnt_transitions = 0

    for episode in itertools.count():
        with GameProgressBar(episode) as progress_bar:
            episode_return, n_transitions = run_episode(
                episode, memory, cnt_transitions, q_behavior, q_target,
                optimizer, criterion, exploration, progress_bar)
            reward_ema.update(episode_return)
            cnt_transitions += n_transitions

            if episode % args.target_update_rate == 0:
                update_target_network(q_behavior, q_target)

            max_return = max(max_return, episode_return)
            writer.add_scalar('running_return', reward_ema.value, episode)
            # print(f"End of episode {episode} (G={episode_return} "
            #       f"transitions={n_transitions} max_return={max_return} "
            #       f"reward_ema={reward_ema.value})")
            print('  '.join([
                f'reward={episode_return:.2f}',
                f'running mean={reward_ema.value:.2f}'
            ]),
                  end='')

        env.close()
Пример #2
0
def run_td_realtime(**kargs):
	if kargs['output_dir'] is None and kargs['logdir'] is not None:
		kargs['output_dir'] = kargs['logdir']

	from collections import namedtuple
	args = namedtuple("TDRealtimeParams", kargs.keys())(*kargs.values())

	if 'dont_init_tf' not in kargs.keys() or not kargs['dont_init_tf']:
		init_nn_library(True, "1")

	env = get_env(args.game, args.atari, args.env_transforms)

	envOps = EnvOps(env.observation_space.shape, env.action_space.n, args.learning_rate, mode="train")
	print(env.observation_space.low)
	print(env.observation_space.high)

	env_model = globals()[args.env_model](envOps)
	if args.env_weightfile is not None:
		env_model.model.load_weights(args.env_weightfile)

	v_model = globals()[args.vmodel](envOps)

	import numpy as np
	td_model = TDNetwork(env_model.model, v_model, envOps)

	summary_writer = tf.summary.FileWriter(args.logdir, K.get_session().graph) if not args.logdir is None else None

	replay_buffer = ReplayBuffer(args.replay_buffer_size, 1, args.update_frequency, args.replay_start_size, args.batch_size)

	from utils.network_utils import NetworkSaver
	network_saver = NetworkSaver(args.save_freq, args.logdir, v_model.model)

	v_agent = VAgent(env.action_space, env_model, v_model, envOps, summary_writer, True, replay_buffer, args.target_network_update)

	egreedyOps = EGreedyOps()
	if replay_buffer is not None:
		egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE
	egreedyOps.mode = args.mode
	egreedyOps.test_epsilon = args.test_epsilon
	#egreedyOps.FINAL_EXPLORATION_FRAME = 10000
	if args.mode == "train":
		egreedyOps.FINAL_EXPLORATION_FRAME = args.egreedy_final_step

	if args.mode == "train":
		if args.egreedy_decay<1:
			egreedyOps.DECAY = args.egreedy_decay
			egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, v_agent)
		else:
			egreedyAgent = MultiEGreedyAgent(env.action_space, egreedyOps, v_agent, args.egreedy_props, args.egreedy_final, final_exp_frame=args.egreedy_final_step)
	else:
		egreedyAgent = EGreedyAgent(env.action_space, egreedyOps, v_agent)


	runner = Runner(env, egreedyAgent, None, 1, max_step=args.max_step, max_episode=args.max_episode)
	runner.listen(replay_buffer, None)
	runner.listen(v_agent, None)
	runner.listen(egreedyAgent, None)
	runner.listen(network_saver, None)
	#runner.run()
	return runner, v_agent
Пример #3
0
def run(config):
    if config.saved_model is None:
        raise Exception("In Evaluation Mode, the saved model couldn't be None")

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)

    assert "NoFrameskip" in config.env, "Require environment with no frameskip"
    env = gym.make(config.env)
    env.seed(config.seed)
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env = EpisodicLifeEnv(env)
    env = FireResetEnv(env)
    env = WarpFrame(env)
    env = PyTorchFrame(env)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 4)
    # env = gym.wrappers.Monitor(env, './video/', video_callable=lambda episode_id: episode_id % 1 == 0, force=True)
    ifi = 1 / config.fps

    replay_buffer = ReplayBuffer(config.buffer_size)

    agent = DDPGAgent(env.observation_space, env.action_space, replay_buffer)
    print(f"Loading the networks parameters - {config.saved_model} ")
    agent.load_params(torch.load(config.saved_model))

    episodes_count = 0
    for episode_i in range(config.num_episodes):
        state = env.reset()
        episode_reward = 0.0
        if config.display:
            env.render()
        while True:
            calc_start = time.time()
            action = agent.step(state)
            next_state, reward, done, info = env.step(action)
            episode_reward += reward
            episodes_count += 1
            if done:
                break
            state = next_state
            if config.display:
                calc_end = time.time()
                elapsed = calc_end - calc_start
                if elapsed < ifi:
                    time.sleep(ifi - elapsed)
                env.render()

        print("********************************************************")
        print("steps: {}".format(episodes_count))
        print("episodes: {}".format(episode_i))
        print("mean episode reward: {}".format(episode_reward))
        print("********************************************************")

    env.close()
Пример #4
0
    def __init__(self, config, env, env_params, her):
        BaseAgent.__init__(self, config)

        self.env = env
        self.config = config
        self.env_params = env_params

        #self.network = ActorCriticDeterministic(env_params['obs'], env_params['action'],
        #                                        env_params['goal'],
        #                                       config.hidden_layers,
        #                                        use_her=config.use_her)
        self.actor = actor(env_params)
        self.target_actor = actor(env_params)
        self.target_actor.load_state_dict(self.actor.state_dict())
        #=============
        self.critic = critic(env_params)
        self.target_critic = critic(env_params)
        self.target_critic.load_state_dict(self.critic.state_dict())

        # self.target_network = ActorCriticDeterministic(env_params['obs'], env_params['action'],
        #                                                env_params['goal'],
        #                                                config.hidden_layers,
        #                                                use_her=config.use_her)

        # self.target_network.load_state_dict(self.network.state_dict())

        self.her = her_sampler(config.replay_strategy, config.replay_k,
                               env.compute_reward)

        self.replay_buffer = ReplayBuffer(
            env_params,
            buffer_size=int(config.buffer_size),
            sample_func=self.her.sample_her_transitions)

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())

        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.config.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.config.clip_range)
        self.model_path = '/home/mohamed/Desktop/Project/utils'
Пример #5
0
def run_dqn(**kargs):
    if kargs['output_dir'] is None and kargs['logdir'] is not None:
        kargs['output_dir'] = kargs['logdir']

    q_model_initial = kargs[
        'q_model_initial'] if 'q_model_initial' in kargs else None

    from collections import namedtuple
    args = namedtuple("DQNParams", kargs.keys())(*kargs.values())

    if 'dont_init_tf' not in kargs.keys() or not kargs['dont_init_tf']:
        #init_nn_library(True, "1")
        init_nn_library("gpu" in kargs and kargs["gpu"] is not None,
                        kargs["gpu"] if "gpu" in kargs else "1")

    #if args.atari:
    #	env = gym_env(args.game + 'NoFrameskip-v0')
    #	env = WarmUp(env, min_step=0, max_step=30)
    #	env = ActionRepeat(env, 4)
    #	#q_model = A3CModel(modelOps)
    #else:
    #	if args.game == "Grid":
    #		env = GridEnv()
    #	else:
    #		env = gym_env(args.game)
    #	#q_model = TabularQModel(modelOps)
    #for trans in args.env_transforms:
    #	env = globals()[trans](env)
    if 'use_env' in kargs and kargs['use_env'] is not None:
        env = kargs['use_env']
    else:
        env = get_env(args.game, args.atari, args.env_transforms,
                      kargs['monitor_dir'] if 'monitor_dir' in kargs else None)
        if 'env_model' in kargs and kargs['env_model'] is not None and kargs[
                'env_weightfile'] is not None:
            print('Using simulated environment')
            envOps = EnvOps(env.observation_space.shape, env.action_space.n,
                            args.learning_rate)
            env_model = globals()[kargs['env_model']](envOps)
            env_model.model.load_weights(kargs['env_weightfile'])
            env = SimulatedEnv(env,
                               env_model,
                               use_reward='env_reward' in kargs
                               and kargs['env_reward'])

    modelOps = DqnOps(env.action_count)
    modelOps.dueling_network = args.dueling_dqn

    viewer = None
    if args.enable_render:
        viewer = EnvViewer(env, args.render_step, 'human')
    if args.atari:
        proproc = PreProPipeline(
            [GrayPrePro(), ResizePrePro(modelOps.INPUT_SIZE)])
        rewproc = PreProPipeline([RewardClipper(-1, 1)])
    else:
        if env.observation_space.__class__.__name__ is 'Discrete':
            modelOps.INPUT_SIZE = env.observation_space.n
        else:
            modelOps.INPUT_SIZE = env.observation_space.shape
        modelOps.AGENT_HISTORY_LENGTH = 1
        proproc = None
        rewproc = None

    modelOps.LEARNING_RATE = args.learning_rate
    if q_model_initial is None:
        q_model = globals()[args.model](modelOps)
    else:
        q_model = q_model_initial

    if not args.load_weightfile is None:
        q_model.model.load_weights(args.load_weightfile)

    summary_writer = tf.summary.FileWriter(
        args.logdir,
        K.get_session().graph) if not args.logdir is None else None

    agentOps = DqnAgentOps()
    agentOps.double_dqn = args.double_dqn
    agentOps.mode = args.mode
    if args.mode == "train":
        agentOps.TARGET_NETWORK_UPDATE_FREQUENCY = args.target_network_update

    replay_buffer = None
    if args.replay_buffer_size > 0:
        if 'load_trajectory' in kargs and kargs['load_trajectory'] is not None:
            replay_buffer = TrajectoryReplay(kargs['load_trajectory'],
                                             kargs['batch_size'],
                                             args.update_frequency,
                                             args.replay_start_size)
        else:
            replay_buffer = ReplayBuffer(args.replay_buffer_size,
                                         modelOps.AGENT_HISTORY_LENGTH,
                                         args.update_frequency,
                                         args.replay_start_size,
                                         args.batch_size)
    #replay_buffer = NStepBuffer(modelOps.AGENT_HISTORY_LENGTH, 8)
    agent = DqnAgent(env.action_space, q_model, replay_buffer, rewproc,
                     agentOps, summary_writer)

    egreedyOps = EGreedyOps()
    if replay_buffer is not None:
        egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE
    egreedyOps.mode = args.mode
    egreedyOps.test_epsilon = args.test_epsilon
    #egreedyOps.FINAL_EXPLORATION_FRAME = 10000
    if args.mode == "train":
        egreedyOps.FINAL_EXPLORATION_FRAME = args.egreedy_final_step

    if args.mode == "train":
        if args.egreedy_decay < 1:
            egreedyOps.DECAY = args.egreedy_decay
            egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, agent)
        else:
            egreedyAgent = MultiEGreedyAgent(
                env.action_space,
                egreedyOps,
                agent,
                args.egreedy_props,
                args.egreedy_final,
                final_exp_frame=args.egreedy_final_step)
    else:
        egreedyAgent = EGreedyAgent(env.action_space, egreedyOps, agent)

    runner = Runner(env,
                    egreedyAgent,
                    proproc,
                    modelOps.AGENT_HISTORY_LENGTH,
                    max_step=args.max_step,
                    max_episode=args.max_episode)
    if replay_buffer is not None:
        runner.listen(replay_buffer, proproc)
    runner.listen(agent, None)
    runner.listen(egreedyAgent, None)
    if viewer is not None:
        runner.listen(viewer, None)

    if args.output_dir is not None:
        networkSaver = NetworkSaver(
            50000 if 'save_interval' not in kargs else kargs['save_interval'],
            args.output_dir, q_model.model)
        runner.listen(networkSaver, None)

    return runner, agent
Пример #6
0
    (env.observation_space['observation'].shape[0] // 2) * 2,
    env.action_space.shape[0]).to(device)

crt = QNetwork((env.observation_space['observation'].shape[0] // 2) * 2,
               env.action_space.shape[0]).to(device)
tgt_crt = QNetwork((env.observation_space['observation'].shape[0] // 2) * 2,
                   env.action_space.shape[0]).to(device)

tgt_crt.load_state_dict(crt.state_dict())

policy_optim = optim.Adam(policy.parameters(), lr=lr)
crt_optim = optim.Adam(crt.parameters(), lr=lr)

# %%
noise = OUNoise(env.action_space)
memory = ReplayBuffer(1000000)


# %%
def dist(x, y):
    x = x.cpu().numpy()
    y = y.cpu().numpy()
    res = np.linalg.norm(x - y, axis=1)
    return torch.tensor(res).unsqueeze(1).to(device)


# %%
def train_policy(act_net,
                 crt_net,
                 tgt_crt_net,
                 optimizer_act,
Пример #7
0
        return CartPoleModel(self.ops, m)


q_model = CartPoleModel(modelOps)

summary_writer = tf.summary.FileWriter(
    args.logdir,
    K.get_session().graph) if not args.logdir is None else None

agentOps = DqnAgentOps()
agentOps.double_dqn = args.double_dqn
agentOps.TARGET_NETWORK_UPDATE_FREQUENCY = 20
#agentOps.REPLAY_START_SIZE = 100
#agentOps.FINAL_EXPLORATION_FRAME = 10000

replay_buffer = ReplayBuffer(int(2000), 1, 1, 1000, 64)
#replay_buffer = NStepBuffer(modelOps.AGENT_HISTORY_LENGTH, 8)
agent = DqnAgent(env.action_space, q_model, replay_buffer, rewproc, agentOps,
                 summary_writer)

egreedyOps = EGreedyOps()
egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE
egreedyOps.FINAL_EXPLORATION_FRAME = 10000
egreedyOps.FINAL_EXPLORATION = 0.01
egreedyOps.DECAY = 0.999
egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, agent)

runner = Runner(env, egreedyAgent, proproc, 1)
runner.listen(replay_buffer, proproc)
runner.listen(agent, None)
runner.listen(egreedyAgent, None)
Пример #8
0
        def __init__(self, threadId, sess, graph):
            StoppableThread.__init__(self)
            self.threadId = threadId
            self.sess = sess
            self.graph = graph
            with self.graph.as_default():
                if args.atari:
                    env = gym_env(args.game + 'NoFrameskip-v0')
                    env = WarmUp(env, min_step=0, max_step=30)
                    env = ActionRepeat(env, 4)
                    proproc = PreProPipeline(
                        [GrayPrePro(),
                         ResizePrePro(modelOps.INPUT_SIZE)])
                    rewproc = PreProPipeline([RewardClipper(-1, 1)])
                    #q_model = A3CModel(modelOps)
                else:
                    if args.game == "Grid":
                        env = GridEnv()
                    else:
                        env = gym_env(args.game)
                    proproc = None
                    rewproc = None
                    #q_model = TabularQModel(modelOps)
                for trans in args.env_transforms:
                    env = globals()[trans](env)

                if 'shared_model' in kargs and kargs['shared_model']:
                    q_model = model
                else:
                    q_model = globals()[args.model](modelOps)
                    q_model.model_update = model.model
                    q_model.set_weights(model.get_weights())
                summary_writer = tf.summary.FileWriter(
                    args.logdir + '/thread-' + str(threadId),
                    K.get_session().graph) if not args.logdir is None else None

                agentOps = DqnAgentOps()
                agentOps.double_dqn = args.double_dqn
                agentOps.REPLAY_START_SIZE = 1
                #agentOps.INITIAL_EXPLORATION = 0
                agentOps.TARGET_NETWORK_UPDATE_FREQUENCY = 1e10

                #replay_buffer = ReplayBuffer(int(1e6), 4, 4, agentOps.REPLAY_START_SIZE, 32)
                replay_buffer = None
                #if threadId > 0:
                if args.nstep > 0:
                    replay_buffer = NStepBuffer(modelOps.AGENT_HISTORY_LENGTH,
                                                args.nstep)
                else:
                    replay_buffer = ReplayBuffer(args.replay_buffer_size,
                                                 modelOps.AGENT_HISTORY_LENGTH,
                                                 args.update_frequency,
                                                 args.replay_start_size,
                                                 args.batch_size)

                #print(kargs['agent'])
                if kargs['agent'] == 'ActorCriticAgent':
                    agent = ActorCriticAgent(env.action_space,
                                             q_model,
                                             replay_buffer,
                                             rewproc,
                                             agentOps,
                                             summary_writer,
                                             ac_model_update=model)  #
                else:
                    agent = DqnAgent(env.action_space,
                                     q_model,
                                     replay_buffer,
                                     rewproc,
                                     agentOps,
                                     summary_writer,
                                     model_eval=model_eval)  #

                egreedyAgent = None

                if threadId > 0 and kargs[
                        'agent'] != 'ActorCriticAgent':  # first thread is for testing
                    egreedyOps = EGreedyOps()
                    egreedyOps.REPLAY_START_SIZE = replay_buffer.REPLAY_START_SIZE
                    #egreedyOps.FINAL_EXPLORATION_FRAME = int(args.egreedy_final_step / args.thread_count)
                    #if args.egreedy_decay<1:
                    #	egreedyAgent = EGreedyAgentExp(env.action_space, egreedyOps, agent)
                    #else:
                    if len(args.egreedy_props
                           ) > 1 and args.egreedy_props[0] == round(
                               args.egreedy_props[0]):
                        cs = np.array(args.egreedy_props)
                        cs = np.cumsum(cs)
                        idx = np.searchsorted(cs, threadId)
                        print('Egreedyagent selected', idx,
                              args.egreedy_final[idx], args.egreedy_decay[idx],
                              args.egreedy_final_step[idx])
                        egreedyAgent = MultiEGreedyAgent(
                            env.action_space, egreedyOps, agent, [1],
                            [args.egreedy_final[idx]],
                            [args.egreedy_decay[idx]],
                            [args.egreedy_final_step[idx]])
                    else:
                        egreedyAgent = MultiEGreedyAgent(
                            env.action_space, egreedyOps, agent,
                            args.egreedy_props, args.egreedy_final,
                            args.egreedy_decay, args.egreedy_final_step)

                self.runner = Runner(
                    env, egreedyAgent if egreedyAgent is not None else agent,
                    proproc, modelOps.AGENT_HISTORY_LENGTH)
                if replay_buffer is not None:
                    self.runner.listen(replay_buffer, proproc)
                self.runner.listen(agent, None)
                if egreedyAgent is not None:
                    self.runner.listen(egreedyAgent, None)
                self.runner.listen(self, proproc)
                self.agent = agent
                self.q_model = q_model
            pass
Пример #9
0
def run(config):
    model_dir = Path('./ddpg_models')
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)
    run_dir = model_dir / curr_run
    figures_dir = run_dir / 'figures'

    os.makedirs(str(run_dir), exist_ok=True)
    os.makedirs(str(figures_dir))
    torch.manual_seed(config.seed)
    np.random.seed(config.seed)

    assert "NoFrameskip" in config.env, "Require environment with no frameskip"
    env = gym.make(config.env)
    env.seed(config.seed)
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env = EpisodicLifeEnv(env)
    env = FireResetEnv(env)
    env = WarpFrame(env)
    env = PyTorchFrame(env)
    env = ClipRewardEnv(env)
    env = FrameStack(env, 4)
    # env = gym.wrappers.Monitor(env, './video/', video_callable=lambda episode_id: episode_id % 1 == 0, force=True)

    replay_buffer = ReplayBuffer(config.buffer_size)

    agent = DDPGAgent(env.observation_space,
                      env.action_space,
                      replay_buffer,
                      hidden_sizes=config.hidden_sizes,
                      critic_lr=config.critic_lr,
                      actor_lr=config.actor_lr,
                      batch_size=config.batch_size,
                      gamma=config.discounted_factor,
                      tau=config.tau)

    if config.saved_model:
        print(f"Loading the networks parameters - { config.saved_model } ")
        agent.load_params(torch.load(config.saved_model))

    total_rewards = [0.0]
    mean_100ep_rewards = []

    state = env.reset()
    for step_i in range(config.num_steps):
        if config.display:
            env.render()

        action = agent.step(state)
        next_state, reward, done, info = env.step(action)
        agent.replay_buffer.add(state, action, reward, next_state, float(done))

        state = next_state
        total_rewards[-1] += reward
        if done:
            state = env.reset()
            total_rewards.append(0.0)

        if len(replay_buffer
               ) > config.batch_size and step_i > config.learning_start:
            agent.update()
            agent.update_target()

        if config.display:
            env.render()

        num_episode = len(total_rewards)

        if done and num_episode % config.print_freq == 0:
            mean_100ep_reward = round(np.mean(total_rewards[-101:-1]), 1)
            print("********************************************************")
            print("steps: {}".format(step_i))
            print("episodes: {}".format(num_episode))
            print("mean 100 episode reward: {}".format(mean_100ep_reward))
            print("********************************************************")
            with open(str(run_dir) + '/episodes_reward.csv', 'ab') as file:
                np.savetxt(file,
                           total_rewards[-config.print_freq - 1:-1],
                           delimiter=',',
                           fmt='%1.2f')
            mean_100ep_rewards.append(mean_100ep_reward)

        if done and num_episode % config.save_model_freq == 0:
            # os.makedirs(str(run_dir / 'incremental'), exist_ok=True)
            # agent.save(str(run_dir / 'incremental' / ('model_ep%i.pt' % num_episode)))
            agent.save(str(run_dir / 'model.pt'))

    agent.save(str(run_dir / 'model.pt'))
    env.close()

    index = list(range(len(total_rewards)))
    plt.plot(index, total_rewards)
    plt.ylabel('Total Rewards')
    plt.savefig(str(figures_dir) + '/reward_curve.jpg')
    # plt.show()
    plt.close()

    index = list(range(len(mean_100ep_rewards)))
    plt.plot(index, mean_100ep_rewards)
    plt.ylabel('mean_100ep_reward')
    plt.savefig(str(figures_dir) + '/mean_100ep_reward_curve.jpg')
    # plt.show()
    plt.close()
Пример #10
0
    def __init__(self, env, name, **kwargs):
                 #batch_size=64, learning_rate=1e-4, reward_decay=0.99,
                 #train_freq=1, target_update=2000, memory_size=2 ** 10, eval_obs=None,
                 #use_dueling=True, use_double=True, use_conv=False,
                 #custom_state_space=None, num_gpu=1, infer_batch_size=8192, network_type=0):
        """
        Init DQN
        :param env: Environment
            environment
        :param name: str
            name of this model
        :param batch_size: int
        :param learning_rate: float
        :param reward_decay: float
            reward_decay in TD
        :param train_freq: int
            mean training times of a sample
        :param target_update: int
            target will update every target_update batches
        :param memory_size: int
            weight of entropy loss in total loss
        :param eval_obs: numpy array
            evaluation set of observation
        :param use_dueling: bool
            whether use dueling q network
        :param use_double: bool
            whether use double q network
        :param use_conv: bool
            use convolution or fully connected layer as state encoder
        :param custom_state_space: tuple
        :param num_gpu: int
            number of gpu
        :param infer_batch_size: int
            batch size while inferring actions
        :param network_type:
        """
        TFBaseModel.__init__(self, env, name, "tfdqn")
        # ======================== set config  ========================
        self.env = env
        self.state_space = env.get_state_space()
        self.num_actions = env.get_action_space()

        self.batch_size = kwargs.setdefault("batch_size", 64)
        self.learning_rate = kwargs.setdefault("learning_rate", 1e4)
        self.training_freq = kwargs.setdefault("training_freq", 1)  # train time of every sample (s,a,r,s')
        self.target_update = kwargs.setdefault("target_update", 1000)   # target network update frequency
        self.eval_obs = kwargs.setdefault("eval_obs", None)
       # self.infer_batch_size = kwargs.setdefault("infer_batch_size", 8192)  # maximum batch size when infer actions,
       # change this to fit your GPU memory if you meet a OOM

        self.network_param = kwargs.setdefault("network", False)
        self.use_dueling = self.network_param["use_dueling"]
        self.use_double = self.network_param["use_double"]
        self.num_gpu = self.network_param["num_gpu"]
        self.use_conv = self.network_param["use_conv"]

        self.nn_layers = self.network_param["layers"]
        self.activation = self.network_param["activation"]
        self.train_ct = 0

        # ======================= build network =======================
        tf.reset_default_graph()
        # input place holder
        self.target = tf.placeholder(tf.float32, [None], name="target")
        self.weight = tf.placeholder(tf.float32, [None], name="weight")

        self.input_state = tf.placeholder(tf.float32, [None, self.state_space], name="input_state")
        self.action = tf.placeholder(tf.int32, [None], name="action")
        self.mask = tf.placeholder(tf.float32, [None], name="mask")
        self.eps = tf.placeholder(tf.float32, name="eps")  # e-greedy

        # build a graph
        with tf.variable_scope(self.name):
            with tf.variable_scope("eval_net_scope"):
                self.eval_scope_name = tf.get_variable_scope().name
                self.qvalues = self._create_network(self.input_state, self.use_conv)

            if self.num_gpu > 1: # build inference graph for multiple gpus
                self._build_multi_gpu_infer(self.num_gpu)

            with tf.variable_scope("target_net_scope"):
                self.target_scope_name = tf.get_variable_scope().name
                self.target_qvalues = self._create_network(self.input_state, self.use_conv)

        # loss
        self.gamma = kwargs.setdefault("reward_decay", 0.99)
        self.actions_onehot = tf.one_hot(self.action, self.num_actions)
        td_error = tf.square(self.target - tf.reduce_sum(tf.multiply(self.actions_onehot, self.qvalues), axis=1))
        self.loss = tf.reduce_sum(td_error * self.mask) / tf.reduce_sum(self.mask)
        self.loss_summary = tf.summary.scalar(name='Loss_summary', tensor=self.loss)

        # train op(clip gradient)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name="ADAM")
        gradients, variables = zip(*optimizer.compute_gradients(self.loss))
        gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        self.train_op = optimizer.apply_gradients(zip(gradients, variables), name="train_op")
#        self.train_summary = tf.summary.scalar(name='Train_summary', tensor=self.train_op)

        # output action
        def out_action(qvalues):
            best_action = tf.argmax(qvalues, axis=1)
            best_action = tf.to_int32(best_action)
            random_action = tf.random_uniform(tf.shape(best_action), 0, self.num_actions, tf.int32, name="random_action")
            should_explore = tf.random_uniform(tf.shape(best_action), 0, 1) < self.eps
            return tf.where(should_explore, random_action, best_action)

        self.output_action = out_action(self.qvalues)
        if self.num_gpu > 1:
            self.infer_out_action = [out_action(qvalue) for qvalue in self.infer_qvalues]

        # target network update op
        self.update_target_op = []
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.target_scope_name)
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.eval_scope_name)
        for i in range(len(t_params)):
            self.update_target_op.append(tf.assign(t_params[i], e_params[i]))

        # Initialize the tensor board
        if not os.path.exists('summaries'):
            os.mkdir('summaries')
        if not os.path.exists(os.path.join('summaries', 'first')):
            os.mkdir(os.path.join('summaries', 'first'))

        # init tensorflow session
        config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.summ_writer = tf.summary.FileWriter(os.path.join('summaries', 'first'), self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        # init replay buffers
        self.replay_buffer_len = 0
        self.memory_size = int(kwargs.setdefault("memory_size", 2**10))
        print("Memory size ", self.memory_size)
        self.replay_buf_state = ReplayBuffer(shape=(self.memory_size, self.state_space))
        self.replay_buf_action = ReplayBuffer(shape=(self.memory_size,), dtype=np.int32)
        self.replay_buf_reward = ReplayBuffer(shape=(self.memory_size,))
        self.replay_buf_terminal = ReplayBuffer(shape=(self.memory_size,), dtype=np.bool)
        self.replay_buf_mask = ReplayBuffer(shape=(self.memory_size,))
        # if mask[i] == 0, then the item is used for padding, not for training
        self.policy = BoltzmanPolicy(action_space=self.num_actions)
Пример #11
0
class DeepQNetwork(TFBaseModel):
    def __init__(self, env, name, **kwargs):
                 #batch_size=64, learning_rate=1e-4, reward_decay=0.99,
                 #train_freq=1, target_update=2000, memory_size=2 ** 10, eval_obs=None,
                 #use_dueling=True, use_double=True, use_conv=False,
                 #custom_state_space=None, num_gpu=1, infer_batch_size=8192, network_type=0):
        """
        Init DQN
        :param env: Environment
            environment
        :param name: str
            name of this model
        :param batch_size: int
        :param learning_rate: float
        :param reward_decay: float
            reward_decay in TD
        :param train_freq: int
            mean training times of a sample
        :param target_update: int
            target will update every target_update batches
        :param memory_size: int
            weight of entropy loss in total loss
        :param eval_obs: numpy array
            evaluation set of observation
        :param use_dueling: bool
            whether use dueling q network
        :param use_double: bool
            whether use double q network
        :param use_conv: bool
            use convolution or fully connected layer as state encoder
        :param custom_state_space: tuple
        :param num_gpu: int
            number of gpu
        :param infer_batch_size: int
            batch size while inferring actions
        :param network_type:
        """
        TFBaseModel.__init__(self, env, name, "tfdqn")
        # ======================== set config  ========================
        self.env = env
        self.state_space = env.get_state_space()
        self.num_actions = env.get_action_space()

        self.batch_size = kwargs.setdefault("batch_size", 64)
        self.learning_rate = kwargs.setdefault("learning_rate", 1e4)
        self.training_freq = kwargs.setdefault("training_freq", 1)  # train time of every sample (s,a,r,s')
        self.target_update = kwargs.setdefault("target_update", 1000)   # target network update frequency
        self.eval_obs = kwargs.setdefault("eval_obs", None)
       # self.infer_batch_size = kwargs.setdefault("infer_batch_size", 8192)  # maximum batch size when infer actions,
       # change this to fit your GPU memory if you meet a OOM

        self.network_param = kwargs.setdefault("network", False)
        self.use_dueling = self.network_param["use_dueling"]
        self.use_double = self.network_param["use_double"]
        self.num_gpu = self.network_param["num_gpu"]
        self.use_conv = self.network_param["use_conv"]

        self.nn_layers = self.network_param["layers"]
        self.activation = self.network_param["activation"]
        self.train_ct = 0

        # ======================= build network =======================
        tf.reset_default_graph()
        # input place holder
        self.target = tf.placeholder(tf.float32, [None], name="target")
        self.weight = tf.placeholder(tf.float32, [None], name="weight")

        self.input_state = tf.placeholder(tf.float32, [None, self.state_space], name="input_state")
        self.action = tf.placeholder(tf.int32, [None], name="action")
        self.mask = tf.placeholder(tf.float32, [None], name="mask")
        self.eps = tf.placeholder(tf.float32, name="eps")  # e-greedy

        # build a graph
        with tf.variable_scope(self.name):
            with tf.variable_scope("eval_net_scope"):
                self.eval_scope_name = tf.get_variable_scope().name
                self.qvalues = self._create_network(self.input_state, self.use_conv)

            if self.num_gpu > 1: # build inference graph for multiple gpus
                self._build_multi_gpu_infer(self.num_gpu)

            with tf.variable_scope("target_net_scope"):
                self.target_scope_name = tf.get_variable_scope().name
                self.target_qvalues = self._create_network(self.input_state, self.use_conv)

        # loss
        self.gamma = kwargs.setdefault("reward_decay", 0.99)
        self.actions_onehot = tf.one_hot(self.action, self.num_actions)
        td_error = tf.square(self.target - tf.reduce_sum(tf.multiply(self.actions_onehot, self.qvalues), axis=1))
        self.loss = tf.reduce_sum(td_error * self.mask) / tf.reduce_sum(self.mask)
        self.loss_summary = tf.summary.scalar(name='Loss_summary', tensor=self.loss)

        # train op(clip gradient)
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, name="ADAM")
        gradients, variables = zip(*optimizer.compute_gradients(self.loss))
        gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        self.train_op = optimizer.apply_gradients(zip(gradients, variables), name="train_op")
#        self.train_summary = tf.summary.scalar(name='Train_summary', tensor=self.train_op)

        # output action
        def out_action(qvalues):
            best_action = tf.argmax(qvalues, axis=1)
            best_action = tf.to_int32(best_action)
            random_action = tf.random_uniform(tf.shape(best_action), 0, self.num_actions, tf.int32, name="random_action")
            should_explore = tf.random_uniform(tf.shape(best_action), 0, 1) < self.eps
            return tf.where(should_explore, random_action, best_action)

        self.output_action = out_action(self.qvalues)
        if self.num_gpu > 1:
            self.infer_out_action = [out_action(qvalue) for qvalue in self.infer_qvalues]

        # target network update op
        self.update_target_op = []
        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.target_scope_name)
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.eval_scope_name)
        for i in range(len(t_params)):
            self.update_target_op.append(tf.assign(t_params[i], e_params[i]))

        # Initialize the tensor board
        if not os.path.exists('summaries'):
            os.mkdir('summaries')
        if not os.path.exists(os.path.join('summaries', 'first')):
            os.mkdir(os.path.join('summaries', 'first'))

        # init tensorflow session
        config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.summ_writer = tf.summary.FileWriter(os.path.join('summaries', 'first'), self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

        # init replay buffers
        self.replay_buffer_len = 0
        self.memory_size = int(kwargs.setdefault("memory_size", 2**10))
        print("Memory size ", self.memory_size)
        self.replay_buf_state = ReplayBuffer(shape=(self.memory_size, self.state_space))
        self.replay_buf_action = ReplayBuffer(shape=(self.memory_size,), dtype=np.int32)
        self.replay_buf_reward = ReplayBuffer(shape=(self.memory_size,))
        self.replay_buf_terminal = ReplayBuffer(shape=(self.memory_size,), dtype=np.bool)
        self.replay_buf_mask = ReplayBuffer(shape=(self.memory_size,))
        # if mask[i] == 0, then the item is used for padding, not for training
        self.policy = BoltzmanPolicy(action_space=self.num_actions)

    def _create_network(self, input_state, use_conv=False, reuse=None):
        """
        Define computation graph of network
        :param input_state: tf.tensor
        :param use_conv: bool
        :param reuse: bool
        :return:
        """
        kernel_num = [32, 32]
        hidden_size = self.nn_layers.values()
        if len(hidden_size) is 1:
            if self.activation == "Linear":
                print("1 NN with Linear activation, " + str(hidden_size[0]) + " neurons")
                h_state = tf.layers.dense(input_state,  units=hidden_size[0], activation=None,
                                    name="h_state", reuse=reuse)
            else:
                print("1 NN with RELU activation, " + str(hidden_size[0]) + " neurons")
                h_state = tf.layers.dense(input_state,  units=hidden_size[0], activation=tf.nn.relu,
                                    name="h_state", reuse=reuse)
        elif len(hidden_size) is 2:
            print("2 Layers NN with " + str(hidden_size[0]) + " and " + str(hidden_size[1]) + " neurons")
            activation = None
            if self.activation != "Linear":
                activation = tf.nn.relu

            h_state_0 = tf.layers.dense(input_state,  units=hidden_size[0], activation=activation,
                                    name="h_state_0", reuse=reuse)

            h_state = tf.layers.dense(h_state_0,  units=hidden_size[1], activation=activation,
                                    name="h_state", reuse=reuse)

        if self.use_dueling:
            value = tf.layers.dense(h_state, units=1, name="value", reuse=reuse)
            advantage = tf.layers.dense(h_state, units=self.num_actions, use_bias=False,
                                        name="advantage", reuse=reuse)

            qvalues = value + advantage - tf.reduce_mean(advantage, axis=1, keep_dims=True)
        else:
            qvalues = tf.layers.dense(h_state, units=self.num_actions, name="value", reuse=reuse)

        return qvalues

    def infer_action(self, user_id, sn, obs, step, policy="e_greedy", eps=0):
        """
        infer action for the given agent.
        :param raw_obs:
        :param user_id: int
            id of the user
        :param policy:
            can be eps-greedy or greedy
        :param eps: float
            used when policy is eps-greedy
        :return:
        """

        if policy == 'e_greedy':
            eps = eps
        elif policy == 'greedy':
            eps = 0

#        if self.num_gpu > 1 and n > batch_size:  # infer by multi gpu in parallel
#            ret = self._infer_multi_gpu(view, feature, ids, eps)
        qvalues = self.sess.run(self.qvalues, feed_dict={self.input_state: obs})
        best_actions = np.argmax(qvalues, axis=1)

        n = 1  # Since we take an action only for 1 user.
        random = np.random.randint(self.num_actions, size=(n,))
        cond = np.random.uniform(0, 1, size=(n,)) < eps
        ret = np.where(cond, random, best_actions)
        action = ret.astype(np.int32)
        # TODO: enable this later.
        #action = self.policy.take_action(qvalues, step)
        #actions.append(action)
        # action = self.sess.run(self.output_action, feed_dict={
        #     self.input_state: obs,
        #     self.eps: eps
        # })
        return action

    def _calc_target(self, next_state, reward, terminal):
        """
        Calculate target value
        :param next_state: next_state of the user.
        :param reward: reward of the previous action
        :param terminal:
        :return:
        """
        n = len(reward)
        if self.use_double:
            t_qvalues, qvalues = self.sess.run([self.target_qvalues, self.qvalues],
                                               feed_dict={self.input_state: next_state})
            next_value = t_qvalues[np.arange(n), np.argmax(qvalues, axis=1)]
        else:
            t_qvalues = self.sess.run(self.target_qvalues, feed_dict={self.input_state: next_state})
            next_value = np.max(t_qvalues, axis=1)

        target = np.where(terminal, reward, reward + self.gamma * next_value)

        return target

    def _add_to_replay_buffer(self, sample_buffer):
        """
        Add stored episode buffers to replay buffer.
        :param sample_buffer:
        :return:
        """
        n = 0
        for episode in sample_buffer.episodes(): # Each user has its own episode.
            s, a, r = [], [], []
            for step in range(len(episode.states)):  # Step represent the sequence number of the transmitted packet.
                if (episode.states[step] is not -1)and (episode.actions[step] is not -1)and (episode.rewards[step]is not -1):
                    # This part is required to make sure we synchronize the s,a and reward.
                    # in order words, to alleviate the effect of delayed rewards.
                    s.append(episode.states[step])
                    a.append(episode.actions[step])
                    r.append(episode.rewards[step])

            m = len(r)
            if m is 0:
                continue
            mask = np.ones((m,))
            terminal = np.zeros((m,), dtype=np.bool)
            if episode.terminal:
                terminal[-1] = True
            else:
                mask[-1] = 0

            self.replay_buf_state.put(s)
            self.replay_buf_action.put(a)
            self.replay_buf_reward.put(r)
            self.replay_buf_terminal.put(terminal)
            self.replay_buf_mask.put(mask)

            n += m

        self.replay_buffer_len = min(self.memory_size, self.replay_buffer_len + n)
        return n

    def train(self, sample_buffer, print_every=1000, **kwargs):
        """
        Add new samples in sample buffer to replay buffer and train
        Parameters
        ----------
        :param sample_buffer: memory.EpisodesBuffer
        :param print_every: int
            print log every print_every batches
        :param kwargs:
        :return:
        loss: float
            bellman residual loss
        value: float
            estimated state value
        """
        add_num = self._add_to_replay_buffer(sample_buffer)
        batch_size = self.batch_size
        total_loss = 0

        n_batches = int(self.training_freq * add_num / batch_size)
        if n_batches == 0:
            return 0, 0

        print("batch number: %d  add: %d  batch_size: %d training_freq: %d  replay_len: %d/%d" %
              (n_batches, add_num, batch_size, self.training_freq, self.replay_buffer_len, self.memory_size))

        start_time = time.time()
        ct = 0
        for i in range(n_batches):
            # fetch a batch
            index = np.random.choice(self.replay_buffer_len - 1, batch_size)

            batch_state = self.replay_buf_state.get(index)
            batch_action = self.replay_buf_action.get(index)
            batch_reward = self.replay_buf_reward.get(index)
            batch_terminal = self.replay_buf_terminal.get(index)
            batch_mask = self.replay_buf_mask.get(index)

            batch_next_state = self.replay_buf_state.get(index+1)

            batch_target = self._calc_target(batch_next_state, batch_reward, batch_terminal)

            ret = self.sess.run([self.train_op, self.loss], feed_dict = {
                self.input_state: batch_state,
                self.action: batch_action,
                self.target: batch_target,
                self.mask: batch_mask
            })
            loss = ret[1]
            total_loss += loss

            if ct % self.target_update == 0:
                print("Target Q update ct " + str(ct))
                self.sess.run(self.update_target_op)

            if ct % print_every == 0:
                print("batch %5d,  loss %.6f, eval %.6f" % (ct, loss, self._eval(batch_target)))

            ct += 1
            self.train_ct += 1

        total_time = time.time() - start_time
        step_average = total_time / max(1.0, (ct / 1000.0))
        print("batches: %d,  total time: %.2f,  1k average: %.2f" % (ct, total_time, step_average))

        return total_loss / ct if ct != 0 else 0, self._eval(batch_target)

    def _eval(self, target):
        """ Evaluate estimated q value"""
        if self.eval_obs is None:
            return np.mean(target)
        else:
            return np.mean(self.sess.run([self.qvalues], feed_dict = {
                self.input_state: self.eval_obs[0]
            }))

    def clean_buffer(self):
        """ Clean replay buffer """
        self.replay_buf_len = 0
        self.replay_buf_view.clear()
        self.replay_buf_feature.clear()
        self.replay_buf_action.clear()
        self.replay_buf_reward.clear()
        self.replay_buf_terminal.clear()
        self.replay_buf_mask.clear()
Пример #12
0
from utils.network_utils import NetworkSaver
from runner.runner import TrajRunner
from utils.trajectory_utils import TrajectoryReplay
from nets.net import init_nn_library

arguments = vars(args)

env = get_env(args.game, args.atari, args.env_transforms)

if args.load_trajectory is None:
	dqn_args = arguments.copy()
	dqn_args['mode'] = 'test'
	dqn_args['replay_buffer_size'] = 0

	runner = run_dqn(**dqn_args)
	replay_buffer = ReplayBuffer(args.replay_buffer_size, 1, args.update_frequency, args.replay_start_size, args.batch_size)
else:
	init_nn_library(True, "1")
	runner = TrajRunner(args.max_step)
	replay_buffer = TrajectoryReplay(args.load_trajectory, args.batch_size)

envOps = EnvOps(env.observation_space.shape, env.action_space.n, args.learning_rate)
summary_writer = tf.summary.FileWriter(args.logdir, K.get_session().graph) if not args.logdir is None else None

#model = EnvModelCartpole(envOps)
model = globals()[args.env_model](envOps)
env = EnvLearner(replay_buffer, model, summary_writer, args.reward_scale)

runner.listen(env, None)

if args.output_dir is None:
Пример #13
0
class DDPGAgent(BaseAgent):
    def __init__(self, config, env, env_params, her):
        BaseAgent.__init__(self, config)

        self.env = env
        self.config = config
        self.env_params = env_params

        #self.network = ActorCriticDeterministic(env_params['obs'], env_params['action'],
        #                                        env_params['goal'],
        #                                       config.hidden_layers,
        #                                        use_her=config.use_her)
        self.actor = actor(env_params)
        self.target_actor = actor(env_params)
        self.target_actor.load_state_dict(self.actor.state_dict())
        #=============
        self.critic = critic(env_params)
        self.target_critic = critic(env_params)
        self.target_critic.load_state_dict(self.critic.state_dict())

        # self.target_network = ActorCriticDeterministic(env_params['obs'], env_params['action'],
        #                                                env_params['goal'],
        #                                                config.hidden_layers,
        #                                                use_her=config.use_her)

        # self.target_network.load_state_dict(self.network.state_dict())

        self.her = her_sampler(config.replay_strategy, config.replay_k,
                               env.compute_reward)

        self.replay_buffer = ReplayBuffer(
            env_params,
            buffer_size=int(config.buffer_size),
            sample_func=self.her.sample_her_transitions)

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())

        self.o_norm = normalizer(size=env_params['obs'],
                                 default_clip_range=self.config.clip_range)
        self.g_norm = normalizer(size=env_params['goal'],
                                 default_clip_range=self.config.clip_range)
        self.model_path = '/home/mohamed/Desktop/Project/utils'

    def learn(self):
        for epoch in range(1, self.config.n_epochs + 1):
            for _ in range(self.config.n_cycles):
                #for _ in range(2):
                episode = self._sample(epoch)
                self.replay_buffer.store_episode(episode)
                self._update_normalizer(episode)
                for _ in range(self.config.n_batches):
                    self._update()

                self._soft_update()

            success_rate = self._eval_agent()

            print(
                'Success rate in after {} epochs is {:.3f} over {} test runs'.
                format(epoch, success_rate, self.config.test_rollouts))
        torch.save([
            self.o_norm.mean, self.o_norm.std, self.g_norm.mean,
            self.g_norm.std,
            self.actor.state_dict()
        ], self.model_path + '/model.pt')

    def _sample(self, epoch):
        obs_batch = []
        action_batch = []
        achieved_goals_batch = []
        goals_batch = []
        actions_episode = []
        obs_episode = []
        goals_episode = []
        achieved_goals_episode = []
        observation = self.env.reset()
        goal = observation['desired_goal']
        obs = observation['observation']
        achieved_goal = observation['achieved_goal']

        i = 0
        while True:
            if self.config.render:
                self.env.render()
            with torch.no_grad():
                action = self.actor(obs, goal)

                if self.config.add_noise:
                    action = self._select_actions(action[0], 1 / epoch)

            new_obs, _, _, info = self.env.step(action)

            achieved_goal = new_obs['achieved_goal']
            obs_episode.append(obs.copy())
            obs = new_obs['observation']

            achieved_goals_episode.append(achieved_goal.copy())
            i += 1
            if i > self.env_params['max_timesteps']:
                break
            actions_episode.append(action.copy())
            goals_episode.append(goal.copy())

        obs_batch.append(obs_episode)
        action_batch.append(actions_episode)
        achieved_goals_batch.append(achieved_goals_episode)
        goals_batch.append(goals_episode)

        episode = [
            np.array(obs_batch),
            np.array(achieved_goals_batch),
            np.array(goals_batch),
            np.array(action_batch)
        ]

        # self.replay_buffer.store_episode([np.array(obs_batch),
        #                                   np.array(achieved_goals_batch),
        #                                   np.array(goals_batch),
        #                                   np.array(action_batch)])
        # self._update_normalizer([np.array(obs_batch),
        #                                   np.array(achieved_goals_batch),
        #                                   np.array(goals_batch),
        #                                   np.array(action_batch)])
        return episode

    def _update(self):
        experiences = self.replay_buffer.sample(self.config.batch_size)
        states, goals = self._preproc_og(experiences['obs'], experiences['g'])
        next_states, next_goals = self._preproc_og(experiences['next_obs'],
                                                   experiences['g'])
        actions = experiences['actions']
        rewards = experiences['r']

        states = self.o_norm.normalize(states)
        goals = self.g_norm.normalize(goals)
        next_states = self.o_norm.normalize(next_states)
        next_goals = self.g_norm.normalize(next_goals)

        with torch.no_grad():
            next_actions = self.target_actor(next_states, next_goals)
            target_value = self.target_critic(next_states, next_actions[0],
                                              next_goals)
            expected_value = (to_tensor(rewards) +
                              self.config.discount * target_value).detach()

            clip_return = 1 / (1 - self.config.discount)
            expected_value = torch.clamp(expected_value, -clip_return, 0)

        #====== Value loss ========
        value_criterion = nn.MSELoss()
        value = self.critic(states, actions, goals)
        value_loss = value_criterion(expected_value, value)
        #====== Policy loss =======
        actions_ = self.actor(states, goals)
        policy_loss = -(self.critic(states, actions_[0], goals)).mean()
        policy_loss += self.config.action_l2 * (actions_[0]).pow(2).mean()
        #====== Policy update =======
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()
        #====== Value update ========
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        self.critic_optimizer.step()

    def _soft_update(self):
        tau = self.config.tau_ddpg
        for targetp, netp in zip(self.target_critic.parameters(),
                                 self.critic.parameters()):
            targetp.data.copy_(tau * netp + (1 - tau) * targetp)

        for targetp, netp in zip(self.target_actor.parameters(),
                                 self.actor.parameters()):
            targetp.data.copy_(tau * netp + (1 - tau) * targetp)

    def _eval_agent(self):
        success_rate = 0
        for _ in range(self.config.test_rollouts):

            observation = self.env.reset()

            obs = observation['observation']
            goal = observation['desired_goal']
            obs, goal = self._preproc_inputs(obs, goal)
            for _ in range(self.env_params['max_timesteps']):
                if self.config.render:
                    self.env.render()
                with torch.no_grad():
                    action = self.actor(obs, goal)
                new_obs, _, _, info = self.env.step(to_numpy(action[0]))
                obs, goal = self._preproc_inputs(new_obs['observation'],
                                                 new_obs['desired_goal'])
            success_rate += info['is_success']

        return success_rate / self.config.test_rollouts

    def _select_actions(self, pi, eps):
        action = pi.cpu().numpy().squeeze()
        # add the gaussian

        action += self.config.eps * self.env_params[
            'action_max'] * np.random.randn(*action.shape)
        action = np.clip(action, -self.env_params['action_max'],
                         self.env_params['action_max'])
        # random actions...

        random_actions = np.random.uniform(low=-self.env_params['action_max'], high=self.env_params['action_max'], \
                                            size=self.env_params['action'])
        # choose if use the random actions
        action += np.random.binomial(1, eps, 1) * (random_actions - action)
        return action

    def _update_normalizer(self, episode_batch):
        mb_obs, mb_ag, mb_g, mb_actions = episode_batch
        mb_obs_next = mb_obs[:, 1:, :]
        mb_ag_next = mb_ag[:, 1:, :]
        # get the number of normalization transitions
        num_transitions = mb_actions.shape[1]
        # create the new buffer to store them
        buffer_temp = {
            'obs': mb_obs,
            'ag': mb_ag,
            'g': mb_g,
            'actions': mb_actions,
            'next_obs': mb_obs_next,
            'next_ag': mb_ag_next,
        }
        transitions = self.her.sample_her_transitions(buffer_temp,
                                                      num_transitions)
        obs, g = transitions['obs'], transitions['g']
        # pre process the obs and g
        transitions['obs'], transitions['g'] = self._preproc_og(obs, g)
        # update
        self.o_norm.update(transitions['obs'])
        self.g_norm.update(transitions['g'])
        # recompute the stats
        self.o_norm.recompute_stats()
        self.g_norm.recompute_stats()

    def _preproc_inputs(self, obs, g):
        obs_norm = self.o_norm.normalize(obs)
        g_norm = self.g_norm.normalize(g)
        return obs_norm, g_norm

    def _preproc_og(self, o, g):
        o = np.clip(o, -self.config.clip_obs, self.config.clip_obs)
        g = np.clip(g, -self.config.clip_obs, self.config.clip_obs)
        return o, g