Python ReplayMemory примеры использования

Язык программирования: Python

Пространство имен/Пакет: replaymemory

Класс/Тип: ReplayMemory

Примеров на hotexamples.com: 32

Python ReplayMemory - 32 примеров найдено. Это лучшие примеры Python кода для replaymemory.ReplayMemory, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayMemory(20)

sample(7)

add(4)

getCurrentIndex(3)

push(3)

pushFrame(3)

pushTransition(3)

sampleTransition(3)

sample_buffer(3)

store_transition(3)

update_priorities(2)

addMemory(1)

append(1)

getRandomMemories(1)

Пример #1

Показать файл

    def __init__(self, env, mode, pre_trained_model, tensorboard_writer=None):
        super(DQNAgent, self).__init__(env, mode, tensorboard_writer)
        self.agent_name = 'DQN' + str(self.agent_no)
        self.memory = ReplayMemory()

        self.network = DeepQNetwork(self.obs_space[0], self.action_space)

        if self.mode == 'play':
            self.network.load_params(pre_trained_model)
            self.network.eval()

        elif self.mode == 'train':

            self.eval_network = DeepQNetwork(self.obs_space[0],
                                             self.action_space)
            self.eval_network.eval()

            if pre_trained_model:
                self.eval_network.load_params(pre_trained_model)

            self.optimizer = optim.RMSprop(self.network.parameters(), lr=LR)
            self.loss_func = SmoothL1Loss()
        else:
            raise ValueError(
                'Please set a valid mode for the agent (play or train)')

Пример #2

Показать файл

Файл: learner_mp.py Проект: LihaoR/Apex-dqn

    def __init__(self, sess, s_size, a_size, scope, queues, trainer):
        self.queue = queues[0]
        self.param_queue = queues[1]
        self.replaymemory = ReplayMemory(100000)
        self.sess = sess
        self.learner_net = network(s_size, a_size, scope, 20)

        self.q = self.learner_net.q
        self.Q = self.learner_net.Q

        self.actions_q = tf.placeholder(shape=[None, a_size, N],
                                        dtype=tf.float32)
        self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32)
        self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32)

        self.q_actiona = tf.multiply(self.q, self.actions_q)
        self.q_action = tf.reduce_sum(self.q_actiona, axis=1)
        self.u = tf.abs(self.q_target - self.q_action)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1))

        self.local_vars = self.learner_net.local_vars  #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        self.gradients = tf.gradients(self.loss, self.local_vars)
        #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0)
        self.apply_grads = trainer.apply_gradients(
            zip(self.gradients, self.local_vars))
        self.sess.run(tf.global_variables_initializer())

Пример #3

Показать файл

Файл: agent.py Проект: Danboruya/mt-sequential-multi-agent-rl

    def __init__(self, policy_net, target_net, durability, optimizer, name,
                 constants):
        """An agent class that takes action on the environment and optimizes
        the action based on the reward.

        Parameters
        ----------
        policy_net : DQN
            [description]
        target_net : DQN
            [description]
        durability : int
            [description]
        optimizer : [type]
            [description]
        name : str
            The name of agent
        constants: Constants
            The hyper-parameters from Constants class
        """
        self.CONSTANTS = constants
        self.policy_net = policy_net
        self.target_net = target_net
        self.target_net.load_state_dict(policy_net.state_dict())
        self.durability = durability
        self.optimizer = optimizer
        self.name = name
        self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE)
        self.steps_done = 0
        self.total_reward = 0.0
        self.reward = 0.0
        self.obtained_reward = 0.0
        self.n_best = 0
        self.policy_net_flag = False

Пример #4

Показать файл

Файл: agent.py Проект: zereflai/RLND-project

    def __init__(self, state_size, action_size, seed, is_double_q=False):
        '''Initialize an Agent.

        Params
        ======
            state_size (int): the dimension of the state
            action_size (int): the number of actions
            seed (int): random seed
        '''

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0  # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP)
        self.running_loss = 0
        self.training_cnt = 0

        self.is_double_q = is_double_q

        self.qnetwork_local = QNetwork(self.state_size, self.action_size,
                                       seed).to(device)
        self.qnetowrk_target = QNetwork(self.state_size, self.action_size,
                                        seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)

Пример #5

Показать файл

Файл: ddpg_agent.py Проект: antoniopioricciardi/udemy_modernRL

    def __init__(self,
                 load_checkpoint,
                 n_states,
                 n_actions,
                 checkpoint_file,
                 mem_size=10**6,
                 batch_size=64,
                 n_hid1=400,
                 n_hid2=300,
                 alpha=1e-4,
                 beta=1e-3,
                 gamma=0.99,
                 tau=0.99):
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  alpha,
                                  checkpoint_file,
                                  name='actor')
        self.critic = CriticNetwork(n_states,
                                    n_actions,
                                    n_hid1,
                                    n_hid2,
                                    beta,
                                    checkpoint_file,
                                    name='critic')

        self.actor_target = ActorNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         alpha,
                                         checkpoint_file,
                                         name='actor_target')
        self.critic_target = CriticNetwork(n_states,
                                           n_actions,
                                           n_hid1,
                                           n_hid2,
                                           beta,
                                           checkpoint_file,
                                           name='critic_target')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))
        self.memory = ReplayMemory(mem_size, n_states, n_actions)
        self.update_network_parameters_phil(tau=1)
        if load_checkpoint:
            self.actor.eval()
        self.load_checkpoint = load_checkpoint

Пример #6

Показать файл

Файл: qrdqn-offline.py Проект: LihaoR/c51-qr-dqn

 def __init__(self, env, name, s_size, a_size, trainer, model_path,
              global_episodes):
     self.name = "worker_" + str(name)
     self.number = name
     self.model_path = model_path
     self.trainer = trainer
     self.global_episodes = global_episodes
     self.increment = self.global_episodes.assign_add(1)
     self.episode_rewards = []
     self.episode_lengths = []
     self.episode_mean_values = []
     #Create the local copy of the network and the tensorflow op to copy global paramters to local network
     self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
     self.update_local_ops = update_target_graph('global', self.name)
     self.env = env
     self.replaymemory = ReplayMemory(max_memory)

Пример #7

Показать файл

 def __init__(self, num_states, num_actions, Double, Dueling, PER):
     self.num_actions = num_actions # 행동 가짓수(2)를 구함
     self.Double = Double
     self.Dueling = Dueling
     self.PER = PER
     
     # transition을 기억하기 위한 메모리 객체 생성
     self.memory = ReplayMemory(CAPACITY)
     
     # 신경망 구성
     n_in, n_mid, n_out = num_states, 32, num_actions
     self.main_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용
     self.target_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용
     print(self.main_q_network) # 신경망의 구조를 출력
     
     # 최적화 기법을 선택
     self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001)
     
     # PER - TD 오차를 기억하기 위한 메모리 객체 생성
     if self.PER == True:
         self.td_error_memory = TDerrorMemory(CAPACITY)

Пример #8

Показать файл

Файл: ddpg.py Проект: ljd2439/CrowdSimulation_jaedong

    def __init__(self, dim):
        self.critic_path = cst.CN_CKPT_PATH
        self.actor_path = cst.AN_CKPT_PATH
        self.replaymemory_path = cst.RM_PATH

        self.dim_body = dim[0]
        self.dim_sensor = dim[1]
        self.dim_state = dim[0] + dim[1] * 3
        self.dim_action = dim[2]

        self.sess = tf.InteractiveSession()
        self.act_lr = cst.ACT_LEARNING_RATE
        self.cri_lr = cst.CRI_LEARNING_RATE
        self.tau = cst.TAU
        self.batch_size = cst.BATCH_SIZE
        self.gamma = cst.REWARD_DECAY

        self.actorNN = ActorNetwork(self.sess, self.dim_state, self.dim_action,
                                    self.act_lr, self.tau, self.batch_size)
        self.criticNN = CriticNetwork(self.sess, self.dim_state,
                                      self.dim_action, self.cri_lr, self.tau,
                                      self.gamma,
                                      self.actorNN.get_num_trainable_vars())

        self.sess.run(tf.global_variables_initializer())

        self.actorNN.update_target_network()
        self.criticNN.update_target_network()

        self.rm = ReplayMemory('DDPG')

        self.agent_count = cst.AGENT_COUNT
        self.exploration_rate = cst.EXPLORATION_RATE
        self.epsilon = cst.CRITIC_EPSILON
        self.LOSS_ITERATION = cst.LOSS_ITERATION

        self.expl_noise = OUNoise(self.dim_action)

        self.expl = False
        self.expl_decay = cst.EXPLORATION_DECAY

Пример #9

Показать файл

Файл: qrdqn-offline.py Проект: LihaoR/c51-qr-dqn

 def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes):
     self.name = "worker_" + str(name)
     self.number = name
     self.model_path = model_path
     self.trainer = trainer
     self.global_episodes = global_episodes
     self.increment = self.global_episodes.assign_add(1)
     self.episode_rewards = []
     self.episode_lengths = []
     self.episode_mean_values = []
     #Create the local copy of the network and the tensorflow op to copy global paramters to local network
     self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
     self.update_local_ops = update_target_graph('global', self.name)
     self.env = env
     self.replaymemory = ReplayMemory(max_memory)

Пример #10

Показать файл

def main(game, episodes, training_mode=False, log=False, no_ops=30):
    env = gym.make(game)
    num_actions = env.action_space.n
    dqn = DeepQNetwork(num_actions, (4, 84, 84))
    replay = ReplayMemory(100000)
    obs = env.reset()
    h, w, c = obs.shape
    phi = Phi(4, 84, 84, c, h, w)
    agent = Agent(replay, dqn, training_mode=training_mode)
    stats = Stats('results/results.csv')

    for i_episode in range(episodes):
        env.reset()

        for i in range(random.randint(1, no_ops)):
            observation, _, _, _ = env.step(0)
            pre_state = phi.add(observation)

        game_score = 0
        done = False
        t = 0

        while not done:
            t += 1
            env.render()
            action = agent.get_action(pre_state)
            observation, reward, done, _ = env.step(action)
            post_state = phi.add(observation)

            if training_mode:
                agent.update_replay_memory(pre_state, action, reward,
                                           post_state, done)
                if agent.time_step > agent.replay_start_size:
                    stats.log_time_step(agent.get_loss())

            pre_state = post_state
            game_score += reward

        print("Episode {} finished after {} time steps with score {}".format(
            i_episode, t, game_score))
        phi.reset()
        if agent.time_step > agent.replay_start_size:
            stats.log_game(game_score, t)

    stats.close()

    if log:
        dqn.save_model('results/model_weights.hdf5')

Пример #11

Показать файл

Файл: train.py Проект: ouyangshixiong/FlappyBird_DQN

def main():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env_evaluate = PLE(game, fps=30, display_screen=False)
    obs_dim = len(env.getGameState())
    action_dim = 2  # 只能是up键，还有一个其它，所以是2

    # rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim)
    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_dim,
        act_dim=action_dim,
        e_greed=0.2,  # explore
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    if os.path.exists('./model_dir'):
        agent.restore('./model_dir')

    # while rpm.size() < MEMORY_WARMUP_SIZE:  # warm up replay memory
    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, env, rpm)

    max_episode = 5000

    # start train
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1

        eval_reward = evaluate(agent, env_evaluate)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))

    agent.save('./model_dir')

Пример #12

Показать файл

EPISODES = 500
START_RANDOM = False
MAX_EPISODE_COUNTER = 3600 * 24 * 2.0 / PERIOD
ACTION_DIM = 1
STATE_DIM = 6
ACTION_MAX = 1.0
MAX_BUFFER = 100000
MAX_TOTAL_REWARD = 300
EPISODE_PLOT = 25

# -------------------------------------------- #
# LOAD USEFULL CLASSES.
# -------------------------------------------- #

# Load the memroy
memory = ReplayMemory(MAX_BUFFER)

# Load the environment.
env = Environment(FILENAME, QUOTE_QTY, TRADE_QTY)

# Load the trainer.
trainer = Trainer(STATE_DIM, ACTION_DIM, ACTION_MAX, memory)

# Load the window.
window = Window(LOOK_BACK)
window.add_norm("#t", method="log_change", ref="close_price_#t")

# Load the tensorboard writer.
writer = SummaryWriter("tensorboard/runs")

# -------------------------------------------- #

Пример #13

Показать файл

Файл: double_v2_agent.py Проект: maxstrobel/TwinDQN

    def __init__(
            self,
            game,
            mem_size=512 * 512,  #1024*512,
            state_buffer_size=4,
            batch_size=64,
            learning_rate=1e-5,
            pretrained_model=None,
            frameskip=4,  #1
            record=False):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders)
        #if self.game == 'Breakout-v0':
        #    dimensions = (32, 195, 8, 152)
        #elif self.game == 'SpaceInvaders-v0':
        #    dimensions = (21, 195, 20, 141)
        #elif self.game == 'Assault-v0':
        #    dimensions = (50, 240, 5, 155)
        #elif self.game == 'Phoenix-v0':
        #    dimensions = (23, 183, 0, 160)
        #elif self.game == 'Skiing-v0':
        #    dimensions = (55, 202, 8, 152)
        #elif self.game == 'Enduro-v0':
        #    dimensions = (50, 154, 8, 160)
        #elif self.game == 'BeamRider-v0':
        #    dimensions = (32, 180, 9, 159)

        if self.game == 'BreakoutAndSpace':
            dimensions_break = (32, 195, 8, 152)
            dimensions_space = (21, 195, 20, 141)
        elif self.game != 'BreakoutAndSpace':
            print(
                'Error! This version is for playing BreakOut and SpaceInvaders at the same time.'
            )

        # Environment
        self.env_break = Environment('BreakoutNoFrameskip-v4',
                                     dimensions_break,
                                     frameskip=frameskip)
        self.env_space = Environment('SpaceInvaders-v0',
                                     dimensions_space,
                                     frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in=state_buffer_size,
                       num_actions=self.env_space.get_number_of_actions())

        self.target_net = DQN(
            channels_in=state_buffer_size,
            num_actions=self.env_space.get_number_of_actions())

        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 4
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size,
                                   num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 25000
        else:
            self.start_train_after = mem_size // 2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500

Пример #14

Показать файл

Файл: agent.py Проект: antoniopioricciardi/udemy_modernRL

    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 update_actor_interval=2,
                 warmup=1000,
                 mem_size=10**6,
                 batch_size=100,
                 n_hid1=400,
                 n_hid2=300,
                 lr_alpha=1e-3,
                 lr_beta=1e-3,
                 gamma=0.99,
                 tau=5e-3,
                 noise_mean=0,
                 noise_sigma=0.1):

        self.load_checkpoint = load_checkpoint
        self.checkpoint_file = checkpoint_file
        # needed for clamping in the learn function
        self.env = env
        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.n_actions = n_actions
        # to keep track of how often we call "learn" function, for the actor network
        self.learn_step_counter = 0
        # to handle countdown to the end of the warmup period, incremented every time we call an action
        self.time_step = 0
        self.update_actor_interval = update_actor_interval
        self.warmup = warmup
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.noise_mean = noise_mean
        self.noise_sigma = noise_sigma

        self.actor = TD3ActorNetwork(n_states,
                                     n_actions,
                                     n_hid1,
                                     n_hid2,
                                     lr_alpha,
                                     checkpoint_file,
                                     name='actor')
        self.target_actor = TD3ActorNetwork(n_states,
                                            n_actions,
                                            n_hid1,
                                            n_hid2,
                                            lr_alpha,
                                            checkpoint_file,
                                            name='target_actor')

        self.critic_1 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_1')
        self.critic_2 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_2')
        self.target_critic_1 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_1')
        self.target_critic_2 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_2')

        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        # tau=1 perform an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.actor, self.target_actor, tau=1)
        self.update_network_parameters(self.critic_1,
                                       self.target_critic_1,
                                       tau=1)
        self.update_network_parameters(self.critic_2,
                                       self.target_critic_2,
                                       tau=1)

Пример #15

Показать файл

image_dimensions = 210 * 160 * 3
num_episodes = 50
target_episode_update = 5
action_threshold = 250
train_batch_size = 64

GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
steps_done = 0
n_actions = env.action_space.n
screen_height = 210
screen_width = 160

memory = ReplayMemory(10000)

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())


def optimize_model():
    if len(memory) < train_batch_size:
        return

    transitions = memory.sample(train_batch_size)
    print('Training on:', len(transitions))

Пример #16

Показать файл

Файл: single_agent.py Проект: maxstrobel/TwinDQN

class SingleAgent(object):
    def __init__(self,
                 game,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 frameskip = 4
                 ):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # Environment
        self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())

        self.target_net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500


    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        EPSILON_PLAY = 0.01
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        elif mode=='play':
            epsilon = EPSILON_PLAY
        else:
            epsilon = EPSILON_END

        if epsilon < random():
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1,1)

            # Prevent noops
            if action[0,0]!=1:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0,0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:
            # Random action
            action = self.env.sample_action()
            action = LongTensor([[action]])

        return action


    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition
        batch = self.replay.sampleTransition(self.batch_size)

        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]),
                                              volatile=True) # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(1)[0]
        next_state_values[non_final_mask]= next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()

        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates%self.update_target_net_each_k_steps==0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]


    def play(self, n):
        """
        Play a game with the current net and render it

        Inputs:
        - n: games to play
        """
        for i in range(n):
            done = False # games end indicator variable
            score = 0
            # Reset game
            screen = self.env.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = gray2pytorch(screen)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            while not done:
                action = self.select_action(state, mode='play')[0,0]

                screen, reward, _, done, _ = self.env.step(action, mode='play')
                score += reward

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                # convert frames to range 0 to 1 again
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                self.state = state
            print('Game ({}/{}) - Final score {}: {}'.format(i+1, n, self.game, score))
        self.env.game.close()


    def play_stats(self, n_games, mode='random'):
        """
        Play N games randomly or evaluate a net and log results for statistics

        Input:
        - n_games: int Number of games to play
        - mode: str 'random' or 'evaluation'
        """
        # Subdirectory for logging
        sub_dir = mode + '_' + self.game + '/'
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # Store history
        reward_history = []
        reward_clamped_history = []

        # Number of actions to sample from
        n_actions = self.env.get_number_of_actions()

        for i_episode in range(1, n_games+1):
            # Reset game
            screen = self.env.reset()

            # Store screen
            if mode=='evaluation':
                # list of k last frames
                last_k_frames = []
                for j in range(self.num_stored_frames):
                    last_k_frames.append(None)
                    last_k_frames[j] = gray2pytorch(screen)
                # frame is saved as ByteTensor -> convert to gray value between 0 and 1
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done = False

            # reset score with initial lives, because every lost live adds -1
            total_reward = 0
            total_reward_clamped = self.env.get_lives()

            while not done:
                if mode=='random':
                    action = randrange(n_actions)
                elif mode=='evaluation':
                    action = self.select_action(state, mode='play')[0,0]

                screen, reward, reward_clamped, done, _ = self.env.step(action)
                total_reward += int(reward)
                total_reward_clamped += int(reward_clamped)

                if mode=='evaluation':
                    #   save latest frame, discard oldest
                    for j in range(self.num_stored_frames-1):
                        last_k_frames[j] = last_k_frames[j+1]
                    last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                    # convert frames to range 0 to 1 again
                    state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0



            # Print current result
            print('Episode: {:6}/{:6} |  '.format(i_episode, n_games),
                  'score: ({:4}/{:4})'.format(total_reward_clamped,total_reward))

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

        avg_reward = np.sum(reward_history)/len(reward_history)
        avg_reward_clamped = np.sum(reward_clamped_history)/len(reward_clamped_history)

        # Print final result
        print('\n\n=============================================\n' +
              'avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward))

        # Log results to files
        with open(sub_dir + mode + '.txt', 'w') as fp:
            fp.write('avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward))
        with open(sub_dir + mode + '_reward.pickle', 'wb') as fp:
            pickle.dump(reward_history, fp)
        with open(sub_dir + mode + '_reward_clamped.pickle', 'wb') as fp:
            pickle.dump(reward_clamped_history, fp)


    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + self.game + '_train.txt'
        loss_file = sub_dir + 'loss.pickle'
        reward_file = sub_dir + 'reward.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        loss_history = []
        reward_history = []
        reward_clamped_history = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' +
                     'Trained game:                       ' + str(self.game) + '\n' +
                     'Learning rate:                      ' + str(self.learning_rate) + '\n' +
                     'Batch size:                         ' + str(self.batch_size) + '\n' +
                     'Memory size(replay):                ' + str(self.mem_size) + '\n' +
                     'Pretrained:                         ' + str(self.pretrained_model) + '\n' +
                     'Started training after k frames:    ' + str(self.start_train_after) + '\n' +
                     'Optimized after k frames:           ' + str(self.optimize_each_k) + '\n' +
                     'Target net update after k frame:    ' + str(self.update_target_net_each_k_steps) + '\n\n' +
                     '------------------------------------------------------' +
                     '--------------------------------------------------\n')

        print('Started training...\nLogging to', sub_dir)

        for i_episode in range(1,num_episodes):
            # reset game at the start of each episode
            screen = self.env.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = gray2pytorch(screen)

            if i_episode == 1:
                self.replay.pushFrame(last_k_frames[0].cpu())

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            done = False # games end indicator variable
            # reset score with initial lives, because every lost live adds -1
            total_reward = 0
            total_reward_clamped = self.env.get_lives()

            # Loop over one game
            while not done:
                self.steps +=1

                action = self.select_action(state)

                # perform selected action on game
                screen, reward, reward_clamped, done, _ = self.env.step(action[0,0])
                total_reward += int(reward)
                total_reward_clamped += int(reward_clamped)

                # Wrap into tensor
                reward = torch.Tensor([reward_clamped])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                # convert frames to range 0 to 1 again
                if not done:
                    next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                else:
                    next_state = None

                # Store transition
                self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu())
                self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity, action, reward, done)

                #	only optimize each kth step
                if self.steps%self.optimize_each_k == 0:
                    loss = self.optimize(net_updates)

                    # Logging
                    loss_history.append(loss)
                    #q_history.append(q_value)
                    #exp_q_history.append(exp_q_value)

                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done:
                    break;

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

            print('Episode: {:6} |  '.format(i_episode),
                  'steps {:8} |  '.format(self.steps),
                  'loss: {:.2E} |  '.format(loss if loss else 0),
                  'score: ({:4}/{:4}) |  '.format(total_reward_clamped,total_reward),
                  'best score: ({:4}/{:4}) |  '.format(best_score_clamped,best_score),
                  'replay size: {:7}'.format(len(self.replay)))

            avg_score_clamped += total_reward_clamped
            avg_score += total_reward
            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            if i_episode % log_avg_episodes == 0 and i_episode!=0:
                avg_score_clamped /= log_avg_episodes
                avg_score /= log_avg_episodes

                print('----------------------------------------------------------------'
                      '-----------------------------------------------------------------',
                      '\nLogging to file: \nEpisode: {:6}   '.format(i_episode),
                      'steps: {:8}   '.format(self.steps),
                      'avg on last {:4} games ({:6.1f}/{:6.1f})   '.format(log_avg_episodes, avg_score_clamped,avg_score),
                      'best score: ({:4}/{:4})'.format(best_score_clamped, best_score),
                      '\n---------------------------------------------------------------'
                      '------------------------------------------------------------------')
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write('Episode: {:6} |  '.format(i_episode) +
                             'steps: {:8} |  '.format(self.steps) +
                             'avg on last {:4} games ({:6.1f}/{:6.1f}) |  '.format(log_avg_episodes, avg_score_clamped,avg_score) +
                             'best score: ({:4}/{:4})\n'.format(best_score_clamped, best_score))
                # Dump loss & reward
                with open(loss_file, 'wb') as fp:
                    pickle.dump(loss_history, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) + '...\n')
                self.target_net.save(sub_dir + self.game + '-' + str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + self.game + '.model')

Пример #17

Показать файл

Файл: double_v2_agent.py Проект: maxstrobel/TwinDQN

class Agent(object):
    def __init__(
            self,
            game,
            mem_size=512 * 512,  #1024*512,
            state_buffer_size=4,
            batch_size=64,
            learning_rate=1e-5,
            pretrained_model=None,
            frameskip=4,  #1
            record=False):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders)
        #if self.game == 'Breakout-v0':
        #    dimensions = (32, 195, 8, 152)
        #elif self.game == 'SpaceInvaders-v0':
        #    dimensions = (21, 195, 20, 141)
        #elif self.game == 'Assault-v0':
        #    dimensions = (50, 240, 5, 155)
        #elif self.game == 'Phoenix-v0':
        #    dimensions = (23, 183, 0, 160)
        #elif self.game == 'Skiing-v0':
        #    dimensions = (55, 202, 8, 152)
        #elif self.game == 'Enduro-v0':
        #    dimensions = (50, 154, 8, 160)
        #elif self.game == 'BeamRider-v0':
        #    dimensions = (32, 180, 9, 159)

        if self.game == 'BreakoutAndSpace':
            dimensions_break = (32, 195, 8, 152)
            dimensions_space = (21, 195, 20, 141)
        elif self.game != 'BreakoutAndSpace':
            print(
                'Error! This version is for playing BreakOut and SpaceInvaders at the same time.'
            )

        # Environment
        self.env_break = Environment('BreakoutNoFrameskip-v4',
                                     dimensions_break,
                                     frameskip=frameskip)
        self.env_space = Environment('SpaceInvaders-v0',
                                     dimensions_space,
                                     frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in=state_buffer_size,
                       num_actions=self.env_space.get_number_of_actions())

        self.target_net = DQN(
            channels_in=state_buffer_size,
            num_actions=self.env_space.get_number_of_actions())

        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 4
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size,
                                   num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 25000
        else:
            self.start_train_after = mem_size // 2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500

    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (
                EPSILON_START - EPSILON_END) / EPSILON_DECAY
        else:
            epsilon = EPSILON_END

        if epsilon > random() or mode == 'play':
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1, 1)

            # Prevent noops
            if action[0, 0] == 0:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0, 0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:

            # Random action
            action = self.env_space.sample_action()
            action = LongTensor([[action]])

        return action

    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition

        batch = self.replay.sampleTransition(self.batch_size)
        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(
            list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(
            torch.cat([ns for ns in batch.next_state if ns is not None]),
            volatile=True
        )  # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(
            self.batch_size).type(FloatTensor),
                                     volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(
            1)[0]
        next_state_values[non_final_mask] = next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        self.optimizer.zero_grad()

        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates % self.update_target_net_each_k_steps == 0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]

    def play(self):
        """
        Play a game with the current net and render it
        """
        done = False  # games end indicator variable
        score = 0
        # Reset game
        screen_break = self.env_break.reset()
        screen_space = self.env_space.reset()

        # list of k last frames
        ############old version:
        #breakout part
        #last_k_frames_break = []
        #for j in range(self.num_stored_frames):
        #    last_k_frames_break.append(None)
        #    last_k_frames_break[j] = gray2pytorch(screen_break)
        #spaceinvaders part
        #last_k_frames_space = []
        #for j in range(self.num_stored_frames):
        #    last_k_frames_space.append(None)
        #    last_k_frames_space[j] = gray2pytorch(screen_space)
        #################
        last_k_frames = []
        for j in range(self.num_stored_frames):
            last_k_frames.append(None)
            last_k_frames[j] = torch.cat(
                (gray2pytorch(screen_break), gray2pytorch(screen_space)),
                dim=2)

        # frame is saved as ByteTensor -> convert to gray value between 0 and 1
        ############old version:
        #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0
        #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0
        #state = torch.cat((state_break,state_space), 2)
        state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0

        while not done:
            action = self.select_action(state, mode='play')

            # Render game
            self.env_break.game.render(mode='human')
            self.env_space.game.render(mode='human')

            # maps actions from space invaders to breakout (shot-left to left, shot-right to right)
            if action[0, 0] == 4:
                action_break = 2
            elif action[0, 0] == 5:
                action_break = 3
            elif action[0, 0] != 5:
                action_break = action[0, 0]

            screen_break, _, reward_break, done_break, info_break = self.env_break.step(
                action_break, mode='play')
            screen_space, _, reward_space, done_space, info_space = self.env_space.step(
                action[0, 0], mode='play')
            score += reward_break
            score += reward_space
            done = done_break or done_space

            ############old
            #   save latest frame, discard oldest
            #for j in range(self.num_stored_frames - 1):
            #    last_k_frames_break[j] = last_k_frames_break[j + 1]
            #    last_k_frames_space[j] = last_k_frames_space[j + 1]
            #last_k_frames_break[self.num_stored_frames - 1] = gray2pytorch(screen_break)
            #last_k_frames_space[self.num_stored_frames - 1] = gray2pytorch(screen_space)

            # convert frames to range 0 to 1 again
            #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0
            #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0
            #state = torch.cat((state_break, state_space), 2)
            #############old_end

            #   save latest frame, discard oldest
            for j in range(self.num_stored_frames - 1):
                last_k_frames[j] = last_k_frames[j + 1]
            last_k_frames[self.num_stored_frames - 1] = torch.cat(
                (gray2pytorch(screen_break), gray2pytorch(screen_space)),
                dim=2)

            # convert frames to range 0 to 1 again
            state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0
            done = done_break or done_space

        print('Final score:', score)
        self.env.game.close()  #for both changen

    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game + '_' + datetime.now().strftime(
            '%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + self.game + '_train.log'
        loss_file = sub_dir + 'loss.pickle'
        reward_file = sub_dir + 'reward.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        loss_history = []
        reward_history = []
        reward_clamped_history = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(
                datetime.now().strftime('%Y%m%d_%H%M%S') + '\n' +
                'Trained game:    ' + str(self.game) + '\n' +
                'Learning rate:    ' + str(self.learning_rate) + '\n' +
                'Batch size:    ' + str(self.batch_size) + '\n' +
                'Pretrained:    ' + str(self.pretrained_model) + '\n' +
                'Started training after k frames:    ' +
                str(self.start_train_after) + '\n' +
                'Optimized after k frames:    ' + str(self.optimize_each_k) +
                '\n' + 'Target net update after k frame:    ' +
                str(self.update_target_net_each_k_steps) + '\n\n' +
                '--------------------------------------------------------------------------------\n'
            )

        print('Started training...\nLogging to', sub_dir)

        for i_episode in range(1, num_episodes):

            # reset game at the start of each episode
            screen_break = self.env_break.reset()
            screen_space = self.env_space.reset()

            # list of k last frames
            last_k_frames_break = []
            last_k_frames_space = []
            for j in range(self.num_stored_frames):
                last_k_frames_break.append(None)
                last_k_frames_space.append(None)
                last_k_frames_break[j] = gray2pytorch(screen_break)
                last_k_frames_space[j] = gray2pytorch(screen_space)

            if i_episode == 1:
                frames_both = torch.cat((last_k_frames_break[0].cpu(),
                                         last_k_frames_space[0].cpu()), 2)
                #self.replay.pushFrame(last_k_frames_break[0].cpu())
                #self.replay.pushFrame(last_k_frames_space[0].cpu())
                self.replay.pushFrame(frames_both)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state_break = torch.cat(last_k_frames_break,
                                    1).type(FloatTensor) / 255.0
            state_space = torch.cat(last_k_frames_space,
                                    1).type(FloatTensor) / 255.0
            state = torch.cat((state_break, state_space), 2)

            done = False  # games end indicator variable
            # reset score with initial lives, because every lost live adds -1
            total_reward = self.env_break.get_lives()
            total_reward += self.env_space.get_lives()
            total_reward_clamped = self.env_break.get_lives()
            total_reward_clamped += self.env_space.get_lives()
            ###########

            # Loop over one game
            while not done:
                self.steps += 1
                action = self.select_action(state)
                # perform selected action on game
                # screen, reward, done, info = self.env.step(action[0,0])#envTest.step(action[0,0])
                #maps actions from space invaders to breakout (shot-left to left, shot-right to right)

                screen_space, _, reward_space, done_space, info_space = self.env_space.step(
                    action[0, 0])

                action_break = action[0, 0]
                if action_break > 3:  #shoot+right/left --> right/left
                    action_break = action_break - 2
                screen_break, _, reward_break, done_break, info_break = self.env_break.step(
                    action_break)

                total_reward += int(reward_break)
                total_reward += int(reward_space)
                done = done_break or done_space

                #   clamp rewards
                reward_break = torch.Tensor([np.clip(reward_break, -1, 1)])
                reward_space = torch.Tensor([np.clip(reward_space, -1, 1)])
                reward = reward_break + reward_space
                total_reward_clamped += int(reward_break[0])
                total_reward_clamped += int(reward_space[0])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames - 1):
                    last_k_frames_break[j] = last_k_frames_break[j + 1]
                    last_k_frames_space[j] = last_k_frames_space[j + 1]
                last_k_frames_break[self.num_stored_frames -
                                    1] = gray2pytorch(screen_break)
                last_k_frames_space[self.num_stored_frames -
                                    1] = gray2pytorch(screen_space)

                # convert frames to range 0 to 1 again
                if not done:
                    next_state_break = torch.cat(last_k_frames_break,
                                                 1).type(FloatTensor) / 255.0
                    next_state_space = torch.cat(last_k_frames_space,
                                                 1).type(FloatTensor) / 255.0
                    next_state = torch.cat(
                        (next_state_break, next_state_space), 2)
                else:
                    next_state = None

                #Store transition
                #Frame concat, Trasition not (try)
                frame_break = last_k_frames_break[self.num_stored_frames -
                                                  1].cpu()
                frame_space = last_k_frames_space[self.num_stored_frames -
                                                  1].cpu()
                frame_both = torch.cat((frame_break, frame_space), 2)
                self.replay.pushFrame(frame_both)
                self.replay.pushTransition(
                    (self.replay.getCurrentIndex() - 1) % self.replay.capacity,
                    action, reward, done)

                #	only optimize each kth step
                if self.steps % self.optimize_each_k == 0:
                    loss = self.optimize(net_updates)

                    # Logging
                    loss_history.append(loss)
                    #q_history.append(q_value)
                    #exp_q_history.append(exp_q_value)

                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done:
                    break

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

            print(
                'Episode: {:6} |  '.format(i_episode),
                'steps {:8} |  '.format(self.steps),
                'loss: {:.2E} |  '.format(loss if loss else 0),
                'score: ({:4}/{:4}) |  '.format(total_reward_clamped,
                                                total_reward),
                'best score: ({:4}/{:4}) |  '.format(best_score_clamped,
                                                     best_score),
                'replay size: {:7}'.format(len(self.replay)))

            avg_score_clamped += total_reward_clamped
            avg_score += total_reward
            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            if i_episode % log_avg_episodes == 0 and i_episode != 0:
                avg_score_clamped /= log_avg_episodes
                avg_score /= log_avg_episodes

                print(
                    '----------------------------------------------------------------'
                    '-----------------------------------------------------------------',
                    '\nLogging to file: \nEpisode: {:6}   '.format(i_episode),
                    'steps: {:8}   '.format(self.steps),
                    'avg on last {:4} games ({:6.1f}/{:6.1f})   '.format(
                        log_avg_episodes, avg_score_clamped, avg_score),
                    'best score: ({:4}/{:4})'.format(best_score_clamped,
                                                     best_score),
                    '\n---------------------------------------------------------------'
                    '------------------------------------------------------------------'
                )
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write(
                        'Episode: {:6} |  '.format(i_episode) +
                        'steps: {:8} |  '.format(self.steps) +
                        'avg on last {:4} games ({:6.1f}/{:6.1f}) |  '.format(
                            log_avg_episodes, avg_score_clamped, avg_score) +
                        'best score: ({:4}/{:4})\n'.format(
                            best_score_clamped, best_score))
                # Dump loss & reward
                with open(loss_file, 'wb') as fp:
                    pickle.dump(loss_history, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) +
                             '...\n')
                self.target_net.save(sub_dir + self.game + '-' +
                                     str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + self.game + '.model')

Пример #18

Показать файл

Файл: double_agent.py Проект: maxstrobel/TwinDQN

    def __init__(self,
                 game1,
                 game2,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 pretrained_subnet1 = False,
                 pretrained_subnet2 = False,
                 frameskip = 4,
                 frozen = False
                 ):
        """
        Inputs:
        - game 1: string to select the game 1
        - game 2: string to select the game 2
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - pretrained_subnet1: str path to the model of the subnet
        - pretrained_subnet2: str path to the model of the subnet
        - frozen: boolean freeze pretrained subnets
        """

        # Namestring
        self.game1 = game1
        self.game2 = game2

        # Environment
        self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip)
        self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip)


        # Neural net
        self.pretrained_subnet1 = pretrained_subnet1
        self.pretrained_subnet2 = pretrained_subnet2
        self.net = TwinDQN(channels_in = state_buffer_size,
                             num_actions = self.env2.get_number_of_actions(),
                             pretrained_subnet1 = pretrained_subnet1,
                             pretrained_subnet2 = pretrained_subnet2,
                             frozen = frozen)
        self.target_net = TwinDQN(channels_in = state_buffer_size,
                                    num_actions = self.env2.get_number_of_actions(),
                                    pretrained_subnet1 = pretrained_subnet1,
                                    pretrained_subnet2 = pretrained_subnet2,
                                    frozen = frozen)

        # Cuda
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        # Pretrained
        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()),
                                    lr=learning_rate)
        #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()),
        #                               lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500

Пример #19

Показать файл

Файл: double_agent.py Проект: maxstrobel/TwinDQN

class DoubleAgent(object):
    def __init__(self,
                 game1,
                 game2,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 pretrained_subnet1 = False,
                 pretrained_subnet2 = False,
                 frameskip = 4,
                 frozen = False
                 ):
        """
        Inputs:
        - game 1: string to select the game 1
        - game 2: string to select the game 2
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - pretrained_subnet1: str path to the model of the subnet
        - pretrained_subnet2: str path to the model of the subnet
        - frozen: boolean freeze pretrained subnets
        """

        # Namestring
        self.game1 = game1
        self.game2 = game2

        # Environment
        self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip)
        self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip)


        # Neural net
        self.pretrained_subnet1 = pretrained_subnet1
        self.pretrained_subnet2 = pretrained_subnet2
        self.net = TwinDQN(channels_in = state_buffer_size,
                             num_actions = self.env2.get_number_of_actions(),
                             pretrained_subnet1 = pretrained_subnet1,
                             pretrained_subnet2 = pretrained_subnet2,
                             frozen = frozen)
        self.target_net = TwinDQN(channels_in = state_buffer_size,
                                    num_actions = self.env2.get_number_of_actions(),
                                    pretrained_subnet1 = pretrained_subnet1,
                                    pretrained_subnet2 = pretrained_subnet2,
                                    frozen = frozen)

        # Cuda
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        # Pretrained
        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()),
                                    lr=learning_rate)
        #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()),
        #                               lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500


    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        - action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        EPSILON_PLAY = 0.01
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        elif mode=='play':
            epsilon = EPSILON_PLAY
        else:
            epsilon = EPSILON_END

        if epsilon < random():
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1,1)

            # Prevent noops
            if action[0,0]!=1:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0,0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:
            # Random action
            action = self.env2.sample_action()
            action = LongTensor([[action]])

        return action


    def map_action(self, action):
        """
        Maps action from game with more actions
        to game with less actions

        Inputs:
        - action: int
        Returns:
        - action: int
        """
        # Map SpaceInvaders on Breakout
        if self.game1=='Breakout' and self.game2=='SpaceInvaders':
            if action>3: # shoot+right/left --> right/left
                return action-2

        # Map Assault on SpaceInvaders
        if self.game1=='SpaceInvaders' and self.game2=='Assault':
            if action!=0: # all actions except 2nd idle
                return action-1

        # Map Phoenix on SpaceInvaders
        if self.game1=='SpaceInvaders' and self.game2=='Phoenix':
            if action==4: # shield --> idle
                return 0
            if action==7: # shield+shot --> shot
                return 1
            if action>4: # shoot+right/left --> shoot+right/left
                return action-1

        # Map Phoenix on Assault
        if self.game1=='Assault' and self.game2=='Phoenix':
            if action==4: # shield --> idle
                return 0
            if action==7: # shield+shot --> shot
                return 2
            if 1<= action and action<=3: # shot/right/left --> shot/right/left
                return action+1

        # No mapping necessary
        return action


    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition
        batch = self.replay.sampleTransition(self.batch_size)

        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]),
                                              volatile=True) # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(1)[0]
        next_state_values[non_final_mask]= next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()


        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates%self.update_target_net_each_k_steps==0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]


    def play(self, n):
        """
        Play a game with the current net and render it

        Inputs:
        - n: games to play
        """
        for i in range(n):
            done = False # games end indicator variable

            # Score counter
            total_reward_game1 = 0
            total_reward_game2 = 0
            total_reward = 0

            # Reset game
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            while not done:
                action = self.select_action(state, mode='play')[0,0]
                action1 = self.map_action(action)
                action2 = action

                # perform selected action on game
                screen1, reward1, _, done1, _ = self.env1.step(action1, mode='play')
                screen2, reward2, _, done2, _ = self.env2.step(action2, mode='play')

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)

                # save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

                # convert frames to range 0 to 1 again
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

                # Merged game over indicator
                done = done1 or done2
            print('Final scores Game ({}/{}): {}: {}    '.format(i+1, n, self.game1, total_reward_game1) +
                  '{}: {}    '.format(self.game2, total_reward_game2) +
                  'total: {}'.format(total_reward))
        self.env1.game.close()
        self.env2.game.close()


    def play_stats(self, n_games, mode='random'):
        """
        Play N games randomly or evaluate a net and log results for statistics

        Input:
        - n_games: int Number of games to play
        - mode: str 'random' or 'evaluation'
        """
        # Subdirectory for logging
        sub_dir = mode + '_' + self.game1 + '+' + self.game2 + '/'
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # Store history total
        reward_history = []
        reward_clamped_history = []
        # Store history game 1
        reward_history_game1 = []
        reward_clamped_history_game1 = []
        # Store history game 2
        reward_history_game2 = []
        reward_clamped_history_game2 = []

        # Number of actions to sample from
        n_actions = self.env2.get_number_of_actions()

        for i_episode in range(1, n_games+1):
            # Reset game
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # Store screen
            if mode=='evaluation':
                # list of k last frames
                last_k_frames = []
                for j in range(self.num_stored_frames):
                    last_k_frames.append(None)
                    last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)
                # frame is saved as ByteTensor -> convert to gray value between 0 and 1
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done = False

            # reset score with initial lives, because every lost live adds -1
            total_reward_game1 = 0
            total_reward_clamped_game1 = self.env1.get_lives()
            total_reward_game2 = 0
            total_reward_clamped_game2 = self.env2.get_lives()
            # total scores for both games
            total_reward = total_reward_game1 + total_reward_game2
            total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2

            while not done:
                if mode=='random':
                    action = randrange(n_actions)
                elif mode=='evaluation':
                    action = self.select_action(state, mode='play')[0,0]
                action1 = self.map_action(action)
                action2 = action

                screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1)
                screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2)

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)
                total_reward_clamped_game1 += reward1_clamped
                total_reward_clamped_game2 += reward2_clamped
                total_reward_clamped += reward1_clamped + reward2_clamped

                if mode=='evaluation':
                    # save latest frame, discard oldest
                    for j in range(self.num_stored_frames-1):
                        last_k_frames[j] = last_k_frames[j+1]
                    last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

                    # convert frames to range 0 to 1 again
                    state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

                # Merged game over indicator
                done = done1 or done2

            # Print current result
            print('Episode: {:6}/{:6} |   '.format(i_episode, n_games) +
                  'score total: ({:6.1f}/{:7.1f}) |   '.format(total_reward_clamped,total_reward) +
                  'score game1: ({:6.1f}/{:7.1f}) |   '.format(total_reward_clamped_game1,total_reward_game1) +
                  'score game2: ({:6.1f}/{:7.1f})'.format(total_reward_clamped_game2,total_reward_game2))

            # Save rewards
            reward_history_game1.append(total_reward_game1)
            reward_history_game2.append(total_reward_game2)
            reward_history.append(total_reward)
            reward_clamped_history_game1.append(total_reward_clamped_game1)
            reward_clamped_history_game2.append(total_reward_clamped_game2)
            reward_clamped_history.append(total_reward_clamped)

        avg_reward_total = np.sum(reward_history) / len(reward_history)
        avg_reward_total_clamped = np.sum(reward_clamped_history) / len(reward_clamped_history)
        avg_reward_game1 = np.sum(reward_history_game1) / len(reward_history_game1)
        avg_reward_game1_clamped = np.sum(reward_clamped_history_game1) / len(reward_clamped_history_game1)
        avg_reward_game2 = np.sum(reward_history_game2) / len(reward_history_game2)
        avg_reward_game2_clamped = np.sum(reward_clamped_history_game2) / len(reward_clamped_history_game2)

        # Print final result
        print('\n\n===========================================\n' +
              'avg score after {:6} episodes:\n'.format(n_games) +
              'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) +
              'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) +
              'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2))

        # Log results to files
        with open(sub_dir + mode + '.txt', 'w') as fp:
            fp.write('avg score after {:6} episodes:\n'.format(n_games) +
                     'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) +
                     'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) +
                     'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2))

        # Dump reward
        with open(sub_dir + mode + '_reward_game1.pickle', 'wb') as fp:
            pickle.dump(reward_history_game1, fp)
        with open(sub_dir + mode + '_reward_game2.pickle', 'wb') as fp:
            pickle.dump(reward_history_game2, fp)
        with open(sub_dir + mode + '_reward_total.pickle', 'wb') as fp:
            pickle.dump(reward_history, fp)

        with open(sub_dir + mode + '_reward_clamped_game1', 'wb') as fp:
            pickle.dump(reward_clamped_history_game1, fp)
        with open(sub_dir + mode + '_reward_clamped_game2', 'wb') as fp:
            pickle.dump(reward_clamped_history_game2, fp)
        with open(sub_dir + mode + '_reward_clamped_total', 'wb') as fp:
            pickle.dump(reward_clamped_history, fp)


    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game1 + '+' + self.game2 + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + 'train.txt'
        reward_file = sub_dir + 'reward.pickle'
        reward_file_game1 = sub_dir + 'reward_game1.pickle'
        reward_file_game2 = sub_dir + 'reward_game2.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        reward_clamped_file_game1 = sub_dir + 'reward_clamped_game1.pickle'
        reward_clamped_file_game2 = sub_dir + 'reward_clamped_game2.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        # Total scores
        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        reward_history = []
        reward_clamped_history = []
        # Scores game 1
        avg_score_game1 = 0
        avg_score_clamped_game1 = 0
        reward_history_game1 = []
        reward_clamped_history_game1 = []
        # Scores game 2
        avg_score_game2 = 0
        avg_score_clamped_game2 = 0
        reward_history_game2 = []
        reward_clamped_history_game2 = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' +
                     'Trained game (first):               {}\n'.format(self.game1) +
                     'Trained game (second):              {}\n'.format(self.game2) +
                     'Learning rate:                      {:.2E}\n'.format(self.learning_rate) +
                     'Batch size:                         {:d}\n'.format(self.batch_size) +
                     'Memory size(replay):                {:d}\n'.format(self.mem_size) +
                     'Pretrained:                         {}\n'.format(self.pretrained_model) +
                     'Pretrained subnet 1:                {}\n'.format(self.pretrained_subnet1) +
                     'Pretrained subnet 2:                {}\n'.format(self.pretrained_subnet2) +
                     'Started training after k frames:    {:d}\n'.format(self.start_train_after) +
                     'Optimized after k frames:           {:d}\n'.format(self.optimize_each_k) +
                     'Target net update after k frame:    {:d}\n\n'.format(self.update_target_net_each_k_steps) +
                     '--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n' +
                     'Episode | Steps     | ' +
                     '{:3} games avg total  | '.format(log_avg_episodes) +
                     '{:3} games avg game1  | '.format(log_avg_episodes) +
                     '{:3} games avg game2  | '.format(log_avg_episodes) +
                     'best score total \n' +
                     '--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n')

        print('Started training...\nLogging to {}\n'.format(sub_dir) +
              'Episode | Steps     |   score total        |   score game 1       |   ' +
              'score game 2       | best score total')

        for i_episode in range(1,num_episodes):
            # reset game at the start of each episode
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = torch.cat((gray2pytorch(screen1),
                                              gray2pytorch(screen2)), dim=1)

            if i_episode == 1:
                self.replay.pushFrame(last_k_frames[0].cpu())

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done1 = False
            done2 = False

            # reset score with initial lives, because every lost live adds -1
            total_reward_game1 = 0
            total_reward_clamped_game1 = self.env1.get_lives()
            total_reward_game2 = 0
            total_reward_clamped_game2 = self.env2.get_lives()
            # total scores for both games
            total_reward = total_reward_game1 + total_reward_game2
            total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2

            # Loop over one game
            while not done1 and not done2:
                self.steps +=1

                action = self.select_action(state)
                action1 = self.map_action(action[0,0])
                action2 = action[0,0]

                # perform selected action on game
                screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1)
                screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2)

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)
                total_reward_clamped_game1 += reward1_clamped
                total_reward_clamped_game2 += reward2_clamped
                total_reward_clamped += reward1_clamped + reward2_clamped

                # Bake reward into tensor
                reward = torch.FloatTensor([reward1_clamped+reward2_clamped])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),
                                                                     gray2pytorch(screen2)), dim=1)

                # convert frames to range 0 to 1 again
                if not done1 and not done2:
                    next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                else:
                    next_state = None

                # Store transition
                self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu())
                self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity,
                                            action, reward, done1 or done2)

                #	only optimize each kth step
                if self.steps%self.optimize_each_k == 0:
                    self.optimize(net_updates)
                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done1 or done2:
                    break;

            # Save rewards
            reward_history_game1.append(total_reward_game1)
            reward_history_game2.append(total_reward_game2)
            reward_history.append(total_reward)
            reward_clamped_history_game1.append(total_reward_clamped_game1)
            reward_clamped_history_game2.append(total_reward_clamped_game2)
            reward_clamped_history.append(total_reward_clamped)

            # Sum up for averages
            avg_score_clamped_game1 += total_reward_clamped_game1
            avg_score_clamped_game2 += total_reward_clamped_game2
            avg_score_clamped += total_reward_clamped
            avg_score_game1 += total_reward_game1
            avg_score_game2 += total_reward_game2
            avg_score += total_reward

            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            print('{:7} | '.format(i_episode) +
                  '{:9} |     '.format(self.steps) +
                  '({:6.1f}/{:7.1f}) |     '.format(total_reward_clamped,total_reward) +
                  '({:6.1f}/{:7.1f}) |     '.format(total_reward_clamped_game1,total_reward_game1) +
                  '({:6.1f}/{:7.1f}) |  '.format(total_reward_clamped_game2,total_reward_game2) +
                  '({:6.1f}/{:8.1f})'.format(best_score_clamped, best_score))

            if i_episode % log_avg_episodes == 0 and i_episode!=0:
                avg_score_clamped_game1 /= log_avg_episodes
                avg_score_clamped_game2 /= log_avg_episodes
                avg_score_clamped /= log_avg_episodes
                avg_score_game1 /= log_avg_episodes
                avg_score_game2 /= log_avg_episodes
                avg_score /= log_avg_episodes

                print('--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n' +
                      'Episode | Steps     | ' +
                      '{:3} games avg total  | '.format(log_avg_episodes) +
                      '{:3} games avg game1  | '.format(log_avg_episodes) +
                      '{:3} games avg game2  | '.format(log_avg_episodes) +
                      'best score total \n' +
                      '{:7} | '.format(i_episode) +
                      '{:9} |     '.format(self.steps) +
                      '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped,avg_score) +
                      '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped_game1,avg_score_game1) +
                      '({:6.1f}/{:7.1f}) |  '.format(avg_score_clamped_game2,avg_score_game2) +
                      '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score) +
                      '\nLogging to file...\n\n'
                      '--------+-----------+----------------------+------------' +
                      '----------+----------------------+--------------------\n' +
                      'Episode | Steps     |   score total        |   score game 1       |   ' +
                      'score game 2       | best score total')
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write('{:7} | '.format(i_episode) +
                             '{:9} |     '.format(self.steps) +
                             '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped,avg_score) +
                             '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped_game1,avg_score_game1) +
                             '({:6.1f}/{:7.1f}) |  '.format(avg_score_clamped_game2,avg_score_game2) +
                             '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score))
                # Dump reward
                with open(reward_file_game1, 'wb') as fp:
                    pickle.dump(reward_history_game1, fp)
                with open(reward_file_game2, 'wb') as fp:
                    pickle.dump(reward_history_game2, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)

                with open(reward_clamped_file_game1, 'wb') as fp:
                    pickle.dump(reward_clamped_history_game1, fp)
                with open(reward_clamped_file_game2, 'wb') as fp:
                    pickle.dump(reward_clamped_history_game2, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped_game1 = 0
                avg_score_clamped_game2 = 0
                avg_score_clamped = 0
                avg_score_game1 = 0
                avg_score_game2 = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) + '...\n')
                self.target_net.save(sub_dir + str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + 'final.model')

Пример #20

Показать файл

    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 mem_size=10**6,
                 batch_size=256,
                 n_hid1=256,
                 n_hid2=256,
                 lr=3e-4,
                 gamma=0.99,
                 tau=5e-3,
                 reward_scale=2):

        self.load_checkpoint = load_checkpoint

        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.reward_scale = reward_scale

        self.memory_counter = 0
        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  self.max_action,
                                  lr,
                                  checkpoint_file,
                                  name='_actor')
        self.critic_1 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic1')
        self.critic_2 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic2')

        self.value_net = ValueNetwork(n_states,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_value')
        self.target_value_net = ValueNetwork(n_states,
                                             n_hid1,
                                             n_hid2,
                                             lr,
                                             checkpoint_file,
                                             name='_value_target')

        # tau=1 performs an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.value_net,
                                       self.target_value_net,
                                       tau=1)

Пример #21

Показать файл

class Agent:
    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 mem_size=10**6,
                 batch_size=256,
                 n_hid1=256,
                 n_hid2=256,
                 lr=3e-4,
                 gamma=0.99,
                 tau=5e-3,
                 reward_scale=2):

        self.load_checkpoint = load_checkpoint

        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.reward_scale = reward_scale

        self.memory_counter = 0
        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  self.max_action,
                                  lr,
                                  checkpoint_file,
                                  name='_actor')
        self.critic_1 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic1')
        self.critic_2 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic2')

        self.value_net = ValueNetwork(n_states,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_value')
        self.target_value_net = ValueNetwork(n_states,
                                             n_hid1,
                                             n_hid2,
                                             lr,
                                             checkpoint_file,
                                             name='_value_target')

        # tau=1 performs an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.value_net,
                                       self.target_value_net,
                                       tau=1)
        # self.update_network_parameters_phil(tau=1)

    def store_transition(self, obs, action, reward, obs_, done):
        self.memory.store_transition(obs, action, reward, obs_, done)

    def sample_transitions(self):
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer(
            self.batch_size)
        # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class)
        state_batch = torch.tensor(state_batch,
                                   dtype=torch.float).to(self.actor.device)
        action_batch = torch.tensor(action_batch,
                                    dtype=torch.float).to(self.actor.device)
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float).to(self.actor.device)
        new_state_batch = torch.tensor(new_state_batch,
                                       dtype=torch.float).to(self.actor.device)
        done_batch = torch.tensor(done_batch).to(self.actor.device)
        return state_batch, action_batch, reward_batch, new_state_batch, done_batch

    def update_network_parameters(self, network, target_network, tau=None):
        for par, target_par in zip(network.parameters(),
                                   target_network.parameters()):
            target_par.data.copy_(tau * par.data + (1 - tau) * target_par.data)

    def choose_action(self, obs):
        obs = torch.tensor([obs], dtype=torch.float).to(self.actor.device)
        actions, _ = self.actor.sample_normal(obs, reparametrize=False)
        return actions.cpu().detach().numpy()[0]

    def learn_phil(self):
        if self.memory.mem_counter < self.batch_size:
            return

        state, action, reward, new_state, done = \
            self.memory.sample_buffer(self.batch_size)

        reward = torch.tensor(reward,
                              dtype=torch.float).to(self.critic_1.device)
        done = torch.tensor(done).to(self.critic_1.device)
        state_ = torch.tensor(new_state,
                              dtype=torch.float).to(self.critic_1.device)
        state = torch.tensor(state, dtype=torch.float).to(self.critic_1.device)
        action = torch.tensor(action,
                              dtype=torch.float).to(self.critic_1.device)

        value = self.value_net(state).view(-1)
        value_ = self.target_value_net(state_).view(-1)
        value_[done] = 0.0

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparametrize=False)
        # actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value_net.optimizer.zero_grad()
        value_target = critic_value - log_probs
        value_loss = 0.5 * (F.mse_loss(value, value_target))
        value_loss.backward(retain_graph=True)
        self.value_net.optimizer.step()

        actions, log_probs = self.actor.sample_normal(state,
                                                      reparametrize=True)
        # actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False)
        log_probs = log_probs.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = torch.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_probs - critic_value
        actor_loss = torch.mean(actor_loss)
        self.actor.optimizer.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        q_hat = self.reward_scale * reward + self.gamma * value_
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()
        self.update_network_parameters(self.value_net, self.target_value_net,
                                       self.tau)
        # self.update_network_parameters_phil()

    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return

        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions(
        )
        # state_batch, action_batch, reward_batch, new_state_batch, done_batch = \
        #     self.memory.sample_buffer(self.batch_size)
        #
        # reward_batch = torch.tensor(reward_batch, dtype=torch.float).to(self.critic_1.device)
        # done_batch = torch.tensor(done_batch).to(self.critic_1.device)
        # new_state_batch = torch.tensor(new_state_batch, dtype=torch.float).to(self.critic_1.device)
        # state_batch = torch.tensor(state_batch, dtype=torch.float).to(self.critic_1.device)
        # action_batch = torch.tensor(action_batch, dtype=torch.float).to(self.critic_1.device)
        '''Compute Value Network loss'''
        self.value_net.optimizer.zero_grad()
        val = self.value_net(state_batch).view(-1)
        val_ = self.target_value_net(new_state_batch).view(-1)
        val_[done_batch] = 0.0

        actions, log_probs = self.actor.sample_normal(state_batch,
                                                      reparametrize=False)
        log_probs = log_probs.view(-1)
        q1 = self.critic_1(state_batch, actions)  # action_batch)
        q2 = self.critic_1(state_batch, actions)  # action_batch)
        q = torch.min(q1, q2).view(-1)
        value_target = q - log_probs
        value_loss = 0.5 * F.mse_loss(val, value_target)

        value_loss.backward(retain_graph=True)
        self.value_net.optimizer.step()
        # val = val - q + log_prob
        '''Compute Actor loss'''
        self.actor.optimizer.zero_grad()
        # here we need to reparametrize and thus use rsample to make the distribution differentiable
        # because the log prob of the chosen action will be part of our loss.
        actions, log_probs = self.actor.sample_normal(state_batch,
                                                      reparametrize=True)
        log_probs = log_probs.view(-1)
        q1 = self.critic_1(state_batch, actions)
        q2 = self.critic_2(state_batch, actions)
        q = torch.min(q1, q2).view(-1)
        actor_loss = log_probs - q
        actor_loss = torch.mean(actor_loss)

        actor_loss.backward(retain_graph=True)
        self.actor.optimizer.step()
        '''Compute Critic loss'''
        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        val_ = self.target_value_net(new_state_batch).view(
            -1)  # value for the critic update
        val_[done_batch] = 0.0
        q_hat = self.reward_scale * reward_batch + self.gamma * val_
        q1_old_policy = self.critic_1(state_batch, action_batch).view(-1)
        q2_old_policy = self.critic_2(state_batch, action_batch).view(-1)
        critic_1_loss = 0.5 * F.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * F.mse_loss(q2_old_policy, q_hat)

        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        self.update_network_parameters(self.value_net, self.target_value_net,
                                       self.tau)
        # self.update_network_parameters_phil()

    def save_models(self):
        self.actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.value_net.save_checkpoint()
        self.target_value_net.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.value_net.load_checkpoint()
        self.target_value_net.load_checkpoint()

    def update_network_parameters_phil(self, tau=None):
        if tau is None:
            tau = self.tau

        target_value_params = self.target_value_net.named_parameters()
        value_params = self.value_net.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau*value_state_dict[name].clone() + \
                    (1-tau)*target_value_state_dict[name].clone()

        self.target_value_net.load_state_dict(value_state_dict)

Пример #22

Показать файл

Файл: ddpg_agent.py Проект: antoniopioricciardi/udemy_modernRL

class DDPGAgent():
    def __init__(self,
                 load_checkpoint,
                 n_states,
                 n_actions,
                 checkpoint_file,
                 mem_size=10**6,
                 batch_size=64,
                 n_hid1=400,
                 n_hid2=300,
                 alpha=1e-4,
                 beta=1e-3,
                 gamma=0.99,
                 tau=0.99):
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  alpha,
                                  checkpoint_file,
                                  name='actor')
        self.critic = CriticNetwork(n_states,
                                    n_actions,
                                    n_hid1,
                                    n_hid2,
                                    beta,
                                    checkpoint_file,
                                    name='critic')

        self.actor_target = ActorNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         alpha,
                                         checkpoint_file,
                                         name='actor_target')
        self.critic_target = CriticNetwork(n_states,
                                           n_actions,
                                           n_hid1,
                                           n_hid2,
                                           beta,
                                           checkpoint_file,
                                           name='critic_target')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))
        self.memory = ReplayMemory(mem_size, n_states, n_actions)
        self.update_network_parameters_phil(tau=1)
        if load_checkpoint:
            self.actor.eval()
        self.load_checkpoint = load_checkpoint

    def reset_noise(self):
        self.noise.reset()

    def __copy_param(self, net_param_1, net_param_2, tau):
        # a.copy_(b) reads content from b and copy it to a
        for par, target_par in zip(net_param_1, net_param_2):
            with torch.no_grad():
                val_to_copy = tau * par.weight + (1 - tau) * target_par.weight
                target_par.weight.copy_(val_to_copy)
                if target_par.bias is not None:
                    val_to_copy = tau * par.bias + (1 - tau) * target_par.bias
                    target_par.bias.copy_(val_to_copy)

    def update_network_parameters(self, tau=None):
        # TODO: Controlla equivalenza con metodo Phil
        # during the class initialization we call this method with tau=1, to perform an exact copy of the nets to targets
        # then when we call this without specifying tau, we use the field stored
        if tau is None:
            tau = self.tau

        actor_params = self.actor.children()
        actor_target_params = self.actor_target.children()
        self.__copy_param(actor_params, actor_target_params, tau)

        critic_params = self.critic.children()
        critic_target_params = self.critic_target.children()
        self.__copy_param(critic_params, critic_target_params, tau)

    def choose_action(self, obs):
        # when using layer norm, we do not want to calculate statistics for the forward propagation. Not needed
        # if using batchnorm or dropout
        self.actor.eval()
        obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device)
        # compute actions
        mu = self.actor(obs)
        # add some random noise for exploration
        mu_prime = mu
        if not self.load_checkpoint:
            mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to(
                self.actor.device)
            self.actor.train()
        return mu_prime.cpu().detach().numpy()

    def store_transitions(self, obs, action, reward, obs_, done):
        self.memory.store_transition(obs, action, reward, obs_, done)

    def sample_transitions(self):
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer(
            self.batch_size)
        # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class)
        state_batch = torch.tensor(state_batch,
                                   dtype=torch.float).to(self.actor.device)
        action_batch = torch.tensor(action_batch,
                                    dtype=torch.float).to(self.actor.device)
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float).to(self.actor.device)
        new_state_batch = torch.tensor(new_state_batch,
                                       dtype=torch.float).to(self.actor.device)
        done_batch = torch.tensor(done_batch).to(self.actor.device)
        return state_batch, action_batch, reward_batch, new_state_batch, done_batch

    def save_models(self):
        self.actor.save_checkpoint()
        self.actor_target.save_checkpoint()
        self.critic.save_checkpoint()
        self.critic_target.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.actor_target.load_checkpoint()
        self.critic.load_checkpoint()
        self.critic_target.load_checkpoint()

    def learn(self):
        # deal with the situation in which we still not have filled the memory to batch size
        if self.memory.mem_counter < self.batch_size:
            return
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions(
        )
        ''' compute actor_target actions and critic_target values, then use obtained values to compute target y_i '''
        target_actions = self.actor_target(
            new_state_batch
        )  #  + torch.tensor(self.noise(), dtype=torch.float).to(self.actor.device)
        target_critic_value_ = self.critic_target(new_state_batch,
                                                  target_actions)
        # target_critic_value_next[done_batch==1] = 0.0  # if done_batch is integer valued
        target_critic_value_[
            done_batch] = 0.0  # if done_batch is bool -- see if it works this way
        target_critic_value_ = target_critic_value_.view(
            -1)  # necessary for operations on matching shapes
        target = reward_batch + self.gamma * target_critic_value_
        target = target.view(self.batch_size, 1)
        ''' zero out gradients '''
        self.actor.optimizer.zero_grad()
        self.critic.optimizer.zero_grad()
        ''' compute critic loss '''
        critic_value = self.critic(state_batch, action_batch)
        critic_loss = F.mse_loss(target, critic_value)
        ''' compute actor loss'''
        # cannot directly use critic value, because it is evaluating a certain (s,a) pair.
        # The formula given in the paper - it appears that - wants to use critic to evaluate
        # the actions produced by an updated actor, given the state
        # actor_loss = torch.mean(critic_value)
        actor_loss = -self.critic(state_batch, self.actor(state_batch))
        actor_loss = torch.mean(actor_loss)

        critic_loss.backward()
        actor_loss.backward()

        self.actor.optimizer.step()
        self.critic.optimizer.step()

        self.update_network_parameters_phil()

    def __copy_params_phil(self, net_a, net_b, tau):
        net_a_params = net_a.named_parameters()
        net_b_params = net_b.named_parameters()
        net_a_state_dict = dict(net_a_params)
        net_b_state_dict = dict(net_b_params)
        for name in net_a_state_dict:
            net_a_state_dict[name] = tau * net_a_state_dict[name].clone() + (
                1 - tau) * net_b_state_dict[name].clone()
        return net_a_state_dict

    def update_network_parameters_phil(self, tau=None):
        if tau is None:
            tau = self.tau

        updated_actor_state_dict = self.__copy_params_phil(
            self.actor, self.actor_target, tau)
        updated_critic_state_dict = self.__copy_params_phil(
            self.critic, self.critic_target, tau)

        self.actor_target.load_state_dict(updated_actor_state_dict)
        self.critic_target.load_state_dict(updated_critic_state_dict)

Пример #23

Показать файл

Файл: agent.py Проект: Danboruya/mt-sequential-multi-agent-rl

class Agent:
    def __init__(self, policy_net, target_net, durability, optimizer, name,
                 constants):
        """An agent class that takes action on the environment and optimizes
        the action based on the reward.

        Parameters
        ----------
        policy_net : DQN
            [description]
        target_net : DQN
            [description]
        durability : int
            [description]
        optimizer : [type]
            [description]
        name : str
            The name of agent
        constants: Constants
            The hyper-parameters from Constants class
        """
        self.CONSTANTS = constants
        self.policy_net = policy_net
        self.target_net = target_net
        self.target_net.load_state_dict(policy_net.state_dict())
        self.durability = durability
        self.optimizer = optimizer
        self.name = name
        self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE)
        self.steps_done = 0
        self.total_reward = 0.0
        self.reward = 0.0
        self.obtained_reward = 0.0
        self.n_best = 0
        self.policy_net_flag = False

    def select_action(self, state, is_first=False):
        sample = random.random()
        eps_threshold = self.CONSTANTS.EPS_END + (self.CONSTANTS.EPS_START - self.CONSTANTS.EPS_END) * \
                        math.exp(-1. * self.steps_done / self.CONSTANTS.EPS_DECAY)
        self.steps_done += 1
        if is_first:
            self.writer.add_graph(self.policy_net,
                                  input_to_model=state.to(
                                      self.CONSTANTS.DEVICE),
                                  profile_with_cuda=True)
        if sample > eps_threshold:
            with torch.no_grad():
                self.policy_net_flag = True
                return self.policy_net(state.to(
                    self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.CONSTANTS.N_ACTIONS)]],
                                device=self.CONSTANTS.DEVICE,
                                dtype=torch.long)

    def select_core_action(self, best_agent_state, flag, best_agent_action):
        self.steps_done += 1
        if flag:
            with torch.no_grad():
                if best_agent_state is None:
                    return self.policy_net(self.state.to(
                        self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
                else:
                    return self.policy_net(
                        best_agent_state.to(
                            self.CONSTANTS.DEVICE)).max(1)[1].view(1, 1)
        else:
            return best_agent_action

    def optimize_model(self):
        if len(self.memory) < self.CONSTANTS.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.CONSTANTS.BATCH_SIZE)

        # zip(*transitions) unzips the transitions into
        # Transition(*) creates new named tuple
        # batch.state - tuple of all the states (each state is a tensor)
        # batch.next_state - tuple of all the next states (each state is a tensor)
        # batch.reward - tuple of all the rewards (each reward is a float)
        # batch.action - tuple of all the actions (each action is an int)

        # Transition = ReplayMemory.get_transition()
        transition = self.CONSTANTS.TRANSITION
        batch = transition(*zip(*transitions))

        actions = tuple(
            (map(lambda a: torch.tensor([[a]], device=self.CONSTANTS.DEVICE),
                 batch.action)))
        rewards = tuple(
            (map(lambda r: torch.tensor([r], device=self.CONSTANTS.DEVICE),
                 batch.reward)))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=utils.get_device(),
                                      dtype=torch.bool)

        non_final_next_states = torch.cat([
            s for s in batch.next_state if s is not None
        ]).to(self.CONSTANTS.DEVICE)

        state_batch = torch.cat(batch.state).to(self.CONSTANTS.DEVICE)
        action_batch = torch.cat(actions)
        reward_batch = torch.cat(rewards)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = torch.zeros(self.CONSTANTS.BATCH_SIZE,
                                        device=self.CONSTANTS.DEVICE)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.CONSTANTS.GAMMA) + reward_batch

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def set_tf_writer(self, path):
        self.writer = self._set_tf_writer(path)

    def _set_tf_writer(self, path):
        if self.name == "core":
            writer = SummaryWriter(log_dir="{}/tf-board/core/".format(path))
        else:
            writer = SummaryWriter(
                log_dir="{}/tf-board/{}".format(path, self.name))
        return writer

    def get_state(self):
        return self.state

    def get_next_state(self):
        return self.next_state

    def get_init_state(self):
        return self.init_state

    def get_name(self):
        return self.name

    def get_policy_net_flag(self):
        return self.policy_net_flag

    def set_init_state(self, state):
        self.init_state = state

    def set_state(self, state):
        self.state = state
        self.next_state = state

    def set_env(self, env):
        self.env = env

    def get_env(self):
        return self.env

    def set_action(self, action):
        self.action = action

    def get_action(self):
        return self.action

    def get_durability(self):
        return self.durability

    def get_policy_net(self):
        return self.policy_net

    def reduce_durability(self, value):
        self.durability = self.durability - value

    def heal_durability(self, value):
        self.durability = self.durability + value

    def set_done_state(self, done):
        self.done = done

    def set_total_reward(self, reward):
        self.reward = reward
        if reward > 0.0:
            self.obtained_reward += reward
        self.total_reward += reward

    def reset_total_reward(self):
        self.total_reward = 0.0
        self.obtained_reward = 0.0

    def get_reward(self):
        return self.reward

    def get_obtained_reward(self):
        return self.obtained_reward

    def best_counter(self):
        self.n_best += 1

    def get_n_best(self):
        return self.n_best

    def get_total_reward(self):
        return self.total_reward

    def set_step_retrun_value(self, obs, done, info):
        self.obs = obs
        self.done = done
        self.info = info

    def is_done(self):
        return self.done

Пример #24

Показать файл

Файл: qrdqn-offline.py Проект: LihaoR/c51-qr-dqn

class Worker():
    def __init__(self, env, name, s_size, a_size, trainer, model_path,
                 global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = env
        self.replaymemory = ReplayMemory(max_memory)

    def train(self, rollout, sess, gamma, ISWeights):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        next_observations = rollout[:, 3]
        dones = rollout[:, 4]

        Q_target = sess.run(
            self.local_Q.Q,
            feed_dict={self.local_Q.inputs: np.vstack(next_observations)})
        actions_ = np.argmax(Q_target, axis=1)

        action = np.zeros((batch_size, a_size))
        action_ = np.zeros((batch_size, a_size))
        for i in range(batch_size):
            action[i][actions[i]] = 1
            action_[i][actions_[i]] = 1
        action_now = np.zeros((batch_size, a_size, N))
        action_next = np.zeros((batch_size, a_size, N))
        for i in range(batch_size):
            for j in range(a_size):
                for k in range(N):
                    action_now[i][j][k] = action[i][j]
                    action_next[i][j][k] = action_[i][j]

        q_target = sess.run(self.local_Q.q_action,
                            feed_dict={
                                self.local_Q.inputs:
                                np.vstack(next_observations),
                                self.local_Q.actions_q: action_next
                            })
        q_target_batch = []
        for i in range(len(q_target)):
            qi = q_target[i]  # * (1 - dones[i])
            z_target_step = []
            for j in range(len(qi)):
                z_target_step.append(gamma * qi[j] + rewards[i])
            q_target_batch.append(z_target_step)
        q_target_batch = np.array(q_target_batch)
        #print q_target_batch
        isweight = np.zeros((batch_size, N))
        for i in range(batch_size):
            for j in range(N):
                isweight[i, j] = ISWeights[i]
        feed_dict = {
            self.local_Q.inputs: np.vstack(observations),
            self.local_Q.actions_q: action_now,
            self.local_Q.q_target: q_target_batch,
            self.local_Q.ISWeights: isweight
        }
        u, l, g_n, v_n, _ = sess.run([
            self.local_Q.u, self.local_Q.loss, self.local_Q.grad_norms,
            self.local_Q.var_norms, self.local_Q.apply_grads
        ],
                                     feed_dict=feed_dict)
        return l / len(rollout), g_n, v_n, Q_target, u

    def work(self, gamma, sess, coord, saver):
        global GLOBAL_STEP
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        epsilon = 0.2

        print("Starting worker " + str(self.number))
        best_mean_episode_reward = -float('inf')
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                #episode_buffer = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.reset()
                s = process_frame(s)
                if epsilon > 0.01:
                    epsilon = epsilon * 0.997
                while not d:
                    #self.env.render()
                    GLOBAL_STEP += 1
                    #Take an action using probabilities from policy network output.
                    if random.random() > epsilon:
                        a_dist_list = sess.run(
                            self.local_Q.Q,
                            feed_dict={self.local_Q.inputs: [s]})
                        a_dist = a_dist_list[0]
                        a = np.argmax(a_dist)
                    else:
                        a = random.randint(0, 5)

                    s1, r, d, _ = self.env.step(a)
                    if d == False:
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                    self.replaymemory.add([s, a, r, s1, d])
                    episode_reward += r
                    s = s1
                    total_steps += 1
                    episode_step_count += 1
                    if total_steps % 2 == 0 and d != True and total_steps > 50000:
                        episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(
                            batch_size)
                        l, g_n, v_n, Q_target, u = self.train(
                            episode_buffer, sess, gamma, ISWeights)
                        u = np.mean(u, axis=1) + 1e-6
                        self.replaymemory.update_priorities(tree_idx, u)
                        #sess.run(self.update_local_ops)
                    if d == True:
                        break
                sess.run(self.update_local_ops)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory:
                    if self.name == 'worker_0' and episode_count % 5 == 0:
                        print('\n episode: ', episode_count, 'global_step:', \
                              GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \
                              'epsilon: ', epsilon)

                    print('loss', l, 'Qtargetmean', np.mean(Q_target))
                    #print 'p_target', p_target
                    if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000:
                        saver.save(
                            sess, self.model_path + '/qr-dqn-' +
                            str(episode_count) + '.cptk')
                        print("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-100:])
                    if episode_count > 20 and best_mean_episode_reward < mean_reward:
                        best_mean_episode_reward = mean_reward

                if self.name == 'worker_0':
                    sess.run(self.increment)
                    #if episode_count%1==0:
                    #print('\r {} {}'.format(episode_count, episode_reward),end=' ')
                episode_count += 1

Пример #25

Показать файл

Файл: BreakoutBrain.py Проект: SunandBean/DQN_study

class Brain:
    def __init__(self, num_actions, Double, Dueling, PER):
        self.num_actions = num_actions  # 행동 가짓수(2)를 구함
        self.Double = Double
        self.Dueling = Dueling
        self.PER = PER

        # transition을 기억하기 위한 메모리 객체 생성
        self.memory = ReplayMemory(CAPACITY)

        # 신경망 구성
        n_out = num_actions
        self.main_q_network = Net_CNN(n_out, Dueling)  # Net 클래스를 사용
        self.target_q_network = Net_CNN(n_out, Dueling)  # Net 클래스를 사용
        print(self.main_q_network)  # 신경망의 구조를 출력

        # 최적화 기법을 선택
        self.optimizer = optim.Adam(self.main_q_network.parameters(),
                                    lr=0.0001)

        # PER - TD 오차를 기억하기 위한 메모리 객체 생성
        if self.PER == True:
            self.td_error_memory = TDerrorMemory(CAPACITY)

    def replay(self, episode=0):
        ''' Experience Replay로 신경망의 결합 가중치 학습 '''

        # 1. 저장된 transition 수 확인
        if len(self.memory) < BATCH_SIZE:
            return

        # 2. 미니배치 생성
        if self.PER == True:
            self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch(
                episode)
        else:
            self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch(
            )

        # 3. 정답신호로 사용할 Q(s_t, a_t)를 계산
        self.expected_state_action_values = self.get_expected_state_action_values(
        )

        # 4. 결합 가중치 수정
        self.update_main_q_network()

    def decide_action(self, state, episode):
        '''현재 상태로부터 행동을 결정함'''
        # e-greedy 알고리즘에서 서서히 최적행동의 비중을 늘린다
        epsilon = 0.5 * (1 / (episode + 1))

        if epsilon <= np.random.uniform(0, 1):
            self.main_q_network.eval()  # 신경망을 추론 모드로 전환
            with torch.no_grad():
                action = self.main_q_network(state).max(1)[1].view(1, 1)
            # 신경망 출력의 최댓값에 대한 인덱스 = max(1)[1]
            # .view(1,1)은 [torch.LongTensor of size 1]을 size 1*1로 변환하는 역할을 함

        else:
            # 행동을 무작위로 반환 (0 혹은 1)
            action = torch.LongTensor([[random.randrange(self.num_actions)]
                                       ])  #행동을 무작위로 반환(0 혹은 1)
            # action은 [torch.LongTensor of size 1*1] 형태가 된다.

        return action

    def make_minibatch(self, episode=0):
        '''2. 미니배치 생성'''

        if self.PER == True:
            # 2.1 PER - 메모리 객체에서 미니배치를 추출
            # def make_minibatch(self, episode):
            if episode < 30:
                transitions = self.memory.sample(BATCH_SIZE)
            else:
                # TD 오차를 이용해 미니배치를 추출하도록 수정
                indexes = self.td_error_memory.get_prioritized_indexes(
                    BATCH_SIZE)
                transitions = [self.memory.memory[n] for n in indexes]
        else:
            # 2.1 메모리 객체에서 미니배치를 추출
            transitions = self.memory.sample(BATCH_SIZE)

        # 2.2 각 변수를 미니배치에 맞는 형태로 변형
        # transitions는 각 단계별로 (state, action, state_next, reward) 형태로 BATCH_SIZE 개수만큼 저장됨
        # 다시 말해, (state, action, state_next, reward) * BATCH_SIZE 형태가 된다.
        # 이를 미니배치로 만들기 위해
        # (state*BATCH_SIZE, action*BATCH_SIZE), state_next*BATCH_SIZE, reward*BATCH_SIZE)
        # 형태로 변환한다.

        batch = Transition(*zip(*transitions))

        # 2.3 각 변수의 요소를 미니배치에 맞게 변형하고, 신경망으로 다룰 수 있게 Variable로 만든다
        # state를 예로 들면, [torch.FloatTensor of size 1*4] 형태의 요소가 BATCH_SIZE 개수만큼 있는 형태다
        # 이를 torch.FloatTensor of size BATCH_SIZE*4 형태로 변형한다
        # 상태, 행동, 보상, non_final 상태로 된 미니배치를 나타내는 Variable을 생성
        # cat은 Concatenates(연접)를 의미한다.
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        return batch, state_batch, action_batch, reward_batch, non_final_next_states

    def get_expected_state_action_values(self):
        ''' 정답 신호로 사용할 Q(s_t,a_t)를 계산'''

        # 3.1 신경망을 추론 모드로 전환
        self.main_q_network.eval()
        self.target_q_network.eval()

        # 3.2 신경망으로 Q(s_t, a_t)를 계산
        # self.model(state_batch)은 왼쪽, 오른쪽에 대한 Q값을 출력하며
        # [torch.FloatTensor of size BATCH_SIZEx2] 형태다
        # 여기서부터는 실행한 행동 a_t에 대한 Q값을 계산하므로 action_batch에서 취한 행동
        # a_t가 왼쪽이냐 오른쪽이냐에 대한 인덱스를 구하고, 이에 대한 Q값을 gather메서드로 모아온다.
        self.state_action_values = self.main_q_network(
            self.state_batch).gather(1, self.action_batch)

        # 3.3 max{Q(s_t+1, a)}값을 계산한다. 이때 다음 상태가 존재하는지에 주의해야 한다

        # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듬
        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, self.batch.next_state)))

        # 먼저 전체를 0으로 초기화
        next_state_values = torch.zeros(BATCH_SIZE)

        # Double DQN
        if self.Double == True:
            a_m = torch.zeros(BATCH_SIZE).type(torch.LongTensor)

            # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산
            # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함
            a_m[non_final_mask] = self.main_q_network(
                self.non_final_next_states).detach().max(1)[1]

            # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1로 변환
            a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

            # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산
            # detach() 메서드로 값을 꺼내옴
            # squeeze()메서드로 size[minibatch*1]을 [minibatch]로 변환
            next_state_values[non_final_mask] = self.target_q_network(
                self.non_final_next_states).gather(
                    1, a_m_non_final_next_states).detach().squeeze()
        else:
            # 다음 상태가 있는 인덱스에 대한 최대 Q값을 구한다
            # 출력에 접근해서 열방향 최댓값(max(1))이 되는 [값, 인덱스]를 구한다
            # 그리고 이 Q값(인덱스 = 0)을 출력한 다음
            # detach 메서드로 이 값을 꺼내온다
            next_state_values[non_final_mask] = self.target_q_network(
                self.non_final_next_states).max(1)[0].detach()

        # 3.4 정답신호로 사용할 Q(s_t, a_t) 값을 Q러닝 식으로 계산
        expected_state_action_values = self.reward_batch + GAMMA * next_state_values

        return expected_state_action_values

    def update_main_q_network(self):
        ''' 4. 결합 가중치 수정 '''

        # 4.1 신경망을 학습 모드로 전환
        self.main_q_network.train()

        # 4.2 손실함수를 계산(smooth_l1_loss는 Huber 함수)
        # expected_state_action_values은 size가 [minibatch]이므로 unsqueeze해서 [minibatch*1]로 만듦
        loss = F.smooth_l1_loss(self.state_action_values,
                                self.expected_state_action_values.unsqueeze(1))

        # 4.3 결합 가중치를 수정
        self.optimizer.zero_grad()  # 경사를 초기화
        loss.backward()  # 역전파 계산
        self.optimizer.step()  # 결합 가중치 수정

    def update_target_q_network(self):  # DDQN에서 추가됨
        ''' Target Q-Network을 Main Q-Network와 맞춤 '''
        self.target_q_network.load_state_dict(self.main_q_network.state_dict())

    def update_td_error_memory(self):  # Prioritized Experience Replay 에서 추가됨
        ''' TD 오차 메모리에 저장된 TD 오차를 업데이트 '''

        # 신경망을 추론 모드로 전환
        self.main_q_network.eval()
        self.target_q_network.eval()

        # 전체 transition으로 미니배치를 생성
        transitions = self.memory.memory
        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])

        # 신경망의 출력 Q(s_t, a_t)를 계산
        state_action_values = self.main_q_network(state_batch).gather(
            1, action_batch)

        # cartpole이 done 상태가 아니고, next_state가 존재하는지 확인하는 인덱스 마스크를 만듦
        non_final_mask = torch.ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))

        # 먼저 전체를 0으로 초기화, 크기는 기억한 transition 개수만큼
        next_state_values = torch.zeros(len(self.memory))
        a_m = torch.zeros(len(self.memory)).type(torch.LongTensor)

        # 다음 상태에서 Q값이 최대가 되는 행동 a_m을 Main Q-Network로 계산
        # 마지막에 붙은 [1]로 행동에 해당하는 인덱스를 구함
        a_m[non_final_mask] = self.main_q_network(
            non_final_next_states).detach().max(1)[1]

        # 다음 상태가 있는 것만을 걸러내고, size 32를 32*1 로 변환
        a_m_non_final_next_states = a_m[non_final_mask].view(-1, 1)

        # 다음 상태가 있는 인덱스에 대해 행동 a_m의 Q값을 target Q-Network로 계산
        # detach() 메서드로 값을 꺼내옴
        # squeeze() 메서드로 size[minibatch*1]을 [minibatch]로 변환
        next_state_values[non_final_mask] = self.target_q_network(
            non_final_next_states).gather(
                1, a_m_non_final_next_states).detach().squeeze()

        # TD 오차를 계산
        td_errors = (reward_batch + GAMMA *
                     next_state_values) - state_action_values.squeeze()
        # state_action_values는 size[minibatch*1]이므로 squeeze 메서드로 size[minibatch]로 변환

        # TD 오차 메모리를 업데이트. Tensor를 detach() 메서드로 꺼내와 NumPy 변수로 변환하고
        # 다시 파이썬 리스트로 반환
        self.td_error_memory.memory = td_errors.detach().numpy().tolist()

Пример #26

Показать файл

Файл: ddpg.py Проект: ljd2439/CrowdSimulation_jaedong

class DDPG:
    def __init__(self, dim):
        self.critic_path = cst.CN_CKPT_PATH
        self.actor_path = cst.AN_CKPT_PATH
        self.replaymemory_path = cst.RM_PATH

        self.dim_body = dim[0]
        self.dim_sensor = dim[1]
        self.dim_state = dim[0] + dim[1] * 3
        self.dim_action = dim[2]

        self.sess = tf.InteractiveSession()
        self.act_lr = cst.ACT_LEARNING_RATE
        self.cri_lr = cst.CRI_LEARNING_RATE
        self.tau = cst.TAU
        self.batch_size = cst.BATCH_SIZE
        self.gamma = cst.REWARD_DECAY

        self.actorNN = ActorNetwork(self.sess, self.dim_state, self.dim_action,
                                    self.act_lr, self.tau, self.batch_size)
        self.criticNN = CriticNetwork(self.sess, self.dim_state,
                                      self.dim_action, self.cri_lr, self.tau,
                                      self.gamma,
                                      self.actorNN.get_num_trainable_vars())

        self.sess.run(tf.global_variables_initializer())

        self.actorNN.update_target_network()
        self.criticNN.update_target_network()

        self.rm = ReplayMemory('DDPG')

        self.agent_count = cst.AGENT_COUNT
        self.exploration_rate = cst.EXPLORATION_RATE
        self.epsilon = cst.CRITIC_EPSILON
        self.LOSS_ITERATION = cst.LOSS_ITERATION

        self.expl_noise = OUNoise(self.dim_action)

        self.expl = False
        self.expl_decay = cst.EXPLORATION_DECAY

    #=====================action===========================

    def Action(self, obs, action_type, run_type):
        if action_type == 'GREEDY':
            return self.action_greedy(obs)

        self.isExploration(run_type == 'TRAIN')

        action_list = []
        agent_num = len(obs['agent'])
        for i in range(0, agent_num):
            agent_obs = obs['agent'][i]
            if np.linalg.norm(agent_obs['d'] -
                              agent_obs['p']) < cst.AGENT_RADIUS + 10:
                action = {}
                action['theta'] = 0
                action['velocity'] = 0
                action['stop'] = True
            else:
                action = self.get_action(agent_obs, run_type == 'TEST')
                if self.expl:
                    action = self.action_random(action)

            action_list.append(action)

        return action_list

    def action(self, obs, action_type, run_type):
        if action_type == 'GREEDY':
            return self.action_greedy(obs)

        self.isExploration(run_type == 'TRAIN')

        action_list = []
        for i in range(0, self.agent_count):
            agent_obs = obs['agent'][i]
            if np.linalg.norm(agent_obs['d'] -
                              agent_obs['p']) < agent_obs['r'] + 10:
                action = {}
                action['theta'] = 0
                action['velocity'] = 0
                action['stop'] = True
            else:
                action = self.get_action(agent_obs, run_type == 'TEST')
                if self.expl:
                    action = self.action_random(action)

            action_list.append(action)

        # for i in range(self.agent_count):
        #   agent_obs = obs['agent'][i]
        #   if np.linalg.norm(agent_obs['d']-agent_obs['p']) < agent_obs['r'] + 5:
        #       action = {}
        #       action['theta'] = 0
        #       action['velocity'] = 0
        #       action['stop'] = True
        #   else:
        #       if i == 0:
        #           action = self.get_action(agent_obs, run_type=='TEST')
        #           if self.expl:
        #               action = self.action_random()
        #       else:
        #           action = self.get_action_greedy(agent_obs)

        #   action_list.append(action)

        return action_list

    def get_action(self, agent_obs, action_target=False):
        state_ = {}
        state_ = self.preprocess(agent_obs)
        state_body = np.reshape(state_['body'], (1, self.dim_body))
        state_sensor = np.reshape(state_['sensor'], (1, self.dim_sensor))

        if action_target:
            prediction = self.actorNN.predict_target(state_body, state_sensor)
        else:
            prediction = self.actorNN.predict(state_body, state_sensor)

        action = {}
        action['theta'] = prediction[0][0]
        action['velocity'] = prediction[0][1]
        action['stop'] = False

        return action

    def action_greedy(self, obs):
        action_list = []
        agent_num = len(obs['agent'])
        for i in range(agent_num):
            agent_obs = obs['agent'][i]

            action = self.get_action_greedy(agent_obs)
            action_list.append(action)

        return action_list

    def get_action_greedy(self, agent_obs):
        if np.linalg.norm(agent_obs['d'] - agent_obs['p']) < 10 + 10:
            action = {}
            action['theta'] = 0
            action['velocity'] = 0
            action['stop'] = True
            return action

        greedy_dis = None

        angle_num = 20
        next_angle = (190 / 2.0)

        offset = 2
        direction = np.array(agent_obs['d']) - np.array(agent_obs['p'])
        direction /= np.linalg.norm(direction)

        greedy_dir = 0
        if random.random() < 0.5:
            greedy_dir = 1

        for angle in range(angle_num):
            if agent_obs['d_map'][angle] < 10 + offset:
                continue

            curr_angle = 190 / 2 - angle * 10
            curr_q = mMath.AngleToCoor(
                curr_angle + agent_obs['front']) * agent_obs['d_map'][angle]
            curr_dis = direction[0] * curr_q[0] + direction[1] * curr_q[1]
            if greedy_dir == 0:
                if (greedy_dis is None) or (greedy_dis < curr_dis):
                    next_angle = curr_angle
                    greedy_dis = curr_dis
                    next_q = curr_q
            else:
                if (greedy_dis is None) or (greedy_dis <= curr_dis):
                    next_angle = curr_angle
                    greedy_dis = curr_dis
                    next_q = curr_q

        action = {}
        action['theta'] = np.clip(next_angle, -10, 10) / 10.0

        if greedy_dis is None:
            action['velocity'] = -1
        else:
            action['velocity'] = 1

        action['stop'] = False

        return action

    def action_random(self, action=None):
        if action is None:
            action = dict()
            action['theta'] = np.random.normal()
            action['velocity'] = np.random.normal()
        else:
            noise_theta, noise_vel = self.expl_noise.noise()
            action['theta'] = action['theta'] + noise_theta
            action['velocity'] = action['velocity'] + noise_vel

        action['stop'] = False

        return action

    #=====================update==========================

    def Update(self):
        if len(self.rm.memory['critic']) > 0 and len(
                self.rm.memory['actor']) > 0:
            self.update_network()

    def update_network(self):
        rm_critic_batch = self.rm.getRandomMemories('critic')

        s_body_batch, s_sensor_batch, a_batch, r_batch, t_batch, s2_body_batch, s2_sensor_batch = [], [], [], [], [], [], []
        for m in rm_critic_batch:
            state_ = copy.copy(self.preprocess(m['state']['agent'][0]))
            state_body = copy.copy(state_['body'])
            state_sensor = copy.copy(state_['sensor'])
            action = copy.copy(
                np.array([m['action'][0]['theta'],
                          m['action'][0]['velocity']]))
            next_state_ = copy.copy(
                self.preprocess(m['next_state']['agent'][0]))
            next_state_body = copy.copy(next_state_['body'])
            next_state_sensor = copy.copy(next_state_['sensor'])

            s_body_batch.append(state_body[0])
            s_sensor_batch.append(state_sensor[0])
            a_batch.append(action)
            r_batch.append(m['reward'])
            t_batch.append(m['term'])
            s2_body_batch.append(next_state_body[0])
            s2_sensor_batch.append(next_state_sensor[0])

        target_q = self.criticNN.predict_target(
            s2_body_batch, s2_sensor_batch,
            self.actorNN.predict_target(s2_body_batch, s2_sensor_batch))

        y_i = []
        c_batch_size = len(rm_critic_batch)
        for k in range(c_batch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.gamma * target_q[k])

        # Update the critic given the targets
        predicted_q_value, _ = self.criticNN.train(
            s_body_batch, s_sensor_batch, a_batch,
            np.reshape(y_i, (int(c_batch_size), 1)))

        # Update the actor policy using the sampled gradient
        rm_actor_batch = self.rm.getRandomMemories('actor')

        actor_body_batch, actor_sensor_batch, actor_a_batch = [], [], []
        for m in rm_actor_batch:
            state_ = copy.copy(self.preprocess(m['state']['agent'][0]))
            state_body = copy.copy(state_['body'])
            state_sensor = copy.copy(state_['sensor'])
            action = copy.copy(
                np.array([m['action'][0]['theta'],
                          m['action'][0]['velocity']]))
            actor_body_batch.append(state_body[0])
            actor_sensor_batch.append(state_sensor[0])
            actor_a_batch.append(action)

        act_batch = self.actorNN.predict(actor_body_batch, actor_sensor_batch)
        grads = self.criticNN.action_gradients(actor_body_batch,
                                               actor_sensor_batch, act_batch)
        self.actorNN.train(actor_body_batch, actor_sensor_batch, grads[0])

        # Update target networks
        self.actorNN.update_target_network()
        self.criticNN.update_target_network()

    #===================evaluate===========================

    def evaluate(self, obs, agent_idx, action, run_type='TRAIN'):
        state_ = {}
        agent_obs = obs['agent'][agent_idx]

        state_['body'] = np.array(
            self.preprocess_body(agent_obs['p'], agent_obs['q'],
                                 agent_obs['v'], agent_obs['d']))
        state_['action'] = np.array([action['theta'], action['velocity']])
        state_['sensor'] = np.array(
            self.preprocess_sensor(agent_obs['d_map'], agent_obs['v_map'],
                                   agent_obs['q_lim'], agent_obs['v_depth']))

        state_body = np.reshape(state_['body'], (1, self.dim_body))
        state_sensor = np.reshape(state_['sensor'], (1, self.dim_sensor))
        action = np.reshape(state_['action'], (1, self.dim_action))

        if run_type == 'TEST':
            prediction = self.criticNN.predict_target(state_body, state_sensor,
                                                      action)[0]
        else:
            prediction = self.criticNN.predict(state_body, state_sensor,
                                               action)[0]

        return prediction

    def expl_rate_decay(self):
        if self.exploration_rate > 0.2:
            self.exploration_rate *= self.expl_decay
            print "exploration rate : ", self.exploration_rate

    #=====================replay_memory===========================

    def addMemory(self, is_greedy, obs, act, next_state, reward, is_term):
        if is_greedy:
            self.rm.addMemory('actor', obs, act, next_state, reward, is_term)
            self.rm.addMemory('critic', obs, act, next_state, reward, is_term)
        else:
            if self.expl:
                self.rm.addMemory('actor', obs, act, next_state, reward,
                                  is_term)
                self.expl = False
            else:
                self.rm.addMemory('critic', obs, act, next_state, reward,
                                  is_term)

    #==================save & load==========================

    def save(self, m_replay=False, training_time=0, eval_list=None):
        cur_time = strftime("%Y%m%d_%I%M.ckpt", localtime())

        print "Save Critic Network : ",
        self.criticNN.save(self.critic_path, cur_time)

        print "Save Actor Network : ",
        self.actorNN.save(self.actor_path, cur_time)

        print "Parameters Saved...!"
        self.save_parameters(cur_time, training_time)

        print "Networks Saved...!"

        if m_replay:
            print "Replay Memories Saved...!"
            self.save_replaymemory(cur_time)

        if eval_list != None:
            print "Evaluation List Saved...!"
            self.save_evaluation(cur_time, eval_list)

    def save_replaymemory(self, cur_time):
        f = open(cst.RM_PATH + "checkpoint", 'w')
        f.write(cur_time)
        f.close()

        f = open(cst.RM_PATH + "rm_" + cur_time, 'w')
        pickle.dump(self.rm, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()

    def save_evaluation(self, cur_time, eval_list=None):
        f = open(cst.EVAL_PATH + "checkpoint", 'w')
        f.write(cur_time)
        f.close()

        f = open(cst.EVAL_PATH + "eval_" + cur_time, 'w')
        pickle.dump(eval_list, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()

    def save_parameters(self, cur_time, training_time):
        f_read = open(cst.PM_READ_PATH, 'r')
        f_write = open(cst.PM_WRITE_PATH + "pm_" + cur_time + ".txt", 'w')
        f_write.write("traning time : " + str(training_time))
        while True:
            line = f_read.readline()
            if not line:
                break
            f_write.write(line)
        f_read.close()
        f_write.close()

    def load_network(self, type):
        if type == 'actor':
            print "Load Recent Actor Network : ",
            self.actorNN.load(self.actor_path)
        elif type == 'critic':
            print "Load Recent Critic Network : ",
            self.criticNN.load(self.critic_path)

    def load_memory(self):
        f = open(cst.RM_PATH + "checkpoint", 'r')
        recent_file_name = f.readline()
        f.close()

        f_rm = open(cst.RM_PATH + "rm_" + recent_file_name, 'r')
        self.rm = pickle.load(f_rm)
        f_rm.close()

        print "Load Replay Memory :  ", cst.RM_PATH, "rm_", recent_file_name

    def load_eval(self):
        f = open(cst.EVAL_PATH + "checkpoint", 'r')
        recent_file_name = f.readline()
        f.close()

        f_eval = open(cst.EVAL_PATH + "eval_" + recent_file_name, 'r')
        self.eval = pickle.load(f_eval)
        f_eval.close()

        print "Load Eval List :  ", cst.EVAL_PATH, "eval_", recent_file_name

    #=================other===============================

    def preprocess(self, agent_obs):
        state = {}
        state['body'] = np.array(
            self.preprocess_body(agent_obs['p'], agent_obs['q'],
                                 agent_obs['v'], agent_obs['d'])).reshape(
                                     (1, self.dim_body))
        state['sensor'] = np.array(
            self.preprocess_sensor(agent_obs['d_map'], agent_obs['delta'], 20,
                                   cst.VISION_DEPTH)).reshape((1, 40))

        return state

    def preprocess_body(self, p, q, v, d):
        p_ = np.array(p)
        q_ = np.array(q)
        d_ = np.array(d)

        width = cst.WINDOW_WIDTH / 2.0
        height = cst.WINDOW_HEIGHT / 2.0

        p_[0] = p_[0] / width
        p_[1] = p_[1] / height

        d_[0] = d_[0] / width
        d_[1] = d_[1] / height

        q_norm = np.linalg.norm(q_)
        q_ = (q_ / q_norm)

        pd = np.array(d_ - p_)
        pd_len = np.linalg.norm(pd)
        pd_vec = pd / pd_len

        inner = mMath.InnerProduct(q_, pd_vec)
        cross = mMath.CrossProduct(q_, pd_vec)

        cross_val = 1.0
        if cross < 0:
            cross_val = 0.0

        return [v, inner, cross_val, pd_len]

    def preprocess_sensor(self, d_map, delta_map, q_lim, vision_depth):
        depth = [d / float(vision_depth) for d in d_map]
        delta = [d / float(vision_depth) for d in delta_map]

        # print "depth : ", depth
        # print "delta : ", delta

        sensor = depth + delta
        return np.array(sensor)

    def get_agent_count(self, is_train, obs):
        if is_train:
            return 1
        else:
            return len(obs['agent'])

    def isExploration(self, flag):
        self.expl = (flag and random.random() < self.exploration_rate)

Пример #27

Показать файл

Файл: single_agent.py Проект: maxstrobel/TwinDQN

    def __init__(self,
                 game,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 frameskip = 4
                 ):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # Environment
        self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())

        self.target_net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500

Пример #28

Показать файл

Файл: agent.py Проект: JCass45/FYP-Deep-Reinforcement-Learning-on-Atari-2600

class Agent():
    def __init__(self, game, agent_type, display, load_model, record, test):
        self.name = game
        self.agent_type = agent_type
        self.ale = ALEInterface()
        self.ale.setInt(str.encode('random_seed'), np.random.randint(100))
        self.ale.setBool(str.encode('display_screen'), display or record)
        if record:
            self.ale.setString(str.encode('record_screen_dir'), str.encode('./data/recordings/{}/{}/tmp/'.format(game, agent_type)))

        self.ale.loadROM(str.encode('./roms/{}.bin'.format(self.name)))
        self.action_list = list(self.ale.getMinimalActionSet())
        self.frame_shape = np.squeeze(self.ale.getScreenGrayscale()).shape
        if test:
            self.name += '_test'

        if 'space_invaders' in self.name:
            # Account for blinking bullets
            self.frameskip = 2
        else:
            self.frameskip = 3

        self.frame_buffer = deque(maxlen=4)
        if load_model and not record:
            self.load_replaymemory()
        else:
            self.replay_memory = ReplayMemory(500000, 32)

        model_input_shape = self.frame_shape + (4,)
        model_output_shape = len(self.action_list)

        if agent_type == 'dqn':
            self.model = DeepQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )
        elif agent_type == 'double':
            self.model = DoubleDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        else:
            self.model = DuelingDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        print('{} Loaded!'.format(' '.join(self.name.split('_')).title()))
        print('Displaying: ', display)
        print('Frame Shape: ', self.frame_shape)
        print('Frame Skip: ', self.frameskip)
        print('Action Set: ', self.action_list)
        print('Model Input Shape: ', model_input_shape)
        print('Model Output Shape: ', model_output_shape)
        print('Agent: ', agent_type)

    def training(self, steps):
        '''
        Trains the agent for :steps number of weight updates.

        Returns the average model loss
        '''

        loss = []

        # Initialize frame buffer. np.squeeze removes empty dimensions e.g. if shape=(210,160,__)
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))

        try:
            for step in range(steps):
                gameover = False
                initial_state = np.stack(self.frame_buffer, axis=-1)
                action = self.model.predict_action(initial_state)

                # Backup data
                if step % 5000 == 0:
                    self.model.save_model()
                    self.model.save_hyperparams()
                    self.save_replaymemory()

                # If using a target model check for weight updates
                if hasattr(self.model, 'tau'):
                    if self.model.tau == 0:
                        self.model.update_target_model()
                        self.model.tau = 10000
                    else:
                        self.model.tau -= 1

                # Frame skipping technique https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/
                lives_before = self.ale.lives()
                for _ in range(self.frameskip):
                    self.ale.act(action)

                reward = self.ale.act(action)
                self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
                lives_after = self.ale.lives()

                if lives_after < lives_before:
                    gameover = True  # Taking advice from dude on reddit
                    reward = -1

                if self.ale.game_over():
                    gameover = True
                    reward = -1
                    self.ale.reset_game()

                new_state = np.stack(self.frame_buffer, axis=-1)

                # Experiment with clipping rewards for stability purposes
                reward = np.clip(reward, -1, 1)
                self.replay_memory.add(
                    initial_state,
                    action,
                    reward,
                    gameover,
                    new_state
                )

                loss += self.model.replay_train()
        except:
            self.model.save_model()
            self.model.save_hyperparams()
            self.save_replaymemory()
            raise KeyboardInterrupt

        return np.mean(loss, axis=0)

    def simulate_random(self):
        print('Simulating game randomly')
        done = False
        total_reward = 0
        while not done:
            action = np.random.choice(self.ale.getMinimalActionSet())
            reward = self.ale.act(action)
            total_reward += reward
            if self.ale.game_over():
                reward = -1
                done = True

            reward = np.clip(reward, -1, 1)
            if reward != 0:
                print(reward)

        frames_survived = self.ale.getEpisodeFrameNumber()
        self.ale.reset_game()
        return total_reward, frames_survived

    def simulate_intelligent(self, evaluating=False):
        done = False
        total_score = 0

        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
        while not done:
            state = np.stack(self.frame_buffer, axis=-1)
            action = self.model.predict_action(state, evaluating)

            for _ in range(self.frameskip):
                self.ale.act(action)

            # Remember, ale.act returns the increase in game score with this action
            total_score += self.ale.act(action)

            # Pushes oldest frame out
            self.frame_buffer.append(np.squeeze(self.ale.getScreenGrayscale()))
            if self.ale.game_over():
                done = True

        frames_survived = self.ale.getEpisodeFrameNumber()
        print('   Game Over')
        print('   Frames Survived: ', frames_survived)
        print('   Score: ', total_score)
        print('===========================')
        self.ale.reset_game()
        return total_score, frames_survived

    def save_replaymemory(self):
        with bz2.BZ2File('./data/{}/{}_replaymem.obj'.format(self.agent_type, self.name), 'wb') as f:
            pickle.dump(self.replay_memory, f, protocol=pickle.HIGHEST_PROTOCOL)
            print('Saved replay memory at ', datetime.now())

    def load_replaymemory(self):
        try:
            with bz2.BZ2File('./data/{}/{}_replaymem.obj'.format(self.agent_type, self.name), 'rb') as f:
                self.replay_memory = pickle.load(f)
                print('Loaded replay memory at ', datetime.now())
        except FileNotFoundError:
            print('No replay memory file found')
            raise KeyboardInterrupt

Пример #29

Показать файл

Файл: agent.py Проект: antoniopioricciardi/udemy_modernRL

class Agent():
    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 update_actor_interval=2,
                 warmup=1000,
                 mem_size=10**6,
                 batch_size=100,
                 n_hid1=400,
                 n_hid2=300,
                 lr_alpha=1e-3,
                 lr_beta=1e-3,
                 gamma=0.99,
                 tau=5e-3,
                 noise_mean=0,
                 noise_sigma=0.1):

        self.load_checkpoint = load_checkpoint
        self.checkpoint_file = checkpoint_file
        # needed for clamping in the learn function
        self.env = env
        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.n_actions = n_actions
        # to keep track of how often we call "learn" function, for the actor network
        self.learn_step_counter = 0
        # to handle countdown to the end of the warmup period, incremented every time we call an action
        self.time_step = 0
        self.update_actor_interval = update_actor_interval
        self.warmup = warmup
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.noise_mean = noise_mean
        self.noise_sigma = noise_sigma

        self.actor = TD3ActorNetwork(n_states,
                                     n_actions,
                                     n_hid1,
                                     n_hid2,
                                     lr_alpha,
                                     checkpoint_file,
                                     name='actor')
        self.target_actor = TD3ActorNetwork(n_states,
                                            n_actions,
                                            n_hid1,
                                            n_hid2,
                                            lr_alpha,
                                            checkpoint_file,
                                            name='target_actor')

        self.critic_1 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_1')
        self.critic_2 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_2')
        self.target_critic_1 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_1')
        self.target_critic_2 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_2')

        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        # tau=1 perform an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.actor, self.target_actor, tau=1)
        self.update_network_parameters(self.critic_1,
                                       self.target_critic_1,
                                       tau=1)
        self.update_network_parameters(self.critic_2,
                                       self.target_critic_2,
                                       tau=1)

    def choose_action(self, obs):
        if self.time_step < self.warmup:
            self.time_step += 1
            action = torch.tensor(self.env.action_space.sample())
        else:
            obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device)
            action = self.actor(obs)

            # exploratory noise, scaled wrt action scale (max_action)
            noise = torch.tensor(
                np.random.normal(self.noise_mean,
                                 self.noise_sigma * self.max_action,
                                 size=self.n_actions)).to(self.actor.device)
            action += noise
        action = torch.clamp(action, self.low_action, self.max_action)
        return action.cpu().detach().numpy()

    def choose_action_eval(self, obs):
        obs = torch.tensor(obs, dtype=torch.float).to(self.actor.device)
        action = self.actor(obs)
        action = torch.clamp(action, self.low_action, self.max_action)
        return action.cpu().detach().numpy()

    def store_transition(self, obs, action, reward, obs_, done):
        self.memory.store_transition(obs, action, reward, obs_, done)

    def sample_transitions(self):
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.memory.sample_buffer(
            self.batch_size)
        # no need to care about the device, it is the same for all class objects (cuda or cpu is the same despite the class)
        state_batch = torch.tensor(state_batch,
                                   dtype=torch.float).to(self.actor.device)
        action_batch = torch.tensor(action_batch,
                                    dtype=torch.float).to(self.actor.device)
        reward_batch = torch.tensor(reward_batch,
                                    dtype=torch.float).to(self.actor.device)
        new_state_batch = torch.tensor(new_state_batch,
                                       dtype=torch.float).to(self.actor.device)
        done_batch = torch.tensor(done_batch).to(self.actor.device)
        return state_batch, action_batch, reward_batch, new_state_batch, done_batch

    def __copy_param(self, net_param_1, net_param_2, tau):
        # a.copy_(b) reads content from b and copy it to a
        for par, target_par in zip(net_param_1, net_param_2):
            #with torch.no_grad():
            val_to_copy = tau * par.weight + (1 - tau) * target_par.weight
            target_par.weight.copy_(val_to_copy)
            if target_par.bias is not None:
                val_to_copy = tau * par.bias + (1 - tau) * target_par.bias
                target_par.bias.copy_(val_to_copy)

    def update_network_parameters(self, network, target_network, tau=None):
        for par, target_par in zip(network.parameters(),
                                   target_network.parameters()):
            target_par.data.copy_(tau * par.data + (1 - tau) * target_par.data)

        #
        # # TODO: Controlla equivalenza con metodo Phil
        # # during the class initialization we call this method with tau=1, to perform an exact copy of the nets to targets
        # # then when we call this without specifying tau, we use the field stored
        # if tau is None:
        #     tau = self.tau
        #
        # actor_params = self.actor.children()
        # target_actor_params = self.target_actor.children()
        # self.__copy_param(actor_params, target_actor_params, tau)
        #
        # critic_params1 = self.critic_1.children()
        # target_critic_1_params = self.target_critic_1.children()
        # self.__copy_param(critic_params1, target_critic_1_params, tau)
        #
        # critic_params2 = self.critic_2.children()
        # target_critic_2_params = self.target_critic_2.children()
        # self.__copy_param(critic_params2, target_critic_2_params, tau)

    def learn(self):
        self.learn_step_counter += 1

        # deal with the situation in which we still not have filled the memory to batch size
        if self.memory.mem_counter < self.batch_size:
            return
        state_batch, action_batch, reward_batch, new_state_batch, done_batch = self.sample_transitions(
        )
        # +- 0.5 as per paper. To be tested if min and max actions are not equal (e.g. -2 and 1)
        noise = torch.tensor(
            np.clip(
                np.random.normal(self.noise_mean, 0.2, size=self.n_actions),
                -0.5, 0.5)).to(self.actor.device)
        target_next_action = torch.clamp(
            self.target_actor(new_state_batch) + noise, self.low_action,
            self.max_action)

        target_q1_ = self.target_critic_1(new_state_batch, target_next_action)
        target_q2_ = self.target_critic_1(new_state_batch, target_next_action)
        target_q_ = torch.min(
            target_q1_,
            target_q2_)  # take the min q_vale for every element in the batch
        target_q_[done_batch] = 0.0
        target = target_q_.view(-1)  # probably not needed
        target = reward_batch + self.gamma * target  #_q
        target = target.view(self.batch_size, 1)  # probably not needed

        q_val1 = self.critic_1(state_batch, action_batch)
        q_val2 = self.critic_1(state_batch, action_batch)

        critic_loss1 = F.mse_loss(q_val1, target)
        critic_loss2 = F.mse_loss(q_val2, target)
        critic_loss = critic_loss1 + critic_loss2

        self.critic_1.optimizer.zero_grad()
        self.critic_2.optimizer.zero_grad()
        critic_loss.backward()
        #critic_loss1.backward()
        #critic_loss2.backward()

        self.critic_1.optimizer.step()
        self.critic_2.optimizer.step()

        if self.learn_step_counter % self.update_actor_interval:
            action = self.actor(state_batch)
            # compute actor loss proportional to the estimated value from q1 given state, action pairs, where the action
            # is recomputed using the new policy
            actor_loss = -torch.mean(self.critic_1(state_batch, action))

            self.actor.optimizer.zero_grad()
            actor_loss.backward()
            self.actor.optimizer.step()

            self.update_network_parameters(self.actor, self.target_actor,
                                           self.tau)
            self.update_network_parameters(self.critic_1, self.target_critic_1,
                                           self.tau)
            self.update_network_parameters(self.critic_2, self.target_critic_2,
                                           self.tau)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic_1.save_checkpoint()
        self.critic_2.save_checkpoint()
        self.target_critic_1.save_checkpoint()
        self.target_critic_2.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic_1.load_checkpoint()
        self.critic_2.load_checkpoint()
        self.target_critic_1.load_checkpoint()
        self.target_critic_2.load_checkpoint()

Пример #30

Показать файл

Файл: agent.py Проект: JCass45/FYP-Deep-Reinforcement-Learning-on-Atari-2600

    def __init__(self, game, agent_type, display, load_model, record, test):
        self.name = game
        self.agent_type = agent_type
        self.ale = ALEInterface()
        self.ale.setInt(str.encode('random_seed'), np.random.randint(100))
        self.ale.setBool(str.encode('display_screen'), display or record)
        if record:
            self.ale.setString(str.encode('record_screen_dir'), str.encode('./data/recordings/{}/{}/tmp/'.format(game, agent_type)))

        self.ale.loadROM(str.encode('./roms/{}.bin'.format(self.name)))
        self.action_list = list(self.ale.getMinimalActionSet())
        self.frame_shape = np.squeeze(self.ale.getScreenGrayscale()).shape
        if test:
            self.name += '_test'

        if 'space_invaders' in self.name:
            # Account for blinking bullets
            self.frameskip = 2
        else:
            self.frameskip = 3

        self.frame_buffer = deque(maxlen=4)
        if load_model and not record:
            self.load_replaymemory()
        else:
            self.replay_memory = ReplayMemory(500000, 32)

        model_input_shape = self.frame_shape + (4,)
        model_output_shape = len(self.action_list)

        if agent_type == 'dqn':
            self.model = DeepQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )
        elif agent_type == 'double':
            self.model = DoubleDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        else:
            self.model = DuelingDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        print('{} Loaded!'.format(' '.join(self.name.split('_')).title()))
        print('Displaying: ', display)
        print('Frame Shape: ', self.frame_shape)
        print('Frame Skip: ', self.frameskip)
        print('Action Set: ', self.action_list)
        print('Model Input Shape: ', model_input_shape)
        print('Model Output Shape: ', model_output_shape)
        print('Agent: ', agent_type)

Пример #31

Показать файл

Файл: learner_mp.py Проект: LihaoR/Apex-dqn

class Learner():
    def __init__(self, sess, s_size, a_size, scope, queues, trainer):
        self.queue = queues[0]
        self.param_queue = queues[1]
        self.replaymemory = ReplayMemory(100000)
        self.sess = sess
        self.learner_net = network(s_size, a_size, scope, 20)

        self.q = self.learner_net.q
        self.Q = self.learner_net.Q

        self.actions_q = tf.placeholder(shape=[None, a_size, N],
                                        dtype=tf.float32)
        self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32)
        self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32)

        self.q_actiona = tf.multiply(self.q, self.actions_q)
        self.q_action = tf.reduce_sum(self.q_actiona, axis=1)
        self.u = tf.abs(self.q_target - self.q_action)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1))

        self.local_vars = self.learner_net.local_vars  #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        self.gradients = tf.gradients(self.loss, self.local_vars)
        #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0)
        self.apply_grads = trainer.apply_gradients(
            zip(self.gradients, self.local_vars))
        self.sess.run(tf.global_variables_initializer())

    def run(self, gamma, s_size, a_size, batch_size, env):
        print('start learning')
        step, train1 = 0, False
        epi_q = []
        self.env = env
        while True:
            if self.queue.empty():
                pass
            else:
                while not self.queue.empty():
                    t_error = self.queue.get()
                    step += 1
                    self.replaymemory.add(t_error)

            if self.param_queue.empty():
                params = self.sess.run(self.local_vars)
                self.param_queue.put(params)

            if step >= 10000:
                train1 = True
                step = 0

            if train1 == True:
                episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(
                    batch_size)
                #print 'fadsfdasfadsfa'
                episode_buffer = np.array(episode_buffer)
                #print episode_buffer
                observations = episode_buffer[:, 0]
                actions = episode_buffer[:, 1]
                rewards = episode_buffer[:, 2]
                observations_next = episode_buffer[:, 3]
                dones = episode_buffer[:, 4]
                Q_target = self.sess.run(self.Q,
                                         feed_dict={
                                             self.learner_net.inputs:
                                             np.vstack(observations_next)
                                         })

                actions_ = np.argmax(Q_target, axis=1)

                action = np.zeros((batch_size, a_size))
                action_ = np.zeros((batch_size, a_size))
                for i in range(batch_size):
                    action[i][actions[i]] = 1
                    action_[i][actions_[i]] = 1
                action_now = np.zeros((batch_size, a_size, N))
                action_next = np.zeros((batch_size, a_size, N))
                for i in range(batch_size):
                    for j in range(a_size):
                        for k in range(N):
                            action_now[i][j][k] = action[i][j]
                            action_next[i][j][k] = action_[i][j]
                q_target = self.sess.run(self.q_action,
                                         feed_dict={
                                             self.learner_net.inputs:
                                             np.vstack(observations_next),
                                             self.actions_q:
                                             action_next
                                         })

                q_target_batch = []
                for i in range(len(q_target)):
                    qi = q_target[i]
                    z_target_step = []
                    for j in range(len(qi)):
                        z_target_step.append(gamma * qi[j] * (1 - dones[i]) +
                                             rewards[i])
                    q_target_batch.append(z_target_step)
                q_target_batch = np.array(q_target_batch)

                isweight = np.zeros((batch_size, N))
                for i in range(batch_size):
                    for j in range(N):
                        isweight[i, j] = ISWeights[i]
                feed_dict = {
                    self.q_target: q_target_batch,
                    self.learner_net.inputs: np.vstack(observations),
                    self.actions_q: action_now,
                    self.ISWeights: isweight
                }

                l, abs_errors, _ = self.sess.run(
                    [self.loss, self.u, self.apply_grads], feed_dict=feed_dict)
                #print abs_errors
                abs_errors = np.mean(abs_errors, axis=1) + 1e-6

                self.replaymemory.update_priorities(tree_idx, abs_errors)

Пример #32

Показать файл

Файл: qrdqn-offline.py Проект: LihaoR/c51-qr-dqn

class Worker():
    def __init__(self,env,name,s_size,a_size,trainer,model_path,global_episodes):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        #Create the local copy of the network and the tensorflow op to copy global paramters to local network
        self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)
        self.env = env
        self.replaymemory = ReplayMemory(max_memory)
        
    def train(self,rollout,sess,gamma,ISWeights):
        rollout = np.array(rollout)
        observations      = rollout[:,0]
        actions           = rollout[:,1]
        rewards           = rollout[:,2]
        next_observations = rollout[:,3]
        dones             = rollout[:,4]
        
        Q_target = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:np.vstack(next_observations)})
        actions_ = np.argmax(Q_target, axis=1)
        
        action = np.zeros((batch_size, a_size))
        action_ = np.zeros((batch_size, a_size))
        for i in range(batch_size):
            action[i][actions[i]] = 1
            action_[i][actions_[i]] = 1
        action_now = np.zeros((batch_size, a_size, N))
        action_next = np.zeros((batch_size, a_size, N))
        for i in range(batch_size):
            for j in range(a_size):
                for k in range(N):
                    action_now[i][j][k] = action[i][j]
                    action_next[i][j][k] = action_[i][j]

        q_target = sess.run(self.local_Q.q_action, feed_dict={self.local_Q.inputs:np.vstack(next_observations),
                                                               self.local_Q.actions_q:action_next})
        q_target_batch = []
        for i in range(len(q_target)):
            qi = q_target[i]# * (1 - dones[i])
            z_target_step = []
            for j in range(len(qi)):
                z_target_step.append(gamma * qi[j] + rewards[i])
            q_target_batch.append(z_target_step)
        q_target_batch = np.array(q_target_batch)
        #print q_target_batch
        isweight = np.zeros((batch_size,N))
        for i in range(batch_size):
            for j in range(N):
                isweight[i,j] = ISWeights[i]
        feed_dict = {self.local_Q.inputs:np.vstack(observations),
                     self.local_Q.actions_q:action_now,
                     self.local_Q.q_target:q_target_batch,
                     self.local_Q.ISWeights:isweight}
        u,l,g_n,v_n,_ = sess.run([self.local_Q.u,
                                  self.local_Q.loss,
                                  self.local_Q.grad_norms,
                                  self.local_Q.var_norms,
                                  self.local_Q.apply_grads],feed_dict=feed_dict)
        return l/len(rollout), g_n, v_n, Q_target, u

    def work(self,gamma,sess,coord,saver):
        global GLOBAL_STEP
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        epsilon = 0.2
        
        print ("Starting worker " + str(self.number))
        best_mean_episode_reward = -float('inf')
        with sess.as_default(), sess.graph.as_default():
            while not coord.should_stop():
                sess.run(self.update_local_ops)
                #episode_buffer = []
                episode_reward = 0
                episode_step_count = 0
                d = False
                s = self.env.reset()
                s = process_frame(s)
                if epsilon > 0.01:
                    epsilon = epsilon * 0.997
                while not d:
                    #self.env.render()
                    GLOBAL_STEP += 1
                    #Take an action using probabilities from policy network output.
                    if random.random() > epsilon:
                        a_dist_list = sess.run(self.local_Q.Q, feed_dict={self.local_Q.inputs:[s]})
                        a_dist = a_dist_list[0]
                        a = np.argmax(a_dist)
                    else:
                        a = random.randint(0, 5)
                    
                    s1, r, d, _ = self.env.step(a)
                    if d == False:
                        s1 = process_frame(s1)
                    else:
                        s1 = s
                    self.replaymemory.add([s,a,r,s1,d])
                    episode_reward += r
                    s = s1                    
                    total_steps += 1
                    episode_step_count += 1
                    if total_steps % 2 == 0 and d != True and total_steps > 50000:
                        episode_buffer, tree_idx, ISWeights = self.replaymemory.sample(batch_size)
                        l,g_n,v_n,Q_target,u = self.train(episode_buffer,sess,gamma,ISWeights)
                        u = np.mean(u,axis=1) + 1e-6
                        self.replaymemory.update_priorities(tree_idx,u)
                        #sess.run(self.update_local_ops)
                    if d == True:
                        break
                sess.run(self.update_local_ops)
                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)

                # Periodically save gifs of episodes, model parameters, and summary statistics.
                if episode_count % 5 == 0 and episode_count != 0 and total_steps > max_memory:
                    if self.name == 'worker_0' and episode_count % 5 == 0:
                        print('\n episode: ', episode_count, 'global_step:', \
                              GLOBAL_STEP, 'mean episode reward: ', np.mean(self.episode_rewards[-10:]), \
                              'epsilon: ', epsilon)
                    
                    print ('loss', l, 'Qtargetmean', np.mean(Q_target))
                    #print 'p_target', p_target
                    if episode_count % 100 == 0 and self.name == 'worker_0' and total_steps > 10000:
                        saver.save(sess,self.model_path+'/qr-dqn-'+str(episode_count)+'.cptk')
                        print ("Saved Model")

                    mean_reward = np.mean(self.episode_rewards[-100:])
                    if episode_count > 20 and best_mean_episode_reward < mean_reward:
                        best_mean_episode_reward = mean_reward

                if self.name == 'worker_0':
                    sess.run(self.increment)
                    #if episode_count%1==0:
                        #print('\r {} {}'.format(episode_count, episode_reward),end=' ')
                episode_count += 1