示例#1
0
    def __init__(self, state_size, action_size, seed, is_prioritized_sample=False):
        '''Initialize an Agent.

        Params
        ======
            state_size (int): the dimension of the state
            action_size (int): the number of actions
            seed (int): random seed
        '''

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.t_step = 0 # Initialize time step (for tracking LEARN_EVERY_STEP and UPDATE_EVERY_STEP)

        self.is_prioritized_sample = is_prioritized_sample

        self.qnetwork_local = QNetwork(self.state_size, self.action_size, seed).to(device)
        self.qnetowrk_target = QNetwork(self.state_size, self.action_size, seed).to(device)
        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        
        if self.is_prioritized_sample == False:
            self.replay_memory = ReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)
        else:
            self.replay_memory = PrioritizedReplayMemory(BATCH_SIZE, BUFFER_SIZE, seed)
示例#2
0
    def __init__(self, sess, s_size, a_size, scope, queues, trainer):
        self.queue = queues[0]
        self.param_queue = queues[1]
        self.replaymemory = ReplayMemory(100000)
        self.sess = sess
        self.learner_net = network(s_size, a_size, scope, 20)

        self.q = self.learner_net.q
        self.Q = self.learner_net.Q

        self.actions_q = tf.placeholder(shape=[None, a_size, N],
                                        dtype=tf.float32)
        self.q_target = tf.placeholder(shape=[None, N], dtype=tf.float32)
        self.ISWeights = tf.placeholder(shape=[None, N], dtype=tf.float32)

        self.q_actiona = tf.multiply(self.q, self.actions_q)
        self.q_action = tf.reduce_sum(self.q_actiona, axis=1)
        self.u = tf.abs(self.q_target - self.q_action)
        self.loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.u) * self.ISWeights, axis=1))

        self.local_vars = self.learner_net.local_vars  #tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
        self.gradients = tf.gradients(self.loss, self.local_vars)
        #grads,self.grad_norms = tf.clip_by_norm(self.gradients,40.0)
        self.apply_grads = trainer.apply_gradients(
            zip(self.gradients, self.local_vars))
        self.sess.run(tf.global_variables_initializer())
    def __init__(self, policy_net, target_net, durability, optimizer, name,
                 constants):
        """An agent class that takes action on the environment and optimizes
        the action based on the reward.

        Parameters
        ----------
        policy_net : DQN
            [description]
        target_net : DQN
            [description]
        durability : int
            [description]
        optimizer : [type]
            [description]
        name : str
            The name of agent
        constants: Constants
            The hyper-parameters from Constants class
        """
        self.CONSTANTS = constants
        self.policy_net = policy_net
        self.target_net = target_net
        self.target_net.load_state_dict(policy_net.state_dict())
        self.durability = durability
        self.optimizer = optimizer
        self.name = name
        self.memory = ReplayMemory(self.CONSTANTS.MEMORY_SIZE)
        self.steps_done = 0
        self.total_reward = 0.0
        self.reward = 0.0
        self.obtained_reward = 0.0
        self.n_best = 0
        self.policy_net_flag = False
示例#4
0
    def __init__(self, env, mode, pre_trained_model, tensorboard_writer=None):
        super(DQNAgent, self).__init__(env, mode, tensorboard_writer)
        self.agent_name = 'DQN' + str(self.agent_no)
        self.memory = ReplayMemory()

        self.network = DeepQNetwork(self.obs_space[0], self.action_space)

        if self.mode == 'play':
            self.network.load_params(pre_trained_model)
            self.network.eval()

        elif self.mode == 'train':

            self.eval_network = DeepQNetwork(self.obs_space[0],
                                             self.action_space)
            self.eval_network.eval()

            if pre_trained_model:
                self.eval_network.load_params(pre_trained_model)

            self.optimizer = optim.RMSprop(self.network.parameters(), lr=LR)
            self.loss_func = SmoothL1Loss()
        else:
            raise ValueError(
                'Please set a valid mode for the agent (play or train)')
    def __init__(self,
                 load_checkpoint,
                 n_states,
                 n_actions,
                 checkpoint_file,
                 mem_size=10**6,
                 batch_size=64,
                 n_hid1=400,
                 n_hid2=300,
                 alpha=1e-4,
                 beta=1e-3,
                 gamma=0.99,
                 tau=0.99):
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  alpha,
                                  checkpoint_file,
                                  name='actor')
        self.critic = CriticNetwork(n_states,
                                    n_actions,
                                    n_hid1,
                                    n_hid2,
                                    beta,
                                    checkpoint_file,
                                    name='critic')

        self.actor_target = ActorNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         alpha,
                                         checkpoint_file,
                                         name='actor_target')
        self.critic_target = CriticNetwork(n_states,
                                           n_actions,
                                           n_hid1,
                                           n_hid2,
                                           beta,
                                           checkpoint_file,
                                           name='critic_target')

        self.noise = OUActionNoise(mu=np.zeros(n_actions))
        self.memory = ReplayMemory(mem_size, n_states, n_actions)
        self.update_network_parameters_phil(tau=1)
        if load_checkpoint:
            self.actor.eval()
        self.load_checkpoint = load_checkpoint
示例#6
0
def main(game, episodes, training_mode=False, log=False, no_ops=30):
    env = gym.make(game)
    num_actions = env.action_space.n
    dqn = DeepQNetwork(num_actions, (4, 84, 84))
    replay = ReplayMemory(100000)
    obs = env.reset()
    h, w, c = obs.shape
    phi = Phi(4, 84, 84, c, h, w)
    agent = Agent(replay, dqn, training_mode=training_mode)
    stats = Stats('results/results.csv')

    for i_episode in range(episodes):
        env.reset()

        for i in range(random.randint(1, no_ops)):
            observation, _, _, _ = env.step(0)
            pre_state = phi.add(observation)

        game_score = 0
        done = False
        t = 0

        while not done:
            t += 1
            env.render()
            action = agent.get_action(pre_state)
            observation, reward, done, _ = env.step(action)
            post_state = phi.add(observation)

            if training_mode:
                agent.update_replay_memory(pre_state, action, reward,
                                           post_state, done)
                if agent.time_step > agent.replay_start_size:
                    stats.log_time_step(agent.get_loss())

            pre_state = post_state
            game_score += reward

        print("Episode {} finished after {} time steps with score {}".format(
            i_episode, t, game_score))
        phi.reset()
        if agent.time_step > agent.replay_start_size:
            stats.log_game(game_score, t)

    stats.close()

    if log:
        dqn.save_model('results/model_weights.hdf5')
示例#7
0
 def __init__(self, env, name, s_size, a_size, trainer, model_path,
              global_episodes):
     self.name = "worker_" + str(name)
     self.number = name
     self.model_path = model_path
     self.trainer = trainer
     self.global_episodes = global_episodes
     self.increment = self.global_episodes.assign_add(1)
     self.episode_rewards = []
     self.episode_lengths = []
     self.episode_mean_values = []
     #Create the local copy of the network and the tensorflow op to copy global paramters to local network
     self.local_Q = Q_Network(s_size, a_size, self.name, trainer)
     self.update_local_ops = update_target_graph('global', self.name)
     self.env = env
     self.replaymemory = ReplayMemory(max_memory)
示例#8
0
def main():
    game = FlappyBird()
    env = PLE(game, fps=30, display_screen=False)
    env_evaluate = PLE(game, fps=30, display_screen=False)
    obs_dim = len(env.getGameState())
    action_dim = 2  # 只能是up键,还有一个其它,所以是2

    # rpm = ReplayMemory(MEMORY_SIZE, obs_dim, action_dim)
    rpm = ReplayMemory(MEMORY_SIZE)

    model = Model(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = Agent(
        algorithm,
        obs_dim=obs_dim,
        act_dim=action_dim,
        e_greed=0.2,  # explore
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    if os.path.exists('./model_dir'):
        agent.restore('./model_dir')

    # while rpm.size() < MEMORY_WARMUP_SIZE:  # warm up replay memory
    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, env, rpm)

    max_episode = 5000

    # start train
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1

        eval_reward = evaluate(agent, env_evaluate)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))

    agent.save('./model_dir')
示例#9
0
 def __init__(self, num_states, num_actions, Double, Dueling, PER):
     self.num_actions = num_actions # 행동 가짓수(2)를 구함
     self.Double = Double
     self.Dueling = Dueling
     self.PER = PER
     
     # transition을 기억하기 위한 메모리 객체 생성
     self.memory = ReplayMemory(CAPACITY)
     
     # 신경망 구성
     n_in, n_mid, n_out = num_states, 32, num_actions
     self.main_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용
     self.target_q_network = Net(n_in, n_mid, n_out, Dueling) # Net 클래스를 사용
     print(self.main_q_network) # 신경망의 구조를 출력
     
     # 최적화 기법을 선택
     self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=0.0001)
     
     # PER - TD 오차를 기억하기 위한 메모리 객체 생성
     if self.PER == True:
         self.td_error_memory = TDerrorMemory(CAPACITY)
示例#10
0
    def __init__(self, dim):
        self.critic_path = cst.CN_CKPT_PATH
        self.actor_path = cst.AN_CKPT_PATH
        self.replaymemory_path = cst.RM_PATH

        self.dim_body = dim[0]
        self.dim_sensor = dim[1]
        self.dim_state = dim[0] + dim[1] * 3
        self.dim_action = dim[2]

        self.sess = tf.InteractiveSession()
        self.act_lr = cst.ACT_LEARNING_RATE
        self.cri_lr = cst.CRI_LEARNING_RATE
        self.tau = cst.TAU
        self.batch_size = cst.BATCH_SIZE
        self.gamma = cst.REWARD_DECAY

        self.actorNN = ActorNetwork(self.sess, self.dim_state, self.dim_action,
                                    self.act_lr, self.tau, self.batch_size)
        self.criticNN = CriticNetwork(self.sess, self.dim_state,
                                      self.dim_action, self.cri_lr, self.tau,
                                      self.gamma,
                                      self.actorNN.get_num_trainable_vars())

        self.sess.run(tf.global_variables_initializer())

        self.actorNN.update_target_network()
        self.criticNN.update_target_network()

        self.rm = ReplayMemory('DDPG')

        self.agent_count = cst.AGENT_COUNT
        self.exploration_rate = cst.EXPLORATION_RATE
        self.epsilon = cst.CRITIC_EPSILON
        self.LOSS_ITERATION = cst.LOSS_ITERATION

        self.expl_noise = OUNoise(self.dim_action)

        self.expl = False
        self.expl_decay = cst.EXPLORATION_DECAY
示例#11
0
N_EPOCHS = 4
N_SAMPLES = 1000
SAMPLE_LENGTH = 15
memory_capacity = 2000
GAMMA = .997
LAMBDA = .95
EPSILON = .2
TARGET_DISCOUNT = .4
N_TIMESTEPS_PER_UPDATE = 300
# ~~~~~~~~~~~~~~~~~~

# Initialization
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
env = gym.make('CartPole-v1')

replay_memory = ReplayMemory(memory_capacity)

policy_net = Actor(sum(env.observation_space.shape), 200, env.action_space.n)
value_net = Critic(sum(env.observation_space.shape), 200, 1)
target_value_net = Critic(sum(env.observation_space.shape), 200, 1)
target_value_net.load_state_dict(value_net.state_dict())
target_value_net.eval()

params = list(policy_net.parameters()) + list(value_net.parameters())
optimizer = optim.SGD(params, lr=1e-3, momentum=.9, weight_decay=1e-6)

writer = SummaryWriter()

reward_normalizer = RewardNormalizer()
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
示例#12
0
    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 update_actor_interval=2,
                 warmup=1000,
                 mem_size=10**6,
                 batch_size=100,
                 n_hid1=400,
                 n_hid2=300,
                 lr_alpha=1e-3,
                 lr_beta=1e-3,
                 gamma=0.99,
                 tau=5e-3,
                 noise_mean=0,
                 noise_sigma=0.1):

        self.load_checkpoint = load_checkpoint
        self.checkpoint_file = checkpoint_file
        # needed for clamping in the learn function
        self.env = env
        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.n_actions = n_actions
        # to keep track of how often we call "learn" function, for the actor network
        self.learn_step_counter = 0
        # to handle countdown to the end of the warmup period, incremented every time we call an action
        self.time_step = 0
        self.update_actor_interval = update_actor_interval
        self.warmup = warmup
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.noise_mean = noise_mean
        self.noise_sigma = noise_sigma

        self.actor = TD3ActorNetwork(n_states,
                                     n_actions,
                                     n_hid1,
                                     n_hid2,
                                     lr_alpha,
                                     checkpoint_file,
                                     name='actor')
        self.target_actor = TD3ActorNetwork(n_states,
                                            n_actions,
                                            n_hid1,
                                            n_hid2,
                                            lr_alpha,
                                            checkpoint_file,
                                            name='target_actor')

        self.critic_1 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_1')
        self.critic_2 = TD3CriticNetwork(n_states,
                                         n_actions,
                                         n_hid1,
                                         n_hid2,
                                         lr_beta,
                                         checkpoint_file,
                                         name='critic_2')
        self.target_critic_1 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_1')
        self.target_critic_2 = TD3CriticNetwork(n_states,
                                                n_actions,
                                                n_hid1,
                                                n_hid2,
                                                lr_beta,
                                                checkpoint_file,
                                                name='target_critic_2')

        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        # tau=1 perform an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.actor, self.target_actor, tau=1)
        self.update_network_parameters(self.critic_1,
                                       self.target_critic_1,
                                       tau=1)
        self.update_network_parameters(self.critic_2,
                                       self.target_critic_2,
                                       tau=1)
示例#13
0
image_dimensions = 210 * 160 * 3
num_episodes = 50
target_episode_update = 5
action_threshold = 250
train_batch_size = 64

GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
steps_done = 0
n_actions = env.action_space.n
screen_height = 210
screen_width = 160

memory = ReplayMemory(10000)

policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())


def optimize_model():
    if len(memory) < train_batch_size:
        return

    transitions = memory.sample(train_batch_size)
    print('Training on:', len(transitions))
示例#14
0
    def __init__(self,
                 game,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 frameskip = 4
                 ):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # Environment
        self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())

        self.target_net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500
示例#15
0
    def __init__(
            self,
            game,
            mem_size=512 * 512,  #1024*512,
            state_buffer_size=4,
            batch_size=64,
            learning_rate=1e-5,
            pretrained_model=None,
            frameskip=4,  #1
            record=False):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders)
        #if self.game == 'Breakout-v0':
        #    dimensions = (32, 195, 8, 152)
        #elif self.game == 'SpaceInvaders-v0':
        #    dimensions = (21, 195, 20, 141)
        #elif self.game == 'Assault-v0':
        #    dimensions = (50, 240, 5, 155)
        #elif self.game == 'Phoenix-v0':
        #    dimensions = (23, 183, 0, 160)
        #elif self.game == 'Skiing-v0':
        #    dimensions = (55, 202, 8, 152)
        #elif self.game == 'Enduro-v0':
        #    dimensions = (50, 154, 8, 160)
        #elif self.game == 'BeamRider-v0':
        #    dimensions = (32, 180, 9, 159)

        if self.game == 'BreakoutAndSpace':
            dimensions_break = (32, 195, 8, 152)
            dimensions_space = (21, 195, 20, 141)
        elif self.game != 'BreakoutAndSpace':
            print(
                'Error! This version is for playing BreakOut and SpaceInvaders at the same time.'
            )

        # Environment
        self.env_break = Environment('BreakoutNoFrameskip-v4',
                                     dimensions_break,
                                     frameskip=frameskip)
        self.env_space = Environment('SpaceInvaders-v0',
                                     dimensions_space,
                                     frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in=state_buffer_size,
                       num_actions=self.env_space.get_number_of_actions())

        self.target_net = DQN(
            channels_in=state_buffer_size,
            num_actions=self.env_space.get_number_of_actions())

        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 4
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size,
                                   num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 25000
        else:
            self.start_train_after = mem_size // 2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500
示例#16
0
EPISODES = 500
START_RANDOM = False
MAX_EPISODE_COUNTER = 3600 * 24 * 2.0 / PERIOD
ACTION_DIM = 1
STATE_DIM = 6
ACTION_MAX = 1.0
MAX_BUFFER = 100000
MAX_TOTAL_REWARD = 300
EPISODE_PLOT = 25

# -------------------------------------------- #
# LOAD USEFULL CLASSES.
# -------------------------------------------- #

# Load the memroy
memory = ReplayMemory(MAX_BUFFER)

# Load the environment.
env = Environment(FILENAME, QUOTE_QTY, TRADE_QTY)

# Load the trainer.
trainer = Trainer(STATE_DIM, ACTION_DIM, ACTION_MAX, memory)

# Load the window.
window = Window(LOOK_BACK)
window.add_norm("#t", method="log_change", ref="close_price_#t")

# Load the tensorboard writer.
writer = SummaryWriter("tensorboard/runs")

# -------------------------------------------- #
    def __init__(self, game, agent_type, display, load_model, record, test):
        self.name = game
        self.agent_type = agent_type
        self.ale = ALEInterface()
        self.ale.setInt(str.encode('random_seed'), np.random.randint(100))
        self.ale.setBool(str.encode('display_screen'), display or record)
        if record:
            self.ale.setString(str.encode('record_screen_dir'), str.encode('./data/recordings/{}/{}/tmp/'.format(game, agent_type)))

        self.ale.loadROM(str.encode('./roms/{}.bin'.format(self.name)))
        self.action_list = list(self.ale.getMinimalActionSet())
        self.frame_shape = np.squeeze(self.ale.getScreenGrayscale()).shape
        if test:
            self.name += '_test'

        if 'space_invaders' in self.name:
            # Account for blinking bullets
            self.frameskip = 2
        else:
            self.frameskip = 3

        self.frame_buffer = deque(maxlen=4)
        if load_model and not record:
            self.load_replaymemory()
        else:
            self.replay_memory = ReplayMemory(500000, 32)

        model_input_shape = self.frame_shape + (4,)
        model_output_shape = len(self.action_list)

        if agent_type == 'dqn':
            self.model = DeepQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )
        elif agent_type == 'double':
            self.model = DoubleDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        else:
            self.model = DuelingDQN(
                model_input_shape,
                model_output_shape,
                self.action_list,
                self.replay_memory,
                self.name,
                load_model
            )

        print('{} Loaded!'.format(' '.join(self.name.split('_')).title()))
        print('Displaying: ', display)
        print('Frame Shape: ', self.frame_shape)
        print('Frame Skip: ', self.frameskip)
        print('Action Set: ', self.action_list)
        print('Model Input Shape: ', model_input_shape)
        print('Model Output Shape: ', model_output_shape)
        print('Agent: ', agent_type)
示例#18
0
if not os.path.exists(model_path):
    os.makedirs(model_path)

env = get_env(task)
a_size = env.action_space.n

global_episodes = tf.Variable(0,
                              dtype=tf.int32,
                              name='global_episodes',
                              trainable=False)
trainer = tf.train.AdamOptimizer(learning_rate=0.00015)
num_workers = 4
batch_size = 10
max_memory = 300000
replaymemory = ReplayMemory(max_memory)
saver = tf.train.Saver(max_to_keep=5)
lock = threading.Lock()

with tf.Session() as sess:
    UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
    UPDATE_EVENT.clear()
    ROLLING_EVENT.set()

    GLOBAL_STEP = 0
    coord = tf.train.Coordinator()
    master_network = Apex_Network(sess, s_size, a_size, 'global', trainer)
    workers = []
    for i in range(num_workers):
        env = get_env(task)
        workers.append(
示例#19
0
    def __init__(self,
                 load_checkpoint,
                 checkpoint_file,
                 env,
                 n_states,
                 n_actions,
                 mem_size=10**6,
                 batch_size=256,
                 n_hid1=256,
                 n_hid2=256,
                 lr=3e-4,
                 gamma=0.99,
                 tau=5e-3,
                 reward_scale=2):

        self.load_checkpoint = load_checkpoint

        self.max_action = float(env.action_space.high[0])
        self.low_action = float(env.action_space.low[0])

        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.reward_scale = reward_scale

        self.memory_counter = 0
        self.memory = ReplayMemory(mem_size, n_states, n_actions)

        self.actor = ActorNetwork(n_states,
                                  n_actions,
                                  n_hid1,
                                  n_hid2,
                                  self.max_action,
                                  lr,
                                  checkpoint_file,
                                  name='_actor')
        self.critic_1 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic1')
        self.critic_2 = CriticNetwork(n_states,
                                      n_actions,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_crtic2')

        self.value_net = ValueNetwork(n_states,
                                      n_hid1,
                                      n_hid2,
                                      lr,
                                      checkpoint_file,
                                      name='_value')
        self.target_value_net = ValueNetwork(n_states,
                                             n_hid1,
                                             n_hid2,
                                             lr,
                                             checkpoint_file,
                                             name='_value_target')

        # tau=1 performs an exact copy of the networks to the respective targets
        # self.update_network_parameters(tau=1)
        self.update_network_parameters(self.value_net,
                                       self.target_value_net,
                                       tau=1)
示例#20
0
    def __init__(self,
                 game1,
                 game2,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 pretrained_subnet1 = False,
                 pretrained_subnet2 = False,
                 frameskip = 4,
                 frozen = False
                 ):
        """
        Inputs:
        - game 1: string to select the game 1
        - game 2: string to select the game 2
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - pretrained_subnet1: str path to the model of the subnet
        - pretrained_subnet2: str path to the model of the subnet
        - frozen: boolean freeze pretrained subnets
        """

        # Namestring
        self.game1 = game1
        self.game2 = game2

        # Environment
        self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip)
        self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip)


        # Neural net
        self.pretrained_subnet1 = pretrained_subnet1
        self.pretrained_subnet2 = pretrained_subnet2
        self.net = TwinDQN(channels_in = state_buffer_size,
                             num_actions = self.env2.get_number_of_actions(),
                             pretrained_subnet1 = pretrained_subnet1,
                             pretrained_subnet2 = pretrained_subnet2,
                             frozen = frozen)
        self.target_net = TwinDQN(channels_in = state_buffer_size,
                                    num_actions = self.env2.get_number_of_actions(),
                                    pretrained_subnet1 = pretrained_subnet1,
                                    pretrained_subnet2 = pretrained_subnet2,
                                    frozen = frozen)

        # Cuda
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        # Pretrained
        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()),
                                    lr=learning_rate)
        #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()),
        #                               lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500