Exemplo n.º 1
0
    def __init__(self,
                 state_size: int,
                 action_size: int,
                 replay_buffer: ReplayMemory,
                 seed: int,
                 batch_size=BATCH_SIZE,
                 update_every=UPDATE_EVERY,
                 tau=TAU,
                 gamma=GAMMA):
        """Initialize the agent"""

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.batch_size = batch_size
        self.tau = tau
        self.update_target_every = update_every
        self.gamma = gamma

        self.qnet_local = DQN(state_size, action_size, seed).to(device)
        self.qnet_target = DQN(state_size, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=LR)
        self.max_gradient_norm = float('inf')

        self.memory = replay_buffer

        self.t_step = 0
Exemplo n.º 2
0
def load_and_test(opt):
    netp1 = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt)
    netp1.load(opt.load_path)
    load_path2 = list(opt.load_path)
    print(opt.load_path)
    load_path2[-11] = '2'
    load_path2 = "".join(load_path2)
    print(load_path2)
    netp2 = DQN(NUM_STATES, NUM_ACTIONS, opt.eps, opt)
    netp2.load(load_path2)
    if opt.player == 1:
        r1, r2, w, d = test_ep_pvp(netp1,
                                   netp2,
                                   opt.num_test,
                                   opt.eps,
                                   render=opt.render)

        print('p1 average reward:', r1)
        print('p2 average reward:', r2)
        print('p1 win rate:', w)
        print('p2 win rate:', 1 - w - d)
        print('draw rate:', d)

    elif opt.player == 2:
        r2, r1, w, d = test_ep_pvp(netp2,
                                   netp1,
                                   opt.num_test,
                                   opt.eps,
                                   render=opt.render)

        print('p1 average reward:', r1)
        print('p2 average reward:', r2)
        print('p1 win rate:', 1 - w - d)
        print('p2 win rate:', w)
        print('draw rate:', d)
Exemplo n.º 3
0
    def __init__(self):
        self.mode = "train"
        with open("config.yaml") as reader:
            self.config = yaml.safe_load(reader)
        print(self.config)
        self.load_config()

        self.online_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.target_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.online_net.train()
        self.target_net.train()
        self.update_target_net()
        for param in self.target_net.parameters():
            param.requires_grad = False

        if self.use_cuda:
            self.online_net.cuda()
            self.target_net.cuda()

        self.naozi = ObservationPool(capacity=self.naozi_capacity)
        # optimizer
        self.optimizer = torch.optim.Adam(
            self.online_net.parameters(),
            lr=self.config['training']['optimizer']['learning_rate'])
        self.clip_grad_norm = self.config['training']['optimizer'][
            'clip_grad_norm']
Exemplo n.º 4
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model and os.path.isfile(args.model):
            # Always load tensors onto CPU by default, will shift to GPU if necessary
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
Exemplo n.º 5
0
    def __init__(self, args, state_size, action_size):
        """Initialize an Agent object.
        
        Params
        ======
            args (class defined on the notebook): A set of parameters that will define the agent hyperparameters
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        self.params = args

        # Deep Q-Network
        if args.use_NoisyNet:
            self.DQN_local = DQN_NoisyNet(args, state_size,
                                          action_size).to(args.device)
            self.DQN_target = DQN_NoisyNet(args, state_size,
                                           action_size).to(args.device)
        else:
            self.DQN_local = DQN(args, state_size, action_size).to(args.device)
            self.DQN_target = DQN(args, state_size,
                                  action_size).to(args.device)

        self.optimizer = optim.Adam(self.DQN_local.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)

        # Replay memory
        self.memory = ReplayBuffer(args, action_size)
        # Initialize time step (for updating every args.target_update steps)
        self.t_step = 0
Exemplo n.º 6
0
  def __init__(self, args, env):
    self.action_space = env.action_space()
    self.atoms = args.atoms
    self.Vmin = args.V_min
    self.Vmax = args.V_max
    self.support = torch.linspace(args.V_min, args.V_max, args.atoms)  # Support (range) of z
    self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
    self.batch_size = args.batch_size
    self.n = args.multi_step
    self.discount = args.discount
    self.priority_exponent = args.priority_exponent
    self.max_gradient_norm = args.max_gradient_norm

    self.policy_net = DQN(args, self.action_space)
    if args.model and os.path.isfile(args.model):
      self.policy_net.load_state_dict(torch.load(args.model))
    self.policy_net.train()

    self.target_net = DQN(args, self.action_space)
    self.update_target_net()
    self.target_net.eval()

    self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps)
    if args.cuda:
      self.policy_net.cuda()
      self.target_net.cuda()
      self.support = self.support.cuda()
Exemplo n.º 7
0
    def __init__(self):
        self.device = args.device
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.history_size = args.history_size
        self.replay_size = args.replay_size
        self.width = args.width
        self.height = args.height
        self.hidden_size = args.hidden_size
        self.action_size = args.action_size
        self.update_cycle = args.update_cycle
        self.log_interval = args.log_interval
        self.actor_num = args.actor_num
        self.alpha = 0.7
        self.beta_init = 0.4
        self.beta = self.beta_init
        self.beta_increment = 1e-6
        self.e = 1e-6
        self.dis = 0.99
        self.start_epoch = 0
        self.mainDQN = DQN(self.history_size, self.hidden_size,
                           self.action_size).to(self.device)
        self.targetDQN = DQN(self.history_size, self.hidden_size,
                             self.action_size).to(self.device)
        self.update_target_model()
        self.optimizer = optim.Adam(self.mainDQN.parameters(), lr=args.lr)
        self.replay_memory = deque(maxlen=self.replay_size)
        self.priority = deque(maxlen=self.replay_size)

        if args.load_model != '000000000000':
            self.log = args.log_directory + args.load_model + '/'
            args.time_stamp = args.load_model[:12]
            args.start_epoch = self.load_model()
        self.log = args.log_directory + args.time_stamp + config + '/'
        self.writer = SummaryWriter(self.log)
Exemplo n.º 8
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount
        self.norm_clip = args.norm_clip

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model:  # Load pretrained model if provided
            if os.path.isfile(args.model):
                state_dict = torch.load(args.model, map_location='cpu')  # Always load tensors onto CPU by default, will shift to GPU if necessary
                if 'conv1.weight' in state_dict.keys():
                    for old_key, new_key in (('conv1.weight', 'convs.0.weight'), ('conv1.bias', 'convs.0.bias'), ('conv2.weight', 'convs.2.weight'), ('conv2.bias', 'convs.2.bias'), ('conv3.weight', 'convs.4.weight'), ('conv3.bias', 'convs.4.bias')):
                        state_dict[new_key] = state_dict[old_key]  # Re-map state dict for old pretrained models
                        del state_dict[old_key]  # Delete old keys for strict load_state_dict
                self.online_net.load_state_dict(state_dict)
                print("Loading pretrained model: " + args.model)
            else:  # Raise error if incorrect model path provided
                raise FileNotFoundError(args.model)

        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
 def __init__(self, env, args):
     super(DQNTrainer).__init__()
     self.model = DQN(env, args, Nash=False).to(args.device)
     self.target = DQN(env, args, Nash=False).to(args.device)
     self.replay_buffer = ReplayBuffer(args.buffer_size)
     self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr)
     self.args = args
Exemplo n.º 10
0
def run():
    ray.init()
    policy_net = DQN(num_channels=4, num_actions=19)
    target_net = DQN(num_channels=4, num_actions=19)
    target_net.load_state_dict(policy_net.state_dict())
    #memory = Memory(50000)
    #shared_memory = ray.get(ray.put(memory))
    memory = RemoteMemory.remote(30000)
    num_channels = 4
    num_actions = 19
    batch_size = 256
    param_server = ParameterServer.remote(num_channels, num_actions)
    learner = (Learner.remote(param_server, batch_size, num_channels,
                              num_actions))
    print(learner)
    print(learner.get_state_dict.remote())

    num_actors = 2
    epsilon = 0.9

    actor_list = [
        Actor.remote(learner, param_server, i, epsilon, num_channels,
                     num_actions) for i in range(num_actors)
    ]
    explore = [actor.explore.remote(learner, memory) for actor in actor_list]
    #ray.get(explore)
    learn = learner.update_network.remote(memory)
Exemplo n.º 11
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms  # size of value distribution.
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max,
                                      self.atoms).to(device=args.device)
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(
            device=args.device)  # greedily selects the action.
        if args.model and os.path.isfile(args.model):
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu')
            )  # state_dict: python dictionary that maps each layer to its parameters.
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(
            device=args.device)  # use to compute target q-values.
        self.update_target_net(
        )  # sets it to the parameters of the online network.
        self.target_net.train()
        for param in self.target_net.parameters(
        ):  # not updated through backpropagation.
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
Exemplo n.º 12
0
    def __init__(self, state_size, action_size, seed, network):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.network = network

        # Q-Network
        if self.network == "duel":
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        else:
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Exemplo n.º 13
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model:  # Load pretrained model if provided
            if os.path.isfile(args.model):
                self.online_net.load_state_dict(
                    torch.load(args.model, map_location='cpu')
                )  # Always load tensors onto CPU by default, will shift to GPU if necessary
                print("Loading pretrained model: " + args.model)
            else:  # Raise error if incorrect model path provided
                raise FileNotFoundError(args.model)

        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.learning_rate,
                                    eps=args.adam_eps)
Exemplo n.º 14
0
 def __init__(self, time_step, split, lr):
     self.dataset = Dataset(T=time_step,
                            split_ratio=split,
                            binary_file=config.BINARY_DATASET)
     self.policy_net_encoder = AttnEncoder(
         input_size=self.dataset.get_num_features(),
         hidden_size=config.ENCODER_HIDDEN_SIZE,
         time_step=time_step)
     self.policy_net_decoder = AttnDecoder(
         code_hidden_size=config.ENCODER_HIDDEN_SIZE,
         hidden_size=config.DECODER_HIDDEN_SIZE,
         time_step=time_step)
     self.policy_net = DQN(self.policy_net_encoder, self.policy_net_decoder)
     self.target_net_encoder = AttnEncoder(
         input_size=self.dataset.get_num_features(),
         hidden_size=config.ENCODER_HIDDEN_SIZE,
         time_step=time_step)
     self.target_net_decoder = AttnDecoder(
         code_hidden_size=config.ENCODER_HIDDEN_SIZE,
         hidden_size=config.DECODER_HIDDEN_SIZE,
         time_step=time_step)
     self.target_net = DQN(self.target_net_encoder, self.target_net_decoder)
     if torch.cuda.is_available():
         self.policy_net_encoder = self.policy_net_encoder.cuda()
         self.policy_net_decoder = self.policy_net_decoder.cuda()
         self.target_net_encoder = self.target_net_encoder.cuda()
         self.target_net_decoder = self.target_net_decoder.cuda()
         self.policy_net = self.policy_net.cuda()
         self.target_net = self.target_net.cuda()
     self.memory = ReplayMemory(config.MEMORY_CAPACITY)
     self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)
Exemplo n.º 15
0
    def __init__(self,
                 meta_controller_experience_memory=None,
                 lr=0.00025,
                 alpha=0.95,
                 eps=0.01,
                 batch_size=32,
                 gamma=0.99,
                 num_options=12):
        # expereince replay memory
        self.meta_controller_experience_memory = meta_controller_experience_memory
        self.lr = lr  # learning rate
        self.alpha = alpha  # optimizer parameter
        self.eps = 0.01  # optimizer parameter
        self.gamma = 0.99
        # BUILD MODEL
        USE_CUDA = torch.cuda.is_available()
        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            self.device = torch.device("cuda:1")
        elif torch.cuda.device_count() == 1:
            self.device = torch.device("cuda:0")
        else:
            self.device = torch.device("cpu")

        dfloat_cpu = torch.FloatTensor
        dfloat_gpu = torch.cuda.FloatTensor

        dlong_cpu = torch.LongTensor
        dlong_gpu = torch.cuda.LongTensor

        duint_cpu = torch.ByteTensor
        dunit_gpu = torch.cuda.ByteTensor

        dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
        ) else torch.FloatTensor
        dlongtype = torch.cuda.LongTensor if torch.cuda.is_available(
        ) else torch.LongTensor
        duinttype = torch.cuda.ByteTensor if torch.cuda.is_available(
        ) else torch.ByteTensor

        self.dtype = dtype
        self.dlongtype = dlongtype
        self.duinttype = duinttype

        Q = DQN(in_channels=4, num_actions=num_options).type(dtype)
        Q_t = DQN(in_channels=4, num_actions=num_options).type(dtype)
        Q_t.load_state_dict(Q.state_dict())
        Q_t.eval()
        for param in Q_t.parameters():
            param.requires_grad = False

        Q = Q.to(self.device)
        Q_t = Q_t.to(self.device)

        self.batch_size = batch_size
        self.Q = Q
        self.Q_t = Q_t
        # optimizer
        optimizer = optim.RMSprop(Q.parameters(), lr=lr, alpha=alpha, eps=eps)
        self.optimizer = optimizer
        print('init: Meta Controller --> OK')
Exemplo n.º 16
0
    def __init__(self):
        """
        initializes all the class variables
        """
        self.env = gym.make('CartPole-v0').unwrapped
        self.resize = T.Compose([
            T.ToPILImage(),
            T.Resize(40, interpolation=Image.CUBIC),
            T.ToTensor()
        ])
        self.env.reset()
        init_screen = self.get_screen()
        self.env.reset()
        _, _, screen_height, screen_width = init_screen.shape

        # Get number of actions from gym action space
        self.n_actions = self.env.action_space.n

        self.policy_net = DQN(screen_height, screen_width,
                              self.n_actions).to(device)
        self.target_net = DQN(screen_height, screen_width,
                              self.n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.0001)
        self.memory = PriortizedReplayMemory(10000)
Exemplo n.º 17
0
 def __init__(self, args, obs):
     self.net = DQN(args.n_obs, args.n_action)
     self.target_net = DQN(args.n_obs, args.n_action)
     if os.path.isfile('./weights/ckpt.pth'):
         self.net.load_state_dict(torch.load('./weights/ckpt.pth'))
         self.target_net.load_state_dict(torch.load('./weights/ckpt.pth'))
     self.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.state_preproc = StatePreproc(self.device)
     self.n_action = args.n_action
     self.gamma = args.gamma
     self.max_grad_norm = args.max_grad_norm
     self.num_procs = args.num_procs
     self.memory = ReplayBuffer(args)
     self.optimizer = torch.optim.Adam(self.net.parameters(),
                                       lr=args.lr,
                                       betas=(0.9, 0.99))
     self.criterion = torch.nn.MSELoss()
     # log
     self.log_episode_rewards = torch.zeros(self.num_procs,
                                            device=self.device,
                                            dtype=torch.float)
     self.episode_rewards = deque([0] * 100, maxlen=100)
     self.episode = 1
     self.init(obs)
     # eval
     self.test_episode = args.test_episode
Exemplo n.º 18
0
    def __init__(self, state_size, action_size, config=RLConfig()):
        self.seed = random.seed(config.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = config.batch_size
        self.batch_indices = torch.arange(config.batch_size).long().to(device)
        self.samples_before_learning = config.samples_before_learning
        self.learn_interval = config.learning_interval
        self.parameter_update_interval = config.parameter_update_interval
        self.per_epsilon = config.per_epsilon
        self.tau = config.tau
        self.gamma = config.gamma

        if config.useDuelingDQN:
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             config.seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              config.seed).to(device)
        else:
            self.qnetwork_local = DQN(state_size, action_size,
                                      config.seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       config.seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=config.learning_rate)

        self.doubleDQN = config.useDoubleDQN
        self.usePER = config.usePER
        if self.usePER:
            self.memory = PrioritizedReplayBuffer(config.buffer_size,
                                                  config.per_alpha)
        else:
            self.memory = ReplayBuffer(config.buffer_size)

        self.t_step = 0
    def __init__(self, learner, actor_idx, epsilon):
        # environment initialization
        import gym
        import minerl
        self.actor_idx = actor_idx
        self.env = gym.make("MineRLTreechop-v0")
        self.port_number = int("12340") + actor_idx
        print("actor environment %d initialize successfully" % self.actor_idx)
        self.shared_network_cpu = ray.get(learner.get_network.remote())
        # self.shared_memory = ray.get(shared_memory_id)
        # print("shared memory assign successfully")

        # network initalization
        self.actor_network = DQN(19).cpu()
        self.actor_target_network = DQN(19).cpu()
        self.actor_network.load_state_dict(self.shared_network_cpu.state_dict())
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        print("actor network %d initialize successfully" % self.actor_idx)

        self.initialized = False
        self.epi_counter = 0
        # exploring info
        self.epsilon = epsilon
        self.max_step = 100
        self.local_buffer_size = 100
        self.local_buffer = deque(maxlen=self.local_buffer_size)

        project_name = 'apex_dqfd_Actor%d' %(actor_idx)
        wandb.init(project=project_name, entity='neverparadise')
Exemplo n.º 20
0
	def __init__(self, action_set, train=True, load_path=None):
		#1. Initialize agent params
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		self.action_set = action_set
		self.action_number = len(action_set)
		self.steps_done = 0
		self.epsilon = Config.EPS_START
		self.episode_durations = []

		#2. Build networks
		self.policy_net = DQN().to(self.device)
		self.target_net = DQN().to(self.device)
		
		self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

		if not train:		
			self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)	
			self.policy_net.load(load_path, optimizer=self.optimizer)
			self.policy_net.eval()

		self.target_net.load_state_dict(self.policy_net.state_dict())
		self.target_net.eval()

		#3. Create Prioritized Experience Replay Memory
		self.memory = Memory(Config.MEMORY_SIZE)
Exemplo n.º 21
0
    def __init__(self, action_size):
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 500000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net and the target net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)
        self.target_net = DQN(action_size)
        self.target_net.to(device)

        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(
            self.optimizer,
            step_size=scheduler_step_size,
            gamma=scheduler_gamma)

        # Initialize a target network and initialize the target network to the policy net
        ### CODE ###
        self.update_target_net()
Exemplo n.º 22
0
    def __init__(self, action_set, train=True, load_path=None):
        #1. Initialize agent params
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.action_set = action_set
        self.action_number = len(action_set)
        self.steps_done = 0
        self.epsilon = Config.EPS_START
        self.episode_durations = []

        print('LOAD PATH    --  agent.init:', load_path)
        time.sleep(2)

        #2. Build networks
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

        if not train:
            print('entrou no not train')        
            self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)    
            self.policy_net.load(load_path, optimizer=self.optimizer)
            self.policy_net.eval()

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.memory = ReplayMemory(1000)
Exemplo n.º 23
0
 def __init__(self):
     # build models
     self.Qt = DQN(in_channels=5, num_actions=18)  # Controller Q network
     self.Qt_t = DQN(in_channels=5,
                     num_actions=18)  # Controller target network
     # self.meta_controller = Model(in_channels=4, num_actions=10)
     self.Q = None  # Meta-Controller Q network
     self.Q_t = None  # Meta-Controller target network
Exemplo n.º 24
0
def main():
    max_episodes = 5000
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name='main')
        targetDQN = DQN(sess, input_size, output_size, name='target')
        tf.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name='target', src_scope_name='main')

        sess.run(copy_ops)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0

            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()

                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)

                if done:
                    reward = -100

                replay_buffer.append((state, action, reward, next_state, done))

                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:
                    break

            print("Episode: {} Steps: {}".format(episode, step_count))

            if step_count > 10000:
                pass

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)

                print("Loss: ", loss)

                sess.run(copy_ops)

        bot_play(mainDQN)
 def __init__(self, network, batch_size):
     self.learner_network = DQN(19).cuda().float()
     self.learner_target_network = DQN(19).cuda().float()
     self.learner_network.load_state_dict(network.state_dict())
     self.learner_target_network.load_state_dict(network.state_dict())
     self.shared_network = DQN(19).cpu()
     self.count = 0
     self.batch_size = batch_size
     wandb.init(project='apex_dqfd_Learner', entity='neverparadise')
Exemplo n.º 26
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount
        self.saved_model_path = args.saved_model_path
        self.experiment = args.experiment
        self.plots_path = args.plots_path
        self.data_save_path = args.data_save_path


        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model and os.path.isfile(args.model):
            # Always load tensors onto CPU by default, will shift to GPU if necessary
            self.online_net.load_state_dict(torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps)

        # list of layers:
        self.online_net_layers = [self.online_net.conv1,
                                  self.online_net.conv2,
                                  self.online_net.conv3,
                                  self.online_net.fc_h_v,
                                  self.online_net.fc_h_a,
                                  self.online_net.fc_z_v,
                                  self.online_net.fc_z_a
                                  ]

        self.target_net_layers = [self.target_net.conv1,
                                  self.target_net.conv2,
                                  self.target_net.conv3,
                                  self.target_net.fc_h_v,
                                  self.target_net.fc_h_a,
                                  self.target_net.fc_z_v,
                                  self.target_net.fc_z_a
                                  ]

        # freeze all layers except the last, and reinitialize last
        if args.freeze_layers > 0:
            self.freeze_layers(args.freeze_layers)

        if args.reinitialize_layers > 0:
            self.reinit_layers(args.reinitialize_layers)
Exemplo n.º 27
0
    def __init__(self, param_server, batch_size, num_channels, num_actions):
        self.learner_network = DQN(num_channels, num_actions).cuda().float()
        self.learner_target_network = DQN(num_channels,
                                          num_actions).cuda().float()
        self.count = 0
        self.batch_size = batch_size
        self.writer = SummaryWriter(f'runs/apex/learner')

        self.lr = LR
        self.optimizer = optim.Adam(self.learner_network.parameters(), self.lr)
        self.param_server = param_server
Exemplo n.º 28
0
def run():
    policy_net = DQN(num_channels, 19).cuda()
    target_net = DQN(num_channels, 19).cuda()
    optimizer = optim.Adam(policy_net.parameters(), LR)
    memory = Memory(50000)
    env = gym.make(ENV_NAME)
    env.make_interactive(port=6666, realtime=False)
    max_epi = 100
    n_step = 2
    update_period = 10
    gamma = 0.99

    total_steps = 0
    epsilon = 0.95
    endEpsilon = 0.01
    stepDrop = (epsilon - endEpsilon) / max_epi

    for num_epi in range(max_epi):
        obs = env.reset()
        state = converter(ENV_NAME, obs).cuda()
        state = state.float()
        done = False
        total_reward = 0
        steps = 0
        if epsilon > endEpsilon:
            epsilon -= stepDrop

        while not done:
            steps += 1
            total_steps += 1
            a_out = policy_net.sample_action(state, epsilon)
            action_index = a_out
            action = make_19action(env, action_index)
            obs_prime, reward, done, info = env.step(action)

            total_reward += reward

            if done:
                print("%d episode is done" % num_epi)
                print("total rewards : %d " % total_reward)
                writer.add_scalar('Rewards/train', total_reward, num_epi)
                break

            state_prime = converter(ENV_NAME, obs_prime).cuda()
            append_sample(memory, policy_net, target_net, state, action_index,
                          reward, state_prime, done)
            state = state_prime

            if memory.size() > 1000:
                update_network(policy_net, target_net, memory, 2, optimizer,
                               total_steps)

            if total_steps % 2000 == 0:
                update_target(policy_net, target_net)
Exemplo n.º 29
0
    def __init__(self,
                 env,
                 taskCount=100,
                 alpha=0.4,
                 hiddenSize=500,
                 perfix=''):
        self.GAMMA = 0.99
        self.epsilon = 0.3
        self.epsilon_end = 0.05
        self.epsilon_decay = 200

        self.update_step = 20
        self.memory_size = 2000
        self.max_epoch = 500
        self.batch_size = 32

        # self.max_epoch = 500
        # self.batch_size = 1
        # self.memory_size = 1
        # self.update_step = 1

        self.hiddenSize = hiddenSize
        # self.save_path = '../Model/' + str(taskCount) + '-' + str(alpha) + perfix +'.pth'
        self.save_path = '../Model/' + perfix + '-' + str(
            taskCount) + '-' + str(alpha) + '.pth'

        # Variables
        self.var_phi = autograd.Variable(torch.Tensor(6), volatile=True)

        # For training
        self.var_batch_phi = autograd.Variable(torch.Tensor(
            self.batch_size, 6))
        self.var_batch_a = autograd.Variable(torch.LongTensor(
            self.batch_size, 1),
                                             requires_grad=False)
        self.var_batch_r = autograd.Variable(torch.Tensor(self.batch_size, 1))
        self.var_batch_phi_next = autograd.Variable(
            torch.Tensor(self.batch_size, 6))
        self.var_batch_r_mask = autograd.Variable(torch.Tensor(
            self.batch_size, 1),
                                                  requires_grad=False)

        self.MP = MemoryReplay(self.memory_size, self.batch_size)
        self.dqn = DQN(hiddenSize=self.hiddenSize)
        self.target_dqn = DQN(hiddenSize=self.hiddenSize)
        self.target_dqn.load_state_dict(self.dqn.state_dict())

        self.optimz = optim.RMSprop(self.dqn.parameters(),
                                    lr=0.00025,
                                    alpha=0.9,
                                    eps=1e-02,
                                    momentum=0.0)

        self.env = env
Exemplo n.º 30
0
def main(*args, **kwargs):
    M = kwargs["M"]
    M.env = gym.make("BreakoutDeterministic-v4")

    M.policy = DQN()
    M.target = DQN()
    M.target.eval()
    M.policy.to(M.device)
    M.target.to(M.device)

    starter = "./bootstrap/model-epoch-2750.pt"
    starter_target = "./bootstrap/model-epoch-2750.pt"
    M.policy.load_state_dict(T.load(starter, map_location=M.device))
    M.target.load_state_dict(T.load(starter_target, map_location=M.device))
    M.time = int(time.time())
    M.log = open("./logs/log-{}.txt".format(M.time), "a")
    M.model_folder = "./model-{}".format(M.time)
    os.mkdir(M.model_folder)

    M.memory = rl.ReplayMemory(REPLAY_BUF_SIZE)
    if DISPLAY_ENABLED:
        M.display = Display("breakout", DISPLAY_WIDTH, DISPLAY_HEIGHT)
    M.action_db = {0: "NOP", 1: "Fire", 2: "Right", 3: "Left"}

    M.optim(optim.RMSprop(M.policy.parameters(), lr=LEARNING_RATE))
    M.steps = 0

    durations = []
    test_durations = []

    for epoch in range(EPOCHS):
        M.epoch = epoch
        duration, avg_loss = train(M)
        durations.append(duration)
        print(
            "[train/{}] duration: {}, total steps: {}, avg loss: {:0.6f}, eps: {:0.2f}"
            .format(epoch, duration, M.steps, avg_loss, M.eps))

        if M.steps >= STEPS_BEFORE_TRAIN:
            test_duration = test(M)
            test_durations.append(test_duration)
            print("[model-{}][test/{}] test_duration: {}".format(
                M.time, epoch, test_duration))

            # Save model
            save_path = "./{}/model-epoch-{}.pt".format(M.model_folder, epoch)
            T.save(M.policy.state_dict(), save_path)

            # Log training progress
            M.log.write(
                "epoch, {}, train_dur, {}, test_dur, {}, avg loss, {}\n".
                format(epoch, duration, test_duration, avg_loss))
            M.log.flush()