示例#1
0
文件: dqn.py 项目: chris-lamb/deep-rl
def initialize(game, model_name, warm_start):
    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    capacity = int(1e4)

    # Cold start
    if not warm_start:
        # Initialize model
        model = DQN(in_channels=num_frames, num_actions=num_actions)
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=1.0e-4,
                                  weight_decay=0.01)
        # Initialize replay memory
        memory_buffer = ReplayMemory(capacity)

        # Initialize statistics
        running_reward = None
        running_rewards = []

    # Warm start
    if warm_start:

        data_file = 'results/{}_{}.p'.format(game, model_name)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/{}_{}_ep_{}.p'.format(
                game, model_name, prior_eps)
            with open(model_file, 'rb') as f:
                saved_model = pickle.load(f)
                model, optimizer, memory_buffer = saved_model

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = DQN(in_channels=num_frames, num_actions=num_actions)
            optimizer = optim.RMSprop(model.parameters(),
                                      lr=1.0e-4,
                                      weight_decay=0.01)
            # Initialize replay memory
            memory_buffer = ReplayMemory(capacity)

            running_reward = None
            running_rewards = []

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()

    criterion = torch.nn.MSELoss()

    return env, model, optimizer, criterion, memory_buffer, cuda, running_reward, running_rewards
示例#2
0
class Agent:
    def __init__(self):
        self.model, self.target = DQN(), DQN()
        if USE_CUDA:
            self.model.cuda()
            self.target.cuda()

        self.exp_buffer = Memory()
        self.exp_number = 0  # size of exp buffer so far
        self.param_updates = 0  # track how many times params updated

        self.opt = torch.optim.RMSprop(self.model.parameters(),
                                       lr=LEARNING_RATE)
        self.loss = nn.SmoothL1Loss()

    # Make an action given a state
    def act(self, state, explore=True):
        if explore and np.random.rand() <= EPSILON:
            # Act randomly
            a = np.random.randint(NUM_ACTIONS)
        else:
            # Send state to model
            a_vec = self.model(state)
            a = int(torch.argmax(torch.squeeze(a_vec)))

        return a

    # clear the buffer
    def clear_exp_buffer(self):
        self.exp_buffer = Memory()
        self.exp_number = 0

    # Add experience to exp buffer
    def add_exp(self, exp):
        self.exp_buffer.add(exp)
        self.exp_number += 1

    # Replay gets batch and trains on it
    def replay(self, batch_size):
        q_loss = 0
        # If experience buffer isn't right size yet, don't do anything
        if self.exp_number < MIN_BUFFER_SIZE: return
        # Get batch from experience_buffer
        batch = self.exp_buffer.get_batch(batch_size)

        s, a, r, s_new, _ = zip(*batch)
        s_new = s_new[:-1]  # Remove last item (it is 'None')
        # First turn batch into something we can run through model
        s = torch.cat(s)
        a = torch.LongTensor(a).unsqueeze(1)
        r = torch.FloatTensor(r).unsqueeze(1)
        s_new = torch.cat(s_new)

        #print(a.shape,r.shape, s.shape, s_new.shape)
        if USE_CUDA:
            a = a.cuda()
            r = r.cuda()

        # Get q vals for s (what model outputted) from a
        # .gather gets us q value for specific action a
        pred_q_vals = self.model(s).gather(1, a)

        # Having chosen a in s,
        # What is the highest possible reward we can get from s_new?
        # We add q of performing a in s then add best q from next state
        # cat 0 to end for the terminal state
        s_new_q_vals = self.target(s_new).max(1)[0]
        zero = torch.FloatTensor(0)
        if USE_CUDA: zero = zero.cuda()

        s_new_q_vals = torch.cat((s_new_q_vals, zero))
        exp_q_vals = r + s_new_q_vals * GAMMA

        myloss = self.loss(pred_q_vals, exp_q_vals)
        self.opt.zero_grad()
        myloss.backward()
        self.opt.step()

        if WEIGHT_CLIPPING:
            for param in self.model.parameters():
                param.grad.data.clamp_(
                    -1, 1)  # Weight clipping avoids exploding gradients

        if self.param_updates % TARGET_UPDATE_INTERVAL == 0:
            self.target.load_state_dict(self.model.state_dict())

        self.param_updates += 1

        global EPSILON
        if EPSILON > EPSILON_MIN:
            EPSILON *= EPSILON_DECAY

        return myloss.item()
示例#3
0
class Agent:
	def __init__(self):
		self.controller, self.target = DQN(), DQN() # For RL 
		self.vision = VAE()

		if USE_CUDA:
			self.controller.cuda()
			self.target.cuda()
			self.vision.cuda()

		# Init weights based on init function
		self.controller.apply(init_weights)
		self.vision.apply(init_weights)
		# Load model params into target
		self.target.load_state_dict(self.controller.state_dict())
		self.action_number = 0 # actions taken (to determine whether or not to update)
	
		# NOTE: DQN exp buffer should use embeddings generated by vision module
		# The vision module (aka the VAE) has memory consisting of game states
		self.exp_buffer = [] # exp buffer
		self.exp_number = 0 # size of exp buffer so far

		self.opt = torch.optim.Adam(self.controller.parameters(),lr=DQN_LEARNING_RATE)
		self.loss = nn.SmoothL1Loss()

	# Make an action given a state
	def act(self, state, explore=True):
		self.action_number += 1
		# Update target
		if self.action_number % TARGET_INTERVAL == 0:
			self.target.load_state_dict(self.model.state_dict())

		if explore and np.random.rand() <= EPSILON:
			# Act randomly
			a = np.random.randint(NUM_ACTIONS)
			return a
		
		# Send state to model
		a_vec = self.controller(self.vision.encode(state))
		a = int(torch.argmax(torch.squeeze(a_vec)))
		return a

	def load_params(self):
		# Looks in current directory for params for model and for VAE		 
		if LOAD_CHECKPOINT_VAE:
			try:
				self.vision.load_state_dict(torch.load("VAEparams.pt"))
				print("Loaded checkpoint for VAE")
			except:
				print("Could not load VAE checkpoint")
		if LOAD_CHECKPOINT_DQN:
			try:
				self.controller.load_state_dict(torch.load("DQNparams.pt"))
				self.target.load_state_dict(torch.load("DQNparams.pt"))
				print("Loaded checkpoint for DQN")
			except:
				print("Could not load DQN checkpoint")

	def save_params(self):
		torch.save(agent.controller.state_dict(), "DQNparams.pt")
		torch.save(agent.vision.state_dict(), "VAEparams.pt")

	# clear the buffer
	def clear_exp_buffer(self):
		self.exp_buffer = []
		self.exp_number = 0
		self.vision.memory = []
		self.vision.memory_num = 0

	# Add experience to exp buffer
	def add_exp(self, exp):
		self.vision.remember(exp[0])

		if self.exp_number >= EXP_BUFFER_MAX:
			del self.exp_buffer[0]
		else:
			self.exp_number += 1

		exp[0] = self.vision.encode(exp[0])
		exp[3] = self.vision.encode(exp[3])

		self.exp_buffer.append(exp)

	# Replay gets batch and trains on it
	# Returns [vision loss, controller loss]
	def replay(self, batch_size):
		v_loss, q_loss = 0,0 # Init to 0 in case we need to return without any training

		# Train vision component first
		if self.action_number % VAE_UPDATE_INTERVAL == 0:
			v_loss = self.vision.replay()
	
	# If experience buffer isn't right size yet, don't do anything
		if self.exp_number < EXP_BUFFER_MIN or self.action_number % TRAINING_INTERVAL != 0: return [v_loss, q_loss]

		# Get batch from experience_buffer
		batch = random.sample(self.exp_buffer, batch_size)
		
		s,a,r,s_new,_ = zip(*batch)
		s_new = s_new[:-1] # Remove last

		# First turn batch into something we can run through model
		s = torch.cat(s)
		a = torch.LongTensor(a).unsqueeze(1)
		r = torch.FloatTensor(r)
		s_new = torch.cat(s_new)
		
		if USE_CUDA:
			a = a.cuda()
			r = r.cuda()

		# Get q vals for s (what model outputted) from a
		# .gather gets us q value for specific action a
		pred_q_vals = self.model(s).gather(1,a).squeeze()

		# Having chosen a in s,
		# What is the highest possible reward we can get from s_new?
		# We add q of performing a in s then add best q from next state
		# cat 0 to end for the terminal state
		s_new_q_vals = self.target(s_new).max(1)[0]

		zero = torch.zeros(1)
		if USE_CUDA: zero = zero.cuda()
		s_new_q_vals = torch.cat((s_new_q_vals, zero))
		exp_q_vals = r + s_new_q_vals*GAMMA
		
		myloss = self.loss(pred_q_vals, exp_q_vals)
		self.opt.zero_grad()
		myloss.backward()
		

		if WEIGHT_CLIPPING:
			for param in self.model.parameters():
				param.grad.data.clamp_(-1,1) # Weight clipping avoids exploding gradients

		self.opt.step()
		
		global EPSILON
		if EPSILON > EPSILON_MIN:
			EPSILON *= EPSILON_DECAY 
	
		return [v_loss, myloss.item()]