示例#1
0
class Clone:
    def __init__(self, min_action_set, hist_len, checkpoint_policy):
        self.minimal_action_set = min_action_set
        print("hist len", hist_len)
        self.network = Network(len(self.minimal_action_set), hist_len)
        self.network.load_state_dict(
            torch.load(checkpoint_policy)['state_dict'])
        if torch.cuda.is_available():
            print("Initializing Cuda Nets...")
            self.network.cuda()

    def predict(self, state):
        # predict action probabilities
        outputs = self.network(Variable(utils.float_tensor(state)))
        vals = outputs[len(outputs) - 1].data.cpu().numpy()
        return vals

    def get_action(self, state):
        vals = self.predict(state)
        return np.argmax(vals)
示例#2
0
class Imitator:
    def __init__(self, min_action_set, learning_rate, alpha, checkpoint_dir,
                 hist_len, l2_penalty):
        self.minimal_action_set = min_action_set
        print("hist len", hist_len)
        self.network = Network(len(self.minimal_action_set), hist_len)
        if torch.cuda.is_available():
            print("Initializing Cuda Nets...")
            self.network.cuda()
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=learning_rate,
                                    weight_decay=l2_penalty)
        self.checkpoint_directory = checkpoint_dir

    def predict(self, state):
        # predict action probabilities
        outputs = self.network(Variable(utils.float_tensor(state)))
        vals = outputs[len(outputs) - 1].data.cpu().numpy()
        return vals

    def get_action(self, state):
        vals = self.predict(state)
        return np.argmax(vals)

    # potentially optimizable
    def compute_labels(self, sample, minibatch_size):
        #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        labels = Variable(utils.long_tensor(minibatch_size))
        actions_taken = [x.action for x in sample]
        #print(actions_taken[0])
        for i in range(len(actions_taken)):
            #print(actions_taken[i])
            labels[i] = np.int(actions_taken[i])
        # The list of ALE actions taken for the minibatch
        #labels = torch.from_numpy(np.array([x.action for x in sample])).long().to(device)
        #for index in range(len(actions_taken)):
        #	labels[index] = torch.from_numpy(actions_taken[index])
        #print(labels[0])
        return labels

    def get_loss(self, outputs, labels):
        return nn.CrossEntropyLoss()(outputs, labels)

    def validate(self, dataset, minibatch_size):
        '''run dataset through loss to get validation error'''
        validation_data = dataset.get_dataset()
        v_loss = 0.0
        for i in range(0,
                       len(validation_data) - minibatch_size, minibatch_size):
            sample = validation_data[i:i + minibatch_size]
            with torch.no_grad():
                state = Variable(
                    utils.float_tensor(
                        np.stack([np.squeeze(x.state) for x in sample])))
                #print(state.size())
                # compute the target values for the minibatch
                labels = self.compute_labels(sample, minibatch_size)
                #print(labels.size())
                #print("labels", labels)
                self.optimizer.zero_grad()
                '''
				Forward pass the minibatch through the
				prediction network.
				'''
                activations = self.network(state)
                '''
				Extract the Q-value vectors of the minibatch
				from the final layer's activations. See return values
				of the forward() functions in cnn.py
				'''
                output = activations[len(activations) - 1]
                loss = self.get_loss(output, labels)
                v_loss += loss
        return v_loss

    def train(self, dataset, minibatch_size):
        # sample a minibatch of transitions
        sample = dataset.sample_minibatch(minibatch_size)
        state = Variable(
            utils.float_tensor(np.stack([np.squeeze(x.state)
                                         for x in sample])))

        # compute the target values for the minibatch
        labels = self.compute_labels(sample, minibatch_size)
        #print("labels", labels)
        self.optimizer.zero_grad()
        '''
		Forward pass the minibatch through the
		prediction network.
		'''
        activations = self.network(state)
        '''
		Extract the Q-value vectors of the minibatch
		from the final layer's activations. See return values
		of the forward() functions in cnn.py
		'''
        output = activations[len(activations) - 1]
        loss = self.get_loss(output, labels)
        #self.losses.append(loss)
        loss.backward()
        self.optimizer.step()
        return loss

    '''
	Args:
	This function checkpoints the network.
	'''

    def checkpoint_network(self, env_name, extra_info):
        print("Checkpointing Weights")
        utils.save_checkpoint({'state_dict': self.network.state_dict()},
                              self.checkpoint_directory, env_name, extra_info)
        print("Checkpointed.")
示例#3
0
class Imitator:
    def __init__(self, min_action_set, learning_rate, alpha,
                 min_squared_gradient, checkpoint_dir, hist_len, l2_penalty):
        self.minimal_action_set = min_action_set
        self.network = Network(len(self.minimal_action_set))
        if torch.cuda.is_available():
            print "Initializing Cuda Nets..."
            self.network.cuda()
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=learning_rate,
                                    weight_decay=l2_penalty)
        self.checkpoint_directory = checkpoint_dir
        self.losses = []

    def predict(self, state):
        # predict action probabilities
        outputs = self.network(Variable(utils.float_tensor(state)))
        vals = outputs[len(outputs) - 1].data.cpu().numpy()
        return vals

    def get_action(self, state):
        vals = self.predict(state)
        return self.minimal_action_set[np.argmax(vals)]

    # potentially optimizable
    def compute_labels(self, sample, minibatch_size):
        labels = Variable(utils.long_tensor(minibatch_size))
        # The list of ALE actions taken for the minibatch
        actions_taken = [x.action for x in sample]
        # The indices of the ALE actions taken in the action set
        action_indices = [
            self.minimal_action_set.index(x) for x in actions_taken
        ]
        for index in range(len(action_indices)):
            labels[index] = action_indices[index]
        return labels

    def get_loss(self, outputs, labels):
        return nn.CrossEntropyLoss()(outputs, labels)

    def train(self, dataset, minibatch_size):
        # sample a minibatch of transitions
        sample = dataset.sample_minibatch(minibatch_size)
        state = Variable(
            utils.float_tensor(np.stack([np.squeeze(x.state)
                                         for x in sample])))

        # compute the target values for the minibatch
        labels = self.compute_labels(sample, minibatch_size)

        self.optimizer.zero_grad()
        '''
		Forward pass the minibatch through the 
		prediction network.
		'''
        activations = self.network(state)
        '''
		Extract the Q-value vectors of the minibatch
		from the final layer's activations. See return values
		of the forward() functions in cnn.py
		'''
        output = activations[len(activations) - 1]
        loss = self.get_loss(output, labels)
        self.losses.append(loss)
        loss.backward()
        self.optimizer.step()

    '''
	Args:
	This function checkpoints the network.
	'''

    def checkpoint_network(self):
        print "Checkpointing Weights"
        utils.save_checkpoint({'state_dict': self.network.state_dict()},
                              self.checkpoint_directory)
        print "Checkpointed."