示例#1
0
    def __init__(self, state_shape, action_size):
        self.learning_rate = 0.001
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = 0.999
        self.episilon = 0.01
        self.lamb = 0.99

        board_shape = state_shape[:2]
        self.board_shape = board_shape

        self.value_model = AgentModel("value", board_shape)
        self.target_value_model = AgentModel("target_value", board_shape)

        self.value_model.build(input_shape=(None,) + board_shape)
        self.target_value_model.build(input_shape=(None,) + board_shape)

        for var, var_target in zip(
            self.value_model.trainable_variables,
            self.target_value_model.trainable_variables,
        ):
            var.assign(var_target)

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        self.loss_function = tf.keras.losses.MeanSquaredError()
示例#2
0
 def __init__(self, num_actions, gamma, max_experiences, min_experiences,
              batch_size, lr, hidden_units, num_states):
     self.num_actions = num_actions
     self.batch_size = batch_size
     self.optimizer = tf.optimizers.Adam(lr)
     self.gamma = gamma
     self.model = AgentModel(num_actions, hidden_units, num_states)
     self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}
     self.max_experiences = max_experiences
     self.min_experiences = min_experiences
示例#3
0
    def test_model(self):
        ones = np.ones(self.shape, dtype=np.float32)
        model = AgentModel("", self.shape)
        output = model(np.array([ones] * 10, dtype=np.float32))
        np.testing.assert_almost_equal(output, [[-0.0006732]] * 10)

        tf.random.set_seed(0)
        output = model(np.array([ones] * 10, dtype=np.float32), training=True).numpy()
        want = [[-0.0006732]] * 10
        np.testing.assert_almost_equal(output, want)
示例#4
0
    def add_model(self):
        """This function calls the appropriate model builder"""
        
        self.model_agent = AgentModel(12, 20, 6)

        self.model_critic = CriticModel(11, 21, 10, 0)

        self.set_model_weights(self.model_agent)

        self.set_model_weights(self.model_critic)

        self.optimizer_agent = torch.optim.Adam(self.model_agent.parameters(),
                                               lr = 0.001)

        self.optimizer_critic = torch.optim.Adam(self.model_critic.parameters(),
                                               lr = 0.001)

        self.loss_agent = torch.nn.MSELoss()

        self.loss_critic = torch.nn.MSELoss()
示例#5
0
def pool_run_args(argses, super_dirname, output_every, t_upto, resume):
    runners = []
    for args in argses:
        output_dirname = make_output_dirname(args)
        output_dirpath = join(super_dirname, output_dirname)
        if resume and get_filenames(output_dirpath):
            runner = Runner(output_dirpath, output_every)
        else:
            model = AgentModel(**args)
            runner = Runner(output_dirpath, output_every, model=model)
            runner.clear_dir()
        runners.append(runner)
    pool_run(runners, t_upto)
示例#6
0
class agent:

    def __init__(self):

        self.critic_loss = None
        
        self.factors_agent = None

        self.factors_critic = None

        self.history_len = 0

        self.is_train = None

        self.loss_agent = None

        self.loss_critic = None

        self.model_agent = None

        self.model_critic = None

        self.optimizer_agent = None

        self.optimizer_critic = None

        self.pred = None

        self.reward = None

    def add_model(self):
        """This function calls the appropriate model builder"""
        
        self.model_agent = AgentModel(12, 20, 6)

        self.model_critic = CriticModel(11, 21, 10, 0)

        self.set_model_weights(self.model_agent)

        self.set_model_weights(self.model_critic)

        self.optimizer_agent = torch.optim.Adam(self.model_agent.parameters(),
                                               lr = 0.001)

        self.optimizer_critic = torch.optim.Adam(self.model_critic.parameters(),
                                               lr = 0.001)

        self.loss_agent = torch.nn.MSELoss()

        self.loss_critic = torch.nn.MSELoss()

    def add_prediction(self, prediction):
        """This function concatenates the prediciton with the critic input"""
        
        i = 0

        j = self.history_len 

        self.factors_critic[i, j, 0] = prediction['score']

        self.factors_critic[i, j, 1] = prediction['r0']

        self.factors_critic[i, j, 2] = prediction['r1']

        self.factors_critic[i, j, 3] = prediction['r2']

        self.factors_critic[i, j, 4] = prediction['r3']

        self.factors_critic[i, j, 5] = prediction['r4']

        self.factors_critic[i, j, 6] = prediction['r5']

        self.factors_critic[i, j, 7] = prediction['sd']

        self.factors_critic[i, j, 8] = prediction['avg']

        self.factors_critic[i, j, 9] = prediction['m']

        self.factors_critic[i, j, 10] = prediction['k']

    def custom_loss_critic(self, target, selection, selection_averages,
                           target_averages):
        """This returns the normalized cross correlation between target and
        selection"""

        # These lines here compute the cross-correlation between target and
        # selection

        top = np.multiply((selection - selection_averages), 
                          (target - target_averages))

        top_sum = np.sum(top, axis = 0)

        bottom_selection = np.power((selection - selection_averages),2)

        bottom_targets = np.power((target - target_averages), 2)

        bottom_selection_sum = np.sum(bottom_selection, axis = 0)

        bottom_targets_sum = np.sum(bottom_targets, axis = 0)

        bottom = np.sqrt(np.multiply(bottom_selection_sum,
                                     bottom_targets_sum))

        divided = np.divide(top_sum, bottom)

        divided = divided[~np.isnan(divided)]

        return(np.sum(divided))
            
    def factorize(self, user_history):
        """This function factorizes a given user history, or batch of user
        histories, into factors for an lstm model"""

        # Reset the holding arrays

        self.factors_agent = np.zeros((1, 20, 12))

        self.factors_critic = np.zeros((1, 21, 11))

        # This i here is to conform with tensorflow input expectations

        i = 0

        j = 0

        for index, row in user_history.iterrows():

            # The last entry in a history is the one we attempt to predict

            if j == (user_history.shape[0]):

                break
            
            # Truncating maximum history to ~1 day of continuous listening

            if j == 20:

                break
            # In an act of data reduction and factor selection, I drop
            # all spotify embeddings and deploy my own
            
            self.factors_agent[i, j, 0] = row['score']

            self.factors_critic[i, j, 0] = row['score']

            self.factors_agent[i, j, 1] = row['r0']

            self.factors_critic[i, j, 1] = row['r0']

            self.factors_agent[i, j, 2] = row['r1']

            self.factors_critic[i, j, 2] = row['r1']

            self.factors_agent[i, j, 3] = row['r2']

            self.factors_critic[i, j, 3] = row['r2']

            self.factors_agent[i, j, 4] = row['r3']

            self.factors_critic[i, j, 4] = row['r3']

            self.factors_agent[i, j, 5] = row['r4']

            self.factors_critic[i, j, 5] = row['r4']

            self.factors_agent[i, j, 6] = row['r5']

            self.factors_critic[i, j, 6] = row['r5']

            self.factors_agent[i, j, 7] = row['m']

            self.factors_critic[i, j, 7] = row['m']

            self.factors_agent[i, j, 8] = row['k']

            self.factors_critic[i, j, 8] = row['k']

            self.factors_agent[i, j, 9] = row['day_w']

            self.factors_critic[i, j, 9] = row['sd']

            self.factors_agent[i, j, 10] = row['day_m']

            self.factors_critic[i, j, 10] = row['avg']

            self.factors_agent[i, j, 11] = row['hour_d']

            j += 1

        i += 1

        self.history_len = j

    def get_agent_reward(self, repeat):
        """This function gets the agent reward""" 

        # if the track is something the user has heard before take the reward
        # to the (1/2)

        if repeat > 0:

            reward =  math.pow(self.reward,0.5)

        else:

            reward = self.reward

        # Due to the square in the operation the magnitue of rward is limited
        # to 1E-7 due to machine precision concerns - verfied through testing

        if reward > 0.9999999:

            reward = 0.9999999 

        reward = torch.tensor([reward], requires_grad = True)

        self.reward = reward

    def get_critic_loss(self, current_user_history, data):
        """This function get the critic loss"""

        user = data[data.user_id == current_user_history.user_id.values[0]]

        user = user[['r0','r1','r2','r3', 'r4', 'r5']]

        user_array = user.to_numpy()

        # In order to use handy dandy numpy list comprehensions, we need to
        # make an overly bulky array for the averages both for target and for
        # selection ( as pssed to self.custom_loss_critic)

        selection_averages = []

        selection_averages.append(np.average(current_user_history.r0.values))

        selection_averages.append(np.average(current_user_history.r1.values))

        selection_averages.append(np.average(current_user_history.r2.values))

        selection_averages.append(np.average(current_user_history.r3.values))

        selection_averages.append(np.average(current_user_history.r4.values))

        selection_averages.append(np.average(current_user_history.r5.values))

        selection_averages = np.array(selection_averages)

        # This line here gives selection_averages a 2nd dimension to match time
        # while the repeat command coppies these average values through the time
        # axis

        selection_averages = np.repeat(selection_averages[None,:], 
                                       current_user_history.shape[0],
                                       axis = 0)

        selection_averages = selection_averages[-10:]

        selection_array=current_user_history[['r0','r1','r2','r3', 'r4', 'r5']]

        selection_array = selection_array[-10:]

        selection_array = selection_array.to_numpy()

        # Here we repeat this process for the whole user history as reflected
        # byuser

        target_averages = []

        target_averages.append(np.average(user.r0.values))

        target_averages.append(np.average(user.r1.values))
       
        target_averages.append(np.average(user.r2.values))
       
        target_averages.append(np.average(user.r3.values))
       
        target_averages.append(np.average(user.r4.values))
       
        target_averages.append(np.average(user.r5.values))
        
        target_averages = np.array(target_averages)

        target_averages = np.repeat(target_averages[None, :],
                                    selection_array.shape[0],
                                    axis = 0)
        
        critic_loss = []

        end  = selection_array.shape[0]

        start = 0

        while end < user_array.shape[0]:
            
            critic_loss.append(self.custom_loss_critic(user_array[start:end,],
                                                selection_array,
                                                selection_averages,
                                                target_averages))

            start += 1

            end += 1

        if len(critic_loss) > 0:

            critic_loss = np.average(critic_loss)

        else:

            critic_loss = 0.0

        critic_loss = torch.tensor([critic_loss], requires_grad = True)

        self.critic_loss = critic_loss

    def predict(self, user_history):
        """This function manages the training of the model based on the provided
        data"""

        self.factorize(user_history)

        self.pred = self.model_agent(torch.Tensor(self.factors_agent))

    def propagate(self, current_user_history, data, prediction, repeat):
        """This function propagates the loss through the actor and critic"""

        self.add_prediction(prediction)

        # Clear out the gradients from the last prediction

        self.model_agent.zero_grad()

        self.model_critic.zero_grad()

        # Get the critic reward

        self.reward = self.model_critic(torch.Tensor(self.factors_critic))

        self.get_agent_reward(repeat)

        # Get the agent loss and apply it
        
        agent_loss = self.loss_agent(self.reward, torch.tensor([1.0]))
        
        self.optimizer_agent.step(agent_loss.backward())

        # Get the critic loss and apply it

        self.get_critic_loss(current_user_history, data)

        evaluated_critic_loss = self.loss_critic(self.critic_loss,
                                                 torch.tensor([6.0]))

        self.optimizer_critic.step(evaluated_critic_loss.backward())

    def ready_agent(self, agent_model_path, critic_model_path, train):
        """This function sets up a working agent - one complete with a loss
        function and a model"""

        self.is_train = train 

        self.model_agent = torch.load(agent_model_path)

        self.model_critic = torch.load(critic_model_path)

        if self.model_agent is not None:
            
            print("Actor Model {} sucessuflly loaded.\n".format(agent_model_path))

        self.model_critic = torch.load(critic_model_path)

        if self.model_agent is not None:
            
            print("Critic Model {} sucessuflly loaded.\n".format(critic_model_path))

    def set_model_weights(self, model):
        """This function initilizes the weights in a pytorch model"""

        classname = model.__class__.__name__

        if classname.find('Linear') != -1:

            n = model.in_features

            y = 1.0 / np.sqrt(n)

            model.weight.data.uniform_(-y,y)

            model.bias.data.fill(0)

    def wake_agent(self, train):
        """This function sets up a working agent - one complete with a loss
        function and a model"""

        self.is_train = train 

        self.add_model()
示例#7
0
def train_val():
    ''' Train on the training set, and validate on seen and unseen splits. '''

    # Set which GPU to use
    device = torch.device('cuda', hparams.device_id)

    # Load hyperparameters from checkpoint (if exists)
    if os.path.exists(hparams.load_path):
        print('Load model from %s' % hparams.load_path)
        ckpt = load(hparams.load_path, device)
        start_iter = ckpt['iter']
    else:
        if not hparams.forward_agent and not hparams.random_agent and not hparams.shortest_agent:
            if hasattr(hparams, 'load_path') and hasattr(hparams, 'eval_only') and hparams.eval_only:
                sys.exit('load_path %s does not exist!' % hparams.load_path)
        ckpt = None
    start_iter = 0
    end_iter = hparams.n_iters

    if not hasattr(hparams, 'ask_baseline'):
        hparams.ask_baseline = None
    if not hasattr(hparams, 'instruction_baseline'):
        hparams.instruction_baseline = None

    # Set random seeds
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    np.random.seed(hparams.seed)
    random.seed(hparams.seed)

    # Create or load vocab
    train_vocab_path = os.path.join(hparams.data_path, 'vocab.txt')
    if not os.path.exists(train_vocab_path):
        raise Exception('Vocab file not found at %s' % train_vocab_path)
    vocab = read_vocab([train_vocab_path])
    hparams.instr_padding_idx = vocab.index('<PAD>')

    tokenizer = Tokenizer(vocab=vocab, encoding_length=hparams.max_instr_len)
    if hparams.encoder_type == 'dic':
        tokenizer = BTokenizer(vocab=vocab,encoding_length=hparams.max_instr_len)
    featurizer = ImageFeatures(hparams.img_features, device)
    simulator = Simulator(hparams)

    # Create train environment
    train_env = Batch(hparams, simulator, featurizer, tokenizer, split='train')

    # Create validation environments
    val_splits = ['val_seen', 'val_unseen']
    eval_mode = hasattr(hparams, 'eval_only') and hparams.eval_only
    if eval_mode:
        if 'val_seen' in hparams.load_path:
            val_splits = ['test_seen']
        elif 'val_unseen' in hparams.load_path:
            val_splits = ['test_unseen']
        else:
            val_splits = ['test_seen', 'test_unseen']
        end_iter = start_iter + 1

    if hparams.eval_on_val:
        val_splits = [x.replace('test_', 'val_') for x in val_splits]

    val_envs_tmp = { split: (
        Batch(hparams, simulator, featurizer, tokenizer, split=split),
        Evaluation(hparams, [split], hparams.data_path))
            for split in val_splits }

    val_envs = {}
    for key, value in val_envs_tmp.items():
        if '_seen' in key:
            val_envs[key + '_env_seen_anna'] = value
            val_envs[key + '_env_unseen_anna'] = value
        else:
            assert '_unseen' in key
            val_envs[key] = value

    # Build model and optimizer
    model = AgentModel(len(vocab), hparams, device).to(device)
    optimizer = optim.Adam(model.parameters(), lr=hparams.lr,
        weight_decay=hparams.weight_decay)

    best_metrics = { env_name  : -1 for env_name in val_envs.keys() }
    best_metrics['combined'] = -1

    # Load model paramters from checkpoint (if exists)
    if ckpt is not None:
        model.load_state_dict(ckpt['model_state_dict'])
        optimizer.load_state_dict(ckpt['optim_state_dict'])
        best_metrics = ckpt['best_metrics']
        train_env.ix = ckpt['data_idx']

    if hparams.log_every == -1:
        hparams.log_every = round(len(train_env.data) / \
            (hparams.batch_size * 100)) * 100

    print('')
    pprint(vars(hparams), width=1)
    print('')
    print(model)
    print('Number of parameters:',
        sum(p.numel() for p in model.parameters() if p.requires_grad))

    if hparams.random_agent or hparams.forward_agent or hparams.shortest_agent:
        assert eval_mode
        agent = SimpleAgent(hparams)
    else:
        agent = VerbalAskAgent(model, hparams, device)

    return train(train_env, val_envs, agent, model, optimizer, start_iter,
        end_iter, best_metrics, eval_mode)
示例#8
0
class DQN:
    def __init__(self, num_actions, gamma, max_experiences, min_experiences,
                 batch_size, lr, hidden_units, num_states):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.optimizer = tf.optimizers.Adam(lr)
        self.gamma = gamma
        self.model = AgentModel(num_actions, hidden_units, num_states)
        self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences

    def predict(self, inputs):
        # accepts single state (as a 2d input) or batch of states, runs a forward pass and returns the model results
        # (logits for actions)
        return self.model(np.atleast_2d(inputs.astype('float32')))

    # Function to train the network using replay experience training
    @tf.function
    def train(self, TargetNet):
        # exit if not enough experiences are saved
        if len(self.experience['s']) < self.min_experiences:
            return 0

        # pick batch_size random ints to select
        ids = np.random.randInt(low=0,
                                high=len(self.experience['s']),
                                size=self.batch_size)

        # Separate the quintuples per category
        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])

        # calculate the value of the next states and fill them into the the bellman equation to get the actual values.
        value_next = np.max(TargetNet.predict(states_next), axis=1)
        actual_values = np.where(dones, rewards,
                                 rewards + self.gamma * value_next)

        # apply gradient descent and apply the gradients
        with tf.GradientTape() as tape:
            selected_action_values = tf.math.reduce_sum(
                self.predict(states) * tf.one_hot(actions, self.num_actions),
                axis=1)
            loss = tf.math.reduce_sum(
                tf.square(actual_values - selected_action_values))

        variables = self.model.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))

    def get_action(self, states, epsilon):
        # balance exploration and exploitation with epsilon and random choice
        # implementation of epsilon-greedy behaviour
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.predict(np.atleast_2d(states))[0])

    def add_experience(self, exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)

        for key, value in exp.items():
            self.experience[key].append(value)

    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())

    def save_model(self, save_path):
        self.model.save_weights(save_path, save_format='tf')

    def load_model(self, path):
        self.model.load_weights(path)
示例#9
0
class NNAgent:
    def __init__(self, state_shape, action_size):
        self.learning_rate = 0.001
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = 0.999
        self.episilon = 0.01
        self.lamb = 0.99

        board_shape = state_shape[:2]
        self.board_shape = board_shape

        self.value_model = AgentModel("value", board_shape)
        self.target_value_model = AgentModel("target_value", board_shape)

        self.value_model.build(input_shape=(None,) + board_shape)
        self.target_value_model.build(input_shape=(None,) + board_shape)

        for var, var_target in zip(
            self.value_model.trainable_variables,
            self.target_value_model.trainable_variables,
        ):
            var.assign(var_target)

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
        self.loss_function = tf.keras.losses.MeanSquaredError()

    # Pick a move given the game state, the move is selected by listing out all
    # possible places to place the current piece and evaluating the resulting
    # board with the neural net, it returns the movement that leads to the
    # state which maximize the predicted future total reward.
    def act(self, info, randomize=False):
        x = info["piece_x"]
        y = info["piece_y"]
        shape = info["piece_shape"]
        rotation = info["piece_rotation"]

        board = info["board"]

        start = Board(board, shape, x, y, rotation)

        visited, finals = start.list_paths()

        if not finals:
            print(info)
            raise RuntimeError("No finals, this shouldn't happen")

        boards = []
        rewards = []
        for node in finals:
            boards.append(node.board)
            rewards.append(node.reward())

        boards = np.array(boards, dtype=np.float32)
        scores = self._eval_many(boards)

        best_end = None
        best_end_score = None
        if not randomize or random.random() > self.episilon:
            for i, node in enumerate(finals):
                score = rewards[i] + self.gamma * scores[i]
                if best_end is None or score > best_end_score:
                    best_end = node
                    best_end_score = score
        else:
            best = random.randint(0, len(boards) - 1)
            best_end = finals[best]
            best_end_score = scores[best]

        node = best_end
        actions = []
        while node != start:
            (action, next_node) = visited[node.tup()]
            if action != Moves.DROP:
                actions.append(action)
            node = next_node
        actions.reverse()

        return actions, best_end_score

    def _eval(self, board):
        return self._eval_many(np.array([board]))[0]

    def _eval_many(self, boards):
        boards = tf.convert_to_tensor(boards, dtype=np.float32)
        return self.value_model(boards).numpy().reshape(-1)

    def _make_features(self, memory, batch_size):
        boards, next_boards, rewards, dones = memory.sample(batch_size)

        boards = tf.convert_to_tensor(boards, dtype=np.float32)
        boards_next = tf.convert_to_tensor(next_boards, dtype=np.float32)
        not_dones = tf.convert_to_tensor(1 - dones, dtype=np.float32)
        rewards = tf.convert_to_tensor(rewards, dtype=np.float32)

        predictions = tf.reshape(self.target_value_model(boards_next), [-1])
        targets = predictions * self.gamma * not_dones + rewards

        return boards, targets

    def trainable_variables(self):
        return (
            self.value_model.trainable_variables
            + self.target_value_model.trainable_variables
        )

    def save_model(self, filename):
        tensors = {v.name: v.numpy() for v in self.trainable_variables()}
        np.savez(filename, **tensors)

    def load_model(self, filename):
        tensors = np.load(filename)
        for v in self.trainable_variables():
            name = v.name
            v.assign(tensors[name])

    def train(self, memory, batch_size):
        features, target = self._make_features(memory, batch_size)

        with tf.GradientTape() as tape:
            predicions = self.value_model(features, training=True)
            loss = self.loss_function(target, predicions)
        gradients = tape.gradient(loss, self.value_model.trainable_variables)
        self.optimizer.apply_gradients(
            zip(gradients, self.value_model.trainable_variables)
        )
        for a, b in zip(
            self.value_model.trainable_variables,
            self.target_value_model.trainable_variables,
        ):
            b.assign(b * self.lamb + a * (1.0 - self.lamb))
        del tape
        return {
            "loss": loss.numpy(),
            "mean target value": target.numpy().mean(),
        }