def test(test_data, model, tickers, randomize, num_rand_stocks=0):
    """
    the test function: test agent on test_data
    if randomize == True, we know we have randomized stocks in training, 
    so we test on num_rand_stocks randomly selected stocks from test_data
    if otherwise, we used the entire test_data set

    :param test_data: the testing set
    :param model: the trained model
    :param tickers: stocks tickers corresponding to test_data, including "CASH"
    :param randomize: boolean indicating whether we have randomized stocks
    :param num_rand_stocks: number of stocks randomized in training
    """
    if randomize:
        # the last element of tickers is "CASH", we don't include "CASH" in randomization
        rand_stock_indices = np.random.choice(len(tickers) - 1,
                                              num_rand_stocks,
                                              replace=False)
        # get randomly selected stock names
        episode_tickers = [tickers[index] for index in rand_stock_indices]
        episode_tickers.append("CASH")
        rand_stock_indices = tf.reshape(rand_stock_indices,
                                        (len(rand_stock_indices), 1))
        # saving the randomization to a new variable so we don't mess w/ test_data
        episode_input = tf.gather_nd(test_data, rand_stock_indices)
    else:
        episode_input = test_data
        episode_tickers = tickers

    env = StockEnv(episode_input, episode_tickers, is_testing=True)
    states, actions, rewards = env.generate_episode(model)
    min_testing_episode_len = 20
    while len(rewards) < min_testing_episode_len:
        print("test episode not long enough")
        states, actions, rewards = env.generate_episode(model)
    print(f'final portfolio total value: {rewards[-1]}')
def train(train_data,
          model,
          tickers,
          randomize,
          num_rand_stocks=0,
          episode_max_days=200):
    """
    the train function, train the model for an entire epoch

    :param train_data: the preprocessed training data, of shape [num_stocks, num_days, datum_size]
    :param model: the model to be trained
    :param tickers: stock tickers corresponding to train_data, including "CASH"
    :param randomize: boolean indicating whether we have randomized stocks
    :param num_rand_stocks: number of stocks randomized in training
    :param episode_max_days: the maximum number of days of trading actions in an episode

    :return losses and rewards
    """
    num_days = train_data.shape[1]
    loss_list = []

    offset = model.past_num - 1  # extra days of price history needed at beginning
    start = 0  # start of price history slice (inclusive)
    end = start + episode_max_days + offset  # end of price history slice (exclusive)
    num_episodes = (num_days - offset) // episode_max_days

    # a list of total cash value
    rewards_list = []

    for episode in range(num_episodes):
        print(f"Training episode {episode+1} of {num_episodes}")

        if randomize:
            # the last element of tickers is "CASH", we don't include "CASH" in randomization
            rand_stock_indices = np.random.choice(len(tickers) - 1,
                                                  num_rand_stocks,
                                                  replace=False)
            # get randomly selected stock names
            episode_tickers = [tickers[index] for index in rand_stock_indices]
            episode_tickers.append("CASH")
            rand_stock_indices = tf.reshape(rand_stock_indices,
                                            (len(rand_stock_indices), 1))
            episode_input = tf.gather_nd(train_data, rand_stock_indices)
        else:
            episode_input = train_data
            episode_tickers = tickers

        # Slice of pricing history to generate this episode on
        episode_input = episode_input[:, start:end, :]
        start += episode_max_days
        end += episode_max_days

        # (REMOVED) randomize starting date in each episode
        # rand_start = randint(0, int(episode_max_days / 5))
        # episode_input = episode_input[:, rand_start:episode_max_days,:]

        # Hyperparameters to be adjusted below:
        env = StockEnv(episode_input,
                       episode_tickers,
                       interest_annual=0.1,
                       borrow_interest_annual=0.2,
                       transaction_penalty=0.0001)

        with tf.GradientTape() as tape:
            states, actions, rewards = env.generate_episode(model)
            rewards_list.extend(rewards)
            discounted_rewards = discount(rewards)
            model.remember(states, actions, discounted_rewards)
            repl_states, repl_actions, repl_discounted_rewards = model.experience_replay(
            )
            model_loss = model.loss(repl_states, repl_actions,
                                    repl_discounted_rewards)
        gradients = tape.gradient(model_loss, model.trainable_variables)
        model.optimizer.apply_gradients(
            zip(gradients, model.trainable_variables))
        loss_list.append(model_loss.numpy())  # reward at end of batch
    return list(loss_list), rewards_list