示例#1
0
    # Loading the best model
    model_name = 'model/state_dict2.pt'
    policy_net.load_state_dict(torch.load(model_name))
    policy_net.to(device)

    policy_net.eval(
    )  # to tell the model how to treat dropout (train: uses dropout, eval: do not use dropout)
    state = test_env.reset()
    log_probs = []
    rewards = []
    profits = []
    hold_profits = []

    for steps in range(test_env.steps_left):
        if steps % 500 == 0:
            test_env.render()

        with torch.no_grad():
            action, log_prob = policy_net.get_action(state, device)
            new_state, reward, done, _ = test_env.step(action)

            log_probs.append(log_prob)
            rewards.append(reward)
            profits.append(test_env._get_profit())
            hold_profits.append(test_env._get_hold_profit())

            if done:
                print('\nTrading session terminated:')
                print(f"total reward: {np.round(np.sum(rewards), decimals=3)}")
                print(f"bot profit: {profits[-1]}")
                print(f"hold profit: {hold_profits[-1]}")
示例#2
0
    max_episode_num = 1_000
    all_rewards = [0]
    avg_rewards = [0]

    policy_net.train() # to tell the model how to treat dropout (train: uses dropout, eval: do not use dropout)
    for episode in range(max_episode_num):
        state = train_env.reset()
        log_probs = []
        rewards = []
        profits = []
        hold_profits = []

        for steps in range(train_env.steps_left):
            if steps % 500 ==0:
                train_env.render()

            action, log_prob = policy_net.get_action(state, device)
            new_state, reward, done, _ = train_env.step(action)

            log_probs.append(log_prob)
            rewards.append(reward)
            profits.append(train_env._get_profit())
            hold_profits.append(train_env._get_hold_profit())

            if done:
                # Evaluate model
                all_rewards.append(np.mean(rewards))
                avg_rewards.append(np.mean(all_rewards[-10:]))

                if all_rewards[-1] > all_rewards[-2]:
示例#3
0
              c_out=test_env.action_space.n,
              seq_len=test_env.observation_space.shape[1])

#acer = ACER(model=model, memory=memory, config=acer_config)

if os.path.exists('./save/model.m5'):
    model.load_state_dict(torch.load('./save/model.m5'))

avg_t = 0
avg_r = 0

obs = test_env.reset()
done = False

while True:
    prob = model.pi(torch.FloatTensor(obs).unsqueeze(0))
    action = Categorical(prob).sample().item()
    obs_prime, reward, done, _ = test_env.step(action)

    if reward is None:
        continue

    print("current price : {}".format(test_env._get_current_price()))
    print("action : {}".format(action))
    print("prob : {}".format(prob))

    test_env.render(mode='human')

    obs = obs_prime

print('Finished.')