Python GridWorld.do_action示例

sigma_average_dict = defaultdict(list)
components = ['W: first hidden', 'b: first hidden', 'W: second hidden', 'b: second hidden', \
                'W: output','b: output']

for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    state = Tensor(env.get_state()).unsqueeze(0)
    score = 0

    for t in xrange(500):
        # Select and perform an action
        if t % sample_period == 0:
            w_sample = model.sample()
        action = select_action(state)
        reward, done = env.do_action(action[0, 0])
        score += reward
        reward = Tensor([reward])

        # Observe new state
        if not done:
            next_state = Tensor(env.get_state()).unsqueeze(0)
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state
        if done: