示例#1
0
 def next_state(self, env):              #Execute a random action in the given env and return the resulting environment state as a node State
     a = env.action_space.sample()
     if isinstance(a, np.ndarray):
         a = a.astype(np.float32)
     nextmove = [a]                              
     obs, r, done, info = env.step(nextmove)
     next = State(info, obs, self.rew+r, done)
     return next
示例#2
0
def HOOSTEP(a,tau,env):
    rew = 0
    logger.debug("HOOSTEP")
    for i in range(tau): 
        obs, r, done, info = env.step(a)
        #env.render()
        rew += r
        if done: break
    return obs, rew, done, info
示例#3
0
def UPDATELEGACY(node,env):
    parent = node.parent
    if parent:
        RESTOREENV(env, parent.state.envState)
        obs, r, done, info = env.step(node.state.moves[-1])
        node.state.envState = CLONEENV(env)
        node.state.rew = parent.state.rew + r
        node.state.done=done
    for c in node.children:
        UPDATELEGACY(c,env)
    return
示例#4
0
def DEFAULTPOLICY(state,depth,env):                   #Rollout simulation for allowed time
    logger.debug("DEFAULTPOLICY")
    t=depth
    reward = state.rew
    done = state.terminal()
 #   RESTOREENV(env, state.envState)
    while not done and t < DEPTH_MAX:
        a = env.action_space.sample()
        if isinstance(a, np.ndarray):
            a = a.astype(np.float32)
        nextmove = [a]                              
        obs, r, done, info = env.step(nextmove)
        reward += r*(0.99**t)
        t += 1
    if done: 
        env.reset()
    return reward
示例#5
0
                tau = current_node.state.tau[-1]
                logger.info(" Selected a and tau [%s, %s]"%(a[0],tau))
                prev_root = current_node.parent
                for c in prev_root.children:
                    logger.debug(" [%s %s] visit %s, value %s"%(c.state.moves[-1], c.state.tau[-1], c.visits, c.reward/c.visits))
                lead_node = current_node
 
            Hroot = hoo.HooNode([0, T_MAX])
            new_Hroots = [Hroot for c in range(action_space.n)] 
            n_i = [0 for c in range(action_space.n)]
            r_i = [0 for c in range(action_space.n)]
            #    current_node = Node(State(None,state,n_act=action_space.n,Hroot=new_Hroots, n_act_i=n_i, r_act_i=r_i, rState=True))
            if tau == 1:
                current_node = lead_node           # For tau=1 after initial selections
                
                obs, r, done, info = env.step(a)
                #    env.render()
                rew += r
                t += 1
                state = CLONEENV(env)
                
                current_node.state.envState = state
                current_node.state.rew = test_r*args.reward_scale_factor
           #     current_node.state.done = done
                current_node.parent = None
                current_node.state.Hroots = new_Hroots
                current_node.state.n_act_i = n_i
                current_node.state.r_act_i = r_i
                current_node.state.n_act_icp = n_i
                current_node.state.Hfront = None
                current_node.state.rootState = True

for i in range(1, 1 + 1):
    obs = env.reset()
    #obs = resize(rgb2gray(env.reset()),(80,80))
    #obs = obs[np.newaxis, :, :]

    reward = 0
    done = False
    R = 0

    while not done:
        action = agent.act(obs)
        #action = agent.act_and_train(obs, reward)
        #action = agent.act(obs)
        obs, reward, done, _ = env.step(action)
        #obs = resize(rgb2gray(obs), (80, 80))
        #obs = obs[np.newaxis, :, :]


    agent.stop_episode()


last_time = datetime.datetime.now()

filename = "toreplace"
env.set_window(False)


print("Starting the training!")