# Outer iteration for m in range(M): # Receive initial observation s = env.reset() explore_variance = 2 # initial exploration variance s = nd.array(s).reshape((1, -1)) # print(s) inner_time = time.time() # Inner iteration for t in range(T): # Generate action from action net and add exploring variation action = actor.net(s) action = action[0].asscalar() action = nd.clip(nd.random.normal(action, explore_variance), -2, 2) action = action.asnumpy() # Get info of next state s_, r, done, info = env.step(action) memory.store_transition(s[0].asnumpy(), action, r, s_) if memory.pointer > buffer_size: # Decrease exploring area, 1. for 0 decreasing explore_variance *= 1. # Sample