cumulated_reward = 0 #Should going forward give more reward then L/R ? print ("Episode = "+str(x)) observation = env.reset() if qlearn.epsilon > 0.05: qlearn.epsilon *= epsilon_discount #render() env.render() state = ''.join(map(str, observation)) for i in range(1000): # Pick an action based on the current state action = qlearn.chooseAction(state) #print ("Action Chosen"+str(action)) # Execute the action and get feedback observation, reward, done, info = env.step(action) cumulated_reward += reward #print ("Reward="+str(reward)) if highest_reward < cumulated_reward: highest_reward = cumulated_reward nextState = ''.join(map(str, observation)) qlearn.learn(state, action, reward, nextState) #env.monitor.flush(force=True) if not(done):
# for each episode, we test the robot for nsteps for t in itertools.count(): state_ = np.zeros(2) #state_[0] = int(np.digitize(state[2],vertical_bin))# z first state_[0] = int(np.digitize(state[0], horizontal_bins[0])) state_[1] = int(np.digitize(state[1], horizontal_bins[1])) #Clip the state for j in range(2): if state_[j] < 0: state_[j] = 0 elif state_[j] > env.shape[j] - 1: state_[j] = env.shape[j] - 1 # Pick an action based on the current state action = qlearn.chooseAction(tuple(state_)) # Execute the action in the environment and get feedback next_state, reward, done, info = env.step(action) cumulated_reward += reward next_state_ = np.zeros(2) #next_state_[0] = int(np.digitize(next_state[2],vertical_bin)) # z first next_state_[0] = int(np.digitize(next_state[0], horizontal_bins[0])) next_state_[1] = int(np.digitize(next_state[1], horizontal_bins[1])) for j in range(2): if next_state_[j] < 0: next_state_[j] = 0
for x in range(total_episodes): done = False cumulated_reward = 0 #Should going forward give more reward then L/R ? observation = env.reset() if qlearn.epsilon > 0.05: qlearn.epsilon *= epsilon_discount step_counter = 0 for i in range(1500): step_counter += 1 binarized_observation = binarize_observation(observation) action = qlearn.chooseAction(binarized_observation) newObservation, reward, done, info = env.step(action) binarized_new_observation = binarize_observation(newObservation) cumulated_reward += reward if highest_reward < cumulated_reward: highest_reward = cumulated_reward qlearn.learn(binarized_observation, action, reward, binarized_new_observation) if not (done): observation = newObservation else: last_time_steps = numpy.append(last_time_steps, [int(i + 1)]) break
for t in range(total_steps): step_counter += 1 [image, laser] = observation hint_pos = getTargetPoints(image) if hint_pos == 0: hint_pos = getHintPoints(image) height, width, depth = image.shape image_action = get_image_action(width, hint_pos) laser_action = get_expert_action(laser) state = threshold_laser(laser, threshold_value) rl_choice = qlearn.chooseAction(state) if rl_choice == 1: action = 20 - image_action else: action = laser_action next_observation, reward, done, info = env.step(action) cumulated_reward += reward next_image, next_laser = next_observation next_state = threshold_laser(next_laser, threshold_value) qlearn.learn(state, rl_choice, reward, next_state) if not done: