for i in range(1000): # Pick an action based on the current state action = qlearn.chooseAction(state) #print ("Action Chosen"+str(action)) # Execute the action and get feedback observation, reward, done, info = env.step(action) cumulated_reward += reward #print ("Reward="+str(reward)) if highest_reward < cumulated_reward: highest_reward = cumulated_reward nextState = ''.join(map(str, observation)) qlearn.learn(state, action, reward, nextState) #env.monitor.flush(force=True) if not(done): #print "NOT done" state = nextState else: print "DONE" last_time_steps = numpy.append(last_time_steps, [int(i + 1)]) break m, s = divmod(int(time.time() - start_time), 60) h, m = divmod(m, 60) print ("EP: "+str(x+1)+" - [alpha: "+str(round(qlearn.alpha,2))+" - gamma: "+str(round(qlearn.gamma,2))+" - epsilon: "+str(round(qlearn.epsilon,2))+"] - Reward: "+str(cumulated_reward)+" Time: %d:%02d:%02d" % (h, m, s))
if qlearn.epsilon > 0.02: qlearn.epsilon *= 0.999 print(qlearn.epsilon) for i in range(100): print("New step "+str(i)) action = qlearn.chooseAction(state) act1.append(action) new_state, reward, done, info = env.step(action) total_reward += reward print("States we were "+str(state)+" action we took "+str(action)+" state we occured "+\ str(new_state)+" reward we obtained "+str(reward)+" and total reward is "+str(total_reward)) if highest_reward < total_reward: highest_reward = total_reward act = act1 qlearn.learn(state,action,reward,new_state) if (done): break else: state = new_state episode.append(x) result.append(total_reward) print("Episode #"+str(x)+" has ended, total reward is "+str(total_reward)) fa.write(str(act)) f.write(str(qlearn.q)) f.close() fa.close() plt.plot(episode, result) plt.show()
next_state_ = np.zeros(2) #next_state_[0] = int(np.digitize(next_state[2],vertical_bin)) # z first next_state_[0] = int(np.digitize(next_state[0], horizontal_bins[0])) next_state_[1] = int(np.digitize(next_state[1], horizontal_bins[1])) for j in range(2): if next_state_[j] < 0: next_state_[j] = 0 elif next_state_[j] > env.shape[j] - 1: next_state_[j] = env.shape[j] - 1 print("Go into state: ", next_state_, " from state: ", state_, " by action: ", stringify(action)) # Make the algorithm learn based on the results qlearn.learn(tuple(state_), action, reward, tuple(next_state_)) if not (done): state = next_state else: rospy.loginfo("DONE") last_time_steps = np.append(last_time_steps, [int(t + 1)]) reward_msg = RLExperimentInfo() reward_msg.episode_number = x reward_msg.episode_reward = cumulated_reward reward_pub.publish(reward_msg) break m, s = divmod(int(time.time() - start_time), 60) h, m = divmod(m, 60) rospy.loginfo(
print("Teaching...") for x in range(teach_episodes): done = False observation = env.reset() #render() #defined above, not env.render() step_counter = 0 for i in range(200): step_counter += 1 action = expert_action(observation) binarized_observation = binarize_observation(observation) newObservation, reward, done, info = env.step(action) binarized_new_observation = binarize_observation(newObservation) qlearn.learn(binarized_observation, action, reward, binarized_new_observation) if not (done): observation = newObservation else: last_time_steps = numpy.append(last_time_steps, [int(i + 1)]) break print("Learning...") for x in range(total_episodes): done = False cumulated_reward = 0 #Should going forward give more reward then L/R ? observation = env.reset()
state = threshold_laser(laser, threshold_value) rl_choice = qlearn.chooseAction(state) if rl_choice == 1: action = 20 - image_action else: action = laser_action next_observation, reward, done, info = env.step(action) cumulated_reward += reward next_image, next_laser = next_observation next_state = threshold_laser(next_laser, threshold_value) qlearn.learn(state, rl_choice, reward, next_state) if not done: observation = next_observation else: break if episode % 100 == 0: qlearn.save_q() m, s = divmod(int(time.time() - start_time), 60) h, m = divmod(m, 60) print("EP " + str(episode + 1) + ": " + str(step_counter) + " timesteps" + " - [alpha: " + str(round(qlearn.alpha, 2)) + " - gamma: " + str(round(qlearn.gamma, 2)) + " - epsilon: " + str(round(qlearn.epsilon, 2)) + "] - Reward: " +