示例#1
0
        for i in range(1000):

            # Pick an action based on the current state
            action = qlearn.chooseAction(state)
            #print ("Action Chosen"+str(action))
            # Execute the action and get feedback
            observation, reward, done, info = env.step(action)
            cumulated_reward += reward
            #print ("Reward="+str(reward))
            if highest_reward < cumulated_reward:
                highest_reward = cumulated_reward

            nextState = ''.join(map(str, observation))

            qlearn.learn(state, action, reward, nextState)

            #env.monitor.flush(force=True)

            if not(done):
                #print "NOT done"
                state = nextState
            else:
                print "DONE"
                last_time_steps = numpy.append(last_time_steps, [int(i + 1)])
                break 

        m, s = divmod(int(time.time() - start_time), 60)
        h, m = divmod(m, 60)
        print ("EP: "+str(x+1)+" - [alpha: "+str(round(qlearn.alpha,2))+" - gamma: "+str(round(qlearn.gamma,2))+" - epsilon: "+str(round(qlearn.epsilon,2))+"] - Reward: "+str(cumulated_reward)+"     Time: %d:%02d:%02d" % (h, m, s))
        if qlearn.epsilon > 0.02:
            qlearn.epsilon *= 0.999
        print(qlearn.epsilon)
        for i in range(100):
            print("New step "+str(i))
            action = qlearn.chooseAction(state)
            act1.append(action)
            new_state, reward, done, info = env.step(action)
            total_reward += reward
            print("States we were "+str(state)+" action we took "+str(action)+" state we occured "+\
            str(new_state)+" reward we obtained "+str(reward)+" and total reward is "+str(total_reward))
            if highest_reward < total_reward:
                highest_reward = total_reward
                act = act1
            qlearn.learn(state,action,reward,new_state)

            if (done):
                break
            else:
                state = new_state
        episode.append(x)
        result.append(total_reward)
        print("Episode #"+str(x)+" has ended, total reward is "+str(total_reward))
    fa.write(str(act))
    f.write(str(qlearn.q))
    f.close()
    fa.close()
    plt.plot(episode, result)
    plt.show()
            next_state_ = np.zeros(2)

            #next_state_[0] = int(np.digitize(next_state[2],vertical_bin)) # z first
            next_state_[0] = int(np.digitize(next_state[0],
                                             horizontal_bins[0]))
            next_state_[1] = int(np.digitize(next_state[1],
                                             horizontal_bins[1]))
            for j in range(2):
                if next_state_[j] < 0:
                    next_state_[j] = 0
                elif next_state_[j] > env.shape[j] - 1:
                    next_state_[j] = env.shape[j] - 1
            print("Go into state: ", next_state_, " from state: ", state_,
                  " by action: ", stringify(action))
            # Make the algorithm learn based on the results
            qlearn.learn(tuple(state_), action, reward, tuple(next_state_))

            if not (done):
                state = next_state
            else:
                rospy.loginfo("DONE")
                last_time_steps = np.append(last_time_steps, [int(t + 1)])
                reward_msg = RLExperimentInfo()
                reward_msg.episode_number = x
                reward_msg.episode_reward = cumulated_reward
                reward_pub.publish(reward_msg)
                break

        m, s = divmod(int(time.time() - start_time), 60)
        h, m = divmod(m, 60)
        rospy.loginfo(
示例#4
0
    print("Teaching...")
    for x in range(teach_episodes):
        done = False
        observation = env.reset()

        #render() #defined above, not env.render()
        step_counter = 0

        for i in range(200):
            step_counter += 1
            action = expert_action(observation)
            binarized_observation = binarize_observation(observation)
            newObservation, reward, done, info = env.step(action)
            binarized_new_observation = binarize_observation(newObservation)

            qlearn.learn(binarized_observation, action, reward,
                         binarized_new_observation)

            if not (done):
                observation = newObservation
            else:
                last_time_steps = numpy.append(last_time_steps, [int(i + 1)])
                break

    print("Learning...")
    for x in range(total_episodes):
        done = False

        cumulated_reward = 0  #Should going forward give more reward then L/R ?

        observation = env.reset()
            state = threshold_laser(laser, threshold_value)

            rl_choice = qlearn.chooseAction(state)

            if rl_choice == 1:
                action = 20 - image_action
            else:
                action = laser_action

            next_observation, reward, done, info = env.step(action)

            cumulated_reward += reward
            next_image, next_laser = next_observation
            next_state = threshold_laser(next_laser, threshold_value)

            qlearn.learn(state, rl_choice, reward, next_state)

            if not done:
                observation = next_observation
            else:
                break

        if episode % 100 == 0:
            qlearn.save_q()

        m, s = divmod(int(time.time() - start_time), 60)
        h, m = divmod(m, 60)
        print("EP " + str(episode + 1) + ":  " + str(step_counter) +
              "  timesteps" + " - [alpha: " + str(round(qlearn.alpha, 2)) +
              " - gamma: " + str(round(qlearn.gamma, 2)) + " - epsilon: " +
              str(round(qlearn.epsilon, 2)) + "] - Reward: " +