parser.add_argument("-gamma",
                        "--gamma",
                        default=0.95,
                        help="discount factor")
    parser.add_argument("-alpha",
                        "--alpha",
                        default=0.1,
                        help="learning rate (step size)")
    parser.add_argument("-plan_step",
                        "--plan_step",
                        default=5,
                        help="planning steps over the learned model")
    args = parser.parse_args()

    # Create an environment
    env = Maze()
    plan_steps = [0, 5, 50]
    steps = np.zeros((len(plan_steps), args.episodes))

    for run in range(args.runs):

        for index, plan_step in zip(range(len(plan_steps)), plan_steps):
            start_time = time.time()

            args.planning_steps = plan_step

            # initialize Q table
            q_value = np.zeros(env.q_size)
            # generate Dyna-Q model
            model = InternalModel()
示例#2
0
def main():
    number_of_turns = 0  #going to use this for counting the number of steps before game end
    catchCount = 0  #count of game ends
    env = Maze(FILE_NAME)
    myCat = Brain('Cat', env.cat.pos, env.actions)
    myMouse = Brain('Mouse', env.mouse.pos, env.actions)
    cheesePos = env.cheese.pos
    board = env.mazeList

    ## DEBUGING
    debug = False  #Step by step toggle
    env.renderWindow = False  #start with graphics being rendered

    while True:
        if debug:
            print('\nCLICK to start loop.')
            env.win.getMouse()
        print('==At start of loop, cat and mouse information:==')
        myCat.printInfo()
        myMouse.printInfo()

        if debug:
            print('\nCLICK to let mouse choose action.')
            env.win.getMouse()
        # print('Calling mouse.chooseRandom with catpos mousepos cheese pos:', myCat.pos, myMouse.pos, cheesePos)
        mouseAction = myMouse.chooseAction(board, myCat.pos, myMouse.pos,
                                           cheesePos)
        mouseImmediateReward = env.moveMouse(mouseAction)

        if debug:
            print('immediate reward:', mouseImmediateReward)
            print('myMouse.q_table:', myMouse.q_table)
            print('\nCLICK to let cat choose action.')
            env.win.getMouse()
        # print('Calling cat.chooseRandom with catpos mousepos cheese pos:', myCat.pos, myMouse.pos, cheesePos)
        catAction = myCat.chooseAction(board, myCat.pos, myMouse.pos,
                                       cheesePos)
        catImmediateReward = env.moveCat(catAction)

        if debug:
            print('catAction:', catAction)
            print('immediate reward:', catImmediateReward)
            print('myCat.q_table:', myCat.q_table)
            print('\nCLICK to get feedback from environment.')
            env.win.getMouse()
        #get feedback from the environment
        catPos, catReward, mousePos, mouseReward, done = env.turnEnd()

        #add goal rewards if any
        catImmediateReward += catReward
        mouseImmediateReward += mouseReward

        if debug:
            print('catPos:', catPos, 'catImmediateReward:', catImmediateReward,
                  'mousePos:', mousePos, 'mouseImmediateReward:',
                  mouseImmediateReward, 'done:', done)
            print('catReward:', catReward, 'mouseReward:', mouseReward)
            print('\nCLICK to update agent Brain with positions.')
            env.win.getMouse()
        # Update agent's brains to reflect board positions after move
        myMouse.updateBrain(catPos, catReward, mousePos, mouseReward)
        myCat.updateBrain(catPos, catReward, mousePos, mouseReward)

        myCat.printInfo()
        myMouse.printInfo()

        if debug:
            print('\nCLICK to start learnLast step for both agents.')
            env.win.getMouse()
        #immediate learning of step taken
        myMouse.learnLast(mouseImmediateReward)
        myCat.learnLast(catImmediateReward)
        myCat.printInfo()
        myMouse.printInfo()
        if debug:
            print('\nCLICK to continue.')

        #if something got caught, execute learning of agents
        if done:
            # time.sleep(1)
            catchCount += 1
            print('Hit something')
            if debug:
                print('mouse q-table before learnAll')
                print(myMouse.q_table)
                print('mouse history before learnAll')
                print(myMouse.history)
            myMouse.learnAll(mouseReward)
            myCat.learnAll(catReward)
            myCat.pos, myMouse.pos, cheesePos = env.restart(
            )  #using restart() so I can program in random spot spawning
        # env.win.getMouse()
        number_of_turns += 1
        # if number_of_turns == 100:
        # break

        if catchCount % 1000 == 0:
            env.renderWindow = True
        if catchCount % 1001 == 2:
            env.renderWindow = False
        if (catchCount % 100 == 0):
            saveAgent(myCat, catchCount)
            saveAgent(myMouse, catchCount)
        if catchCount == 1:
            break
示例#3
0
                ms, ma = env_model.sample_s_a()  # ms in here is a str
                mr, ms_ = env_model.get_r_s_(ms, ma)
                RL.learn(ms, ma, mr, str(ms_))

                # print(env_model.database)
                # print('################')
                # print(RL.q_table)
                # print('################')
            s = s_
            s2 = s2_
            if done:
                s = env.reset()
                break

            if done2:
                s2 = env.reset2()
                break

    # end of game
    print('game over')
    print(RL.q_table)
    env.destroy()


if __name__ == "__main__":
    env = Maze()
    env_model = EnvModel(actions=list(range(env.n_actions)))
    RL = Learning(actions=list(range(env.n_actions)))
    env.after(0, update)
    env.mainloop()
示例#4
0
]
'''
Map=\
['#o#*#o#*#',
 'o1o  *o^o',
 '# # # #o#',
 'o #####^o',
 '#*#####o#',
 'oo^oo^  o',
 '#oo^#oo0#'
]

human_play=False
have_render=True

env = Maze(Map)
if(have_render):
    Image = env.render()

act=[0,0]
score=[0,0]

def ch(x):
    if(x=='l'):return 0
    if(x=='r'):return 1
    if(x=='u'):return 2
    if(x=='d'):return 3
    if(x=='b'):return 4
    if(x=='s'):return 5
def read_action(event):
    if event.keysym == 'Left':
示例#5
0
def testLoading(itNumber):
    print('INIT base game..')
    time.sleep(1)
    catchCount = itNumber
    env = Maze(FILE_NAME)
    myCat = Brain('Cat', env.cat.pos, env.actions)
    myMouse = Brain('Mouse', env.mouse.pos, env.actions)
    cheesePos = env.cheese.pos
    board = env.mazeList

    print('loading from file')
    loadAgent(myCat, catchCount)
    loadAgent(myMouse, catchCount)
    time.sleep(1)

    print('showing agent info/q_tables')
    myCat.printInfo()
    myMouse.printInfo()
    time.sleep(1)

    print('testing running of agents from this point..')
    time.sleep(1)

    ## DEBUGING
    debug = True  #Step by step toggle
    env.renderWindow = True  #start with graphics being rendered

    while True:
        if debug:
            print('\nCLICK to start loop.')
            env.win.getMouse()
        print('==At start of loop, cat and mouse information:==')
        myCat.printInfo()
        myMouse.printInfo()

        if debug:
            print('\nCLICK to let mouse choose action.')
            env.win.getMouse()
        # print('Calling mouse.chooseRandom with catpos mousepos cheese pos:', myCat.pos, myMouse.pos, cheesePos)
        mouseAction = myMouse.chooseAction(board, myCat.pos, myMouse.pos,
                                           cheesePos)
        mouseImmediateReward = env.moveMouse(mouseAction)

        if debug:
            print('immediate reward:', mouseImmediateReward)
            print('myMouse.q_table:', myMouse.q_table)
            print('\nCLICK to let cat choose action.')
            env.win.getMouse()
        # print('Calling cat.chooseRandom with catpos mousepos cheese pos:', myCat.pos, myMouse.pos, cheesePos)
        catAction = myCat.chooseAction(board, myCat.pos, myMouse.pos,
                                       cheesePos)
        catImmediateReward = env.moveCat(catAction)

        if debug:
            print('catAction:', catAction)
            print('immediate reward:', catImmediateReward)
            print('myCat.q_table:', myCat.q_table)
            print('\nCLICK to get feedback from environment.')
            env.win.getMouse()
        #get feedback from the environment
        catPos, catReward, mousePos, mouseReward, done = env.turnEnd()

        #add goal rewards if any
        catImmediateReward += catReward
        mouseImmediateReward += mouseReward

        if debug:
            print('catPos:', catPos, 'catImmediateReward:', catImmediateReward,
                  'mousePos:', mousePos, 'mouseImmediateReward:',
                  mouseImmediateReward, 'done:', done)
            print('catReward:', catReward, 'mouseReward:', mouseReward)
            print('\nCLICK to update agent Brain with positions.')
            env.win.getMouse()
        # Update agent's brains to reflect board positions after move
        myMouse.updateBrain(catPos, catReward, mousePos, mouseReward)
        myCat.updateBrain(catPos, catReward, mousePos, mouseReward)

        myCat.printInfo()
        myMouse.printInfo()

        if debug:
            print('\nCLICK to start learnLast step for both agents.')
            env.win.getMouse()
        #immediate learning of step taken
        myMouse.learnLast(mouseImmediateReward)
        myCat.learnLast(catImmediateReward)
        myCat.printInfo()
        myMouse.printInfo()
        if debug:
            print('\nCLICK to continue.')

        #if something got caught, execute learning of agents
        if done:
            catchCount += 1
            print('Hit something')
            if debug:
                print('mouse q-table before learnAll')
                print(myMouse.q_table)
                print('mouse history before learnAll')
                print(myMouse.history)
            myMouse.learnAll(mouseReward)
            myCat.learnAll(catReward)
            myCat.pos, myMouse.pos, cheesePos = env.restart(
            )  #using restart() so I can program in random spot spawning

        if catchCount == 500000:
            break
示例#6
0
文件: play.py 项目: jlinbb/dqn_demo
from env import Maze
from q_learning import QLearning


def update():
    for episode in range(20):
        state = env.reset()
        step_count, done = 0, False
        while not done:
            env.render()
            action = RL.choose_action(str(state))
            state_, reward, done = env.step(action)
            step_count += 1
            RL.learn(str(state), action, reward, str(state_))
            state = state_
        print(' Round over at: {0} round, Total steps: {1} steps'.format(
            episode, step_count))


if __name__ == '__main__':
    env = Maze()
    agent = QLearning(actions=list(range(env.n_actions)))

    env.after(100, update())
    # env.mainloop()

    print('\n Q Table')
    print(agent.q_table)
    agent.q_table.to_csv('Q_Table.csv')
示例#7
0
            if (step > 200) and (step % 5 == 0):
                RL.learn()

            # swap observation
            observation = observation_

            ## break while loop when end of this episode
            #if done:
            #break
            step += 1
            time.sleep(60)


if __name__ == "__main__":
    # maze game
    env = Maze()
    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=200,
        memory_size=2000,
        # output_graph=True
    )
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()
示例#8
0
 def __init__(self, wid):
     self.wid = wid
     self.env = Maze(Map)
     self.ppo = GLOBAL_PPO
示例#9
0
class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = Maze(Map)
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        while not COORD.should_stop():
            s = self.env.reset()
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            t = 0
            while True:
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data
                a = self.ppo.choose_action(s)
                baseline_a = base.choose_action(self.env, 1)
                s_, r, done = self.env.step({(0, a), (1, U.ch(baseline_a))})
                r = r[0]
                buffer_s.append(s.flatten())
                buffer_a.append(a)
                buffer_r.append(r)  # normalize reward, find to be useful
                s = s_
                ep_r += r

                t += 1
                #print('step : %d, reward : %d, done : %d' % (t, r, done))

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done:
                    print(GLOBAL_EP)
                    if done:
                        v_s_ = 0  # terminal
                    else:
                        v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or GLOBAL_EP >= EP_MAX:
                        print('update')
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update
                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

                if done:
                    # record reward changes, plot later
                    if len(GLOBAL_RUNNING_R) == 0:
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                                ep_r * 0.1)
                    GLOBAL_EP += 1
                    print(
                        '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                        '|W%i' % self.wid,
                        '|Ep_r: %.2f' % ep_r,
                    )
                    break
示例#10
0
    GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
    GLOBAL_RUNNING_R = []
    COORD = tf.train.Coordinator()
    QUEUE = queue.Queue()  # workers putting data in this queue
    threads = []
    for worker in workers:  # worker threads
        t = threading.Thread(target=worker.work, args=())
        t.start()  # training
        threads.append(t)
    # add a PPO updating thread
    threads.append(threading.Thread(target=GLOBAL_PPO.update, ))
    threads[-1].start()

    COORD.join(threads)
    print('aasdas')

    env = Maze(Map)
    tf.reset_default_graph()
    load_PPO = PPO(Load=True)
    while True:
        s = env.reset()
        for t in range(100):
            env.render()
            a = load_PPO.choose_action(s)
            baseline_a = base.choose_action(env, 1)
            s, r, done = env.step({(0, a), (1, U.ch(baseline_a))})
            if (r[0] != -1):
                print(r)
            if (done):
                break