示例#1
0
def main(**kargs):
    initial_weights_file, initial_i_frame = latest(kargs['weights_dir'])

    print("Continuing using weights from file: ", initial_weights_file, "from", initial_i_frame)

    if kargs['theano_verbose']:
        theano.config.compute_test_value = 'warn'
        theano.config.exception_verbosity = 'high'
        theano.config.optimizer = 'fast_compile'

    ale = ag.init(display_screen=(kargs['visualize'] == 'ale'), record_dir=kargs['record_dir'])
    game = ag.SpaceInvadersGame(ale)


    def new_game():
        game.ale.reset_game()
        game.finished = False
        game.cum_reward = 0
        game.lives = 4
        return game

    replay_memory = dqn.ReplayMemory(size=kargs['dqn.replay_memory_size']) if not kargs['dqn.no_replay'] else None
    # dqn_algo = q.ConstAlgo([3])
    dqn_algo = dqn.DQNAlgo(game.n_actions(),
                           replay_memory=replay_memory,
                           initial_weights_file=initial_weights_file,
                           build_network=kargs['dqn.network'],
                           updates=kargs['dqn.updates'])

    dqn_algo.replay_start_size = kargs['dqn.replay_start_size']
    dqn_algo.final_epsilon = kargs['dqn.final_epsilon']
    dqn_algo.initial_epsilon = kargs['dqn.initial_epsilon']
    dqn_algo.i_frames = initial_i_frame

    dqn_algo.log_frequency=kargs['dqn.log_frequency']


    import Queue
    dqn_algo.mood_q = Queue.Queue() if kargs['show_mood'] else None

    if kargs['show_mood'] is not None:
        plot = kargs['show_mood']()

        def worker():
            while True:
                item = dqn_algo.mood_q.get()
                plot.show(item)
                dqn_algo.mood_q.task_done()

        import threading
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()

    print(str(dqn_algo))

    visualizer = ag.SpaceInvadersGameCombined2Visualizer() if kargs['visualize'] == 'q' else q.GameNoVisualizer()
    teacher = q.Teacher(new_game, dqn_algo, visualizer,
                        ag.Phi(skip_every=4), repeat_action=4, sleep_seconds=0)
    teacher.teach(500000)
示例#2
0
def random_on_space_invaders():
  import q_learning as q
  import numpy as np
  import ale_game as ag
  reload(q)
  reload(ag)
  ale = ag.init()
  game = ag.SpaceInvadersGame(ale)
  #game.show_vectorized(game.vectorized(ale.getScreen()))
  teacher = q.Teacher(game, q.RandomAlgo(game.get_actions()), ag.SpaceInvadersGameVectorizedVisualizer())
  teacher.teach(1)
示例#3
0
def random_on_space_invaders():
    import q_learning as q
    import numpy as np
    import ale_game as ag
    reload(q)
    reload(ag)
    ale = ag.init()
    game = ag.SpaceInvadersGame(ale)
    #game.show_vectorized(game.vectorized(ale.getScreen()))
    teacher = q.Teacher(game, q.RandomAlgo(game.get_actions()),
                        ag.SpaceInvadersGameVectorizedVisualizer())
    teacher.teach(1)
示例#4
0
def dqn_on_space_invaders_play(initial_weights_file, visualize='q', show_mood=False):
    import q_learning as q
    import ale_game as ag
    import dqn
    reload(q)
    reload(ag)
    reload(dqn)

    print("Using weights from file: ", initial_weights_file)

    ale = ag.init(display_screen=(visualize == 'ale'))
    game = ag.SpaceInvadersGame(ale)

    def new_game():
        game.ale.reset_game()
        game.finished = False
        game.cum_reward = 0
        game.lives = 4
        return game

    replay_memory = dqn.ReplayMemory(size=100, grace=10)
    dqn_algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, initial_weights_file=initial_weights_file)

    dqn_algo.epsilon = 0.1
    dqn_algo.initial_epsilon = 0.1
    dqn_algo.final_epsilon = 0.1
    dqn_algo.ignore_feedback = True
    dqn_algo.log_frequency = 0

    import Queue
    dqn_algo.mood_q = Queue.Queue() if show_mood else None

    if show_mood:
        plot = Plot()

        def worker():
            while True:
                item = dqn_algo.mood_q.get()
                plot.show(item)
                dqn_algo.mood_q.task_done()

        import threading
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()

    print(str(dqn_algo))

    visualizer = ag.SpaceInvadersGameCombined2Visualizer() if visualize == 'q' else q.GameNoVisualizer()
    teacher = q.Teacher(new_game, dqn_algo, visualizer,
                        ag.Phi(skip_every=4), repeat_action=4, sleep_seconds=0)
    return teacher.teach(100)
示例#5
0
def dqn_on_space_invaders_cpu(visualize=False, theano_verbose=False, initial_weights_file=None, ignore_feedback=False):
    import q_learning as q
    import ale_game as ag
    import dqn
    import theano
    reload(q)
    reload(ag)
    reload(dqn)
    if theano_verbose:
        theano.config.compute_test_value = 'warn'
        theano.config.exception_verbosity = 'high'
        theano.config.optimizer = 'fast_compile'

    ale = ag.init()
    game = ag.SpaceInvadersGame(ale)

    def new_game():
        game.ale.reset_game()
        game.finished = False
        game.cum_reward = 0
        game.lives = 4
        return game

    replay_memory = dqn.ReplayMemory(size=100, grace=10)
    dqn_algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, initial_weights_file=initial_weights_file)

    dqn_algo.target_network_update_frequency = 50
    dqn_algo.replay_memory_size = 100
    dqn_algo.replay_start_size = 75
    dqn_algo.epsilon = 0.1
    dqn_algo.initial_epsilon = 0.1
    dqn_algo.final_epsilon = 0.1
    dqn_algo.log_frequency = 10

    dqn_algo.ignore_feedback = ignore_feedback
    # dqn_algo.ignore_feedback = True

    print(str(dqn_algo))

    visualizer = ag.SpaceInvadersGameCombined2Visualizer() if visualize else q.GameNoVisualizer()
    teacher = q.Teacher(new_game, dqn_algo, visualizer,
                        ag.Phi(skip_every=4), repeat_action=4, sleep_seconds=0)
    teacher.teach(500000)
示例#6
0
def random_on_space_invaders():
    import q_learning as q
    import ale_game as ag
    reload(q)
    reload(ag)

    ale = ag.init()
    game = ag.SpaceInvadersGame(ale)

    def new_game():
        game.ale.reset_game()
        game.finished = False
        game.cum_reward = 0
        return game

    # game.show_vectorized(game.vectorized(ale.getScreen()))
    teacher = q.Teacher(new_game, q.RandomAlgo(game.get_actions()), ag.SpaceInvadersGameCombined2Visualizer(),
                        ag.Phi(skip_every=6), repeat_action=6)
    teacher.teach(1)
示例#7
0
def const_on_space_invaders():
    import teacher as q
    import ale_game as ag
    import dqn
    reload(q)
    reload(ag)
    reload(dqn)

    ale = ag.init()
    game = ag.SpaceInvadersGame(ale)

    def new_game():
        game.ale.reset_game()
        game.finished = False
        game.cum_reward = 0
        return game

    const_algo = q.ConstAlgo([2, 2, 2, 2, 2, 0, 0, 0, 0])
    teacher = q.Teacher(new_game, const_algo, ag.SpaceInvadersGameCombined2Visualizer(),
                        ag.Phi(skip_every=6), repeat_action=6)
    teacher.teach(1)
示例#8
0
def const_on_space_invaders():
    import teacher as q
    import ale_game as ag
    import dqn
    reload(q)
    reload(ag)
    reload(dqn)

    ale = ag.init()
    game = ag.SpaceInvadersGame(ale)

    def new_game():
        game.ale.reset_game()
        game.finished = False
        game.cum_reward = 0
        return game

    const_algo = q.ConstAlgo([2, 2, 2, 2, 2, 0, 0, 0, 0])
    teacher = q.Teacher(new_game, const_algo, ag.SpaceInvadersGameCombined2Visualizer(),
                        ag.Phi(skip_every=6), repeat_action=6)
    teacher.teach(1)
示例#9
0
def sarsa_gd_on_space_invaders():
  import q_learning as q
  import numpy as np
  import ale_game as ag
  import matplotlib.pyplot as plt
  plt.ion()
  reload(q)
  reload(ag)
  ale = ag.init()
  run = '1'

  n_colors = 5

  def state_adapter(scr): 
    vect = np.reshape(ag.vectorized(scr, 14, 20), 14 * 20 * n_colors)
    return np.where(vect)[0]

  game = ag.SpaceInvadersGame(ale)
  q_algo1 = q.SARSALambdaGradientDescent(game.get_actions(), game.get_state(), 
    initial_q = 5, initial_theta = [1] * 14 * 20 * n_colors, be_positive = False, state_adapter = state_adapter)
  q_algo1.epsilon = 0.05
  q_algo1.lmbda = 0.99 # 0.9
  q_algo1.gamma = 0.999
  q_algo1.alpha = 0.5
  def new_game():
    game.ale.reset_game()
    game.finished = False
    game.cum_reward = 0
    return game

 

  teacher = q.Teacher(new_game, q_algo1, ag.SpaceInvadersGameVectorizedVisualizer(), repeat_action = 3)
  
#  teacher.single_step(Game)
  q_algo1.epsilon = 0
  q_algo1.log_freq = 1
  teacher.teach(1)  

  initial_training = 1000
  training_decay_from = 95
  training_decay_ex = 50


  result_test = []
  result_1 = []
  result_2 = []

  teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action = 3)
  q_algo1.log_freq = 0.05
  q_algo1.epsilon = 1  
  result_1 = teacher.teach(initial_training)


  q_algo1.epsilon = 0
  q_algo1.log_freq = 0.05
  result_test.append(teacher.teach(1))

  for i in range(training_decay_from):
    q_algo1.epsilon = 1 - i/100
    teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action = 3)
    result_2.append(teacher.teach(training_decay_ex))
    q_algo1.epsilon = 0
    result_test.append(teacher.teach(1))  





  import cPickle as pickle
  with open('gradient_descent.theta' + run , 'wb') as handle:
    pickle.dump(q_algo1.theta, handle)

  with open('gradient_descent.gamma' + run, 'wb') as handle:
    pickle.dump(q_algo1.gamma, handle)

  with open('gradient_descent.lmbda' + run, 'wb') as handle:
    pickle.dump(q_algo1.lmbda, handle)

  with open('gradient_descent.alpha' + run, 'wb') as handle:
    pickle.dump(q_algo1.alpha, handle)  

  r1 = [a[1] for a in result_1]  
  plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r1), np.cumsum(r1)[200:])])/200)

  r2 = [a[1] for r in result_2 for a in r]  
  plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r2), np.cumsum(r2)[200:])])/200)

  r_test = [a[1] for r in result_test for a in r]
  plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_test)[50:])])/50)

  r_4 = [a[1] for a in result_4 ]
  plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_4)[2:])])/2)
  

  q_algo1.epsilon = 0.1
  teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action = 3)
  teacher.teach(100)
示例#10
0
def main(**kargs):
    initial_weights_file, i_total_action = latest(kargs['weights_dir'])

    print("Continuing using weights from file: ", initial_weights_file, "from",
          i_total_action)

    if kargs['theano_verbose']:
        theano.config.compute_test_value = 'warn'
        theano.config.exception_verbosity = 'high'
        theano.config.optimizer = 'fast_compile'

    if kargs['game'] == 'simple_breakout':
        game = simple_breakout.SimpleBreakout()

        class P(object):
            def __init__(self):
                self.screen_size = 12

            def __call__(self, frames):
                return frames

        phi = P()
    else:
        ale = ag.init(game=kargs['game'],
                      display_screen=(kargs['visualize'] == 'ale'),
                      record_dir=kargs['record_dir'])
        game = ag.ALEGame(ale)
        phi = ag.Phi(method=kargs["phi_method"])

    replay_memory = dqn.ReplayMemory(size=kargs['dqn.replay_memory_size']
                                     ) if not kargs['dqn.no_replay'] else None
    algo = dqn.DQNAlgo(game.n_actions(),
                       replay_memory=replay_memory,
                       initial_weights_file=initial_weights_file,
                       build_network=kargs['dqn.network'],
                       updates=kargs['dqn.updates'],
                       screen_size=phi.screen_size)

    algo.replay_start_size = kargs['dqn.replay_start_size']
    algo.final_epsilon = kargs['dqn.final_epsilon']
    algo.initial_epsilon = kargs['dqn.initial_epsilon']
    algo.i_action = i_total_action

    algo.log_frequency = kargs['dqn.log_frequency']
    algo.target_network_update_frequency = kargs[
        'target_network_update_frequency']
    algo.final_exploration_frame = kargs['final_exploration_frame']

    import Queue
    algo.mood_q = Queue.Queue() if kargs['show_mood'] else None

    if kargs['show_mood'] is not None:
        plot = kargs['show_mood']()

        def worker():
            while True:
                item = algo.mood_q.get()
                plot.show(item)
                algo.mood_q.task_done()

        import threading
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()

    print(str(algo))

    if kargs['visualize'] != 'q':
        visualizer = q.GameNoVisualizer()
    else:
        if kargs['game'] == 'simple_breakout':
            visualizer = simple_breakout.SimpleBreakoutVisualizer(algo)
        else:
            visualizer = ag.ALEGameVisualizer(phi.screen_size)

    teacher = q.Teacher(
        game=game,
        algo=algo,
        game_visualizer=visualizer,
        phi=phi,
        repeat_action=kargs['repeat_action'],
        i_total_action=i_total_action,
        total_n_actions=50000000,
        max_actions_per_game=10000,
        skip_n_frames_after_lol=kargs['skip_n_frames_after_lol'],
        run_test_every_n=kargs['run_test_every_n'])
    teacher.teach()
示例#11
0
def main(game_name, network_type, updates_method,
         target_network_update_frequency,
         initial_epsilon, final_epsilon, test_epsilon, final_exploration_frame, replay_start_size,
         deepmind_rmsprop_epsilon, deepmind_rmsprop_learning_rate, deepmind_rmsprop_rho,
         rmsprop_epsilon, rmsprop_learning_rate, rmsprop_rho,
         phi_type, phi_method,
         epoch_size, n_training_epochs, n_test_epochs,
         visualize, record_dir, show_mood,
         replay_memory_size, no_replay,
         repeat_action, skip_n_frames_after_lol, max_actions_per_game,
         weights_dir, algo_initial_state_file,
         log_frequency, theano_verbose):
    args = locals()

    if theano_verbose:
        theano.config.compute_test_value = 'warn'
        theano.config.exception_verbosity = 'high'
        theano.config.optimizer = 'fast_compile'

    if game_name == 'simple_breakout':
        game = simple_breakout.SimpleBreakout()
        class P(object):
            def __init__(self):
                self.screen_size = (12, 12)

            def __call__(self, frames):
                return frames
        phi = P()
    else:
        ale = ag.init(game=game_name, display_screen=(visualize == 'ale'), record_dir=record_dir)
        game = ag.ALEGame(ale)
        if phi_type == '4':
            phi = ag.Phi4(method=phi_method)
        elif phi_type == '1':
            phi = ag.Phi(method=phi_method)
        else:
            raise RuntimeError("Unknown phi: {phi}".format(phi=phi_type))

    if network_type == 'nature':
        build_network = network.build_nature
    elif network_type == 'nature_with_pad':
        build_network = network.build_nature_with_pad
    elif network_type == 'nips':
        build_network = network.build_nips
    elif network_type == 'nature_with_pad_he':
        build_network = network.build_nature_with_pad_he
    elif hasattr(network_type, '__call__'):
        build_network = network_type
    else:
        raise RuntimeError("Unknown network: {network}".format(network=network_type))


    if updates_method == 'deepmind_rmsprop':
        updates = \
            lambda loss, params: u.deepmind_rmsprop(loss, params,
                                                          learning_rate=deepmind_rmsprop_learning_rate,
                                                          rho=deepmind_rmsprop_rho,
                                                          epsilon=deepmind_rmsprop_epsilon)
    elif updates_method == 'rmsprop':
        updates = \
            lambda loss, params: lasagne.updates.rmsprop(loss, params,
                                                         learning_rate=rmsprop_learning_rate,
                                                         rho=rmsprop_rho,
                                                         epsilon=rmsprop_epsilon)
    else:
        raise RuntimeError("Unknown updates: {updates}".format(updates=updates_method))

    replay_memory = dqn.ReplayMemory(size=replay_memory_size) if not no_replay else None

    def create_algo():
        algo = dqn.DQNAlgo(game.n_actions(),
                               replay_memory=replay_memory,
                               build_network=build_network,
                               updates=updates,
                               screen_size=phi.screen_size)

        algo.replay_start_size = replay_start_size
        algo.final_epsilon = final_epsilon
        algo.initial_epsilon = initial_epsilon

        algo.log_frequency = log_frequency
        algo.target_network_update_frequency = target_network_update_frequency
        algo.final_exploration_frame = final_exploration_frame
        return algo

    algo_train = create_algo()
    algo_test = create_algo()
    algo_test.final_epsilon = test_epsilon
    algo_test.initial_epsilon = test_epsilon
    algo_test.epsilon = test_epsilon


    import Queue
    algo_train.mood_q = Queue.Queue() if show_mood else None

    if show_mood is not None:
        import Queue
        algo_train.mood_q = Queue.Queue()
        if show_mood == 'plot':
            plot = Plot()
        elif show_mood == "log":
            plot = Log()

        def worker():
            while True:
                item = algo_train.mood_q.get()
                plot.show(item)
                algo_train.mood_q.task_done()

        import threading
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()

    print(str(algo_train))

    if visualize != 'q':
        visualizer = q.GameNoVisualizer()
    else:
        if game_name == 'simple_breakout':
            visualizer = simple_breakout.SimpleBreakoutVisualizer(algo_train)
        else:
            visualizer = ag.ALEGameVisualizer(phi.screen_size)

    teacher = q.Teacher(game=game,
                        algo=algo_train,
                        game_visualizer=visualizer,
                        phi=phi,
                        repeat_action=repeat_action,
                        max_actions_per_game=max_actions_per_game,
                        skip_n_frames_after_lol=skip_n_frames_after_lol,
                        tester=False)

    tester = q.Teacher(game=game,
                        algo=algo_test,
                        game_visualizer=visualizer,
                        phi=phi,
                        repeat_action=repeat_action,
                        max_actions_per_game=max_actions_per_game,
                        skip_n_frames_after_lol=skip_n_frames_after_lol,
                        tester=True)

    q.teach_and_test(teacher, tester, n_epochs=n_training_epochs,
                     frames_to_test_on=n_test_epochs * epoch_size,
                     epoch_size=epoch_size,
                     state_dir=weights_dir,
                     algo_initial_state_file=algo_initial_state_file)
示例#12
0
def sarsa_gd_on_space_invaders():
    import q_learning as q
    import numpy as np
    import ale_game as ag
    import matplotlib.pyplot as plt
    plt.ion()
    reload(q)
    reload(ag)
    ale = ag.init()
    run = '1'

    n_colors = 5

    def state_adapter(scr):
        vect = np.reshape(ag.vectorized(scr, 14, 20), 14 * 20 * n_colors)
        return np.where(vect)[0]

    game = ag.SpaceInvadersGame(ale)
    q_algo1 = q.SARSALambdaGradientDescent(game.get_actions(),
                                           game.get_state(),
                                           initial_q=5,
                                           initial_theta=[1] * 14 * 20 *
                                           n_colors,
                                           be_positive=False,
                                           state_adapter=state_adapter)
    q_algo1.epsilon = 0.05
    q_algo1.lmbda = 0.99  # 0.9
    q_algo1.gamma = 0.999
    q_algo1.alpha = 0.5

    def new_game():
        game.ale.reset_game()
        game.finished = False
        game.cum_reward = 0
        return game

    teacher = q.Teacher(new_game,
                        q_algo1,
                        ag.SpaceInvadersGameVectorizedVisualizer(),
                        repeat_action=3)

    #  teacher.single_step(Game)
    q_algo1.epsilon = 0
    q_algo1.log_freq = 1
    teacher.teach(1)

    initial_training = 1000
    training_decay_from = 95
    training_decay_ex = 50

    result_test = []
    result_1 = []
    result_2 = []

    teacher = q.Teacher(new_game,
                        q_algo1,
                        q.GameNoVisualizer(),
                        repeat_action=3)
    q_algo1.log_freq = 0.05
    q_algo1.epsilon = 1
    result_1 = teacher.teach(initial_training)

    q_algo1.epsilon = 0
    q_algo1.log_freq = 0.05
    result_test.append(teacher.teach(1))

    for i in range(training_decay_from):
        q_algo1.epsilon = 1 - i / 100
        teacher = q.Teacher(new_game,
                            q_algo1,
                            q.GameNoVisualizer(),
                            repeat_action=3)
        result_2.append(teacher.teach(training_decay_ex))
        q_algo1.epsilon = 0
        result_test.append(teacher.teach(1))

    import cPickle as pickle
    with open('gradient_descent.theta' + run, 'wb') as handle:
        pickle.dump(q_algo1.theta, handle)

    with open('gradient_descent.gamma' + run, 'wb') as handle:
        pickle.dump(q_algo1.gamma, handle)

    with open('gradient_descent.lmbda' + run, 'wb') as handle:
        pickle.dump(q_algo1.lmbda, handle)

    with open('gradient_descent.alpha' + run, 'wb') as handle:
        pickle.dump(q_algo1.alpha, handle)

    r1 = [a[1] for a in result_1]
    plt.plot(
        np.array(
            [x[1] - x[0] for x in zip(np.cumsum(r1),
                                      np.cumsum(r1)[200:])]) / 200)

    r2 = [a[1] for r in result_2 for a in r]
    plt.plot(
        np.array(
            [x[1] - x[0] for x in zip(np.cumsum(r2),
                                      np.cumsum(r2)[200:])]) / 200)

    r_test = [a[1] for r in result_test for a in r]
    plt.plot(
        np.array([
            x[1] - x[0] for x in zip(np.cumsum(r_test),
                                     np.cumsum(r_test)[50:])
        ]) / 50)

    r_4 = [a[1] for a in result_4]
    plt.plot(
        np.array(
            [x[1] - x[0] for x in zip(np.cumsum(r_test),
                                      np.cumsum(r_4)[2:])]) / 2)

    q_algo1.epsilon = 0.1
    teacher = q.Teacher(new_game,
                        q_algo1,
                        q.GameNoVisualizer(),
                        repeat_action=3)
    teacher.teach(100)
示例#13
0
def main(game_name, network_type, updates_method,
         target_network_update_frequency, initial_epsilon, final_epsilon,
         test_epsilon, final_exploration_frame, replay_start_size,
         deepmind_rmsprop_epsilon, deepmind_rmsprop_learning_rate,
         deepmind_rmsprop_rho, rmsprop_epsilon, rmsprop_learning_rate,
         rmsprop_rho, phi_type, phi_method, epoch_size, n_training_epochs,
         n_test_epochs, visualize, record_dir, show_mood, replay_memory_size,
         no_replay, repeat_action, skip_n_frames_after_lol,
         max_actions_per_game, weights_dir, algo_initial_state_file,
         log_frequency, theano_verbose):
    args = locals()

    if theano_verbose:
        theano.config.compute_test_value = 'warn'
        theano.config.exception_verbosity = 'high'
        theano.config.optimizer = 'fast_compile'

    if game_name == 'simple_breakout':
        game = simple_breakout.SimpleBreakout()

        class P(object):
            def __init__(self):
                self.screen_size = (12, 12)

            def __call__(self, frames):
                return frames

        phi = P()
    else:
        ale = ag.init(game=game_name,
                      display_screen=(visualize == 'ale'),
                      record_dir=record_dir)
        game = ag.ALEGame(ale)
        if phi_type == '4':
            phi = ag.Phi4(method=phi_method)
        elif phi_type == '1':
            phi = ag.Phi(method=phi_method)
        else:
            raise RuntimeError("Unknown phi: {phi}".format(phi=phi_type))

    if network_type == 'nature':
        build_network = network.build_nature
    elif network_type == 'nature_with_pad':
        build_network = network.build_nature_with_pad
    elif network_type == 'nips':
        build_network = network.build_nips
    elif network_type == 'nature_with_pad_he':
        build_network = network.build_nature_with_pad_he
    elif hasattr(network_type, '__call__'):
        build_network = network_type
    else:
        raise RuntimeError(
            "Unknown network: {network}".format(network=network_type))

    if updates_method == 'deepmind_rmsprop':
        updates = \
            lambda loss, params: u.deepmind_rmsprop(loss, params,
                                                          learning_rate=deepmind_rmsprop_learning_rate,
                                                          rho=deepmind_rmsprop_rho,
                                                          epsilon=deepmind_rmsprop_epsilon)
    elif updates_method == 'rmsprop':
        updates = \
            lambda loss, params: lasagne.updates.rmsprop(loss, params,
                                                         learning_rate=rmsprop_learning_rate,
                                                         rho=rmsprop_rho,
                                                         epsilon=rmsprop_epsilon)
    else:
        raise RuntimeError(
            "Unknown updates: {updates}".format(updates=updates_method))

    replay_memory = dqn.ReplayMemory(
        size=replay_memory_size) if not no_replay else None

    def create_algo():
        algo = dqn.DQNAlgo(game.n_actions(),
                           replay_memory=replay_memory,
                           build_network=build_network,
                           updates=updates,
                           screen_size=phi.screen_size)

        algo.replay_start_size = replay_start_size
        algo.final_epsilon = final_epsilon
        algo.initial_epsilon = initial_epsilon

        algo.log_frequency = log_frequency
        algo.target_network_update_frequency = target_network_update_frequency
        algo.final_exploration_frame = final_exploration_frame
        return algo

    algo_train = create_algo()
    algo_test = create_algo()
    algo_test.final_epsilon = test_epsilon
    algo_test.initial_epsilon = test_epsilon
    algo_test.epsilon = test_epsilon

    import Queue
    algo_train.mood_q = Queue.Queue() if show_mood else None

    if show_mood is not None:
        import Queue
        algo_train.mood_q = Queue.Queue()
        if show_mood == 'plot':
            plot = Plot()
        elif show_mood == "log":
            plot = Log()

        def worker():
            while True:
                item = algo_train.mood_q.get()
                plot.show(item)
                algo_train.mood_q.task_done()

        import threading
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()

    print(str(algo_train))

    if visualize != 'q':
        visualizer = q.GameNoVisualizer()
    else:
        if game_name == 'simple_breakout':
            visualizer = simple_breakout.SimpleBreakoutVisualizer(algo_train)
        else:
            visualizer = ag.ALEGameVisualizer(phi.screen_size)

    teacher = q.Teacher(game=game,
                        algo=algo_train,
                        game_visualizer=visualizer,
                        phi=phi,
                        repeat_action=repeat_action,
                        max_actions_per_game=max_actions_per_game,
                        skip_n_frames_after_lol=skip_n_frames_after_lol,
                        tester=False)

    tester = q.Teacher(game=game,
                       algo=algo_test,
                       game_visualizer=visualizer,
                       phi=phi,
                       repeat_action=repeat_action,
                       max_actions_per_game=max_actions_per_game,
                       skip_n_frames_after_lol=skip_n_frames_after_lol,
                       tester=True)

    q.teach_and_test(teacher,
                     tester,
                     n_epochs=n_training_epochs,
                     frames_to_test_on=n_test_epochs * epoch_size,
                     epoch_size=epoch_size,
                     state_dir=weights_dir,
                     algo_initial_state_file=algo_initial_state_file)
示例#14
0
def sarsa_gd_on_space_invaders():
    import q_learning as q
    import numpy as np
    import ale_game as ag
    import matplotlib.pyplot as plt
    import sarsa as ss

    plt.ion()
    reload(ss)
    reload(q)
    reload(ag)
    ale = ag.init()
    run = '1'

    def state_adapter(frames):
        result = np.where(np.reshape(np.concatenate(frames), 80 * 80 * 4) > 0)
        if len(result) == 0:
            return [0]
        else:
            return result

    game = ag.SpaceInvadersGame(ale)
    q_algo1 = ss.SARSALambdaGradientDescent(game.n_actions(), theta_len=80 * 80 * 4, state_adapter=state_adapter)
    q_algo1.epsilon = 0.9
    q_algo1.lmbda = 0.99
    q_algo1.gamma = 0.999
    q_algo1.alpha = 0.1

    def new_game():
        game.ale.reset_game()
        game.finished = False
        game.cum_reward = 0
        return game



    result_test = []
    result_1 = []
    result_2 = []

    teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), phi=ag.Phi(skip_every=6), repeat_action=6)

    q_algo1.epsilon = 1
    q_algo1.log_freq = 1
    result_test.append(teacher.teach(10))

    vis_teacher = q.Teacher(new_game, q_algo1, ag.SpaceInvadersGameCombined2Visualizer(), phi=ag.Phi(skip_every=6),
                        repeat_action=6)

    #  teacher.single_step(Game)
    q_algo1.epsilon = 0.1
    q_algo1.log_freq = 1
    # vis_teacher.teach(5)

    for i in xrange(90):
        q_algo1.log_freq = 0.03
        q_algo1.epsilon = 1 - i / 100
        result_2.append(teacher.teach(50))
        q_algo1.epsilon = 0.1
        result_test.append(teacher.teach(10))

    import cPickle as pickle
    with open('gradient_descent.theta' + run, 'wb') as handle:
        pickle.dump(q_algo1.theta, handle)

    with open('gradient_descent.gamma' + run, 'wb') as handle:
        pickle.dump(q_algo1.gamma, handle)

    with open('gradient_descent.lmbda' + run, 'wb') as handle:
        pickle.dump(q_algo1.lmbda, handle)

    with open('gradient_descent.alpha' + run, 'wb') as handle:
        pickle.dump(q_algo1.alpha, handle)

    r1 = [a[1] for a in result_1]
    plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r1), np.cumsum(r1)[200:])]) / 200)

    r2 = [a[1] for r in result_2 for a in r]
    plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r2), np.cumsum(r2)[200:])]) / 200)

    r_test = [a[1] for r in result_test for a in r]
    plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_test)[50:])]) / 50)

    r_4 = [a[1] for a in result_4]
    plt.plot(np.array([x[1] - x[0] for x in zip(np.cumsum(r_test), np.cumsum(r_4)[2:])]) / 2)

    q_algo1.epsilon = 0.1
    teacher = q.Teacher(new_game, q_algo1, q.GameNoVisualizer(), repeat_action=3)
    teacher.teach(100)