Exemplo n.º 1
0
def run(**kwargs):
    '''
    Setup TF, gym environment, etc.
    '''

    iterations = kwargs['iterations']
    discount = kwargs['discount']
    batch_size = kwargs['batch_size']
    num_batches = kwargs['num_batches']
    max_seq_length = kwargs['max_seq_length']
    learning_rate = kwargs['learning_rate']
    animate = kwargs['animate']
    logdir = kwargs['logdir']
    seed = kwargs['seed']
    games_played_per_epoch = kwargs['games_played_per_epoch']
    load_model = False
    mcts_iterations = kwargs['mcts_iterations']
    batches_per_epoch = kwargs['batches_per_epoch']
    headless = kwargs['headless']
    update_freq = kwargs['update_freq']
    buffer_size = kwargs['buffer_size']
    use_priority = kwargs['use_priority']
    policy_batch_size = kwargs['policy_batch_size']
    reservoir_buffer_size = kwargs['reservoir_buffer_size']

    if headless:
        import matplotlib

    ################################################################
    # SEEDS
    ################################################################
    tf.set_random_seed(seed)
    np.random.seed(seed)

    ################################################################
    # SETUP GYM + RL ALGO
    ################################################################
    env = gym.make('snake-v1')  # Make the gym environment
    maximum_number_of_steps = max_seq_length  #or env.max_episode_steps # Maximum length for episodes

    ################################################################
    # TF BOILERPLATE
    ################################################################

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)

    summary_writers = []
    for idx in np.arange(env.n_actors):
        summary_writers.append(
            tf.summary.FileWriter(
                os.path.join(logdir, 'tensorboard', 'snake_%s' % idx)))

    summary_writers.append(
        tf.summary.FileWriter(
            os.path.join(logdir, 'tensorboard', 'training_stats')))

    with tf.Session() as sess:

        networks = []

        for i in range(env.n_actors):
            networks.append(
                SelfPlay(
                    sess,
                    create_basic([64, 64, 256], transpose=True),
                    [(env.n_actors) * 2 + 1, env.world.screen_width,
                     env.world.screen_height],
                    summary_writers[-1],
                    n_actions=4,
                    batch_size=batch_size,
                    gamma=.99,
                    update_freq=update_freq,
                    ddqn=True,  # double dqn
                    buffer_size=buffer_size,
                    clip_grad=None,
                    batches_per_epoch=batches_per_epoch,
                    is_sparse=True,
                    use_priority=use_priority,
                    _id=i,
                    policy_batch_size=policy_batch_size,
                    reservoir_buffer_size=reservoir_buffer_size))

        monitor = Monitor(os.path.join(logdir, 'gifs'))
        epsilon_schedule = PiecewiseSchedule(
            [(0, .2), (50000, .05), (75000, .01)],
            outside_value=.01)  #LinearSchedule(iterations*60/100, 1., 0.001)
        eta_schedule = PiecewiseSchedule(
            [(0, .8), (60000, .4)],
            outside_value=.4)  #LinearSchedule(iterations*60/100, 0.2, 0.1)
        if use_priority:
            beta_schedule = LinearSchedule(iterations, 0.4, 1.)
        learning_rate_schedule = PiecewiseSchedule([(0, 1e-3), (30000, 5e-4),
                                                    (60000, 1e-4)],
                                                   outside_value=1e-4)
        policy_learning_rate_schedule = PiecewiseSchedule([(0, 1e-3),
                                                           (4000, 5e-4),
                                                           (20000, 1e-4)],
                                                          outside_value=1e-4)

        saver = tf.train.Saver(max_to_keep=2)
        # summary_writer = tf.summary.FileWriter(logdir)

        ## Load model from where you left off
        ## Does not play nice w/ plots in tensorboard at the moment
        ## TODO: FIX
        if load_model == True:
            try:
                print('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(logdir)
                saver.restore(sess, ckpt.model_checkpoint_path)
                iteration_offset = int(
                    ckpt.model_checkpoint_path.split('-')[-1].split('.')[0])
            except:
                print('Failed to load. Starting from scratch')
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())
                iteration_offset = 0
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            iteration_offset = 0

        summary_writers[0].add_graph(sess.graph)

        ################################################################
        # Train Loop
        ################################################################

        tic = time.time()
        total_timesteps = 0

        while not all([
                network.buffer.full(N=int(buffer_size / 2.))
                for network in networks
        ]):
            networks[0].buffer.games_played += 1
            print 'Game number: %s. Buffer_sizes: %s' % (
                networks[0].buffer.games_played,
                [network.buffer.buffer_size for network in networks])
            obs = env.reset()

            done_n = np.array([False] * env.n_actors)
            steps = 0
            length_alive = np.array([0] * env.n_actors)
            viewer = None
            while not done_n.all():

                length_alive[env.world.idxs_of_alive_snakes] += 1
                last_obs = obs

                acts = []
                for i, network in enumerate(networks):
                    act = network.greedy_select(
                        np.array([[x.A for x in get_data(last_obs, i)]]), 1.)
                    acts += [str(act[0])]

                # Next step
                obs, reward_n, done_n = env.step(acts)
                steps += 1

                for i in env.world.idxs_of_alive_snakes:
                    priority = networks[i].get_error(
                        np.array(get_data(last_obs, i)), np.array(acts[i]),
                        np.array(reward_n[i]), np.array(get_data(obs, i)),
                        np.array(done_n[i]))

                    networks[i].store(
                        np.array(get_data(last_obs, i)),  # state
                        np.array(acts[i]),  # action
                        np.array(reward_n[i]),  #rewards
                        np.array(get_data(obs, i)),  #new state
                        np.array(done_n[i]),  #done
                        priority=priority)

                    # networks[i].store_reservoir(np.array(get_data(last_obs, i)), # state
                    #                                     np.array(int(acts[i])))

                # terminate the collection of data if the controller shows stability
                # for a long time. This is a good thing.
                if steps > maximum_number_of_steps:
                    done_n[:] = True

        print 'Filled Buffer'

        to_learn = np.array([0] * env.n_actors)
        frames_seen = np.array([0] * env.n_actors)

        for iteration in range(iteration_offset,
                               iteration_offset + iterations + 1):
            print('{0} Iteration {1} {0}'.format('*' * 10, iteration))
            networks[0].buffer.soft_reset()
            timesteps_in_iteration = 0

            if (iteration % update_freq == 0):
                saver.save(
                    sess,
                    os.path.join(logdir, 'model-' + str(iteration) + '.cptk'))
                print "Saved Model. Timestep count: %s" % iteration

            total_number_of_steps_in_iteration = 0

            total_reward = np.array([0] * env.n_actors)

            while True:
                networks[0].buffer.games_played += 1
                if (((networks[0].buffer.games_played) % 10) == 0):
                    print 'Epoch: %s. Game number: %s' % (
                        iteration, networks[0].buffer.games_played)
                obs = env.reset()

                # raw_observations = []
                # raw_observations.append(np.array(obs))

                animate_episode = ((networks[0].buffer.games_played - 1)
                                   == 0) and (iteration % update_freq
                                              == 0) and animate

                done_n = np.array([False] * env.n_actors)
                steps = 0

                # Runs policy, collects observations and rewards
                viewer = None

                length_alive = np.array([0] * env.n_actors)
                game_time = time.time()
                action_times = []
                learn_times = []

                select_from_average = np.array([True] * env.n_actors)

                for idx in range(select_from_average.shape[0]):
                    r = np.random.uniform()
                    eta = eta_schedule.value(iteration)
                    if (eta > 0) and (r <= eta):
                        select_from_average[idx] = False  # Sample from greedy

                while not done_n.all():

                    if animate_episode:
                        if (not viewer) and (not headless):
                            from gym.envs.classic_control import rendering
                            viewer = rendering.SimpleImageViewer()

                        rgb = env.render('rgb_array', headless=headless)
                        scaler = 10
                        rgb = repeat_upsample(rgb, scaler, scaler)

                        if not headless:

                            viewer.imshow(rgb)
                            time.sleep(.01)

                        monitor.add(rgb, iteration,
                                    networks[0].buffer.games_played)

                    length_alive[env.world.idxs_of_alive_snakes] += 1
                    to_learn[env.world.idxs_of_alive_snakes] += 1
                    # ob = get_data(np.array(raw_observations)[-2:])
                    last_obs = obs

                    # Control the exploration
                    acts = []
                    action_time = time.time()
                    for i, network in enumerate(networks):
                        if env.world.snakes[i].alive:
                            act = network.select_from_policy(
                                np.array([[x.A
                                           for x in get_data(last_obs, i)]]),
                                epsilon_schedule.value(iteration),
                                select_from_average[i])
                            acts += [str(act[0])]
                        else:
                            acts += [str(0)]

                    action_times.append(time.time() - action_time)
                    # Next step
                    obs, reward_n, done_n = env.step(acts)

                    total_reward += np.array(reward_n)

                    total_number_of_steps_in_iteration += 1
                    steps += 1

                    for i in env.world.idxs_of_alive_snakes:
                        priority = networks[i].get_error(
                            np.array(get_data(last_obs, i)), np.array(acts[i]),
                            np.array(reward_n[i]), np.array(get_data(obs, i)),
                            np.array(done_n[i]))

                        networks[i].store(
                            np.array(get_data(last_obs, i)),  # state
                            np.array(acts[i]),  # action
                            np.array(reward_n[i]),  #rewards
                            np.array(get_data(obs, i)),  #new state
                            np.array(done_n[i]),  #done
                            priority=priority)
                        if not select_from_average[i]:
                            networks[i].store_reservoir(
                                np.array(get_data(last_obs, i)),  # state
                                np.array(int(acts[i])))

                    # max: to cover all new steps added to buffer, min: to not overdo too much
                    learn_time = time.time()
                    for network_id in [
                            x for x in range(len(to_learn))
                            if to_learn[x] >= max(
                                networks[x].batch_size,
                                networks[x].avg_policy_batch_size)
                    ]:
                        to_learn[network_id] = 0
                        network = networks[network_id]
                        for _ in range(5):
                            frames_seen[network_id] += networks[
                                network_id].batch_size
                            if use_priority:
                                network.train_step(learning_rate_schedule,
                                                   beta_schedule)
                            else:
                                network.train_step(learning_rate_schedule)

                        for _ in range(5):
                            if network.reservoir.buffer_size > 0:
                                network.avg_policy_train_step(
                                    policy_learning_rate_schedule)

                    learn_times.append(time.time() - learn_time)
                    # terminate the collection of data if the controller shows stability
                    # for a long time. This is a good thing.
                    if steps > maximum_number_of_steps:
                        done_n[:] = True

                if viewer:
                    viewer.close()

                if networks[0].buffer.games_played >= 1:
                    break

            game_time = time.time() - game_time
            monitor.make_gifs(iteration)

            for count, writer in enumerate(summary_writers[:-1]):
                summary = tf.Summary()
                summary.value.add(tag='Average Reward',
                                  simple_value=(total_reward[count]))
                summary.value.add(tag='Steps Taken',
                                  simple_value=(length_alive[count]))
                summary.value.add(tag='Frames Seen',
                                  simple_value=frames_seen[count])
                writer.add_summary(summary, iteration)
                writer.flush()

            summary = tf.Summary()
            summary.value.add(tag='Time Elapsed/Game', simple_value=game_time)
            summary.value.add(tag='Time Elapsed/Total Actions',
                              simple_value=np.sum(action_times))
            summary.value.add(tag='Time Elapsed/Mean Actions',
                              simple_value=np.mean(action_times))
            summary.value.add(tag='Time Elapsed/Max Actions',
                              simple_value=np.max(action_times))
            summary.value.add(tag='Time Elapsed/Min Actions',
                              simple_value=np.min(action_times))
            summary.value.add(tag='Time Elapsed/Total Learn',
                              simple_value=np.sum(learn_times))
            summary.value.add(tag='Time Elapsed/Mean Learn',
                              simple_value=np.mean(learn_times))
            summary.value.add(tag='Time Elapsed/Max Learn',
                              simple_value=np.max(learn_times))
            summary.value.add(tag='Time Elapsed/Min Learn',
                              simple_value=np.min(learn_times))
            summary_writers[-1].add_summary(summary, iteration)
            summary_writers[-1].flush()

            print game_time, sum(action_times), sum(learn_times)
Exemplo n.º 2
0
def run(**kwargs):
    '''
    Setup TF, gym environment, etc.
    '''

    iterations=kwargs['iterations']
    discount=kwargs['discount']
    batch_size=kwargs['batch_size']
    num_batches=kwargs['num_batches']
    max_seq_length=kwargs['max_seq_length']
    learning_rate=kwargs['learning_rate']
    animate=kwargs['animate']
    logdir=kwargs['logdir']
    seed=kwargs['seed']
    games_played_per_epoch=kwargs['games_played_per_epoch']
    load_model = False
    mcts_iterations=kwargs['mcts_iterations']
    batches_per_epoch=kwargs['batches_per_epoch']
    headless=kwargs['headless']
    update_freq=kwargs['update_freq']
    buffer_size=kwargs['buffer_size']

    if headless:
        import matplotlib

    ################################################################
    # SEEDS
    ################################################################
    tf.set_random_seed(seed)
    np.random.seed(seed)

    
    ################################################################
    # SETUP GYM + RL ALGO
    ################################################################
    env = gym.make('snake-v0') # Make the gym environment
    maximum_number_of_steps = max_seq_length #or env.max_episode_steps # Maximum length for episodes
   

    ################################################################
    # TF BOILERPLATE
    ################################################################

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
    sess = tf.Session(config=tf_config)

    summary_writers = []
    for idx in np.arange(env.n_actors):
        summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','snake_%s' % idx) ))

    summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','training_stats') ))    

    def rgb2gray(rgb):
        return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

    with tf.Session() as sess:
        network = DQN( 
                     sess,
                     create_basic([16,16,64], transpose=True),
                     [1,env.world.screen_width,env.world.screen_height], 
                     summary_writers[-1],
                     n_actions=4, 
                     batch_size=batch_size,
                     gamma=.99,
                     update_freq=update_freq,
                     ddqn=True, # double dqn
                     buffer_size = buffer_size,
                     clip_grad = None,
                     batches_per_epoch = batches_per_epoch,
                     is_sparse = False
                     )

        monitor = Monitor(os.path.join(logdir,'gifs'))
        epsilon_schedule = LinearSchedule(iterations*9/10, 1.0, 0.01)
        learning_rate_schedule = PiecewiseSchedule([(0,1e-3),(20000,5e-4),(50000,1e-4)], outside_value=1e-4)

        saver = tf.train.Saver(max_to_keep=2)
        # summary_writer = tf.summary.FileWriter(logdir) 

        ## Load model from where you left off
        ## Does not play nice w/ plots in tensorboard at the moment
        ## TODO: FIX
        if load_model == True:
            try:
                print ('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(logdir)
                saver.restore(sess,ckpt.model_checkpoint_path)
                iteration_offset = int(ckpt.model_checkpoint_path.split('-')[-1].split('.')[0])
            except:
                print ('Failed to load. Starting from scratch')
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())
                iteration_offset = 0   
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            iteration_offset = 0

        summary_writers[0].add_graph(sess.graph)

        ################################################################
        # Fill Buffer
        ################################################################

        tic = time.time()
        total_timesteps = 0

        while not network.buffer.full(N=buffer_size/2):
            network.buffer.games_played += 1
            print 'Game number: %s. Buffer_size: %s' % (network.buffer.games_played, network.buffer.buffer_size)
            _ = env.reset()
            obs = env.render('rgb_array', headless = headless).astype(float)
            obs /= obs.max()
            obs = rgb2gray(obs)

            done_n = np.array([False]*env.n_actors)
            steps = 0
            while not done_n.all():
                last_obs = obs
                acts = network.greedy_select([[last_obs]], 1.) 
                acts = [str(x) for x in acts]
      
                # Next step
                _, reward_n, done_n = env.step(acts[-1])
                obs = env.render('rgb_array', headless = headless).astype(float)
                obs /= obs.max()
                obs = rgb2gray(obs)

                steps += 1

                network.store(np.array([[last_obs]]), # state
                                  np.array(acts), # action
                                  np.array(reward_n), #rewards
                                  np.array([[obs]]), #new state
                                  np.array(done_n) #done
                                  )

                if steps > maximum_number_of_steps:
                    done_n[:] = True

        print 'Filled Buffer'

        ################################################################
        # Train Loop
        ################################################################
        network.buffer.soft_reset()
        total_number_of_steps_in_iteration = 0

        for iteration in range(iteration_offset, iteration_offset + iterations):
            print('{0} Iteration {1} {0}'.format('*'*10, iteration))
            timesteps_in_iteration = 0

            if (iteration % update_freq == 0):
                saver.save(sess,os.path.join(logdir,'model-'+str(iteration)+'.cptk'))
                print "Saved Model. Timestep count: %s" % iteration

            total_reward = np.array([0]*env.n_actors)

            while True:
                network.buffer.games_played += 1
                if (((network.buffer.games_played) % 10) == 0):
                    print 'Epoch: %s. Game number: %s' % (iteration, network.buffer.games_played)
                _ = env.reset()
                rgb = obs = env.render('rgb_array', headless = headless).astype(float)
                obs /= obs.max()
                obs = rgb2gray(obs)

                animate_episode = (iteration % (update_freq) == 0) and animate

                done_n = np.array([False]*env.n_actors)
                steps = 0
                
                # Runs policy, collects observations and rewards
                viewer = None

                while not done_n.all():

                    if animate_episode:
                        if (not viewer) and (not headless):
                            from gym.envs.classic_control import rendering
                            viewer = rendering.SimpleImageViewer()

                        rgb = env.render('rgb_array', headless = headless)
                        scaler = 10
                        rgb=repeat_upsample(rgb,scaler,scaler)

                        if not headless:
                            
                            viewer.imshow(rgb)
                            time.sleep(.01)

                        monitor.add(rgb, iteration, network.buffer.games_played)

                    
                    # ob = get_data(np.array(raw_observations)[-2:])
                    last_obs = obs

                    # Control the exploration
                    acts = network.greedy_select([[last_obs]], epsilon_schedule.value(network.epoch)) # epsilon greedy

                    acts = [str(x) for x in acts]
          
                    # Next step
                    _, reward_n, done_n = env.step(acts[-1])
                    obs = env.render('rgb_array', headless = headless).astype(float)
                    obs /= obs.max()
                    obs = rgb2gray(obs)

                    total_reward += np.array(reward_n)

                    if total_number_of_steps_in_iteration % 4 == 0:
                        network.train_step(learning_rate_schedule)
                    
                    total_number_of_steps_in_iteration += 1
                    steps += 1

                    network.store(np.array([[last_obs]]), # state
                                  np.array(acts), # action
                                  np.array(reward_n), #rewards
                                  np.array([[obs]]), #new state
                                  np.array(done_n) #done
                                  )

                    # terminate the collection of data if the controller shows stability
                    # for a long time. This is a good thing.
                    if steps > maximum_number_of_steps:
                        done_n[:] = True

                if viewer:
                    viewer.close()

                if network.buffer.games_played >= 1:
                    break

            monitor.make_gifs(iteration)
            
            
            for count, writer in enumerate(summary_writers):
                if count < (len(summary_writers) - 1):
                    summary = tf.Summary()
                    summary.value.add(tag='Average Reward', simple_value=(total_reward[count]))
                    summary.value.add(tag='Steps Taken', simple_value=(steps))
                    writer.add_summary(summary, iteration)
                writer.flush()
Exemplo n.º 3
0
def run(**kwargs):
    '''
    Setup TF, gym environment, etc.
    '''

    logdir = kwargs['logdir']
    seed = kwargs['seed']
    headless = kwargs['headless']

    if headless:
        import matplotlib

    ################################################################
    # SEEDS
    ################################################################
    tf.set_random_seed(seed * 20)
    np.random.seed(seed * 20)

    ################################################################
    # SETUP GYM + RL ALGO
    ################################################################
    env = gym.make('snake-v0')  # Make the gym environment

    ################################################################
    # TF BOILERPLATE
    ################################################################

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)

    def rgb2gray(rgb):
        return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])

    with tf.Session() as sess:
        network = DQN(
            sess,
            create_basic([64, 64, 256], transpose=True),
            [(env.world.number_of_snakes) * 2 + 1, env.world.screen_width,
             env.world.screen_height],
            None,
            n_actions=4,
            batch_size=None,
            gamma=.99,
            update_freq=None,
            ddqn=True,  # double dqn
            buffer_size=None,
            clip_grad=None,
            batches_per_epoch=None,
            is_sparse=False,
            use_priority=False)

        monitor = Monitor(os.path.join(logdir, 'test_gifs'))
        # summary_writer = tf.summary.FileWriter(logdir)

        ## Load model from where you left off
        ## Does not play nice w/ plots in tensorboard at the moment
        ## TODO: FIX
        saver = tf.train.Saver(max_to_keep=2)

        if True:
            try:
                print('Loading Model...')
                ckpt = tf.train.get_checkpoint_state(
                    os.path.join(os.getcwd(), logdir))
                saver.restore(sess, ckpt.model_checkpoint_path)
                iteration_offset = int(
                    ckpt.model_checkpoint_path.split('-')[-1].split('.')[0])
            except:
                print('Failed to load. Starting from scratch')
                sess.run(tf.global_variables_initializer())
                sess.run(tf.local_variables_initializer())
                iteration_offset = 0
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            iteration_offset = 0

        ################################################################
        # Fill Buffer
        ################################################################

        tic = time.time()
        total_timesteps = 0

        for iteration in range(5):
            obs = env.reset()
            # obs = env.render('rgb_array', headless = headless).astype(float)
            # obs /= obs.max()
            # obs = rgb2gray(obs)

            done_n = np.array([False] * env.n_actors)
            steps = 0
            viewer = None
            while not done_n.all():

                if True:
                    if (not viewer) and (not headless):
                        from gym.envs.classic_control import rendering
                        viewer = rendering.SimpleImageViewer()

                    rgb = env.render('rgb_array', headless=headless)
                    scaler = 10
                    rgb = repeat_upsample(rgb, scaler, scaler)

                    if not headless:

                        viewer.imshow(rgb)
                        time.sleep(.01)

                    monitor.add(rgb, iteration, iteration)

                last_obs = np.array([[x.A for x in obs]])
                acts = network.greedy_select(
                    last_obs, 0)  #network.greedy_select([[last_obs]], 0)
                acts = [str(x) for x in acts]

                # Next step
                obs, reward_n, done_n = env.step(acts[-1])
                # obs = env.render('rgb_array', headless = headless).astype(float)
                # obs /= obs.max()
                # obs = rgb2gray(obs)

                steps += 1
                if steps > 300:
                    break

            monitor.make_gifs(iteration, fps=12)
            pdb.set_trace()