def run(**kwargs): ''' Setup TF, gym environment, etc. ''' iterations = kwargs['iterations'] discount = kwargs['discount'] batch_size = kwargs['batch_size'] num_batches = kwargs['num_batches'] max_seq_length = kwargs['max_seq_length'] learning_rate = kwargs['learning_rate'] animate = kwargs['animate'] logdir = kwargs['logdir'] seed = kwargs['seed'] games_played_per_epoch = kwargs['games_played_per_epoch'] load_model = False mcts_iterations = kwargs['mcts_iterations'] batches_per_epoch = kwargs['batches_per_epoch'] headless = kwargs['headless'] update_freq = kwargs['update_freq'] buffer_size = kwargs['buffer_size'] use_priority = kwargs['use_priority'] policy_batch_size = kwargs['policy_batch_size'] reservoir_buffer_size = kwargs['reservoir_buffer_size'] if headless: import matplotlib ################################################################ # SEEDS ################################################################ tf.set_random_seed(seed) np.random.seed(seed) ################################################################ # SETUP GYM + RL ALGO ################################################################ env = gym.make('snake-v1') # Make the gym environment maximum_number_of_steps = max_seq_length #or env.max_episode_steps # Maximum length for episodes ################################################################ # TF BOILERPLATE ################################################################ tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) summary_writers = [] for idx in np.arange(env.n_actors): summary_writers.append( tf.summary.FileWriter( os.path.join(logdir, 'tensorboard', 'snake_%s' % idx))) summary_writers.append( tf.summary.FileWriter( os.path.join(logdir, 'tensorboard', 'training_stats'))) with tf.Session() as sess: networks = [] for i in range(env.n_actors): networks.append( SelfPlay( sess, create_basic([64, 64, 256], transpose=True), [(env.n_actors) * 2 + 1, env.world.screen_width, env.world.screen_height], summary_writers[-1], n_actions=4, batch_size=batch_size, gamma=.99, update_freq=update_freq, ddqn=True, # double dqn buffer_size=buffer_size, clip_grad=None, batches_per_epoch=batches_per_epoch, is_sparse=True, use_priority=use_priority, _id=i, policy_batch_size=policy_batch_size, reservoir_buffer_size=reservoir_buffer_size)) monitor = Monitor(os.path.join(logdir, 'gifs')) epsilon_schedule = PiecewiseSchedule( [(0, .2), (50000, .05), (75000, .01)], outside_value=.01) #LinearSchedule(iterations*60/100, 1., 0.001) eta_schedule = PiecewiseSchedule( [(0, .8), (60000, .4)], outside_value=.4) #LinearSchedule(iterations*60/100, 0.2, 0.1) if use_priority: beta_schedule = LinearSchedule(iterations, 0.4, 1.) learning_rate_schedule = PiecewiseSchedule([(0, 1e-3), (30000, 5e-4), (60000, 1e-4)], outside_value=1e-4) policy_learning_rate_schedule = PiecewiseSchedule([(0, 1e-3), (4000, 5e-4), (20000, 1e-4)], outside_value=1e-4) saver = tf.train.Saver(max_to_keep=2) # summary_writer = tf.summary.FileWriter(logdir) ## Load model from where you left off ## Does not play nice w/ plots in tensorboard at the moment ## TODO: FIX if load_model == True: try: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(logdir) saver.restore(sess, ckpt.model_checkpoint_path) iteration_offset = int( ckpt.model_checkpoint_path.split('-')[-1].split('.')[0]) except: print('Failed to load. Starting from scratch') sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 else: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 summary_writers[0].add_graph(sess.graph) ################################################################ # Train Loop ################################################################ tic = time.time() total_timesteps = 0 while not all([ network.buffer.full(N=int(buffer_size / 2.)) for network in networks ]): networks[0].buffer.games_played += 1 print 'Game number: %s. Buffer_sizes: %s' % ( networks[0].buffer.games_played, [network.buffer.buffer_size for network in networks]) obs = env.reset() done_n = np.array([False] * env.n_actors) steps = 0 length_alive = np.array([0] * env.n_actors) viewer = None while not done_n.all(): length_alive[env.world.idxs_of_alive_snakes] += 1 last_obs = obs acts = [] for i, network in enumerate(networks): act = network.greedy_select( np.array([[x.A for x in get_data(last_obs, i)]]), 1.) acts += [str(act[0])] # Next step obs, reward_n, done_n = env.step(acts) steps += 1 for i in env.world.idxs_of_alive_snakes: priority = networks[i].get_error( np.array(get_data(last_obs, i)), np.array(acts[i]), np.array(reward_n[i]), np.array(get_data(obs, i)), np.array(done_n[i])) networks[i].store( np.array(get_data(last_obs, i)), # state np.array(acts[i]), # action np.array(reward_n[i]), #rewards np.array(get_data(obs, i)), #new state np.array(done_n[i]), #done priority=priority) # networks[i].store_reservoir(np.array(get_data(last_obs, i)), # state # np.array(int(acts[i]))) # terminate the collection of data if the controller shows stability # for a long time. This is a good thing. if steps > maximum_number_of_steps: done_n[:] = True print 'Filled Buffer' to_learn = np.array([0] * env.n_actors) frames_seen = np.array([0] * env.n_actors) for iteration in range(iteration_offset, iteration_offset + iterations + 1): print('{0} Iteration {1} {0}'.format('*' * 10, iteration)) networks[0].buffer.soft_reset() timesteps_in_iteration = 0 if (iteration % update_freq == 0): saver.save( sess, os.path.join(logdir, 'model-' + str(iteration) + '.cptk')) print "Saved Model. Timestep count: %s" % iteration total_number_of_steps_in_iteration = 0 total_reward = np.array([0] * env.n_actors) while True: networks[0].buffer.games_played += 1 if (((networks[0].buffer.games_played) % 10) == 0): print 'Epoch: %s. Game number: %s' % ( iteration, networks[0].buffer.games_played) obs = env.reset() # raw_observations = [] # raw_observations.append(np.array(obs)) animate_episode = ((networks[0].buffer.games_played - 1) == 0) and (iteration % update_freq == 0) and animate done_n = np.array([False] * env.n_actors) steps = 0 # Runs policy, collects observations and rewards viewer = None length_alive = np.array([0] * env.n_actors) game_time = time.time() action_times = [] learn_times = [] select_from_average = np.array([True] * env.n_actors) for idx in range(select_from_average.shape[0]): r = np.random.uniform() eta = eta_schedule.value(iteration) if (eta > 0) and (r <= eta): select_from_average[idx] = False # Sample from greedy while not done_n.all(): if animate_episode: if (not viewer) and (not headless): from gym.envs.classic_control import rendering viewer = rendering.SimpleImageViewer() rgb = env.render('rgb_array', headless=headless) scaler = 10 rgb = repeat_upsample(rgb, scaler, scaler) if not headless: viewer.imshow(rgb) time.sleep(.01) monitor.add(rgb, iteration, networks[0].buffer.games_played) length_alive[env.world.idxs_of_alive_snakes] += 1 to_learn[env.world.idxs_of_alive_snakes] += 1 # ob = get_data(np.array(raw_observations)[-2:]) last_obs = obs # Control the exploration acts = [] action_time = time.time() for i, network in enumerate(networks): if env.world.snakes[i].alive: act = network.select_from_policy( np.array([[x.A for x in get_data(last_obs, i)]]), epsilon_schedule.value(iteration), select_from_average[i]) acts += [str(act[0])] else: acts += [str(0)] action_times.append(time.time() - action_time) # Next step obs, reward_n, done_n = env.step(acts) total_reward += np.array(reward_n) total_number_of_steps_in_iteration += 1 steps += 1 for i in env.world.idxs_of_alive_snakes: priority = networks[i].get_error( np.array(get_data(last_obs, i)), np.array(acts[i]), np.array(reward_n[i]), np.array(get_data(obs, i)), np.array(done_n[i])) networks[i].store( np.array(get_data(last_obs, i)), # state np.array(acts[i]), # action np.array(reward_n[i]), #rewards np.array(get_data(obs, i)), #new state np.array(done_n[i]), #done priority=priority) if not select_from_average[i]: networks[i].store_reservoir( np.array(get_data(last_obs, i)), # state np.array(int(acts[i]))) # max: to cover all new steps added to buffer, min: to not overdo too much learn_time = time.time() for network_id in [ x for x in range(len(to_learn)) if to_learn[x] >= max( networks[x].batch_size, networks[x].avg_policy_batch_size) ]: to_learn[network_id] = 0 network = networks[network_id] for _ in range(5): frames_seen[network_id] += networks[ network_id].batch_size if use_priority: network.train_step(learning_rate_schedule, beta_schedule) else: network.train_step(learning_rate_schedule) for _ in range(5): if network.reservoir.buffer_size > 0: network.avg_policy_train_step( policy_learning_rate_schedule) learn_times.append(time.time() - learn_time) # terminate the collection of data if the controller shows stability # for a long time. This is a good thing. if steps > maximum_number_of_steps: done_n[:] = True if viewer: viewer.close() if networks[0].buffer.games_played >= 1: break game_time = time.time() - game_time monitor.make_gifs(iteration) for count, writer in enumerate(summary_writers[:-1]): summary = tf.Summary() summary.value.add(tag='Average Reward', simple_value=(total_reward[count])) summary.value.add(tag='Steps Taken', simple_value=(length_alive[count])) summary.value.add(tag='Frames Seen', simple_value=frames_seen[count]) writer.add_summary(summary, iteration) writer.flush() summary = tf.Summary() summary.value.add(tag='Time Elapsed/Game', simple_value=game_time) summary.value.add(tag='Time Elapsed/Total Actions', simple_value=np.sum(action_times)) summary.value.add(tag='Time Elapsed/Mean Actions', simple_value=np.mean(action_times)) summary.value.add(tag='Time Elapsed/Max Actions', simple_value=np.max(action_times)) summary.value.add(tag='Time Elapsed/Min Actions', simple_value=np.min(action_times)) summary.value.add(tag='Time Elapsed/Total Learn', simple_value=np.sum(learn_times)) summary.value.add(tag='Time Elapsed/Mean Learn', simple_value=np.mean(learn_times)) summary.value.add(tag='Time Elapsed/Max Learn', simple_value=np.max(learn_times)) summary.value.add(tag='Time Elapsed/Min Learn', simple_value=np.min(learn_times)) summary_writers[-1].add_summary(summary, iteration) summary_writers[-1].flush() print game_time, sum(action_times), sum(learn_times)
def run(**kwargs): ''' Setup TF, gym environment, etc. ''' iterations=kwargs['iterations'] discount=kwargs['discount'] batch_size=kwargs['batch_size'] num_batches=kwargs['num_batches'] max_seq_length=kwargs['max_seq_length'] learning_rate=kwargs['learning_rate'] animate=kwargs['animate'] logdir=kwargs['logdir'] seed=kwargs['seed'] games_played_per_epoch=kwargs['games_played_per_epoch'] load_model = False mcts_iterations=kwargs['mcts_iterations'] batches_per_epoch=kwargs['batches_per_epoch'] headless=kwargs['headless'] update_freq=kwargs['update_freq'] buffer_size=kwargs['buffer_size'] if headless: import matplotlib ################################################################ # SEEDS ################################################################ tf.set_random_seed(seed) np.random.seed(seed) ################################################################ # SETUP GYM + RL ALGO ################################################################ env = gym.make('snake-v0') # Make the gym environment maximum_number_of_steps = max_seq_length #or env.max_episode_steps # Maximum length for episodes ################################################################ # TF BOILERPLATE ################################################################ tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) summary_writers = [] for idx in np.arange(env.n_actors): summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','snake_%s' % idx) )) summary_writers.append(tf.summary.FileWriter(os.path.join(logdir,'tensorboard','training_stats') )) def rgb2gray(rgb): return np.dot(rgb[...,:3], [0.299, 0.587, 0.114]) with tf.Session() as sess: network = DQN( sess, create_basic([16,16,64], transpose=True), [1,env.world.screen_width,env.world.screen_height], summary_writers[-1], n_actions=4, batch_size=batch_size, gamma=.99, update_freq=update_freq, ddqn=True, # double dqn buffer_size = buffer_size, clip_grad = None, batches_per_epoch = batches_per_epoch, is_sparse = False ) monitor = Monitor(os.path.join(logdir,'gifs')) epsilon_schedule = LinearSchedule(iterations*9/10, 1.0, 0.01) learning_rate_schedule = PiecewiseSchedule([(0,1e-3),(20000,5e-4),(50000,1e-4)], outside_value=1e-4) saver = tf.train.Saver(max_to_keep=2) # summary_writer = tf.summary.FileWriter(logdir) ## Load model from where you left off ## Does not play nice w/ plots in tensorboard at the moment ## TODO: FIX if load_model == True: try: print ('Loading Model...') ckpt = tf.train.get_checkpoint_state(logdir) saver.restore(sess,ckpt.model_checkpoint_path) iteration_offset = int(ckpt.model_checkpoint_path.split('-')[-1].split('.')[0]) except: print ('Failed to load. Starting from scratch') sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 else: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 summary_writers[0].add_graph(sess.graph) ################################################################ # Fill Buffer ################################################################ tic = time.time() total_timesteps = 0 while not network.buffer.full(N=buffer_size/2): network.buffer.games_played += 1 print 'Game number: %s. Buffer_size: %s' % (network.buffer.games_played, network.buffer.buffer_size) _ = env.reset() obs = env.render('rgb_array', headless = headless).astype(float) obs /= obs.max() obs = rgb2gray(obs) done_n = np.array([False]*env.n_actors) steps = 0 while not done_n.all(): last_obs = obs acts = network.greedy_select([[last_obs]], 1.) acts = [str(x) for x in acts] # Next step _, reward_n, done_n = env.step(acts[-1]) obs = env.render('rgb_array', headless = headless).astype(float) obs /= obs.max() obs = rgb2gray(obs) steps += 1 network.store(np.array([[last_obs]]), # state np.array(acts), # action np.array(reward_n), #rewards np.array([[obs]]), #new state np.array(done_n) #done ) if steps > maximum_number_of_steps: done_n[:] = True print 'Filled Buffer' ################################################################ # Train Loop ################################################################ network.buffer.soft_reset() total_number_of_steps_in_iteration = 0 for iteration in range(iteration_offset, iteration_offset + iterations): print('{0} Iteration {1} {0}'.format('*'*10, iteration)) timesteps_in_iteration = 0 if (iteration % update_freq == 0): saver.save(sess,os.path.join(logdir,'model-'+str(iteration)+'.cptk')) print "Saved Model. Timestep count: %s" % iteration total_reward = np.array([0]*env.n_actors) while True: network.buffer.games_played += 1 if (((network.buffer.games_played) % 10) == 0): print 'Epoch: %s. Game number: %s' % (iteration, network.buffer.games_played) _ = env.reset() rgb = obs = env.render('rgb_array', headless = headless).astype(float) obs /= obs.max() obs = rgb2gray(obs) animate_episode = (iteration % (update_freq) == 0) and animate done_n = np.array([False]*env.n_actors) steps = 0 # Runs policy, collects observations and rewards viewer = None while not done_n.all(): if animate_episode: if (not viewer) and (not headless): from gym.envs.classic_control import rendering viewer = rendering.SimpleImageViewer() rgb = env.render('rgb_array', headless = headless) scaler = 10 rgb=repeat_upsample(rgb,scaler,scaler) if not headless: viewer.imshow(rgb) time.sleep(.01) monitor.add(rgb, iteration, network.buffer.games_played) # ob = get_data(np.array(raw_observations)[-2:]) last_obs = obs # Control the exploration acts = network.greedy_select([[last_obs]], epsilon_schedule.value(network.epoch)) # epsilon greedy acts = [str(x) for x in acts] # Next step _, reward_n, done_n = env.step(acts[-1]) obs = env.render('rgb_array', headless = headless).astype(float) obs /= obs.max() obs = rgb2gray(obs) total_reward += np.array(reward_n) if total_number_of_steps_in_iteration % 4 == 0: network.train_step(learning_rate_schedule) total_number_of_steps_in_iteration += 1 steps += 1 network.store(np.array([[last_obs]]), # state np.array(acts), # action np.array(reward_n), #rewards np.array([[obs]]), #new state np.array(done_n) #done ) # terminate the collection of data if the controller shows stability # for a long time. This is a good thing. if steps > maximum_number_of_steps: done_n[:] = True if viewer: viewer.close() if network.buffer.games_played >= 1: break monitor.make_gifs(iteration) for count, writer in enumerate(summary_writers): if count < (len(summary_writers) - 1): summary = tf.Summary() summary.value.add(tag='Average Reward', simple_value=(total_reward[count])) summary.value.add(tag='Steps Taken', simple_value=(steps)) writer.add_summary(summary, iteration) writer.flush()
def run(**kwargs): ''' Setup TF, gym environment, etc. ''' logdir = kwargs['logdir'] seed = kwargs['seed'] headless = kwargs['headless'] if headless: import matplotlib ################################################################ # SEEDS ################################################################ tf.set_random_seed(seed * 20) np.random.seed(seed * 20) ################################################################ # SETUP GYM + RL ALGO ################################################################ env = gym.make('snake-v0') # Make the gym environment ################################################################ # TF BOILERPLATE ################################################################ tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) def rgb2gray(rgb): return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) with tf.Session() as sess: network = DQN( sess, create_basic([64, 64, 256], transpose=True), [(env.world.number_of_snakes) * 2 + 1, env.world.screen_width, env.world.screen_height], None, n_actions=4, batch_size=None, gamma=.99, update_freq=None, ddqn=True, # double dqn buffer_size=None, clip_grad=None, batches_per_epoch=None, is_sparse=False, use_priority=False) monitor = Monitor(os.path.join(logdir, 'test_gifs')) # summary_writer = tf.summary.FileWriter(logdir) ## Load model from where you left off ## Does not play nice w/ plots in tensorboard at the moment ## TODO: FIX saver = tf.train.Saver(max_to_keep=2) if True: try: print('Loading Model...') ckpt = tf.train.get_checkpoint_state( os.path.join(os.getcwd(), logdir)) saver.restore(sess, ckpt.model_checkpoint_path) iteration_offset = int( ckpt.model_checkpoint_path.split('-')[-1].split('.')[0]) except: print('Failed to load. Starting from scratch') sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 else: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) iteration_offset = 0 ################################################################ # Fill Buffer ################################################################ tic = time.time() total_timesteps = 0 for iteration in range(5): obs = env.reset() # obs = env.render('rgb_array', headless = headless).astype(float) # obs /= obs.max() # obs = rgb2gray(obs) done_n = np.array([False] * env.n_actors) steps = 0 viewer = None while not done_n.all(): if True: if (not viewer) and (not headless): from gym.envs.classic_control import rendering viewer = rendering.SimpleImageViewer() rgb = env.render('rgb_array', headless=headless) scaler = 10 rgb = repeat_upsample(rgb, scaler, scaler) if not headless: viewer.imshow(rgb) time.sleep(.01) monitor.add(rgb, iteration, iteration) last_obs = np.array([[x.A for x in obs]]) acts = network.greedy_select( last_obs, 0) #network.greedy_select([[last_obs]], 0) acts = [str(x) for x in acts] # Next step obs, reward_n, done_n = env.step(acts[-1]) # obs = env.render('rgb_array', headless = headless).astype(float) # obs /= obs.max() # obs = rgb2gray(obs) steps += 1 if steps > 300: break monitor.make_gifs(iteration, fps=12) pdb.set_trace()