def main(_): model_dir = get_model_dir(conf, ['is_train', 'random_seed', 'monitor', 'display', 'log_level']) preprocess_conf(conf) with tf.Session() as sess: # environment env = gym.make(conf.env_name) env._seed(conf.random_seed) assert isinstance(env.observation_space, gym.spaces.Box), \ "observation space must be continuous" assert isinstance(env.action_space, gym.spaces.Box), \ "action space must be continuous" # exploration strategy if conf.noise == 'ou': strategy = OUExploration(env, sigma=conf.noise_scale) elif conf.noise == 'brownian': strategy = BrownianExploration(env, conf.noise_scale) elif conf.noise == 'linear_decay': strategy = LinearDecayExploration(env) else: raise ValueError('Unkown exploration strategy: %s' % conf.noise) # networks shared_args = { 'sess': sess, 'input_shape': env.observation_space.shape, 'action_size': env.action_space.shape[0], 'hidden_dims': conf.hidden_dims, 'use_batch_norm': conf.use_batch_norm, 'use_seperate_networks': conf.use_seperate_networks, 'hidden_w': conf.hidden_w, 'action_w': conf.action_w, 'hidden_fn': conf.hidden_fn, 'action_fn': conf.action_fn, 'w_reg': conf.w_reg, } logger.info("Creating prediction network...") pred_network = Network( scope='pred_network', **shared_args ) logger.info("Creating target network...") target_network = Network( scope='target_network', **shared_args ) target_network.make_soft_update_from(pred_network, conf.tau) # statistic stat = Statistic(sess, conf.env_name, model_dir, pred_network.variables, conf.update_repeat) agent = NAF(sess, env, strategy, pred_network, target_network, stat, conf.discount, conf.batch_size, conf.learning_rate, conf.max_steps, conf.update_repeat, conf.max_episodes) #agent.run(conf.monitor, conf.display, conf.is_train) agent.run(conf.monitor, conf.display, True)
def __init__(self): self.world = World(*SimulationConfig.word_size) self.graphic = Graphic(self.world, *SimulationConfig.pane_size) if SimulationConfig.fixed_sick_cases: for i in range(SimulationConfig.population_size): if i < SimulationConfig.fixed_cases_count: self.world.add_agent_on_free(Agent(self.world, True)) else: self.world.add_agent_on_free(Agent(self.world, False)) else: for i in range(SimulationConfig.population_size): self.world.add_agent_on_free( Agent( self.world, get_it_with_probability( SimulationConfig.create_sick_agent_probability, True, False))) self.statistic = Statistic(self.world)
class Simulation: graphic: Graphic world: World statistic: Statistic def __init__(self): self.world = World(*SimulationConfig.word_size) self.graphic = Graphic(self.world, *SimulationConfig.pane_size) if SimulationConfig.fixed_sick_cases: for i in range(SimulationConfig.population_size): if i < SimulationConfig.fixed_cases_count: self.world.add_agent_on_free(Agent(self.world, True)) else: self.world.add_agent_on_free(Agent(self.world, False)) else: for i in range(SimulationConfig.population_size): self.world.add_agent_on_free( Agent( self.world, get_it_with_probability( SimulationConfig.create_sick_agent_probability, True, False))) self.statistic = Statistic(self.world) def run(self): while True: self.step() self.graphic.render() self.statistic.collect_statistics() def step(self): random.shuffle(self.world.agents) self.world.clear_death_agents() for agent in self.world.agents: agent.step() self.world.process_step_effects()
def main(_): model_dir, data_dir = get_dirs(conf, ['exp_name']) # exp_start_time = datetime.datetime.now().strftime("%A_%b%d-%H%M%S") # data_dir = "logs/" + conf.exp_name + "_" + exp_start_time preprocess_conf(conf, model_dir) env = gym.make(conf.env_name) env.seed(conf.random_seed) state_shape = env.observation_space.shape if type(env.action_space) is gym.spaces.Discrete: action_shape = env.action_space.n else: action_shape = env.action_space.shape[0] # replay buffer buffer = ReplayBuffer2(conf.buffer_size) # building agent # config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.allow_growth = True config = tf.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=8) with tf.Session(config=config) as sess: # agent agent = SoftPolicyGradient(sess, conf, state_shape, action_shape) # statistic stat = Statistic(sess, conf, model_dir, data_dir) if conf.load_model: stat.load_model() def var_print(): for var in tf.global_variables(): print(var) print("printing vars:------------------------------------------------") var_print() print( "printing vars::------------------------------------------------") start_steps = 1000 episode, global_step, local_step = 0, 0, 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() # pbar = tqdm(total=conf.max_steps, dynamic_ncols=True) while global_step < conf.max_steps: # interaction with environment action = agent.sampling_actions( [state], is_deterministic=False)[0] # [-inf, inf] next_state, reward, done, info = env.step( action_converter(env, action)) global_step += 1 local_step += 1 epi_rewards += reward reward *= conf.reward_scale buffer.add_transition(state, action, reward, next_state, done) state = next_state # train step if buffer.size() >= conf.batch_size and global_step >= start_steps: for i in range(conf.num_train_steps): transitions = buffer.get_transitions(conf.batch_size) Q, single_Q_loss, single_pi_loss = agent.trainer( transitions) total_Q.append(np.mean(Q)) Q_loss.append(single_Q_loss) pi_loss.append(single_pi_loss) # evaluate step if global_step % conf.eval_interval == 0: ave_epi_rewards = np.mean(eval_step(env, agent)) stat.save_step(global_step, ave_epi_rewards) print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards) if done: # save step all_epi_rewards.append(epi_rewards) stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) # pbar.update(local_step) lenn = len(all_epi_rewards) fromm = max(lenn - 20, 0) to = lenn min_5_ep_ret = min(all_epi_rewards[fromm:to]) # pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f avg_5_epi_rew %.1f' % # (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), sum(all_epi_rewards[fromm:to])/(to-fromm) ) ) print( 'Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f \tmin_5_epi_rew %.1f' % (episode + 1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), min_5_ep_ret)) threshold = -500.0 if ((to - fromm) > 3 and min_5_ep_ret > threshold): time_end = time.time() print("SHI hyperParams have made algo converge (", threshold, ") in ", (time_end - time_begin) / 1.0, " s") stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) stat.save_model(global_step) sys.exit() episode += 1 local_step = 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset()
def main(_): model_dir, data_dir = get_dirs(conf, ['env_name']) preprocess_conf(conf, model_dir) env = gym.make(conf.env_name) # env.seed(conf.random_seed) state_shape = env.observation_space.shape if type(env.action_space) is gym.spaces.Discrete: action_shape = env.action_space.n else: action_shape = env.action_space.shape[0] # replay buffer buffer = ReplayBuffer2(conf.buffer_size) # building agent config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # agent agent = SoftPolicyGradient(sess, conf, state_shape, action_shape) # statistic stat = Statistic(sess, conf, model_dir, data_dir) if conf.load_model: stat.load_model() episode, global_step, local_step = 0, 0, 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() pbar = tqdm(total=conf.max_steps, dynamic_ncols=True) while global_step < conf.max_steps: # interaction with environment action = agent.sampling_actions([state], is_deterministic=False)[0] # [-inf, inf] next_state, reward, done, info = env.step(action_converter(env, action)) global_step += 1 local_step += 1 epi_rewards += reward reward *= conf.reward_scale buffer.add_transition(state, action, reward, next_state, done) state = next_state # train step if buffer.size() >= conf.batch_size: for i in range(conf.num_train_steps): transitions = buffer.get_transitions(conf.batch_size) Q, single_Q_loss, single_pi_loss = agent.trainer(transitions) total_Q.append(np.mean(Q)) Q_loss.append(single_Q_loss) pi_loss.append(single_pi_loss) # evaluate step if global_step % conf.eval_interval == 0: ave_epi_rewards = np.mean(eval_step(env, agent)) stat.save_step(global_step, ave_epi_rewards) print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards) if done: # save step stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss)) pbar.update(local_step) pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f' % (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss))) print() episode += 1 local_step = 0 epi_rewards = 0 total_Q, Q_loss, pi_loss = [], [], [] state = env.reset() pbar.close()
def main(_): config.observation_dims = eval(config.observation_dims) # Scale some of the flags. for flag in [ 'memory_size', 't_target_q_update_freq', 't_ep_end', 't_train_max', 'learning_rate_decay_step', 't_learn_start', 't_test', 't_save', 'n_step', 'n_episode' ]: setattr(config, flag, getattr(config, flag) * config.scale) # Determine some more flags, clean up flags. if config.chtc: config.t_test /= 10 config.max_to_keep = 0 if not config.chtc else 2 config.run_dir = config.run_dir.replace('//', '/') config.save_dir = config.save_dir.replace('//', '/') # Print config. PrettyPrinter().pprint({ key: config.__dict__['__wrapped'][key].value for key in config.__dict__['__wrapped'].__dir__() }) with tf.Session() as sess: # Create environment, networks, statistics module, and agent. env = AtariEnvironment( env_name=config.env_name, n_action_repeat=config.n_action_repeat, max_random_start=config.max_random_start, observation_dims=config.observation_dims, display=config.display, use_cumulated_reward=config.use_cumulated_reward) pred_network = CNN(sess=sess, history_length=config.history_length, observation_dims=config.observation_dims, output_size=env.env.action_space.n, name='pred_network', trainable=True) target_network = CNN(sess=sess, history_length=config.history_length, observation_dims=config.observation_dims, output_size=env.env.action_space.n, name='target_network', trainable=False) stat = Statistic(sess=sess, t_test=config.t_test, t_save=config.t_save, t_learn_start=config.t_learn_start, run_dir=config.run_dir, save_dir=config.save_dir, variables=pred_network.var.values(), load=config.load, chtc=config.chtc, window_length=config.window_length, termination_p_hat=config.termination_p_hat, max_to_keep=config.max_to_keep) agent = DeepQAgent(sess=sess, pred_network=pred_network, env=env, stat=stat, config=config, target_network=target_network) # Begin training/playing. if config.is_train: agent.train(config.t_train_max) else: agent.play(test_ep=0., n_step=config.n_step, n_episode=config.n_episode) agent.stat.save_model(agent.t, stat.saver) print(" [*] Cleanly closing!")
def train(): # create place holder img input_ph, ground_truths_ph, ground_truths, pre_processed_input = dh.get_place_holders( ) # Processing LabelId's one_hot_labels = utils.one_hot( ground_truths[0], is_color=False) # TODO: add dictionary task-to-label-number # Geting model autoencoder = utils.get_autoencoder(user_config.autoencoder, config.working_dataset, config.strided) logits = autoencoder.inference(pre_processed_input) processed_ground_truths = [ one_hot_labels, ground_truths[1], ground_truths[2] ] loss_op, loss_list, multi_loss_class = lh.get_loss( logits, processed_ground_truths) optimizer = tf.train.AdamOptimizer(FLAGS.leaning_rate) train_step = optimizer.minimize(loss_op) saver = tf.train.Saver() gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=config.gpu_memory_fraction) session_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options) if FLAGS.use_summary: sh.define_summaries(logits, ground_truths, processed_ground_truths, loss_op, loss_list, multi_loss_class) num_of_train_examples = FLAGS.num_of_train_imgs statistic = Statistic(logits, loss_op, loss_list, input_ph, ground_truths_ph, multi_loss_class, processed_ground_truths) val_input_img, val_gt = dh.init_data(FLAGS.num_of_val_imgs) for ind in range(FLAGS.num_of_val_imgs): val_input_img[ind], val_gt[ind] = dh.get_data(ind, 'val') with tf.Session(config=session_config) as sess: global_step = start_training(sess, autoencoder, saver) summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(results_dir + '/logs', sess.graph) input_img, gt = dh.init_data(num_of_train_examples) input_batch = None # training starts here step = 0 for epoch in range(FLAGS.num_of_epchs): print("\nEpoch: " + str(epoch)) sub_batche = 0 for ind in tqdm(np.random.permutation(num_of_train_examples)): if input_img[ind] is None: input_img[ind], gt[ind] = dh.get_data(ind, 'train') # ----- make the random batch ---- if sub_batche == 0: input_batch = input_img[ind] gt_batch = gt[ind] else: input_batch, gt_batch = add_to_batch( input_batch, gt_batch, input_img[ind], gt[ind]) if sub_batche < FLAGS.batch - 1: sub_batche += 1 continue sub_batche = 0 # ---- batch is ready ---- feed_dict = get_feed_dict(input_ph, ground_truths_ph, input_batch, gt_batch) sess.run(train_step, feed_dict=feed_dict) if FLAGS.use_summary and step % FLAGS.calc_summary == 0: sh.handle_summarys(sess, logits, summary, summary_writer, step, feed_dict) step += 1 statistic.handle_statistic(epoch, logits, sess, input_img, gt, val_input_img, val_gt) if epoch % FLAGS.epoch_model_ckpts == 0: ckpt_dir = os.path.join(results_dir, 'global_ckpt') if not os.path.exists(ckpt_dir): os.mkdir(ckpt_dir) saver.save(sess, os.path.join(ckpt_dir, 'global_ckpt'), global_step=global_step) if epoch % FLAGS.epoch_analysis_breakpoints == 0: analysis_ckpt_dir = os.path.join(results_dir, 'Analysis_ckpts') if not os.path.exists(analysis_ckpt_dir): os.mkdir(analysis_ckpt_dir) saver.save(sess, os.path.join(analysis_ckpt_dir, 'epoch_' + str(epoch)), global_step=global_step)