def test_func(args, shared_model, env_conf, datasets=None, tests=None, shared_dict=None): ptitle('Valid agent') if args.valid_gpu < 0: gpu_id = args.gpu_ids[-1] else: gpu_id = args.valid_gpu env_conf["env_gpu"] = gpu_id if not args.deploy: log = {} logger = Logger(args.log_dir) create_dir(args.log_dir + "models/") create_dir(args.log_dir + "tifs/") create_dir(args.log_dir + "tifs_test/") os.system("cp *.py " + args.log_dir) os.system("cp *.sh " + args.log_dir) os.system("cp models/*.py " + args.log_dir + "models/") setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) env_conf_log = env_conf if tests is not None: if args.testlbl: test_env = EM_env(tests[0], env_conf, type="test", gt_lbl_list=tests[1]) else: test_env = EM_env(tests[0], env_conf, type="test") if not args.deploy: for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, d_args[k])) for k in env_conf_log.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, env_conf_log[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = get_model(args, args.model, env_conf["observation_shape"], args.features, atrous_rates=args.atr_rate, num_actions=2, split=args.data_channel, gpu_id=gpu_id, multi=args.multi) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() flag = True if not args.deploy: create_dir(args.save_model_dir) recent_episode_scores = ScalaTracker(100) recent_FgBgDice = ScalaTracker(100) recent_bestDice = ScalaTracker(100) recent_diffFG = ScalaTracker(100) recent_MUCov = ScalaTracker(100) recent_MWCov = ScalaTracker(100) recent_AvgFP = ScalaTracker(100) recent_AvgFN = ScalaTracker(100) recent_rand_i = ScalaTracker(100) renderlist = [] renderlist.append(player.env.render()) max_score = 0 # ----------------------------------------- Deploy / Inference ----------------------------------------- if args.deploy: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) # inference (args, None, player.model, tests [0], test_env, gpu_id, player.env.rng, len (tests [0])) if len(tests) == 4: inference(args, None, player.model, tests[0], test_env, gpu_id, player.env.rng, len(tests[0]), tests[3]) else: inference(args, None, player.model, tests[0], test_env, gpu_id, player.env.rng, len(tests[0])) return # ----------------------------------------- End Deploy / Inference ----------------------------------------- merge_ratios = [] split_ratios = [] if args.wctrl == "s2m": schedule = args.wctrl_schedule delta = (shared_dict['spl_w'] - shared_dict['mer_w']) / (2 * len(schedule)) mer_w_delta = delta mer_w_var = shared_dict['mer_w'] mer_w_scheduler = Scheduler(mer_w_var, schedule, mer_w_delta) split_delta = -delta / len(args.out_radius) split_var = shared_dict['spl_w'] / len(args.out_radius) spl_w_scheduler = Scheduler(split_var, schedule, split_delta) while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() renderlist.append(player.env.render()) if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores.push(reward_sum) if args.save_max and recent_episode_scores.mean() >= max_score: max_score = recent_episode_scores.mean() if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = {} state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, str(num_tests))) if num_tests % args.log_period == 0: if tests is not None and not args.DEBUG: inference(args, logger, player.model, tests[0], test_env, gpu_id, player.env.rng, num_tests) if (np.max(env.lbl) != 0 and np.max(env.gt_lbl) != 0): bestDice, FgBgDice, diffFG, MWCov, MUCov, AvgFP, AvgFN, rand_i = evaluate( args, player.env) recent_FgBgDice.push(FgBgDice) recent_diffFG.push(abs(diffFG)) recent_bestDice.push(bestDice) recent_MWCov.push(MWCov) recent_MUCov.push(MUCov) recent_AvgFP.push(AvgFP) recent_AvgFN.push(AvgFN) recent_rand_i.push(rand_i) log_info = { "bestDice": recent_bestDice.mean(), "FgBgDice": recent_FgBgDice.mean(), "diffFG": recent_diffFG.mean(), "MWCov": recent_MWCov.mean(), "MUCov": recent_MUCov.mean(), "AvgFP": recent_AvgFP.mean(), "AvgFN": recent_AvgFN.mean(), "rand_i": recent_rand_i.mean() } for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) else: bestDice, FgBgDice, diffFG = 0, 0, 0 MWCov, MUCov, AvgFP, AvgFN = 0, 0, 0, 0 rand_i = 0 print( "----------------------VALID SET--------------------------" ) print(args.env) print("bestDice:", bestDice, "FgBgDice:", FgBgDice, "diffFG:", diffFG, "MWCov:", MWCov, "MUCov:", MUCov, "AvgFP:", AvgFP, "AvgFN:", AvgFN, "rand_i:", rand_i) # print ("mean bestDice") print("Log test #:", num_tests) print("rewards: ", player.reward.mean()) print("sum rewards: ", reward_sum) print("#gt_values:", len(np.unique(player.env.gt_lbl))) print("values:") values = player.env.unique() print(np.concatenate([values[0][None], values[1][None]], 0)) print("------------------------------------------------") log_img = np.concatenate(renderlist[::-1], 0) if not "3D" in args.data: for i in range(3): player.probs.insert(0, np.zeros_like(player.probs[0])) while (len(player.probs) - 3 < args.max_episode_length): player.probs.append(np.zeros_like(player.probs[0])) probslist = [ np.repeat(np.expand_dims(prob, -1), 3, -1) for prob in player.probs ] probslist = np.concatenate(probslist, 1) probslist = (probslist * 256).astype(np.uint8, copy=False) # log_img = renderlist [-1] print(probslist.shape, log_img.shape) log_img = np.concatenate([probslist, log_img], 0) log_info = {"valid_sample": log_img} print(log_img.shape) io.imsave( args.log_dir + "tifs/" + str(num_tests) + "_sample.tif", log_img.astype(np.uint8)) io.imsave( args.log_dir + "tifs/" + str(num_tests) + "_pred.tif", player.env.lbl.astype(np.uint8)) io.imsave(args.log_dir + "tifs/" + str(num_tests) + "_gt.tif", player.env.gt_lbl.astype(np.int32)) if args.seg_scale: log_info["scaler"] = player.env.scaler for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) if not args.deploy: log_info = { 'mean_valid_reward': reward_mean, '100_mean_reward': recent_episode_scores.mean(), 'split_ratio': player.env.split_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl), 'merge_ratio': player.env.merge_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl), } if args.wctrl == 's2m': log_info.update({ 'mer_w': mer_w_scheduler.value(), 'spl_w': spl_w_scheduler.value() * len(args.out_radius), }) merge_ratios.append(player.env.merge_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl)) split_ratios.append(player.env.split_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl)) print("split ratio: ", np.max(player.env.split_ratio_sum), np.min(player.env.split_ratio_sum)) print("merge ratio: ", np.max(player.env.merge_ratio_sum), np.min(player.env.merge_ratio_sum)) print("merge ratio: ", merge_ratios) print("split ratio: ", split_ratios) for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 if args.wctrl == "s2m": shared_dict["spl_w"] = spl_w_scheduler.next() shared_dict["mer_w"] = mer_w_scheduler.next() player.env.config["spl_w"] = shared_dict["spl_w"] player.env.config["mer_w"] = shared_dict["mer_w"] player.clear_actions() state = player.env.reset(player.model, gpu_id) renderlist.append(player.env.render()) time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
class MetaA2CModel: """ The Meta A2C (Advantage Actor Critic) model class :param gamma: (float) Discount factor :param vf_coef: (float) Value function coefficient for the loss calculation :param ent_coef: (float) Entropy coefficient for the loss caculation :param max_grad_norm: (float) The maximum value for the gradient clipping :param learning_rate: (float) The learning rate :param alpha: (float) RMSProp decay parameter (default: 0.99) :param epsilon: (float) RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance (used only for loading) WARNING: this logging can take a lot of space quickly """ def __init__(self, input_length: int, output_length: int, n_steps: int, window_size: int, seed=3, gamma=0.8, vf_coef=0.25, ent_coef=0, max_grad_norm=0.5, learning_rate=1e-3, alpha=0.99, epsilon=1e-5, lr_schedule='linear', verbose=0, _init_setup_model=True): self.policy = MetaLstmActorCriticPolicy self.verbose = verbose self.input_length = input_length self.output_length = output_length self.num_train_steps = 0 self.n_steps = n_steps self.window_size = window_size self.total_timesteps = config.max_timesteps/10_000 self.gamma = gamma self.vf_coef = vf_coef self.ent_coef = ent_coef self.max_grad_norm = max_grad_norm self.alpha = alpha self.epsilon = epsilon self.lr_schedule = lr_schedule self.learning_rate = learning_rate self.graph = None self.sess = None self.learning_rate_ph = None self.actions_ph = None self.advs_ph = None self.rewards_ph = None self.pg_loss = None self.vf_loss = None self.entropy = None self.apply_backprop = None self.policy_model = None self.step = None self.value = None self.learning_rate_schedule = None self.trainable_variables = None self.layers = config.meta_layers self.lstm_units = config.meta_lstm_units if seed is not None: set_global_seeds(seed) self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=self.total_timesteps, schedule=self.lr_schedule, init_step=self.num_train_steps) if _init_setup_model: self.setup_model() def setup_model(self): """ Create all the functions and tensorflow graphs necessary to train the model """ assert issubclass(self.policy, MetaLstmActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of MetaLstmActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_utils.make_session(graph=self.graph) # azért nincs step model mert ugyanaz a lépés (n_batch) így felesleges. policy_model = self.policy(sess=self.sess, input_length=self.input_length, output_length=self.output_length, n_steps=self.n_steps, window_size=self.window_size, layers=self.layers, lstm_units=self.lstm_units) with tf.variable_scope("loss", reuse=False): self.actions_ph = policy_model.pdtype.sample_placeholder([self.n_steps], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [self.n_steps], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [self.n_steps], name="rewards_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") neglogpac = policy_model.proba_distribution.neglogp(self.actions_ph) self.entropy = tf.reduce_mean(policy_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(policy_model.value_fn), self.rewards_ph) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef self.trainable_variables = tf_utils.find_trainable_variables("model") grads = tf.gradients(loss, self.trainable_variables) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.trainable_variables)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.step = policy_model.step self.policy_model = policy_model self.value = self.policy_model.value tf.global_variables_initializer().run(session=self.sess) def train_step(self, inputs: np.ndarray, discounted_rewards, actions, values): """ applies a training step to the model """ advs = discounted_rewards - values cur_lr = None for _ in range(self.n_steps): cur_lr = self.learning_rate_schedule.value() td_map = {self.policy_model.input_ph: inputs, self.actions_ph: actions, self.advs_ph: advs, self.rewards_ph: discounted_rewards, self.learning_rate_ph: cur_lr} policy_loss, value_loss, policy_entropy, _ = self.sess.run( [self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop], td_map) return policy_loss, value_loss, policy_entropy def save(self, save_path: str, id: str): """ Save the current parameters to file :param save_path: (str or file-like object) the save location """ params = { "gamma": self.gamma, "vf_coef": self.vf_coef, "ent_coef": self.ent_coef, "max_grad_norm": self.max_grad_norm, "learning_rate": self.learning_rate, "alpha": self.alpha, "epsilon": self.epsilon, "lr_schedule": self.lr_schedule, "verbose": self.verbose, "policy": self.policy, "num_train_steps": self.num_train_steps, "input_length": self.input_length, "output_length": self.output_length, "n_steps": self.n_steps, "window_size": self.window_size, "total_timesteps": self.total_timesteps, "layers": self.layers, "lstm_units": self.lstm_units, } json_params = { "input_length": self.input_length, "output_length": self.output_length, "n_steps": self.n_steps, "window_size": self.window_size, "total_timesteps": self.total_timesteps, "gamma": self.gamma, "vf_coef": self.vf_coef, "ent_coef": self.ent_coef, "max_grad_norm": self.max_grad_norm, "learning_rate": self.learning_rate, "alpha": self.alpha, "epsilon": self.epsilon, "lr_schedule": self.lr_schedule, "layers": self.layers, "lstm_units": self.lstm_units, } weights = self.sess.run(self.trainable_variables) utils._save_model_to_file(save_path, id, 'meta', json_params=json_params, weights=weights, params=params) @classmethod def load(cls, model_id: str, input_len: int, output_len: int): """ Load the model from file """ load_path = os.path.join(config.model_path, model_id) weights, params = utils._load_model_from_file(load_path, 'meta') if params['input_length'] != input_len or params['output_length'] != output_len: raise ValueError("The input and the output length must be the same as the model's that trying to load.") model = cls(input_length=params["input_length"], output_length=params["output_length"], n_steps=params["n_steps"], window_size=params["window_size"], _init_setup_model=False) model.__dict__.update(params) model.setup_model() restores = [] for param, loaded_weight in zip(model.trainable_variables, weights): restores.append(param.assign(loaded_weight)) model.sess.run(restores) return model