def __init__(self, *, alpha=0.003, gamma_kwargs=DEFAULT_KWARGS["gamma_kwargs"], **kwargs): super(Reinforce, self).__init__(**kwargs) self.alpha = alpha self.gamma_kwargs = gamma_kwargs self.gamma_scheduler = get_scheduler(gamma_kwargs) self.schedulers += (self.gamma_scheduler, ) self.targets_ph = tf_v1.placeholder("float32", shape=(None, 1), name="target_ph") self.states_ph = tf_v1.placeholder("float32", shape=[None, *self.obs_shape], name="states_ph") if self.policy_type == "DiscretePolicy": self.actions_ph = tf_v1.placeholder("int32", shape=(None, ), name=f"actions_ph") else: self.actions_ph = tf_v1.placeholder("float32", shape=(None, *self.action_shape), name=f"actions_ph") self.field_names = ("state", "action", "reward" ) # Used to sample out the trajectory self.scalar_summaries += ("gamma", )
def __init__(self, *, lr_kwargs=DEFAULT_KWARGS["lr_kwargs"], layers=None, preprocessors=None, **kwargs): super(DiscretePolicy, self).__init__(**kwargs) self.lr_kwargs = lr_kwargs self.lr_scheduler = get_scheduler(lr_kwargs) self.schedulers += (self.lr_scheduler, ) # Placeholders self.lr_ph = tf_v1.placeholder("float32", shape=[], name=f"{self.scope}/lr_ph") if layers is None: layers = DEFAULT_LAYERS layers[-1]["units"] = self.action_size self.layers = layers self.preprocessors = preprocessors self.model = NeuralNetwork(self.scope, input_shapes=[self.obs_shape], layers=self.layers, preprocessors=self.preprocessors) # Loss parameters self._loss = None self.train_op = None # Summary parameters self.scalar_summaries_tf += ("loss", ) self.scalar_summaries += ("lr", )
def __init__(self, masker, task_lst, vocabs, optimizer, args): """ :param model: 模型 :param description: 模型描述 :param task_lst: 任务列表 :param optimizer: 优化器 :param log_path: TensorboardX存储文件夹 :param save_path: 模型存储位置 :param accumulation_steps: 累积梯度 :param print_every: 评估间隔 """ self.logger = fastNLP.logger self.masker = masker self.task_lst = task_lst self.save_path = args.save_path self.description = args.exp_name self.optim = optimizer self.vocabs = vocabs n_steps = (int( len(task_lst) * len(task_lst[0].train_set) * 100 / args.batch_size) + 1) args.n_steps = n_steps self.epoch_scheduler = get_scheduler(args, self.optim) self.scheduler = None self.logger.info('Using scheduler {}'.format(self.scheduler)) self.accumulation_steps = args.accumulation_steps self.print_every = args.print_every self.batch_size = args.batch_size self.save_ep = args.save_ep include_tasks = args.tasks if include_tasks is None: self.empty_tasks = set() else: self.empty_tasks = set(range(len( self.task_lst))) - set(include_tasks) self.steps = 0 self.best_acc = 0 self.best_epoch = 0 self.metrics = [] for t in task_lst: if has_acc(t.task_name): self.metrics.append(AccuracyMetric()) else: self.metrics.append( SpanFPreRecMetric( self.vocabs[t.task_name], encoding_type="bioes" if t.task_name == "ner" else "bio", )) # self.logger.info(self.metrics) tb_path = "eval" if args.evaluate else "train" self.summary_writer = SummaryWriter(os.path.join( args.tb_path, tb_path))
def __init__(self, lr_kwargs=DEFAULT_KWARGS["lr_kwargs"], gamma_kwargs=DEFAULT_KWARGS["gamma_kwargs"], reward_scale=1.0, **kwargs): super(Sarsa, self).__init__(**kwargs) self.lr_kwargs = lr_kwargs self.gamma_kwargs = gamma_kwargs self.lr_scheduler = get_scheduler(lr_kwargs) self.gamma_scheduler = get_scheduler(gamma_kwargs) self.schedulers += (self.lr_scheduler, self.gamma_scheduler) self.reward_scale = reward_scale self.field_names = ("state", "action", "reward", "next_state", "done") self.q_net = QNetwork(input_shapes=[self.obs_shape], output_size=self.action_size, layers=self._layers, preprocessors=self.preprocessors, scope="q_network") self.target_q = self.q_net # Placeholders self.lr_ph = tf_v1.placeholder("float32", shape=[], name="lr_ph") self.gamma_ph = tf_v1.placeholder("float32", shape=[], name="gamma_ph") self.states_ph = tf_v1.placeholder("float32", shape=[None, *self.obs_shape], name="states_ph") self.actions_ph = tf_v1.placeholder("int32", shape=[None], name="actions_ph") self.rewards_ph = tf_v1.placeholder("float32", shape=[None], name="rewards_ph") self.next_states_ph = tf_v1.placeholder("float32", shape=[None, *self.obs_shape], name="next_states_ph") self.dones_ph = tf_v1.placeholder("float32", shape=[None], name="dones_ph") self.next_actions_ph = tf_v1.placeholder("int32", shape=[None], name="next_actions_ph") # Summary ops self.summary_init_objects += (self.q_net, ) self.scalar_summaries += ("gamma", "lr")
def __init__(self, *, reward_scale=1.0, gamma_kwargs=DEFAULT_KWARGS["gamma_kwargs"], alpha_lr_kwargs=DEFAULT_KWARGS["alpha_lr_kwargs"], q_lr_kwargs=DEFAULT_KWARGS["q_lr_kwargs"], tau=5e-3, update_interval=1, num_q_nets=2, auto_ent=True, target_entropy="auto", init_log_alpha=0.0, **kwargs): super(SAC, self).__init__(**kwargs) assert num_q_nets > 1, f"Minimum number of Q network is 2 but given '{num_q_nets}'" self.reward_scale = reward_scale self.q_lr_kwargs = q_lr_kwargs self.gamma_kwargs = gamma_kwargs self.alpha_lr_kwargs = alpha_lr_kwargs self.q_lr_scheduler = get_scheduler(q_lr_kwargs) self.gamma_scheduler = get_scheduler(gamma_kwargs) self.alpha_lr_scheduler = get_scheduler(alpha_lr_kwargs) self.schedulers += (self.q_lr_scheduler, self.gamma_scheduler, self.alpha_lr_scheduler) self.tau = tau self.update_interval = update_interval self.num_q_nets = num_q_nets self.auto_ent = auto_ent self.init_log_alpha = init_log_alpha if self.auto_ent: assert target_entropy == "auto" or isinstance(target_entropy, (int, float)) self.target_entropy = -float(self.action_size) if target_entropy == "auto" else target_entropy self.log_alpha_tf = tf_v1.get_variable('log_alpha', dtype="float32", initializer=float(init_log_alpha)) else: self.target_entropy = None self.log_alpha_tf = tf_v1.constant(init_log_alpha, dtype="float32") self.alpha_tf = tf_v1.exp(self.log_alpha_tf) self.log_alpha = 0.0 self.alpha_loss_tf = None self.alpha_train_op = None self.critics = [] self.targets = [] # Placeholders self.q_lr_ph = tf_v1.placeholder("float32", shape=[], name="q_lr_ph") self.alpha_lr_ph = tf_v1.placeholder("float32", shape=[], name="alpha_lr_ph") self.gamma_ph = tf_v1.placeholder("float32", shape=[], name="gamma_ph") self.states_ph = tf_v1.placeholder("float32", shape=[None, *self.obs_shape], name="states_ph") self.actions_ph = tf_v1.placeholder("float32", shape=[None, self.action_size], name="actions_ph") self.rewards_ph = tf_v1.placeholder("float32", shape=[None], name="rewards_ph") self.next_states_ph = tf_v1.placeholder("float32", shape=[None, *self.obs_shape], name="next_states_ph") self.dones_ph = tf_v1.placeholder("float32", shape=[None], name="dones_ph") # Summary parameters self.alpha_loss = None self.scalar_summaries += ("alpha_loss", "log_alpha", "alpha", "q_lr", "alpha_lr")
def __init__(self, *, tau=0.003, update_interval=10, lr_kwargs=DEFAULT_KWARGS["lr_kwargs"], gamma_kwargs=DEFAULT_KWARGS["gamma_kwargs"], sigma_kwargs=DEFAULT_KWARGS["sigma_kwargs"], reward_scale=1.0, **kwargs): super(DDPG, self).__init__(**kwargs) self.tau = tau self.reward_scale = reward_scale self.update_interval = update_interval self.lr_kwargs = lr_kwargs self.gamma_kwargs = gamma_kwargs self.sigma_kwargs = sigma_kwargs self.lr_scheduler = get_scheduler(lr_kwargs) self.gamma_scheduler = get_scheduler(gamma_kwargs) self.sigma_scheduler = get_scheduler(sigma_kwargs) self.schedulers += (self.lr_scheduler, self.gamma_scheduler, self.sigma_scheduler) self.critic = None self.target = None # Placeholders self.lr_ph = tf_v1.placeholder("float32", shape=[], name="lr_ph") self.gamma_ph = tf_v1.placeholder("float32", shape=[], name="gamma_ph") self.states_ph = tf_v1.placeholder("float32", shape=[None, *self.obs_shape], name="states_ph") self.actions_ph = tf_v1.placeholder("float32", shape=[None, self.action_size], name="actions_ph") self.rewards_ph = tf_v1.placeholder("float32", shape=[None], name="rewards_ph") self.next_states_ph = tf_v1.placeholder("float32", shape=[None, *self.obs_shape], name="next_states_ph") self.dones_ph = tf_v1.placeholder("float32", shape=[None], name="dones_ph") # Summary parameters self.scalar_summaries += ("gamma", "sigma", "lr")
def __init__(self, *, lr_kwargs=DEFAULT_KWARGS["lr_kwargs"], **kwargs): super(A2C, self).__init__(**kwargs) self.critic = QNetwork(input_shapes=[self.obs_shape], output_size=1, layers=self._layers, preprocessors=self.preprocessors, scope="critic") self.lr_kwargs = lr_kwargs self.lr_scheduler = get_scheduler(lr_kwargs) self.schedulers += (self.lr_scheduler, ) # Placeholders self.lr_ph = tf_v1.placeholder("float32", shape=(), name="lr_ph") self.summary_init_objects += (self.critic, ) self.scalar_summaries += ("lr", )
def __init__(self, *, eps_kwargs=DEFAULT_KWARGS["eps_kwargs"], explore_ratio=0.60, explore_exploit_interval=20, **kwargs): super(GreedyEpsilonPolicy, self).__init__(**kwargs) self.eps_kwargs = eps_kwargs self.eps_scheduler = get_scheduler(eps_kwargs) self.schedulers += (self.eps_scheduler, ) self.scalar_summaries += ("eps", ) ######################### Experimental feature ########################## """ Idea: Instead of choosing actions greedily with some random actions based on the epsilon value, we introduce some epoch intervals periodically where the policy operates deterministically by temporarily setting the epsilon value to 0. """ # TODO: Implement it as a scheduler in all policies self.explore_ratio = explore_ratio self.explore_exploit_interval = explore_exploit_interval self.explore_interval = self.explore_ratio*self.explore_exploit_interval self.eps_mask = 1
class OffPolicyAlgorithm(BaseAlgorithm): PARAMETERS = BaseAlgorithm.PARAMETERS.union({ "batch_size_kwargs", "num_init_exp_samples", "max_init_exp_timestep", "buffer_size" }) def __init__(self, *, batch_size_kwargs=DEFAULT_KWARGS["batch_size_kwargs"], num_init_exp_samples=10000, max_init_exp_timestep="auto", buffer_size=1_000_000, **kwargs): super(OffPolicyAlgorithm, self).__init__(**kwargs) self.buffer_size = buffer_size self.replay_buffer = ReplayBuffer(size=buffer_size) self.batch_size_kwargs = batch_size_kwargs self.batch_size_schedhuler = get_scheduler(batch_size_kwargs) self.schedulers += (self.batch_size_schedhuler, ) self.num_init_exp_samples = None if num_init_exp_samples is None else int( num_init_exp_samples) self.max_init_exp_timestep = self.max_episode_steps if max_init_exp_timestep == "auto" else max_init_exp_timestep self.scalar_summaries += ("replay_buffer_size", "batch_size")
def pretrain(cfg): print(cfg.pretty()) pretrain_config_validator(cfg) fix_seed(cfg.seed) controller = load_pretrained_weights( NAO(**cfg.controller).to(0), cfg.pretrained_model_path) models = {'trunk': controller} dataset = get_dataset(seed=cfg.seed, **cfg.dataset) optimizers = { 'trunk_optimizer': get_optimizer(parameters=models['trunk'].parameters(), **cfg.optimizer) } lr_schedulers = { 'trunk_scheduler_by_iteration': get_scheduler(optimizer=optimizers['trunk_optimizer'], **cfg.scheduler) } loss_funcs = { 'reconstruction_loss': torch.nn.NLLLoss(), 'metric_loss': get_loss(**cfg.loss) } mining_funcs = {"tuple_miner": get_miner(**cfg.miner)} visualizers = [umap.UMAP(**params) for params in cfg.visualizers] end_of_iteration_hook = TensorboardHook(visualizers).end_of_iteration_hook end_of_epoch_hook = ModelSaverHook().end_of_epoch_hook get_trainer( models=models, optimizers=optimizers, lr_schedulers=lr_schedulers, loss_funcs=loss_funcs, mining_funcs=mining_funcs, dataset=dataset, end_of_iteration_hook=end_of_iteration_hook, end_of_epoch_hook=end_of_epoch_hook, **cfg.trainer, ).train()
def train(cfg): print(cfg.pretty()) train_config_validator(cfg) fix_seed(cfg.seed) writer = SummaryWriter(log_dir='logs') controller = load_pretrained_weights( NAO(**cfg.controller).to(0), cfg.pretrained_model_path) dataset = get_dataset(writer=writer, seed=cfg.seed, **cfg.dataset) optimizer = get_optimizer(parameters=_get_target_parameters( controller, cfg.freeze_encoder_decoder), **cfg.optimizer) lr_scheduler = get_scheduler(optimizer=optimizer, **cfg.scheduler) end_of_epoch_hook = ModelSaverHook().end_of_epoch_hook get_trainer( controller=controller, dataset=dataset, optimizer=optimizer, lr_scheduler=lr_scheduler, writer=writer, end_of_epoch_hook=end_of_epoch_hook, **cfg.trainer, ).train()
def __init__(self, *, lr_kwargs, layers=None, preprocessors=None, learn_std=True, std_value=0.1, mu_range=None, log_std_range=None, **kwargs): super(GaussianPolicy, self).__init__(**kwargs) self.std_value = std_value self.lr_kwargs = lr_kwargs self.lr_scheduler = get_scheduler(lr_kwargs) self.schedulers += (self.lr_scheduler, ) self.learn_std = learn_std self.mu_range = (-2.0, 2.0) if mu_range is None else mu_range self.log_std_range = (-10, 0.3) if log_std_range is None else log_std_range assert not self.discrete_action_space, "Action space for the Gaussian Policy must be continuous!" # Placeholders self.lr_ph = tf_v1.placeholder("float32", shape=(), name="lr_ph") # Create model if layers is None: layers = DEFAULT_LAYERS self.layers = layers self.preprocessors = preprocessors self.base_model = NeuralNetwork(self.scope, input_shapes=[self.obs_shape], layers=self.layers, preprocessors=self.preprocessors) self.mu = tf_v1.keras.layers.Dense(self.action_size, activation=None)( self.base_model.output) self.mu = tf_v1.clip_by_value(self.mu, *self.mu_range) if self.learn_std: self.log_std = tf_v1.keras.layers.Dense(self.action_size)( self.base_model.output) self.log_std = tf_v1.clip_by_value(self.log_std, *self.log_std_range) self.std = tf_v1.exp(self.log_std) self.raw_action_model = tf_v1.keras.Model( inputs=[self.base_model.input], outputs=[self.mu, self.std]) else: self.std = tf_v1.constant([std_value] * self.action_size, dtype="float32") self.raw_action_model = tf_v1.keras.Model( inputs=[self.base_model.input], outputs=[self.mu]) batch_size = tf_v1.shape(self.mu)[0] norm_dist = tfd.Normal(loc=tf_v1.zeros(self.action_size), scale=tf_v1.ones(self.action_size)) z = norm_dist.sample(batch_size) raw_actions = self.mu + z * self.std # Reparameterization trick self.actions = tf_v1.tanh(raw_actions) self.deterministic_actions = tf_v1.tanh(self.mu) self.model = tf_v1.keras.Model(inputs=[self.base_model.input], outputs=[self.actions]) # Loss parameters self._loss = None self.train_op = None # Summary parameters self.scalar_summaries += ("lr", ) self.scalar_summaries_tf += ("loss", "mean_log_actions", "min_mu", "mean_mu", "max_mu", "min_std", "mean_std", "max_std") self.histogram_summaries_tf += ("actions", "mu", "std", "log_actions")
def configure_optimizers(self): self.optimizer = get_optimizer(self.hparams, self.models) scheduler = get_scheduler(self.hparams, self.optimizer) return [self.optimizer], [scheduler]