def train(variant): Min_cost = 1000000 traj = get_traj() # get data env_name = variant['env_name'] # choose your environment env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params[ 'max_episodes'] # maximum episodes for RL training max_ep_steps = env_params[ 'max_ep_steps'] # number of maximum steps in each episode max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic s_dim = env.observation_space.shape[0] print("s_dim is ", s_dim) a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = CAC(a_dim, s_dim, policy_params) # policy.restore("log/CMAPSS/CAC-new-reward-0.01/0/policy") pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for i in range(max_episodes): current_path = { 'rewards': [], 'distance': [], 'kl_divergence': [], 'a_loss': [], 'alpha': [], 'lyapunov_error': [], 'entropy': [], 'beta': [], 'action_distance': [], } if global_step > max_global_steps: break s = env.reset() # Random start point start_point = np.random.randint(0, 500000) s = traj[start_point, :16] # current state, theta,next w, desired state # this is for decision making # 16,1,4,16 s = np.concatenate([[s], [traj[start_point, 17:]]], axis=1)[0] env.state = s for j in range(start_point + 1, start_point + 1 + max_ep_steps): if Render: env.render() delta = np.zeros(36) # ###### NOSIE ############## noise = np.random.normal(0, 0.01, 16) delta[20:] = noise # ########IF Noise env########## # s= s + delta # a = policy.choose_action(s) # ###### BIAS ############## # noise = s[0:16]*0.01 # delta[0:16] = noise a = policy.choose_action(s + delta) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # action = traj[j-1,16] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Run in simulator X_, r, done, theta = env.step(action) # The new s= current state,next omega, next state s_ = np.concatenate([X_, [traj[j, 17:]]], axis=1)[0] # s_ = np.concatenate([[s_], [theta]], axis=1)[0] # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0] env.state = s_ # theta_pre=theta if training_started: global_step += 1 if j == max_ep_steps - 1 + start_point: done = True terminal = 1. if done else 0. if j > start_point + 2: pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_, _s) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch) if training_started: current_path['rewards'].append(r) current_path['distance'].append(distance) current_path['kl_divergence'].append(kl) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['beta'].append(beta) current_path['action_distance'].append(action_distance) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) # print(training_diagnotic) if training_diagnotic is not None: eval_diagnotic = training_evaluation(variant, env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = ['time_step:', str(global_step), '|'] [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() if eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] <= Min_cost: Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] print("New lowest cost:", Min_cost) policy.save_result(log_path) if training_started and global_step % ( 10 * evaluation_frequency) == 0 and global_step > 0: policy.save_result(log_path) # Status Update _s = s s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) evaluation_env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] alg_name = variant['algorithm_name'] policy_build_fn = get_policy(alg_name) policy_params = variant['alg_params'] batch_size = policy_params['batch_size'] lr_c = policy_params['lr_c'] cliprange = policy_params['cliprange'] cliprangenow = cliprange lr_c_now = lr_c # learning rate for critic gamma = policy_params['gamma'] gae_lamda = policy_params['gae_lamda'] log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=policy_params['output_format']) logger.logkv('safety_threshold', policy_params['safety_threshold']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', batch_size) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fn(a_dim, s_dim, policy_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=policy.N) for j in range(max_global_steps): if global_step > max_global_steps: break mb_obs, mb_obs_, mb_rewards, mb_actions, mb_values, mb_terminals, mb_t = [], [], [], [], [], [], [] for n in range(policy.N): current_path = { 'rewards': [], 'obs': [], 'obs_': [], 'done': [], 'value': [], 't': [], 'action': [], } s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) # For n in range number of steps for t in range(max_ep_steps): # Given observations, get action value and neglopacs # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init [a], [value] = policy.choose_action(s) action = np.tanh(a) action = a_lowerbound + (action + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator s_, r, done, info = env.step(action) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if t == max_ep_steps - 1: done = True terminal = 1. if done else 0. if Render: env.render() current_path['rewards'].append(r) current_path['action'].append(a) current_path['obs'].append(s) current_path['obs_'].append(s_) current_path['done'].append(terminal) current_path['value'].append(value) current_path['t'].append(t) if done: global_step += t + 1 last_training_paths.appendleft(current_path) break else: s = s_ # mb_obs = np.asarray(mb_obs, dtype=s.dtype) # mb_values = np.asarray(mb_values, dtype=s.dtype) # mb_l_values = np.asarray(mb_l_values, dtype=s.dtype) # mb_actions = np.asarray(mb_actions, dtype=action.dtype) # mb_obs_ = np.asarray(mb_obs_, dtype=s_.dtype) # mb_rewards = np.asarray(mb_rewards, dtype=np.float32) # mb_l_rewards = np.asarray(mb_l_rewards, dtype=np.float32) # mb_terminals = np.asarray(mb_terminals, dtype=np.float32) # last_value, last_l_value = policy.predict_values([s_]) rescale = np.mean([len(path) for path in last_training_paths]) initial_return = [] mb_advs = [] for path in last_training_paths: lastgaelam = 0 path_advs = np.zeros_like(path['rewards']) path_values = path['value'] path_next_values = path['value'][1:] path_next_values.append(policy.predict_values(path['obs_'][-1])) for t in reversed(range(len(path_values))): delta = path['rewards'][t] + gamma * path_next_values[t] * ( 1 - path['done'][t]) - path_values[t] path_advs[t] = lastgaelam = delta + gamma * gae_lamda * ( 1 - path['done'][t]) * lastgaelam path_returns = path_advs + path_values initial_return.append(path_returns[0]) mb_advs.extend(path_advs) mb_obs.extend(path['obs']) mb_obs_.extend(path['obs_']) mb_values.extend(path['value']) mb_terminals.extend(path['done']) mb_t.extend(path['t']) mb_actions.extend(path['action']) initial_return = np.asarray(initial_return, dtype=np.float32) mb_obs = np.asarray(mb_obs, dtype=s.dtype) mb_values = np.asarray(mb_values, dtype=s.dtype) mb_actions = np.asarray(mb_actions, dtype=action.dtype) mb_obs_ = np.asarray(mb_obs_, dtype=s_.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_terminals = np.asarray(mb_terminals, dtype=np.float32) mb_advs = np.asarray(mb_advs, dtype=np.float32) mb_t = np.asarray(mb_t, dtype=np.float32) mb_returns = mb_advs + mb_values mblossvals = [] inds = np.arange(len(mb_advs), dtype=int) initial_return = np.mean(initial_return) # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step # if sum(current_path['l_rewards'])>0: # policy.ALPHA3 = min(policy.ALPHA3 * 1.5, policy_params['alpha3']) # else: # policy.ALPHA3 = min(policy.ALPHA3 * 1.01, policy_params['alpha3']) slices = (arr[inds] for arr in (mb_obs, mb_obs_, mb_returns, mb_advs, mb_actions, mb_values, mb_t)) # print(**slices) mblossvals.append( policy.update(*slices, initial_return, cliprangenow, lr_c_now, rescale)) mblossvals = np.mean(mblossvals, axis=0) frac = 1.0 - (global_step - 1.0) / max_global_steps cliprangenow = cliprange * frac lr_c_now = lr_c * frac # learning rate for critic # lr_l_now = lr_l * frac # learning rate for critic logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts(last_training_paths) if training_diagnotic is not None: # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\ eval_diagnotic = training_evaluation(variant, evaluation_env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_c', lr_c_now) [ logger.logkv(name, value) for name, value in zip(policy.diagnosis_names, mblossvals) ] string_to_print = ['time_step:', str(global_step), '|'] [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend( [key, ':', str(round(training_diagnotic[key], 2)), '|']) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY print('Running time: ', time.time() - t1) return
def train(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['num_of_training_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] policy_params['network_structure'] = env_params['network_structure'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # if disturber_params['process_noise']: # d_dim = disturber_params['noise_dim'] # else: # d_dim = env_params['disturbance dim'] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = LAC(a_dim, s_dim, policy_params) pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for i in range(max_episodes): current_path = { 'rewards': [], 'a_loss': [], 'alpha': [], 'lambda': [], 'lyapunov_error': [], 'entropy': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s) # a = a*0 action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator disturbance_input = np.zeros([a_dim + s_dim]) s_, r, done, info = env.step(action) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a, batch) if training_started: current_path['rewards'].append(r) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['lambda'].append(labda) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) if training_diagnotic is not None: if variant['num_of_evaluation_paths'] > 0: eval_diagnotic = training_evaluation( variant, env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = ['time_step:', str(global_step), '|'] if variant['num_of_evaluation_paths'] > 0: [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(variant): env_name = variant["env_name"] env = get_env_from_name(env_name) env_params = variant["env_params"] max_episodes = env_params["max_episodes"] max_ep_steps = env_params["max_ep_steps"] max_global_steps = env_params["max_global_steps"] store_last_n_paths = variant["num_of_training_paths"] evaluation_frequency = variant["evaluation_frequency"] policy_params = variant["alg_params"] policy_params["network_structure"] = env_params["network_structure"] min_memory_size = policy_params["min_memory_size"] steps_per_cycle = policy_params["steps_per_cycle"] train_per_cycle = policy_params["train_per_cycle"] batch_size = policy_params["batch_size"] lr_a, lr_c, lr_l = ( policy_params["lr_a"], policy_params["lr_c"], policy_params["lr_l"], ) lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic if "Fetch" in env_name or "Hand" in env_name: s_dim = (env.observation_space.spaces["observation"].shape[0] + env.observation_space.spaces["achieved_goal"].shape[0] + env.observation_space.spaces["desired_goal"].shape[0]) else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # if disturber_params['process_noise']: # d_dim = disturber_params['noise_dim'] # else: # d_dim = env_params['disturbance dim'] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = LAC(a_dim, s_dim, policy_params) pool_params = { "s_dim": s_dim, "a_dim": a_dim, "d_dim": 1, "store_last_n_paths": store_last_n_paths, "memory_capacity": policy_params["memory_capacity"], "min_memory_size": policy_params["min_memory_size"], "history_horizon": policy_params["history_horizon"], "finite_horizon": policy_params["finite_horizon"], } if "value_horizon" in policy_params.keys(): pool_params.update({"value_horizon": policy_params["value_horizon"]}) else: pool_params["value_horizon"] = None pool = Pool(pool_params) # For analyse Render = env_params["eval_render"] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant["log_path"] logger.configure(dir=log_path, format_strs=["csv"]) logger.logkv("tau", policy_params["tau"]) logger.logkv("alpha3", policy_params["alpha3"]) logger.logkv("batch_size", policy_params["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) for i in range(max_episodes): current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } if global_step > max_global_steps: break s = env.reset() if "Fetch" in env_name or "Hand" in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s) action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # action = a # Run in simulator disturbance_input = np.zeros([a_dim + s_dim]) s_, r, done, info = env.step(action) if "Fetch" in env_name or "Hand" in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info["done"] > 0: done = True if training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1.0 if done else 0.0 pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if (pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0): training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a, batch) if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) if (training_started and global_step % evaluation_frequency == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if variant["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation( variant, env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_c", lr_c_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if variant["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print("Running time: ", time.time() - t1) return