def train(variant): s_save = [] env_name = variant['env_name'] env = get_env_from_name(env_name) if variant['evaluate'] is True: evaluation_env = get_env_from_name(env_name) else: evaluation_env = None env_params = variant['env_params'] judge_safety_func = get_safety_constraint_func(variant) max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] num_of_paths = variant['num_of_paths'] alg_name = variant['algorithm_name'] policy_build_fn = get_policy(alg_name) policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fn(a_dim, s_dim, policy_params) logger.logkv('target_entropy', policy.target_entropy) # For analyse Render = env_params['eval_render'] ewma_p = 0.95 ewma_step = np.zeros((1, max_episodes + 1)) ewma_reward = np.zeros((1, max_episodes + 1)) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False for i in range(max_episodes): ep_reward = 0 l_r = 0 current_path = { 'rewards': [], 'l_rewards': [], 'violation': [], } [current_path.update({key: []}) for key in policy.diag_names] if global_step > max_global_steps: break s = env.reset() for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator s_, r, done, info = env.step(action) if training_started: global_step += 1 l_r = info['l_rewards'] if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. violation_of_constraint = info['violation_of_constraint'] # 储存s,a和s_next,reward用于DDPG的学习 policy.store_transition(s, a, r, l_r, terminal, s_) s_save.append(s_) sio.savemat('data_all.mat', { 's': s_save, }) # 如果状态接近边缘 就存储到边缘memory里 # if policy.use_lyapunov is True and np.abs(s[0]) > env.cons_pos: # or np.abs(s[2]) > env.theta_threshold_radians*0.8 if policy.use_lyapunov is True and judge_safety_func( s_, r, done, info): # or np.abs(s[2]) > env.theta_threshold_radians*0.8 policy.store_edge_transition(s, a, r, l_r, terminal, s_) # Learn if policy.use_lyapunov is True: if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0: # Decay the action randomness training_started = True for _ in range(train_per_cycle): train_diagnotic = policy.learn(lr_a_now, lr_c_now, lr_l_now) else: if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0: # Decay the action randomness training_started = True for _ in range(train_per_cycle): train_diagnotic = policy.learn(lr_a_now, lr_c_now, lr_l_now) if training_started: current_path['rewards'].append(r) current_path['l_rewards'].append(l_r) current_path['violation'].append(violation_of_constraint) [ current_path[key].append(value) for key, value in zip(policy.diag_names, train_diagnotic) ] if training_started and global_step % evaluation_frequency == 0 and global_step > 0: if evaluation_env is not None: rollouts = get_evaluation_rollouts(policy, evaluation_env, num_of_paths, max_ep_steps, render=Render) diagnotic = evaluate_rollouts(rollouts) # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()] print( 'training_step:', global_step, 'average eval reward:', diagnotic['return-average'], 'average eval lreward:', diagnotic['lreturn-average'], 'average eval violations:', diagnotic['violation-avg'], 'average length:', diagnotic['episode-length-avg'], ) logger.logkv('eval_eprewmean', diagnotic['return-average']) logger.logkv('eval_eplrewmean', diagnotic['lreturn-average']) logger.logkv('eval_eplenmean', diagnotic['episode-length-avg']) logger.logkv('eval_violation_times', diagnotic['violation-avg']) logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) if training_diagnotic is not None: # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\ logger.logkv('eprewmean', training_diagnotic['rewards']) logger.logkv('eplrewmean', training_diagnotic['l_rewards']) logger.logkv('eplenmean', training_diagnotic['len']) logger.logkv('end_cost', training_diagnotic['end_cost']) [ logger.logkv(key, training_diagnotic[key]) for key in policy.diag_names ] logger.logkv('violation_times', training_diagnotic['violation']) logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) print( 'training_step:', global_step, 'average reward:', round(training_diagnotic['rewards'], 2), 'average lreward:', round(training_diagnotic['l_rewards'], 2), 'average violations:', training_diagnotic['violation'], 'end cost:', round(training_diagnotic['end_cost'], 2), 'average length:', round(training_diagnotic['len'], 1), 'lyapunov error:', round(training_diagnotic['lyapunov_error'], 6), 'critic1 error:', round(training_diagnotic['critic1_error'], 6), 'critic2 error:', round(training_diagnotic['critic2_error'], 6), 'policy_loss:', round(training_diagnotic['policy_loss'], 6), 'alpha:', round(training_diagnotic['alpha'], 6), 'lambda:', round(training_diagnotic['labda'], 6), 'entropy:', round(training_diagnotic['entropy'], 6), ) # 'max_grad:', round(training_diagnotic['max_grad'], 6) logger.dumpkvs() # 状态更新 s = s_ ep_reward += r # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) ewma_step[0, i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j ewma_reward[ 0, i + 1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) if variant['evaluate'] is True: evaluation_env = get_env_from_name(env_name) else: evaluation_env = None env_params = variant['env_params'] judge_safety_func = get_safety_constraint_func(variant) max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] num_of_paths = variant['num_of_paths'] alg_name = variant['algorithm_name'] policy_build_fn = get_policy(alg_name) policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fn(a_dim, s_dim, policy_params) logger.logkv('target_entropy', policy.target_entropy) # For analyse Render = env_params['eval_render'] ewma_p = 0.95 ewma_step = np.zeros((1, max_episodes + 1)) ewma_reward = np.zeros((1, max_episodes + 1)) # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False for i in range(max_episodes): ep_reward = 0 l_r = 0 current_path = { 'rewards': [], 'l_rewards': [], 'l_error': [], 'critic1_error': [], 'critic2_error': [], 'alpha': [], 'lambda': [], 'entropy': [], 'a_loss': [], 'violation': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s, True) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator s_, r, done, info = env.step(action) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if training_started: global_step += 1 l_r = info['l_rewards'] if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. violation_of_constraint = info['violation_of_constraint'] # 储存s,a和s_next,reward用于DDPG的学习 policy.store_transition(s, a, r, l_r, terminal, s_) # Learn if policy.pointer > min_memory_size and global_step % steps_per_cycle == 0: # Decay the action randomness training_started = True for _ in range(train_per_cycle): labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now) if training_started: current_path['rewards'].append(r) current_path['l_rewards'].append(l_r) current_path['l_error'].append(l_loss) current_path['critic1_error'].append(c1_loss) current_path['critic2_error'].append(c2_loss) current_path['alpha'].append(alpha) current_path['lambda'].append(labda) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['violation'].append(violation_of_constraint) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: if evaluation_env is not None: rollouts = get_evaluation_rollouts(policy, evaluation_env, num_of_paths, max_ep_steps, render=Render) diagnotic = evaluate_rollouts(rollouts) # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()] print( 'training_step:', global_step, 'average eval reward:', diagnotic['return-average'], 'average eval lreward:', diagnotic['lreturn-average'], 'average eval violations:', diagnotic['violation-avg'], 'average length:', diagnotic['episode-length-avg'], ) logger.logkv('eval_eprewmean', diagnotic['return-average']) logger.logkv('eval_eplrewmean', diagnotic['lreturn-average']) logger.logkv('eval_eplenmean', diagnotic['episode-length-avg']) logger.logkv('eval_violation_times', diagnotic['violation-avg']) logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) if training_diagnotic is not None: # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\ logger.logkv('eprewmean', training_diagnotic['train-return-average']) logger.logkv('eplrewmean', training_diagnotic['train-lreturn-average']) logger.logkv( 'eplenmean', training_diagnotic['train-episode-length-avg']) logger.logkv('lyapunov_lambda', training_diagnotic['train-lambda-avg']) logger.logkv('alpha', training_diagnotic['train-alpha-avg']) logger.logkv('entropy', training_diagnotic['train-entropy-avg']) logger.logkv('critic1 error', training_diagnotic['train-critic1-error-avg']) logger.logkv('critic2 error', training_diagnotic['train-critic2-error-avg']) logger.logkv( 'lyapunov error', training_diagnotic['train-lyapunov-error-avg']) logger.logkv('policy_loss', training_diagnotic['train-a-loss-avg']) logger.logkv( 'average_cost', training_diagnotic['train-return-average'] / training_diagnotic['train-episode-length-avg']) logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) print( 'training_step:', global_step, 'average reward:', round(training_diagnotic['train-return-average'], 2), 'average lreward:', round(training_diagnotic['train-lreturn-average'], 2), 'average violations:', training_diagnotic['train-violation-avg'], 'average length:', round(training_diagnotic['train-episode-length-avg'], 1), 'lyapunov error:', round(training_diagnotic['train-lyapunov-error-avg'], 6), 'critic1 error:', round(training_diagnotic['train-critic1-error-avg'], 6), 'critic2 error:', round(training_diagnotic['train-critic2-error-avg'], 6), 'policy_loss:', round(training_diagnotic['train-a-loss-avg'], 6), 'alpha:', round(training_diagnotic['train-alpha-avg'], 6), 'lambda:', round(training_diagnotic['train-lambda-avg'], 6), 'entropy:', round(training_diagnotic['train-entropy-avg'], 6), ) logger.dumpkvs() # 状态更新 s = s_ ep_reward += r # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) ewma_step[0, i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j ewma_reward[ 0, i + 1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return