def train(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['num_of_training_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] policy_params['network_structure'] = env_params['network_structure'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # if disturber_params['process_noise']: # d_dim = disturber_params['noise_dim'] # else: # d_dim = env_params['disturbance dim'] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = LAC(a_dim, s_dim, policy_params) pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for i in range(max_episodes): current_path = { 'rewards': [], 'a_loss': [], 'alpha': [], 'lambda': [], 'lyapunov_error': [], 'entropy': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s) # a = a*0 action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator disturbance_input = np.zeros([a_dim + s_dim]) s_, r, done, info = env.step(action) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a, batch) if training_started: current_path['rewards'].append(r) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['lambda'].append(labda) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) if training_diagnotic is not None: if variant['num_of_evaluation_paths'] > 0: eval_diagnotic = training_evaluation( variant, env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = ['time_step:', str(global_step), '|'] if variant['num_of_evaluation_paths'] > 0: [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(variant): Min_cost = 1000000 traj = get_traj() # get data env_name = variant['env_name'] # choose your environment env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params[ 'max_episodes'] # maximum episodes for RL training max_ep_steps = env_params[ 'max_ep_steps'] # number of maximum steps in each episode max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic s_dim = env.observation_space.shape[0] print("s_dim is ", s_dim) a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = CAC(a_dim, s_dim, policy_params) # policy.restore("log/CMAPSS/CAC-new-reward-0.01/0/policy") pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for i in range(max_episodes): current_path = { 'rewards': [], 'distance': [], 'kl_divergence': [], 'a_loss': [], 'alpha': [], 'lyapunov_error': [], 'entropy': [], 'beta': [], 'action_distance': [], } if global_step > max_global_steps: break s = env.reset() # Random start point start_point = np.random.randint(0, 500000) s = traj[start_point, :16] # current state, theta,next w, desired state # this is for decision making # 16,1,4,16 s = np.concatenate([[s], [traj[start_point, 17:]]], axis=1)[0] env.state = s for j in range(start_point + 1, start_point + 1 + max_ep_steps): if Render: env.render() delta = np.zeros(36) # ###### NOSIE ############## noise = np.random.normal(0, 0.01, 16) delta[20:] = noise # ########IF Noise env########## # s= s + delta # a = policy.choose_action(s) # ###### BIAS ############## # noise = s[0:16]*0.01 # delta[0:16] = noise a = policy.choose_action(s + delta) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # action = traj[j-1,16] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Run in simulator X_, r, done, theta = env.step(action) # The new s= current state,next omega, next state s_ = np.concatenate([X_, [traj[j, 17:]]], axis=1)[0] # s_ = np.concatenate([[s_], [theta]], axis=1)[0] # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0] env.state = s_ # theta_pre=theta if training_started: global_step += 1 if j == max_ep_steps - 1 + start_point: done = True terminal = 1. if done else 0. if j > start_point + 2: pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_, _s) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch) if training_started: current_path['rewards'].append(r) current_path['distance'].append(distance) current_path['kl_divergence'].append(kl) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['beta'].append(beta) current_path['action_distance'].append(action_distance) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) # print(training_diagnotic) if training_diagnotic is not None: eval_diagnotic = training_evaluation(variant, env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = ['time_step:', str(global_step), '|'] [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() if eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] <= Min_cost: Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] print("New lowest cost:", Min_cost) policy.save_result(log_path) if training_started and global_step % ( 10 * evaluation_frequency) == 0 and global_step > 0: policy.save_result(log_path) # Status Update _s = s s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(variant): Min_cost = 1000000 data_trajectories = get_data() # get data (X, W, X_, theta, state) env_name = variant['env_name'] # choose your environment env = get_env_from_name(env_name) num_data_traj = variant['num_data_trajectories'] reward_id = variant['reward_id'] env_params = variant['env_params'] max_episodes = env_params[ 'max_episodes'] # maximum episodes for RL training max_ep_steps = env_params[ 'max_ep_steps'] # number of maximum steps in each episode max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for lyapunov critic s_dim = env.observation_space.shape[ 0] # dimension of state (3 for Battery) a_dim = env.action_space.shape[0] # action space dimension (1 or 2) a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = CAC(a_dim, s_dim, policy_params) policy.restore(variant['log_path'] + "/0/policy") pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] ref_s = env.reference_state # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for i in range(max_episodes): print("episode # ", i) print("global steps ", global_step) current_path = { 'rewards': [], 'distance': [], 'kl_divergence': [], 'a_loss': [], 'alpha': [], 'lyapunov_error': [], 'entropy': [], 'beta': [], 'action_distance': [], } if global_step > max_global_steps: break s = env.reset() # Random start point # traj_id = np.random.randint(0, len(data_trajectories)) traj_id = np.random.randint(0, num_data_traj) # traj_id = 1 traj = data_trajectories[traj_id] # print(len(traj)) if variant['traj_start'] == "random": start_point = np.random.randint(0, len(traj) - 2) else: start_point = int(variant['traj_start']) # s = traj[start_point, 1] s = traj[start_point, -8:] # current state, theta,next w, desired state # this is for decision making # 16,1,4,16 # s = np.array([s, traj[start_point, 2], traj[start_point, 4]]) # print(i, s) s = np.array( list(s) + [traj[start_point, 2]] + list(traj[start_point + 1, -8:])) # print(s) env.state = s env.model.state = traj[start_point, -8:] # env.state = env.model.state # ep_steps = len(traj) ep_steps = min(start_point + 1 + max_ep_steps, len(traj)) # print("selected traj = ", traj_id, " and length = ", len(traj), " starting = ", start_point, " ep_steps = ", ep_steps) for j in range(start_point + 1, ep_steps): if Render: env.render() s = env.state delta = np.zeros(s.shape) # ###### NOSIE ############## # noise = np.random.normal(0, 0.01, 0.01) # delta[2:]= noise # ########IF Noise env########## # s= s + delta # a = policy.choose_action(s) # ###### BIAS ############## # noise = s[0:16]*0.01 # delta[0:16] = noise # store_s = s.copy() # store_s[2] = store_s[2]-store_s[0] # a = policy.choose_action(store_s + delta) # print(s, delta) a = policy.choose_action(s / ref_s + delta) # print("a: ", a) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # action = traj[j-1,16] # print("a normalize: " , action) a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Run in simulator s_, r, done, X_ = env.step(action, traj[j, 2], traj[j, 1]) # The new s= current state,next omega, next state s_ = np.array(list(s_) + [traj[j + 1, 2]] + list(traj[j + 1, -8:])) # s_ = np.array([X_[1][0], traj[j, 2], traj[j,4]]) # s_ = np.array([traj[j, 1], traj[j, 2], traj[j,4]]) r = modify_reward(r, s, s_, reward_id) # print(r) if global_step % 100 == 1: print("global step: ", global_step, " true action: ", [traj[j, 5], traj[j, 6]], " predicted action: ", action, " and reward : ", r) # print("new state is : ", s_) # s_ = np.concatenate([[s_], [theta]], axis=1)[0] # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0] env.state = s_ # store_s_ = s_.copy() # store_s_[2] = store_s_[2] - store_s_[0] # theta_pre=theta if training_started: global_step += 1 if j == ep_steps - 2: done = True terminal = 1. if done else 0. if j > start_point + 2: pool.store(s / ref_s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_ / ref_s, _s / ref_s) # pool.store(store_s, a, np.zeros([1]), np.zeros([1]), r, terminal, store_s_, store__s) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True # print("learning policy") for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, beta, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch) if global_step % 2000 == 1: print("labda = ", labda, " | alpha = ", alpha, " | beta = ", beta, " | l_loss = ", l_loss, " | entropy = ", entropy, " | a_loss = ", a_loss, " | action_distance = ", action_distance) if training_started: current_path['rewards'].append(r) current_path['distance'].append(distance) current_path['kl_divergence'].append(kl) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['beta'].append(beta) current_path['action_distance'].append(action_distance) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) # print(training_diagnotic) if training_diagnotic is not None: print("doing training evaluation") eval_diagnotic = training_evaluation(variant, env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = ['time_step:', str(global_step), '|'] [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() if eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] <= Min_cost: Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] print("New lowest cost:", Min_cost) policy.save_result(log_path) else: print("cost did not improve.") print( "avg cost was ", eval_diagnotic['test_return'] / eval_diagnotic['test_average_length']) print("prev best cost is:", Min_cost) # policy.save_result(log_path) if training_started and global_step % ( 10 * evaluation_frequency) == 0 and global_step > 0: policy.save_result(log_path) # State Update _s = s s = s_ store__s = _s.copy() store__s[2] = store__s[2] - store__s[0] # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: # print("done at ", j) if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
from pool.pool import Pool if __name__ == '__main__': Pool().run()
def train_v2(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_build_fun = get_policy(variant['algorithm_name']) policy_params = variant['alg_params'] disturber_params = variant['disturber_params'] iter_of_actor_train = policy_params['iter_of_actor_train_per_epoch'] iter_of_disturber_train = policy_params[ 'iter_of_disturber_train_per_epoch'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # if disturber_params['process_noise']: # d_dim = disturber_params['noise_dim'] # else: # d_dim = env_params['disturbance dim'] d_dim = np.nonzero(disturber_params['disturbance_magnitude'])[0].shape[0] disturbance_chanel_list = np.nonzero( disturber_params['disturbance_magnitude'])[0] disturber_params['disturbance_chanel_list'] = disturbance_chanel_list a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = policy_build_fun(a_dim, s_dim, d_dim, policy_params) disturber = Disturber(d_dim, s_dim, disturber_params) pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': d_dim, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'finite_horizon': policy_params['finite_horizon'], } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_actor_training_paths = deque(maxlen=store_last_n_paths) last_disturber_training_paths = deque(maxlen=store_last_n_paths) actor_training_started = False disturber_training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('ita', policy_params['ita']) logger.logkv('energy_decay_rate', disturber_params['energy_decay_rate']) logger.logkv('magnitude', disturber_params['disturbance_magnitude']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for iter in range(max_episodes): for i in range(iter_of_actor_train): current_path = { 'rewards': [], 'disturbance_mag': [], 'a_loss': [], 'alpha': [], 'lyapunov_error': [], 'labda': [], 'critic_error': [], 'entropy': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s, True) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 disturbance, raw_disturbance = disturber.choose_action(s, j) # Run in simulator # disturbance = np.array([0]) disturbance_input = np.zeros([a_dim + s_dim]) disturbance_input[disturbance_chanel_list] = disturbance s_, r, done, info = env.step(action, process_noise=disturbance_input) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if actor_training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. pool.store(s, a, disturbance, raw_disturbance, r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) # Learn if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: actor_training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now, batch) if actor_training_started: current_path['rewards'].append(r) current_path['labda'].append(labda) current_path['critic_error'].append(min(c1_loss, c2_loss)) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['disturbance_mag'].append( np.linalg.norm(disturbance)) if actor_training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_actor_training_paths) if training_diagnotic is not None: [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = [ 'Actor training!time_step:', str(global_step), '|' ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if actor_training_started: last_actor_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break if global_step > max_global_steps: break for i in range(iter_of_disturber_train): current_path = { 'rewards': [], 'disturbance_mag': [], 'd_loss': [], 'alpha': [], 'disturber_critic_error': [], 'entropy': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s, True) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 disturbance, raw_disturbance = disturber.choose_action(s, j) # Run in simulator # disturbance = np.array([0]) s_, r, done, info = env.step(action, disturbance) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if disturber_training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. pool.store(s, a, disturbance, raw_disturbance, r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) # Learn if pool.memory_pointer > min_memory_size and global_step % disturber_params[ 'steps_per_cycle'] == 0: disturber_training_started = True for _ in range(disturber_params['train_per_cycle']): batch = pool.sample(disturber_params['batch_size']) d_alpha, d_c1_loss, d_c2_loss, d_entropy, d_loss = disturber.learn( lr_a_now, lr_c_now, batch) # d_c1_loss = 0 # d_c2_loss = 0 # d_loss=0 if disturber_training_started: current_path['rewards'].append(r) current_path['disturber_critic_error'].append( min(d_c1_loss, d_c2_loss)) current_path['d_loss'].append(d_loss) current_path['alpha'].append(d_alpha) current_path['entropy'].append(d_entropy) current_path['disturbance_mag'].append( np.linalg.norm(disturbance)) if disturber_training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_disturber_training_paths) if training_diagnotic is not None: [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = [ 'Disturber training!time_step:', str(global_step), '|' ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if disturber_training_started: last_disturber_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break if global_step > max_global_steps: break policy.save_result(log_path) disturber.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(variant): env_name = variant["env_name"] env = get_env_from_name(env_name) env_params = variant["env_params"] max_episodes = env_params["max_episodes"] max_ep_steps = env_params["max_ep_steps"] max_global_steps = env_params["max_global_steps"] store_last_n_paths = variant["num_of_training_paths"] evaluation_frequency = variant["evaluation_frequency"] policy_params = variant["alg_params"] policy_params["network_structure"] = env_params["network_structure"] min_memory_size = policy_params["min_memory_size"] steps_per_cycle = policy_params["steps_per_cycle"] train_per_cycle = policy_params["train_per_cycle"] batch_size = policy_params["batch_size"] lr_a, lr_c, lr_l = ( policy_params["lr_a"], policy_params["lr_c"], policy_params["lr_l"], ) lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic if "Fetch" in env_name or "Hand" in env_name: s_dim = (env.observation_space.spaces["observation"].shape[0] + env.observation_space.spaces["achieved_goal"].shape[0] + env.observation_space.spaces["desired_goal"].shape[0]) else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # if disturber_params['process_noise']: # d_dim = disturber_params['noise_dim'] # else: # d_dim = env_params['disturbance dim'] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = LAC(a_dim, s_dim, policy_params) pool_params = { "s_dim": s_dim, "a_dim": a_dim, "d_dim": 1, "store_last_n_paths": store_last_n_paths, "memory_capacity": policy_params["memory_capacity"], "min_memory_size": policy_params["min_memory_size"], "history_horizon": policy_params["history_horizon"], "finite_horizon": policy_params["finite_horizon"], } if "value_horizon" in policy_params.keys(): pool_params.update({"value_horizon": policy_params["value_horizon"]}) else: pool_params["value_horizon"] = None pool = Pool(pool_params) # For analyse Render = env_params["eval_render"] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant["log_path"] logger.configure(dir=log_path, format_strs=["csv"]) logger.logkv("tau", policy_params["tau"]) logger.logkv("alpha3", policy_params["alpha3"]) logger.logkv("batch_size", policy_params["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) for i in range(max_episodes): current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } if global_step > max_global_steps: break s = env.reset() if "Fetch" in env_name or "Hand" in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s) action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # action = a # Run in simulator disturbance_input = np.zeros([a_dim + s_dim]) s_, r, done, info = env.step(action) if "Fetch" in env_name or "Hand" in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info["done"] > 0: done = True if training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1.0 if done else 0.0 pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if (pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0): training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a, batch) if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) if (training_started and global_step % evaluation_frequency == 0 and global_step > 0): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts( last_training_paths) if training_diagnostics is not None: if variant["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation( variant, env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_c", lr_c_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if variant["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"]) for key in eval_diagnostics.keys() ] [ string_to_print.extend([ key, ":", str(round(training_diagnostics[key], 2)), "|" ]) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print("Running time: ", time.time() - t1) return
def train(variant): Min_cost = 1000000 data_trajectories = get_data() # get data (X, W, X_, theta, state) env_name = variant['env_name'] # choose your environment env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] # maximum episodes for RL training max_ep_steps = env_params['max_ep_steps'] # number of maximum steps in each episode max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] s_dim = env.observation_space.shape[0] # dimension of state (3 for Battery) a_dim = env.action_space.shape[0] # action space dimension (1 or 2) a_upperbound = env.action_space.high a_lowerbound = env.action_space.low agent = CAC(a_dim,s_dim, policy_params, max_global_steps = max_global_steps) # policy.restore(variant['log_path'] + "/0/policy") pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', agent.target_entropy) for i in range(max_episodes): print("episode # ", i) print("global steps ", global_step) current_path = {'rewards': [], 'distance': [], 'a_loss': [], 'alpha': [], 'lyapunov_error': [], 'entropy': [], 'beta':[], 'action_distance': [], } if global_step > max_global_steps: break s = env.reset() # Random start point # traj_id = np.random.randint(0, len(data_trajectories)) traj_id = np.random.randint(0, variant['num_data_trajectories']) # traj_id = 0 traj = data_trajectories[traj_id] # print(len(traj)) start_point = np.random.randint(0, len(traj)) # start_point = 0 s = traj[start_point, 1] # current state, theta,next w, desired state # this is for decision making # 16,1,4,16 s = np.array([s, traj[start_point, 2], traj[start_point, 4]]) # print(i, s) env.state = s env.model.state = traj[start_point, -8:] ep_steps = min(start_point+1+max_ep_steps, len(traj)) for j in range(start_point+1,ep_steps): if Render: env.render() delta = np.zeros(3) # ###### NOSIE ############## # noise = np.random.normal(0, 0.01, 0.01) # delta[2:]= noise # ########IF Noise env########## # s= s + delta # a = policy.choose_action(s) # ###### BIAS ############## # noise = s[0:16]*0.01 # delta[0:16] = noise a = agent.act(torch.tensor([s]).float()) action = a_lowerbound + (a.detach().numpy() + 1.) * (a_upperbound - a_lowerbound) / 2 # action = traj[j-1,16] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Run in simulator _, r, done, X_ = env.step(action) # The new s= current state,next omega, next state s_ = np.array([X_[1][0], traj[j, 2], traj[j,4]]) r = modify_reward(r, s, s_, variant['reward_id']) if j%100 == 0: print("current state: ", s, "true action: ", traj[j, 5], " predicted action: ", action, " and reward : ", r) env.state = s_ # theta_pre=theta if training_started: global_step += 1 agent.scheduler_step() if j == max_ep_steps - 1+start_point: done = True terminal = 1. if done else 0. if j>start_point+2: pool.store(s, a.detach().numpy().flatten(), np.zeros([1]), np.zeros([1]), r, terminal, s_,_s) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) alpha_loss, beta_loss, labda_loss, actor_loss, lyapunov_loss = agent.learn(batch) if j % 200 == 0: print("labda = ", agent.labda, " | alpha = ", agent.alpha, " | l_loss = ", lyapunov_loss , " | entropy = ", agent.log_pis, " | a_loss = ", actor_loss, " | alpha_loss = ", alpha_loss, " | labda_loss = ", labda_loss) if training_started: current_path['rewards'].append(r) current_path['lyapunov_error'].append(lyapunov_loss.detach().numpy()) current_path['alpha'].append(agent.alpha.detach().numpy()) current_path['entropy'].append(entropy) current_path['a_loss'].append(actor_loss.detach().numpy()) current_path['beta'].append(agent.beta.detach().numpy()) # current_path['action_distance'].append(action_distance) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts(last_training_paths) # print(training_diagnotic) if training_diagnotic is not None: print("doing training evaluation") eval_diagnotic = training_evaluation(variant, env, agent) [logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys()] training_diagnotic.pop('return') [logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys()] string_to_print = ['time_step:', str(global_step), '|'] [string_to_print.extend([key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys()] [string_to_print.extend([key, ':', str(round(training_diagnotic[key], 2)) , '|']) for key in training_diagnotic.keys()] print(''.join(string_to_print)) logger.dumpkvs() if eval_diagnotic['test_return'] / eval_diagnotic['test_average_length'] <= Min_cost: Min_cost = eval_diagnotic['test_return'] / eval_diagnotic['test_average_length'] print("New lowest cost:", Min_cost) agent.save_result(log_path) else: print("cost did not improve.") print("The best cost is ", Min_cost) print("avg cost was ", eval_diagnotic['test_return']/eval_diagnotic['test_average_length']) if training_started and global_step % (10*evaluation_frequency) == 0 and global_step > 0: agent.save_result(log_path) # State Update _s = s s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) break agent.save_result(log_path) print('Running time: ', time.time() - t1) return