def learn( network, env, data_path='', model_path='./model/', model_name='ddpg_none_fuzzy_150', file_name='test', model_based=False, memory_extend=False, model_type='linear', restore=False, dyna_learning=False, seed=None, nb_epochs=5, # with default settings, perform 1M steps total nb_sample_cycle=5, nb_epoch_cycles=150, nb_rollout_steps=400, nb_model_learning=10, nb_sample_steps=50, nb_samples_extend=5, reward_scale=1.0, noise_type='normal_0.2', #'adaptive-param_0.2', ou_0.2, normal_0.2 normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, batch_size=32, # per MPI worker tau=0.01, param_noise_adaption_interval=50, **network_kwargs): nb_actions = env.action_space.shape[0] memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) if model_based: """ store fake_data""" fake_memory = Memory(limit=int(1e5), action_shape=env.action_space.shape[0], observation_shape=env.observation_space.shape) """ select model or not """ if model_type == 'gp': kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) dynamic_model = GaussianProcessRegressor(kernel=kernel) reward_model = GaussianProcessRegressor(kernel=kernel) elif model_type == 'linear': dynamic_model = LinearRegression() reward_model = LinearRegression() elif model_type == 'mlp': dynamic_model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) reward_model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) else: logger.info( "You need to give the model_type to fit the dynamic and reward!!!" ) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) """ set noise """ action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) """action scale""" max_action = env.action_high_bound logger.info( 'scaling actions by {} before executing in env'.format(max_action)) """ agent ddpg """ agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape[0], gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) sess = U.get_session() if restore: agent.restore(sess, model_path, model_name) else: agent.initialize(sess) sess.graph.finalize() agent.reset() episodes = 0 epochs_rewards = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_times = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_steps = np.zeros((nb_epochs, nb_epoch_cycles), dtype=np.float32) epochs_states = [] for epoch in range(nb_epochs): logger.info( "======================== The {} epoch start !!! =========================" .format(epoch)) epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_times = [] epoch_actions = [] epoch_episode_states = [] epoch_qs = [] epoch_episodes = 0 for cycle in range(nb_epoch_cycles): start_time = time.time() obs, state, done = env.reset() obs_reset = cp.deepcopy(obs) episode_reward = 0. episode_step = 0 episode_states = [] logger.info( "================== The {} episode start !!! ===================" .format(cycle)) for t_rollout in range(nb_rollout_steps): logger.info( "================== The {} steps finish !!! ===================" .format(t_rollout)) """ Predict next action """ action, q, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=True) new_obs, next_state, r, done, safe_or_not, final_action = env.step( max_action * action, t_rollout) if safe_or_not is False: break episode_reward += r episode_step += 1 episode_states.append([ cp.deepcopy(state), cp.deepcopy(final_action), np.array(cp.deepcopy(r)), cp.deepcopy(next_state) ]) epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs state = next_state if done: break """ extend the memory """ if model_based and cycle > (nb_model_learning + 1) and memory_extend: pred_x = np.zeros((1, 18), dtype=np.float32) for j in range(nb_samples_extend): m_action, _, _, _ = agent.step(obs, stddev, apply_noise=True, compute_Q=False) pred_x[:, :12] = obs pred_x[:, 12:] = m_action m_new_obs = dynamic_model.predict(pred_x)[0] """ get real reward """ # state = env.inverse_state(m_new_obs) # m_reward = env.get_reward(state, m_action) m_reward = reward_model.predict(pred_x)[0] agent.store_transition(obs, m_action, m_reward, m_new_obs, done) """ generate new data and fit model""" if model_based and cycle > nb_model_learning: logger.info( "============================== Model Fit !!! ===============================" ) input_x = np.concatenate( (memory.observations0.data[:memory.nb_entries], memory.actions.data[:memory.nb_entries]), axis=1) input_y_obs = memory.observations1.data[:memory.nb_entries] input_y_reward = memory.rewards.data[:memory.nb_entries] dynamic_model.fit(input_x, input_y_obs) reward_model.fit(input_x, input_y_reward) if dyna_learning: logger.info( "========================= Collect data !!! =================================" ) pred_obs = np.zeros((1, 18), dtype=np.float32) for sample_index in range(nb_sample_cycle): fake_obs = obs_reset for t_episode in range(nb_sample_steps): fake_action, _, _, _ = agent.step(fake_obs, stddev, apply_noise=True, compute_Q=False) pred_obs[:, :12] = fake_obs pred_obs[:, 12:] = fake_action next_fake_obs = dynamic_model.predict(pred_obs)[0] fake_reward = reward_model.predict(pred_obs)[0] # next_fake_obs = dynamic_model.predict(np.concatenate((fake_obs, fake_action)))[0] # fake_reward = reward_model.predict(np.concatenate((fake_obs, fake_action)))[0] fake_obs = next_fake_obs fake_terminals = False fake_memory.append(fake_obs, fake_action, fake_reward, next_fake_obs, fake_terminals) """ noise decay """ stddev = float(stddev) * 0.95 duration = time.time() - start_time epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_times.append(cp.deepcopy(duration)) epoch_episode_states.append(cp.deepcopy(episode_states)) epochs_rewards[epoch, cycle] = episode_reward epochs_steps[epoch, cycle] = episode_step epochs_times[epoch, cycle] = cp.deepcopy(duration) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_rewards)) logger.info( "============================= The Episode_Times:: {}!!! ============================" .format(epoch_episode_times)) epoch_episodes += 1 episodes += 1 """ Training process """ epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): logger.info("") # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() """ planning training """ if model_based and cycle > (nb_model_learning + 1) and dyna_learning: for t_train in range(nb_train_steps): # setting for adapt param noise, if necessary. if fake_memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) batch = fake_memory.sample(batch_size=batch_size) fake_cl, fake_al = agent.train_fake_data(batch) epoch_critic_losses.append(fake_cl) epoch_actor_losses.append(fake_al) agent.update_target_net() epochs_states.append(cp.deepcopy(epoch_episode_states)) # # save data np.save( data_path + 'train_reward_' + algorithm_name + '_' + noise_type + file_name, epochs_rewards) np.save( data_path + 'train_step_' + algorithm_name + '_' + noise_type + file_name, epochs_steps) np.save( data_path + 'train_states_' + algorithm_name + '_' + noise_type + file_name, epochs_states) np.save( data_path + 'train_times_' + algorithm_name + '_' + noise_type + file_name, epochs_times) # # agent save agent.store(model_path + 'train_model_' + algorithm_name + '_' + noise_type + file_name)
def learn( network, env, path='', model_path='./model/', model_name='ddpg_none_fuzzy', restore=False, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=50, reward_scale=1.0, render=False, render_eval=False, noise_type=None, #'adaptive-param_0.2' normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=32, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 1 rank = MPI.COMM_WORLD.Get_rank() nb_actions = env.action_space.n # assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. memory = Memory(limit=int(1e5), action_shape=env.action_space.n, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None nb_actions = env.action_space.n if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = env.max_action logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.n, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() if restore: agent.restore(sess, model_path, model_name) else: # Prepare everything. agent.initialize(sess) sess.graph.finalize() # if eval_env is not None: # eval_obs = eval_env.reset() # nenvs = obs.shape[0] # episode_reward = np.zeros(nenvs, dtype=np.float32) #vector # episode_step = np.zeros(nenvs, dtype=int) # vector agent.reset() episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_episode_states = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. # if nenvs > 1: # # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # # of the environments, so resetting here instead # agent.reset() obs, done = env.reset() episode_reward = 0. episode_step = 0 episode_states = [] for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step(max_action * action, t_rollout) # tion in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 episode_states.append(cp.deepcopy(obs)) # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs # for d in range(len(done)): # if done[d]: # # Episode done. # epoch_episode_rewards.append(episode_reward[d]) # episode_rewards_history.append(episode_reward[d]) # epoch_episode_steps.append(episode_step[d]) # episode_reward[d] = 0. # episode_step[d] = 0 # epoch_episodes += 1 # episodes += 1 # if nenvs == 1: # agent.reset() if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) epoch_episode_states.append(cp.deepcopy(episode_states)) # episode_reward = 0. # episode_step = 0. epoch_episodes += 1 episodes += 1 # if nenvs == 1: # agent.reset() break # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append( eval_episode_reward[d]) eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() # save data np.save(path + 'train_reward_none_fuzzy', epoch_episode_rewards) np.save(path + 'train_step_none_fuzzy', epoch_episode_steps) np.save(path + 'train_states_none_fuzzy', epoch_episode_states) # agent save agent.store(model_path + model_name)