def main(): map_dir = "/home/i2rlab/shahil_files/shahil_RL_ws_new/src/turtlebot/turtlebot_gazebo/worlds/" #os.rename(map_dir+"house_train.world",map_dir+"maze.world") rospy.init_node('turtlebot2_human', anonymous=True, log_level=rospy.WARN) env = StartOpenAI_ROS_Environment( 'MyTurtleBot2HumanModel-v1') #os.rename(map_dir+"maze.world",map_dir+"house_train.world") s_dim = env.observation_space.shape[0]+4 a_dim = env.action_space.shape[0] a_bound = env.action_space.high #print('a_bound test', a_bound) sign_talker = rospy.Publisher('/sign', Int64, queue_size=1) ####################### load agent ####################################### #TD3 = td3(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE, LR_A, LR_C) #TD3.loader() d = [0.5,1,0.5,1]+list(6*np.ones([180])) #A = np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]]) #TGREEN = '\033[32m' # Green Text #ENDC = '\033[m' # reset to the defaults #################### Start #################### i=0 while i <2: s = env.reset() #print("env printing: ",env) t1 = time.time() min_distance_ob = s[-2] theta_bt_uh_ob = s[-1] #a_old = A[0] a_old = np.array([1,0,0,0]) s = np.hstack((a_old,np.array(s[3:-2])/d)) for j in range(MAX_EP_STEPS): ################ Mode ######################################### #if i%2==0: #a = TD3.choose_action(s,1) #a = [1,0,0,0] #if a[1]<=0.9 and min_distance_ob<=0.7 and abs(theta_bt_uh_ob)<1.3: #print (TGREEN + "Going to Crash!" , ENDC) #a = A[1] '''else: a = A[0]''' #x_test = env.state_msg.pose.position.x #print('x_test',env.collect_data) a = np.array([1,0,0,0]) s_, r, done, info = env.step(a) sign_talker.publish(int(r)) if r <-200: done = 0 min_distance_ob = s_[-2] theta_bt_uh_ob = s_[-1] a_old = a s=np.hstack((a_old,np.array(s_[3:-2])/d)) if done or j == MAX_EP_STEPS-1: t = time.time()-t1 if j>3: i=i+1 break
def __init__(self, environment_name, n_observations, n_actions, n_episodes=1000, n_win_ticks=195, min_episodes=100, max_env_steps=None, gamma=1.0, epsilon=1.0, epsilon_min=0.01, epsilon_log_decay=0.995, alpha=0.01, alpha_decay=0.01, batch_size=64, monitor=False, quiet=False): self.memory = deque(maxlen=100000) # self.env = gym.make(environment_name) self.env = StartOpenAI_ROS_Environment(environment_name) if monitor: self.env = gym.wrappers.Monitor(self.env, '../data/cartpole-1', force=True) self.input_dim = n_observations self.n_actions = n_actions self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_log_decay self.alpha = alpha self.alpha_decay = alpha_decay self.n_episodes = n_episodes self.n_win_ticks = n_win_ticks self.min_episodes = min_episodes self.batch_size = batch_size self.quiet = quiet if max_env_steps is not None: self.env._max_episode_steps = max_env_steps # Init model self.model = Sequential() self.model.add(Dense(24, input_dim=self.input_dim, activation='tanh')) self.model.add(Dense(48, activation='tanh')) self.model.add(Dense(self.n_actions, activation='linear')) self.model.compile(loss='mse', optimizer=Adam(lr=self.alpha, decay=self.alpha_decay))
def test(self): # Initialize OpenAI_ROS ENV LoadYamlFileParamsTest(rospackage_name="learning_ros", rel_path_from_package_to_file="config", yaml_file_name="aliengo_stand.yaml") env = StartOpenAI_ROS_Environment('AliengoStand-v0') saver = tf.train.Saver() time.sleep(3) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, 'model_test/model_ddqn.ckpt') scores = [] success_num = 0 for e in range(self.EPISODES): state = env.reset() state = np.reshape(state, [1, self.state_size]) done = False i = 0 rewards = [] while not rospy.is_shutdown(): # until ros is not shutdown #self.env.render() # height prob is chosen as action action = np.argmax(self.model.predict(state)) next_state, reward, done, _ = env.step(action) time.sleep(0.01) next_state = np.reshape(next_state, [1, self.state_size]) rewards.append(reward) self.remember(state, action, reward, next_state, done) i += 1 if done: # every step update target model self.update_target_model() state = env.reset() break else: state = next_state self.replay() scores.append(sum(rewards)) plt.plot(scores) plt.show()
def main(): rospy.init_node('parrotdrone_goto_qlearn', anonymous=True, log_level=rospy.WARN) task_and_robot_environment_name = rospy.get_param( '/drone/task_and_robot_environment_name') env = StartOpenAI_ROS_Environment( task_and_robot_environment_name)
def launch(args): # create the ddpg_agent task_and_robot_environment_name = rospy.get_param( '/fetch/task_and_robot_environment_name') # to register our task env to openai env. # so that we don't care the output of this method for now. env = StartOpenAI_ROS_Environment(task_and_robot_environment_name) # env = gym.make(args.env_name) # set random seeds for reproduce env.seed(args.seed + MPI.COMM_WORLD.Get_rank()) random.seed(args.seed + MPI.COMM_WORLD.Get_rank()) np.random.seed(args.seed + MPI.COMM_WORLD.Get_rank()) torch.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank()) if args.cuda: torch.cuda.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank()) # get the environment parameters env_params = get_env_params(env) # create the ddpg agent to interact with the environment ddpg_trainer = ddpg_agent(args, env, env_params) ddpg_trainer.learn()
def main(): task_and_robot_environment_name = rospy.get_param( '/fetch/task_and_robot_environment_name') # to register our task env to openai env. # so that we don't care the output of this method for now. env = StartOpenAI_ROS_Environment(task_and_robot_environment_name) logdir = None n_epochs = 50 num_cpu = 1 seed = 0 policy_save_interval = 5 replay_strategy = "future" clip_return = 1 rospy.logwarn("task_and_robot_environment_name ==>" + str(env)) rospy.logwarn("HERE") rospy.init_node("train_fetch_her") launch(task_and_robot_environment_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return)
from stable_baselines.deepq.policies import MlpPolicy as mlp_dqn from stable_baselines.sac.policies import MlpPolicy as mlp_sac from stable_baselines.ddpg.policies import MlpPolicy as mlp_ddpg from stable_baselines.td3.policies import MlpPolicy as mlp_td3 from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from openai_ros.task_envs.cartpole_stay_up import stay_up from openai_ros.openai_ros_common import StartOpenAI_ROS_Environment os.chdir('~/catkin_ws/src/openai_examples/cartpole/cartpole3d/scripts/') rospy.init_node('cartpole3d_trpo', anonymous=True, log_level=rospy.FATAL) environment_name = rospy.get_param( '/cartpole_v0/task_and_robot_environment_name') env = StartOpenAI_ROS_Environment(environment_name) env = DummyVecEnv([lambda: env ]) # The algorithms require a vectorized environment to run # The noise objects for TD3 n_actions = env.action_space.n action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model_list = [ A2C(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/A2C/"), ACKTR(MlpPolicy, env,
params = P() params.goal = ROBOTICS_ENV_LIST[params.env_name] params.test_episodes = 10 now = datetime.now() params.log_dir += "{}".format(params.env_name) params.model_dir += "{}".format(params.env_name) rospy.init_node("start_her") task_and_robot_environment_name = rospy.get_param( '/fetch/task_and_robot_environment_name') # to register our task env to openai env. # so that we don't care the output of this method for now. env = StartOpenAI_ROS_Environment(task_and_robot_environment_name) # params.max_action = env.action_space.high[0] # params.num_action = env.action_space.shape[0] # TODO: this is temp solution..... check openai's fetch's implementation!! params.max_action = 0 params.num_action = 4 # set seed env.seed(params.seed) tf.random.set_random_seed(params.seed) agent = DDPG(Actor, Critic, params.num_action, params) replay_buffer = ReplayBuffer(params.memory_size)
from stable_baselines.deepq.policies import MlpPolicy as mlp_dqn from stable_baselines.sac.policies import MlpPolicy as mlp_sac from stable_baselines.ddpg.policies import MlpPolicy as mlp_ddpg from stable_baselines.td3.policies import MlpPolicy as mlp_td3 from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from openai_ros.task_envs.cartpole_stay_up import stay_up from openai_ros.openai_ros_common import StartOpenAI_ROS_Environment os.chdir('~/catkin_ws/src/openai_examples/cartpole/cartpole3d/scripts/') rospy.init_node('cartpole3d_train_all', anonymous=True, log_level=rospy.FATAL) environment_name = rospy.get_param('/cartpole_v0/task_and_robot_environment_name') env = StartOpenAI_ROS_Environment(environment_name) env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run # The noise objects for TD3 n_actions = env.action_space.n action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model_list = [ A2C(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/A2C/"), ACKTR(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/ACKTR/"), PPO2(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/PPO2/"), TRPO(MlpPolicy, env, verbose=1, tensorboard_log="../results/tensorboard_logs/TRPO/"), ] algo_list = ['A2C', 'ACKTR', 'PPO2', 'TRPO']
agent.writer.add_scalar('Train/q2_loss', agent.q2_loss.item(), ep_num) agent.writer.add_scalar('Train/policy_loss', agent.policy_loss.item(), ep_num) agent.writer.flush() total_rewards.append(ep_reward) avg_reward = np.mean(total_rewards[-100:]) agent.writer.add_scalar('avg_reward', avg_reward, ep_num) agent.writer.flush() print("Steps:{} Episode:{} Reward:{} Avg Reward:{}".format( t, ep_num, ep_reward, avg_reward)) o, r, d, ep_reward, ep_len = env.reset(), 0, False, 0, 0 goal_point.position.x, goal_point.position.y, goal_point.position.z = [ random.uniform(0.3, 0.5) for _ in xrange(3) ] goal_pub.publish(goal_point) ep_num += 1 if t > 0 and t % steps_per_epoch == 0: epoch = t rospy.init_node('test_RL_PANDA', anonymous=True, log_level=rospy.WARN) replay_buffer = ReplayBuffer(int(1e6)) env = NormalizedActions(StartOpenAI_ROS_Environment('MyPandaTrainingEnv-v0')) agent = SAC(env, replay_buffer, hidden_dim=[256, 256]) train(agent)
def main(): rospy.init_node('movingcube_onedisk_walk_gym', anonymous=True, log_level=rospy.WARN) # Set the path where learned model will be saved rospack = rospkg.RosPack() pkg_path = rospack.get_path('my_moving_cube_pkg') models_dir_path = os.path.join(pkg_path, "models_saved") if not os.path.exists(models_dir_path): os.makedirs(models_dir_path) out_model_file_path = os.path.join(models_dir_path, "movingcube_model.pkl") max_timesteps = rospy.get_param("/moving_cube/max_timesteps") buffer_size = rospy.get_param("/moving_cube/buffer_size") # We convert to float becase if we are using Ye-X notation, it sometimes treats it like a string. lr = float(rospy.get_param("/moving_cube/lr")) exploration_fraction = rospy.get_param("/moving_cube/exploration_fraction") exploration_final_eps = rospy.get_param( "/moving_cube/exploration_final_eps") print_freq = rospy.get_param("/moving_cube/print_freq") reward_task_learned = rospy.get_param("/moving_cube/reward_task_learned") def callback(lcl, _glb): # stop training if reward exceeds 199 aux1 = lcl['t'] > 100 aux2 = sum(lcl['episode_rewards'][-101:-1]) / 100 is_solved = aux1 and aux2 >= reward_task_learned rospy.logdebug("aux1=" + str(aux1)) rospy.logdebug("aux2=" + str(aux2)) rospy.logdebug("reward_task_learned=" + str(reward_task_learned)) rospy.logdebug("IS SOLVED?=" + str(is_solved)) return is_solved # Init OpenAI_ROS ENV task_and_robot_environment_name = rospy.get_param( '/moving_cube/task_and_robot_environment_name') env = StartOpenAI_ROS_Environment(task_and_robot_environment_name) rospy.loginfo("Starting Learning") model = deepq.models.mlp([64]) act = deepq.learn( env, q_func=model, lr=lr, max_timesteps=max_timesteps, buffer_size=buffer_size, exploration_fraction=exploration_fraction, exploration_final_eps=exploration_final_eps, print_freq= print_freq, # how many apisodes until you print total rewards and info param_noise=False, callback=callback) env.close() rospy.logwarn("Saving model to movingcube_model.pkl") act.save(out_model_file_path)
def main(): name = input('Please input your exp number:') folder = "Human_exp_new_" + str(name) os.system("mkdir " + folder) ####################### inite environment ######################################## map_dir = "/home/i2rlab/shahil_files/shahil_RL_ws_new/src/turtlebot/turtlebot_gazebo/worlds/" #os.rename(map_dir+"house_3.world",map_dir+"maze.world") rospy.init_node('turtlebot2_human', anonymous=True, log_level=rospy.WARN) env = StartOpenAI_ROS_Environment('MyTurtleBot2HumanModel-v1') #os.rename(map_dir+"maze.world",map_dir+"house_3.world") s_dim = env.observation_space.shape[0] + 4 a_dim = env.action_space.shape[0] a_bound = env.action_space.high sign_talker = rospy.Publisher('/sign', Int64, queue_size=1) ####################### load agent ####################################### TD3 = td3(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE, LR_A, LR_C) TD3.loader() # ddpg = DDPG(a_dim, s_dim, a_bound, GAMMA, TAU, MEMORY_CAPACITY, BATCH_SIZE, LR_A, LR_C) # with ddpg.sess.as_default(): # with ddpg.g.as_default(): # ddpg.loader() d = [0.5, 1, 0.5, 1] + list(6 * np.ones([180])) A = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) TGREEN = '\033[32m' # Green Text ENDC = '\033[m' # reset to the defaults #################### Initialize Data ################################# n_h = 0 ############## number of eps of pure human control ################# n_s_TD3 = 0 ############## number of eps of shared control ################# # n_s_ddpg = 0 n_crash_h = 0 ############## number of crash in pure human control ################# n_crash_s_TD3 = 0 ############## number of crash in shared control ################# # n_crash_s_ddpg = 0 n_success_h = 0 ############## number of success of pure human control ################# n_success_s_TD3 = 0 ############## number of success of shared control ################# # n_success_s_ddpg = 0 t_h = 0 ############## total time of pure human control ################# t_s_TD3 = 0 ############## total time of shared control ################# # t_s_ddpg = 0 #################### SART ##################### SA_h = 0 SA_s_TD3 = 0 # SA_s_ddpg = 0 #################### workload ################# WL_h = 0 WL_s_TD3 = 0 # WL_s_ddpg = 0 #################### Start #################### i = 0 total_ep = 2 sequence_ep = [0, 1] #np.random.shuffle(sequence_ep) pureh_ep = sequence_ep[0:int(total_ep / 2)] # ddpg_ep = sequence_ep[int(total_ep/3):int(2*total_ep/3)] td3_ep = sequence_ep[int(total_ep / 2):] n_d = 0 while i < total_ep: s = env.reset() print(s) t1 = time.time() min_distance_ob = s[-2] theta_bt_uh_ob = s[-1] a_old = A[0] s = np.hstack((a_old, np.array(s[3:-2]) / d)) done = 0 while not done: ################ Mode ######################################### if i in td3_ep: a = TD3.choose_action(s, 1) if a[1] <= 0.9 and min_distance_ob <= 0.7 and abs( theta_bt_uh_ob) < 1.3: print(TGREEN + "Going to Crash!", ENDC) a = A[1] elif i in pureh_ep: a = A[0] s_, r, done, info = env.step(a) sign_talker.publish(int(r)) if r < -400 and done: done = 0 if i in td3_ep: n_crash_s_TD3 += 1 elif i in pureh_ep: n_crash_h += 1 print("n_crash_s_TD3:", n_crash_s_TD3) print("n_crash_h:", n_crash_h) print("i:", i) print("n_d:", n_d) min_distance_ob = s_[-2] theta_bt_uh_ob = s_[-1] a_old = a s = np.hstack((a_old, np.array(s_[3:-2]) / d)) if done: t = 0 n_d = n_d + 1 if n_d % 5 == 0: instability = int( input( 'How changeable is the situation (1: stable and straightforward -- 7: changing suddenly):' )) variability = int( input( 'How many variables are changing within the situation: (1: very few -- 7: large number):' )) complexity = int( input( 'How complicated is the situation (1: simple -- 7: complex):' )) arousal = int( input( 'How aroused are you in the situation (1: low degree of alertness -- 7: alert and ready for activity):' )) spare = int( input( 'How much mental capacity do you have to spare in the situation (1: Nothing to spare -- 7: sufficient):' )) concentration = int( input( 'How much are you concentrating on the situation (1: focus on only one thing -- 7: many aspect):' )) attention = int( input( 'How much is your attention divide in the situation (1: focus on only one thing -- 7: many aspect):' )) quantity = int( input( 'How much information have your gained about the situation (1: little -- 7: much):' )) quality = int( input( 'How good information have you been accessible and usable (1: poor -- 7: good):' )) famlilarity = int( input( 'How familar are you with the situation (1: New situation -- 7: a great deal of relevant experience):' )) SA = quantity + quality + famlilarity - ( (instability + variability + complexity) - (arousal + spare + concentration + attention)) if i in td3_ep: SA_s_TD3 += SA WL_s_TD3 += float( input('Please input your workload (from TLX):')) t_s_TD3 += t n_s_TD3 += 1 if r > 500: n_success_s_TD3 += 1 elif i in pureh_ep: SA_h += SA WL_h += float( input('Please input your workload (from TLX):')) t_h += t n_h += 1 if r > 500: n_success_h += 1 i = i + 1 break np.savetxt( 'data.dat', np.array([[ n_s_TD3, t_s_TD3, n_crash_s_TD3, n_success_s_TD3, SA_s_TD3, WL_s_TD3 ], [n_h, t_h, n_crash_h, n_success_h, SA_h, WL_h]])) #,[n_s_ddpg,t_s_ddpg,n_crash_s_ddpg,n_success_s_ddpg, SA_s_ddpg, WL_s_ddpg]])) ########### shared_TD3: number of eps, total time, time of crash, time of success, situation awareness, workload ########### ########### human: number of eps, total time, time of crash, time of success, situation awareness, workload ########### ########### shared_TD3_ddpg: number of eps, total time, time of crash, time of success, situation awareness, workload ########### os.system('cp -r data.dat ' + folder + '/')
class DQNRobotSolver(): def __init__(self, environment_name, n_observations, n_actions, n_episodes=1000, n_win_ticks=195, min_episodes=100, max_env_steps=None, gamma=1.0, epsilon=1.0, epsilon_min=0.01, epsilon_log_decay=0.995, alpha=0.01, alpha_decay=0.01, batch_size=64, monitor=False, quiet=False): self.memory = deque(maxlen=100000) # self.env = gym.make(environment_name) self.env = StartOpenAI_ROS_Environment(environment_name) if monitor: self.env = gym.wrappers.Monitor(self.env, '../data/cartpole-1', force=True) self.input_dim = n_observations self.n_actions = n_actions self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_log_decay self.alpha = alpha self.alpha_decay = alpha_decay self.n_episodes = n_episodes self.n_win_ticks = n_win_ticks self.min_episodes = min_episodes self.batch_size = batch_size self.quiet = quiet if max_env_steps is not None: self.env._max_episode_steps = max_env_steps # Init model self.model = Sequential() self.model.add(Dense(24, input_dim=self.input_dim, activation='tanh')) self.model.add(Dense(48, activation='tanh')) self.model.add(Dense(self.n_actions, activation='linear')) self.model.compile(loss='mse', optimizer=Adam(lr=self.alpha, decay=self.alpha_decay)) def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def choose_action(self, state, epsilon): return self.env.action_space.sample() if ( np.random.random() <= epsilon) else np.argmax( self.model.predict(state)) def get_epsilon(self, t): return max( self.epsilon_min, min(self.epsilon, 1.0 - math.log10((t + 1) * self.epsilon_decay))) def preprocess_state(self, state): return np.reshape(state, [1, self.input_dim]) def replay(self, batch_size): x_batch, y_batch = [], [] minibatch = random.sample(self.memory, min(len(self.memory), batch_size)) for state, action, reward, next_state, done in minibatch: y_target = self.model.predict(state) y_target[0][ action] = reward if done else reward + self.gamma * np.max( self.model.predict(next_state)[0]) x_batch.append(state[0]) y_batch.append(y_target[0]) self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay def run(self): rate = rospy.Rate(30) scores = deque(maxlen=100) for e in range(self.n_episodes): init_state = self.env.reset() state = self.preprocess_state(init_state) done = False i = 0 while not done: # openai_ros doesnt support render for the moment #self.env.render() action = self.choose_action(state, self.get_epsilon(e)) next_state, reward, done, _ = self.env.step(action) next_state = self.preprocess_state(next_state) self.remember(state, action, reward, next_state, done) state = next_state i += 1 scores.append(i) mean_score = np.mean(scores) if mean_score >= self.n_win_ticks and e >= min_episodes: if not self.quiet: print('Ran {} episodes. Solved after {} trials'.format( e, e - min_episodes)) return e - min_episodes if e % 1 == 0 and not self.quiet: print( '[Episode {}] - Mean survival time over last {} episodes was {} ticks.' .format(e, min_episodes, mean_score)) self.replay(self.batch_size) if not self.quiet: print('Did not solve after {} episodes'.format(e)) return e
def main(): # Initialize OpenAI_ROS ENV LoadYamlFileParamsTest(rospackage_name="learning_ros", rel_path_from_package_to_file="config", yaml_file_name="aliengo_stand.yaml") env = StartOpenAI_ROS_Environment('AliengoStand-v0') time.sleep(3) # Initialize PPO agent Policy = Policy_net('policy', ob_space=3, act_space=8) Old_Policy = Policy_net('old_policy', ob_space=3, act_space=8) PPO = PPOTrain(Policy, Old_Policy, gamma=GAMMA) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) obs = env.reset() reward = 0 success_num = 0 scores = [] for iteration in range(ITERATION): observations = [] actions = [] v_preds = [] rewards = [] while not rospy.is_shutdown(): # until ros is not shutdown # prepare to feed placeholder Policy.obs obs = np.stack([obs]).astype(dtype=np.float32) act, v_pred = Policy.act(obs=obs, stochastic=True) #print('act: ',act, 'v_pred: ',v_pred ) act = np.asscalar(act) v_pred = np.asscalar(v_pred) observations.append(obs) actions.append(act) v_preds.append(v_pred) rewards.append(reward) #execute according to action next_obs, reward, done, _ = env.step(act) time.sleep(0.01) if done: # next state of terminate state has 0 state value v_preds_next = v_preds[1:] + [0] obs = env.reset() break else: obs = next_obs #scores store for visualization scores.append(sum(rewards)) #if consectuvely 10 times has high reward end the training if sum(rewards) >= 400: success_num += 1 print("Succes number: " + str(success_num)) if success_num >= 5: saver.save(sess, 'model_train/model_ppo.ckpt') print('Clear!! Model saved.') if success_num >= 10: saver.save(sess, 'model_train/model_ppo.ckpt') print('Finished! ') break else: success_num = 0 gaes = PPO.get_gaes(rewards=rewards, v_preds=v_preds, v_preds_next=v_preds_next) # convert list to numpy array for feeding tf.placeholder observations = np.reshape(observations, [len(observations), 3]) actions = np.array(actions).astype(dtype=np.int32) rewards = np.array(rewards).astype(dtype=np.float32) v_preds_next = np.array(v_preds_next).astype(dtype=np.float32) #calculate generative advantage estimator score gaes = np.array(gaes).astype(dtype=np.float32) gaes = (gaes - gaes.mean()) print('gaes', gaes) #assign current policy params to previous policy params PPO.assign_policy_parameters() inp = [observations, actions, rewards, v_preds_next, gaes] # PPO train for epoch in range(4): sample_indices = np.random.randint( low=0, high=observations.shape[0], size=64) # indices are in [low, high) sampled_inp = [ np.take(a=a, indices=sample_indices, axis=0) for a in inp ] # sample training data PPO.train(obs=sampled_inp[0], actions=sampled_inp[1], rewards=sampled_inp[2], v_preds_next=sampled_inp[3], gaes=sampled_inp[4]) plt.plot(scores) plt.show() env.stop()
experience_replay_buffer = ReplayMemory() episode_rewards = np.zeros(num_episodes) episode_lens = np.zeros(num_episodes) epsilon = 1.0 epsilon_min = 0.1 epsilon_change = (epsilon - epsilon_min) / 500000 rospy.init_node('sumo_dqlearn', anonymous=True, log_level=rospy.WARN) episode_counter_pub = rospy.Publisher('/episode_counter', Int16) # Init OpenAI_ROS ENV task_and_robot_environment_name = rospy.get_param( '/turtlebot2/task_and_robot_environment_name') env = StartOpenAI_ROS_Environment(task_and_robot_environment_name) rospy.loginfo("Gym environment done") obs = env.reset() print obs.shape # Create the Gym environment rospy.loginfo("Gym environment done") rospy.loginfo("Starting Learning") # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('gazebo_sumo') #outdir = pkg_path + '/training_results' #env = wrappers.Monitor(env, outdir, force=True) rospy.loginfo("Monitor Wrapper started")
def run(self): # Initialize OpenAI_ROS ENV LoadYamlFileParamsTest(rospackage_name="learning_ros", rel_path_from_package_to_file="config", yaml_file_name="aliengo_stand.yaml") env = StartOpenAI_ROS_Environment('AliengoStand-v0') saver = tf.train.Saver() time.sleep(3) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) scores = [] success_num = 0 for e in range(self.EPISODES): state = env.reset() state = np.reshape(state, [1, self.state_size]) done = False rewards = [] while not rospy.is_shutdown( ): # until ros is not shutdown #self.env.render() action = self.act(state) next_state, reward, done, _ = env.step(action) time.sleep(0.01) next_state = np.reshape(next_state, [1, self.state_size]) rewards.append(reward) self.remember(state, action, reward, next_state, done) if done: # every step update target model self.update_target_model() state = env.reset() break else: state = next_state self.replay() #if consectuvely 10 times has high reward end the training scores.append(sum(rewards)) if sum(rewards) >= 400: success_num += 1 print("Succes number: " + str(success_num)) if success_num >= 5: #checkpoint if (ddqn): saver.save(sess, 'model_train/model_ddqn.ckpt') else: saver.save(sess, 'model_train/model_dqn.ckpt') print('Clear!! Model saved.') if success_num >= 10: if (ddqn): saver.save(sess, 'model_train/model_ddqn.ckpt') else: saver.save(sess, 'model_train/model_dqn.ckpt') print('Clear!! Model saved. AND Finished! ') break else: success_num = 0 plt.plot(scores) plt.show()
self.model = model_from_yaml(loaded_model_yaml) # load weights into new model self.model.load_weights(model_name_HDF5_format_path) print("Loaded model from disk") if __name__ == '__main__': rospy.init_node('fetch_n1try_algorithm', anonymous=True, log_level=rospy.WARN) # Init OpenAI_ROS ENV task_and_robot_environment_name = rospy.get_param( '/sia_fetch/task_and_robot_environment_name') rospy.logwarn("task_and_robot_environment_name ==>" + str(task_and_robot_environment_name)) openai_ros_env_object = StartOpenAI_ROS_Environment( task_and_robot_environment_name) #print("########################## task_and_robot_environment_name ########################################") #print(task_and_robot_environment_name) rospy.loginfo("Starting Learning") rospackage_name = "fetch_train" model_name = "sia_fetch_n1try" n_observations = rospy.get_param('/sia_fetch/n_observations') n_actions = rospy.get_param('/sia_fetch/n_actions') n_episodes_training = rospy.get_param('/sia_fetch/episodes_training') n_episodes_running = rospy.get_param('/sia_fetch/episodes_running') n_win_ticks = rospy.get_param('/sia_fetch/n_win_ticks')
ep_reward += reward rospy.logwarn("Testing: Cumulative Reward of this episode: ") rospy.logwarn(ep_reward) writer.add_scalar("Test_Cumulative_Rewards", ep_reward, global_step=test_global_step) return ep_reward, test_global_step if __name__ == '__main__': rospy.init_node('example_turtlebot2_maze_dqn', anonymous=True, log_level=rospy.WARN) # Init OpenAI_ROS ENV task_and_robot_environment_name = rospy.get_param('/turtlebot2/task_and_robot_environment_name') # Create the Gym environment env = StartOpenAI_ROS_Environment(task_and_robot_environment_name) rospy.loginfo("Gym environment done") rospy.loginfo("Starting Learning") # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('turtle2_openai_ros_example') outdir = pkg_path + '/training_results' env = wrappers.Monitor(env, outdir, force=True) rospy.loginfo("Monitor Wrapper started") last_time_steps = np.ndarray(0) """ # Loads parameters from the ROS param server
from gym import wrappers # ROS packages required import rospy import rospkg from openai_ros.openai_ros_common import StartOpenAI_ROS_Environment if __name__ == '__main__': rospy.init_node('example_turtlebot2_maze_qlearn', anonymous=True, log_level=rospy.WARN) # Init OpenAI_ROS ENV task_and_robot_environment_name = rospy.get_param( '/turtlebot2/task_and_robot_environment_name') env = StartOpenAI_ROS_Environment( task_and_robot_environment_name) # Create the Gym environment rospy.loginfo("Gym environment done") rospy.loginfo("Starting Learning") # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('turtle2_openai_ros_example') outdir = pkg_path + '/training_results' env = wrappers.Monitor(env, outdir, force=True) rospy.loginfo("Monitor Wrapper started") last_time_steps = numpy.ndarray(0) # Loads parameters from the ROS param server # Parameters are stored in a yaml file inside the config directory
# RL packages required import tensorflow as tf import cv2 from ForgER.agent import * from ForgER.replay_buff import AggregatedBuff from ForgER.model import get_network_builder rospy.init_node('example_deepsoccer_soccer_qlearn', anonymous=True, log_level=rospy.WARN) task_and_robot_environment_name = rospy.get_param('/deepsoccer/task_and_robot_environment_name') save_path = rospy.get_param('/deepsoccer/save_path') save_file = rospy.get_param('/deepsoccer/save_file') env = StartOpenAI_ROS_Environment(task_and_robot_environment_name) def get_dtype_dict(env): action_shape = env.action_space.shape action_shape = action_shape if len(action_shape) > 0 else 1 action_dtype = env.action_space.dtype action_dtype = 'int32' if np.issubdtype(action_dtype, int) else action_dtype action_dtype = 'float32' if np.issubdtype(action_dtype, float) else action_dtype env_dict = {'action': {'shape': action_shape, 'dtype': action_dtype}, 'reward': {'dtype': 'float32'}, 'done': {'dtype': 'bool'}, 'n_reward': {'dtype': 'float32'}, 'n_done': {'dtype': 'bool'}, 'actual_n': {'dtype': 'float32'}, 'demo': {'dtype': 'float32'},