def main(): experiment = 'model-builder-v0' #specify environments here env = gym.make(experiment) #steps= env.spec.timestep_limit #steps per episode steps = 20 assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) for i in range(episodes): print("==== Starting episode no:", i, "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print("Action at step", t, " :", action, "\n") observation, reward, done, info = env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode) print("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print('\n\n') break total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))
def main(): experiment= 'InvertedPendulum-v1' #specify environments here env= gym.make(experiment) steps= env.spec.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 reward_per_episode = 0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:",i,"====","\n" observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise print "Action at step", t ," :",action,"\n" observation,reward,done,info=env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode print "Printing reward to file" exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def main(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE, CA_ACTION_BOUND) exploration_noise = OUNoise(CA_ACTION_SPACE) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = CA_OBS_SPACE num_actions = CA_ACTION_SPACE print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:", i, "====", "\n" # observation = env.reset() observation = ca_reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) # env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print "Action at step", t, " :", action, "\n" # observation,reward,done,info=env.step(action) observation, reward, done, info = ca_step(action) print x, observation, action, reward, done #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode print "Printing reward to file" exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print '\n\n' break total_reward += reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def main(): '''main function''' # Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(ACTORNET_PRE_TRAINED, STATENET_PRE_TRAINED) exploration_noise = OUNoise(ACTION_DIM) # saving reward: reward_st = np.array([0]) img_server = ImgServer() img_server.wait_for_connect() observe_t_img = img_server.receive_img() observe_t_data = convert_img2data(observe_t_img) actor_t = agent.evaluate_actor(observe_t_data) noise = exploration_noise.noise() actor_t = actor_t[0] + noise img_server.send_actor_cmd(actor_t) observe_t_1_img = observe_t_img actor_t_1 = actor_t img_server.close_connect() index = 1 while True: observe_t_img = observe_t_1_img actor_t = actor_t_1 img_server.wait_for_connect() observe_t_1_img = img_server.receive_img() observe_t_1_data = convert_img2data(observe_t_1_img) actor_t_1 = agent.evaluate_actor(observe_t_1_data) noise = exploration_noise.noise() actor_t_1 = actor_t_1[0] + noise cost = compute_cost(observe_t_img) agent.add_experience(observe_t_img, observe_t_1_img, actor_t, cost, index) if index > 32: agent.train() img_server.send_actor_cmd(actor_t_1) img_server.close_connect() index = index + 1
def main(): env = Env(19997) steps= 10000 num_states = 59 num_actions = 3 #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(num_actions) counter=0 reward_per_episode = 0 total_reward=0 reward_st = np.array([0]) agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt') for i in range(episodes): # print "==== Starting episode no:",i,"====","\n" observation = env.reset() done =False reward_per_episode = 0 for t in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise for i in range(num_actions): if action[i] > 1.0: action[i] = 1.0 if action[i] < -1.0: action[i] = -1.0 observation,reward,done = env.step(action) print("reward:", reward, "\n") agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode) exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") agent.actor_net.save_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.save_critic(os.getcwd() + '/weights/critic/model.ckpt') break total_reward+=reward_per_episode
def main(): env= Env(19997) steps = 300 #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(2) counter = 0 reward_per_episode = 0. num_states = 32*16 num_actions = 2 #saving reward: reward_st = np.array([0]) for i in range(episodes): print ("==== Starting episode no:",str(i),"====","\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise observation,reward,done=env.step(action,t) agent.add_experience(x,observation,action,reward,done) if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done): print ('EPISODE: ',str(i),' Steps: ',str(t),' Total Reward: ',str(reward_per_episode)) exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") agent.actor_net.save_actor('/home/lee/Projects/Tracking/RL/weights/actor/model.ckpt') agent.critic_net.save_critic('/home/lee/Projects/Tracking/RL/weights/critic/model.ckpt') print ('\n\n') break
def main(): env = Env(20000) steps = 50 agent = DDPG(env, is_batch_norm) counter = 0 exploration_noise = OUNoise(2) reward_per_episode = 0 num_states = 96 * 4 + 4 num_actions = 2 reward_st = np.array([0]) agent.actor_net.load_actor( '/home/myounghoe/ddpgtf/norepeat_target_2action_scale2/weights/actor/model.ckpt' ) agent.critic_net.load_critic( '/home/myounghoe/ddpgtf/norepeat_target_2action_scale2/weights/critic/model.ckpt' ) for i in range(episodes): print("==== Starting episode no:", str(i), "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[0] # + noise action = np.array([-1.0, 0.0]) observation, reward, done = env.step(action, t) reward_per_episode += reward counter += 1 #check if episode ends: if (done): print('EPISODE: ', str(i), ' Steps: ', str(t), ' Total Reward: ', str(reward_per_episode)) # print "Printing reward to file" reward_st = np.append(reward_st, reward_per_episode) np.savetxt('test_reward.txt', reward_st, newline="\n") print('\n\n') break
def main(): with tf.Graph().as_default(): agent = DDPG(number_of_states, number_of_actions) reward_per_time_step = 0 RM = ReplayMemory(100000) for e in range(episodes): print('Begin Episode number', e) # For Loop του αλγορίθμου για ένα επισόδειο for t in range(steps): if t == 0: current_moisture = 0.01 else: current_moisture = RM.replay_memory[-1][0][0] # y(t) current_state = np.array([current_moisture, y_set]) # s current_state_true = current_state.reshape(1, 2) action = agent.evaluate_actor(current_state_true)[0][ 0] # Δίνει το action , α(t) print(action) T = np.linspace(t, t + 1) next_moisture = output(sys, T, action, current_moisture) # y(t+1) next_state = np.array([next_moisture, y_set]) # s' current_reward = reward(agent.model_train()[-1]) # r # print("this thing =", agent.model_train()[-1]) reward_per_time_step += current_reward # Συνολικό reward # print(reward_per_time_step) RM.add_experience(current_state, next_state, action, current_reward) agent.model_train()
def main(): env = Env(20000) steps= 10000 num_states = 59 num_actions = 3 #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) counter=0 reward_per_episode = 0 total_reward=0 reward_st = np.array([0]) agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt') for i in range(episodes): # print "==== Starting episode no:",i,"====","\n" observation = env.reset() done =False reward_per_episode = 0 for t in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) action = action[0] observation,reward,done = env.step(action) reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode) reward_st = np.append(reward_st,reward_per_episode) # np.savetxt('episode_reward.txt', reward_st, newline="\n") break total_reward+=reward_per_episode
def main(): sess = tf.Session() setting.load_data(setting.currency, train_test_data.file_list, train_test_data.test_file) agent = DDPG(sess, CURRENCY, CHART, TIMELINE, LENGTH) counter = 0 reward_for_episode = 0 total_reward = 0 epsilon = 1.0 # parameter defining ratio between random action and DQN decision time_step = 0 # frame number # saving reward reward_st = np.array([0]) saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state('./trade_model') if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('model has been loaded successfully!') saver.restore(sess, ckpt.model_checkpoint_path) else: print('start new progress.') sess.run(tf.global_variables_initializer()) for idx in range(MAX_EPISODE): terminal = False print('Starting episode no: %d' % idx) state = setting.reset() reward_for_episode = 0 step_on_episode = 0 while not terminal: present_state = state if np.random.rand() < epsilon: selected_currency = np.random.choice(CURRENCY) ratio = 2 * (np.random.rand() - 0.5) action = setting.action_value(CURRENCY, selected_currency, ratio) else: action = agent.evaluate_actor(present_state) if idx > OBSERVE: epsilon -= 1 / 50000 state, reward, terminal, _ = setting.step(action) # add s_t, s_(t+1), action, reward to experience memory agent.add_experience(present_state, state, action, reward, terminal) # train critic and actor network if time_step > 2000 and time_step % TRAIN_INTERVAL == 0: agent.train() reward_for_episode += reward time_step += 1 step_on_episode += 1 # check if episode ends print('at %s, EPISODE: %d, Steps: %d, Reward: %d' % (str(datetime.datetime.now()), idx, step_on_episode, reward_for_episode)) reward_st = np.append(reward_st, reward_for_episode) if idx % 500 == 0 and idx != 0: saver.save(sess, 'trade_model/actor_critic_network.ckpt', global_step=time_step) total_reward += reward_for_episode print('Average reward per episode: {}'.format(total_reward / MAX_EPISODE))
def main(): ''' In this file, we first load the system state parameter from the .mat files, then for each each slot, we observe the state parameter and make the action. Then, we save this state-actor record into the memory for the latter train. Finally, the system convert into te next ecopids. ''' #load the state parameter form .mat file task_size = sio.loadmat('./data/data')['input_data_size'] #load the task size CPU_density = sio.loadmat('./data/data')['input_CPU_density'] #load the required CPU cycles of each task bit task_delay_re = sio.loadmat('./data/data')['input_task_delay_re'] #load the maximum toleration delay of each task task_gain = sio.loadmat('./data/data')['input_task_gain'] #load the gain of each task user_power = sio.loadmat('./data/data')['input_user_power'] #load the transmit power of each user user_chan_gain = sio.loadmat('./data/data')['input_user_chan_gain'] #load the wireless channel gain of each user bs_capacity = sio.loadmat('./data/data')['input_bs_capacity'] #load the computing capacity of each base station #set the number of users in these base station bs_1_user_num = 10 bs_2_user_num = 20 bs_3_user_num = 10 #set the wireless channel nosie, channel bandiwidth, transmission rate of wired connection, chan_noise = 10**(-8) chan_band = 10**6 wired_rate = 10 #set the length of time slot slot_len = 10000 #Set the record number in the replay buffer, the total reward, the reward record of the whole time slots counter = 0 total_reward = 0 reward_st = np.array([0]) #Randomly initialize critic,actor,target critic, target actor network and replay buffer num_states, num_actions = len(task_size[:,1]) * 7, len(task_size[:,1]) agent = DDPG(num_states, num_actions, is_batch_norm) #set the explore nosie to guarantee the algrithm's optimal performance exploration_noise = OUNoise(1) #travel each slot, and make the action decision for i in range(slot_len): print ("==== Starting episode no:",i,"====","\n") current_state = np.hstack((task_size[:,i], CPU_density[:,i], task_delay_re[:,i], task_gain[:,i],\ user_power[:,0], user_chan_gain[:,i],bs_capacity[:,i])) #obtain the current system state current_state = np.reshape(current_state, [1, -1]) actor_input = current_state #set the input of actor network actor_output = agent.evaluate_actor(actor_input) #predict the action in this slot noise = exploration_noise.noise() #obtain the noise added in the action action = actor_output[0] + noise #Select action according to current policy and exploration noise # print ("Action at slot", i ," :",action,"\n") reward = 1#fuction(action,current_state) #obtain the reward in this slot next_state = np.hstack((task_size[:,i+1], CPU_density[:,i+1], task_delay_re[:,i+1], task_gain[:,i+1], user_power[:,0],\ user_chan_gain[:,i+1], bs_capacity[:,i+1])) #obtain the system state in the next slot next_state = np.reshape(next_state, [1, -1]) agent.add_experience(current_state, next_state, action, reward) #add s_t,s_t+1,action,reward to experience memory #train critic and actor network if counter > 64: agent.train() counter+=1 # print ('EPISODE: ',i,'Reward: ',reward) reward_st = np.append(reward_st,reward) np.savetxt('episode_reward.txt',reward_st, newline="\n") total_reward+=reward print ("Average reward per episode {}".format(total_reward / slot_len))
def s2l(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer num_states = feature_size #num_states = env.observation_space.shape[0] num_actions = num_controls print ("Number of States:", num_states) print ("Number of Actions:", num_actions) action_space_high=[1.5] #[0.0,0.0,0.0] action_space_low=[0.03] #[0.5,0.5,0.5] print ("Action space highest values", action_space_high) print ("Action space lowest values:", action_space_low) robot=RoboControl() #while True: # #robot.check() # robot.publish_control([1]) # robot.reset() agent = DDPG(is_batch_norm,num_states,num_actions,action_space_high,action_space_low) exploration_noise = OUNoise(num_actions) counter=0 total_reward=0 print ("Number of Rollouts per episode:", num_rollouts) print ("Number of Steps per roll out:", steps) reward_st = np.array([0]) #saving reward eval_metric_st= np.array([0]) reward_st_all = np.array([0]) #saving reward after every step activity_obj=Vid_Feature() demo_vid_array=demo_array_extractor(demo_folder) demo_features=activity_obj.feature_extractor(demo_vid_array) frame_obj=Frame_Feature() #camera_obj= Camera() camera_obj= CameraSub() for episode in range(num_episodes): print ("==== Starting episode no:",episode,"====","\n") robot.reset() # Reset env in the begining of each episode obs_img=camera_obj.camera_subscribe() # Get the observation #obs_img=np.array(misc.imresize(obs_img,[112,112,3])) observation =np.array(frame_obj.frame_feature_extractor(obs_img)) observation=observation.reshape(-1) reward_per_episode = 0 for t in range(num_rollouts): reward_per_rollout=0 vid_robo_=[] for i in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise print ('Action at episode-',episode,'rollout-',t, 'step-', i ," :",action) robot.publish_control(action) obs_robo=camera_obj.camera_subscribe() # Get the observation #obs_robo=misc.imresize(obs_robo,[112,112,3]) vid_robo_.append(obs_robo) observation=np.array(frame_obj.frame_feature_extractor(np.array(obs_robo))) observation=observation.reshape(-1) #pasue() if(i==15): vid_robo=np.array(vid_robo_) robo_features=activity_obj.feature_extractor(vid_robo) reward=-(distance(demo_features,robo_features)) reward=np.array(reward) print('reward: ',reward) else: reward=0 reward=np.array(reward) print('reward: ',reward) # Storing reward after every rollout reward_st_all = np.append(reward_st_all,reward) np.savetxt('reward_all.txt',reward_st_all, newline="\n") #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x,observation,action,reward,False) reward_per_rollout+=reward counter+=1 #train critic and actor network if counter > start_training: agent.train() print ('\n\n') #Saving policy if ((episode%100)==0 and t==num_rollouts-1): print('saving policy...........................!') agent.save_actor(episode) reward_per_episode+=reward_per_rollout #check if episode ends: print ('EPISODE: ',episode,' Total Reward: ',reward_per_episode) print ("Printing reward to file") exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, fmt='%f', newline="\n") print ('\n\n') total_reward+=reward_per_episode print ("Average reward per episode {}".format(total_reward / num_episodes))
def main(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG() exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 reward_per_episode = 0 total_reward=0 #saving reward: reward_st = np.array([0]) # network setup s = socket.socket() # Create a socket object #host = socket.gethostname() # Get local machine name host = '' # Get local machine name port = 21567 # Reserve a port for your service. s.bind((host, port)) s.listen(5) imgorigin_t = np.zeros(300,400) imgorigin_t_1 = np.zeros(300,400) actor_t = np.zeros(6) actor_t_1 =np.zeros(6) index = 0 #the first time c, addr = s.accept() # Establish connection with client. print ('Got connection from'), addr print ("Receiving...") l = c.recv(1024) f = open('temp.tif','wb') while (l): f.write(l) l = c.recv(1024) f.close() print ("Done Receiving") imgorigin_t = np.array(Image.open('temp.tif')) tempimg = imgorigin_t[np.newaxis,:,:,np.newaxis] tempimg = tempimg.transpose([0,2,1,3]) test_pred = agent.evaluate_actor(tempimg) action_t = test_pred[0] print action_t str_buf = '' str_buf = str_buf+str(action_t[0,0])+" " str_buf = str_buf+str(action_t[0,1])+" " str_buf = str_buf+str(action_t[0,2])+" " str_buf = str_buf+str(action_t[0,3])+" " str_buf = str_buf+str(action_t[0,4])+" " str_buf = str_buf+str(action_t[0,5])+" " imgorigin_t_1 = imgorigin_t actor_t_1 = actor_t c.send(str_buf) c.close() index =1 while True: #update imgorigin_t and actor_t imgorigin_t = img_origin_t_1 actor_t = actor_t_1 c, addr = s.accept() # Establish connection with client. print ('Got connection from'), addr print ("Receiving...") l = c.recv(1024) f = open('temp.tif','wb') while (l): f.write(l) l = c.recv(1024) f.close() print ("Done Receiving") imgorigin_t_1 = np.array(Image.open('temp.tif')) tempimg = imgorigin_t_1[np.newaxis,:,:,np.newaxis] tempimg = tempimg.transpose([0,2,1,3]) test_pred = agent.evaluate_actor(tempimg) action_t_1 = test_pred[0] print action_t_1 reward = compute_reward(imgorigin_t_1) agent.add_experience(imgorigin_t,imgorigin_t_1,action_t,reward,index) if index > 32: agent.train() str_buf = '' str_buf = str_buf+str(action_t_1[0,0])+" " str_buf = str_buf+str(action_t_1[0,1])+" " str_buf = str_buf+str(action_t_1[0,2])+" " str_buf = str_buf+str(action_t_1[0,3])+" " str_buf = str_buf+str(action_t_1[0,4])+" " str_buf = str_buf+str(action_t_1[0,5])+" " c.send(str_buf) print("send action finished!") c.close() index = index+1
def main(): experiment = 'quadruped-robot-v0' #specify environments here backupNameFile = "quadruped_robot_0" backupPathFile = "storage/" + backupNameFile bFullPath = os.path.join( os.path.split(os.path.abspath(__file__))[0], backupPathFile) env = gym.make(experiment) steps = env.spec.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer global agent agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) for i in range(episodes): print("==== Starting episode no:", i, "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise # print ("Action at step", t ," :",action,"\n") observation, reward, done, info = env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): # print ('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode) # print ("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print('\n\n') break # Save some episodes # print(episodes) # if (episodes == 10): # with open(bFullPath+"_EP_"+episodes+".pkl", 'wb') as file: # pickle.dump(agent, file) # pickle.dump_session(bFullPath+"_EP_"+episodes+".pkl") # print ('SAVE EPISODE ',episodes) # break; total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))
def main(): experiment= 'InvertedPendulum-v1' env= gym.make(experiment) assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] #saving reward: reward_st = np.array([0]) for i in xrange(episodes): observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) #env.render() x = observation #select action using actor network model action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states])) noise = exploration_noise.noise() action = action[0] + noise print 'Agent.Action :',action print '\n' print '\n' observation,reward,done,[]=env.step(action) #add s_t,s_t+1,action,reward to experience memeroy agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if done: print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n' print '\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
class PlayAgent: def __init__(self, ip="127.0.0.1", id=28888): # ip 는 서버 주소.id 는 agent 식별자 self.ar = ClientActionRobotJava(ip) self.se = GameStateExtractor() # ㅎ믇 self.tp = TrajectoryPlanner() # 궤적 계산 모듈 self.firstShot = True self.solved = [] # clear 한 레벨은 1, 아닌 레벨은 0의 값을 가진다 self.currentLevel = -1 self.failedCounter = 0 self.id = id self.width = 840 # 게임 화면 너비 self.height = 480 # 게임 화면 높이 # 게임 화면 스크린샷(=state) 의 크기. 즉. state의 개수는 # 화면의 픽셀 수와 같다. # [height, width, RGB 3 channels] self.num_states = [self.height, self.width, 3] # Action space 정의 # [거리(0~90px), 각도(0~90degree), tapTime(0~5000ms)] self.num_actions = 3 self.action_space_high = [90, 75, 50] self.action_space_low = [0, 0, 0] self.noise_mean = [20, -20, 0] self.noise_sigma = [10, 30, 20] self.ddpg = DDPG(self.num_states, self.num_actions, self.action_space_high, self.action_space_low, is_batch_norm) def getNextLevel(self): # 다음 레벨을 얻어온다 level = 0 unsolved = False for i in range(len(self.solved)): if self.solved[i] == 0: unsolved = True level = i + 1 if level <= self.currentLevel and self.currentLevel < len(self.solved): continue else: return level if unsolved: return level level = (self.currentLevel + 1) % len(self.solved) if level == 0: level = len(self.solved) return level def checkMyScore(self): scores = self.ar.checkMyScore() # 현재 점수 확인 level = 1 for s in scores: # 각 level 별 점수 확인 print "||\tlevel %d score is : %d\t||" % (level, s) if s > 0: self.solved[level - 1] = 1 level += 1 def getScreenBuffer(self, buffer, width=840, height=480): """ 현재 게임플레이 스크린샷을 받아온다. RGB 별로 따로 저장한다. """ print "## Get ScreenBuffer" # returnBuffer's size = (480, 840, 3) returnBuffer = np.zeros((height, width, 3)) for i in range(height): for j in range(width): RGB = buffer.getRGB(j, i) returnBuffer[i, j, 0] = RGB & 0x0000ff returnBuffer[i, j, 1] = RGB & 0x00ff00 returnBuffer[i, j, 2] = RGB & 0xff0000 print "## Return ScreenBuffer" return returnBuffer def shoot(self, action): """ 새를 쏘고, 쏜 후의 상태를 반환한다. """ # 새총 detection screenshot = self.ar.doScreenShot() vision = Vision(screenshot) sling = vision.findSlingshotMBR() # 현재 게임 state pigs = vision.findPigsMBR() state = self.ar.checkState() # 새총이 감지되면 플레이하고, 아니라면 스킵 if sling != None: # 맵에 돼지가 존재하면 임의로 한 마리를 타겟으로 잡고 쏜다. if len(pigs) != 0: refPoint = self.tp.getReferencePoint(sling) print "## Ref Sling Point : ", refPoint # DDPG 로부터 취할 action을 받아온다 releaseDistance = action[0] releaseAngle = action[1] tapTime = action[2] print "## Release Distance : ", releaseDistance print "## Release Angle : ", releaseAngle self.ar.fullyZoomOut() screenshot = self.ar.doScreenShot() vision = Vision(screenshot) _sling = vision.findSlingshotMBR() # zoom out 했을 때 감지된 새총. if _sling != None: # zoom out 하지 않았을 때의 새총 위치와 zoom out 한 후의 새총 위치의 차이를 구하여 # 너무 차이가 난다면, 쏘지 않고 다시 screenshot 을 찍어 분석하도록 함 scale_diff = (sling.width - _sling.width) ** 2 + \ (sling.height - _sling.height) ** 2 if scale_diff < 25: self.ar.shoot(int(refPoint.x), int(refPoint.y), int( releaseDistance), int(releaseAngle), 0, int(tapTime), True) print "## Shooting is Done" state = self.ar.checkState() if state == state.PLAYING: self.firstShot = False else: print "## Scale is changed. So sling can not execute the shot and will re-segment the image" else: print "## No sling was detected. So agent can not execute the shot and will re-segment the image" return state def ddpg_run(self): """ DDPG algorithm 을 raw pixel data(screenshot)에 대해서 돌린다 """ info = self.ar.configure(ClientActionRobot.intToByteArray(self.id)) self.solved = np.zeros(info[2]) self.checkMyScore() print "## current level : %d" % self.currentLevel # DDPG # random 하게 critic, actor, target critic net, target actor net 을 초기화하고 # experience memory 도 deque 로 초기화 한다 exploration_noise = OUNoise( self.num_actions, self.noise_mean, self.noise_sigma) counter = 1 reward_per_episode = 0 # episode는 한 판을 의미. total_reward = 0 print "# of States : ", self.num_states print "# of Actions : ", self.num_actions # reward 저장 reward_st = np.array([0]) # parameter 로 정한 episode 수 만큼 training 학습 진행 for i in xrange(episodes): # 다음 레벨 받아오기 self.currentLevel = self.getNextLevel() # 받아온 레벨이 1~3 이면 해당 레벨 로드, 아니면 1로 초기화 후 로드 if self.currentLevel < 4: self.ar.loadLevel(self.currentLevel) else: self.currentLevel = 1 self.ar.loadLevel(self.currentLevel) prevscore = 0 reward_per_episode = 0 steps = 0 print "======== Starting Episode No : ", (i + 1), "========", "\n" # 하나의 episode 에 대한 루프 while True: # 게임 플레이 screenshot 가져오기 screenshot = self.ar.doScreenShot() x = self.getScreenBuffer(screenshot, self.width, self.height) # actor evaluation 을 통해서 다음에 취할 action 을 얻는다 action = self.ddpg.evaluate_actor(np.reshape( x, [1, self.num_states[0], self.num_states[1], self.num_states[2]])) print "## Get Action from network!! : ", action action = action[0] noise = exploration_noise.noise() # action 을 현재의 policy 에 따라 정하되, # epsilon(noise) 수치 정도에 따라 실험적인 action을 # stochastic 하게 취하도록 한다. action = action + noise print action # distance 가 음수이면 양수로 뒤집어준다. action[0] = action[0] if action[0] > self.action_space_low[0] else -action[0] # distance 가 최대 범위를 넘어서면 최대 범위로 설정한다. action[0] = action[0] if action[0] < self.action_space_high[0] else self.action_space_high[0] # 각도의 경우에도 마찬가지 처리를 해준다 action[1] = action[1] if action[1] > self.action_space_low[1] else -action[1] action[1] = action[1] if action[1] < self.action_space_high[1] else self.action_space_high[1] # tap time 도 마찬가지 action[2] = action[2] if action[2] > self.action_space_low[2] else -action[2] action[2] = action[2] if action[2] < self.action_space_low[2] else self.action_space_high[2] print "## Action at step ", steps, " :", action, "\n" # 쏘고나서 점수가 안정화 될 때까지 조금 기다리는 로직이 들어있다 state = self.shoot(action) if state == state.WON or state == state.LOST: # episode 가 끝나면( 한 레벨이 끝나면 ) print "## Episode End" screenshot = self.ar.doScreenShot() observation = self.getScreenBuffer( screenshot, self.width, self.height) # 이기면 reward를 받고 지면 받지 않는다. if state == state.WON: score = self.se.getScoreEndGame(screenshot) # 현재 episode 에서 얻은 점수를 1000으로 나눈 값을 reward 로 사용 reward = (score - prevscore) / 1000.0 else: reward = 0.00 self.currentLevel = self.currentLevel self.firstShot = True # episode 가 끝나면 first shot 초기화 done = True # episode done 처리 # experience memory 에 # s(t), s(t + 1), action, reward 를 저장한다 print "######## SCORE : ", score print "######## REWARD : ", reward # x = state(screenBuffer) at t # obervation = state(screenBuffer) at (t + 1) self.ddpg.add_experience( x, observation, action, reward, done) # critic network 와 actor network 학습 # 정해둔 step 이상 진행됐을 경우부터 학습을 시작하도록 한다. # experience 를 충분히 경험해야 하기 때문. if counter > TRAIN_STEP: self.ddpg.train() counter += 1 steps += 1 print "==== EPISODE: ", i, ' Steps: ', steps, ' Total Reward: ', reward_per_episode print "Writing reward info into file..." exploration_noise.reset() # reward_st 는 배열이다. # 마지막 원소에 해당 판에서 얻은 총 점수를 기록하고 # 파일로 내보낸다 reward_st = np.append(reward_st, reward_per_episode) np.savetxt("episodes_reward.txt", reward_st, newline="\n") print "\n\n" break elif state == state.PLAYING: # PLAYING 상태일 때 screenshot = self.ar.doScreenShot() vision = Vision(screenshot) sling = vision.findSlingshotMBR() while sling == None and self.ar.checkState() == state.PLAYING: print "## No slingshot was detected. Please remove pop up or zoom out" self.ar.fullyZoomOut() screenshot = self.ar.doScreenShot() # S(t + 1) 을 얻는다 observation = self.getScreenBuffer( screenshot, self.width, self.height) # experience memory 에 # S(t), S(t + 1), action, reward 를 저장한다 score = self.ar.getInGameScore(screenshot) reward = (score - prevscore) / 1000.0 prevscore = score done = False reward_st = np.append(reward_st, reward) self.ddpg.add_experience( x, observation, action, reward, done) print "## Add experience (action) (reward) (done)", action, reward, done # critie, actor network 학습 if counter > TRAIN_STEP: self.ddpg.train() reward_per_episode += reward counter += 1 steps += 1 # 일반적인 상황이 아닌 상황들에 대한 예외처리 elif state == state.LEVEL_SELECTION: print "unexpected level selection page, go to the last current level: %d" % self.currentLevel self.ar.loadLevel(self.currentLevel) elif state == state.MAIN_MENU: print"unexpected main menu page, reload the level: %d" % self.currentLevel self.ar.loadLevel(self.currentLevel) elif state == state.EPISODE_MENU: print "unexpected episode menu page, reload the level: %d" % self.currentLevel self.ar.loadLevel(self.currentLevel) total_reward += reward_per_episode # episode 들의 reward 를 누계 avg_reward = total_reward / episodes print "## Average reward per episode is : ", avg_reward
def main(): experiment = 'MountainCarContinuous-v0' env = gym.make(experiment) steps = env.spec.timestep_limit assert isinstance(env.observation_space, Box) assert isinstance(env.action_space, Box) agent = DDPG(env, is_batch_norm) #这个在循环前面,所以所有的weight都有继承 #也就是说,整个过程只训练了一个模型出来。 exploration_noise = OUNoise(env.action_space.shape[0]) reward_per_episode = 0 total_reward = 0 counter = 0 num_states = env.observation_space.shape[0] - 1 num_actions = env.action_space.shape[0] #这是state的维度和action的维度 print 'Number of States:', num_states print 'Number of Actions:', num_actions print 'Number of steps per episode:', steps if is_exploration == True: print("\nExploration phase for {} steps. ".format(exploration_steps)) e_steps = 0 while e_steps < exploration_steps: s = env.reset() one_step = 0 done = False exploration_noise.reset() exp = [] while not done: a = exploration_noise.noise() ss, r, done, _ = env.step(a) exp.append((s[:-1], a, ss[:-1], r, done)) s = ss one_step += 1 if one_step > 998: break agent.add_experience(exp) e_steps += 1 reward_st = np.array([0]) #这个是用来存每一次的rewards的 for i in xrange(episodes): #一共要循环1000次 print '====starting episode no:', i, '====', '\n' observation = env.reset() #每个情节初始化,但是模型参数不初始化 reward_per_episode = 0 LSTM_SIZE = 40 statec_t1 = np.zeros((BATCH_SIZE, LSTM_SIZE)) stateh_t1 = np.zeros((BATCH_SIZE, LSTM_SIZE)) exp = [] for t in xrange(steps): #env.render() x = [observation[0:num_states]] x = np.reshape(x * BATCH_SIZE, [BATCH_SIZE, num_states]) actor, statec_t1, stateh_t1 = agent.evaluate_actor( x, statec_t1, stateh_t1) noise = exploration_noise.noise() #ra = random.random() if (i < 500): action = actor[0] + noise else: action = actor[0] observation, reward, done, info = env.step(action) #print 'Action at step',t,':',action,'reward:',reward,'\n' exp.append((x, action, observation[0:num_states], reward, done)) if counter > 64: agent.train() counter += 1 reward_per_episode += reward if (done or (t == steps - 1)): #一个情节结束了~ agent.add_experience(exp) print 'EPISODE:', i, 'Steps', t, 'Total Reward:', reward_per_episode print 'Printing reward to file' exploration_noise.reset() reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline='\n') print '\n\n' break total_reward += reward_per_episode #这里是计算平均值的 print "Average reward per episode {}".format(total_reward / episodes)
def s2l(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer num_states = feature_size #num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) agent = DDPG(env, is_batch_norm, num_states, num_actions) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 total_reward = 0 print("Number of Rollouts per episode:", num_rollouts) print("Number of Steps per roll out:", steps) reward_st = np.array([0]) #saving reward eval_metric_st = np.array([0]) reward_st_all = np.array([0]) #saving reward after every step frame_obj = Frame_Feature() #activity_obj=Vid_Feature() demo_vid_array = demo_array_extractor(demo_folder) demo_features = frame_obj.video_feature_extractor(demo_vid_array) for episode in range(num_episodes): print("==== Starting episode no:", episode, "====", "\n") env.reset() # Reset env in the begining of each episode env.render() obs_img = env.render(mode='rgb_array') # Get the observation obs_img = np.array(misc.imresize(obs_img, [112, 112, 3])) observation = np.array(frame_obj.frame_feature_extractor(obs_img)) observation = observation.reshape(-1) reward_per_episode = 0 for t in range(num_rollouts): reward_per_rollout = 0 vid_robo_ = [] for i in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print('Action at episode-', episode, 'rollout-', t, 'step-', i, " :", action) _, _, done, info = env.step(action) env.render() obs_robo_ = env.render(mode='rgb_array') # Get the observation obs_robo = misc.imresize(obs_robo_, [112, 112, 3]) vid_robo_.append(obs_robo) observation = np.array( frame_obj.frame_feature_extractor(np.array(obs_robo))) observation = observation.reshape(-1) #pasue() if (i == 15): vid_robo = np.array(vid_robo_) robo_features = frame_obj.video_feature_extractor(vid_robo) reward = -(distance(demo_features, robo_features)) reward = np.array(reward) print('reward: ', reward) else: reward = 0 reward = np.array(reward) print('reward: ', reward) # Printing eval_metric after every rollout eval_metric = np.array(env.get_eval()) eval_metric = eval_metric.reshape(-1) print('Distance to goal:', eval_metric) eval_metric_st = np.append(eval_metric_st, eval_metric) np.savetxt('eval_metric_per_step.txt', eval_metric_st, newline="\n") # Storing reward after every rollout reward_st_all = np.append(reward_st_all, reward) np.savetxt('reward_all.txt', reward_st_all, newline="\n") #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, False) reward_per_rollout += reward counter += 1 #train critic and actor network if counter > start_training: agent.train() print('\n\n') #Saving policy if ((episode % 50) == 0 and t == num_rollouts - 1): print('saving policy...........................!') agent.save_actor(episode) reward_per_episode += reward_per_rollout #check if episode ends: print('EPISODE: ', episode, ' Total Reward: ', reward_per_episode) print("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, fmt='%f', newline="\n") print('\n\n') total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / num_episodes))
def main(): enable_actuator_dynamics = True env = ControlSystem(enable_actuator_dynamics=enable_actuator_dynamics) steps = env.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) # agent.load_model() exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) log_dir = os.path.join(os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'action') if enable_actuator_dynamics == True: filtered_log_dir = os.path.join( os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'filtered_action') y_hat_log_dir = os.path.join(os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'y_hat') y_ref_log_dir = os.path.join(os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'y_ref') gen_function_log_dir = os.path.join( os.getcwd(), 'logs', datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'function') os.makedirs(log_dir) if enable_actuator_dynamics == True: os.makedirs(filtered_log_dir) os.makedirs(y_hat_log_dir) os.makedirs(y_ref_log_dir) os.makedirs(gen_function_log_dir) for i in range(episodes): print("==== Starting episode no:", i, "====") observation = env.reset() reward_per_episode = 0 actions_per_episode = [] if enable_actuator_dynamics == True: filtered_action_per_episode = [] for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise actions_per_episode.append(action) # if i % 100 == 0: # print ("Action at step", t ," :",action,"\n") # print("#", action[0]) if action[0] < 0: action = [0] elif action[0] > 1: action = [1] # print("Step", t, 'action', action) if enable_actuator_dynamics == False: observation, reward, Y_plot, t_plot, y_ref, random_function = env.step( action, t) elif enable_actuator_dynamics == True: observation, reward, filtered_action, Y_plot, t_plot, y_ref, random_function = env.step( action, t) filtered_action_per_episode.append(filtered_action) # print ("Reward at step", t ," :",reward,"\n") #add y_t,y_t-1,action,reward,timestep to experience memory agent.add_experience(x, observation, action, reward, t) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (t == steps - 1): print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode) # print ("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") # print("Y_plot") # plt.step(t_plot,Y_plot) # plt.grid() # plt.xlabel('t') # plt.ylabel('y') # plt.show() # Save actions np.savetxt(log_dir + '/' + str(i).zfill(7) + '.txt', actions_per_episode) if enable_actuator_dynamics == True: np.savetxt( filtered_log_dir + '/' + str(i).zfill(7) + '.txt', filtered_action_per_episode) np.savetxt(y_hat_log_dir + '/' + str(i).zfill(7) + '.txt', Y_plot) np.savetxt(y_ref_log_dir + '/' + str(i).zfill(7) + '.txt', y_ref) # np.savetxt(gen_function_log_dir + '/' + str(i).zfill(7) + '.txt', random_function) # save model if i % 100 == 0: print('save') agent.save_model() # print ('\n\n') break total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))