예제 #1
0
파일: main.py 프로젝트: mmarklar/ddpg-aigym
def main():
    experiment = 'model-builder-v0'  #specify environments here
    env = gym.make(experiment)
    #steps= env.spec.timestep_limit #steps per episode
    steps = 20
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    for i in range(episodes):
        print("==== Starting episode no:", i, "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print("Action at step", t, " :", action, "\n")

            observation, reward, done, info = env.step(action)

            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ',
                      reward_per_episode)
                print("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))
예제 #2
0
def main():
    experiment= 'InvertedPendulum-v1' #specify environments here
    env= gym.make(experiment)
    steps= env.spec.timestep_limit #steps per episode    
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    reward_per_episode = 0    
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]    
    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])
      
    
    for i in xrange(episodes):
        print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            print "Action at step", t ," :",action,"\n"
            
            observation,reward,done,info=env.step(action)
            
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset() #reinitializing random noise for action exploration
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)    
예제 #3
0
파일: main.py 프로젝트: wenjiebit/FCMADRL
def main():
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE,
                 CA_ACTION_BOUND)
    exploration_noise = OUNoise(CA_ACTION_SPACE)
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = CA_OBS_SPACE
    num_actions = CA_ACTION_SPACE

    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])

    for i in xrange(episodes):
        print "==== Starting episode no:", i, "====", "\n"
        # observation = env.reset()
        observation = ca_reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)
            # env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print "Action at step", t, " :", action, "\n"

            # observation,reward,done,info=env.step(action)
            observation, reward, done, info = ca_step(action)
            print x, observation, action, reward, done
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print '\n\n'
                break
    total_reward += reward_per_episode
    print "Average reward per episode {}".format(total_reward / episodes)
예제 #4
0
def main():
    '''main function'''

    # Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(ACTORNET_PRE_TRAINED, STATENET_PRE_TRAINED)
    exploration_noise = OUNoise(ACTION_DIM)
    # saving reward:
    reward_st = np.array([0])

    img_server = ImgServer()
    img_server.wait_for_connect()

    observe_t_img = img_server.receive_img()
    observe_t_data = convert_img2data(observe_t_img)
    actor_t = agent.evaluate_actor(observe_t_data)
    noise = exploration_noise.noise()
    actor_t = actor_t[0] + noise
    img_server.send_actor_cmd(actor_t)

    observe_t_1_img = observe_t_img
    actor_t_1 = actor_t
    img_server.close_connect()

    index = 1
    while True:
        observe_t_img = observe_t_1_img
        actor_t = actor_t_1
        img_server.wait_for_connect()
        observe_t_1_img = img_server.receive_img()
        observe_t_1_data = convert_img2data(observe_t_1_img)
        actor_t_1 = agent.evaluate_actor(observe_t_1_data)
        noise = exploration_noise.noise()
        actor_t_1 = actor_t_1[0] + noise
        cost = compute_cost(observe_t_img)
        agent.add_experience(observe_t_img, observe_t_1_img, actor_t, cost, index)
        if index > 32:
            agent.train()
        img_server.send_actor_cmd(actor_t_1)
        img_server.close_connect()

        index = index + 1
예제 #5
0
def main():
    env = Env(19997)
    steps= 10000
    num_states = 59
    num_actions = 3

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(num_actions)
    counter=0
    reward_per_episode = 0    
    total_reward=0
    reward_st = np.array([0])

    agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt')
    agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt')
      
    for i in range(episodes):
        # print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        done =False
        reward_per_episode = 0
        for t in range(steps):
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            
            for i in range(num_actions):
                if action[i] > 1.0:
                    action[i] = 1.0
                if action[i] < -1.0:
                    action[i] = -1.0

            observation,reward,done = env.step(action)
            print("reward:", reward, "\n")
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode)
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                agent.actor_net.save_actor(os.getcwd() + '/weights/actor/model.ckpt')
                agent.critic_net.save_critic(os.getcwd() + '/weights/critic/model.ckpt')
                break
    total_reward+=reward_per_episode            
예제 #6
0
def main():
    env= Env(19997)
    steps = 300
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(2)
    counter = 0
    reward_per_episode = 0.
    num_states = 32*16
    num_actions = 2

    #saving reward:
    reward_st = np.array([0])
    
    for i in range(episodes):
        print ("==== Starting episode no:",str(i),"====","\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise
            
            observation,reward,done=env.step(action,t)
            agent.add_experience(x,observation,action,reward,done)

            if counter > 64:
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done):
                print ('EPISODE: ',str(i),' Steps: ',str(t),' Total Reward: ',str(reward_per_episode))
                exploration_noise.reset() #reinitializing random noise for action exploration
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                agent.actor_net.save_actor('/home/lee/Projects/Tracking/RL/weights/actor/model.ckpt')
                agent.critic_net.save_critic('/home/lee/Projects/Tracking/RL/weights/critic/model.ckpt')
                print ('\n\n')
                break
예제 #7
0
def main():
    env = Env(20000)
    steps = 50
    agent = DDPG(env, is_batch_norm)
    counter = 0
    exploration_noise = OUNoise(2)
    reward_per_episode = 0
    num_states = 96 * 4 + 4
    num_actions = 2
    reward_st = np.array([0])

    agent.actor_net.load_actor(
        '/home/myounghoe/ddpgtf/norepeat_target_2action_scale2/weights/actor/model.ckpt'
    )
    agent.critic_net.load_critic(
        '/home/myounghoe/ddpgtf/norepeat_target_2action_scale2/weights/critic/model.ckpt'
    )

    for i in range(episodes):
        print("==== Starting episode no:", str(i), "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
            noise = exploration_noise.noise()
            action = action[0]  # + noise
            action = np.array([-1.0, 0.0])
            observation, reward, done = env.step(action, t)
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done):
                print('EPISODE: ', str(i), ' Steps: ', str(t),
                      ' Total Reward: ', str(reward_per_episode))
                # print "Printing reward to file"
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('test_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
예제 #8
0
def main():
    with tf.Graph().as_default():
        agent = DDPG(number_of_states, number_of_actions)
        reward_per_time_step = 0
        RM = ReplayMemory(100000)

        for e in range(episodes):
            print('Begin Episode number', e)
            # For Loop του αλγορίθμου για ένα επισόδειο
            for t in range(steps):
                if t == 0:
                    current_moisture = 0.01
                else:
                    current_moisture = RM.replay_memory[-1][0][0]  # y(t)

                current_state = np.array([current_moisture, y_set])  # s
                current_state_true = current_state.reshape(1, 2)
                action = agent.evaluate_actor(current_state_true)[0][
                    0]  # Δίνει το action , α(t)
                print(action)
                T = np.linspace(t, t + 1)

                next_moisture = output(sys, T, action,
                                       current_moisture)  # y(t+1)

                next_state = np.array([next_moisture, y_set])  # s'

                current_reward = reward(agent.model_train()[-1])  # r
                # print("this thing =", agent.model_train()[-1])

                reward_per_time_step += current_reward  # Συνολικό reward
                # print(reward_per_time_step)

                RM.add_experience(current_state, next_state, action,
                                  current_reward)
                agent.model_train()
예제 #9
0
def main():
    env = Env(20000)
    steps= 10000
    num_states = 59
    num_actions = 3

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    counter=0
    reward_per_episode = 0    
    total_reward=0
    reward_st = np.array([0])

    agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt')
    agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt')

    for i in range(episodes):
        # print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        done =False
        reward_per_episode = 0
        for t in range(steps):
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            action = action[0]

            observation,reward,done = env.step(action)
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode)
                reward_st = np.append(reward_st,reward_per_episode)
                # np.savetxt('episode_reward.txt', reward_st, newline="\n")
                break
    total_reward+=reward_per_episode            
예제 #10
0
def main():
    sess = tf.Session()

    setting.load_data(setting.currency, train_test_data.file_list,
                      train_test_data.test_file)
    agent = DDPG(sess, CURRENCY, CHART, TIMELINE, LENGTH)
    counter = 0
    reward_for_episode = 0
    total_reward = 0

    epsilon = 1.0  # parameter defining ratio between random action and DQN decision
    time_step = 0  # frame number

    # saving reward
    reward_st = np.array([0])

    saver = tf.train.Saver(tf.global_variables())
    ckpt = tf.train.get_checkpoint_state('./trade_model')

    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print('model has been loaded successfully!')
        saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        print('start new progress.')
        sess.run(tf.global_variables_initializer())

    for idx in range(MAX_EPISODE):
        terminal = False
        print('Starting episode no: %d' % idx)
        state = setting.reset()
        reward_for_episode = 0
        step_on_episode = 0

        while not terminal:
            present_state = state
            if np.random.rand() < epsilon:
                selected_currency = np.random.choice(CURRENCY)
                ratio = 2 * (np.random.rand() - 0.5)
                action = setting.action_value(CURRENCY, selected_currency,
                                              ratio)

            else:
                action = agent.evaluate_actor(present_state)

            if idx > OBSERVE:
                epsilon -= 1 / 50000

            state, reward, terminal, _ = setting.step(action)

            # add s_t, s_(t+1), action, reward to experience memory
            agent.add_experience(present_state, state, action, reward,
                                 terminal)

            # train critic and actor network
            if time_step > 2000 and time_step % TRAIN_INTERVAL == 0:
                agent.train()

            reward_for_episode += reward
            time_step += 1
            step_on_episode += 1

        # check if episode ends
        print('at %s, EPISODE: %d, Steps: %d, Reward: %d' %
              (str(datetime.datetime.now()), idx, step_on_episode,
               reward_for_episode))
        reward_st = np.append(reward_st, reward_for_episode)

        if idx % 500 == 0 and idx != 0:
            saver.save(sess,
                       'trade_model/actor_critic_network.ckpt',
                       global_step=time_step)

    total_reward += reward_for_episode
    print('Average reward per episode: {}'.format(total_reward / MAX_EPISODE))
예제 #11
0
def main():
    
    '''
    In this file, we first load the system state parameter from the .mat files, then for each 
    each slot, we observe the state parameter and make the action. Then, we save this state-actor
    record into the memory for the latter train. Finally, the system convert into te next ecopids.
    '''
    #load the state parameter form .mat file
    task_size = sio.loadmat('./data/data')['input_data_size']   #load the task size 
    CPU_density = sio.loadmat('./data/data')['input_CPU_density']   #load the required CPU cycles of each task bit 
    task_delay_re = sio.loadmat('./data/data')['input_task_delay_re']  #load the maximum toleration delay of each task
    task_gain = sio.loadmat('./data/data')['input_task_gain']  #load the gain of each task
    user_power = sio.loadmat('./data/data')['input_user_power']  #load the transmit power of each user
    user_chan_gain = sio.loadmat('./data/data')['input_user_chan_gain']  #load the wireless channel gain of each user
    bs_capacity = sio.loadmat('./data/data')['input_bs_capacity']  #load the computing capacity of each base station    
    
    
    #set the number of users in these base station
    bs_1_user_num = 10
    bs_2_user_num = 20
    bs_3_user_num = 10
    
    #set the wireless channel nosie, channel bandiwidth, transmission rate of wired connection,
    chan_noise =   10**(-8)
    chan_band = 10**6
    wired_rate = 10
    
    #set the length of time slot 
    slot_len = 10000
    
    #Set the record number in the replay buffer, the total reward, the reward record of the whole time slots
    counter = 0 
    total_reward = 0
    reward_st = np.array([0])
    
    #Randomly initialize critic,actor,target critic, target actor network and replay buffer
    num_states, num_actions = len(task_size[:,1]) * 7, len(task_size[:,1])
    agent = DDPG(num_states, num_actions, is_batch_norm)
    
    #set the explore nosie to guarantee the algrithm's optimal performance
    exploration_noise = OUNoise(1)
    
    #travel each slot, and make the action decision
    for i in range(slot_len):
        print ("==== Starting episode no:",i,"====","\n")
        current_state = np.hstack((task_size[:,i], CPU_density[:,i], task_delay_re[:,i], task_gain[:,i],\
        user_power[:,0], user_chan_gain[:,i],bs_capacity[:,i]))   #obtain the current system state
        current_state = np.reshape(current_state, [1, -1])
        actor_input = current_state   #set the input of actor network
        actor_output = agent.evaluate_actor(actor_input)   #predict the action in this slot
        noise = exploration_noise.noise()   #obtain the noise added in the action
        action = actor_output[0] + noise #Select action according to current policy and exploration noise
#        print ("Action at slot", i ," :",action,"\n")
        reward = 1#fuction(action,current_state)   #obtain the reward in this slot
        next_state = np.hstack((task_size[:,i+1], CPU_density[:,i+1], task_delay_re[:,i+1], task_gain[:,i+1], user_power[:,0],\
        user_chan_gain[:,i+1], bs_capacity[:,i+1]))   #obtain the system state in the next slot
        next_state = np.reshape(next_state, [1, -1])
        agent.add_experience(current_state, next_state, action, reward)   #add s_t,s_t+1,action,reward to experience memory
        #train critic and actor network
        if counter > 64: 
            agent.train()
        counter+=1
#        print ('EPISODE: ',i,'Reward: ',reward)
        reward_st = np.append(reward_st,reward)
        np.savetxt('episode_reward.txt',reward_st, newline="\n")
    total_reward+=reward
    print ("Average reward per episode {}".format(total_reward / slot_len))
예제 #12
0
def s2l():
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    num_states = feature_size   #num_states = env.observation_space.shape[0]
    num_actions = num_controls
    print ("Number of States:", num_states)
    print ("Number of Actions:", num_actions)
    action_space_high=[1.5] #[0.0,0.0,0.0]
    action_space_low=[0.03] #[0.5,0.5,0.5]
    print ("Action space highest values", action_space_high)
    print ("Action space lowest values:", action_space_low)
    robot=RoboControl()
    #while True:
    #    #robot.check()
    #    robot.publish_control([1])
    #    robot.reset() 

    agent = DDPG(is_batch_norm,num_states,num_actions,action_space_high,action_space_low)
    exploration_noise = OUNoise(num_actions)
    counter=0
    total_reward=0
    print ("Number of Rollouts per episode:", num_rollouts)
    print ("Number of Steps per roll out:", steps)
    reward_st = np.array([0])  #saving reward
    eval_metric_st= np.array([0])
    reward_st_all = np.array([0])  #saving reward after every step

    activity_obj=Vid_Feature()
    demo_vid_array=demo_array_extractor(demo_folder)
    demo_features=activity_obj.feature_extractor(demo_vid_array)

    frame_obj=Frame_Feature()
    #camera_obj= Camera()
    camera_obj= CameraSub()

    for episode in range(num_episodes):
        print ("==== Starting episode no:",episode,"====","\n")


        robot.reset()   # Reset env in the begining of each episode
        obs_img=camera_obj.camera_subscribe()   # Get the observation
        #obs_img=np.array(misc.imresize(obs_img,[112,112,3]))
        observation =np.array(frame_obj.frame_feature_extractor(obs_img))
        observation=observation.reshape(-1)
        reward_per_episode = 0

        for t in range(num_rollouts):

            reward_per_rollout=0
            vid_robo_=[]

            for i in range(steps):

                x = observation

                action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
                noise = exploration_noise.noise()
                action = action[0] + noise #Select action according to current policy and exploration noise
                print ('Action at episode-',episode,'rollout-',t, 'step-', i ," :",action)


                robot.publish_control(action)
                
                obs_robo=camera_obj.camera_subscribe()   # Get the observation
            
                #obs_robo=misc.imresize(obs_robo,[112,112,3])
                vid_robo_.append(obs_robo)
                observation=np.array(frame_obj.frame_feature_extractor(np.array(obs_robo)))
                observation=observation.reshape(-1)
                #pasue()

                if(i==15):
                    vid_robo=np.array(vid_robo_)
                    robo_features=activity_obj.feature_extractor(vid_robo)
                    reward=-(distance(demo_features,robo_features))
                    reward=np.array(reward)
                    print('reward: ',reward)
                else:
                    reward=0
                    reward=np.array(reward)
                    print('reward: ',reward)

                # Storing reward after every rollout
                reward_st_all = np.append(reward_st_all,reward)
                np.savetxt('reward_all.txt',reward_st_all, newline="\n")

                #add s_t,s_t+1,action,reward to experience memory
                agent.add_experience(x,observation,action,reward,False)
                reward_per_rollout+=reward
                counter+=1

            #train critic and actor network
            if counter > start_training:
                    agent.train()
            print ('\n\n')

            #Saving policy
            if ((episode%100)==0 and t==num_rollouts-1):
                print('saving policy...........................!')
                agent.save_actor(episode)


            reward_per_episode+=reward_per_rollout

        #check if episode ends:

        print ('EPISODE: ',episode,' Total Reward: ',reward_per_episode)
        print ("Printing reward to file")
        exploration_noise.reset() #reinitializing random noise for action exploration
        reward_st = np.append(reward_st,reward_per_episode)
        np.savetxt('episode_reward.txt',reward_st, fmt='%f', newline="\n")
        print ('\n\n')

        total_reward+=reward_per_episode

    print ("Average reward per episode {}".format(total_reward / num_episodes))
예제 #13
0
def main():
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG()
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    reward_per_episode = 0    
    total_reward=0
    #saving reward:
    reward_st = np.array([0])
      
    # network setup
    s = socket.socket()         # Create a socket object
    #host = socket.gethostname() # Get local machine name
    host = ''                    # Get local machine name
    port = 21567  # Reserve a port for your service.
    s.bind((host, port))
    
    s.listen(5)
    imgorigin_t = np.zeros(300,400)
    imgorigin_t_1 = np.zeros(300,400)
    actor_t = np.zeros(6)
    actor_t_1 =np.zeros(6)
    index = 0


    #the first time
    c, addr = s.accept()     # Establish connection with client.
    print ('Got connection from'), addr
    print ("Receiving...")
    l = c.recv(1024)
    f = open('temp.tif','wb')
    while (l):
        f.write(l)
        l = c.recv(1024)
    f.close()
    print ("Done Receiving")
    imgorigin_t = np.array(Image.open('temp.tif'))
    tempimg = imgorigin_t[np.newaxis,:,:,np.newaxis]
    tempimg = tempimg.transpose([0,2,1,3])
    test_pred = agent.evaluate_actor(tempimg)
    action_t = test_pred[0]

    print action_t

    str_buf = ''
    str_buf = str_buf+str(action_t[0,0])+" "
    str_buf = str_buf+str(action_t[0,1])+" "
    str_buf = str_buf+str(action_t[0,2])+" "
    str_buf = str_buf+str(action_t[0,3])+" "
    str_buf = str_buf+str(action_t[0,4])+" "
    str_buf = str_buf+str(action_t[0,5])+" "
    
    imgorigin_t_1 = imgorigin_t
    actor_t_1 = actor_t

    c.send(str_buf)
    c.close()
    
    index =1
    while True:
        #update imgorigin_t and actor_t
        imgorigin_t = img_origin_t_1
        actor_t = actor_t_1
        c, addr = s.accept()     # Establish connection with client.
        print ('Got connection from'), addr
        print ("Receiving...")
        l = c.recv(1024)
        f = open('temp.tif','wb')
        while (l):
            f.write(l)
            l = c.recv(1024)
        f.close()
        print ("Done Receiving")
        imgorigin_t_1 = np.array(Image.open('temp.tif'))
        tempimg = imgorigin_t_1[np.newaxis,:,:,np.newaxis]
        tempimg = tempimg.transpose([0,2,1,3])
        test_pred = agent.evaluate_actor(tempimg)
        action_t_1 = test_pred[0]
        print action_t_1

        reward = compute_reward(imgorigin_t_1)
        agent.add_experience(imgorigin_t,imgorigin_t_1,action_t,reward,index)

        if index > 32:
            agent.train()

        str_buf = ''
        str_buf = str_buf+str(action_t_1[0,0])+" "
        str_buf = str_buf+str(action_t_1[0,1])+" "
        str_buf = str_buf+str(action_t_1[0,2])+" "
        str_buf = str_buf+str(action_t_1[0,3])+" "
        str_buf = str_buf+str(action_t_1[0,4])+" "
        str_buf = str_buf+str(action_t_1[0,5])+" "
        c.send(str_buf)
        print("send action finished!")
        c.close()

        index = index+1
예제 #14
0
def main():
    experiment = 'quadruped-robot-v0'  #specify environments here
    backupNameFile = "quadruped_robot_0"

    backupPathFile = "storage/" + backupNameFile
    bFullPath = os.path.join(
        os.path.split(os.path.abspath(__file__))[0], backupPathFile)

    env = gym.make(experiment)
    steps = env.spec.timestep_limit  #steps per episode
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    global agent
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    for i in range(episodes):
        print("==== Starting episode no:", i, "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            # print ("Action at step", t ," :",action,"\n")

            observation, reward, done, info = env.step(action)

            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                # print ('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode)
                # print ("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
        # Save some episodes
        # print(episodes)
        # if (episodes == 10):
        # with open(bFullPath+"_EP_"+episodes+".pkl", 'wb') as file:
        #     pickle.dump(agent, file)
        # pickle.dump_session(bFullPath+"_EP_"+episodes+".pkl")
        # print ('SAVE EPISODE ',episodes)
        # break;
    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))
예제 #15
0
파일: main.py 프로젝트: zxqzhang/ddpg-aigym
def main():
    experiment= 'InvertedPendulum-v1'
    env= gym.make(experiment)
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    
    #saving reward:
    reward_st = np.array([0])
    
    
    
    for i in xrange(episodes):
        observation = env.reset()
    
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            #env.render()
            
            x = observation
            #select action using actor network model
            action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states]))
            
            noise = exploration_noise.noise()
            
                       
            action = action[0] + noise
            
            
            print 'Agent.Action :',action
            print '\n'
            print '\n'
            
                      
            observation,reward,done,[]=env.step(action)
            #add s_t,s_t+1,action,reward to experience memeroy
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()            
            
            reward_per_episode+=reward
            
            counter+=1
            #check if episode ends:
            if done:
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n'
                print '\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)    
예제 #16
0
class PlayAgent:

    def __init__(self, ip="127.0.0.1", id=28888):   # ip 는 서버 주소.id 는 agent 식별자
        self.ar = ClientActionRobotJava(ip)
        self.se = GameStateExtractor()  # ㅎ믇
        self.tp = TrajectoryPlanner()   # 궤적 계산 모듈
        self.firstShot = True
        self.solved = []            # clear 한 레벨은 1, 아닌 레벨은 0의 값을 가진다
        self.currentLevel = -1
        self.failedCounter = 0
        self.id = id
        self.width = 840            # 게임 화면 너비
        self.height = 480           # 게임 화면 높이
        # 게임 화면 스크린샷(=state) 의 크기. 즉. state의 개수는
        # 화면의 픽셀 수와 같다.
        # [height, width, RGB 3 channels]
        self.num_states = [self.height, self.width, 3]
        # Action space 정의
        # [거리(0~90px), 각도(0~90degree), tapTime(0~5000ms)]
        self.num_actions = 3
        self.action_space_high = [90, 75, 50]
        self.action_space_low = [0, 0, 0]
        self.noise_mean = [20, -20, 0]
        self.noise_sigma = [10, 30, 20]
        self.ddpg = DDPG(self.num_states, self.num_actions,
                         self.action_space_high, self.action_space_low, is_batch_norm)

    def getNextLevel(self):     # 다음 레벨을 얻어온다

        level = 0
        unsolved = False

        for i in range(len(self.solved)):
            if self.solved[i] == 0:
                unsolved = True
                level = i + 1
                if level <= self.currentLevel and self.currentLevel < len(self.solved):
                    continue
                else:
                    return level

        if unsolved:
            return level

        level = (self.currentLevel + 1) % len(self.solved)
        if level == 0:
            level = len(self.solved)

        return level

    def checkMyScore(self):

        scores = self.ar.checkMyScore()     # 현재 점수 확인
        level = 1
        for s in scores:    # 각 level 별 점수 확인
            print "||\tlevel %d score is : %d\t||" % (level, s)
            if s > 0:
                self.solved[level - 1] = 1
            level += 1

    def getScreenBuffer(self, buffer, width=840, height=480):
        """
            현재 게임플레이 스크린샷을 받아온다.
            RGB 별로 따로 저장한다.
        """
        print "## Get ScreenBuffer"
        # returnBuffer's size = (480, 840, 3)
        returnBuffer = np.zeros((height, width, 3))
        for i in range(height):
            for j in range(width):
                RGB = buffer.getRGB(j, i)
                returnBuffer[i, j, 0] = RGB & 0x0000ff
                returnBuffer[i, j, 1] = RGB & 0x00ff00
                returnBuffer[i, j, 2] = RGB & 0xff0000

        print "## Return ScreenBuffer"
        return returnBuffer

    def shoot(self, action):
        """
            새를 쏘고,
            쏜 후의 상태를 반환한다.
        """
        # 새총 detection
        screenshot = self.ar.doScreenShot()
        vision = Vision(screenshot)
        sling = vision.findSlingshotMBR()

        # 현재 게임 state
        pigs = vision.findPigsMBR()
        state = self.ar.checkState()

        # 새총이 감지되면 플레이하고, 아니라면 스킵
        if sling != None:

            # 맵에 돼지가 존재하면 임의로 한 마리를 타겟으로 잡고 쏜다.
            if len(pigs) != 0:

                refPoint = self.tp.getReferencePoint(sling)
                print "## Ref Sling Point : ", refPoint

                # DDPG 로부터 취할 action을 받아온다
                releaseDistance = action[0]
                releaseAngle = action[1]
                tapTime = action[2]
                print "## Release Distance : ", releaseDistance
                print "## Release Angle : ", releaseAngle

                self.ar.fullyZoomOut()
                screenshot = self.ar.doScreenShot()
                vision = Vision(screenshot)
                _sling = vision.findSlingshotMBR()  # zoom out 했을 때 감지된 새총.

                if _sling != None:
                    # zoom out 하지 않았을 때의 새총 위치와 zoom out 한 후의 새총 위치의 차이를 구하여
                    # 너무 차이가 난다면, 쏘지 않고 다시 screenshot 을 찍어 분석하도록 함
                    scale_diff = (sling.width - _sling.width) ** 2 + \
                        (sling.height - _sling.height) ** 2

                    if scale_diff < 25:
                        self.ar.shoot(int(refPoint.x), int(refPoint.y), int(
                            releaseDistance), int(releaseAngle), 0, int(tapTime), True)
                        print "## Shooting is Done"
                        state = self.ar.checkState()

                        if state == state.PLAYING:
                            self.firstShot = False

                    else:
                        print "## Scale is changed. So sling can not execute the shot and will re-segment the image"
                else:
                    print "## No sling was detected. So agent can not execute the shot and will re-segment the image"

        return state

    def ddpg_run(self):
        """
            DDPG algorithm 을 raw pixel data(screenshot)에 대해서 돌린다
        """

        info = self.ar.configure(ClientActionRobot.intToByteArray(self.id))
        self.solved = np.zeros(info[2])
        self.checkMyScore()
        print "## current level : %d" % self.currentLevel

        # DDPG
        # random 하게 critic, actor, target critic net, target actor net 을 초기화하고
        # experience memory 도 deque 로 초기화 한다
        exploration_noise = OUNoise(
            self.num_actions, self.noise_mean, self.noise_sigma)
        counter = 1
        reward_per_episode = 0      # episode는 한 판을 의미.
        total_reward = 0
        print "# of States : ", self.num_states
        print "# of Actions : ", self.num_actions

        # reward 저장
        reward_st = np.array([0])

        # parameter 로 정한 episode 수 만큼 training 학습 진행
        for i in xrange(episodes):

            # 다음 레벨 받아오기
            self.currentLevel = self.getNextLevel()
            # 받아온 레벨이 1~3 이면 해당 레벨 로드, 아니면 1로 초기화 후 로드
            if self.currentLevel < 4:
                self.ar.loadLevel(self.currentLevel)
            else:
                self.currentLevel = 1
                self.ar.loadLevel(self.currentLevel)

            prevscore = 0
            reward_per_episode = 0
            steps = 0
            print "======== Starting Episode No : ", (i + 1), "========", "\n"

            # 하나의 episode 에 대한 루프
            while True:

                # 게임 플레이 screenshot 가져오기
                screenshot = self.ar.doScreenShot()
                x = self.getScreenBuffer(screenshot, self.width, self.height)
                # actor evaluation 을 통해서 다음에 취할 action 을 얻는다
                action = self.ddpg.evaluate_actor(np.reshape(
                    x, [1, self.num_states[0], self.num_states[1], self.num_states[2]]))
                print "## Get Action from network!! : ", action
                action = action[0]
                noise = exploration_noise.noise()
                # action 을 현재의 policy 에 따라 정하되,
                # epsilon(noise) 수치 정도에 따라 실험적인 action을
                # stochastic 하게 취하도록 한다.
                action = action + noise
                print action
                # distance 가 음수이면 양수로 뒤집어준다.
                action[0] = action[0] if action[0] > self.action_space_low[0] else -action[0]
                # distance 가 최대 범위를 넘어서면 최대 범위로 설정한다.
                action[0] = action[0] if action[0] < self.action_space_high[0] else self.action_space_high[0]
                # 각도의 경우에도 마찬가지 처리를 해준다
                action[1] = action[1] if action[1] > self.action_space_low[1] else -action[1]
                action[1] = action[1] if action[1] < self.action_space_high[1] else self.action_space_high[1]
                # tap time 도 마찬가지
                action[2] = action[2] if action[2] > self.action_space_low[2] else -action[2]
                action[2] = action[2] if action[2] < self.action_space_low[2] else self.action_space_high[2]
                print "## Action at step ", steps, " :", action, "\n"
                # 쏘고나서 점수가 안정화 될 때까지 조금 기다리는 로직이 들어있다
                state = self.shoot(action)

                if state == state.WON or state == state.LOST:
                    # episode 가 끝나면( 한 레벨이 끝나면 )
                    print "## Episode End"

                    screenshot = self.ar.doScreenShot()
                    observation = self.getScreenBuffer(
                        screenshot, self.width, self.height)

                    # 이기면 reward를 받고 지면 받지 않는다.
                    if state == state.WON:
                        score = self.se.getScoreEndGame(screenshot)
                        # 현재 episode 에서 얻은 점수를 1000으로 나눈 값을 reward 로 사용
                        reward = (score - prevscore) / 1000.0
                    else:
                        reward = 0.00

                    self.currentLevel = self.currentLevel
                    self.firstShot = True   # episode 가 끝나면 first shot 초기화
                    done = True             # episode done 처리

                    # experience memory 에
                    # s(t), s(t + 1), action, reward 를 저장한다
                    print "######## SCORE : ", score
                    print "######## REWARD : ", reward
                    # x = state(screenBuffer) at t
                    # obervation = state(screenBuffer) at (t + 1)
                    self.ddpg.add_experience(
                        x, observation, action, reward, done)

                    # critic network 와 actor network 학습
                    # 정해둔 step 이상 진행됐을 경우부터 학습을 시작하도록 한다.
                    # experience 를 충분히 경험해야 하기 때문.
                    if counter > TRAIN_STEP:
                        self.ddpg.train()
                    counter += 1
                    steps += 1

                    print "==== EPISODE: ", i, ' Steps: ', steps, ' Total Reward: ', reward_per_episode
                    print "Writing reward info into file..."
                    exploration_noise.reset()
                    # reward_st 는 배열이다.
                    # 마지막 원소에 해당 판에서 얻은 총 점수를 기록하고
                    # 파일로 내보낸다
                    reward_st = np.append(reward_st, reward_per_episode)
                    np.savetxt("episodes_reward.txt", reward_st, newline="\n")
                    print "\n\n"

                    break

                elif state == state.PLAYING:    # PLAYING 상태일 때
                    screenshot = self.ar.doScreenShot()
                    vision = Vision(screenshot)
                    sling = vision.findSlingshotMBR()

                    while sling == None and self.ar.checkState() == state.PLAYING:
                        print "## No slingshot was detected. Please remove pop up or zoom out"
                        self.ar.fullyZoomOut()
                        screenshot = self.ar.doScreenShot()

                    # S(t + 1) 을 얻는다
                    observation = self.getScreenBuffer(
                        screenshot, self.width, self.height)
                    # experience memory 에
                    # S(t), S(t + 1), action, reward 를 저장한다
                    score = self.ar.getInGameScore(screenshot)
                    reward = (score - prevscore) / 1000.0
                    prevscore = score
                    done = False
                    reward_st = np.append(reward_st, reward)

                    self.ddpg.add_experience(
                        x, observation, action, reward, done)
                    print "## Add experience (action) (reward) (done)", action, reward, done

                    # critie, actor network 학습
                    if counter > TRAIN_STEP:
                        self.ddpg.train()
                    reward_per_episode += reward
                    counter += 1
                    steps += 1

                # 일반적인 상황이 아닌 상황들에 대한 예외처리
                elif state == state.LEVEL_SELECTION:
                    print "unexpected level selection page, go to the last current level: %d" % self.currentLevel
                    self.ar.loadLevel(self.currentLevel)
                elif state == state.MAIN_MENU:
                    print"unexpected main menu page, reload the level: %d" % self.currentLevel
                    self.ar.loadLevel(self.currentLevel)
                elif state == state.EPISODE_MENU:
                    print "unexpected episode menu page, reload the level: %d" % self.currentLevel
                    self.ar.loadLevel(self.currentLevel)

        total_reward += reward_per_episode  # episode 들의 reward 를 누계
        avg_reward = total_reward / episodes
        print "## Average reward per episode is : ", avg_reward
예제 #17
0
def main():
    experiment = 'MountainCarContinuous-v0'
    env = gym.make(experiment)
    steps = env.spec.timestep_limit
    assert isinstance(env.observation_space, Box)
    assert isinstance(env.action_space, Box)

    agent = DDPG(env, is_batch_norm)  #这个在循环前面,所以所有的weight都有继承
    #也就是说,整个过程只训练了一个模型出来。
    exploration_noise = OUNoise(env.action_space.shape[0])
    reward_per_episode = 0
    total_reward = 0
    counter = 0
    num_states = env.observation_space.shape[0] - 1
    num_actions = env.action_space.shape[0]
    #这是state的维度和action的维度

    print 'Number of States:', num_states
    print 'Number of Actions:', num_actions
    print 'Number of steps per episode:', steps

    if is_exploration == True:
        print("\nExploration phase for {} steps. ".format(exploration_steps))
        e_steps = 0
        while e_steps < exploration_steps:
            s = env.reset()
            one_step = 0
            done = False
            exploration_noise.reset()
            exp = []
            while not done:
                a = exploration_noise.noise()
                ss, r, done, _ = env.step(a)
                exp.append((s[:-1], a, ss[:-1], r, done))
                s = ss
                one_step += 1
                if one_step > 998:
                    break
            agent.add_experience(exp)
            e_steps += 1

    reward_st = np.array([0])  #这个是用来存每一次的rewards的

    for i in xrange(episodes):  #一共要循环1000次
        print '====starting episode no:', i, '====', '\n'
        observation = env.reset()  #每个情节初始化,但是模型参数不初始化
        reward_per_episode = 0
        LSTM_SIZE = 40
        statec_t1 = np.zeros((BATCH_SIZE, LSTM_SIZE))
        stateh_t1 = np.zeros((BATCH_SIZE, LSTM_SIZE))
        exp = []
        for t in xrange(steps):
            #env.render()
            x = [observation[0:num_states]]
            x = np.reshape(x * BATCH_SIZE, [BATCH_SIZE, num_states])
            actor, statec_t1, stateh_t1 = agent.evaluate_actor(
                x, statec_t1, stateh_t1)
            noise = exploration_noise.noise()
            #ra = random.random()
            if (i < 500):
                action = actor[0] + noise
            else:
                action = actor[0]
            observation, reward, done, info = env.step(action)
            #print 'Action at step',t,':',action,'reward:',reward,'\n'
            exp.append((x, action, observation[0:num_states], reward, done))

            if counter > 64:
                agent.train()
            counter += 1
            reward_per_episode += reward
            if (done or (t == steps - 1)):
                #一个情节结束了~
                agent.add_experience(exp)
                print 'EPISODE:', i, 'Steps', t, 'Total Reward:', reward_per_episode
                print 'Printing reward to file'
                exploration_noise.reset()
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline='\n')
                print '\n\n'
                break

    total_reward += reward_per_episode
    #这里是计算平均值的
    print "Average reward per episode {}".format(total_reward / episodes)
def s2l():

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    num_states = feature_size  #num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)

    agent = DDPG(env, is_batch_norm, num_states, num_actions)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    total_reward = 0

    print("Number of Rollouts per episode:", num_rollouts)
    print("Number of Steps per roll out:", steps)
    reward_st = np.array([0])  #saving reward
    eval_metric_st = np.array([0])
    reward_st_all = np.array([0])  #saving reward after every step

    frame_obj = Frame_Feature()

    #activity_obj=Vid_Feature()
    demo_vid_array = demo_array_extractor(demo_folder)
    demo_features = frame_obj.video_feature_extractor(demo_vid_array)

    for episode in range(num_episodes):
        print("==== Starting episode no:", episode, "====", "\n")
        env.reset()  # Reset env in the begining of each episode
        env.render()
        obs_img = env.render(mode='rgb_array')  # Get the observation
        obs_img = np.array(misc.imresize(obs_img, [112, 112, 3]))
        observation = np.array(frame_obj.frame_feature_extractor(obs_img))
        observation = observation.reshape(-1)
        reward_per_episode = 0

        for t in range(num_rollouts):

            reward_per_rollout = 0
            vid_robo_ = []

            for i in range(steps):

                x = observation

                action = agent.evaluate_actor(np.reshape(x, [1, num_states]))
                noise = exploration_noise.noise()
                action = action[
                    0] + noise  #Select action according to current policy and exploration noise
                print('Action at episode-', episode, 'rollout-', t, 'step-', i,
                      " :", action)

                _, _, done, info = env.step(action)
                env.render()
                obs_robo_ = env.render(mode='rgb_array')  # Get the observation
                obs_robo = misc.imresize(obs_robo_, [112, 112, 3])
                vid_robo_.append(obs_robo)
                observation = np.array(
                    frame_obj.frame_feature_extractor(np.array(obs_robo)))
                observation = observation.reshape(-1)
                #pasue()

                if (i == 15):
                    vid_robo = np.array(vid_robo_)
                    robo_features = frame_obj.video_feature_extractor(vid_robo)
                    reward = -(distance(demo_features, robo_features))
                    reward = np.array(reward)
                    print('reward: ', reward)
                else:
                    reward = 0
                    reward = np.array(reward)
                    print('reward: ', reward)

                # Printing eval_metric after every rollout
                eval_metric = np.array(env.get_eval())
                eval_metric = eval_metric.reshape(-1)
                print('Distance to goal:', eval_metric)
                eval_metric_st = np.append(eval_metric_st, eval_metric)
                np.savetxt('eval_metric_per_step.txt',
                           eval_metric_st,
                           newline="\n")

                # Storing reward after every rollout
                reward_st_all = np.append(reward_st_all, reward)
                np.savetxt('reward_all.txt', reward_st_all, newline="\n")

                #add s_t,s_t+1,action,reward to experience memory
                agent.add_experience(x, observation, action, reward, False)
                reward_per_rollout += reward
                counter += 1

            #train critic and actor network
            if counter > start_training:
                agent.train()
            print('\n\n')

            #Saving policy
            if ((episode % 50) == 0 and t == num_rollouts - 1):
                print('saving policy...........................!')
                agent.save_actor(episode)

            reward_per_episode += reward_per_rollout

        #check if episode ends:

        print('EPISODE: ', episode, ' Total Reward: ', reward_per_episode)
        print("Printing reward to file")
        exploration_noise.reset(
        )  #reinitializing random noise for action exploration
        reward_st = np.append(reward_st, reward_per_episode)
        np.savetxt('episode_reward.txt', reward_st, fmt='%f', newline="\n")
        print('\n\n')

        total_reward += reward_per_episode

    print("Average reward per episode {}".format(total_reward / num_episodes))
예제 #19
0
def main():
    enable_actuator_dynamics = True
    env = ControlSystem(enable_actuator_dynamics=enable_actuator_dynamics)

    steps = env.timestep_limit  #steps per episode
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm)

    # agent.load_model()

    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    log_dir = os.path.join(os.getcwd(), 'logs',
                           datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                           'action')

    if enable_actuator_dynamics == True:
        filtered_log_dir = os.path.join(
            os.getcwd(), 'logs',
            datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'filtered_action')

    y_hat_log_dir = os.path.join(os.getcwd(), 'logs',
                                 datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                                 'y_hat')

    y_ref_log_dir = os.path.join(os.getcwd(), 'logs',
                                 datetime.now().strftime('%Y-%m-%d_%H-%M-%S'),
                                 'y_ref')

    gen_function_log_dir = os.path.join(
        os.getcwd(), 'logs',
        datetime.now().strftime('%Y-%m-%d_%H-%M-%S'), 'function')

    os.makedirs(log_dir)
    if enable_actuator_dynamics == True:
        os.makedirs(filtered_log_dir)
    os.makedirs(y_hat_log_dir)
    os.makedirs(y_ref_log_dir)
    os.makedirs(gen_function_log_dir)

    for i in range(episodes):
        print("==== Starting episode no:", i, "====")
        observation = env.reset()
        reward_per_episode = 0
        actions_per_episode = []
        if enable_actuator_dynamics == True:
            filtered_action_per_episode = []

        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, num_states]))

            noise = exploration_noise.noise()

            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            actions_per_episode.append(action)
            # if i % 100 == 0:
            #     print ("Action at step", t ," :",action,"\n")
            # print("#", action[0])
            if action[0] < 0:
                action = [0]
            elif action[0] > 1:
                action = [1]

            # print("Step", t, 'action', action)

            if enable_actuator_dynamics == False:
                observation, reward, Y_plot, t_plot, y_ref, random_function = env.step(
                    action, t)
            elif enable_actuator_dynamics == True:
                observation, reward, filtered_action, Y_plot, t_plot, y_ref, random_function = env.step(
                    action, t)
                filtered_action_per_episode.append(filtered_action)

            # print ("Reward at step", t ," :",reward,"\n")
            #add y_t,y_t-1,action,reward,timestep to experience memory
            agent.add_experience(x, observation, action, reward, t)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (t == steps - 1):
                print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ',
                      reward_per_episode)
                # print ("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")

                # print("Y_plot")
                # plt.step(t_plot,Y_plot)
                # plt.grid()
                # plt.xlabel('t')
                # plt.ylabel('y')
                # plt.show()

                # Save actions
                np.savetxt(log_dir + '/' + str(i).zfill(7) + '.txt',
                           actions_per_episode)
                if enable_actuator_dynamics == True:
                    np.savetxt(
                        filtered_log_dir + '/' + str(i).zfill(7) + '.txt',
                        filtered_action_per_episode)
                np.savetxt(y_hat_log_dir + '/' + str(i).zfill(7) + '.txt',
                           Y_plot)
                np.savetxt(y_ref_log_dir + '/' + str(i).zfill(7) + '.txt',
                           y_ref)
                # np.savetxt(gen_function_log_dir + '/' + str(i).zfill(7) + '.txt', random_function)

                # save model
                if i % 100 == 0:
                    print('save')
                    agent.save_model()
                # print ('\n\n')

                break

    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))