示例#1
0
    def __init__(self, goal_theta_num=0, goal_theta_den=1):
        self.goal_theta_num = goal_theta_num
        self.goal_theta_den = goal_theta_den
        self.goal_theta = goal_theta_num / goal_theta_den

        self.env = PendulumEnv(goal_theta=self.goal_theta)

        self.save_directory = 'saved_policies'

        self.epsilon = .2
        self.gamma = .99

        self.num_avail_actions = 31
        self.num_avail_positions = 51
        self.num_avail_velocities = 51

        self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_avail_positions)
        self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_avail_velocities)
        self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_avail_actions)

        self.q_matrix = np.zeros((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        ))

        self.converge_threshold = 0.05 # % of q-value that dq must be close to to be considered converged
        self.perc_conv_thresh = 0.8 # % of q-values that must pass the convergence threshold

        self.percent_converged = 0
        self.percent_unexplored = 0

        self.prev_q_matrix = np.zeros((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        )) # previous q-matrix

        self.dq_matrix = 100 * np.ones((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        )) # delta-q matrix, tracks amount each weight is being updated

        self.data = dict()
        self.start_time = 0
        self.total_time = 0

        self.ep_rewards = []
        self.perc_unexplored_arr = []
        self.perc_conv_arr = []
    def __init__(self, data_dictionary=None, policy_name=None, policy_directory=None):

        self.save_directory = 'rory_data'

        self.trainer = QLearning()
        self.data = data_dictionary

        self.num_actions = self.trainer.num_avail_actions
        self.num_positions = self.trainer.num_avail_positions
        self.num_velocities = self.trainer.num_avail_velocities

        if data_dictionary is None:
            if policy_directory is None:
                self.load_directory = self.trainer.save_directory
            else:
                self.load_directory = policy_directory

            if policy_name is None:
                self.policy_name = self.grab_newest_policy()
            else:
                self.policy_name = policy_name

            # goal_theta_num, goal_theta_den = self.get_goal_theta(self.policy_name)

            # self.goal_theta = goal_theta_num / goal_theta_den
            self.goal_theta = np.pi / 4

            self.file = os.path.join(self.load_directory, self.policy_name)
            self.policy = self.load_policy()
            self.data = dict()
            

        else:
            self.goal_theta = self.data['goal_theta']
            self.policy = self.data['policy']
            self.file = self.data['fname']
            self.policy_name = self.file

        self.env = PendulumEnv(goal_theta=self.goal_theta)

        self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_positions)
        self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_velocities)
        self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_actions)

        self.dummy_q = self.trainer.q_matrix
        self.num_useful_actions = 0

        self.torques = []
        self.theta_errors = []
示例#3
0
 def __init__(self):
     self.env = PendulumEnv()
     self.env.reset()
     self.action_number = 101
     self.state_space = 2
     self.epsilon = 0.2
     self.bias = 0
     self.gamma = 0.99
     min_torque = -self.env.max_torque
     max_torque = self.env.max_torque
     self.actions = []
     for i in range(self.action_number):
         self.actions.append(min_torque + (max_torque - min_torque) /
                             (self.action_number - 1) * i)
     print(self.actions)
     self.weight_matrix = np.zeros((len(self.actions), self.state_space))
示例#4
0
def render_test(torque_type=0):
    '''
    torque_types:
        - 0 : square wave torque to make the pendulum oscillate back and forth
        - 1 : some constant value torque
        - 2 (or anything else) : random torque
    '''

    env = PendulumEnv()
    env.reset()

    at_rest = True

    try:
        for _ in range(500):
            env.render()

            if torque_type == 0:

                if env.state[0] == env.angle_limit and at_rest:
                    val = env.max_torque
                    at_rest = False

                elif env.state[0] == -env.angle_limit and at_rest:
                    val = -env.max_torque
                    at_rest = False

                if abs(env.state[0]) == env.angle_limit and not at_rest:
                    at_rest = True

                u = np.array([val]).astype(env.action_space.dtype)

            elif torque_type == 1:
                val = -43
                u = np.array([val]).astype(env.action_space.dtype)

            else:
                u = env.action_space.sample()

            info = env.step(u)
            # print(info)
            # print(env.state[0]) # print angular position
            print(env.state[1])  # print angular velocity

            time.sleep(.1)

    except KeyboardInterrupt:
        pass

    env.close()
def main():
    dummy_env = PendulumEnv()
    start_pos = dummy_env.angle_limit

    pol_dir = 'saved_policies'
    fname = 'interp316_pi84.npy'
    sim = Simulator(policy_directory=pol_dir, policy_name=fname)
    sim.simulate(ep_num=1, iter_num=200, start_pos=start_pos, start_vel=0)
    sim.save_precious_simulated_data()
示例#6
0
    def __init__(self, netsize, Nsensors=1, Nmotors=1):  # Create ising model

        self.size = netsize  #Network size
        self.Ssize = Nsensors  # Number of sensors
        self.Msize = Nmotors  # Number of sensors

        self.h = np.zeros(netsize)
        self.J = np.zeros((netsize, netsize))
        self.max_weights = 2

        self.randomize_state()

        self.env = PendulumEnv()

        self.observation = self.env.reset()

        self.Beta = 1.0
        self.defaultT = max(100, netsize * 20)

        self.Ssize1 = 0
        self.maxspeed = self.env.max_speed
        self.Update(-1)
示例#7
0
class QLearning():

    def __init__(self, goal_theta_num=0, goal_theta_den=1):
        self.goal_theta_num = goal_theta_num
        self.goal_theta_den = goal_theta_den
        self.goal_theta = goal_theta_num / goal_theta_den

        self.env = PendulumEnv(goal_theta=self.goal_theta)

        self.save_directory = 'saved_policies'

        self.epsilon = .2
        self.gamma = .99

        self.num_avail_actions = 31
        self.num_avail_positions = 51
        self.num_avail_velocities = 51

        self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_avail_positions)
        self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_avail_velocities)
        self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_avail_actions)

        self.q_matrix = np.zeros((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        ))

        self.converge_threshold = 0.05 # % of q-value that dq must be close to to be considered converged
        self.perc_conv_thresh = 0.8 # % of q-values that must pass the convergence threshold

        self.percent_converged = 0
        self.percent_unexplored = 0

        self.prev_q_matrix = np.zeros((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        )) # previous q-matrix

        self.dq_matrix = 100 * np.ones((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        )) # delta-q matrix, tracks amount each weight is being updated

        self.data = dict()
        self.start_time = 0
        self.total_time = 0

        self.ep_rewards = []
        self.perc_unexplored_arr = []
        self.perc_conv_arr = []
    

    def getQMatrixIdx(self, th, thdot, torque):
        thIdx = np.abs(self.thetas - th).argmin()
        thdotIdx = np.abs(self.theta_dots - thdot).argmin()
        torIdx = np.abs(self.actions - torque).argmin()

        return torIdx, thIdx, thdotIdx


    def getMaxQValue(self, th, thdot):
        # returns the depth index for given th,thdot state where torque is highest
        maxQValIdx = self.q_matrix[:, th, thdot].argmax()
        maxQVal = self.q_matrix[maxQValIdx, th, thdot]

        return maxQValIdx, maxQVal


    def get_action(self, th, thdot):

        random_float = random.random()
        if (random_float < self.epsilon): # if a random float is less than our epsilon, explore a random action
            chosen_idx = np.random.randint(0, self.num_avail_actions)

        else: # if the random float is not less than epsilon, exploit the policy
            _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0)
            chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx)

        action = self.actions[chosen_idx]
        u = np.array([action]).astype(self.env.action_space.dtype)

        return u
    

    def check_converged(self):
        # total number of elements
        total_elements = self.q_matrix.size
        
        percent_changed = np.divide(self.dq_matrix, self.q_matrix,
                        out=np.ones(self.q_matrix.shape),
                        where=self.q_matrix != 0)

        # compute the number of 'converged' q-values
        num_converged = (percent_changed < self.converge_threshold).sum()
        # percentage of converged q-values
        self.percent_converged = num_converged / total_elements

        self.percent_unexplored = (self.dq_matrix == 100).sum() / self.dq_matrix.size

        if self.percent_converged >= self.perc_conv_thresh:
            return True

        else:
            return False


    def get_convergence_stats(self):
        # returns a dictionary where the keys are threshold values (from [0-0.95] in 0.05 step size)
        # and the values are the percentage of the q-matrix that meets this threshold of convergence
        conv_stats = dict()
        total_elements = self.q_matrix.size
        conv_threshold = 0.05

        for i in range(20):
            new_threshold = (conv_threshold * i)
            percent_changed = np.divide(self.dq_matrix, self.q_matrix,
                            out=np.ones(self.q_matrix.shape),
                            where=self.q_matrix != 0)
            # compute the number of 'converged' q-values
            num_converged = (percent_changed < new_threshold).sum()
            # percentage of converged q-values
            percent_converged = num_converged / total_elements

            conv_stats[new_threshold] = percent_converged

        return conv_stats


    def train(self, episodes=15000, max_iterations=100000, l_rate=0.1):
        self.start_time = time.time()

        for episode_num in range(episodes):

            self.episodes = episode_num
            self.iterations = max_iterations
            self.l_rate = l_rate
            
            # reset the environment and declare th,thdot
            th, thdot = self.env.reset()

            iter_count = -1
            total_reward = 0

            while(not self.env.is_done and iter_count < max_iterations):
                iter_count += 1

                # select a new action to take
                u = self.get_action(th, thdot)

                # find the current indecies in the self.weights_matrix so that we can update the weight for this action : th,thdot,u
                currTorIdx, currThIdx, currThdotIdx = self.getQMatrixIdx(th, thdot, u)

                # find next state corresponding to chosen action
                nextTh, nextThdot, reward = self.env.step(u)
                total_reward += reward

                _, nextThIdx, nextThdotIdx = self.getQMatrixIdx(nextTh, nextThdot, u)

                # find the highest weighted torque in the self.weights_matrix given the nextTh,nextThdot
                _, nextQVal = self.getMaxQValue(nextThIdx, nextThdotIdx)

                self.q_matrix[currTorIdx, currThIdx, currThdotIdx] += l_rate * (reward + self.gamma * nextQVal \
                                                                    - self.q_matrix[currTorIdx, currThIdx, currThdotIdx])

                self.dq_matrix[currTorIdx, currThIdx, currThdotIdx] = self.q_matrix[currTorIdx, currThIdx, currThdotIdx] \
                                                                    - self.prev_q_matrix[currTorIdx, currThIdx, currThdotIdx]

                self.prev_q_matrix[currTorIdx, currThIdx, currThdotIdx] = self.q_matrix[currTorIdx, currThIdx, currThdotIdx]

                th = nextTh
                thdot = nextThdot
                    
                if iter_count % 100 == 0:
                    # self.env.render()
                    print('iter_count = ', iter_count)
                    print('episode = ', episode_num)
                    print('epsilon = ', self.epsilon)
                    print(f'percent converged = {round(self.percent_converged, 2)}')
                    print(f'percent unexplored = {round(self.percent_unexplored, 2)}')
                    print("Time Elapsed: ",time.strftime("%H:%M:%S",time.gmtime(time.time()-self.start_time)))
                    print('')
                    
            converged = self.check_converged()
            
            if converged:
                print(f'Converged on episode {episode_num}')
                break

            if episode_num % 10 == 0:
                self.ep_rewards.append(total_reward)
                self.perc_unexplored_arr.append(self.percent_unexplored)
                self.perc_conv_arr.append(self.percent_converged)

        
        self.print_stuff()
        self.save_policy()
        self.get_precious_data()

    
    def increase_epsilon_maybe(self, ep_num):
        if ep_num % 1000 == 0 and ep_num != 0:
            self.epsilon -= .006

    
    def save_policy(self):
        time_struct = time.localtime(time.time())
        fname = self.get_fname(time_struct)
        self.data['fname'] = fname
        save_path = os.path.join(self.save_directory, fname)

        np.save(save_path, self.q_matrix)

        print(f'saved policy: {fname}')


    def get_fname(self, time_params):
        year = time_params.tm_year
        month = time_params.tm_mon
        day = time_params.tm_mday
        hour = time_params.tm_hour
        minute = time_params.tm_min
        sec = time_params.tm_sec

        if isclose(self.goal_theta_num, np.pi, abs_tol=1e-7):
            num = 'pi'
        elif isclose(self.goal_theta_num, -np.pi, abs_tol=1e-7):
            num = 'ip'
        else:
            num = self.goal_theta_num

        if isclose(self.goal_theta_den, np.pi, abs_tol=1e-7):
            den = 'pi'
        elif isclose(self.goal_theta_den, -np.pi, abs_tol=1e-7):
            den = 'ip'
        else:
            den = self.goal_theta_den

        fname = f'{year}_{month}_{day}_{hour}_{minute}_{sec}_{num}_{den}'

        return fname
    

    def print_stuff(self):

        print("Training Done")
        print("Total Time Elapsed: ",time.strftime("%H:%M:%S",time.gmtime(time.time()-self.start_time)))
        print(f'percent converged = {round(self.percent_converged, 2)}')
        print(f'percent unexplored = {round(self.percent_unexplored, 2)}')


    def get_precious_data(self):
        self.data["gamma"] = self.gamma
        self.data["epsilon"] = self.epsilon
        self.data["goal_theta"] = self.goal_theta
        self.data["perc_actions_explored"] = 1 - self.percent_unexplored
        self.data["perc_converged"] = self.percent_converged
        self.data["conv_perc_req"] = self.perc_conv_thresh # % of q-values that must pass the convergence threshold
        self.data["converge_threshold"] = self.converge_threshold # % of q-value that dq must be close to to be considered converged
        self.data["training_time"] = self.total_time
        self.data["training_episodes"] = self.episodes
        self.data["training_iterations"] = self.iterations
        self.data["learning_rate"] = self.l_rate
        self.data["policy"] = self.q_matrix
        self.data["converged_stats"] = self.get_convergence_stats()
        self.data["ep_rewards_arr"] = self.ep_rewards
        self.data["perc_unexplored_arr"] = self.perc_unexplored_arr
        self.data["perc_converged_arr"] = self.perc_conv_arr
示例#8
0
import numpy as np
from pendulumAgent import DankAgent
import datetime
import time
import matplotlib.pyplot as plt


def print_timestamp(string=""):
    now = datetime.datetime.now()
    print(string + now.strftime("%Y-%m-%d %H:%M"))


####### INTIALISATION ##########################################################
weights_file = "network.h5"
load_existent_model = False
env = PendulumEnv()
env.cnt += 1
agent = DankAgent([-env.max_torque, env.max_torque])
agent.model.summary()
if load_existent_model:
    agent.load(weights_file)

nepisodes = 0
######## CONSTANTS ############################################################
#def: 50000
MEMORY_SIZE = 80000
memory = []
#def: 30000
TRAINING_EPISODES = 30
AUTO_SAVER = 50
SHOW_PROGRESS = 10
示例#9
0
#!/usr/bin/python3
import matplotlib.pyplot as plt
import gym
import sys
from pendulum import PendulumEnv, angle_normalize
import random
from random import randrange
import numpy as np
from pendulumAgent import DankAgent
import datetime
import time

####### INTIALISATION ##########################################################
weights_file = "network.h5"
env = PendulumEnv()
env.cnt += 1
state = env.reset()
state = state.reshape((1, 3))

action_dim = 25
action_high = env.action_space.high[0]
action_low = env.action_space.low[0]
action_sequence = np.linspace(action_low, action_high, action_dim)

agent = DankAgent(env.observation_space.shape[0], action_dim)
agent.model.summary()

nepisodes = 0
batch = []
TRAINING_EPISODES = 500
steps = 200
示例#10
0
import numpy as np
from pendulumAgent import DankAgent
import datetime
import time
from keras.models import load_model
from pendulumAgent import DankAgent, print_timestamp
import argparse
import sys
#sys.path.append('..')
parser = argparse.ArgumentParser()
import random
import datetime



env = PendulumEnv()
env.cnt += 1
input_shape = 3
batch_size = 64
test_agent = DankAgent([-env.max_torque,env.max_torque],input_shape,batch_size)
test_agent.load('network_vanilla.h5')

list_episode_reward = []

TESTING_EPISODES = 10

for ep in range(TESTING_EPISODES):
    state = env.reset()
    episode_reward = 0
    for t in range(500):
        env.render()
		plt.xlabel("Episode")
		plt.ylabel("Episode Reward")
		plt.title("Evaluation - Episode Reward over Time")
		fig2.savefig('new_results/' + 'evaluation_reward.png')

		if noshow:
			plt.close(fig2)
		else:
			plt.show(fig2)

if __name__ == "__main__":

	stats = []

	reward_func = True
	env = PendulumEnv(reward_function = reward_func)
	state_space_size = env.observation_space.shape[0]
	action_space_size = env.action_space.shape[0]
	action_bound = 2
	

	max_time_per_episode = 200
	num_episodes = 5000
	discount_factor = 0.99
	batch_size = 64
	tau = 0.001

	train = 10
	stats = []
	actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_space_size))
	replay_memory = ReplayBuffer(num_episodes)
示例#12
0
class QLearning():
    def __init__(self):
        self.env = PendulumEnv()
        self.env.reset()
        self.action_number = 101
        self.state_space = 2
        self.epsilon = 0.2
        self.bias = 0
        self.gamma = 0.99
        min_torque = -self.env.max_torque
        max_torque = self.env.max_torque
        self.actions = []
        for i in range(self.action_number):
            self.actions.append(min_torque + (max_torque - min_torque) /
                                (self.action_number - 1) * i)
        print(self.actions)
        self.weight_matrix = np.zeros((len(self.actions), self.state_space))

    def train(self, episodes=10, max_iterarions=20000, learning_rate=0.01):
        for episodes_number in range(episodes):
            total_reward = 0
            curr_state_info = self.env.reset()
            curr_state = np.zeros(self.state_space)
            for i in range(self.state_space):
                curr_state[i] = curr_state_info[i]

            #print (curr_state)
            for iteration_number in range(max_iterarions):
                time.sleep(0.1)
                print(curr_state)
                random_float = random.random()
                if (random_float < self.epsilon):
                    action = np.random.randint(0, self.action_number - 1)
                else:
                    action = -1
                    max_q_s_a_w = -sys.float_info.max
                    for action_iter in range(self.action_number):
                        q_s_a_w_iter = np.dot(
                            curr_state,
                            self.weight_matrix[action_iter]) + self.bias
                        if (q_s_a_w_iter > max_q_s_a_w):
                            max_q_s_a_w = q_s_a_w_iter
                            action = action_iter

                u = np.array([self.actions[action]
                              ]).astype(self.env.action_space.dtype)
                #u = np.array([500]).astype(self.env.action_space.dtype)

                next_state_info, reward, isDone, _ = self.env.step(u)
                print("reward : ", reward)
                print(self.actions[action])
                print("")

                next_state = np.zeros((self.state_space))
                for i in range(self.state_space):
                    next_state[i] = float(next_state_info[i])

                max_q_s_a_w = -sys.float_info.max
                for action_iter in range(self.action_number):
                    q_s_a_w_iter = np.dot(
                        next_state,
                        self.weight_matrix[action_iter]) + self.bias
                    if (q_s_a_w_iter > max_q_s_a_w):
                        max_q_s_a_w = q_s_a_w_iter

                gradient_matrix_w = np.zeros(
                    (self.action_number, self.state_space))
                gradient_matrix_w[action] = curr_state

                copy_of_weight_matrix = copy.deepcopy(self.weight_matrix)
                copy_of_bias = copy.deepcopy(self.bias)

                self.weight_matrix = self.weight_matrix - learning_rate * (
                    (np.dot(curr_state, copy_of_weight_matrix[action]) +
                     copy_of_bias) -
                    (reward + self.gamma * max_q_s_a_w)) * gradient_matrix_w
                self.bias = self.bias - learning_rate * (
                    (np.dot(curr_state, copy_of_weight_matrix[action]) +
                     copy_of_bias) - (reward + self.gamma * max_q_s_a_w)) * 1.0

                curr_state = next_state
                total_reward += reward

                if (True or episodes_number % 100 == 0):
                    self.env.render()

        self.env.close()
示例#13
0
class ising:
    # Initialize the network
    def __init__(self, netsize, Nsensors=1, Nmotors=1):  # Create ising model

        self.size = netsize  #Network size
        self.Ssize = Nsensors  # Number of sensors
        self.Msize = Nmotors  # Number of sensors

        self.h = np.zeros(netsize)
        self.J = np.zeros((netsize, netsize))
        self.max_weights = 2

        self.randomize_state()

        self.env = PendulumEnv()

        self.observation = self.env.reset()

        self.Beta = 1.0
        self.defaultT = max(100, netsize * 20)

        self.Ssize1 = 0
        self.maxspeed = self.env.max_speed
        self.Update(-1)

    def get_state(self, mode='all'):
        if mode == 'all':
            return self.s
        elif mode == 'motors':
            return self.s[-self.Msize:]
        elif mode == 'sensors':
            return self.s[0:self.Ssize]
        elif mode == 'input':
            return self.sensors
        elif mode == 'non-sensors':
            return self.s[self.Ssize:]
        elif mode == 'hidden':
            return self.s[self.Ssize:-self.Msize]

    def get_state_index(self, mode='all'):
        return bool2int(0.5 * (self.get_state(mode) + 1))

    # Randomize the state of the network
    def randomize_state(self):
        self.s = np.random.randint(0, 2, self.size) * 2 - 1
        self.sensors = np.random.randint(0, 2, self.Ssize) * 2 - 1

    # Randomize the position of the agent
    def randomize_position(self):
        self.observation = self.env.reset()

    # Set random bias to sets of units of the system
    def random_fields(self, max_weights=None):
        if max_weights is None:
            max_weights = self.max_weights
        self.h[self.Ssize:] = max_weights * \
         (np.random.rand(self.size - self.Ssize) * 2 - 1)

    # Set random connections to sets of units of the system
    def random_wiring(self, max_weights=None):  # Set random values for h and J
        if max_weights is None:
            max_weights = self.max_weights
        for i in range(self.size):
            for j in np.arange(i + 1, self.size):
                if i < j and (i >= self.Ssize or j >= self.Ssize):
                    self.J[i,
                           j] = (np.random.rand(1) * 2 - 1) * self.max_weights

    # Update the position of the agent
    def Move(self):
        self.previous_speed = self.observation[1]
        action = np.sum(self.s[-self.Msize:]) / self.Msize
        observation, reward, done, info = self.env.step(action)

        self.observation = self.env.state
        self.positionX = np.cos(self.env.state[0]) * self.env.l
        self.positionY = np.sin(self.env.state[0]) * self.env.l

        self.speed = self.env.state[1]

    # Transorm the sensor input into integer index
    def SensorIndex(self, x, xmax):
        return int(
            np.floor((x + xmax) / (2 * xmax + 10 * np.finfo(float).eps) *
                     2**self.Ssize))

    # Update the state of the sensor
    def UpdateSensors(self):
        self.speed_ind = self.SensorIndex(self.speed, self.maxspeed)
        self.sensors = 2 * bitfield(self.speed_ind, self.Ssize) - 1

    # Execute step of the Glauber algorithm to update the state of one unit
    def GlauberStep(self, i=None):
        if i is None:
            i = np.random.randint(self.size)

        I = 0
        if i < self.Ssize:
            I = self.sensors[i]
        eDiff = 2 * self.s[i] * (self.h[i] + I +
                                 np.dot(self.J[i, :] + self.J[:, i], self.s))
        if eDiff * self.Beta < np.log(1 / np.random.rand() - 1):  # Glauber
            self.s[i] = -self.s[i]

    # Update random unit of the agent
    def Update(self, i=None):
        if i is None:
            i = np.random.randint(-1, self.size)
        if i == -1:
            self.Move()
            self.UpdateSensors()
        else:
            self.GlauberStep(i)

    # Sequentially update state of all units of the agent in random order
    def SequentialUpdate(self):
        for i in np.random.permutation(range(-1, self.size)):
            self.Update(i)

    # Step of the learning algorith to ajust correlations to the critical regime
    def AdjustCorrelations(self, T=None):
        if T is None:
            T = self.defaultT

        self.m = np.zeros(self.size)
        self.c = np.zeros((self.size, self.size))
        self.C = np.zeros((self.size, self.size))

        # Main simulation loop:
        samples = []
        for t in range(T):

            self.SequentialUpdate()
            #self.x[t] = self.position
            self.m += self.s
            for i in range(self.size):
                self.c[i, i + 1:] += self.s[i] * self.s[i + 1:]
        self.m /= T
        self.c /= T
        for i in range(self.size):
            self.C[i, i + 1:] = self.c[i, i + 1:] - self.m[i] * self.m[i + 1:]

        c1 = np.zeros((self.size, self.size))
        for i in range(self.size):
            inds = np.array([], int)
            c = np.array([])
            for j in range(self.size):
                if not i == j:
                    inds = np.append(inds, [j])
                if i < j:
                    c = np.append(c, [self.c[i, j]])
                elif i > j:
                    c = np.append(c, [self.c[j, i]])
            order = np.argsort(c)[::-1]
            c1[i, inds[order]] = self.Cint[i, :]
        self.c1 = np.triu(c1 + c1.T, 1)
        self.c1 *= 0.5

        self.m[0:self.Ssize] = 0
        self.m1[0:self.Ssize] = 0
        self.c[0:self.Ssize, 0:self.Ssize] = 0
        self.c[-self.Msize:, -self.Msize:] = 0
        self.c[0:self.Ssize, -self.Msize:] = 0
        self.c1[0:self.Ssize, 0:self.Ssize] = 0
        self.c1[-self.Msize:, -self.Msize:] = 0
        self.c1[0:self.Ssize, -self.Msize:] = 0
        dh = self.m1 - self.m
        dJ = self.c1 - self.c
        #print(self.m1,self.m)
        #print("dh, dJ:",dh,",",dJ)
        return dh, dJ

    # Algorithm for poising an agent in a critical regime
    def CriticalLearning(self, Iterations, T=None):
        u = 0.01
        count = 0
        dh, dJ = self.AdjustCorrelations(T)
        fit = max(np.max(np.abs(self.c1 - self.c)),
                  np.max(np.abs(self.m1 - self.m)))
        for it in range(Iterations):
            count += 1
            self.h += u * dh
            self.J += u * dJ
            if it % 10 == 0:
                self.randomize_state()
                self.randomize_position()
            Vmax = self.max_weights
            for i in range(self.size):
                if np.abs(self.h[i]) > Vmax:
                    self.h[i] = Vmax * np.sign(self.h[i])
                for j in np.arange(i + 1, self.size):
                    if np.abs(self.J[i, j]) > Vmax:
                        self.J[i, j] = Vmax * np.sign(self.J[i, j])

            dh, dJ = self.AdjustCorrelations(T)
            fit = np.max(np.abs(self.c1 - self.c))
class Simulator():

    def __init__(self, data_dictionary=None, policy_name=None, policy_directory=None):

        self.save_directory = 'rory_data'

        self.trainer = QLearning()
        self.data = data_dictionary

        self.num_actions = self.trainer.num_avail_actions
        self.num_positions = self.trainer.num_avail_positions
        self.num_velocities = self.trainer.num_avail_velocities

        if data_dictionary is None:
            if policy_directory is None:
                self.load_directory = self.trainer.save_directory
            else:
                self.load_directory = policy_directory

            if policy_name is None:
                self.policy_name = self.grab_newest_policy()
            else:
                self.policy_name = policy_name

            # goal_theta_num, goal_theta_den = self.get_goal_theta(self.policy_name)

            # self.goal_theta = goal_theta_num / goal_theta_den
            self.goal_theta = np.pi / 4

            self.file = os.path.join(self.load_directory, self.policy_name)
            self.policy = self.load_policy()
            self.data = dict()
            

        else:
            self.goal_theta = self.data['goal_theta']
            self.policy = self.data['policy']
            self.file = self.data['fname']
            self.policy_name = self.file

        self.env = PendulumEnv(goal_theta=self.goal_theta)

        self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_positions)
        self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_velocities)
        self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_actions)

        self.dummy_q = self.trainer.q_matrix
        self.num_useful_actions = 0

        self.torques = []
        self.theta_errors = []

    
    def load_policy(self):
        policy = np.load(self.file, allow_pickle=True)
        policy = policy.item().get('policy')
        return policy

    
    def grab_newest_policy(self):
        all_policies = os.listdir(self.load_directory)

        file_count = 0
        theta_dict = {}
        name_dict = {}

        for i, policy in enumerate(all_policies):

            if not policy.startswith('.'): # ignore .DS files, its a Mac thing
                fname, _ = os.path.splitext(policy)
                
                try:
                    gtheta = '_'.join(fname.split('_')[-2:])
                    fname = '_'.join(fname.split('_')[:-2])
                    time_components = np.array(list(map(int, fname.split('_'))))
                    file_count += 1
                except ValueError:
                    continue

                theta_dict[i] = gtheta

                if file_count == 1:
                    name_array = time_components
                else:
                    name_array = np.row_stack((name_array, time_components))

                name_dict[fname] = i

        while len(name_array.shape) > 1:
            col_idx_diff =  np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0]
            row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze()
            name_array = name_array[row_idx_curr_max, :]
        
        newest_policy = name_array
        newest_policy = '_'.join(map(str, newest_policy))
        suffix_theta = theta_dict[name_dict[newest_policy]]
        newest_policy += '_' + suffix_theta + '.npy'
            
        return newest_policy
    

    def get_goal_theta(self,pol_name):
        # same exact logic as get_newest_policy() except it just grabs the goal theta
        fname, _ = os.path.splitext(pol_name)
        len_fname = len(fname) - 1

        if 'pi' in fname:
            idx_pi = fname.find('pi')
            
            if idx_pi != len_fname: # index of numerator
                num = np.pi
            else: # index of denominator
                den = np.pi

            fname = fname.replace('pi','555')
        
        if 'ip' in fname:
            idx_ip = fname.find('ip')

            if idx_ip != len_fname: # index of numerator
                num = -np.pi
            else: # index of denominator
                den = -np.pi

            fname = fname.replace('ip','555')
                       
        time_components = np.array(list(map(int, fname.split('_'))))
        name_array = time_components

        while len(name_array.shape) > 1:
            col_idx_diff =  np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0]
            row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze()
            name_array = name_array[row_idx_curr_max, :]
        
        newest_policy = name_array
        temp_num = newest_policy[-2]
        temp_den = newest_policy[-1]

        if temp_num != 555:
            num = temp_num
        if temp_den != 555:
            den = temp_den

        return num, den


    def getQMatrixIdx(self, th, thdot, torque):
        thIdx = np.abs(self.thetas - th).argmin()
        thdotIdx = np.abs(self.theta_dots - thdot).argmin()
        torIdx = np.abs(self.actions - torque).argmin()

        return torIdx, thIdx, thdotIdx


    def getMaxQValue(self, th, thdot):
        # returns the depth index for given th,thdot state where torque is highest
        maxQValIdx = self.policy[:, th, thdot].argmax()
        maxQVal = self.policy[maxQValIdx, th, thdot]

        return maxQValIdx, maxQVal


    def get_action(self, th, thdot):
        _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0)
        chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx)

        action = self.actions[chosen_idx]
        u = np.array([action]).astype(self.env.action_space.dtype)

        return u

    
    def save_precious_simulated_data(self):
        self.data["simulated_episodes"] = self.num_episodes
        self.data["simulated_iterations"] = self.num_iterations
        self.data["avg_sim_cost"] = self.avg_cost
        self.data["perc_useful_actions"] = self.num_useful_actions
        self.data["torque_arr"] = self.torques
        self.data["theta_error_arr"] = self.theta_errors

        fname = self.policy_name

        save_path = os.path.join(self.save_directory, fname)

        if not os.path.exists(self.save_directory):
            os.mkdir(self.save_directory)

        np.save(save_path, self.data)

        print(f'saved precious data: {fname}')


    def update_dummy_q(self,torI,thI,thDotI):
        if self.dummy_q[torI,thI,thDotI] != 1000:
            self.dummy_q[torI,thI,thDotI] = 1000

    
    def simulate(self, ep_num=500, iter_num=150, start_pos=None, start_vel=None):
        print(f'Running simulation using policy: {self.file}')

        self.num_episodes = ep_num
        self.num_iterations = iter_num
        total_total_cost = 0

        try:

            for i in range(self.num_episodes):

                th, thdot = self.env.reset()

                if start_pos is not None:
                    self.env.state[0] = start_pos
                    th = start_pos
                
                if start_vel is not None:
                    self.env.state[1] = start_vel
                    thdot = start_vel

                for _ in range(self.num_iterations):

                    self.env.render()

                    self.theta_errors.append(self.goal_theta - th)

                    u = self.get_action(th, thdot)
                    self.torques.append(u)

                    torIdx,thIdx,thDotIdx = self.getQMatrixIdx(th, thdot, u)

                    self.update_dummy_q(torIdx,thIdx,thDotIdx)

                    nextTh, nextThdot, reward = self.env.step(u)
                    total_total_cost += reward

                    th = nextTh
                    thdot = nextThdot
                
                    time.sleep(.05)

                if i != self.num_episodes-1:
                    self.torques = []
                    self.theta_errors = []
        
        except KeyboardInterrupt:
            pass

        self.avg_cost = total_total_cost / self.num_episodes
        self.num_useful_actions = (self.dummy_q == 1000).sum() / self.dummy_q.size
        
        self.env.close()
示例#15
0
from pendulum import PendulumEnv
from scipy.linalg import solve_continuous_are as riccati

def lqr(A, B, Q, R, N=None):
    # Solve Algebraic Riccati Equation
    P = riccati(A, B, Q, R, e=N)
    R_inv = np.linalg.inv(R)
    if N:
        K = np.dot(R_inv, np.dot(B.T, P) + N.T)
    else:
        K = np.dot(R_inv, np.dot(B.T, P))
    return K

# create environment
env = PendulumEnv()

# reset environment
state = env.reset()
done = False

# Get linearized dynamics
A,B = env._linearize(np.pi)

# create cost matrices for LQR
Q = np.array([[1,0], [0,10]])
R = np.array([[0.001]])

# Compute gain matrix K
K = lqr(A,B,Q,R)
示例#16
0
    N_rk4 = 10  # RK4 steps
    dt = 0.1  #delta time s
    N = 500  # horizon length
    gamma = 0.99

    costs = []
    acc_costs = []
    u_opt_f = []
    th_opt_f = []
    thdot_opt_f = []
    g_1_f = []
    g_2_f = []
    iter_f = []
    iter_total = 0

    with contextlib.closing(PendulumEnv(N_rk4=N_rk4, DT=dt)) as env:
        s = env.reset(fixed=True)  #fixed start at pi,0
        x0bar = np.array(s)
        q = 0
        for i in range(N):
            env.render()
            u_opt, th_opt, thdot_opt, g_1, g_2, l, iter_count = simultaneous_opt(
                s, q, env, N - i, gamma)
            s, c, d, _ = env.step(u_opt[0], noise=True)  #state noise
            q = c + gamma * q
            u_opt_f.append(u_opt[0])
            th_opt_f.append(th_opt[0])
            thdot_opt_f.append(thdot_opt[0])
            costs.append(c)  #gamma is implicit
            g_1_f.append(g_1[0])
            g_2_f.append(g_2[0])
示例#17
0
文件: TROP.py 项目: milad88/Thesis
import sys
from pendulum import PendulumEnv
from Actor_Network_TRPO import *
from Critic_Network_TRPO import *
from utility import *

if __name__ == "__main__":
    print("start")
    env = PendulumEnv()
    action_space = np.arange(-2, 2.01, 0.01)
    num_actions = len(action_space)
    action_dim = 1
    action_bound = env.action_space.high
    state_dim = 3
    batch_size = 32
    learning_rate = 0.001
    delta = 0.01
    discount_factor = 0.99
    num_episodes = 10
    len_episode = 100
    epsilon = 0.1
    load = False
    if not load:

        g_stat = []

        config = tf.ConfigProto()
        config.intra_op_parallelism_threads = 8
        config.gpu_options.per_process_gpu_memory_fraction = 0.33

        with tf.Session(config=config) as sess: