from gym.spaces import Discrete, Box from ourgym import RelativeDiscreteActionMap from ourgym.RobotArmInvPendulum import SingleMotorActionMap from simulation.robot_arm_simulation import RobotArmEnvironment from rl import DQNAgent, ACAgent from time import sleep, time import numpy as np number_of_episodes = 10000 max_iterations_per_episode = 500 if __name__ == '__main__': agent = DQNAgent(6, 9, 10000, 1.0, 0.05, 9000, 0.99, 0.00001, 2, (10, 10), 1000) # agent.epsilon = 0.05 # agent.load('backup/weights_1515613961.468759') with RobotArmEnvironment(sim_ticks_per_step=15) as env: # FOR ACCELERATION CONTROL # env.action_space = Discrete(9) # env.action_map = RelativeDiscreteActionMap(9, -100, 101, 100) # env.observation_space = Box(np.array([0, -1, 0, -1, 0, -1]), np.array([1, 1, 1, 1, 1, 1])) # FOR SINGLE MOTOR CONTROL env.action_space = Discrete(9) env.action_map = SingleMotorActionMap(9, 45, 135) env.observation_space = Box(np.array([0, -1, 0, -1, 0, -1]), np.array([1, 1, 1, 1, 1, 1]))
if __name__ == '__main__': env = gym.make('Pendulum-v0') observation = env.reset() print(observation, type(observation), observation.shape) print(env.action_space) print(env.action_space.sample) dim_action = 40 dim_state = 3 am = ActionMap(dim_action) agent = DQNAgent(dim_state, dim_action, am) high = np.array([1., 1., 8.]) low = -high #print("mean 100 episode reward before learning: {}".format(calculate_mean_reward(agent, env))) episodes = 1000 for i in range(episodes): print(i) observation = env.reset() while True: env.render(mode="human") action = agent.act(observation) new_observation, reward, done, info = env.step(
def run_experiments(index): # changes reward and done function task_index = index random_run = True # common parameters num_episodes = 5000 num_steps = 200 memory_size = 10000 batch_size = 64 e_start = 1.0 e_finish = 0.05 e_decay_steps = 4500 dr = 0.995 lr = 0.0001 layers = 2 nodes = (20, 20) frequency_updates = 0 # if index % 2 == 0: # task_index = 1 # if index >= 4: # num_episodes = 20000 # e_decay_steps = 18000 # else: # task_index = 2 # if index >= 4: # num_episodes = 20000 # e_decay_steps = 18000 while True: # create directory if it does not exist directory_path = "../experiments_{}_{}/{}_{}/".format( task_index, "random" if random_run else "", datetime.now().strftime("%d-%m-%Y_%H-%M-%S"), uuid.uuid4()) if not os.path.exists(os.path.dirname(directory_path)): try: os.makedirs(os.path.dirname(directory_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise try: nr_actions_per_motor = 9 lower_bound = 45 upper_bound = 135 simulation_init_state = (0, 0, np.pi, 0, np.pi, 0) reset_with_noise = False if task_index == 1: nr_actions_per_motor = 9 lower_bound = 70 upper_bound = 110 simulation_init_state = (np.pi, 0, np.pi, 0, np.pi, 0) reset_with_noise = True elif task_index == 2: nr_actions_per_motor = 5 lower_bound = 45 upper_bound = 135 simulation_init_state = (0, 0, np.pi, 0, np.pi, 0) reset_with_noise = False env = RobotArmEnvironment( reward_function_index=task_index, done_function_index=task_index, simulation_init_state=simulation_init_state, reset_with_noise=reset_with_noise, sim_ticks_per_step=6) env.action_space = Discrete(nr_actions_per_motor**2) env.action_map = AbsoluteDiscreteActionMap(lower_bound, upper_bound, nr_actions_per_motor) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = DQNAgent(env, state_dim, action_dim, memory_size, e_start, e_finish, e_decay_steps, dr, lr, layers, nodes, frequency_updates) run(env, agent, num_episodes, num_steps, batch_size, directory_path, random_run) except KeyboardInterrupt as e: # for f in os.listdir(os.path.dirname(directory_path)): # if re.search(file_path, f): # os.remove(os.path.join(directory_path, f)) break
def run(env: RobotArmEnvironment, agent: DQNAgent, num_episodes: int, max_num_steps: int, batch_size: int, directory_path: str, random_run: bool = False): reward_history_file_name = directory_path + "reward.csv" action_history_file_name = directory_path + "action.csv" max_q_history_file_name = directory_path + "max-q.csv" state_history_file_name = directory_path + "state.csv" # Parse these files with: # with open(file_name, "r") as f: # reader = csv.reader(f, delimiter=" ") # for row in reader: # for col in row: # col = ast.literal_eval(col) # (nan values have to be checked for) previous = time.time() for episode_idx in range(num_episodes): state = env.reset() for step_idx in range(max_num_steps): with open(state_history_file_name, "a") as f: f.write( ("(" + ("{}," * 6) + ") ").format(*env.simulation.state)) if episode_idx % 100 == 0: env.render() time.sleep(1 / 10) # take an action if random_run: action = env.action_space.sample() else: max_q, action, prediction = agent.act(state) if not random_run: with open(max_q_history_file_name, "a") as f: f.write("{} ".format(max_q)) with open(action_history_file_name, "a") as f: f.write("({},{}) ".format( env.action_map.get(int(action))[0], env.action_map.get(int(action))[1])) # observe effect of action and remember new_state, reward, done, info = env.step(action) if not random_run: agent.remember(state, action, reward, new_state, done) with open(reward_history_file_name, "a") as f: f.write("{} ".format(float(reward))) # store new state state = new_state if done: break if not random_run: agent.replay(batch_size) # new line in all data files with open(action_history_file_name, "a") as f: f.write("\n") with open(reward_history_file_name, "a") as f: f.write("\n") if not random_run: with open(max_q_history_file_name, "a") as f: f.write("\n") with open(state_history_file_name, "a") as f: f.write(("(" + ("{}," * 6) + ") \n").format(*env.simulation.state)) if not random_run and episode_idx % 50 == 0: agent.save(directory_path + "weights-ep-{}".format(episode_idx)) current = time.time() print("{}: episode {:3}/{:3} completed in {:4}s".format( os.getpid(), episode_idx, num_episodes, current - previous)) previous = current
# ensure files are downloaded if not os.path.isfile(CVAE_DATA_PATH): download_blob(CVAE_DATA_BLOB_NAME, CVAE_DATA_PATH) if not os.path.isfile(CVAE_MODEL_PATH): download_blob(CVAE_MODEL_BLOB_NAME, CVAE_MODEL_PATH) if not os.path.isfile(FULL_RL_MODEL_PATH): download_blob(FULL_RL_MODEL_BLOB_NAME, FULL_RL_MODEL_PATH) # load files with open(CVAE_DATA_PATH, 'rb') as f: CVAE_DATA = pickle.load(f) CVAE_MODEL = CVAE(data_dim=EMBEDDING_DIM * 2, label_dim=9, model_path=CVAE_MODEL_PATH) FULL_RL_MODEL = DQNAgent(action_size=3, load_model=True, no_op_steps=0) def simple_sample(n_real, n_fake): ''' Generates a mixed dataset of simulated and real embedded samples. Samples are "embedded" because we've used transfer learning. Sampling is "simple" because the GAN is not fit with each simple. ''' ## sample real data real_data = [] if n_real > 0: real_data = __sample_real_data(n_real) ## sample fake data fake_data = [] if n_fake > 0:
def get(self, index): return index def getIndex(self, action): return action if __name__ == '__main__': env = gym.make('BipedalWalker-v2') observation = env.reset() print(observation, type(observation), observation.shape) print(env.action_space) print(env.action_space.sample) agent = DQNAgent(24, 4, ActionMap()) while True: env.render(mode="human", close=False) action = env.action_space.sample() observation, reward, done, info = env.step(env.action_space.sample()) print(action, reward, done) if done: break time.sleep(1 / 60)
e_start = 1 e_finish = 0.05 e_decay = 400 dr = 0.99 lr = 0.00001 layers = 2 nodes = 20 frequency_updates = 0 agent = DQNAgent( env, state_dim, action_dim, memory_size, e_start, e_finish, e_decay, dr, lr, layers, (nodes, nodes), frequency_updates, ) for episode in range(num_episodes): state = env.reset() tr = 0 for step in range(num_steps): action = agent.act(state)[1] print(step, flush=True, end=" ")
""" A simple example to run the DQN algorithm on a toy example. """ import gym import tensorflow as tf from rl import DQNAgent from keras.layers import Dense, Input, merge, Activation, Flatten from keras.models import Model env_name = 'CartPole-v0' num_actions = 2 def make_model(): i = Input((4, )) x = i x = Dense(128, activation='relu')(x) policy = Dense(num_actions, activation='softmax')(x) value = Dense(1, activation='linear')(x) return Model([i], [value]) with tf.Session() as sess, tf.device('/cpu:0'): agent = DQNAgent(make_model) agent.compile(sess) agent.train(sess, lambda: gym.make('CartPole-v0'))
def run_experiments(reward_index): if reward_index < 0 or reward_index > 1: raise ValueError() num_episodes = number_of_episodes num_steps = max_iterations_per_episode batchsize = 32 state_size = 6 action_size = 81 memory_size = 100000 epsilon_start = 1 epsilon_min = 0.1 epsilon_decay_per_step = 10000 lr = 0.00001 dr = 0.99 amount_layers = 2 amount_nodes_layer = 40 frequency_updates = 1000 parameters = {} parameters['num_episodes'] = num_episodes parameters['num_steps'] = num_steps parameters['batchsize'] = batchsize parameters['state_size'] = state_size parameters['action_size'] = action_size parameters['memory_size'] = memory_size parameters['epsilon_start'] = epsilon_start parameters['epsilon_min'] = epsilon_min parameters['epsilon_decay_episodes_required'] = epsilon_decay_per_step parameters['learning_rate'] = lr parameters['discount_rate'] = dr parameters['amount_layers'] = amount_layers parameters['amount_nodes_layer'] = amount_nodes_layer parameters['frequency_update_target_model'] = frequency_updates agent = DQNAgent(6, 81, num_steps, epsilon_start, epsilon_min, epsilon_decay_per_step, dr, lr, amount_layers, (amount_nodes_layer, amount_nodes_layer), frequency_updates) with RobotArmEnvironment(reward_function_index=reward_index, reward_function_params=(1 / 6 * np.pi, 2 * np.pi, 1, 10, 0.05, 0.1, 2, 0.001, 1)) as env: ah = list() rh = list() for episode_idx in range(number_of_episodes): state = env.reset() tr = 0 ct = time.time() ah.append(list()) rh.append(list()) for i in range(max_iterations_per_episode): action = agent.act(state) ah[episode_idx].append(env.action_map.get(int(action))) next_state, reward, done, _ = env.step(action) rh[episode_idx].append(float(reward)) agent.remember(state, action, reward, next_state, done) state = next_state tr += reward if done: break agent.replay(32) print( "episode {}/{}, average reward {}, epsilon {}, time taken {}s". format(episode_idx + 1, number_of_episodes, tr, agent.get_epsilon(), time.time() - ct)) agent._update_epsilon() if episode_idx % 100 == 0 and episode_idx != 0: agent.safe() save_info(episode_idx, reward_index, parameters, env.action_map.to_json_object(), rh, ah, env.to_json_object())