def loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') # observation_space = env.observation_space.shape[0] run = 0 while True: run += 1 state = env.reset() step = 0 while True: step += 1 env.render() #TODO RUN PI ADJUST action = utils.policy(env, pilco, state, False) # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) # reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + str(run) + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
def plot_pilco_source_learning_curve(): env = gym.make('continuous-cartpole-v0') env.seed(73) pilcos = ['initial'] + [str(i) for i in range(6)] rewards = [] for i, p in enumerate(pilcos): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 xs = [] angles = [] while True: xs.append(state[0]) angles.append(state[2]) step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) plt.plot(xs, angles) plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight") plt.close() env.close() plt.plot([i for i, _ in enumerate(pilcos)], rewards) plt.savefig('pilco_rewards_plot', bbox_inches="tight") plt.close() return rewards, xs, angles
def cartpole(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) # Reshape BY REFERENCE ! step = 0 while True: step += 1 action = dqn_solver.act(state) state_next, reward, terminal, _ = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape( state_next, [1, observation_space]) # Reshape BY REFERENCE ! dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay()
def msPacman(): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) print('start', dqn_solver.exploration_rate) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [480, observation_space]) step = 0 while True: step += 1 #env.render() action = dqn_solver.act(state) state_next, reward, done, info = env.step(action) reward = reward if not done else -reward state_next = np.reshape(state_next, [480, observation_space]) dqn_solver.remember(state, action, reward, state_next, done) state = state_next if done: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run, dqn_solver.exploration_rate) print(step, run) dqn_solver.updateExploration_rate() break dqn_solver.experience_replay()
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5', transfer_name='{:d}true_dyn_pi_adj.pkl', adjust=True): env = gym.make('continuous-cartpole-v99') env.seed(1) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco(pilco_name, controller=controller, reward=R, sparse=False) rewards = [] for i in range(10): print('Running {:s}'.format(transfer_name.format(i))) if adjust: with open(transfer_name.format(i), 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) if adjust: pi_adjust_action = pi_adjust.predict( np.array(a).reshape(1, -1))[0] else: pi_adjust_action = 0 # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT state_next, reward, terminal, info = env.step(u_action + pi_adjust_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) env.close() return rewards
def souce_loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) env = gym.make('continuous-cartpole-v99') pi_adjust = None score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 # env.render() # TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() return (avg_reward)
def true_loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) with open('9true_dyn_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) # with open('10_pi_adj.pkl', 'rb') as inp2: # good_pi = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 while True: run += 1 state = env.reset() # print(state) # input() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1))[0] state_next, reward, terminal, info = env.step(action + u_action) reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
def initialize(): # create environment and initial parameters self.env = gym.make(self.env_name) self.env.seed(self.seed) self.env_eval = gym.make(self.env_name) self.observation_space_size = self.env.observation_space.shape[0] self.action_space_size = self.env.action_space.n self.reward_threshold = self.env.spec.reward_threshold self.score_max = self.env.spec.max_episode_steps self.exploration_rate = self.exloration_max self.memory = deque(maxlen=self.memory_size) self.tnet_counter = 0 self.step_counter = 0 # create ScoreLogger self.score_logger = ScoreLogger(self.dir_path, self.window_size, self.reward_threshold)
def loader(): with open('CartPole-v1_dqn_solver.pkl', 'rb') as input: dqn_solver = pickle.load(input) env = gym.make(ENV_NAME) env.seed(73) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) # print(state) # input() step = 0 while True: step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print( "Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay() if (run % 50 == 0): save_object(dqn_solver, 'v1_in_v99_dqn_solver.pkl') save_object(dqn_solver, 'v1_in_v99_dqn_solver.pkl') env.env.close()
def cartpole(): env = gym.make(ENV_NAME) env.seed(73) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] action_space = env.action_space.n dqn_solver = cartpole_agent_dqn(observation_space, action_space) run = 0 while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) # print(state) # input() step = 0 while True: step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step)) score_logger.add_score(step, run) break dqn_solver.experience_replay() if (run % 50 == 0): save_object(dqn_solver, ENV_NAME + '_' + 'dqn_solver.pkl') save_object(dqn_solver, ENV_NAME + '_' + 'dqn_solver.pkl') env.env.close()
def run(solver='static'): env = gym.make(ENV_NAME) score_logger = ScoreLogger(ENV_NAME) observation_space = env.observation_space.shape[0] run = 0 if solver == 'static': static_solver = StaticSolver() while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 env.render() action = static_solver.act(state) state_next, reward, terminal, info = env.step(action) state_next = np.reshape(state_next, [1, observation_space]) state = state_next if terminal: print('Run: ' + str(run) + ', score: ' + str( step)) score_logger.add_score(step, run) break elif solver == 'dqn': action_space = env.action_space.n dqn_solver = DQNSolver(observation_space, action_space) while True: run += 1 state = env.reset() state = np.reshape(state, [1, observation_space]) step = 0 while True: step += 1 env.render() action = dqn_solver.act(state) state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state_next = np.reshape(state_next, [1, observation_space]) dqn_solver.remember(state, action, reward, state_next, terminal) state = state_next if terminal: print('Run: ' + str(run) + ', exploration: ' + str(dqn_solver.exploration_rate) + ', score: ' + str( step)) score_logger.add_score(step, run) break dqn_solver.experience_replay()
class DQNAgent: def __init__(self, dir_path=None): def initialize(): # create environment and initial parameters self.env = gym.make(self.env_name) self.env.seed(self.seed) self.env_eval = gym.make(self.env_name) self.observation_space_size = self.env.observation_space.shape[0] self.action_space_size = self.env.action_space.n self.reward_threshold = self.env.spec.reward_threshold self.score_max = self.env.spec.max_episode_steps self.exploration_rate = self.exloration_max self.memory = deque(maxlen=self.memory_size) self.tnet_counter = 0 self.step_counter = 0 # create ScoreLogger self.score_logger = ScoreLogger(self.dir_path, self.window_size, self.reward_threshold) if dir_path is None: # settings self.env_name = ENV_NAME self.exloration_max = EXPLORATION_MAX self.exploration_min = EXPLORATION_MIN self.exploration_decay = EXPLORATION_DECAY self.memory_size = MEMORY_SIZE self.memory_min = MEMORY_MIN self.minibatch_size = MINIBATCH_SIZE self.batch_size = BATCH_SIZE self.learning_rate = LEARNING_RATE self.gamma = GAMMA self.window_size = WINDOW_SIZE self.seed = SEED self.update_target_q_after_n_steps = UPDATE_TARGET_Q_AFTER_N_STEPS self.tau = TAU self.num_episodes_eval = NUM_EPISODES_EVAL self.steps_per_eval = STEPS_PER_EVAL self.exploration_rate_eval = EXPLORATION_RATE_EVAL self.seed_eval = SEED_EVAL self.frames_per_step = FRAMES_PER_STEP # create new directory to store settings and results run = 0 while True: run += 1 if not os.path.exists(f"./experiments/{ENV_NAME}_{run}"): self.dir_path = f"./experiments/{ENV_NAME}_{run}" os.mkdir(self.dir_path) break # save settings with open(os.path.join(self.dir_path, "settings.json"), "w") as file: json.dump(self.__dict__, file) initialize() self.score_logger.log(f"Results of experiments stored in: {self.dir_path}") # create model and store model and visualization # self.qnet is online model, self.tnet is target model self.qnet = Sequential() self.qnet.add(Dense(256, input_shape=(self.observation_space_size,), activation='relu')) self.qnet.add(Dense(self.action_space_size, activation='linear')) self.qnet.compile(loss="huber_loss", optimizer=Adam(learning_rate=self.learning_rate)) self.qnet.save(os.path.join(self.dir_path, "model.HDF5")) plot_model(self.qnet, to_file=os.path.join(self.dir_path, "model.png"), show_shapes=True) self.tnet = clone_model(self.qnet) self.tnet.set_weights(self.qnet.get_weights()) else: with open(os.path.join(dir_path, "settings.json"), "r") as file: self.__dict__ = json.load(file) initialize() model_name = "model_best_5840.HDF5" self.qnet = load_model(os.path.join(self.dir_path, model_name)) self.score_logger.log(f"{os.path.join(self.dir_path, model_name)} loaded") def train(self): episode = 0 episode_train = 0 frame = 0 temp = True while True: state = self.env.reset() state = np.reshape(state, (1, self.observation_space_size)) episode += 1 score = 0 done = False while not done: action = self.act(state) state_new, reward, done, info = self.env.step(action) state_new = np.reshape(state_new, (1, self.observation_space_size)) score += reward frame += 1 if score >= self.score_max: self.memory.append((state, action, reward, state_new, not done)) else: self.memory.append((state, action, reward, state_new, done)) state = state_new if len(self.memory) >= self.memory_min: if frame % self.frames_per_step == 0: temp = True self.experience_replay() if self.step_counter % self.steps_per_eval == 0 and temp: temp = False self.evaluate() if len(self.memory) >= self.memory_min: episode_train += 1 self.score_logger.log(f"\nEpisode: {episode_train} ({episode}), exploration: {self.exploration_rate}, score: {score}") self.score_logger.add_score(score, episode, episode_train) if episode_train % 64 == 0: self.qnet.save(os.path.join(self.dir_path, "model.HDF5")) self.score_logger.log("Model Saved") if self.score_logger.save_best_model: self.qnet.save(os.path.join(self.dir_path, "model_best.HDF5")) self.score_logger.save_best_model = False self.score_logger.log("Best model replaced") self.score_logger.solved() def act(self, state, exploration_rate=None): if exploration_rate == None: exploration_rate = self.exploration_rate if np.random.rand() < exploration_rate: return self.env.action_space.sample() q_values = self.qnet.predict(state) return np.argmax(q_values[0]) def experience_replay(self): batch = random.sample(self.memory, self.minibatch_size) x = np.zeros((self.minibatch_size, self.observation_space_size)) y = np.zeros((self.minibatch_size, self.action_space_size)) for i, (state, action, reward, state_new, done) in enumerate(batch): target = self.qnet.predict(state) if done: target[0][action] = reward else: target[0][action] = reward + \ self.gamma*self.tnet.predict(state_new)[0][np.argmax(self.qnet.predict(state_new)[0])] x[i, :] = state[0] y[i, :] = target[0] self.qnet.fit(x, y, batch_size=self.batch_size, verbose=0) if self.tnet_counter >= self.update_target_q_after_n_steps: w_qnet = self.qnet.get_weights() w_tnet = self.tnet.get_weights() for i in range(len(w_tnet)): w_tnet[i] = w_qnet[i]*self.tau + w_tnet[i]*(1-self.tau) self.tnet.set_weights(w_tnet) self.tnet_counter = 0 self.tnet_counter += 1 self.exploration_rate = np.amax((self.exploration_rate*self.exploration_decay, self.exploration_min)) self.step_counter += 1 def evaluate(self): self.env_eval.seed(self.seed_eval) scores = [] for i in range(self.num_episodes_eval): state = self.env_eval.reset() state = np.reshape(state, (1, self.observation_space_size)) score = 0 done = False while not done: action = self.act(state, self.exploration_rate_eval) state, reward, done, info = self.env_eval.step(action) state = np.reshape(state, (1, self.observation_space_size)) score += reward scores.append(score) self.score_logger.add_evaluation(scores, self.step_counter) def simulate(self, exploration_rate=0.0, verbose=False): state = self.env.reset() state = np.reshape(state, (1, self.observation_space_size)) score = 0 while True: self.env.render() action = self.act(state, exploration_rate) if verbose: with np.printoptions(precision=5, sign=' ', floatmode='fixed', suppress=True): self.score_logger.log(f"State: {state[0]}, Output model: {self.qnet.predict(state)[0]}, Action: {action}, score: {score}") state, reward, done, info = self.env.step(action) score += reward state = np.reshape(state, (1, self.observation_space_size)) time.sleep(0.02) if done: self.score_logger.log(f"Episode finished, score: {score}") break self.env.close()
def piadjust(NT): with open('GOODv1.pkl ', 'rb') as inp: dqn_solver = pickle.load(inp) env_S = gym.make(ENV_NAME) env_S.seed(73) score_logger_S = ScoreLogger(ENV_NAME) observation_space_S = env_S.observation_space.shape[0] env_T = gym.make(ENV_NAMET) env_T.seed(73) score_logger_T = ScoreLogger(ENV_NAMET) observation_space_T = env_T.observation_space.shape[0] #TODO IMplement Pi adjust D_S = sampler(dqn_solver, env_S, 1000) D_S = noiser(D_S, [0, 2]) print('D_S sampling done') D_T = None i = 0 pi_adj = dqn_solver while i < NT: D_adj = [] if i == 0: D_i_T = sampler(dqn_solver, env_T, 1000) elif i != 0: D_i_T = sampler_adj(pi_adj, dqn_solver, env_T, 1000) if D_T is not None: # print(D_i_T.shape, D_T.shape) D_T = np.concatenate((D_i_T, D_T)) elif D_T is None: D_T = D_i_T print('Goin for inverse dyn') gpr = inverse_dyn(D_T) print('inverse dyn done') for samp in D_S: x_s = np.ndarray.tolist(samp[0])[0] x_s1 = np.ndarray.tolist(samp[2])[0] u_t_S = samp[1] # print(u_t_S) a = np.ndarray.tolist(samp[0])[0] a.extend(np.ndarray.tolist(samp[2])[0]) # print( np.array(a).reshape(1, 8) ) u_t_T = gpr.predict(np.array(a).reshape(1, 8), return_std=False) if u_t_T > 0: u_t_T = 1 elif u_t_T < 0: u_t_T = 0 # print('\n\n', dqn_solver.act( np.array(a[0:4]).reshape([1,4] ) )) # print( np.array(a[0:4]).reshape([1,4] ) # print(i, ' ', D_adj) D_adj.append((x_s, u_t_S, u_t_T)) # print(i, ' ',x_s, u_t_S, u_t_T) print('Goin for L3') pi_adj = L3(D_adj) print('L3 Done') # x_s.append(u_t_S) # print(pi_adj.predict(np.array(x_s).reshape(1,-1))) print(i) i = i + 1 if (i % 1 == 0): save_object(pi_adj, str(i) + '_pi_adj.pkl') env_S.env.close() env_T.env.close() return (pi_adj)
def connect4dqn(folder): env = Connect4() os.chdir(folder) score_logger_random = ScoreLogger('AI_vs_random', average_score_to_solve=1000) score_logger_ai = ScoreLogger('AI_vs_{}'.format(EVAL_AI), average_score_to_solve = 11) #only 10 games played but scorelogger would (early)stop(ing) when reaching 10 games 10 times in a row --> 11 # player1won = 0 # player2won = 0 observation_space = env.reset().shape action_space = env.validMoves().size # Assign GPU to DGX config = tf.ConfigProto( device_count = {'GPU': 1} ) config.gpu_options.allow_growth = True sess = tf.Session(config=config) set_session(sess) solver = getattr(import_module('{}.dqn'.format(folder)), 'DQNSolver') dqn_solver = solver(observation_space, action_space) run = 0 state = env.reset() #moved one loop up. otherwise player two wont be able to start if player one wins while True: state = env.soft_reset() #dirty workaround. creates an empty board without interfering with the turn counter --> makes loser able to start next round run += 1 if run % SAVE_EVERY_K_GAMES == 0 : print('Saving weights and starting evaluation...') dqn_solver.save() score, ties = evaluate.ai_vs_random(env, dqn_solver, eval_ctr=run, numberOfGames = NUMBER_OF_EVAL_GAMES, games_recorded_per_eval = GAMES_RECORDED_PER_EVAL) score_logger_random.add_score(score + ties, run) #logging ties as success eval_solver = getattr(import_module('{}.dqn'.format(EVAL_AI)), 'DQNSolver') eval_dqn_solver = eval_solver(observation_space, action_space) eval_dqn_solver.exploration_rate = 0 ai1_win, ai2_win, tieCOunter = evaluate.ai_vs_ai(env, ai1=dqn_solver, ai1_name=folder, ai2=eval_dqn_solver, ai2_name=EVAL_AI, eval_ctr=run, numberOfGames = NUMBER_OF_AI_EVAL_GAMES, games_recorded_per_eval = GAMES_RECORDED_PER_EVAL) del eval_dqn_solver score_logger_ai.add_score(ai1_win + tieCOunter, run) #logging ties as success step = 0 while True: step += 1 player = env.getNextPlayer() if player == 1: action_player1 = dqn_solver.act(state, env) state_next, reward_player1, terminal, info = env.makeMove(player, action_player1, DEMO_MODE) state_copy = np.copy(state) state_next_copy = np.copy(state_next) if terminal: dqn_solver.pop() # if player 1 wins, pop player 2's last move from and give it a negative reward dqn_solver.remember(normalized_state, action_player2, reward_player1*-1, normalized_state_next, terminal) dqn_solver.remember(state, action_player1, reward_player1, state_next, terminal) state = state_next else: normalized_state = np.roll(state, 1, axis = -1) action_player2 = dqn_solver.act(normalized_state, env) state_next, reward_player2, terminal, info = env.makeMove(player, action_player2, DEMO_MODE) normalized_state_next = np.roll(state_next, 1, axis = -1) if terminal: dqn_solver.pop() # if player 2 wins, pop player 1's last move from and give it a negative reward dqn_solver.remember(state_copy, action_player1, reward_player2*-1, state_next_copy, terminal) dqn_solver.remember(normalized_state, action_player2, reward_player2, normalized_state_next, terminal) state = state_next if terminal: # if player == 1: # player1won += 1 # else: # player2won += 1 # try: # winRatio = player1won/player2won # except ZeroDivisionError: # winRatio = 0 # print('Win ratio: {}'.format(winRatio)) #debug stuff print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", moves: " + str(step)) break dqn_solver.experience_replay()
y_i = r_i + 𝛾 * max(Q(next_state, action; 𝜃_target)) Loss: (y_i - Q(state, action; 𝜃))^2 Every C step, 𝜃_target <- 𝜃 """ import os import numpy as np import tensorflow as tf import random from collections import deque import deep_q_network as dqn from point_and_click_env import Env from score_logger import ScoreLogger from typing import List env = Env() score_logger = ScoreLogger('mouse model', 1000, 100000) # Constants defining our neural network INPUT_SIZE = env.observation_space.shape[0] OUTPUT_SIZE = env.action_space.n DISCOUNT_RATE = 0.95 REPLAY_MEMORY = 100000 BATCH_SIZE = 32 TARGET_UPDATE_FREQUENCY = 1000 MAX_EPISODES = 4000000 SAVE_PERIOD = 10000 LOG_PERIOD = 10000 E_DECAY = 0.9998 E_MIN = 0.05
def loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) for pick in range(1, 11): env = gym.make('continuous-cartpole-v99') with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 #env.render() #TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1)) action = action[0] if action[0] > 1: action[0] = 1 elif action[0] < -1: action[0] = -1 # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() Rs[0][pick - 1] = avg_reward return (Rs)