import gym import gym_flappy_bird import datetime from stable_baselines.deepq.policies import CnnPolicy from stable_baselines.common.vec_env import DummyVecEnv, VecFrameStack from env_wrapper import make_flappy_env from stable_baselines import DQN ENV_ID = 'flappy-bird-v0' env = make_flappy_env(ENV_ID, num_env=1, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) model = DQN(CnnPolicy, env, verbose=1, tensorboard_log='./dqn/dqn_2300k_timetest') start_time = datetime.datetime.now() model.learn(total_timesteps=2300000) print(datetime.datetime.now() - start_time) model.save("dqn_2300k") print('Finished')
from stable_baselines.common.evaluation import evaluate_policy from stable_baselines.common.atari_wrappers import FrameStack, WarpFrame, MaxAndSkipEnv, EpisodicLifeEnv import tensorflow as tf # Suppress warnings tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) import hyperparams as hp env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) env = EpisodicLifeEnv(env) env = WarpFrame(env) env = FrameStack(env, n_frames=hp.FRAME_STACK) env = MaxAndSkipEnv(env, skip=hp.FRAME_SKIP) model = DQN.load("models/round3/best_model") obs = env.reset() # cr = 0 # while True: # action, _states = model.predict(obs, deterministic=False) # obs, rewards, done, info = env.step(action) # env.step(action) # cr += rewards # print("Reward: {}\t\t".format(cr), end='\r') # env.render() # if (done): # print("Finished an episode with total reward: ", cr) # cr = 0 # break
def launchAgent(env_name: int, model_name: str, test_mode=False, filepath=None): """ :param test_mode: 에이전트를 테스트 모드로 불러와 주행시킬지를 확인하는 모드입니다. 이럴 시에 학습은 이루어지지 않으며, 주행만 이루어집니다. :param env_name: 불러올 환경의 이름입니다. 1 : 미니맵 이미지를 사용하지 않은, 점 사이의 거리 계산을 한 환경입니다. 2 : 미니맵 이미지를 사용하고, 보상을 업데이트한 모델입니다. 다른 값(기본) : 현재 쓰는 모델입니다. 미니맵 이미지를 사용하고, 보상을 다시 업데이트한 모델입니다. :param model_name: 설정할 모델의 이름입니다. DQN : DQN 모델을 불러옵니다. HER : HER 모델을 불러옵니다. 다른 값(기본) : PPO2 모델을 불러옵니다. :return: 마지막으로 episode를 수행한 모델을 return합니다. """ from stable_baselines import DQN, HER, PPO2 if env_name == 1: from Reinforcement_AI.env.a_env import KartEnv kart_env = KartEnv() policy = "MlpPolicy" elif env_name == 2: from Reinforcement_AI.env.d_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv1 kart_env = DetailedMiniMapEnv1() policy = "CnnPolicy" elif env_name == 3: from Reinforcement_AI.env.a_env2 import KartEnv kart_env = KartEnv() policy = "MlpPolicy" elif env_name == 4: from Reinforcement_AI.env.a_env3 import KartEnv kart_env = KartEnv() policy = "MlpPolicy" else: #env_name == "detailed_minimap_enhanced" or env_name == "4": from Reinforcement_AI.env.e_enhanced_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv2 kart_env = DetailedMiniMapEnv2() policy = "CnnPolicy" if model_name == "DQN": model = DQN(policy=policy, env=kart_env, double_q=True, prioritized_replay=True, verbose=1) elif model_name == "HER": model = HER(policy=policy, env=kart_env, model_class=DQN, verbose=1) else: # model_name == "PPO2" model = PPO2(policy=policy, learning_rate=0.0001, env=kart_env, verbose=1) if test_mode: # 테스트 모드일때 에이전트 불러와서 작동하게함 model.load(filepath) kart_env.set_continuos(True) while True: observation = kart_env.reset() while True: action, _states = model.predict(observation) observation, rewards, dones, info = kart_env.step(action) if dones: break else: for i in range(1000): model.learn(total_timesteps=12500) model.save(str(env_name) + "_" + model_name + "_" + str(i + 1))
if MODEL == 'DQN': from stable_baselines.deepq.policies import LnCnnPolicy, MlpPolicy if ENVIRONMENT in ['rgbd', 'rgb', 'rgbdsparse']: model = DQN(LnCnnPolicy, env, verbose=1, tensorboard_log=(log_dir + "tensorboard_%s_%s_%s/") % (MODEL, ENVIRONMENT, DATE), gamma=args.discount, learning_rate=args.lr, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, double_q=True, learning_starts=1000, target_network_update_freq=500, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-06, param_noise=False, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False) elif ENVIRONMENT in 'possensor': model = DQN(MlpPolicy, env,
tensorboard_folder = './tensorboard/Bomberman/base/' model_folder = './models/Bomberman/base/' if not os.path.isdir(tensorboard_folder): os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = 'Cnn' model_tag = 'Cnn' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: BaseEnv()]) env = VecFrameStack(env, 2) model = DQN(CustomCnnPolicy, env, verbose=0, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=10000000, tb_log_name='DQN' + model_tag) model.save(model_folder + "DQN" + model_tag) del model model = DQN.load(model_folder + "DQN" + model_tag) done = False states = None obs = env.reset() while not done: action, states = model.predict(obs, states) obs, _, done, info = env.step(action) env.render()
# Create log dir log_dir = args.log_path os.makedirs(log_dir, exist_ok=True) env = gym_gvgai.make(args.env) env = WarpFrame(env) env = Monitor(env, log_dir, allow_early_resets=True) if args.save_video_interval != 0: env = gym.wrappers.Monitor( env, os.path.join(log_dir, "videos"), video_callable=(lambda ep: ep % args.save_video_interval == 0), force=True) model = DQN(CnnPolicy, env, verbose=1, exploration_fraction=args.exploration_fraction, exploration_final_eps=args.exploration_final_eps, tensorboard_log="tensorboard_log", prioritized_replay=bool(args.double_q), double_q=bool(args.double_q), buffer_size=int(args.buffer_size), train_freq=args.train_freq, batch_size=args.batch_size, seed=args.seed) model.learn(total_timesteps=int(args.num_timesteps), callback=callback)
from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import DQN from PaddleEnv import PaddleEnv model = DQN.load("model.h5") env = DummyVecEnv([lambda: PaddleEnv()]) obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, _ = env.step(action)
env = DummyVecEnv([lambda: env2]) env4 = DummyVecEnv([lambda: env3]) check_env(env2, warn=True) # Define Callback #Callback stops training if maximum is reached in mean reward callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=env2.calculate_threshold(), verbose=1) # Callback safes the currently best model eval_callback = EvalCallback(env4, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path='./DQN_Models/best/') checkpoint_callback = CheckpointCallback(save_freq=1e4, save_path='./model_checkpoints/') # Uncomment, to train a new fresh model, otherwise a allready trained model will be trained #model = DQN(MlpPolicy, env, verbose=2, tensorboard_log="./logs/progress_tensorboard/") # Load current best model model = DQN.load("DQN_Models/dqn_5x5_3_SingleShot.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/") # Train model model.learn(total_timesteps=1000000, callback=[checkpoint_callback, eval_callback]) #Delete current model and load the best model del model model = DQN.load("./DQN_Models/best/best_model.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/") # Test trained model results = [] for iteration in range(100): score = 0 print('Iteration', iteration) # Observed Player board observation = env.reset()
def test_dqn(name): model_path = os.path.join('models', name) model = DQN.load(model_path) return model
# if cfg["--train"]: # policy_kwargs = {"net_arch": [512, 512]} # model = PPO2(MlpPolicy, env, # verbose=1, # policy_kwargs=policy_kwargs, # n_steps=cfg["--n_steps"], # learning_rate=cfg["--learning_rate"], # tensorboard_log="./logs/") # model.learn(total_timesteps=int(cfg["--steps"])) # model.save("ppo2_intersection") # # if cfg["--test"]: # model = PPO2.load("ppo2_intersection") # obs = env.reset() # while True: # action, _states = model.predict(obs) # obs, rewards, dones, info = env.step(action) # env.render() if cfg["--train"]: policy_kwargs = {} model = DQN(DQNMlp, env, verbose=1, policy_kwargs=policy_kwargs, batch_size=cfg["--batch_size"], exploration_fraction=0.3, learning_rate=cfg["--learning_rate"], tensorboard_log="./logs/") model.learn(total_timesteps=int(cfg["--steps"])) model.save("deepq_intersection")
from stable_baselines import DQN from space_lander.envs.spacex_lander import * # Create environment env_names = ['SpaceXLander-v0', 'LunarLanderv2-v0'] env_name = env_names[0] env = gym.make(env_name) # Instantiate the agent model = DQN(policy='MlpPolicy', env=env, learning_rate=1e-3, prioritized_replay=True, verbose=1, tensorboard_log=f"./{env_name}") # Train the agent obs = env.reset() def eval_and_show(*args, **kwargs): if args[0]['t'] % 10000 == 0: print('Evaluating', args[0]['t']) done = False while not done: action, _states = model.predict(args[0]['obs']) obs, reward, done, info = env.step(action) env.render() # env.close()
def train_single(self, env_name="Merging-v0"): """ Directly trains on env_name """ for seed in [201, 202, 203, 204, 205]: print(f"\ntraining with bsize {self.bs}, seed{seed}") self.seed = seed self.experiment_name = f"B{self.bs}R{seed}" print("EXPT NAME: ", self.experiment_dir1, self.experiment_name) self.experiment_dir = os.path.join(self.experiment_dir1, self.experiment_name) self.create_eval_dir() self.model = None env = gym.make(env_name) eval_env = gym.make(env_name) env._set_barrier_size(self.bs) env._set_homotopy_class('right') eval_env._set_barrier_size(self.bs) eval_env._set_homotopy_class('right') if self.model_type == "PPO": if self.is_save: ### DEEPER NETWORK #policy_kwargs = dict(net_arch=[dict(pi=[64, 64, 64, 64], # vf=[64, 64, 64, 64])] # ) #self.PPO = PPO2('MlpPolicy', env, verbose=1, seed=self.seed, learning_rate=1e-3, # policy_kwargs=policy_kwargs) ### DROPOUT #self.PPO = PPO2(MlpGeneralPolicy1, env, verbose=1, seed=self.seed, learning_rate=1e-3) ### REGULAR self.PPO = PPO2('MlpPolicy', env, verbose=1, seed=self.seed, learning_rate=1e-3) else: self.PPO = PPO2('MlpPolicy', env, verbose=1, seed=self.seed, learning_rate=1e-3) self.model = train(self.PPO, eval_env, self.timesteps, self.experiment_dir, self.is_save, self.eval_save_period, self.rets_path, 0) elif self.model_type == "DQN": if self.is_save: self.DQN = DQN( 'MlpPolicy', env, verbose=1, seed=self.seed, prioritized_replay=True, learning_rate=1e-3, tensorboard_log="./Gridworldv1_tensorboard/" + self.experiment_name, full_tensorboard_log=True) else: self.DQN = DQN('MlpPolicy', env, verbose=1, seed=self.seed, prioritized_replay=True, learning_rate=1e-3) self.model = train(self.DQN, eval_env, self.timesteps, self.experiment_dir, self.is_save, self.eval_save_period, self.rets_path, 0) elif self.model_type == "HER": env = HERGoalEnvWrapper(env) eval_env = HERGoalEnvWrapper(eval_env) print("bs: ", env.env.barrier_size) print("hc: ", env.env.homotopy_class) self.HER = HER('MlpPolicy', env, DDPG, n_sampled_goal=4, goal_selection_strategy="future", seed=self.seed, verbose=1) self.model = train(self.HER, eval_env, self.timesteps, self.experiment_dir, self.is_save, self.eval_save_period, self.rets_path, 0)
if __name__ == '__main__': env = gym.make('CartPole-v1') env = VisualizationEnv( env, steps_lookback=10000, episodic=True, features_names=[ 'Cart Position', 'Cart Velocity', 'Pole Angle', 'Pole Velocity At Tip' ], actions_names=['Push cart to the left', 'Push cart to the right']) model = DQN(CustomDQNPolicy, env, verbose=1, learning_rate=1e-3, exploration_fraction=0.1, exploration_final_eps=0.02, prioritized_replay=True) model.learn(total_timesteps=100000) obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() env.close() env.envs[0].join()
def main(game, method, pixels, tca, runname, run): def obj(variable, actions_0, obs): perturbation = np.zeros([1, 84, 84, 4]) for i in range(len(variable) // 3): x = int(np.round(variable[3 * i] * 83)) y = int(np.round(variable[3 * i + 1] * 83)) pixel_attack = int(np.round(variable[3 * i + 2] * 254)) x = np.clip(x, 0, 83) y = np.clip(y, 0, 83) pixel_attack = np.clip(pixel_attack, 0, 254) perturbation[:, x, y, :] = pixel_attack np.clip(perturbation, 0, 254) obs_new = obs + perturbation actions_new = model.action_probability(obs_new) fitness_value = max_max_distance(actions_new, actions_0) return fitness_value def evaluate(variable, obs): perturbation = np.zeros([1, 84, 84, 4]) for i in range(len(variable) // 3): x = int(np.round(variable[3 * i] * 83)) y = int(np.round(variable[3 * i + 1] * 83)) pixel_attack = int(np.round(variable[3 * i + 2] * 254)) x = np.clip(x, 0, 83) y = np.clip(y, 0, 83) pixel_attack = np.clip(pixel_attack, 0, 254) perturbation[:, x, y, :] = pixel_attack np.clip(perturbation, 0, 254) obs_new = obs + perturbation actions = model.action_probability(obs) actions_new = model.action_probability(obs_new) action, _states = model.predict(obs_new) obs_candi, rewards, dones, infos = env.step(action) return obs_candi, rewards, dones, infos, obs_new, actions_new, perturbation def minmax_distance(actions_new, actions_0): arg_max = np.argmax(actions_0[0]) arg_min = np.argmin(actions_0[0]) minmax_dist = actions_new[0][arg_min] - actions_new[0][arg_max] return minmax_dist def max_max_distance(actions_new, actions_0): arg_max = np.argmax(actions_0[0]) a_candid = list(actions_new[0]) a_candid.remove(a_candid[arg_max]) maxmax_dist = np.max(a_candid) - actions_new[0][arg_max] return maxmax_dist def calculate_entropy(actions): entropy_actions = [ -probs * np.log(probs) / np.log(len(actions)) for probs in actions ] entropy = np.sum(entropy_actions) return entropy alg = GA model = DQN.load("trained_agents/{}/{}NoFrameskip-v4".format(method, game)) Episode_Reward = [] Episode_Lenth = [] Attack_times = [] dir_name = 'results/{}/{}/{}/FSA_{}_TCA_{}'.format(runname, method, game, pixels, tca) if not os.path.exists(dir_name): os.makedirs(dir_name) atk_num = pixels bounds = [[0, 1], [0, 1], [0, 1]] * atk_num env = make_atari_env('{}NoFrameskip-v4'.format(game), num_env=1, seed=run, wrapper_kwargs=None, start_index=0, allow_early_resets=True, start_method=None) env = VecFrameStack(env, n_stack=4) env.reset() model.set_env(env) obs = env.reset() x0 = [0.5, 0.5, 0.5] * atk_num atk_time = 0 TrueS_array = [] Delta_array = [] CleanS_array = [] for i in range(5000): actions = model.action_probability(obs) attack_significance = calculate_entropy(actions[0]) CleanS_array.append((obs[0, :, :, 3]).astype('uint8')) if attack_significance <= tca: atk_time = atk_time + 1 l = alg(lambda variable: obj(variable, actions, obs), x0, xBound=bounds, verbose=False) l.maximize = True l.maxEvaluations = 400 res = l.learn() solution = list(res)[0] obs, rewards, dones, infos, obs_new, actions_new, perturbation = evaluate( solution, obs) obs_store = np.int_(obs_new) true_state = (obs_store[0, :, :, 3]).astype('uint8') TrueS_array.append(true_state) Delta_array.append(perturbation[0, :, :, 3].astype('uint8')) else: obs = np.int_(obs) true_state = (obs[0, :, :, 3]).astype('uint8') TrueS_array.append(true_state) Delta_array.append(np.zeros([84, 84]).astype('uint8')) action, _states = model.predict(obs) obs, rewards, dones, infos = env.step(action) episode_infos = infos[0].get('episode') if episode_infos is not None: print("Atari Episode Score: {:.2f}".format(episode_infos['r'])) print("Atari Episode Length", episode_infos['l']) REWARD = episode_infos['r'] Lenth = episode_infos['l'] break size = (84, 84) video_dir = 'results/{}_videos/{}/{}/FSA_{}_TCA_{}'.format( runname, method, game, pixels, tca) if not os.path.exists(video_dir): os.makedirs(video_dir) fps = 10 out_true = cv2.VideoWriter('{}/true_run_{}.avi'.format(video_dir, run), cv2.VideoWriter_fourcc(*'XVID'), fps, size) #*'PIM1' out_delta = cv2.VideoWriter('{}/delta_run_{}.avi'.format(video_dir, run), cv2.VideoWriter_fourcc(*'XVID'), fps, size) out_clean = cv2.VideoWriter('{}/clean_run_{}.avi'.format(video_dir, run), cv2.VideoWriter_fourcc(*'XVID'), fps, size) for i in range(len(TrueS_array)): image_true = TrueS_array[i] x_true = np.repeat(image_true, 3, axis=1) x_true = x_true.reshape(84, 84, 3) x_true[:, :, 0] = 150 * np.ones((84, 84), dtype=int) x_true[:, :, 1] = 150 * np.ones((84, 84), dtype=int) out_true.write(x_true) image_delta = Delta_array[i] x_delta = np.repeat(image_delta, 3, axis=1) x_delta = x_delta.reshape(84, 84, 3) x_delta[:, :, 0] = 150 * np.ones((84, 84), dtype=int) x_delta[:, :, 1] = 150 * np.ones((84, 84), dtype=int) out_delta.write(x_delta) image_clean = CleanS_array[i] x_clean = np.repeat(image_clean, 3, axis=1) x_clean = x_clean.reshape(84, 84, 3) x_clean[:, :, 0] = 150 * np.ones((84, 84), dtype=int) x_clean[:, :, 1] = 150 * np.ones((84, 84), dtype=int) out_clean.write(x_clean) cv2.destroyAllWindows() out_true.release() out_delta.release() out_clean.release() Episode_Reward.append(REWARD) Episode_Lenth.append(Lenth) Attack_times.append(atk_time) data = np.column_stack((Episode_Reward, Attack_times, Episode_Lenth)) np.savetxt('{}/run_{}.dat'.format(dir_name, run), data)
def reset(self): return reset_b(self.jlenv) @property def observation_space(self): return Box(low=-1.0, high=1.0, shape=(2,)) @property def action_space(self): # if you want a continuous action space, use a box # return Box(low=-1.0, high=1.0, shape=(1,)) return Discrete(3) from stable_baselines import DQN from stable_baselines.common.vec_env import DummyVecEnv dqn = DQN('MlpPolicy', DummyVecEnv([lambda: MCEnv(deepcopy(mc))]), verbose=1, exploration_fraction=0.1) print("getting ready to learn...") dqn.learn(total_timesteps=10) from julia.DMUStudent import evaluate from julia.Base import convert, Function, Float64 def policy_function(s): act, st = dqn.predict(s) jl_act = [-1.0, 0.0, 1.0][act] # careful that this matches action decoding above!! return convert(Float64, jl_act) evaluate(convert(Function, policy_function), "hw4", n_episodes=100)
"prioritized_replay": True, "total_timesteps": 10**7, "layers": [7, 7] }, ] for e in experiments: print(e) # Create log dir log_dir = "/tmp/" + e["name"] + "/" os.makedirs(log_dir, exist_ok=True) b_program_settings["n_blue_cars"] = e["n"] env = gym_env_generator(episode_timeout=30) env = Monitor(env, log_dir) policy_kwargs = dict(layers=e["layers"]) model = DQN("MlpPolicy", env, verbose=1, exploration_fraction=0.9, exploration_final_eps=0, learning_rate=0.001, learning_starts=100, policy_kwargs=policy_kwargs, double_q=e["double_q"], prioritized_replay=e["prioritized_replay"]) model.learn(total_timesteps=e["total_timesteps"]) model.save(log_dir + e["name"]) del model # remove to demonstrate saving and loading model = DQN.load(log_dir + e["name"]) evaluate_model(model) env.close()
if __name__ == "__main__": env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux" worker_id = 19 env = UnityEnv(env_id, worker_id=worker_id, use_visual=False, no_graphics=True) # Create log dir time_int = int(time.time()) log_dir = "stable_results/basic_env_{}/".format(time_int) os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run num_env = 2 #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)]) model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=30000) model.save(log_dir+"model") #evaluate agent episodes = 100 ep_r = [] ep_l = [] for e in range(episodes): obs = env.reset() total_r = 0. total_l = 0. while True: action, _states = model.predict(obs) obs, rewards, dones, infos = env.step(action) total_l += 1.
#def render(self, mode='human'): #def close (self): from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN from stable_baselines.common.callbacks import CheckpointCallback # Save a checkpoint every 1000 steps checkpoint_callback = CheckpointCallback(save_freq=500000, save_path='./logs/', name_prefix='dqn_model') env = CustomEnv(size=4, score_to_win=None, rate_2=0.5, random=False, enable_rewrite_board=False) #model = DQN(MlpPolicy, env, verbose=1) model = DQN.load("./DQN5") model.set_env(env) model.learn(total_timesteps=5000000, callback=checkpoint_callback) model.save("./DQN6") #del model # remove to demonstrate saving and loading #model = DQN.load("./deepq_2048")
def test_generate_cartpole(): model = DQN('MlpPolicy', 'CartPole-v1', verbose=0) generate_expert_traj(model, 'expert_cartpole', n_timesteps=1000, n_episodes=10)
import gym from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN env = gym.make('CartPole-v1') model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=25000) model.save("deepq_cartpole") del model # remove to demonstrate saving and loading model = DQN.load("deepq_cartpole") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
gamma=config['gamma'], policy_kwargs=config['policy_kwargs'], verbose=1, tensorboard_log=save_path) elif config['algorithm'] == 'PPO2': env = make_vec_env(lambda: env, n_envs=1) model = PPO2(config['policy_network'], env, learning_rate=config['learning_rate'], gamma=config['gamma'], policy_kwargs=config['policy_kwargs'], verbose=1, tensorboard_log=save_path) elif config['algorithm'] == 'DQN': model = DQN( config['policy_network'], env, learning_rate=config['learning_rate'], buffer_size=config['buffer_size'], target_network_update_freq=64, gamma=config['gamma'], # policy_kwargs = config['policy_kwargs'], verbose=1, tensorboard_log=save_path) model.learn(config['total_steps'], callback=callback) model.save(os.path.join(save_path, 'model')) env.close()
import time env = gym.make('MountainCar-v0') # Custom MLP policy of two layers of size 32 each class CustomDQNPolicy(FeedForwardPolicy): def __init__(self, *args, **kwargs): super(CustomDQNPolicy, self).__init__(*args, **kwargs, layers=[16, 16], layer_norm=False, feature_extraction="mlp") model = DQN(CustomDQNPolicy, env, verbose=1) #model.learn(total_timesteps=25000) #generate_expert_traj(model, "I:\Code\BachelorThesis\cartpole\data\expert_cartpole", n_episodes=10) #test it reward_sum = 0.0 obs = env.reset() for i in range(0, 10): done = False while not done: action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) reward_sum += reward env.render()
def launchAgent(): from stable_baselines import DQN import Reinforcement_AI.env.c_seperate_env as sep_env from queue import Queue from threading import Thread minimap_env = sep_env.MinimapEnv() allenv = sep_env.AllEnv() minimap_model = DQN( "CnnPolicy", # policy minimap_env, # environment double_q=True, # Double Q enable prioritized_replay=True, # Replay buffer enabled verbose=0 # log print ) allenv_model = DQN( "MlpPolicy", allenv, double_q=True, prioritized_replay=True, verbose=0 ) for i in range(100): if i != 0: minimap_model = DQN.load("KR_minimap_" + str(i)) allenv_model = DQN.load("KR_allenv_" + str(i)) que = Queue() minimap_model.set_env(minimap_env) allenv_model.set_env(allenv) # minimap_thread = Thread(target=minimap_model.learn, args=[50000]) # allenv_thread = Thread(target=allenv_model.learn, args=[50000]) allenv_thread = Thread(target=lambda q, arg1: q.put(allenv_model.learn(arg1)), args=(que, 50000)) # test = Pool(processes=1) # minimap_thread.start() allenv_thread.start() # test_result = test.apply_async(allenv_model.learn, (50000, None, 100, "DQN", True, None)) minimap_model.learn(total_timesteps=50000) # allenv_model.learn(total_timesteps=50000) # minimap_thread.join() allenv_thread.join() allenv_model = que.get() # return_val = test_result.get() minimap_model.save("KR_minimap_" + str(i + 1)) allenv_model.save("KR_allenv_" + str(i + 1))
import gym from game_env_gym import GameEnv from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN env = GameEnv() #env = gym.make('CartPole-v1') model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=100000) model.save("deepqrcina") obs = env.reset() done = False total_reward = 0.0 while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action) total_reward += reward env.render() print(total_reward)
totalSteps = hparams['training']['totalSteps'] initLrnRate = hparams['training']['initLrnRate'] lr_schedule = PiecewiseSchedule(([ (0, initLrnRate), (1*totalSteps // 2, initLrnRate * .1), (3*totalSteps // 4, initLrnRate * .01) ])) model = DQN( policy = CustomPolicy, env = env, verbose = 1, #learning_rate = lr_schedule.value(step_id), learning_rate = initLrnRate, buffer_size = hparams['training']['bufferSize'], gamma = hparams['training']['gamma'], batch_size = hparams['training']['batchSize'], learning_starts = hparams['training']['learningStarts'], exploration_fraction = .95, exploration_final_eps = .0, param_noise = False, prioritized_replay = False, tensorboard_log = pathToLog, full_tensorboard_log = True, seed = args.seed, n_cpu_tf_sess = args.nproc) model.learn( total_timesteps = hparams['training']['totalSteps'], log_interval = hparams['training']['totalSteps'] // 50, callback = callback, tb_log_name = args.params) model.save(pathToLastModel)
# best_mean_reward = mean_reward # # Example for saving best model # print("Saving new best model") # _locals["self"].save("./models/best_model_dqn.pkl") # print("-" * 90) # n_steps += 1 return True if __name__ == "__main__": steering_angles = np.array([-0.65, -0.5, -0.25, -0.1, 0.0, 1.0, 0.25, 0.5, 0.65]) env = AirSimGym(continuous=False, off_road_dist=2.9, max_speed=4.1, scale_reward=True, steering_angles=steering_angles) #env = Monitor(env, log_dir, allow_early_resets=True) #env = DummyVecEnv([lambda: env]) model = DQN(MlpPolicy, \ env,\ buffer_size=80000,\ learning_rate=0.001,\ train_freq=2,\ batch_size=64,\ exploration_fraction=0.1,\ exploration_final_eps=0.02) start_date = datetime.now() #model = DQN.load(models + "best_model_dqn.pkl", env=env) model.learn(total_timesteps=500000, log_interval=200, callback=callback) end_date = datetime.now() hours = int((end_date - start_date).total_seconds()) // 3600 model.save(f"./models/dqn_final_ver{VER_NO}_{hours}hrs.pkl")
env.close() HTML(show_env(frames)) # ## Deep Q-Learning # + # This is example code from https://github.com/hill-a/stable-baselines # - # Create environment env = gym.make('LunarLander-v2') # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=int(2e5)) # Save the agent model.save("dqn_lunar_new") del model # delete trained model to demonstrate loading # + # Load the trained agent model = DQN.load("dqn_lunar") # Enjoy trained agent obs = env.reset() frames = []
import gym_donkeycar import numpy as np from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN #SET UP ENVIRONMENT os.environ[ 'DONKEY_SIM_PATH'] = f"./DonkeySimMac/donkey_sim.app/Contents/MacOS/donkey_sim" os.environ['DONKEY_SIM_PORT'] = str(9091) os.environ['DONKEY_SIM_HEADLESS'] = str(1) # "1" is headless env = gym.make("donkey-warehouse-v0") #gym.make("donkey-generated-roads-v0") timesteps = 100000 # Set this to a reasonable number model_name = "dqn_model" # Change the model name to your preferences training = True # Change this to test or use the model if training: model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=timesteps) model.save(model_name) else: model = DQN.load(model_name) obv = env.reset() for t in range(10000): action, _states = model.predict(obv) # drive straight with small speed # execute the action obv, reward, done, info = env.step(action)
#create environment n_cpu = 1 #gotta be 1 (controlling single minecraft agent..) #env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)]) env = VecFrameStack( DummyVecEnv([lambda: RemoteEnv(args.url) for i in range(n_cpu)]), args.frame_stack) #TODO: use warnings module if args.save_path is None: print("Warning: no save_path provided. Model will not be saved.") if args.load_path is not None: #load model print("Loading '{}'...".format(args.load_path)) #model = PPO2.load(args.load_path, env, verbose=0) model = DQN.load(args.load_path, env, verbose=0) else: #create new model #model = PPO2(MlpLstmPolicy, env, verbose=0, nminibatches=1)#have to set minibatches to 1 #model = PPO2(MlpLnLstmPolicy, env, verbose=0, nminibatches=1) #model = PPO2(MlpPolicy, env, verbose=0) model = DQN(MlpPolicy, env, verbose=0) #and immediately save save_model() sys.stdout.flush() #some large number fluct_life = 999999999999 training_step_counter = 0
"prioritized_replay": False, "total_timesteps": 10**5, "layers": [5] } log_dir = "/tmp/" + e["name"] + "/" os.makedirs(log_dir, exist_ok=True) b_program_settings["n_blue_cars"] = e["n"] env = gym_env_generator(episode_timeout=30) env = Monitor(env, log_dir) policy_kwargs = dict(layers=e["layers"]) model = DQN("MlpPolicy", env, verbose=1, exploration_fraction=0.9, exploration_final_eps=0, learning_rate=0.001, learning_starts=100, policy_kwargs=policy_kwargs, double_q=e["double_q"], prioritized_replay=e["prioritized_replay"]) env = gym_env_generator(episode_timeout=100) observation = env.reset() print(observation) observation = np.array(observation) vectorized_env = model._is_vectorized_observation(observation, model.observation_space) observation = observation.reshape((-1, ) + model.observation_space.shape) with model.sess.as_default(): actions, a, b = model.step_model.step(observation, deterministic=True) print(actions)