Пример #1
0
import gym
import gym_flappy_bird
import datetime

from stable_baselines.deepq.policies import CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv, VecFrameStack

from env_wrapper import make_flappy_env
from stable_baselines import DQN

ENV_ID = 'flappy-bird-v0'

env = make_flappy_env(ENV_ID, num_env=1, seed=0)
# Frame-stacking with 4 frames
env = VecFrameStack(env, n_stack=4)

model = DQN(CnnPolicy,
            env,
            verbose=1,
            tensorboard_log='./dqn/dqn_2300k_timetest')

start_time = datetime.datetime.now()

model.learn(total_timesteps=2300000)

print(datetime.datetime.now() - start_time)

model.save("dqn_2300k")

print('Finished')
Пример #2
0
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common.atari_wrappers import FrameStack, WarpFrame, MaxAndSkipEnv, EpisodicLifeEnv
import tensorflow as tf
# Suppress warnings
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import hyperparams as hp

env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = EpisodicLifeEnv(env)
env = WarpFrame(env)
env = FrameStack(env, n_frames=hp.FRAME_STACK)
env = MaxAndSkipEnv(env, skip=hp.FRAME_SKIP)

model = DQN.load("models/round3/best_model")

obs = env.reset()

# cr = 0
# while True:
#     action, _states = model.predict(obs, deterministic=False)
#     obs, rewards, done, info = env.step(action)
#     env.step(action)
#     cr += rewards
#     print("Reward: {}\t\t".format(cr), end='\r')
#     env.render()
#     if (done):
#         print("Finished an episode with total reward: ", cr)
#         cr = 0
#         break
Пример #3
0
def launchAgent(env_name: int,
                model_name: str,
                test_mode=False,
                filepath=None):
    """
    :param test_mode: 에이전트를 테스트 모드로 불러와 주행시킬지를 확인하는 모드입니다. 이럴 시에 학습은 이루어지지 않으며, 주행만 이루어집니다.
    :param env_name: 불러올 환경의 이름입니다.
        1 : 미니맵 이미지를 사용하지 않은, 점 사이의 거리 계산을 한 환경입니다.
        2 : 미니맵 이미지를 사용하고, 보상을 업데이트한 모델입니다.
        다른 값(기본) : 현재 쓰는 모델입니다. 미니맵 이미지를 사용하고, 보상을 다시 업데이트한 모델입니다.
    :param model_name: 설정할 모델의 이름입니다.
        DQN : DQN 모델을 불러옵니다.
        HER : HER 모델을 불러옵니다.
        다른 값(기본) : PPO2 모델을 불러옵니다.
    :return: 마지막으로 episode를 수행한 모델을 return합니다.
    """

    from stable_baselines import DQN, HER, PPO2

    if env_name == 1:
        from Reinforcement_AI.env.a_env import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    elif env_name == 2:
        from Reinforcement_AI.env.d_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv1
        kart_env = DetailedMiniMapEnv1()
        policy = "CnnPolicy"
    elif env_name == 3:
        from Reinforcement_AI.env.a_env2 import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    elif env_name == 4:
        from Reinforcement_AI.env.a_env3 import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    else:  #env_name == "detailed_minimap_enhanced" or env_name == "4":
        from Reinforcement_AI.env.e_enhanced_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv2
        kart_env = DetailedMiniMapEnv2()
        policy = "CnnPolicy"

    if model_name == "DQN":
        model = DQN(policy=policy,
                    env=kart_env,
                    double_q=True,
                    prioritized_replay=True,
                    verbose=1)
    elif model_name == "HER":
        model = HER(policy=policy, env=kart_env, model_class=DQN, verbose=1)
    else:  # model_name == "PPO2"
        model = PPO2(policy=policy,
                     learning_rate=0.0001,
                     env=kart_env,
                     verbose=1)

    if test_mode:  # 테스트 모드일때 에이전트 불러와서 작동하게함
        model.load(filepath)
        kart_env.set_continuos(True)

        while True:
            observation = kart_env.reset()
            while True:
                action, _states = model.predict(observation)
                observation, rewards, dones, info = kart_env.step(action)
                if dones:
                    break

    else:
        for i in range(1000):
            model.learn(total_timesteps=12500)
            model.save(str(env_name) + "_" + model_name + "_" + str(i + 1))
Пример #4
0
if MODEL == 'DQN':
    from stable_baselines.deepq.policies import LnCnnPolicy, MlpPolicy

    if ENVIRONMENT in ['rgbd', 'rgb', 'rgbdsparse']:
        model = DQN(LnCnnPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=(log_dir + "tensorboard_%s_%s_%s/") %
                    (MODEL, ENVIRONMENT, DATE),
                    gamma=args.discount,
                    learning_rate=args.lr,
                    buffer_size=50000,
                    exploration_fraction=0.1,
                    exploration_final_eps=0.02,
                    train_freq=1,
                    batch_size=32,
                    double_q=True,
                    learning_starts=1000,
                    target_network_update_freq=500,
                    prioritized_replay=True,
                    prioritized_replay_alpha=0.6,
                    prioritized_replay_beta0=0.4,
                    prioritized_replay_beta_iters=None,
                    prioritized_replay_eps=1e-06,
                    param_noise=False,
                    _init_setup_model=True,
                    policy_kwargs=None,
                    full_tensorboard_log=False)

    elif ENVIRONMENT in 'possensor':
        model = DQN(MlpPolicy,
                    env,
tensorboard_folder = './tensorboard/Bomberman/base/'
model_folder = './models/Bomberman/base/'
if not os.path.isdir(tensorboard_folder):
    os.makedirs(tensorboard_folder)
if not os.path.isdir(model_folder):
    os.makedirs(model_folder)

policy = 'Cnn'
model_tag = 'Cnn'
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

env = DummyVecEnv([lambda: BaseEnv()])
env = VecFrameStack(env, 2)

model = DQN(CustomCnnPolicy, env, verbose=0, tensorboard_log=tensorboard_folder)
model.learn(total_timesteps=10000000, tb_log_name='DQN' + model_tag)

model.save(model_folder + "DQN" + model_tag)
del model
model = DQN.load(model_folder + "DQN" + model_tag)

done = False
states = None
obs = env.reset()

while not done:
    action, states = model.predict(obs, states)
    obs, _, done, info = env.step(action)
    env.render()
Пример #6
0
# Create log dir
log_dir = args.log_path
os.makedirs(log_dir, exist_ok=True)

env = gym_gvgai.make(args.env)
env = WarpFrame(env)
env = Monitor(env, log_dir, allow_early_resets=True)

if args.save_video_interval != 0:
    env = gym.wrappers.Monitor(
        env,
        os.path.join(log_dir, "videos"),
        video_callable=(lambda ep: ep % args.save_video_interval == 0),
        force=True)

model = DQN(CnnPolicy,
            env,
            verbose=1,
            exploration_fraction=args.exploration_fraction,
            exploration_final_eps=args.exploration_final_eps,
            tensorboard_log="tensorboard_log",
            prioritized_replay=bool(args.double_q),
            double_q=bool(args.double_q),
            buffer_size=int(args.buffer_size),
            train_freq=args.train_freq,
            batch_size=args.batch_size,
            seed=args.seed)

model.learn(total_timesteps=int(args.num_timesteps), callback=callback)
Пример #7
0
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import DQN
from PaddleEnv import PaddleEnv

model = DQN.load("model.h5")
env = DummyVecEnv([lambda: PaddleEnv()])
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, _ = env.step(action)
Пример #8
0
env = DummyVecEnv([lambda: env2])
env4 = DummyVecEnv([lambda: env3])
check_env(env2, warn=True)

# Define Callback
#Callback stops training if maximum is reached in mean reward
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=env2.calculate_threshold(), verbose=1)
# Callback safes the currently best model
eval_callback = EvalCallback(env4, callback_on_new_best=callback_on_best, verbose=1, best_model_save_path='./DQN_Models/best/')
checkpoint_callback = CheckpointCallback(save_freq=1e4, save_path='./model_checkpoints/')

# Uncomment, to train a new fresh model, otherwise a allready trained model will be trained
#model = DQN(MlpPolicy, env, verbose=2, tensorboard_log="./logs/progress_tensorboard/")

# Load current best model
model = DQN.load("DQN_Models/dqn_5x5_3_SingleShot.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/")

# Train model
model.learn(total_timesteps=1000000, callback=[checkpoint_callback, eval_callback])

#Delete current model and load the best model
del model
model = DQN.load("./DQN_Models/best/best_model.zip", verbose=2, env=env, tensorboard_log="./logs/progress_tensorboard/")

# Test trained model
results = []
for iteration in range(100):
    score = 0
    print('Iteration', iteration)
    # Observed Player board
    observation = env.reset()
Пример #9
0
def test_dqn(name):
    model_path = os.path.join('models', name)
    model = DQN.load(model_path)
    return model
    # if cfg["--train"]:
    #     policy_kwargs = {"net_arch": [512, 512]}
    #     model = PPO2(MlpPolicy, env,
    #                  verbose=1,
    #                  policy_kwargs=policy_kwargs,
    #                  n_steps=cfg["--n_steps"],
    #                  learning_rate=cfg["--learning_rate"],
    #                  tensorboard_log="./logs/")
    #     model.learn(total_timesteps=int(cfg["--steps"]))
    #     model.save("ppo2_intersection")
    #
    # if cfg["--test"]:
    #     model = PPO2.load("ppo2_intersection")
    #     obs = env.reset()
    #     while True:
    #         action, _states = model.predict(obs)
    #         obs, rewards, dones, info = env.step(action)
    #         env.render()

    if cfg["--train"]:
        policy_kwargs = {}
        model = DQN(DQNMlp, env,
                    verbose=1,
                    policy_kwargs=policy_kwargs,
                    batch_size=cfg["--batch_size"],
                    exploration_fraction=0.3,
                    learning_rate=cfg["--learning_rate"],
                    tensorboard_log="./logs/")
        model.learn(total_timesteps=int(cfg["--steps"]))
        model.save("deepq_intersection")
from stable_baselines import DQN

from space_lander.envs.spacex_lander import *

# Create environment
env_names = ['SpaceXLander-v0', 'LunarLanderv2-v0']
env_name = env_names[0]
env = gym.make(env_name)

# Instantiate the agent
model = DQN(policy='MlpPolicy',
            env=env,
            learning_rate=1e-3,
            prioritized_replay=True,
            verbose=1,
            tensorboard_log=f"./{env_name}")

# Train the agent
obs = env.reset()


def eval_and_show(*args, **kwargs):
    if args[0]['t'] % 10000 == 0:
        print('Evaluating', args[0]['t'])
        done = False
        while not done:
            action, _states = model.predict(args[0]['obs'])
            obs, reward, done, info = env.step(action)
            env.render()
        # env.close()
Пример #12
0
    def train_single(self, env_name="Merging-v0"):
        """
        Directly trains on env_name
        """
        for seed in [201, 202, 203, 204, 205]:
            print(f"\ntraining with bsize {self.bs}, seed{seed}")
            self.seed = seed
            self.experiment_name = f"B{self.bs}R{seed}"
            print("EXPT NAME: ", self.experiment_dir1, self.experiment_name)
            self.experiment_dir = os.path.join(self.experiment_dir1,
                                               self.experiment_name)
            self.create_eval_dir()
            self.model = None
            env = gym.make(env_name)
            eval_env = gym.make(env_name)
            env._set_barrier_size(self.bs)
            env._set_homotopy_class('right')
            eval_env._set_barrier_size(self.bs)
            eval_env._set_homotopy_class('right')
            if self.model_type == "PPO":
                if self.is_save:
                    ### DEEPER NETWORK
                    #policy_kwargs = dict(net_arch=[dict(pi=[64, 64, 64, 64],
                    #                                    vf=[64, 64, 64, 64])]
                    #                                    )
                    #self.PPO = PPO2('MlpPolicy', env, verbose=1, seed=self.seed, learning_rate=1e-3,
                    #                policy_kwargs=policy_kwargs)
                    ### DROPOUT
                    #self.PPO = PPO2(MlpGeneralPolicy1, env, verbose=1, seed=self.seed, learning_rate=1e-3)
                    ### REGULAR
                    self.PPO = PPO2('MlpPolicy',
                                    env,
                                    verbose=1,
                                    seed=self.seed,
                                    learning_rate=1e-3)
                else:
                    self.PPO = PPO2('MlpPolicy',
                                    env,
                                    verbose=1,
                                    seed=self.seed,
                                    learning_rate=1e-3)

                self.model = train(self.PPO, eval_env, self.timesteps,
                                   self.experiment_dir, self.is_save,
                                   self.eval_save_period, self.rets_path, 0)
            elif self.model_type == "DQN":
                if self.is_save:
                    self.DQN = DQN(
                        'MlpPolicy',
                        env,
                        verbose=1,
                        seed=self.seed,
                        prioritized_replay=True,
                        learning_rate=1e-3,
                        tensorboard_log="./Gridworldv1_tensorboard/" +
                        self.experiment_name,
                        full_tensorboard_log=True)
                else:
                    self.DQN = DQN('MlpPolicy',
                                   env,
                                   verbose=1,
                                   seed=self.seed,
                                   prioritized_replay=True,
                                   learning_rate=1e-3)
                self.model = train(self.DQN, eval_env, self.timesteps,
                                   self.experiment_dir, self.is_save,
                                   self.eval_save_period, self.rets_path, 0)
            elif self.model_type == "HER":
                env = HERGoalEnvWrapper(env)
                eval_env = HERGoalEnvWrapper(eval_env)
                print("bs: ", env.env.barrier_size)
                print("hc: ", env.env.homotopy_class)
                self.HER = HER('MlpPolicy',
                               env,
                               DDPG,
                               n_sampled_goal=4,
                               goal_selection_strategy="future",
                               seed=self.seed,
                               verbose=1)
                self.model = train(self.HER, eval_env, self.timesteps,
                                   self.experiment_dir, self.is_save,
                                   self.eval_save_period, self.rets_path, 0)
Пример #13
0
if __name__ == '__main__':

    env = gym.make('CartPole-v1')

    env = VisualizationEnv(
        env,
        steps_lookback=10000,
        episodic=True,
        features_names=[
            'Cart Position', 'Cart Velocity', 'Pole Angle',
            'Pole Velocity At Tip'
        ],
        actions_names=['Push cart to the left', 'Push cart to the right'])

    model = DQN(CustomDQNPolicy,
                env,
                verbose=1,
                learning_rate=1e-3,
                exploration_fraction=0.1,
                exploration_final_eps=0.02,
                prioritized_replay=True)
    model.learn(total_timesteps=100000)

    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()

    env.close()
    env.envs[0].join()
Пример #14
0
def main(game, method, pixels, tca, runname, run):
    def obj(variable, actions_0, obs):
        perturbation = np.zeros([1, 84, 84, 4])
        for i in range(len(variable) // 3):
            x = int(np.round(variable[3 * i] * 83))
            y = int(np.round(variable[3 * i + 1] * 83))
            pixel_attack = int(np.round(variable[3 * i + 2] * 254))
            x = np.clip(x, 0, 83)
            y = np.clip(y, 0, 83)
            pixel_attack = np.clip(pixel_attack, 0, 254)
            perturbation[:, x, y, :] = pixel_attack
        np.clip(perturbation, 0, 254)
        obs_new = obs + perturbation
        actions_new = model.action_probability(obs_new)
        fitness_value = max_max_distance(actions_new, actions_0)
        return fitness_value

    def evaluate(variable, obs):
        perturbation = np.zeros([1, 84, 84, 4])
        for i in range(len(variable) // 3):
            x = int(np.round(variable[3 * i] * 83))
            y = int(np.round(variable[3 * i + 1] * 83))
            pixel_attack = int(np.round(variable[3 * i + 2] * 254))
            x = np.clip(x, 0, 83)
            y = np.clip(y, 0, 83)
            pixel_attack = np.clip(pixel_attack, 0, 254)
            perturbation[:, x, y, :] = pixel_attack
        np.clip(perturbation, 0, 254)
        obs_new = obs + perturbation
        actions = model.action_probability(obs)
        actions_new = model.action_probability(obs_new)
        action, _states = model.predict(obs_new)
        obs_candi, rewards, dones, infos = env.step(action)
        return obs_candi, rewards, dones, infos, obs_new, actions_new, perturbation

    def minmax_distance(actions_new, actions_0):
        arg_max = np.argmax(actions_0[0])
        arg_min = np.argmin(actions_0[0])
        minmax_dist = actions_new[0][arg_min] - actions_new[0][arg_max]
        return minmax_dist

    def max_max_distance(actions_new, actions_0):
        arg_max = np.argmax(actions_0[0])
        a_candid = list(actions_new[0])
        a_candid.remove(a_candid[arg_max])
        maxmax_dist = np.max(a_candid) - actions_new[0][arg_max]
        return maxmax_dist

    def calculate_entropy(actions):
        entropy_actions = [
            -probs * np.log(probs) / np.log(len(actions)) for probs in actions
        ]
        entropy = np.sum(entropy_actions)
        return entropy

    alg = GA
    model = DQN.load("trained_agents/{}/{}NoFrameskip-v4".format(method, game))
    Episode_Reward = []
    Episode_Lenth = []
    Attack_times = []
    dir_name = 'results/{}/{}/{}/FSA_{}_TCA_{}'.format(runname, method, game,
                                                       pixels, tca)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    atk_num = pixels
    bounds = [[0, 1], [0, 1], [0, 1]] * atk_num
    env = make_atari_env('{}NoFrameskip-v4'.format(game),
                         num_env=1,
                         seed=run,
                         wrapper_kwargs=None,
                         start_index=0,
                         allow_early_resets=True,
                         start_method=None)
    env = VecFrameStack(env, n_stack=4)
    env.reset()
    model.set_env(env)
    obs = env.reset()
    x0 = [0.5, 0.5, 0.5] * atk_num
    atk_time = 0
    TrueS_array = []
    Delta_array = []
    CleanS_array = []
    for i in range(5000):
        actions = model.action_probability(obs)
        attack_significance = calculate_entropy(actions[0])
        CleanS_array.append((obs[0, :, :, 3]).astype('uint8'))
        if attack_significance <= tca:
            atk_time = atk_time + 1
            l = alg(lambda variable: obj(variable, actions, obs),
                    x0,
                    xBound=bounds,
                    verbose=False)
            l.maximize = True
            l.maxEvaluations = 400
            res = l.learn()
            solution = list(res)[0]
            obs, rewards, dones, infos, obs_new, actions_new, perturbation = evaluate(
                solution, obs)
            obs_store = np.int_(obs_new)
            true_state = (obs_store[0, :, :, 3]).astype('uint8')
            TrueS_array.append(true_state)
            Delta_array.append(perturbation[0, :, :, 3].astype('uint8'))
        else:
            obs = np.int_(obs)
            true_state = (obs[0, :, :, 3]).astype('uint8')
            TrueS_array.append(true_state)
            Delta_array.append(np.zeros([84, 84]).astype('uint8'))
            action, _states = model.predict(obs)
            obs, rewards, dones, infos = env.step(action)

        episode_infos = infos[0].get('episode')
        if episode_infos is not None:
            print("Atari Episode Score: {:.2f}".format(episode_infos['r']))
            print("Atari Episode Length", episode_infos['l'])
            REWARD = episode_infos['r']
            Lenth = episode_infos['l']
            break
    size = (84, 84)
    video_dir = 'results/{}_videos/{}/{}/FSA_{}_TCA_{}'.format(
        runname, method, game, pixels, tca)
    if not os.path.exists(video_dir):
        os.makedirs(video_dir)
    fps = 10
    out_true = cv2.VideoWriter('{}/true_run_{}.avi'.format(video_dir, run),
                               cv2.VideoWriter_fourcc(*'XVID'), fps,
                               size)  #*'PIM1'
    out_delta = cv2.VideoWriter('{}/delta_run_{}.avi'.format(video_dir, run),
                                cv2.VideoWriter_fourcc(*'XVID'), fps, size)
    out_clean = cv2.VideoWriter('{}/clean_run_{}.avi'.format(video_dir, run),
                                cv2.VideoWriter_fourcc(*'XVID'), fps, size)
    for i in range(len(TrueS_array)):
        image_true = TrueS_array[i]
        x_true = np.repeat(image_true, 3, axis=1)
        x_true = x_true.reshape(84, 84, 3)
        x_true[:, :, 0] = 150 * np.ones((84, 84), dtype=int)
        x_true[:, :, 1] = 150 * np.ones((84, 84), dtype=int)
        out_true.write(x_true)
        image_delta = Delta_array[i]
        x_delta = np.repeat(image_delta, 3, axis=1)
        x_delta = x_delta.reshape(84, 84, 3)
        x_delta[:, :, 0] = 150 * np.ones((84, 84), dtype=int)
        x_delta[:, :, 1] = 150 * np.ones((84, 84), dtype=int)
        out_delta.write(x_delta)
        image_clean = CleanS_array[i]
        x_clean = np.repeat(image_clean, 3, axis=1)
        x_clean = x_clean.reshape(84, 84, 3)
        x_clean[:, :, 0] = 150 * np.ones((84, 84), dtype=int)
        x_clean[:, :, 1] = 150 * np.ones((84, 84), dtype=int)
        out_clean.write(x_clean)
    cv2.destroyAllWindows()
    out_true.release()
    out_delta.release()
    out_clean.release()
    Episode_Reward.append(REWARD)
    Episode_Lenth.append(Lenth)
    Attack_times.append(atk_time)
    data = np.column_stack((Episode_Reward, Attack_times, Episode_Lenth))
    np.savetxt('{}/run_{}.dat'.format(dir_name, run), data)
Пример #15
0
    def reset(self):
        return reset_b(self.jlenv)

    @property
    def observation_space(self):
        return Box(low=-1.0, high=1.0, shape=(2,))

    @property
    def action_space(self):
        # if you want a continuous action space, use a box
        # return Box(low=-1.0, high=1.0, shape=(1,))
        return Discrete(3)
        
from stable_baselines import DQN
from stable_baselines.common.vec_env import DummyVecEnv

dqn = DQN('MlpPolicy',  DummyVecEnv([lambda: MCEnv(deepcopy(mc))]), verbose=1, exploration_fraction=0.1)

print("getting ready to learn...")
dqn.learn(total_timesteps=10)

from julia.DMUStudent import evaluate
from julia.Base import convert, Function, Float64

def policy_function(s):
    act, st = dqn.predict(s)
    jl_act = [-1.0, 0.0, 1.0][act] # careful that this matches action decoding above!!
    return convert(Float64, jl_act)
    
evaluate(convert(Function, policy_function), "hw4", n_episodes=100)
        "prioritized_replay": True,
        "total_timesteps": 10**7,
        "layers": [7, 7]
    },
]

for e in experiments:
    print(e)
    # Create log dir
    log_dir = "/tmp/" + e["name"] + "/"
    os.makedirs(log_dir, exist_ok=True)
    b_program_settings["n_blue_cars"] = e["n"]
    env = gym_env_generator(episode_timeout=30)
    env = Monitor(env, log_dir)
    policy_kwargs = dict(layers=e["layers"])
    model = DQN("MlpPolicy",
                env,
                verbose=1,
                exploration_fraction=0.9,
                exploration_final_eps=0,
                learning_rate=0.001,
                learning_starts=100,
                policy_kwargs=policy_kwargs,
                double_q=e["double_q"],
                prioritized_replay=e["prioritized_replay"])
    model.learn(total_timesteps=e["total_timesteps"])
    model.save(log_dir + e["name"])
    del model  # remove to demonstrate saving and loading
    model = DQN.load(log_dir + e["name"])
    evaluate_model(model)
    env.close()
if __name__ == "__main__":
    env_id = "/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux"
    worker_id = 19
    env = UnityEnv(env_id, worker_id=worker_id, use_visual=False, no_graphics=True)
    # Create log dir
    time_int = int(time.time())
    log_dir = "stable_results/basic_env_{}/".format(time_int)
    os.makedirs(log_dir, exist_ok=True)

    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
    num_env = 2

    #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)])

    model = DQN(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=30000)
    model.save(log_dir+"model")

    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, infos = env.step(action)
            total_l += 1.
Пример #18
0
    #def render(self, mode='human'):

    #def close (self):


from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN
from stable_baselines.common.callbacks import CheckpointCallback
# Save a checkpoint every 1000 steps
checkpoint_callback = CheckpointCallback(save_freq=500000,
                                         save_path='./logs/',
                                         name_prefix='dqn_model')

env = CustomEnv(size=4,
                score_to_win=None,
                rate_2=0.5,
                random=False,
                enable_rewrite_board=False)

#model = DQN(MlpPolicy, env, verbose=1)
model = DQN.load("./DQN5")
model.set_env(env)
model.learn(total_timesteps=5000000, callback=checkpoint_callback)
model.save("./DQN6")

#del model # remove to demonstrate saving and loading

#model = DQN.load("./deepq_2048")
Пример #19
0
def test_generate_cartpole():
    model = DQN('MlpPolicy', 'CartPole-v1', verbose=0)
    generate_expert_traj(model, 'expert_cartpole', n_timesteps=1000, n_episodes=10)
Пример #20
0
import gym

from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

env = gym.make('CartPole-v1')

model = DQN(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("deepq_cartpole")

del model  # remove to demonstrate saving and loading

model = DQN.load("deepq_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Пример #21
0
                  gamma=config['gamma'],
                  policy_kwargs=config['policy_kwargs'],
                  verbose=1,
                  tensorboard_log=save_path)

elif config['algorithm'] == 'PPO2':
    env = make_vec_env(lambda: env, n_envs=1)
    model = PPO2(config['policy_network'],
                 env,
                 learning_rate=config['learning_rate'],
                 gamma=config['gamma'],
                 policy_kwargs=config['policy_kwargs'],
                 verbose=1,
                 tensorboard_log=save_path)

elif config['algorithm'] == 'DQN':
    model = DQN(
        config['policy_network'],
        env,
        learning_rate=config['learning_rate'],
        buffer_size=config['buffer_size'],
        target_network_update_freq=64,
        gamma=config['gamma'],  # policy_kwargs = config['policy_kwargs'],
        verbose=1,
        tensorboard_log=save_path)

model.learn(config['total_steps'], callback=callback)
model.save(os.path.join(save_path, 'model'))

env.close()
Пример #22
0
import time

env = gym.make('MountainCar-v0')


# Custom MLP policy of two layers of size 32 each
class CustomDQNPolicy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomDQNPolicy, self).__init__(*args,
                                              **kwargs,
                                              layers=[16, 16],
                                              layer_norm=False,
                                              feature_extraction="mlp")


model = DQN(CustomDQNPolicy, env, verbose=1)

#model.learn(total_timesteps=25000)

#generate_expert_traj(model, "I:\Code\BachelorThesis\cartpole\data\expert_cartpole", n_episodes=10)

#test it
reward_sum = 0.0
obs = env.reset()
for i in range(0, 10):
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
Пример #23
0
def launchAgent():
    from stable_baselines import DQN
    import Reinforcement_AI.env.c_seperate_env as sep_env
    from queue import Queue
    from threading import Thread

    minimap_env = sep_env.MinimapEnv()
    allenv = sep_env.AllEnv()

    minimap_model = DQN(
        "CnnPolicy",  # policy
        minimap_env,  # environment
        double_q=True,  # Double Q enable
        prioritized_replay=True,  # Replay buffer enabled
        verbose=0  # log print
    )

    allenv_model = DQN(
        "MlpPolicy",
        allenv,
        double_q=True,
        prioritized_replay=True,
        verbose=0
    )

    for i in range(100):
        if i != 0:
            minimap_model = DQN.load("KR_minimap_" + str(i))
            allenv_model = DQN.load("KR_allenv_" + str(i))

        que = Queue()

        minimap_model.set_env(minimap_env)
        allenv_model.set_env(allenv)

        # minimap_thread = Thread(target=minimap_model.learn, args=[50000])
        # allenv_thread = Thread(target=allenv_model.learn, args=[50000])
        allenv_thread = Thread(target=lambda q, arg1: q.put(allenv_model.learn(arg1)), args=(que, 50000))
        # test = Pool(processes=1)

        # minimap_thread.start()
        allenv_thread.start()
        # test_result = test.apply_async(allenv_model.learn, (50000, None, 100, "DQN", True, None))
        minimap_model.learn(total_timesteps=50000)

        # allenv_model.learn(total_timesteps=50000)

        # minimap_thread.join()
        allenv_thread.join()

        allenv_model = que.get()
        # return_val = test_result.get()

        minimap_model.save("KR_minimap_" + str(i + 1))
        allenv_model.save("KR_allenv_" + str(i + 1))
Пример #24
0
import gym
from game_env_gym import GameEnv
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

env = GameEnv()
#env = gym.make('CartPole-v1')

model = DQN(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=100000)
model.save("deepqrcina")

obs = env.reset()
done = False
total_reward = 0.0
while not done:
    action, _states = model.predict(obs)
    obs, reward, done, info = env.step(action)
    total_reward += reward
    env.render()
print(total_reward)
Пример #25
0
totalSteps  = hparams['training']['totalSteps']
initLrnRate = hparams['training']['initLrnRate']
lr_schedule = PiecewiseSchedule(([
                (0, initLrnRate),
                (1*totalSteps // 2, initLrnRate * .1),
                (3*totalSteps // 4, initLrnRate * .01)
]))
model   = DQN(
    policy                  = CustomPolicy,
    env                     = env,
    verbose                 = 1,
    #learning_rate           = lr_schedule.value(step_id),
    learning_rate           = initLrnRate,
    buffer_size             = hparams['training']['bufferSize'],
    gamma                   = hparams['training']['gamma'],
    batch_size              = hparams['training']['batchSize'],
    learning_starts         = hparams['training']['learningStarts'],
    exploration_fraction    = .95,
    exploration_final_eps   = .0,
    param_noise             = False,
    prioritized_replay      = False,
    tensorboard_log         = pathToLog,
    full_tensorboard_log    = True,
    seed                    = args.seed,
    n_cpu_tf_sess           = args.nproc)
model.learn(
    total_timesteps = hparams['training']['totalSteps'],
    log_interval    = hparams['training']['totalSteps'] // 50,
    callback        = callback,
    tb_log_name     = args.params)
model.save(pathToLastModel)
    #             best_mean_reward = mean_reward
    #             # Example for saving best model
    #             print("Saving new best model")
    #             _locals["self"].save("./models/best_model_dqn.pkl")
            
    #         print("-" * 90)
    # n_steps += 1
    return True

if __name__ == "__main__":
    steering_angles = np.array([-0.65, -0.5, -0.25, -0.1, 0.0, 1.0, 0.25, 0.5, 0.65])
    env = AirSimGym(continuous=False, off_road_dist=2.9, max_speed=4.1, scale_reward=True, steering_angles=steering_angles)
    #env = Monitor(env, log_dir, allow_early_resets=True)
    #env = DummyVecEnv([lambda: env])

    model = DQN(MlpPolicy, \
                env,\
                buffer_size=80000,\
                learning_rate=0.001,\
                train_freq=2,\
                batch_size=64,\
                exploration_fraction=0.1,\
                exploration_final_eps=0.02)

    start_date = datetime.now()
    #model = DQN.load(models + "best_model_dqn.pkl", env=env)
    model.learn(total_timesteps=500000, log_interval=200, callback=callback)
    end_date = datetime.now()
    hours = int((end_date - start_date).total_seconds()) // 3600
    model.save(f"./models/dqn_final_ver{VER_NO}_{hours}hrs.pkl")
Пример #27
0
env.close()
HTML(show_env(frames))

# ## Deep Q-Learning

# +
# This is example code from https://github.com/hill-a/stable-baselines
# -

# Create environment
env = gym.make('LunarLander-v2')

# Instantiate the agent
model = DQN('MlpPolicy',
            env,
            learning_rate=1e-3,
            prioritized_replay=True,
            verbose=1)
# Train the agent
model.learn(total_timesteps=int(2e5))
# Save the agent
model.save("dqn_lunar_new")
del model  # delete trained model to demonstrate loading

# +
# Load the trained agent
model = DQN.load("dqn_lunar")

# Enjoy trained agent
obs = env.reset()
frames = []
Пример #28
0
import gym_donkeycar
import numpy as np
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy
from stable_baselines import DQN

#SET UP ENVIRONMENT
os.environ[
    'DONKEY_SIM_PATH'] = f"./DonkeySimMac/donkey_sim.app/Contents/MacOS/donkey_sim"
os.environ['DONKEY_SIM_PORT'] = str(9091)
os.environ['DONKEY_SIM_HEADLESS'] = str(1)  # "1" is headless

env = gym.make("donkey-warehouse-v0")
#gym.make("donkey-generated-roads-v0")

timesteps = 100000  # Set this to a reasonable number
model_name = "dqn_model"  # Change the model name to your preferences
training = True  # Change this to test or use the model

if training:
    model = DQN(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=timesteps)
    model.save(model_name)
else:
    model = DQN.load(model_name)
    obv = env.reset()
    for t in range(10000):
        action, _states = model.predict(obv)  # drive straight with small speed
        # execute the action
        obv, reward, done, info = env.step(action)
Пример #29
0
#create environment
n_cpu = 1  #gotta be 1 (controlling single minecraft agent..)
#env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])
env = VecFrameStack(
    DummyVecEnv([lambda: RemoteEnv(args.url) for i in range(n_cpu)]),
    args.frame_stack)

#TODO: use warnings module
if args.save_path is None:
    print("Warning: no save_path provided. Model will not be saved.")

if args.load_path is not None:
    #load model
    print("Loading '{}'...".format(args.load_path))
    #model = PPO2.load(args.load_path, env, verbose=0)
    model = DQN.load(args.load_path, env, verbose=0)
else:
    #create new model
    #model = PPO2(MlpLstmPolicy, env, verbose=0, nminibatches=1)#have to set minibatches to 1
    #model = PPO2(MlpLnLstmPolicy, env, verbose=0, nminibatches=1)
    #model = PPO2(MlpPolicy, env, verbose=0)
    model = DQN(MlpPolicy, env, verbose=0)
    #and immediately save
    save_model()

sys.stdout.flush()

#some large number
fluct_life = 999999999999
training_step_counter = 0
Пример #30
0
    "prioritized_replay": False,
    "total_timesteps": 10**5,
    "layers": [5]
}

log_dir = "/tmp/" + e["name"] + "/"
os.makedirs(log_dir, exist_ok=True)
b_program_settings["n_blue_cars"] = e["n"]
env = gym_env_generator(episode_timeout=30)
env = Monitor(env, log_dir)
policy_kwargs = dict(layers=e["layers"])
model = DQN("MlpPolicy",
            env,
            verbose=1,
            exploration_fraction=0.9,
            exploration_final_eps=0,
            learning_rate=0.001,
            learning_starts=100,
            policy_kwargs=policy_kwargs,
            double_q=e["double_q"],
            prioritized_replay=e["prioritized_replay"])

env = gym_env_generator(episode_timeout=100)
observation = env.reset()
print(observation)
observation = np.array(observation)
vectorized_env = model._is_vectorized_observation(observation,
                                                  model.observation_space)
observation = observation.reshape((-1, ) + model.observation_space.shape)
with model.sess.as_default():
    actions, a, b = model.step_model.step(observation, deterministic=True)
print(actions)