Exemplo n.º 1
0
def test_identity():

    with tf.Graph().as_default():
        env = IdentityEnv(10)
        random.seed(0)

        tf.set_random_seed(0)

        param_noise = False
        model = deepq.models.mlp([32])
        act = deepq.learn(
            env,
            q_func=model,
            lr=1e-3,
            max_timesteps=10000,
            buffer_size=50000,
            exploration_fraction=0.1,
            exploration_final_eps=0.02,
            print_freq=10,
            param_noise=param_noise,
        )

        tf.set_random_seed(0)

        N_TRIALS = 1000
        sum_rew = 0
        obs = env.reset()
        for i in range(N_TRIALS):
            obs, rew, done, _ = env.step(act([obs]))
            sum_rew += rew

        assert sum_rew > 0.9 * N_TRIALS
Exemplo n.º 2
0
def main():
    logger.configure()
    env = make_atari('PongNoFrameskip-v4')
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)

    model = deepq.learn(
        env,
        "conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=True,
        lr=1e-4,
        total_timesteps=int(1e7),
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
    )

    model.save('pong_model.pkl')
    env.close()
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default=None)

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )

    deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=args.num_timesteps,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        prioritized_replay_alpha=args.prioritized_replay_alpha,
        checkpoint_freq=args.checkpoint_freq,
        checkpoint_path=args.checkpoint_path,
    )

    env.close()
Exemplo n.º 4
0
def main():
    env = gym.make("CartPole-v0")
    act = deepq.learn(env, network='mlp', total_timesteps=0, load_path="cartpole_model.pkl")

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
def main():

  env = CartPoleBulletEnv(renders=False)
  model = deepq.models.mlp([64])
  act = deepq.learn(env,
                    q_func=model,
                    lr=1e-3,
                    max_timesteps=100000,
                    buffer_size=50000,
                    exploration_fraction=0.1,
                    exploration_final_eps=0.02,
                    print_freq=10,
                    callback=callback)
  print("Saving model to cartpole_model.pkl")
  act.save("cartpole_model.pkl")
Exemplo n.º 6
0
def main():
    env = gym.make("CartPole-v0")
    act = deepq.learn(
        env,
        network='mlp',
        lr=1e-3,
        total_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        callback=callback
    )
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")
def main():
    env = gym.make("Wavefollower-v0")
    model = deepq.models.mlp([64,64])
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=2500000,
        buffer_size=50000,
        exploration_fraction=0.4,
        exploration_final_eps=0.02,
        print_freq=1
    )
    print("Saving model to wavefollower_model.pkl")
    act.save("wavefollower_model.pkl")
Exemplo n.º 8
0
def main():

  env = KukaGymEnv(renders=False, isDiscrete=True)
  model = deepq.models.mlp([64])
  act = deepq.learn(env,
                    q_func=model,
                    lr=1e-3,
                    max_timesteps=10000000,
                    buffer_size=50000,
                    exploration_fraction=0.1,
                    exploration_final_eps=0.02,
                    print_freq=10,
                    callback=callback)
  print("Saving model to kuka_model.pkl")
  act.save("kuka_model.pkl")
Exemplo n.º 9
0
def main():
    env = gym.make("MountainCar-v0")
    # Enabling layer_norm here is import for parameter space noise!
    act = deepq.learn(
        env,
        network=models.mlp(num_hidden=64, num_layers=1),
        lr=1e-3,
        total_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.1,
        print_freq=10,
        param_noise=True
    )
    print("Saving model to mountaincar_model.pkl")
    act.save("mountaincar_model.pkl")
Exemplo n.º 10
0
def main():
    env = gym.make("MountainCar-v0")
    act = deepq.learn(
        env,
        network=models.mlp(num_layers=1, num_hidden=64),
        total_timesteps=0,
        load_path='mountaincar_model.pkl'
    )

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
def main():

  env = KukaCamGymEnv(renders=False, isDiscrete=True)
  model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                                  hiddens=[256],
                                  dueling=False)
  act = deepq.learn(env,
                    q_func=model,
                    lr=1e-3,
                    max_timesteps=10000000,
                    buffer_size=50000,
                    exploration_fraction=0.1,
                    exploration_final_eps=0.02,
                    print_freq=10,
                    callback=callback)
  print("Saving model to kuka_cam_model.pkl")
  act.save("kuka_cam_model.pkl")
Exemplo n.º 12
0
def main():
    env = gym.make("MountainCar-v0")
    # Enabling layer_norm here is import for parameter space noise!
    model = deepq.models.mlp([64], layer_norm=True)
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.1,
        print_freq=10,
        param_noise=True
    )
    print("Saving model to mountaincar_model.pkl")
    act.save("mountaincar_model.pkl")
Exemplo n.º 13
0
def main(learning_rate, trainmaxsteps, nsimudays, npricedays):

    tf.reset_default_graph(
    )  # to avoid the conflict with the existing parameters, but this is not suggested for reuse parameters

    basetraindaysets = [
        3, 7, 12, 33, 43, 62, 69, 80, 91, 97, 98, 108, 116, 123, 126, 136, 144,
        153, 161, 174, 192, 199, 225, 230, 234, 247, 261, 274, 281, 287, 295,
        305, 313, 320, 327, 332, 345, 348, 357, 350, 360
    ]
    basedatasetlen = len(basetraindaysets)
    selectdays = basetraindaysets[
        dataset_start:basedatasetlen:dataset_interval]
    selectdaysfortrain = []
    for iday in selectdays:
        selectdaysfortrain.append(iday)
        selectdaysfortrain.append(iday + 1)
        selectdaysfortrain.append(iday + 2)

    startday = 3
    #nsimudays = 1
    #npricedays = 1
    print('---------------selectdaysfortrain: ---------------')
    print(selectdaysfortrain)
    env = SimpleBatterySimEnv(Lmpfile, batteryEtini, startday, nsimudays,
                              npricedays, selectdaysfortrain)
    model = deepq.models.mlp([256, 256])

    act = deepq.learn(env,
                      q_func=model,
                      lr=learning_rate,
                      max_timesteps=trainmaxsteps,
                      buffer_size=50000,
                      checkpoint_freq=100,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.02,
                      print_freq=10,
                      callback=callback)

    print("Saving final model to %s_lr_%s_%dw.pkl" %
          (model_name, str(learning_rate), int(trainmaxsteps / 10000)))
    act.save(savedModel + "/" + model_name + "_lr_%s_%dw.pkl" %
             (str(learning_rate), int(trainmaxsteps / 10000)))
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--env', help='environment ID', default='BeamRiderNoFrameskip-v4'
    )  # TODO changed to Beamrider since it gives larger rewards easy to see progress
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int,
                        default=0)  # TODO made it false code was complaining
    parser.add_argument('--dueling', type=int,
                        default=0)  # TODO made it false for code simplicity
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    args = parser.parse_args()
    logger.configure("./log/BeamRider")  # TODO log results under BeamRider
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
        nbins=1000,  # TODO number of bins
    )
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=args.num_timesteps,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        min_Val=-50,  # TODO min value of Q values
        max_Val=50,  # TODO max value of Q values
        nbins=1000  # TODO number of bins
    )
    # act.save("pong_model.pkl") XXX
    env.close()
Exemplo n.º 15
0
def main(args):    
    env_name = "CartPole-v0"
    
    env = CartPoleEnv(max_ep_len=args.ep_len, seed=args.seed, append=False)
    model = deepq.models.mlp([64])
    max_timesteps = args.steps
    act = deepq.learn(
        env,
        env_name=env_name,
        q_func=model,
        lr=args.lr,
        max_timesteps=max_timesteps,
        buffer_size=50000,
        exploration_fraction=0.0001,
        exploration_final_eps=0.02,
        print_freq=1,
        callback=callback,
        eval=False,
    )
Exemplo n.º 16
0
def main():
    env = gym.make("CartPole-v0")
    model = deepq.models.mlp([64])
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        callback=callback
    )
    print("Saving model to CartPole_model.pkl")
    act.save("CartPole_model.pkl")
    
    if __name__ == '__main__':
    main()
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v0')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--bufferSize', type=int, default=10000)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--learningRate', type=float, default=5e-4)
    parser.add_argument('--epsStart', type=float, default=1.0)
    parser.add_argument('--epsEnd', type=float, default=.05)
    parser.add_argument('--learningStart', type=int, default=int(1000))
    parser.add_argument('--targetNetworkUpdate', type=int, default=int(500))

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = bench.Monitor(env, logger.get_dir())
    env = deepq.wrap_atari_dqn(env)
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )
    act = deepq.learn(env,
                      q_func=model,
                      lr=args.learningRate,
                      max_timesteps=args.num_timesteps,
                      buffer_size=args.bufferSize,
                      exploration_fraction=args.epsStart,
                      exploration_final_eps=args.epsEnd,
                      train_freq=4,
                      learning_starts=args.learningStart,
                      target_network_update_freq=args.targetNetworkUpdate,
                      gamma=0.99,
                      prioritized_replay=bool(args.prioritized))
    act.save()
    env.close()
Exemplo n.º 18
0
def compare_exploration_p():
    # Probability of ignoring action
    ignore_probs = [0, .01, .03, .05, .1, .2, .3]
    empirical_metrics = {}

    for p in ignore_probs:
        steps = int(3e4)
        # Create environment with this ignore prob
        env = GridworldEnv(p_ignore=p)

        # Train policy on this environment
        with tf.Graph().as_default():
            policy = deepq.learn(env,
                                 network='mlp',
                                 lr=1e-3,
                                 total_timesteps=int(steps),
                                 buffer_size=int(2e4),
                                 exploration_fraction=.1,
                                 exploration_final_eps=0,
                                 print_freq=100,
                                 num_layers=2,
                                 num_hidden=64,
                                 activation=tf.nn.relu)
            policy.save('policy_ignore_p_{}.pkl'.format(p))

            # Run trained policy on validation set and
            # collect metrics
            episodes = collect_metrics(policy)
            all_eps_metrics = [ep_metrics(ep_states) for ep_states in episodes]
            mac_rate, avg_ep_len = metrics(all_eps_metrics)

            empirical_metrics[p] = {
                'mac_rate': mac_rate,
                'avg_ep_len': avg_ep_len
            }

        tf.reset_default_graph()

    with open('metrics.pkl', 'wb') as f:
        pickle.dump(empirical_metrics, f)

    return empirical_metrics
Exemplo n.º 19
0
def main():
    args = setup_utils.setup_and_load()

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    seed = int(time.time()) % 10000
    set_global_seeds(seed * 100 + rank)

    main_utils.setup_mpi_gpus()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    env = main_utils.Scalarize(main_utils.make_general_env(1, seed=rank))
    print("load path:")
    print("{}/saved_models/{}.pkl".format(Config.SAVE_PATH, Config.RUN_ID))
    act = deepq.learn(
        env,
        network="conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        total_timesteps=0,
        load_path="{}/saved_models/{}.pkl".format(Config.SAVE_PATH,
                                                  Config.RUN_ID)
        # load_path="{}/ckpts/{}/model".format(Config.SAVE_PATH, Config.RUN_ID)
    )

    num_episodes = 500
    # while True:
    episode_rew_ls = []
    for i in range(num_episodes):
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            if Config.RENDER:
                env.render()
            obs, rew, done, _ = env.step(act(obs[None])[0])
            episode_rew += rew
        episode_rew_ls.append(episode_rew)
        print("Episode reward", episode_rew)
    print("Avg episode reward", np.mean(episode_rew_ls))
    print("Var episode reward", np.std(episode_rew_ls))
Exemplo n.º 20
0
def main():
    env = Env(64, 64)
    env = WarpFrame(env)
    env = ScaledFloatFrame(env)
    env = FrameStack(env, 1)
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--dueling', type=int, default=0)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    args = parser.parse_args()
    logger.configure()
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (32, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )
    act = deepq.learn(env,
                      q_func=model,
                      lr=1e-4,
                      max_timesteps=args.num_timesteps,
                      buffer_size=10000,
                      exploration_fraction=0.25,
                      exploration_final_eps=0.01,
                      train_freq=4,
                      learning_starts=10000,
                      target_network_update_freq=1000,
                      gamma=0.99,
                      prioritized_replay=bool(args.prioritized),
                      restore=True)
    for _ in range(100):
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            sleep(0.01)
            env.render()
            action = act(np.array(obs)[None])[0]
            obs, rew, done, _ = env.step(action)
            episode_rew += rew
            # print(action, rew)
        print("Episode reward", episode_rew)
Exemplo n.º 21
0
def main():
    global episode_rewards
    env = gym.make("LunarLander-v2")
    max_timesteps_env = env.env._spec.__dict__['tags']['wrapper_config.TimeLimit.max_episode_steps']
    model = deepq.models.mlp([256, 128], activation_fn=tf.nn.tanh)
    act = deepq.learn(
        env,
        lr=1e-5,
        q_func=model,
        target_network_update_freq=1,
        batch_size=32,
        max_timesteps=max_timesteps_env*10000,
        buffer_size=500,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        callback=callback
    )
    print('done')
    return episode_rewards
def main():
    env = gym.make("imgreg_train-v4")
    data_path = 'data/KLA/train/3.h5'
    env.loadData(data_path)
    model = deepq.models.cnn_to_mlp([(16, 8, 4), (32, 8, 4), (64, 4, 2),
                                     (32, 3, 1)], [256])
    act = deepq.learn(env,
                      q_func=model,
                      lr=1e-3,
                      max_timesteps=50000,
                      checkpoint_freq=1000,
                      buffer_size=50000,
                      exploration_fraction=0.3,
                      exploration_final_eps=0.02,
                      print_freq=10,
                      gamma=0.95,
                      batch_size=32,
                      load_model='models/KLA/2.pkl')
    print("Saving model")
    act.save("models/KLA/3.pkl")
Exemplo n.º 23
0
def main(args):
    """
    train and save the DeepQ model, for the mountain car problem

    :param args: (ArgumentParser) the input arguments
    """
    env = gym.make("MountainCar-v0")
    # Enabling layer_norm here is import for parameter space noise!
    model = deepq.models.mlp([64], layer_norm=True)
    act = deepq.learn(env,
                      q_func=model,
                      learning_rate=1e-3,
                      max_timesteps=args.max_timesteps,
                      buffer_size=50000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.1,
                      print_freq=10,
                      param_noise=True)
    print("Saving model to mountaincar_model.pkl")
    act.save("mountaincar_model.pkl")
Exemplo n.º 24
0
def main():
    env = gym.make("PongNoFrameskip-v4")
    env = deepq.wrap_atari_dqn(env)
    model = deepq.learn(
        env,
        "conv_only",
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=True,
        total_timesteps=0
    )

    while True:
        obs, done = env.reset(), False
        episode_rew = 0
        while not done:
            env.render()
            obs, rew, done, _ = env.step(model(obs[None])[0])
            episode_rew += rew
        print("Episode reward", episode_rew)
Exemplo n.º 25
0
def play():
    engine_configuration_channel = EngineConfigurationChannel()
    # 時間スケールを10倍に設定
    engine_configuration_channel.set_configuration_parameters(time_scale=10.0)
    unity_env = UnityEnvironment("./ml-agents/Project/PushBlock",
                                 side_channels=[engine_configuration_channel])
    env = UnityToGymWrapper(unity_env, 0, flatten_branched=True)
    # モデル読み込み
    model = deepq.learn(env, "mlp", total_timesteps=0, load_path="./model")

    obs = env.reset()
    obs = np.expand_dims(np.array(obs), axis=0)

    while True:
        action, _, _, _ = model.step(tf.constant(obs))
        action = action[0].numpy()
        obs, rew, done, _ = env.step(action)
        if done:
            obs = env.reset()
        obs = np.expand_dims(np.array(obs), axis=0)
Exemplo n.º 26
0
def main(learning_rate):

    tf.reset_default_graph(
    )  # to avoid the conflict with the existing parameters, but this is not suggested for reuse parameters
    env = PowerDynSimEnv(case_files_array, dyn_config_file, rl_config_file,
                         java_port)
    model = deepq.models.mlp([128, 128])

    act = deepq.learn(env,
                      q_func=model,
                      lr=learning_rate,
                      max_timesteps=900000,
                      buffer_size=50000,
                      checkpoint_freq=1000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.02,
                      print_freq=10,
                      callback=callback)
    print("Saving final model to power_model_multistep498_508_lr_%s_90w.pkl" %
          (str(learning_rate)))
Exemplo n.º 27
0
def main():
    env = gym.make("Pendulum-v0")
    model = deepq.models.mlp([256,256])

    exp_name = 'half_up'

    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=350000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        exp_name=exp_name,
        callback=callback
    )
    print("Saving model to pendulum_model.pkl")
    act.save("pendulum_model_{}.pkl".format(exp_name))
Exemplo n.º 28
0
def main():
    env = gym.make("PongNoFrameskip-v3")
    env = ScaledFloatFrame(wrap_dqn(env))
    model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                                    hiddens=[256],
                                    dueling=True)
    act = deepq.learn(env,
                      q_func=model,
                      lr=1e-4,
                      max_timesteps=2000000,
                      buffer_size=10000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.01,
                      train_freq=4,
                      learning_starts=10000,
                      target_network_update_freq=1000,
                      gamma=0.99,
                      prioritized_replay=True)
    act.save("pong_model.pkl")
    env.close()
Exemplo n.º 29
0
def main():
    env = gym.make('State-Based-MDP-Navigation-2d-Map0-Goal0-KnownGoalPosition-v0')
    #env = gym.make('Image-Based-Navigation-2d-Map0-Goal0-v0')
    env.action_space = spaces.Discrete(100)
    model = deepq.models.mlp([64])

    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=1,
        callback=callback
    )

    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")
Exemplo n.º 30
0
def main():
    # create the environment
    env = gym.make("balancebot-v0")  # <-- this we need to create

    # create the learning agent
    model = deepq.models.mlp([16, 16])

    # train the agent on the environment
    act = deepq.learn(env,
                      q_func=model,
                      lr=1e-3,
                      max_timesteps=200000,
                      buffer_size=50000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.02,
                      print_freq=10,
                      callback=callback)

    # save trained model
    act.save("balance.pkl")
Exemplo n.º 31
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env',
                        help='environment ID',
                        default='snake-single-v0')
    parser.add_argument('--seed', help='Random seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default='./single-dqn/')

    args = parser.parse_args()

    # make_session first argument : num of cpus
    with U.make_session(8):
        env = gym.make(args.env)
        env = FrameStack(env, 4)
        print("observation space is ", env.observation_space)
        print("action space is ", env.action_space)
        model = deepq.models.cnn_to_mlp(convs=[(32, 8, 4), (64, 4, 2),
                                               (64, 3, 1)],
                                        hiddens=[512],
                                        dueling=bool(args.dueling))

        act = deepq.learn(env,
                          q_func=model,
                          lr=1e-4,
                          max_timesteps=10000000,
                          buffer_size=50000,
                          train_freq=4,
                          exploration_fraction=0.1,
                          exploration_final_eps=0.02,
                          gamma=0.99,
                          print_freq=10,
                          checkpoint_freq=args.checkpoint_freq,
                          checkpoint_path=args.checkpoint_path,
                          param_noise=True)
        act.save("../models/single-dqn/single_dqn_model_final.pkl")
Exemplo n.º 32
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env_id',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=0)
    #parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    env = bench.Monitor(env, logger.get_dir())
    env.seed(args.seed)
    v_func = deepq.models.mlp(hiddens=[200, 200], )
    l_func = deepq.models.mlp(hiddens=[200, 200], )
    mu_func = deepq.models.mlp(hiddens=[200, 200], )

    stddev = 0.3
    nb_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                sigma=float(stddev) *
                                                np.ones(nb_actions))

    act = deepq.learn(env,
                      models=[mu_func, v_func, l_func],
                      action_noise=action_noise,
                      lr=1e-3,
                      max_timesteps=args.num_timesteps,
                      buffer_size=100000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.01,
                      train_freq=4,
                      learning_starts=10000,
                      target_network_update_freq=1000,
                      gamma=0.99,
                      prioritized_replay=bool(args.prioritized))
    # act.save("pong_model.pkl") XXX
    env.close()
Exemplo n.º 33
0
def main():
    env = gym.make("imgreg_train-v3")
    data_path = 'data/train.h5'
    env.loadData(data_path)
    model = deepq.models.cnn_to_mlp([(16, 8, 4), (16, 4, 1), (32, 4, 2)], [256])
    act = deepq.learn(
        env,
        q_func=model,
        lr = 1e-3,
        max_timesteps = 5000000,
        checkpoint_freq = 1000,
        buffer_size = 50000,
        exploration_fraction = 0.2,
        exploration_final_eps = 0.02,
        print_freq = 10,
        gamma = 0.95,
        batch_size = 64,
        load_model = None
    )
    print("Saving model")
    act.save("models/iter_6.1.pkl")
Exemplo n.º 34
0
    def play(self):
        config = self.config

        env = self.get_player()
        model = deepq.learn(
            env,
            config['MODEL']['TYPE'],
            **config['MODEL']['ARGS'],
            **config['LOAD_PATH'],
            dueling=config['DUELING'],
            total_timesteps=0
        )

        while True:
            obs, done = env.reset(), False
            episode_rew = 0
            while not done:
                env.render()
                obs, rew, done, _ = env.step(model(obs[None])[0])
                episode_rew += rew
            print("Episode reward", episode_rew)
Exemplo n.º 35
0
def main(env_name, seed, exp_name):
    data_dir = osp.join(
        osp.dirname(
            osp.dirname(
                osp.dirname(osp.dirname(osp.dirname(osp.abspath(__file__)))))),
        'spinup_data',
        datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S_") +
        exp_name)
    logger.configure(dir=data_dir)
    env = gym.make(env_name)
    act = deepq.learn(env,
                      network='mlp',
                      seed=seed,
                      lr=1e-3,
                      total_timesteps=100000,
                      buffer_size=50000,
                      exploration_fraction=0.1,
                      exploration_final_eps=0.02,
                      print_freq=10)
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")
Exemplo n.º 36
0
def main(args):
    """
    train and save the DeepQ model, for the cartpole problem

    :param args: (ArgumentParser) the input arguments
    """
    env = gym.make("CartPole-v0")
    model = deepq.models.mlp([64])
    act = deepq.learn(
        env,
        q_func=model,
        learning_rate=1e-3,
        max_timesteps=args.max_timesteps,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        callback=callback
    )
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")
Exemplo n.º 37
0
def main():
    env = gym.make("imgreg_train-v5")
    data_paths = ['data/train/1.h5', 'data/train/2.h5', 'data/train/3.h5', 'data/train/4.h5', 'data/train/5.h5']
    env.loadData(data_paths)
    model = deepq.models.cnn_to_mlp([(16, 8, 4), (32, 4, 2), (32, 3, 1)], [256])
    act = deepq.learn(
        env,
        q_func=model,
        lr = 1e-3,
        max_timesteps = 100000,
        checkpoint_freq = 1000,
        buffer_size = 10000,
        exploration_fraction = 0.3,
        exploration_final_eps = 0.02,
        print_freq = 10,
        gamma = 0.95,
        batch_size = 64,
        load_model = None
    )
    print("Saving model")
    act.save("models/2.1.pkl")
Exemplo n.º 38
0
def main():
    env = gym.make("AirSimCarEnv-v0")
    model = deepq.models.mlp([64], layer_norm=True)

    print("\n======= Training session starts for DQN Car =======")
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=1.0,  #0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        param_noise=True,
        checkpoint_freq=2,
        learning_starts=5,
        callback=callback)
    trainedModel = "car.pkl"
    print("\nSaving model to", trainedModel)
    act.save(trainedModel)
Exemplo n.º 39
0
    def learn(self):
        def callback(lcl, _glb):
            is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
            return is_solved

        self.logger.configure()

        config = self.config
        env = self.get_player(train=True)
        model = deepq.learn(
            env,
            config['MODEL']['TYPE'],
            **config['MODEL']['ARGS'],
            **config['LOAD_PATH'],
            dueling=config['DUELING'],
            lr=config['LEARNING_RATE'],
            total_timesteps=config['TOTAL_TIMESTEPS'],
            buffer_size=config['BUFFER_SIZE'],
            exploration_fraction=config['EXPLORATION_FRACTION'],
            exploration_final_eps=config['EXPLORATION_FINAL_EPS'],
            train_freq=config['TRAIN_FREQ'],
            learning_starts=config['NO_OP_STEPS'],
            target_network_update_freq=config['TARGET_UPDATE_FREQ'],
            gamma=config['GAMMA'],
            seed=config['SEED'],
            batch_size=config['BATCH_SIZE'],
            print_freq=config['PRINT_FREQ'],
            checkpoint_freq=config['CHECKPOINT_FREQ'],
            checkpoint_path=config['CHECKPOINT_PATH_PREFIX'],
            prioritized_replay=config['PRIORITIZED_REPLAY'],
            prioritized_replay_alpha=config['PRIORITIZED_REPLAY_ALPHA'],
            prioritized_replay_beta0=config['PRIORITIZED_REPLAY_BETA'],
            prioritized_replay_beta_iters=config['PRIORITIZED_REPLAY_BETA_ITERS'],
            prioritized_replay_eps=config['PRIORITIZED_REPLAY_EPS'],
            param_noise=config['PARAM_NOISE'],
            callback=callback
        )

        model.save(config['CHECKPOINT_PATH_PREFIX'] + config['ENV_NAME'] + '.pkl')
        env.close()
Exemplo n.º 40
0
def train():
    set_global_seeds(args.seed)
    directory = os.path.join(
        args.log_dir,
        '_'.join([args.env,
                  datetime.datetime.now().strftime("%m%d%H%M")]))
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        ValueError("The directory already exists...", directory)
    json.dump(vars(args),
              open(os.path.join(directory, 'learning_prop.json'), 'w'))

    env = gym.make(args.env)

    with tf.device(args.device):
        model = deepq.models.mlp([64])
        act, records = deepq.learn(
            env,
            q_func=model,
            lr=args.learning_rate,
            max_timesteps=args.nb_train_steps,
            buffer_size=args.buffer_size,
            exploration_fraction=args.eps_fraction,
            exploration_final_eps=args.eps_min,
            print_freq=10,
            checkpoint_freq=int(args.nb_train_steps / 10),
            learning_starts=args.nb_warmup_steps,
            gamma=args.gamma,
            callback=None,  #callback,
            epoch_steps=args.nb_epoch_steps,
            gpu_memory=args.gpu_memory,
            directory=directory,
            double_q=args.double_q,
            nb_test_steps=args.nb_test_steps,
        )
        print("Saving model to model.pkl")
        act.save(os.path.join(directory, "model.pkl"))
    plot(records, directory)
Exemplo n.º 41
0
def main():
    parser = argsparser()
    args = parser.parse_args()
    logger.configure(dir=args.log_dir)
    
    env = gym.make(args.env_id)
    env.seed(args.seed)
    set_global_seeds(args.seed)
    model = deepq.models.mlp([64])
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=1000000,
        buffer_size=50000,
        exploration_fraction=0.01,
        exploration_final_eps=0.02,
        print_freq=10
        #callback=callback
    )
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")
Exemplo n.º 42
0
def main():
    # Load orderbook
    orderbook = Orderbook()
    orderbook.loadFromEvents('ob-1-small.tsv')


    env = gym.make("ctc-executioner-v0")
    env.configure(orderbook)
    model = deepq.models.cnn_to_mlp( convs=[(1, 10, 20)], hiddens=[200])
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=100000,
        buffer_size=5000,
        exploration_fraction=0.1,
        exploration_final_eps=0.1,
        target_network_update_freq=1,
        print_freq=10,
    )
    print("Saving model as ctc-executioner-v0.pkl")
    act.save("ctc-executioner-v0.pkl")
Exemplo n.º 43
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(3 * 10e6))
    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    import time

    current_milli_time = lambda: int(round(time.time() * 1000))

    env = Env(64, 44)
    env = WarpFrame(env)
    env = ScaledFloatFrame(env)

    model = deepq.models.cnn_to_mlp(
        convs=[(16, 8, 4), (16, 4, 2), (32, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )
    act = deepq.learn(env,
                      q_func=model,
                      lr=5e-4,
                      max_timesteps=args.num_timesteps,
                      buffer_size=100000,
                      exploration_fraction=0.05,
                      exploration_final_eps=0.01,
                      train_freq=2,
                      learning_starts=10000,
                      target_network_update_freq=1000,
                      gamma=0.99,
                      print_freq=30,
                      checkpoint_freq=200000,
                      prioritized_replay=bool(args.prioritized))
    act.save("draw_model.pkl")
    env.close()
Exemplo n.º 44
0
def main():
    env = gym.make("PongNoFrameskip-v4")
    env = ScaledFloatFrame(wrap_dqn(env))
    model = deepq.models.cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=True
    )
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=2000000,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=True
    )
    act.save("pong_model.pkl")
    env.close()
env = gym.make("CartPole-v0")

# # set up the logger
# logdir = '/tmp/experiments/discrete/DQN/'
# logger.configure(os.path.abspath(logdir))
# print("logger.get_dir(): ", logger.get_dir() and os.path.join(logger.get_dir()))

# models = [[64], [64,64], [128,128], [256,256]]
models = [[64], [128], [64,64], [128,128], [256,256]]

for m in models:
    g = tf.Graph()
    with g.as_default():
        # tf.reset_default_graph()
        act = deepq.learn(
            env,
            q_func=deepq.models.mlp(m),
            lr=1e-3,
            max_timesteps=10000,
            buffer_size=50000,
            exploration_fraction=0.1,
            exploration_final_eps=0.02,
            print_freq=10,
            callback=callback,
            outdir="/tmp/experiments/discrete/DQN/"+str(m)
        )
        act.save("models/cartpole_model_DQN_"+str(m)+".pkl")

# print("Saving model to cartpole_model.pkl")
# act.save("cartpole_model.pkl")
import gym
from baselines import deepq

env = gym.make("MountainCar-v0")
# env = gym.make("MountainCarContinuous-v0")
print(env.action_space.n)
# Enabling layer_norm here is import for parameter space noise!
model = deepq.models.mlp([64], layer_norm=True)
act = deepq.learn(
    env,
    q_func=model,
    lr=1e-3,
    max_timesteps=100000,
    buffer_size=50000,
    exploration_fraction=0.1,
    exploration_final_eps=0.1,
    print_freq=10,
    param_noise=False
)
print("Saving model to mountaincar_model.pkl")
act.save("mountaincar_model.pkl")