Exemplo n.º 1
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', required=True)
    parser.add_argument('-p', '--predictor', required=True)
    parser.add_argument('-n', '--name', required=True)
    parser.add_argument('-s', '--seed', default=1, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=None, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="parallel_trpo", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=10000, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    parser.add_argument('-x', '--human_labels', default=1000, type=int)
    args = parser.parse_args()

    print("Setting things up...")

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)

    env = make_with_torque_removed(env_id)

    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4

        if args.n_labels:
            label_schedule = LabelAnnealer(agent_logger,
                                           final_timesteps=num_timesteps,
                                           final_labels=args.n_labels,
                                           pretrain_labels=pretrain_labels)
        else:
            print(
                "No label limit given. We will request one label every few seconds."
            )
            label_schedule = ConstantLabelSchedule(
                pretrain_labels=pretrain_labels)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector(
                run_name, args.human_labels)

        elif args.predictor == "human":
            bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            assert bucket and bucket.startswith(
                "gs://"
            ), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector(
                env_id, experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        predictor = ComparisonRewardPredictor(
            env,
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )

        print(
            "Starting random rollouts to generate pretraining segments. No learning will take place..."
        )
        pretrain_segments = segments_from_rand_rollout(
            env_id,
            make_with_torque_removed,
            n_desired_segments=pretrain_labels * 2,
            clip_length_in_seconds=CLIP_LENGTH,
            workers=args.workers)
        for i in range(
                pretrain_labels):  # Turn our random segments into comparisons
            comparison_collector.add_segment_pair(
                pretrain_segments[i], pretrain_segments[i + pretrain_labels])

        # Sleep until the human has labeled most of the pretraining comparisons
        while len(comparison_collector.labeled_comparisons) < int(
                pretrain_labels * 0.75):
            comparison_collector.label_unlabeled_comparisons()
            if args.predictor == "synth":
                print("%s synthetic labels generated... " %
                      (len(comparison_collector.labeled_comparisons)))
            elif args.predictor == "human":
                print(
                    "%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... "
                    % (len(comparison_collector.labeled_comparisons),
                       pretrain_labels))
                sleep(5)

        # Start the actual training
        for i in range(args.pretrain_iters):
            predictor.train_predictor()  # Train on pretraining labels
            if i % 100 == 0:
                print("%s/%s predictor pretraining iters... " %
                      (i, args.pretrain_iters))

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor,
                                         env,
                                         save_dir=osp.join(
                                             '/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            env_id=env_id,
            make_env=make_with_torque_removed,
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=get_timesteps_per_episode(env),
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":

        def make_env():
            return make_with_torque_removed(env_id)

        train_pposgd_mpi(make_env,
                         num_timesteps=num_timesteps,
                         seed=args.seed,
                         predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" %
                         args.agent)
Exemplo n.º 2
0
def main():
    # Tensorflow is not fork-safe, so we must use spawn instead
    # https://github.com/tensorflow/tensorflow/issues/5448#issuecomment-258934405
    multiprocessing.set_start_method('spawn')

    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', required=True)
    parser.add_argument('-p', '--reward_model', required=True)
    parser.add_argument('-n', '--name', required=True)
    parser.add_argument('-s', '--seed', default=1, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=None, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="ga3c", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=5000, type=int)
    parser.add_argument('-b', '--starting_beta', default=0.1, type=float)
    parser.add_argument('-c', '--clip_length', default=1.5, type=float)
    parser.add_argument('-f', '--stacked_frames', default=4, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    parser.add_argument('--force_new_environment_clips', action="store_true")
    parser.add_argument('--force_new_training_labels', action="store_true")
    parser.add_argument('--force_new_reward_model', action="store_true")
    parser.add_argument('--force_new_agent_model', action="store_true")
    args = parser.parse_args()

    env_id = args.env_id
    experiment_name = slugify(args.name)

    # Potentially erase old data
    if args.force_new_environment_clips:
        existing_clips = [x for x in os.listdir('clips') if x.startswith(env_id)]
        if len(existing_clips):
            print("Found {} old clips".format(len(existing_clips)))
            print("Are you sure you want to erase them and start fresh?")
            print("Warning: This will invalidate all training labels made from these clips!")
            if input("> ").lower().startswith('y'):
                for clip in existing_clips:
                    os.remove(os.path.join('clips', clip))
                from human_feedback_api import Clip
                Clip.objects.filter(environment_id=env_id).delete()
                # Also erase all label data for this experiment
                from human_feedback_api import SortTree
                SortTree.objects.filter(experiment_name=experiment_name).delete()
                from human_feedback_api import Comparison
                Comparison.objects.filter(experiment_name=experiment_name).delete()
            else:
                print("Quitting...")
                return

    if args.force_new_training_labels:
        from human_feedback_api import SortTree
        from human_feedback_api import Comparison
        all_tree_nodes = SortTree.objects.filter(experiment_name=experiment_name)
        if all_tree_nodes:
            print("Found a sorting tree with {} nodes".format(len(all_tree_nodes)))
            print("Are you sure you want to erase all the comparison data associated with this tree?")
            if input("> ").lower().startswith('y'):
                all_tree_nodes.delete()
                Comparison.objects.filter(experiment_name=experiment_name).delete()
            else:
                print("Quitting...")
                return

    print("Setting things up...")
    run_name = "%s/%s-%s" % (env_id, experiment_name, int(time()))
    env = make_env(env_id)
    n_pretrain_labels = args.pretrain_labels if args.pretrain_labels else (args.n_labels // 4 if args.n_labels else 0)

    episode_logger = EpisodeLogger(run_name)
    schedule = make_label_schedule(n_pretrain_labels, args.n_labels, args.num_timesteps, episode_logger)

    os.makedirs('checkpoints/reward_model', exist_ok=True)
    os.makedirs('clips', exist_ok=True)

    # Make reward model
    if args.reward_model == "rl":
        reward_model = OriginalEnvironmentReward(episode_logger)
        args.pretrain_iters = 0  # Don't bother pre-training a traditional RL agent
    else:
        reward_model = OrdinalRewardModel(
            args.reward_model, env, env_id, make_env, experiment_name, episode_logger, schedule,
            n_pretrain_labels, args.clip_length, args.stacked_frames, args.workers)

    if not args.force_new_reward_model:
        reward_model.try_to_load_model_from_checkpoint()

    reward_model.train(args.pretrain_iters, report_frequency=25)
    reward_model.save_model_checkpoint()

    # Wrap the reward model to capture videos every so often:
    if not args.no_videos:
        video_path = os.path.join('/tmp/rl_teacher_vids', run_name)
        checkpoint_interval = 20 if args.agent == "ga3c" else 200
        reward_model = SegmentVideoRecorder(reward_model, env, save_dir=video_path, checkpoint_interval=checkpoint_interval)

    print("Starting joint training of reward model and agent")
    if args.agent == "ga3c":
        from ga3c.Server import Server as Ga3cServer
        from ga3c.Config import Config as Ga3cConfig
        Ga3cConfig.ATARI_GAME = env_id
        Ga3cConfig.MAKE_ENV_FUNCTION = make_env
        Ga3cConfig.NETWORK_NAME = experiment_name
        Ga3cConfig.SAVE_FREQUENCY = 200
        Ga3cConfig.TENSORBOARD = True
        Ga3cConfig.LOG_WRITER = episode_logger
        Ga3cConfig.AGENTS = args.workers
        Ga3cConfig.LOAD_CHECKPOINT = not args.force_new_agent_model
        Ga3cConfig.STACKED_FRAMES = args.stacked_frames
        Ga3cConfig.BETA_START = args.starting_beta
        Ga3cConfig.BETA_END = args.starting_beta * 0.1
        Ga3cServer(reward_model).main()
    elif args.agent == "parallel_trpo":
        from parallel_trpo.train import train_parallel_trpo
        train_parallel_trpo(
            env_id=env_id,
            make_env=make_env,
            stacked_frames=args.stacked_frames,
            predictor=reward_model,
            summary_writer=episode_logger,
            workers=args.workers,
            runtime=(args.num_timesteps / 1000),
            max_timesteps_per_episode=get_timesteps_per_episode(env),
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":
        from pposgd_mpi.run_mujoco import train_pposgd_mpi
        train_pposgd_mpi(lambda: make_env(env_id), num_timesteps=args.num_timesteps, seed=args.seed, predictor=reward_model)
    elif args.agent == "ppo_atari":
        from pposgd_mpi.run_atari import train_atari
        # TODO: Add Multi-CPU support!
        train_atari(env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=1, predictor=reward_model)
    else:
        raise ValueError("%s is not a valid choice for args.agent" % args.agent)
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--env_id', default="ShortHopper-v1", type=str)
    parser.add_argument('-p', '--predictor', default="human", type=str)
    parser.add_argument('-n', '--name', default="human-175-hopper", type=str)
    parser.add_argument('-s', '--seed', default=6, type=int)
    parser.add_argument('-w', '--workers', default=4, type=int)
    parser.add_argument('-l', '--n_labels', default=None, type=int)
    parser.add_argument('-L', '--pretrain_labels', default=20, type=int)
    parser.add_argument('-t', '--num_timesteps', default=5e6, type=int)
    parser.add_argument('-a', '--agent', default="pposgd_mpi", type=str)
    parser.add_argument('-i', '--pretrain_iters', default=1, type=int)
    parser.add_argument('-V', '--no_videos', action="store_true")
    parser.add_argument('--log_path',
                        help='Directory to save learning curve data.',
                        default='tmp/openaiTest',
                        type=str)
    args = parser.parse_args()

    print("Setting things up...")

    env_id = args.env_id
    run_name = "%s/%s-%s" % (env_id, args.name, int(time()))
    summary_writer = make_summary_writer(run_name)

    env = make_with_torque_removed(env_id)

    num_timesteps = int(args.num_timesteps)
    experiment_name = slugify(args.name)

    if args.predictor == "rl":
        predictor = TraditionalRLRewardPredictor(summary_writer)
    else:
        agent_logger = AgentLogger(summary_writer)

        pretrain_labels = args.pretrain_labels if args.pretrain_labels else args.n_labels // 4
        #online and offline
        if args.n_labels:
            label_schedule = LabelAnnealer(agent_logger,
                                           final_timesteps=num_timesteps,
                                           final_labels=args.n_labels,
                                           pretrain_labels=pretrain_labels)
        else:
            print(
                "No label limit given. We will request one label every few seconds."
            )
            label_schedule = ConstantLabelSchedule(
                pretrain_labels=pretrain_labels)

        if args.predictor == "synth":
            comparison_collector = SyntheticComparisonCollector()

        elif args.predictor == "human":
            bucket = os.environ.get('RL_TEACHER_GCS_BUCKET')
            bucket = "gs://rl-teacher-preference"
            #assert bucket and bucket.startswith("gs://"), "env variable RL_TEACHER_GCS_BUCKET must start with gs://"
            comparison_collector = HumanComparisonCollector(
                env_id, experiment_name=experiment_name)
        else:
            raise ValueError("Bad value for --predictor: %s" % args.predictor)

        predictor = ComparisonRewardPredictor(
            env,
            summary_writer,
            comparison_collector=comparison_collector,
            agent_logger=agent_logger,
            label_schedule=label_schedule,
        )

        # print("Starting random rollouts to generate pretraining segments. No learning will take place...")
        # pretrain_segments = segments_from_rand_rollout(
        #     env_id, make_with_torque_removed, n_desired_segments=pretrain_labels * 2,
        #     clip_length_in_seconds=CLIP_LENGTH, workers=args.workers)
        # for i in range(pretrain_labels):  # Turn our random segments into comparisons
        #     comparison_collector.add_segment_pair(pretrain_segments[i], pretrain_segments[i + pretrain_labels])
        #
        # # Sleep until the human has labeled most of the pretraining comparisons
        # while len(comparison_collector.labeled_comparisons) < int(pretrain_labels * 0.75):
        #     comparison_collector.label_unlabeled_comparisons()
        #     if args.predictor == "synth":
        #         print("%s synthetic labels generated... " % (len(comparison_collector.labeled_comparisons)))
        #     elif args.predictor == "human":
        #         print("%s/%s comparisons labeled. Please add labels w/ the human-feedback-api. Sleeping... " % (
        #             len(comparison_collector.labeled_comparisons), pretrain_labels))
        #         sleep(5)
        #
        # # Start the actual training
        #
        # for i in range(args.pretrain_iters):
        #     predictor.train_predictor()  # Train on pretraining labels
        #     if i % 10 == 0:
        #         print("%s/%s predictor pretraining iters... " % (i, args.pretrain_iters))
        #saver = tf.train.Saver(max_to_keep=5)
        #save_path = saver.save(sess, "/tmp/GAN/GAN_preference_based_model.ckpt")
        #print("Model saved in path: %s" % save_path)

    # Wrap the predictor to capture videos every so often:
    if not args.no_videos:
        predictor = SegmentVideoRecorder(predictor,
                                         env,
                                         save_dir=osp.join(
                                             '/tmp/rl_teacher_vids', run_name))

    # We use a vanilla agent from openai/baselines that contains a single change that blinds it to the true reward
    # The single changed section is in `rl_teacher/agent/trpo/core.py`
    print("Starting joint training of predictor and agent")
    if args.agent == "parallel_trpo":
        train_parallel_trpo(
            env_id=env_id,
            make_env=make_with_torque_removed,
            predictor=predictor,
            summary_writer=summary_writer,
            workers=args.workers,
            runtime=(num_timesteps / 1000),
            max_timesteps_per_episode=get_timesteps_per_episode(env),
            timesteps_per_batch=8000,
            max_kl=0.001,
            seed=args.seed,
        )
    elif args.agent == "pposgd_mpi":

        def make_env():
            return make_with_torque_removed(env_id)

        try:
            from mpi4py import MPI
        except ImportError:
            MPI = None

        def configure_logger(log_path, **kwargs):
            if log_path is not None:
                logger.configure(log_path)
            else:
                logger.configure(**kwargs)

        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
            rank = 0
            configure_logger(args.log_path)
        else:
            rank = MPI.COMM_WORLD.Get_rank()
            configure_logger(args.log_path, format_strs=[])

        train_pposgd_mpi(make_env,
                         num_timesteps=num_timesteps,
                         seed=args.seed,
                         predictor=predictor)
    else:
        raise ValueError("%s is not a valid choice for args.agent" %
                         args.agent)