示例#1
0
def main(policy_file, seed, n_test_rollouts, render):
    set_global_seeds(seed)

    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    if params['env_name'] == 'GazeboWAMemptyEnv-v2':
        eval_params = {
            'exploit': True,
            'use_target_net': params['test_with_polyak'],
            'compute_Q': True,
            'rollout_batch_size': 1,
            #'render': bool(render),
        }

        for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
            eval_params[name] = params[name]

        madeEnv = config.cached_make_env(params['make_env'])
        evaluator = RolloutWorker(madeEnv, params['make_env'], policy, dims,
                                  logger, **eval_params)
        evaluator.seed(seed)
    else:
        eval_params = {
            'exploit': True,
            'use_target_net': params['test_with_polyak'],
            'compute_Q': True,
            'rollout_batch_size': 1,
            'render': bool(render),
        }

        for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
            eval_params[name] = params[name]

        evaluator = RolloutWorkerOriginal(params['make_env'], policy, dims,
                                          logger, **eval_params)
        evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts()

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
示例#2
0
def main(policy_file, seed, n_test_rollouts, render, level, dimo, env_name):
    set_global_seeds(seed)

    PGGD.DIMO = dimo
    # Load policy.
    with open(policy_file, 'rb') as f:
        policy = pickle.load(f)
    if env_name == '':
        env_name = policy.info['env_name']

    # Prepare params.
    params = config.DEFAULT_PARAMS
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params['env_name'] = env_name
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    dims = config.configure_dims(params)

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'compute_Q': False,
        'rollout_batch_size': 1,
        'render': bool(render),
    }

    for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
        eval_params[name] = params[name]

    evaluator = RolloutStudent(params['make_env'], policy, None, dims, logger,
                               **eval_params)
    evaluator.seed(seed)

    # Run evaluation.
    evaluator.clear_history()
    # Set the evaluator to the corresponding difficulty level
    for _ in range(level):
        evaluator.increase_difficulty()

    for _ in range(n_test_rollouts):
        evaluator.generate_rollouts(render=True, test=True, exploit=True)

    # record logs
    for key, val in evaluator.logs('test'):
        logger.record_tabular(key, np.mean(val))
    logger.dump_tabular()
示例#3
0
def launch(env_name,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           override_params={},
           save_policies=True,
           render=False,
           max_test=True,
           expert_file="",
           policy_file="",
           level=0,
           curriculum=True,
           train_render=False):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        whoami = mpi_fork(num_cpu)
        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env_name
    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)

    if policy_file == "":
        policy = config.configure_pggd(dims=dims,
                                       params=params,
                                       clip_return=clip_return)
    else:
        # Load policy.
        with open(policy_file, 'rb') as f:
            policy = pickle.load(f)
        fn = config.configure_her(params)
        # print(fn)
        policy.set_sample_transitions(fn)
        # print(dir(policy))
        policy.set_obs_size(dims)

    if expert_file != "":
        with open(expert_file, 'rb') as f:
            expert = pickle.load(f)
    else:
        expert = None

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        'beta_final': params['beta_final'],
        'annealing_coeff': params['annealing_coeff']
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': False,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    rollout_worker = RolloutStudent(params['make_env'], policy, expert, dims,
                                    logger, **rollout_params)
    rollout_worker.seed(rank_seed)

    examiner = RolloutStudent(params['make_env'], policy, None, dims, logger,
                              **eval_params)
    examiner.seed(rank_seed)

    evaluator = RolloutStudent(params['make_env'], policy, None, dims, logger,
                               **eval_params)
    evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          examiner=examiner,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          render=render,
          level=level,
          curriculum=curriculum,
          max_test=max_test,
          train_render=train_render)
示例#4
0
def learn(
        *,
        network,
        env,
        total_timesteps,  ### 4
        seed=None,
        eval_env=None,
        replay_strategy='future',
        policy_save_interval=5,
        clip_return=True,
        demo_file=None,
        override_params=None,
        load_path=None,
        save_path=None,
        **kwargs):

    print(
        "-------------------JW Debug learn func @ her.py with hrl baseline merge ----------------------"
    )
    override_params = override_params or {}
    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
        num_cpu = MPI.COMM_WORLD.Get_size()

    # Seed everything.
    rank_seed = seed + 1000000 * rank if seed is not None else None
    set_global_seeds(rank_seed)

    # Prepare params.
    params = config.DEFAULT_PARAMS
    env_name = env.specs[0].id
    params['env_name'] = env_name
    # print(env_name)

    params['replay_strategy'] = replay_strategy
    if env_name in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env_name]
                      )  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    params['rollout_batch_size'] = env.num_envs

    if demo_file is not None:
        params['bc_loss'] = 1
    params.update(kwargs)

    config.log_params(params, logger=logger)  ### 5

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')  ### 6
        logger.warn()

    dims = config.configure_dims(params)
    # policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return, FLAGS=FLAGS, agent_params=agent_params)
    #===============================#
    FLAGS = parse_options()  ## Prepare params for HAC.

    FLAGS.layers = 2  # Enter number of levels in agent hierarchy

    FLAGS.time_scale = 10  # Enter max sequence length in which each policy will specialize

    # Enter max number of atomic actions.
    # This will typically be FLAGS.time_scale**(FLAGS.layers).
    # However, in the UR5 Reacher task, we use a shorter episode length.
    # max_actions = FLAGS.time_scale**(FLAGS.layers-1)*6
    max_actions = 1000

    timesteps_per_action = 15  # Provide the number of time steps per atomic action.

    agent_params = {}

    # Define percentage of actions that a subgoal level (i.e. level i > 0) will test subgoal actions
    agent_params["subgoal_test_perc"] = 0.3

    # Define subgoal penalty for missing subgoal.  Please note that by default the Q value target for missed subgoals does not include Q-value of next state (i.e, discount rate = 0).  As a result, the Q-value target for missed subgoal just equals penalty.  For instance in this 3-level UR5 implementation, if a level proposes a subgoal and misses it, the Q target value for this action would be -10.  To incorporate the next state in the penalty, go to the "penalize_subgoal" method in the "layer.py" file.
    agent_params["subgoal_penalty"] = -FLAGS.time_scale

    # Define exploration noise that is added to both subgoal actions and atomic actions.  Noise added is Gaussian N(0, noise_percentage * action_dim_range)
    agent_params["atomic_noise"] = [0.1 for i in range(3)]
    agent_params["subgoal_noise"] = [0.03 for i in range(6)]

    # Define number of episodes of transitions to be stored by each level of the hierarchy
    agent_params["episodes_to_store"] = 500

    # Provide training schedule for agent.
    # Training by default will alternate between exploration and testing.
    # Hyperparameter below indicates number of exploration episodes.
    # Testing occurs for 100 episodes.  To change number of testing episodes, go to "ran_HAC.py".
    agent_params["num_exploration_episodes"] = 50
    # policy = config.configure_ddpg(params, FLAGS, dims, reuse, use_mpi, clip_return) # 이걸 어떻게 해야해!

    # def configure_ddpg(dims, params, FLAGS, agent_params, reuse=False, use_mpi=True, clip_return=True):

    # policy = [] ## policy를 레이어 별로 만들어줌
    # for i in range(0, FLAGS.layers):
    #     print("!!!!!!!!!!!!!!!!!!!!!!!!!! i={}".format(i))
    #     policy[i] = config.configure_ddpg(dims=dims, params=params, FLAGS=FLAGS, agent_params=agent_params, reuse=False, use_mpi=True, clip_return=True) # 이걸 어떻게 해야해!
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   FLAGS=FLAGS,
                                   agent_params=agent_params,
                                   reuse=False,
                                   use_mpi=True,
                                   clip_return=True)  # 이걸 어떻게 해야해!
    # 원래 dims, params, reuse=False, use_mpi=True, clip_return=True

    # agent = design_agent_and_env(FLAGS, env, dims=dims, params=params, clip_return=clip_return) ## make agent(TD3) for HAC.
    # policy = design_agent_and_env(FLAGS, env, dims=dims, params=params, clip_return=clip_return) ## make agent(TD3) for HAC.
    #===============================#
    if load_path is not None:
        tf_util.load_variables(load_path)

    rollout_params = {
        'exploit': False,
        'use_target_net': False,
        'use_demo_states': True,
        'compute_Q': False,
        'T': params['T'],
        ############hrl################

        ###############################
    }

    eval_params = {
        'exploit': True,
        'use_target_net': params['test_with_polyak'],
        'use_demo_states': False,
        'compute_Q': True,
        'T': params['T'],
    }

    for name in [
            'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
    ]:
        rollout_params[name] = params[name]
        eval_params[name] = params[name]

    eval_env = eval_env or env
    print("================")
    print(FLAGS)
    print(type(FLAGS))
    print("================")
    ## Done with prepare
    # run_HAC(FLAGS, agent)
    # agent = design_agent_and_env(FLAGS, env, dims=dims, params=params, clip_return=clip_return) ## 원래거
    agent = design_agent_and_env(FLAGS=FLAGS,
                                 env=env,
                                 policy=policy,
                                 dims=dims,
                                 logger=logger,
                                 rollout_params=rollout_params,
                                 eval_params=eval_params,
                                 agent_params=agent_params,
                                 monitor=True)
    # agent = design_agent_and_env(FLAGS, env, dims, policy, rollout_params, eval_params, agent_params, monitor=True)
    '''
    FLAGS : Namespace(layers=2, time_scale=10), <class 'argparse.Namespace'>
    env : <baselines.common.vec_env.dummy_vec_env.DummyVecEnv object at 0x1c22644ef0>, <class 'baselines.common.vec_env.dummy_vec_env.DummyVecEnv'>
    dims : {'o': 10, 'u': 4, 'g': 3, 'info_is_success': 1}, dict
    policy : <baselines.her.ddpg.DDPG object at 0x1c2a83de48>, <class 'baselines.her.ddpg.DDPG'>
    logger : <module 'baselines.logger' from '/Users/ryujiwon/rl-robotarm-final/baselines/baselines/logger.py'>, <class 'module'>
    '''
    # rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params)
    # ##
    # # rollout_worker_high = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params)
    # ##
    # evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) ## 뭐하는 놈임

    n_cycles = params['n_cycles']
    n_epochs = total_timesteps // n_cycles // agent.layers[
        0].rollout_worker.T // agent.layers[0].rollout_worker.rollout_batch_size
    # n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size
    # print("#######################################n_epoch = {}".format(n_epochs)) ### 7

    # return train(
    #     save_path=save_path,
    #     env_name=env_name, #jw
    #     agent=agent, #jw
    #     policy=policy, rollout_worker=rollout_worker,
    #     evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
    #     n_cycles=params['n_cycles'], n_batches=params['n_batches'],
    #     policy_save_interval=policy_save_interval, demo_file=demo_file, FLAGS=FLAGS)
    '''
    def HAC_train(*, env, agnet, policy,
          n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval,
          save_path, demo_file, FLAGS, **kwargs):
    '''
    print(
        "@ her.py learn, env={}, agent={}, policy={}, n_epochs={}, n_test_rollouts={}, n_cycles={}, n_batches={}, policy_save_interval={}, save_path={}, demo_file={}, FLAGS={}"
        .format(
            env,
            agent,  ## A.R
            policy,
            n_epochs,
            params['n_test_rollouts'],
            params['n_cycles'],
            params['n_batches'],
            policy_save_interval,
            save_path,
            demo_file,
            FLAGS))

    return HAC_train(
        # env_name=env_name, #jw
        env=env,  ## A.R
        agent=agent,  ## A.R
        policy=policy,
        n_epochs=n_epochs,
        n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'],
        n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval,
        save_path=save_path,
        demo_file=demo_file,
        FLAGS=FLAGS)
def launch(env,
           logdir,
           n_epochs,
           num_cpu,
           seed,
           replay_strategy,
           policy_save_interval,
           clip_return,
           bc_loss,
           q_filter,
           num_demo,
           override_params={},
           save_policies=True):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)
    resource.setrlimit(resource.RLIMIT_NOFILE, (65536, 65536))

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(
            config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(
        **override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    if num_cpu == 1:
        logger.warn()
        logger.warn('*** Warning ***')
        logger.warn(
            'You are running HER with just a single MPI worker. This will work, but the '
            +
            'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) '
            +
            'were obtained with --num_cpu 19. This makes a significant difference and if you '
            +
            'are looking to reproduce those results, be aware of this. Please also refer to '
            +
            'https://github.com/openai/baselines/issues/314 for further details.'
        )
        logger.warn('****************')
        logger.warn()

    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims,
                                   params=params,
                                   clip_return=clip_return,
                                   bc_loss=bc_loss,
                                   q_filter=q_filter,
                                   num_demo=num_demo)

    if params['env_name'] == 'GazeboWAMemptyEnv-v2':

        demoFileName = '/home/rjangir/wamObjectDemoData/data_wam_double_random_100_40_25.npz'
        rollout_params = {
            'exploit': False,
            'use_target_net': False,
            'use_demo_states': True,
            'compute_Q': False,
            'T': params['T'],
            #'render': 1,
        }

        eval_params = {
            'exploit': True,
            'use_target_net': params['test_with_polyak'],
            #'use_demo_states': False,
            'compute_Q': True,
            'T': params['T'],
            'rollout_batch_size': 1,
            #'render': 1,
        }

        for name in [
                'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
        ]:
            rollout_params[name] = params[name]
            eval_params[name] = params[name]

        madeEnv = config.cached_make_env(params['make_env'])
        rollout_worker = RolloutWorker(madeEnv, params['make_env'], policy,
                                       dims, logger, **rollout_params)
        rollout_worker.seed(rank_seed)

        evaluator = RolloutWorker(madeEnv, params['make_env'], policy, dims,
                                  logger, **eval_params)
        evaluator.seed(rank_seed)
    else:

        demoFileName = '/home/rjangir/fetchDemoData/data_fetch_random_100.npz'
        rollout_params = {
            'exploit': False,
            'use_target_net': False,
            'use_demo_states': True,
            'compute_Q': False,
            'T': params['T'],
            #'render': 1,
        }

        eval_params = {
            'exploit': True,
            'use_target_net': params['test_with_polyak'],
            #'use_demo_states': False,
            'compute_Q': True,
            'T': params['T'],
            #'render': 1,
        }

        for name in [
                'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps'
        ]:
            rollout_params[name] = params[name]
            eval_params[name] = params[name]

        rollout_worker = RolloutWorkerOriginal(params['make_env'], policy,
                                               dims, logger, **rollout_params)
        rollout_worker.seed(rank_seed)

        evaluator = RolloutWorkerOriginal(params['make_env'], policy, dims,
                                          logger, **eval_params)
        evaluator.seed(rank_seed)

    train(logdir=logdir,
          policy=policy,
          rollout_worker=rollout_worker,
          evaluator=evaluator,
          n_epochs=n_epochs,
          n_test_rollouts=params['n_test_rollouts'],
          n_cycles=params['n_cycles'],
          n_batches=params['n_batches'],
          policy_save_interval=policy_save_interval,
          save_policies=save_policies,
          demo_file_name=demoFileName)
示例#6
0
def launch(
    env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, demo_file,
    override_params={}, save_policies=True
):
    # Fork for multi-CPU MPI implementation.
    if num_cpu > 1:
        try:
            whoami = mpi_fork(num_cpu, ['--bind-to', 'core'])
        except CalledProcessError:
            # fancy version of mpi call failed, try simple version
            whoami = mpi_fork(num_cpu)

        if whoami == 'parent':
            sys.exit(0)
        import baselines.common.tf_util as U
        U.single_threaded_session().__enter__()
    rank = MPI.COMM_WORLD.Get_rank()

    # Configure logging
    if rank == 0:
        if logdir or logger.get_dir() is None:
            logger.configure(dir=logdir)
    else:
        logger.configure()
    logdir = logger.get_dir()
    assert logdir is not None
    os.makedirs(logdir, exist_ok=True)

    # Seed everything.
    rank_seed = seed + 1000000 * rank
    set_global_seeds(rank_seed)
    resource.setrlimit(resource.RLIMIT_NOFILE, (65536, 65536))

    # Prepare params.
    params = config.DEFAULT_PARAMS
    params['env_name'] = env
    params['replay_strategy'] = replay_strategy
    if env in config.DEFAULT_ENV_PARAMS:
        params.update(config.DEFAULT_ENV_PARAMS[env])  # merge env-specific parameters in
    params.update(**override_params)  # makes it possible to override any parameter
    with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f:
        json.dump(params, f)
    params = config.prepare_params(params)
    config.log_params(params, logger=logger)

    
    dims = config.configure_dims(params)
    policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return)

    if params['env_name'] == 'FetchPickAndPlace-v0':
        rollout_params = {
            'exploit': False,
            'use_target_net': False,
            'use_demo_states': True,
            'compute_Q': False,
            'T': params['T'],
            'render': 1,
        }

        eval_params = {
            'exploit': True,
            'use_target_net': params['test_with_polyak'],
            #'use_demo_states': False,
            'compute_Q': True,
            'T': params['T'],
            'rollout_batch_size': 1,
            'render': 1,
        }

        for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
            rollout_params[name] = params[name]
            eval_params[name] = params[name]



        madeEnv = config.cached_make_env(params['make_env'])
        rollout_worker = RolloutWorker(madeEnv, params['make_env'], policy, dims, logger, **rollout_params)
        rollout_worker.seed(rank_seed)

        evaluator = RolloutWorker(madeEnv, params['make_env'], policy, dims, logger, **eval_params)
        evaluator.seed(rank_seed)
    else:
        rollout_params = {
            'exploit': False,
            'use_target_net': False,
            'use_demo_states': True,
            'compute_Q': False,
            'T': params['T'],
            'render': 1,
        }

        eval_params = {
            'exploit': True,
            'use_target_net': params['test_with_polyak'],
            #'use_demo_states': False,
            'compute_Q': True,
            'T': params['T'],
            'render': 1,
        }

        for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']:
            rollout_params[name] = params[name]
            eval_params[name] = params[name]


        rollout_worker = RolloutWorkerOriginal(params['make_env'], policy, dims, logger, **rollout_params)
        rollout_worker.seed(rank_seed)

        evaluator = RolloutWorkerOriginal(params['make_env'], policy, dims, logger, **eval_params)
        evaluator.seed(rank_seed)

    train(
        logdir=logdir, policy=policy, rollout_worker=rollout_worker,
        evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'],
        n_cycles=params['n_cycles'], n_batches=params['n_batches'],
        policy_save_interval=policy_save_interval, save_policies=save_policies, demo_file = demo_file)