예제 #1
0
파일: run.py 프로젝트: greedforgood/Python
def mp_evaluator(args, comm_eva, agent_id=0):
    args.init_before_training(if_main=False)

    if True:
        '''basic arguments'''
        cwd = args.cwd
        agent = args.agent

        env = args.env
        state_dim = env.state_dim
        action_dim = env.action_dim
        # if_discrete = env.if_discrete

        '''training arguments'''
        net_dim = args.net_dim
        # max_memo = args.max_memo
        break_step = args.break_step
        # batch_size = args.batch_size
        # target_step = args.target_step
        # repeat_times = args.repeat_times
        learning_rate = args.learning_rate
        if_break_early = args.if_allow_break

        # gamma = args.gamma
        # reward_scale = args.reward_scale
        if_per_or_gae = args.if_per_or_gae
        # soft_update_tau = args.soft_update_tau

        '''evaluating arguments'''
        show_gap = args.eval_gap
        eval_env = args.eval_env
        eval_times1 = args.eval_times1
        eval_times2 = args.eval_times2
        del args

    '''init: Agent'''
    agent.init(net_dim, state_dim, action_dim, learning_rate, if_per_or_gae, gpu_id=-1)
    agent.save_or_load_agent(cwd, if_save=False)

    act_cpu = agent.act.to(torch.device("cpu"))
    act_cpu.eval()
    [setattr(param, 'requires_grad', False) for param in act_cpu.parameters()]
    del agent

    '''init Evaluator'''
    eval_env = deepcopy_or_rebuild_env(env) if eval_env is None else eval_env
    evaluator = Evaluator(cwd=cwd, agent_id=agent_id, device=torch.device("cpu"), env=eval_env,
                          eval_times1=eval_times1, eval_times2=eval_times2, eval_gap=show_gap)  # build Evaluator
    evaluator.save_or_load_recoder(if_save=False)

    if_train = True
    with torch.no_grad():
        while if_train:
            if_train = comm_eva.evaluate_and_save0(act_cpu, evaluator, if_break_early, break_step, cwd)

    print(f'| UsedTime: {time.time() - evaluator.start_time:.0f} | SavedDir: {cwd}')
    evaluator.save_or_load_recoder(if_save=True)
예제 #2
0
파일: run.py 프로젝트: greedforgood/Python
def train_and_evaluate(args):
    args.init_before_training()
    '''basic arguments'''
    cwd = args.cwd
    env = args.env
    agent = args.agent
    gpu_id = args.gpu_id
    '''training arguments'''
    net_dim = args.net_dim
    max_memo = args.max_memo
    break_step = args.break_step
    batch_size = args.batch_size
    target_step = args.target_step
    repeat_times = args.repeat_times
    learning_rate = args.learning_rate
    if_break_early = args.if_allow_break

    gamma = args.gamma
    reward_scale = args.reward_scale
    if_per_or_gae = args.if_per_or_gae
    soft_update_tau = args.soft_update_tau
    '''evaluating arguments'''
    show_gap = args.eval_gap
    eval_times1 = args.eval_times1
    eval_times2 = args.eval_times2
    if_vec_env = getattr(env, 'env_num', 1) > 1
    env_eval = deepcopy_or_rebuild_env(
        env) if args.env_eval is None else args.env_eval
    del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: environment'''
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    env_eval = deepcopy(env) if env_eval is None else deepcopy(env_eval)
    '''init: Agent, ReplayBuffer, Evaluator'''
    agent.init(net_dim, state_dim, action_dim, learning_rate, if_per_or_gae)
    if_on_policy = getattr(agent, 'if_on_policy', False)

    buffer = ReplayBuffer(
        max_len=max_memo,
        state_dim=state_dim,
        action_dim=action_dim,
        if_use_per=if_per_or_gae) if if_on_policy else tuple()

    evaluator = Evaluator(cwd=cwd,
                          agent_id=gpu_id,
                          device=agent.device,
                          env=env_eval,
                          eval_times1=eval_times1,
                          eval_times2=eval_times2,
                          eval_gap=show_gap)  # build Evaluator
    '''prepare for training'''
    agent.state = env.reset()
    total_step = 0
    '''start training'''
    if_train = True
    while if_train:
        with torch.no_grad():
            if if_on_policy:
                buffer_tuple1 = agent.explore_env(env, target_step,
                                                  reward_scale, gamma)
                buffer_tuple2 = agent.prepare_buffer(buffer_tuple1)
                steps = buffer_tuple2[0].size(0)

                buffer = buffer_tuple2
            else:
                trajectory_list = agent.explore_env(env, target_step,
                                                    reward_scale, gamma)
                steps = len(trajectory_list)

                buffer.extend_buffer_from_list(trajectory_list)
        total_step += steps

        # assert if_on_policy and isinstance(buffer, tuple)
        # assert (not if_on_policy) and isinstance(buffer, ReplayBuffer)
        logging_tuple = agent.update_net(buffer, batch_size, repeat_times,
                                         soft_update_tau)

        with torch.no_grad():
            if_reach_goal = evaluator.evaluate_save(agent.act, steps,
                                                    logging_tuple)
            if_train = not ((if_break_early and if_reach_goal)
                            or total_step > break_step
                            or os.path.exists(f'{cwd}/stop'))

    print(
        f'| UsedTime: {time.time() - evaluator.start_time:.0f} | SavedDir: {cwd}'
    )
예제 #3
0
파일: run.py 프로젝트: greedforgood/Python
def mp_evaluator(args, pipe_eva):
    args.init_before_training(process_id=-1)

    if True:
        '''arguments: basic'''
        cwd = args.cwd
        env = args.env
        agent = args.agent
        gpu_id = args.gpu_id
        # worker_num = args.worker_num
        '''arguments: train'''
        # net_dim = args.net_dim
        # max_memo = args.max_memo
        break_step = args.break_step
        # batch_size = args.batch_size
        # target_step = args.target_step
        # repeat_times = args.repeat_times
        # learning_rate = args.learning_rate
        if_break_early = args.if_allow_break

        # gamma = args.gamma
        # reward_scale = args.reward_scale
        # if_per_or_gae = args.if_per_or_gae
        # soft_update_tau = args.soft_update_tau
        '''arguments: evaluate'''
        show_gap = args.eval_gap
        eval_times1 = args.eval_times1
        eval_times2 = args.eval_times2
        env_eval = deepcopy_or_rebuild_env(
            env) if args.env_eval is None else args.env_eval
        '''arguments: environment'''
        # max_step = env.max_step
        # state_dim = env.state_dim
        # action_dim = env.action_dim
        # if_discrete = env.if_discrete
        del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: Evaluator'''
    evaluator = Evaluator(cwd=cwd,
                          agent_id=gpu_id,
                          device=agent.device,
                          env=env_eval,
                          eval_times1=eval_times1,
                          eval_times2=eval_times2,
                          eval_gap=show_gap)  # build Evaluator

    # pipe_eva[1].send((act_cpu, steps))
    act_cpu, steps = pipe_eva[0].recv()
    '''start training'''
    sum_step = steps
    if_train = True
    while if_train:
        # pipe_eva[1].send((act_state_dict, steps, logging_tuple))
        act_state_dict, steps, logging_tuple = pipe_eva[0].recv()

        sum_step += steps
        if act_state_dict is not None:
            act_cpu.load_state_dict(act_state_dict)

            if_reach_goal = evaluator.evaluate_save(act_cpu, sum_step,
                                                    logging_tuple)
            sum_step = 0

            if_train = not ((if_break_early and if_reach_goal)
                            or evaluator.total_step > break_step
                            or os.path.exists(f'{cwd}/stop'))

    print(f'| SavedDir: {cwd}\n'
          f'| UsedTime: {time.time() - evaluator.start_time:.0f}')
    pipe_eva[0].send(if_train)
예제 #4
0
파일: run.py 프로젝트: greedforgood/Python
def train_and_evaluate(args, agent_id=0):
    args.init_before_training(if_main=True)

    if True:
        '''basic arguments'''
        cwd = args.cwd
        agent = args.agent

        env = args.env
        state_dim = env.state_dim
        action_dim = env.action_dim
        if_discrete = env.if_discrete

        '''training arguments'''
        net_dim = args.net_dim
        max_memo = args.max_memo
        break_step = args.break_step
        batch_size = args.batch_size
        target_step = args.target_step
        repeat_times = args.repeat_times
        learning_rate = args.learning_rate
        if_break_early = args.if_allow_break

        gamma = args.gamma
        reward_scale = args.reward_scale
        if_per_or_gae = args.if_per_or_gae
        soft_update_tau = args.soft_update_tau

        '''evaluating arguments'''
        show_gap = args.eval_gap
        eval_env = args.eval_env
        eval_times1 = args.eval_times1
        eval_times2 = args.eval_times2
        del args

    '''init: Agent'''
    agent.init(net_dim, state_dim, action_dim, learning_rate, if_per_or_gae)
    agent.save_or_load_agent(cwd, if_save=False)
    if_on_policy = agent.if_on_policy

    '''init Evaluator'''
    eval_env = deepcopy_or_rebuild_env(env) if eval_env is None else eval_env
    evaluator = Evaluator(cwd=cwd, agent_id=agent_id, device=agent.device, env=eval_env,
                          eval_times1=eval_times1, eval_times2=eval_times2, eval_gap=show_gap)  # build Evaluator
    evaluator.save_or_load_recoder(if_save=False)

    '''init ReplayBuffer'''
    if if_on_policy:
        buffer = list()

        def update_buffer(_trajectory_list):
            buffer[:] = agent.prepare_buffer(_trajectory_list)  # buffer = (state, action, r_sum, logprob, advantage)

            _steps = buffer[2].size(0)  # buffer[2] = r_sum
            _r_exp = buffer[2].mean().item()  # buffer[2] = r_sum
            return _steps, _r_exp

        assert isinstance(buffer, list)
    else:
        buffer = ReplayBuffer(state_dim=state_dim, action_dim=1 if if_discrete else action_dim,
                              max_len=max_memo, if_use_per=if_per_or_gae)
        buffer.save_or_load_history(cwd, if_save=False)

        def update_buffer(_trajectory_list):
            _state = torch.as_tensor([item[0] for item in _trajectory_list], dtype=torch.float32)
            _other = torch.as_tensor([item[1] for item in _trajectory_list], dtype=torch.float32)
            buffer.extend_buffer(_state, _other)

            _steps = _other.size()[0]
            _r_exp = _other[:, 0].mean().item()  # other = (reward, mask, ...)
            return _steps, _r_exp

        assert isinstance(buffer, ReplayBuffer)

    '''start training'''
    if if_on_policy:
        agent.state = env.reset()
    elif buffer.max_len != 0:  # if_off_policy
        agent.state = env.reset()
    else:  # if_off_policy
        with torch.no_grad():  # update replay buffer
            trajectory_list = explore_before_training(env, target_step, reward_scale, gamma)
            steps, r_exp = update_buffer(trajectory_list)
            agent.state = trajectory_list[-1][0]  # trajectory_list[-1][0] = (state, other)[0] = state

        agent.update_net(buffer, target_step, batch_size, repeat_times)

        agent.act_target.load_state_dict(agent.act.state_dict()) if agent.if_use_act_target else None
        agent.cri_target.load_state_dict(agent.cri.state_dict()) if agent.if_use_cri_target else None
        evaluator.total_step += steps

    if_train = True
    while if_train:
        with torch.no_grad():
            trajectory_list = agent.explore_env(env, target_step, reward_scale, gamma)
            steps, r_exp = update_buffer(trajectory_list)

        logging_tuple = agent.update_net(buffer, batch_size, repeat_times, soft_update_tau)

        with torch.no_grad():
            if_reach_goal = evaluator.evaluate_and_save(agent.act, steps, r_exp, logging_tuple)
            if_train = not ((if_break_early and if_reach_goal)
                            or evaluator.total_step > break_step
                            or os.path.exists(f'{cwd}/stop'))

    print(f'| UsedTime: {time.time() - evaluator.start_time:.0f} | SavedDir: {cwd}')

    agent.save_or_load_agent(cwd, if_save=True)
    buffer.save_or_load_history(cwd, if_save=True) if not if_on_policy else None
    evaluator.save_or_load_recoder(if_save=True)
예제 #5
0
def train_and_evaluate(args):
    args.init_before_training()
    '''basic arguments'''
    cwd = args.cwd
    env = args.env
    agent = args.agent
    gpu_id = args.gpu_id
    '''training arguments'''
    net_dim = args.net_dim
    max_memo = args.max_memo
    break_step = args.break_step
    batch_size = args.batch_size
    target_step = args.target_step
    repeat_times = args.repeat_times
    learning_rate = args.learning_rate
    if_break_early = args.if_allow_break

    gamma = args.gamma
    reward_scale = args.reward_scale
    if_per_or_gae = args.if_per_or_gae
    soft_update_tau = args.soft_update_tau
    '''evaluating arguments'''
    show_gap = args.eval_gap
    eval_times1 = args.eval_times1
    eval_times2 = args.eval_times2
    if_vec_env = getattr(env, 'env_num', 1) > 1
    env_eval = deepcopy_or_rebuild_env(
        env) if args.env_eval is None else args.env_eval
    del args  # In order to show these hyper-parameters clearly, I put them above.
    '''init: environment'''
    state_dim = env.state_dim
    action_dim = env.action_dim
    if_discrete = env.if_discrete
    '''init: Agent, ReplayBuffer, Evaluator'''

    agent.init(net_dim, state_dim, action_dim, learning_rate, if_per_or_gae)
    if_on_policy = agent.if_on_policy
    '''init: ReplayBuffer'''
    agent.state = env.reset()
    buffer = ReplayBuffer(
        max_len=target_step if if_on_policy else max_memo,
        if_on_policy=if_on_policy,
        if_per_or_gae=if_per_or_gae,
        state_dim=state_dim,
        action_dim=action_dim,
        if_discrete=if_discrete,
    )
    if if_on_policy:
        steps = 0
    else:  # explore_before_training for off-policy
        with torch.no_grad():  # update replay buffer
            trajectory_list, state = explore_before_training(
                env, target_step, reward_scale, gamma)
        agent.state = state
        steps = len(trajectory_list)
        buffer.extend_buffer_from_list(trajectory_list)
        agent.update_net(buffer, target_step, batch_size,
                         repeat_times)  # pre-training and hard update

        # hard update for the first time
        agent.act_target.load_state_dict(agent.act.state_dict()) if getattr(
            agent, 'act_target', None) else None
        agent.cri_target.load_state_dict(agent.cri.state_dict()) if getattr(
            agent, 'cri_target', None) else None
    total_step = steps
    '''init: Evaluator'''
    evaluator = Evaluator(cwd=cwd,
                          agent_id=gpu_id,
                          device=agent.device,
                          env=env_eval,
                          eval_times1=eval_times1,
                          eval_times2=eval_times2,
                          eval_gap=show_gap)  # build Evaluator
    '''start training'''
    if_train = True
    if if_vec_env:
        while if_train:
            with torch.no_grad():
                buffer = agent.explore_envs(env, target_step, reward_scale,
                                            gamma)
            steps = buffer[0].size(0) * buffer[0].size(1)
            total_step += steps

            buffer = agent.prepare_buffers(buffer)
            logging_tuple = agent.update_net(buffer, batch_size, repeat_times,
                                             soft_update_tau)

            with torch.no_grad():  # speed up running
                if_reach_goal = evaluator.evaluate_save(
                    agent.act, steps, logging_tuple)
            if_train = not ((if_break_early and if_reach_goal)
                            or total_step > break_step
                            or os.path.exists(f'{cwd}/stop'))
    else:
        while if_train:
            with torch.no_grad():
                trajectory_list = agent.explore_env(env, target_step,
                                                    reward_scale, gamma)
            steps = len(trajectory_list)
            total_step += steps

            buffer.extend_buffer_from_list(trajectory_list)
            logging_tuple = agent.update_net(buffer, batch_size, repeat_times,
                                             soft_update_tau)

            with torch.no_grad():  # speed up running
                if_reach_goal = evaluator.evaluate_save(
                    agent.act, steps, logging_tuple)
            if_train = not ((if_break_early and if_reach_goal)
                            or total_step > break_step
                            or os.path.exists(f'{cwd}/stop'))