예제 #1
0
파일: workers.py 프로젝트: dasimagin/rita
def test_worker(args, shared_model, total_steps, optimizer):
    args.environment.clip_rewards = False
    env = make_env(args.environment)

    log_path = '{}/{}'.format(args.train.experiment_folder, 'log.txt')
    logging.basicConfig(filename=log_path, level=logging.INFO)
    logging.info("STARTED TRAINING PROCESS {}".format(time.strftime("%Y.%m.%d_%H:%M", time.localtime())))

    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model = BaseWrapper(model)
    if (args.train.use_pixel_control or
            args.train.use_reward_prediction):
        model = ExperienceWrapper(model)
    if args.train.use_pixel_control:
        model = PixelControlWrapper(model, args.train.gamma, args.train.pc_coef)
    if args.train.use_reward_prediction:
        model = RewardPredictionWrapper(model, args.train.rp_coef)
    if args.train.use_value_replay:
        model = ValueReplayWrapper(model)
    model.config = args
    model.eval()

    start_time = time.time()

    reward_history = []
    while True:
        model.load_state_dict(shared_model.state_dict())
        if (len(reward_history) + 1) % args.train.save_frequency == 0:
            save_progress(args, model, optimizer, total_steps.value)
        stats = play_game(model, env)
        reward_history.append(stats['total_reward'])

        log_message = (
                'Time {}, num steps {}, FPS {:.0f}, '+
                'curr episode reward {:.2f}, mean episode reward: {:.2f}, '+
                'mean policy loss {:.2f}, mean value loss {:.2f}, '+
                'mean entropy percentage {:.2f}'
            ).format(
            time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
            total_steps.value,
            total_steps.value / (time.time() - start_time),
            stats['total_reward'],
            np.mean(reward_history[-60:]),
            stats['policy_loss'],
            stats['value_loss'],
            stats['entropy']
        )
        if args.train.use_pixel_control:
            log_message += ', pixel control loss %.2f' %stats['pc_loss']
        if args.train.use_reward_prediction:
            log_message += ', reward prediction loss %.2f' %stats['rp_loss']
        if args.train.use_value_replay:
            log_message += ', value replay loss %.2f' %stats['vr_loss']
        print(log_message)
        logging.info(log_message)
        time.sleep(60)
예제 #2
0
            config.environment.episode_length_sec, 60)
        config.environment.prev_frame_h = config.environment.frame_h
        config.environment.prev_frame_w = config.environment.frame_w
        config.environment.frame_h = max(config.environment.frame_h, 256)
        config.environment.frame_w = max(config.environment.frame_w, 256)
    env = make_env(config.environment, recording=True)
    model = ActorCritic(env.observation_space.shape, env.action_space.n)
    model = BaseWrapper(model)
    if (config.train.use_pixel_control or config.train.use_reward_prediction):
        model = ExperienceWrapper(model)
    if config.train.use_pixel_control:
        model = PixelControlWrapper(model, config.train.gamma,
                                    config.train.pc_coef)
    if config.train.use_reward_prediction:
        model = RewardPredictionWrapper(model, config.train.rp_coef)
    if config.train.use_value_replay:
        model = ValueReplayWrapper(model)
    model.config = config
    if cmd_args.pretrained_weights is not None:
        model.load_state_dict(torch.load(cmd_args.pretrained_weights))
    else:
        print(
            "You have not specified path to model weigths, random plays will be performed"
        )
    model.eval()
    results = record_video(model, env)
    log_message = "evaluated on pretrained weights: {}, results: {}".format(
        cmd_args.pretrained_weights, results)
    print(log_message)
    logging.info(log_message)