def test_worker(args, shared_model, total_steps, optimizer): args.environment.clip_rewards = False env = make_env(args.environment) log_path = '{}/{}'.format(args.train.experiment_folder, 'log.txt') logging.basicConfig(filename=log_path, level=logging.INFO) logging.info("STARTED TRAINING PROCESS {}".format(time.strftime("%Y.%m.%d_%H:%M", time.localtime()))) model = ActorCritic(env.observation_space.shape, env.action_space.n) model = BaseWrapper(model) if (args.train.use_pixel_control or args.train.use_reward_prediction): model = ExperienceWrapper(model) if args.train.use_pixel_control: model = PixelControlWrapper(model, args.train.gamma, args.train.pc_coef) if args.train.use_reward_prediction: model = RewardPredictionWrapper(model, args.train.rp_coef) if args.train.use_value_replay: model = ValueReplayWrapper(model) model.config = args model.eval() start_time = time.time() reward_history = [] while True: model.load_state_dict(shared_model.state_dict()) if (len(reward_history) + 1) % args.train.save_frequency == 0: save_progress(args, model, optimizer, total_steps.value) stats = play_game(model, env) reward_history.append(stats['total_reward']) log_message = ( 'Time {}, num steps {}, FPS {:.0f}, '+ 'curr episode reward {:.2f}, mean episode reward: {:.2f}, '+ 'mean policy loss {:.2f}, mean value loss {:.2f}, '+ 'mean entropy percentage {:.2f}' ).format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), total_steps.value, total_steps.value / (time.time() - start_time), stats['total_reward'], np.mean(reward_history[-60:]), stats['policy_loss'], stats['value_loss'], stats['entropy'] ) if args.train.use_pixel_control: log_message += ', pixel control loss %.2f' %stats['pc_loss'] if args.train.use_reward_prediction: log_message += ', reward prediction loss %.2f' %stats['rp_loss'] if args.train.use_value_replay: log_message += ', value replay loss %.2f' %stats['vr_loss'] print(log_message) logging.info(log_message) time.sleep(60)
config.environment.episode_length_sec, 60) config.environment.prev_frame_h = config.environment.frame_h config.environment.prev_frame_w = config.environment.frame_w config.environment.frame_h = max(config.environment.frame_h, 256) config.environment.frame_w = max(config.environment.frame_w, 256) env = make_env(config.environment, recording=True) model = ActorCritic(env.observation_space.shape, env.action_space.n) model = BaseWrapper(model) if (config.train.use_pixel_control or config.train.use_reward_prediction): model = ExperienceWrapper(model) if config.train.use_pixel_control: model = PixelControlWrapper(model, config.train.gamma, config.train.pc_coef) if config.train.use_reward_prediction: model = RewardPredictionWrapper(model, config.train.rp_coef) if config.train.use_value_replay: model = ValueReplayWrapper(model) model.config = config if cmd_args.pretrained_weights is not None: model.load_state_dict(torch.load(cmd_args.pretrained_weights)) else: print( "You have not specified path to model weigths, random plays will be performed" ) model.eval() results = record_video(model, env) log_message = "evaluated on pretrained weights: {}, results: {}".format( cmd_args.pretrained_weights, results) print(log_message) logging.info(log_message)