shuffle_gradients=args.reinforce_shuffle) count = 0 for iteration in range(args.agg_iters): agent.train(args.num_epochs, iteration) for traj in range(args.traj_per_agg): timestep = env.reset() rewards = [] if args.record: recorder = Recorder(args.experiment_name, count) for t in tqdm(range(args.traj_length), desc='Generating episode'): if args.record: recorder.record_frame(env.physics.render(camera_id=0), t) state = env.physics.state() action = agent.choose_action(state) timestep = env.step(action) new_state, reward = env.physics.state(), timestep.reward agent.D_RL.pushTrajectory([state, action, reward, new_state]) rewards.append(reward) print('Trajectory done. Total reward: {}'.format(sum(rewards))) writer.add_scalar('total_reward', sum(rewards), count) val_loss = agent.validation_loss() logger.log([count, sum(rewards), val_loss[0], val_loss[1]]) print(count, sum(rewards), val_loss[0], val_loss[1]) #agent.saveifbest(sum(rewards), args.experiment_name) if args.reinforce: agent.REINFORCE(rewards) if args.record: recorder.make_movie() count += 1