Пример #1
0
              shuffle_gradients=args.reinforce_shuffle)

count = 0
for iteration in range(args.agg_iters):
    agent.train(args.num_epochs, iteration)
    for traj in range(args.traj_per_agg):
        timestep = env.reset()
        rewards = []
        if args.record:
            recorder = Recorder(args.experiment_name, count)
        for t in tqdm(range(args.traj_length), desc='Generating episode'):
            if args.record:
                recorder.record_frame(env.physics.render(camera_id=0), t)
            state = env.physics.state()
            action = agent.choose_action(state)
            timestep = env.step(action)
            new_state, reward = env.physics.state(), timestep.reward
            agent.D_RL.pushTrajectory([state, action, reward, new_state])
            rewards.append(reward)
        print('Trajectory done. Total reward: {}'.format(sum(rewards)))
        writer.add_scalar('total_reward', sum(rewards), count)
        val_loss = agent.validation_loss()
        logger.log([count, sum(rewards), val_loss[0], val_loss[1]])
        print(count, sum(rewards), val_loss[0], val_loss[1])
        #agent.saveifbest(sum(rewards), args.experiment_name)
        if args.reinforce:
            agent.REINFORCE(rewards)
        if args.record:
            recorder.make_movie()
        count += 1