np.column_stack((np.arange(timesteps),
                                                    np.concatenate(actions))),
                                   advantages_pl:
                                   np.concatenate(advantages),
                                   learning_rate_pl:
                                   learning_rate
                               })
            # validation
            #val_rewards = [get_rollout(sess, env, rollout_limit, stochastic=True, seed=(epochs+i))[2] for i in range(10)]
            # store and print training statistics
            #mtr = np.mean([np.sum(r) for r in rewards])
            #mvr = np.mean([np.sum(r) for r in val_rewards])
            mtr = np.mean([np.sum(r) for r in rewards])
            #mvr = np.mean(np.sort([np.sum(r) for r in val_rewards])[5:-5])
            statistics.append(
                [epoch, env.get_nbactions(), mtr, loss, win_rate])
            if epoch % 10 == 0:
                print('Epoch:, %4d. training reward: %6.2f, loss: %7.4f' %
                      (epoch + 1, mtr, loss))

            if epoch % 100 == 0:
                saver.save(sess, "{}/{}.ckpt".format(model, model))

            if epoch % 400 == 0:
                #Get win-rate
                win_rate = get_winrate(sess, env)
                print("Win Rate:", win_rate)

                if win_rate > win_rate_best:
                    saver.save(sess, "{}/{}_best.ckpt".format(model, model))
                actions.append(a)
                rewards.append(r)
                timesteps += t
            # compute advantages
            advantages = get_advantages(rewards, rollout_limit, discount_factor)
            # policy gradient update
            loss, _ = sess.run(fetches=[loss_f, train_f], feed_dict={
                states_pl: np.concatenate(states),
                actions_pl: np.column_stack((np.arange(timesteps), np.concatenate(actions))),
                advantages_pl: np.concatenate(advantages),
                learning_rate_pl: learning_rate
            })            

            mtr = np.mean([np.sum(r) for r in rewards])
            #mvr = np.mean(np.sort([np.sum(r) for r in val_rewards])[5:-5])
            statistics.append([epoch, env.get_nbactions(), mtr, loss, win_rate])
            if epoch % 10 == 0:
                print('%4d. training reward: %6.2f, loss: %7.4f' % (epoch+1, mtr, loss))
            
            if epoch % 100 == 0:    
                saver.save(sess, "{}/{}.ckpt".format(model,model))   

            if epoch % 400 == 0:
                #Get win-rate
                win_rate = get_winrate(sess, env)
                print(win_rate)

                if win_rate > win_rate_best:
                    saver.save(sess, "{}/{}_best.ckpt".format(model,model))

        print('done')