"""
m = 1000  # number of trajectories to generate for checking accuracy of learned algorithm
n = 1000  # number of time steps for each trajectory
lrz.dt = 0.01  # set default time step to 0.01
xstart = np.zeros((m, 3))  # stores the initial state of the n trajectories
xend = np.zeros((m, 3))  # stores the final state of the n trajectories
task = np.zeros(
    (m, 1), dtype=float
)  # for each trajectory task stores 1 (resp. 0) for control objective
# (resp. not) achieved

for j in range(m):
    xstart[j, :] = lrz.reset()
    lrz.trajectory(n, 0)
    xend[j, :] = lrz.state
    if lrz.reward() > -0.15:
        task[j, 0] = 1.0

print('Efficiency of the learning algorithm is ',
      np.squeeze(100 * sum(task) / m), '%')

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xstart[:, 0],
           xstart[:, 1],
           xstart[:, 2],
           c='k',
           marker='x',
           label="starting states")
ax.scatter(xend[:, 0],
           xend[:, 1],
r = 1.5
lrz = Lorenz(sigma, b, r)  # initialize lorenz object with given parameters

n_samples = 1000  # set number of training samples
lrz.X, lrz.U = np.zeros((n_samples, 3)), np.zeros(
    (n_samples, 1))  # initialize training data to 0
"""  Training  
randomly initialize the state of the lorenz object and set lrz.X[i, :] to the initial state
lorenz object takes one step with -ve control and gets reward r1
reset the lorenz state back to starting state and take another step with +ve control which gives reward r2
Set policy lrz.U[i, 0] to -1 or 1 depending upon which policy maximizes reward
"""
for i in range(n_samples):
    lrz.X[i, :] = lrz.reset()
    lrz.step(-lrz.max_control)
    r1 = lrz.reward()
    lrz.state = lrz.X[i, :]
    lrz.step(lrz.max_control)
    r2 = lrz.reward()
    lrz.U[i, 0] = 2 * np.argmax([r1, r2]) - 1

data = {
    'sigma': sigma,
    'b': b,
    'r': r,
    'n_samples': n_samples,
    'X': lrz.X,
    'U': lrz.U
}
#write a file
f = open("learning_algorithm2_training_data", "wb")