def test_differentiate_univariate_vector_function(self):
        def func(x):
            f1 = x.logistic()
            f2 = x.log(base=2)
            f3 = x**2
            f4 = 1 - x
            return np.array([f1, f2, f3, f4])

        x = Variable(2, 1)
        val, der = differentiate(func, np.array([2]))

        val_expected = np.array([1 / (1 + np.exp(-2)), np.log2(2), 4, -1])

        f1_der = [np.exp(-2) / (1 + np.exp(-2))**2]
        f2_der = [1 / (np.log(2) * 2)]
        f3_der = [2 * 2]
        f4_der = [-1]
        derivative_expected = np.array([f1_der, f2_der, f3_der, f4_der])

        np.testing.assert_array_almost_equal(val, val_expected)
        np.testing.assert_array_almost_equal(der, derivative_expected)

        def func(x):
            f1 = x[0].logistic()
            f2 = x[0].log(base=2)
            f3 = x[0]**2
            f4 = 1 - x[0]
            return np.array([f1, f2, f3, f4])

        val, der = differentiate(func, np.array([2]), scalar=False)

        np.testing.assert_array_almost_equal(val, val_expected)
        np.testing.assert_array_almost_equal(der, derivative_expected)
    def test_differentiate_univariate_scalar_function(self):
        def func(x):
            return 3 * x**2 + 4 / x + np.sin(x**2)

        val, der = differentiate(func, np.array([2]))
        x = 2
        self.assertAlmostEqual(val, func(x))
        derivative_expected = np.array(
            [6 * x - 4 / x**2 + np.cos(x**2) * 2 * x])
        np.testing.assert_array_almost_equal(der, derivative_expected)

        def func(x):
            return 3 * x[0]**2 + 4 / x[0] + np.sin(x[0]**2)

        val, der = differentiate(func, np.array([x]), scalar=False)
        self.assertAlmostEqual(val, func(np.array([x])))
        np.testing.assert_array_almost_equal(der, derivative_expected)
    def test_differentiate_multivariate_vector_function(self):
        def func(x, y, z):
            f1 = 3 * x**2 + 4 * z / x + np.log(x + y + z)
            f2 = x * y * z
            f3 = 1 / (x - z)
            f4 = y
            return np.array([f1, f2, f3, f4])

        x = 2
        y = 3
        z = 1
        x_arr = np.array([x, y, z])
        val, der = differentiate(func, x_arr)

        f1_der = [
            6 * x - 4 * z / x**2 + 1 / (x + y + z), 1 / (x + y + z),
            (5 * x + 4 * z + 4 * y) / (x * (x + y + z))
        ]
        f2_der = [y * z, x * z, x * y]
        f3_der = [-1 / (x - z)**2, 0, 1 / (x - z)**2]
        f4_der = [0, 1, 0]
        derivative_expected = np.array([f1_der, f2_der, f3_der, f4_der])

        np.testing.assert_array_almost_equal(val, func(x, y, z))
        np.testing.assert_array_almost_equal(der, derivative_expected)

        def func(x):
            f1 = 3 * x[0]**2 + 4 * x[2] / x[0] + np.log(x[0] + x[1] + x[2])
            f2 = x[0] * x[1] * x[2]
            f3 = 1 / (x[0] - x[2])
            f4 = x[1]
            return np.array([f1, f2, f3, f4])

        val, der = differentiate(func, x_arr, scalar=False)

        np.testing.assert_array_almost_equal(val, func(x_arr))
        np.testing.assert_array_almost_equal(der, derivative_expected)
    def test_differentiate_multivariate_scalar_function(self):
        def func(x, y, z):
            return 3 * x**2 + 4 * z / x + np.log(x + y + z)

        x = 2
        y = 3
        z = 1
        x_arr = np.array([x, y, z])

        val, der = differentiate(func, x_arr)
        self.assertAlmostEqual(val, func(x, y, z))
        derivative_expected = np.array([
            6 * x - 4 * z / x**2 + 1 / (x + y + z), 1 / (x + y + z),
            (5 * x + 4 * z + 4 * y) / (x * (x + y + z))
        ])
        np.testing.assert_array_almost_equal(der, derivative_expected)

        def func(x):
            return 3 * x[0]**2 + 4 * x[2] / x[0] + np.log(x[0] + x[1] + x[2])

        val, der = differentiate(func, x_arr, scalar=False)

        self.assertAlmostEqual(val, func(x_arr))
        np.testing.assert_array_almost_equal(der, derivative_expected)
    def bfgs_optimize(self, num_iterations=1000, learning_rate=0.01, tolerance=None):
        """
        method that performs Quasi-Newton optimization of the objective function with BFGS updates

        INPUTS
        =======
        - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000
        - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.01
        - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm
                of the update step is smaller than this value, gradient descent will terminate; Default is None 
                (no tolerance check is used)

        RETURNS
        ========
        - val: the minimum value of the objective_function that was found (float)
        - cur_variable_values: the values for the inputs to objective_function that gave the
                minimum objective_value found. (1D array of floats with the same size as the number of
                inputs to the objective function)


        EXAMPLES
        =========

        # multivariate function with scalars as input
        >>> import numpy as np
        >>> f = lambda x, y: x**2 + y**2
        >>> op = Optimizer(f, np.array([1, -1]))
        >>> op.bfgs_optimize(num_iterations=1000, learning_rate=0.1)
        (4.82773951620493e-92, array([ 1.55366333e-46, -1.55366333e-46]))

        # multivariate function with a vector as input
        >>> import numpy as np
        >>> f = lambda x: x[0]**2 + x[1]**2
        >>> op = Optimizer(f, np.array([1, -1]), scalar=False)
        >>> op.bfgs_optimize(num_iterations=1000, learning_rate=0.1)
        (4.82773951620493e-92, array([ 1.55366333e-46, -1.55366333e-46]))

        # univariate function with scalar as input
        >>> import numpy as np
        >>> f = lambda x: x**2
        >>> op = Optimizer(f, np.array([1]))
        >>> op.bfgs_optimize(num_iterations=1000, learning_rate=0.1)
        (2.4138697581024885e-92, array([1.55366333e-46]))

        """

        num_variables = len(self.variable_initialization)
        cur_variable_values = self.variable_initialization
        cur_inv_hessian = np.eye(num_variables)
        val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
        self.val_history = [val]
        
        for i in range(num_iterations):
            
            delta_var = -learning_rate * cur_inv_hessian@der
            cur_variable_values = cur_variable_values + delta_var
            val, der2 = differentiate(self.objective_function, cur_variable_values, self.scalar)
            self.val_history.append(val)
            identity = np.eye(num_variables)
            y = (der2 - der).reshape(-1, 1)
            s = delta_var.reshape(-1, 1)
            denominator = y.T@s
            t1 = (identity- [email protected]/denominator)
            t2 = (identity - [email protected] / denominator)
            t3 = [email protected]/denominator
            cur_inv_hessian = t1@cur_inv_hessian@t2 + t3
            der = der2

            self._print_updates(i, val)

            if self._tolerance_check(tolerance, delta_var):
                break
        
        return val, cur_variable_values
    def adam_optimize(self, num_iterations=1000, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, tolerance=None):
        """
        method that performs Adaptive Moment Estimation(adam) optimization of the objective function
        INPUTS
        =======
        Default parameters follow those provided in the original paper.
        - num_iterations: an int specifying the maximum number of iterations; Default is 1000
        - learning_rate: a float/int specifying the learning rate for gradient descent; Default value 0.001.
        - beta1: Exponential decay hyperparameter for the first moment estimates. Default value 0.9
        - beta2: Exponential decay hyperparameter for the second moment estimates. Default 0.999
        - epsilon: Hyperparameter preventing division by zero. Default value 1e-8.
        - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm
                of the update step is smaller than this value, the adam_optimizer will terminate; Default is None 
                (no tolerance check is used)
        
        RETURNS
        ========
        - objective_value: the minimum value of the objective_function that was found (float)
        - cur_variable_values: the values for the inputs to objective_function that gave the
                minimum objective_value found. (1D array of floats with the same size as the number of
                inputs to the objective function)
        EXAMPLES
        =========
        # multivariate function with scalars as input
        >>> import numpy as np
        >>> f = lambda x, y: x**3 + y**2
        >>> op = Optimizer(f, np.array([1, -1]))
        >>> op.adam_optimize(learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8)
        (6.03886825409073e-06, array([1.82103595e-02, 1.81385270e-21]))

        # multivariate function with a vector as input
        >>> import numpy as np
        >>> f = lambda x: x[0]**2 + x[1]**2
        >>> op = Optimizer(f, np.array([1, -1]), scalar=False)
        >>> op.adam_optimize(learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8)
        (7.701661519998926e-49, array([-6.20550623e-25,  6.20550623e-25]))

        # univariate function with scalar as input
        >>> import numpy as np
        >>> f = lambda x: x**2
        >>> op = Optimizer(f, np.array([1]))
        >>> op.adam_optimize(learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8)
        (3.850830759999463e-49, array([-6.20550623e-25]))

        """

        if not 0 <= beta1 < 1 or not 0 <= beta2 < 1:
            raise ValueError("The value of beta (sample weight) should be between 0 and 1 (excluding 1).")
        cur_variable_values = self.variable_initialization
        val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
        self.val_history = [val]
        v, s, v_corrected, s_corrected = 0,0,0,0
        
        for l in range(num_iterations):
            # Compute the moving average of the gradients.
            v = beta1 * v + (1 - beta1) * der
            # Compute bias-corrected first moment estimate.
            v_corrected = v / (1 - np.power(beta1, l+1))
            # Moving average of the squared gradients.
            s = beta2 * s + (1 - beta2) * der**2
            # Compute bias-corrected second raw moment estimate.
            s_corrected = s / (1 - np.power(beta2, l+1))
            # Update the derivatives.
            delta_var = learning_rate * v_corrected / (np.sqrt(s_corrected) + epsilon)
            cur_variable_values = cur_variable_values - delta_var
            val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
            self.val_history.append(val)

            self._print_updates(l, val)

            if self._tolerance_check(tolerance, delta_var):
                break
        return val, cur_variable_values
    def rmsprop_optimize(self, num_iterations=1000, learning_rate=0.001, epsilon=1e-7, beta=0.9, tolerance=None):
        """
        Method that performs RMSProp gradient descent optimization of the objective function.
        This is an enhancement to Adagrad and adjusts the learning rate alpha by dividing it by the
        exponential moving averages of gradients.

        INPUTS
        =======
        - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000
        - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.001
        - epsilon: A float to prevent division by zero during optimization; Default is 1e-7
        - beta: A float ranging between 0 and 1 specifying the sample weight for exponential average of weights; Default
                is 0.9
        - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm
                       of the update step is smaller than this value, gradient descent will terminate; Default is None
                       (no tolerance check is used)

        RETURNS
        ========
        - objective_value: the minimum value of the objective_function that was found (float)
        - cur_variable_values: the values for the inputs to objective_function that gave the
                       minimum objective_value found. (1D array of floats with the same size as the number of
                       inputs to the objective function)


        EXAMPLES
        =========
        # Univariate objective function with scalar inputs.
        >>> import numpy as np
        >>> g = lambda x: x**4 - x
        >>> op = Optimizer(g, np.array([1]))
        >>> op.rmsprop_optimize(num_iterations=1000, learning_rate=0.01)
        (-0.4724703937105774, array([0.62996052]))

        # Multivariate objective function with scalar inputs.
        >>> import numpy as np
        >>> g = lambda x, y: x**2 + y**2 + 12
        >>> op = Optimizer(g, np.array([0.5, 0.88]))
        >>> op.rmsprop_optimize(num_iterations=10000, learning_rate=0.01)
        (12.00004995, array([ 0.0049975, -0.0049975]))

        # Multivariate objective function with vector inputs.
        >>> import numpy as np
        >>> g = lambda x: x[0]**2 + 2*x[1]**2 + 12
        >>> op = Optimizer(g, np.array([0.5, 0.88]), scalar=False)
        >>> op.rmsprop_optimize(num_iterations=10000, learning_rate=0.01)
        (12.0000749625, array([ 0.0049975 , -0.00499937]))

        """
        if not 0 <= beta <= 1:
            raise ValueError("The value of beta (sample weight) should be between 0 and 1.")
        cur_variable_values = self.variable_initialization
        val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
        self.val_history = [val]
        _exp_average_gradient = 0

        for i in range(num_iterations):
            _exp_average_gradient = (beta * _exp_average_gradient) + ((1 - beta) * der**2)
            delta_var = (learning_rate * der) / np.sqrt(_exp_average_gradient + epsilon)
            cur_variable_values = cur_variable_values - delta_var
            val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
            self.val_history.append(val)

            self._print_updates(i, val)

            if self._tolerance_check(tolerance, delta_var):
                break

        return val, cur_variable_values
    def adagrad_optimize(self, num_iterations=1000, learning_rate=0.01, epsilon=1e-7, tolerance=None):
        """
        Method that performs adaptive gradient descent optimization of the objective function.Adagrad adjusts the learning rate         
        alpha by dividing it by the square root of the cumulative sum of current and past squared gradients.

        INPUTS
        =======
        - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000
        - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.01
        - epsilon: A float to prevent division by zero during optimization; Default is 1e-7
        - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm
                       of the update step is smaller than this value, gradient descent will terminate; Default is None
                       (no tolerance check is used)

        RETURNS
        ========
        - objective_value: the minimum value of the objective_function that was found (float)
        - cur_variable_values: the values for the inputs to objective_function that gave the
                       minimum objective_value found. (1D array of floats with the same size as the number of
                       inputs to the objective function)


        EXAMPLES
        =========
        # Univariate objective function with scalar inputs.
        >>> import numpy as np
        >>> g = lambda x: x**4 - x
        >>> op = Optimizer(g, np.array([1]))
        >>> op.adagrad_optimize(num_iterations=1000, learning_rate=0.01)
        (-0.4705616040471904, array([0.65786042]))

        # Multivariate objective function with scalar inputs.
        >>> import numpy as np
        >>> g = lambda x, y: x**2 + y**2 + 12
        >>> op = Optimizer(g, np.array([0.5, 0.88]))
        >>> op.adagrad_optimize(num_iterations=10000, learning_rate=0.01)
        (12.000013226920059, array([8.13318093e-08, 3.63688329e-03]))

        # Multivariate objective function with vector inputs.
        >>> import numpy as np
        >>> g = lambda x: x[0]**2 + 2*x[1]**2 + 12
        >>> op = Optimizer(g, np.array([0.5, 0.88]), scalar=False)
        >>> op.adagrad_optimize(num_iterations=10000, learning_rate=0.01)
        (12.000026453839908, array([8.13318093e-08, 3.63688327e-03]))

        """
        cur_variable_values = self.variable_initialization
        val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
        self.val_history = [val]
        _cumsum_gradient = 0

        for i in range(num_iterations):
            _cumsum_gradient = _cumsum_gradient + (der**2)
            delta_var = (learning_rate * der) / np.sqrt(_cumsum_gradient + epsilon)
            cur_variable_values = cur_variable_values - delta_var
            val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
            self.val_history.append(val)

            self._print_updates(i, val)

            if self._tolerance_check(tolerance, delta_var):
                break

        return val, cur_variable_values
    def momentum_optimize(self, num_iterations=1000, learning_rate=0.01, beta=0.9, tolerance=None):
        """
        Method that performs momentum gradient descent optimization of the objective function. It does so by factoring a
        momentum term during learning, which is an exponential moving average of current and past gradients.

        INPUTS
        =======
        - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000
        - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.01
        - beta: A float ranging between 0 and 1 specifying the sample weight for exponential average of weights; Default
                is 0.9
        - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm
                       of the update step is smaller than this value, gradient descent will terminate; Default is None
                       (no tolerance check is used)

        RETURNS
        ========
        - objective_value: the minimum value of the objective_function that was found (float)
        - cur_variable_values: the values for the inputs to objective_function that gave the
                       minimum objective_value found. (1D array of floats with the same size as the number of
                       inputs to the objective function)


        EXAMPLES
        =========
        # Univariate objective function with scalar inputs.
        >>> import numpy as np
        >>> g = lambda x: x**4 - x
        >>> op = Optimizer(g, np.array([1]))
        >>> op.momentum_optimize(num_iterations=1000, learning_rate=0.01)
        (-0.4724703937105774, array([0.62996052]))

        # Multivariate objective function with scalar inputs.
        >>> import numpy as np
        >>> g = lambda x, y: x**3 + 2*y**2 + 12
        >>> op = Optimizer(g, np.array([0.5, 0.88]))
        >>> op.momentum_optimize(num_iterations=10000, learning_rate=0.01)
        (12.000000035335317, array([ 3.28147927e-003, -1.79857502e-230]))

        # Multivariate objective function with vector inputs.
        >>> import numpy as np
        >>> g = lambda x: x[0]**3 + 2*x[1]**2 + 12
        >>> op = Optimizer(g, np.array([0.5, 0.88]), scalar=False)
        >>> op.momentum_optimize(num_iterations=1000, learning_rate=0.01)
        (12.00002667493136, array([2.98791178e-02, 1.51990528e-23]))

        """
        if not 0 <= beta <= 1:
            raise ValueError("The value of beta (sample weight) should be between 0 and 1.")
        
        cur_variable_values = self.variable_initialization
        val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
        self.val_history = [val]
        _current_momentum = 0

        for i in range(num_iterations):
            _current_momentum = (beta * _current_momentum) + ((1 - beta) * der)
            delta_var = learning_rate * _current_momentum
            cur_variable_values = cur_variable_values - delta_var
            val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
            self.val_history.append(val)

            self._print_updates(i, val)

            if self._tolerance_check(tolerance, delta_var):
                break

        return val, cur_variable_values
    def gd_optimize(self, num_iterations=1000, learning_rate=0.01, tolerance=None):
        """
        method that performs gradient descent optimization of the objective function

        INPUTS
        =======
        - num_iterations: an int specifying the maximum number of iterations of gradient descent; Default is 1000
        - learning_rate: a float/int specifying the learning rate for gradient descent; Default is 0.01
        - tolerance: a float specifying the smallest tolerance for the updates to the variables. If the L2 norm
                of the update step is smaller than this value, gradient descent will terminate; Default is None 
                (no tolerance check is used)

        RETURNS
        ========
        - val: the minimum value of the objective_function that was found (float)
        - cur_variable_values: the values for the inputs to objective_function that gave the
                minimum objective_value found. (1D array of floats with the same size as the number of
                inputs to the objective function)


        EXAMPLES
        =========

        # multivariate function with scalars as input
        >>> import numpy as np
        >>> f = lambda x, y: x**2 + y**2
        >>> op = Optimizer(f, np.array([1, -1]))
        >>> op.gd_optimize(num_iterations=1000, learning_rate=0.1)
        (3.026941164608489e-194, array([ 1.23023192e-97, -1.23023192e-97]))

        # multivariate function with a vector as input
        >>> import numpy as np
        >>> f = lambda x: x[0]**2 + x[1]**2
        >>> op = Optimizer(f, np.array([1, -1]), scalar=False)
        >>> op.gd_optimize(num_iterations=1000, learning_rate=0.1)
        (3.026941164608489e-194, array([ 1.23023192e-97, -1.23023192e-97]))

        # univariate function with scalar as input
        >>> import numpy as np
        >>> f = lambda x: x**2
        >>> op = Optimizer(f, np.array([1]))
        >>> op.gd_optimize(num_iterations=1000, learning_rate=0.1)
        (1.5134705823042444e-194, array([1.23023192e-97]))

        """

        cur_variable_values = self.variable_initialization
        val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
        self.val_history = [val]

        
        for i in range(num_iterations):
            
            delta_var = learning_rate * der
            cur_variable_values = cur_variable_values - delta_var
            val, der = differentiate(self.objective_function, cur_variable_values, self.scalar)
            self.val_history.append(val)

            self._print_updates(i, val)

            if self._tolerance_check(tolerance, delta_var):
                break

        return val, cur_variable_values