def __init__(self, x, y=None): self.x = _create_array(x)[0] if y is not None: self.y = _create_array(y)[0] self.x = np.column_stack([self.x, self.y]) self.n, self.m = self.x.shape self.method = 'two_pass_covariance' self.cov = np.empty([self.m, self.m])
def isskewsymmetric(x): r""" Tests if a matrix is skew symmetric. Parameters ---------- x : array_like Accepts a numpy array, nested list, dictionary, or pandas DataFrame. The private function _create_array is called to create a copy of x as a numpy array. Returns ------- Boolean Returns True if matrix is antisymmetric Notes ----- Skew symmetric matrices are also known as antisymmetric. A skew symmetric matrix is equal to its negative transpose. .. math:: A = -A^T :math:`a_{ij}` must equal :math:`-a_{ji}`, thus the diagonal of a skew symmetric matrix must be 0, as :math:`a_{jj}` = :math:`-a_{jj}`. .. math:: \begin{bmatrix} 0 & a_{12} & a_{13} \\ -a_{12} & 0 & a_{23} \\ -a_{13} & -a_{23} & 0 \end{bmatrix} Examples -------- >>> m = pd.DataFrame({0: [0,-2,1], 1: [2,0,4], 2: [-1,-4,0]}) >>> isskewsymmetric(m) True >>> m2 = pd.DataFrame({0: [2,-1,0], 1: [-1,2,-1], 2: [0,-1,2]}) >>> isskewsymmetric(m2) False References ---------- Rowland, Todd and Weisstein, Eric W. "Antisymmetric Matrix." From MathWorld--A Wolfram Web Resource. http://mathworld.wolfram.com/AntisymmetricMatrix.html """ x = _create_array(x)[0] if x.shape[0] != x.shape[1]: return False if np.allclose(x, -np.transpose(x)) is False: return False return True
def var_cond(x): r""" Calculates the condition number, denoted as :math:`\kappa` which measures the sensitivity of the variance :math:`S` of a sample vector :math:`x` as defined by Chan and Lewis (as cited in Chan, Golub, & Leveque, 1983). Given a machine accuracy value of :math:`u`, the value :math:`\kappa u` can be used as a measure to judge the accuracy of the different variance computation algorithms. Parameters ---------- x : array_like Accepts a numpy array, nested list, dictionary, or pandas DataFrame. The private function _create_array is called to create a copy of x as a numpy array. Returns ------- varr : numpy ndarray Depending on the dimension of the input, returns a 1D or 2D array of the column-wise computed variances. Notes ----- The 2-norm is defined as usual: .. math:: ||x||_2 = \sum^N_{i=1} x^2_i Then the condition number :math:`\kappa` is defined as: .. math:: \kappa = \frac{||x||_2}{\sqrt{S}} = \sqrt{1 + \bar{x}^2 N / S} References ---------- Chan, T., Golub, G., & Leveque, R. (1983). Algorithms for Computing the Sample Variance: Analysis and Recommendations. The American Statistician, 37(3), 242-247. http://dx.doi.org/10.1080/00031305.1983.10483115 """ x = _create_array(x)[0] if x.ndim == 1: kap_cond = norm(x) / np.sqrt(var(x)) elif x.ndim == 2: kap_cond = np.empty(x.shape[1]) j = 0 for i in x.T: k = norm(i) / np.sqrt(var(i)) kap_cond[j] = k j += 1 else: raise ValueError('array must be 1D or 2D') return kap_cond
def __init__(self, x): self.x = _create_array(x)[0] self.m = self.x.shape[0] self.n = self.x.shape[1] self.r = np.zeros((self.n, self.n)) self.q = np.zeros((self.m, self.n)) self.method = 'householder'
def __init__(self, x, factors=None, rotate=None, covar=False): r""" Initializes the FactorAnalysis class. Parameters ---------- x : array-like Numpy ndarray, pandas DataFrame, list of lists or dictionary (keys are column names and corresponding values are the column values) representing observation vectors factors : int, default None Number of underlying hypothetical factors rotate : str, default None Rotation to use when performing the factor analysis. Currently not used. covar : boolean, default False If False (default), perform the factor analysis using the covariance matrix. If True, the factor analysis is computed with the correlation matrix. It is highly recommended to use the correlation matrix in the vast majority of cases as variables with comparatively large variances can dominate the diagonal of the covariance matrix and the factors. """ self.x = _create_array(x)[0] self.factors = int(factors) if self.factors > self.x.shape[1]: raise ValueError('number of factors cannot exceed number of observation vectors') self.rotate = rotate self.covar = covar self.method = 'principal_component'
def issymmetric(x): r""" Tests if a matrix is symmetric. Parameters ---------- x : array_like Accepts a numpy array, nested list, dictionary, or pandas DataFrame. The private function _create_array is called to create a copy of x as a numpy array. Returns ------- Boolean Returns True if matrix is symmetric Notes ----- A symmetric matrix is defined as a square matrix that is equal to its transpose. .. math:: A \in \mathbb{R}^{n \times n} \qquad A^T = A A symmetric matrix has the following form: .. math:: \begin{bmatrix} a_{11} & a_{12} & \cdots & a_{1n} \\ a_{12} & a_{22} & \cdots & a_{2n} \\ \vdots & \vdots & \ddots & \vdots \\ a_{1n} & a_{2n} & \cdots & a_{nn} \end{bmatrix} Examples -------- >>> m = pd.DataFrame({0: [2,-1,0], 1: [-1,2,-1], 2: [0,-1,2]}) >>> issymmetric(m) True References ---------- Golub, G., & Van Loan, C. (2013). Matrix computations (3rd ed.). Baltimore (MD): Johns Hopkins U.P. Weisstein, Eric W. "Symmetric Matrix." From MathWorld--A Wolfram Web Resource. http://mathworld.wolfram.com/SymmetricMatrix.html """ x = _create_array(x)[0] if x.shape[0] != x.shape[1]: return False if np.allclose(np.transpose(x), x) is False: return False return True
def __init__(self, x): self.type = x.__class__.__name__ self.x, self.cols = _create_array(x) if self.x.ndim > 2: raise ValueError('array must be 1D or 2D') self.dim = self.x.ndim self.n = self.x.shape[0] self.method = 'corrected_two_pass'
def isorthogonal(x): r""" Tests if a matrix is orthogonal. Parameters ---------- x : array_like Accepts a numpy array, nested list, dictionary, or pandas DataFrame. The private function _create_array is called to create a copy of x as a numpy array. Returns ------- Boolean Returns True if matrix is orthogonal Notes ----- A square matrix :math:`A` is said to be orthogonal if: .. math:: AA^T = I Where :math:`A^T` is the transpose of :math:`A` and :math:`I` is the identity matrix. The following matrix is orthogonal: .. math:: A = \begin{bmatrix}\frac{1}{3} & -\frac{2}{3} & \frac{2}{3} \\ \frac{2}{3} & -\frac{1}{3} & -\frac{2}{3} \\ \frac{2}{3} & \frac{2}{3} & \frac{1}{3} \end{bmatrix} Examples -------- >>> a = pd.DataFrame({0: [1/3, 2/3, 2/3], 1: [-2/3, -1/3, 2/3], 2: [2/3,-2/3,1/3]}) >>> isorthogonal(a) True References ---------- Rowland, Todd. "Orthogonal Matrix." From MathWorld--A Wolfram Web Resource, created by Eric W. Weisstein. http://mathworld.wolfram.com/OrthogonalMatrix.html """ x = _create_array(x)[0] if x.shape[0] != x.shape[1]: return 'Matrix is not orthogonal' if np.allclose(np.dot(x, x.T), np.eye(x.shape[0])): return True return False
def add_noise(cor, epsilon=None, M=None): x = _create_array(cor)[0] n = x.shape[1] if epsilon is None: epsilon = 0.05 if M is None: M = 2 np.fill_diagonal(cor, 1 - epsilon) cor = _CorMatrixSim._generate_noise(cor, n, M, epsilon) return cor
def lagrange_interpolate(x, y): r""" Interpolates a polynomial given a set of equal-length x and y values using Lagrangian interpolation. Parameters ---------- x One-dimensional array of x values. Can be a pandas DataFrame or Series, list, dictionary (first key-value pair is used if there are more than one), or numpy array. Must be same length as y. y One-dimensional array of y values. Can be a pandas DataFrame or Series, list, dictionary (first key-value pair is used if there are more than one), or numpy array. Must be same length as y. Returns ------- Symbolic representation of interpolating polynomial. Notes ----- The Lagrangian method of polynomial interpolation uses Lagrangian polynomials to fit a polynomial to a given set of data points. The Lagrange interpolating polynomial is given by the following theorem: For a set of data points :math:`(x_0, y_0), (x_1, y_1), \cdots, (x_n, y_n)` with no duplicate $x$ and there exists a function $f$ which evaluates to these points, then there is a unique polynomial $P(x)$ with degree $\leq n$ also exists. The polynomial is given by: .. math:: P(x) = f(x_o)L_{n,0}(x) + \cdots + f(x_n)L_{n,n}(x) = \sum^n_{k=0} f(x_k) L_{n,k}(x) Where each :math:`k` in :math:`k = 0, 1, \cdots, n` is: .. math:: L_{n,k} = \frac{(x - x_0)(x - x_1) \cdots (x - x_{k-1})(x - x_{k+1}) \cdots (x - x_n)}{(x_k - x_0)(x_k - x_1) \cdots (x_k - x_{k-1})(x_k - x_{k+1}) \cdots (x_k - x_n)} = \underset{i \neq k}{\prod^n_{i=0}} \frac{(x - x_i)}{(x_k - x_i)} Examples -------- >>> x, y = [0, 2, 3, 4], [7, 11, 28, 63] >>> lagrange_interpolate(x, y) x**3 - 2*x + 7 References ---------- Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.). Boston, MA: Brooks/Cole, Cengage Learning. Cheney, E. W., & Kincaid, D. (2013). Numerical mathematics and computing (6th ed.). Boston, MA: Brooks/Cole, Cengage Learning. """ x, y = _create_array(x)[0], _create_array(y)[0] if len(x) != len(y): raise ValueError('x and y must be the same length to evaluate polynomial') l = [] for i in np.arange(len(x)): num = 1 denom = 1 p = np.delete(x, i) for j in p: num = str(num) + '*' + '(' + 'x' + ' - ' + str(j) + ')' denom = str(denom) + '*' + '(' + str(x[i]) + ' - ' + str(j) + ')' l.append('(' + num + ')' + '/' + '(' + denom + ')') poly = 0 for i in np.arange(len(l)): poly = str(poly) + '+' + str(y[i]) + '*' + str(l[i]) return simplify(poly)
def pearson(x, y=None): r""" Computes the Pearson product-moment correlation coefficients of the given variables. Parameters ---------- x : array-like Can be Pandas DataFrame, Pandas Series, numpy ndarray, list, list of lists, or dictionary representing a 1D or 2D array containing the variables and their respective observation vectors. The input is concatenated with the parameter y if given. y : array-like Can be Pandas DataFrame, Pandas Series, numpy ndarray, list, list of lists, or dictionary representing a 1D or 2D array containing the variables and their respective observation vectors. Returns ------- numpy ndarray The correlation coefficient matrix of the inputted variables. Notes ----- Pearson's product-moment correlation coefficient is the covariance of two random variables divided by the product of their standard deviations and is typically represented by :math:`\rho`: .. math:: \rho_{x, y} = \frac{cov(X, Y)}{\sigma_X \sigma_Y} The correlation matrix :math:`C` and the covariance matrix :math:`R` have the following relationship. .. math:: R_{ij} = \frac{C_{ij}}{\sqrt{C_{ii} * C_{jj}}} Examples -------- >>> h = np.array([[16,4,8,4], [4,10,8,4], [8,8,12,10], [4,4,10,12]]) >>> pearson(h) array([[ 1. , -0.47140452, -0.24618298, -0.45732956], [-0.47140452, 1. , 0.05802589, -0.29643243], [-0.24618298, 0.05802589, 1. , 0.80218063], [-0.45732956, -0.29643243, 0.80218063, 1. ]]) >>> pearson(h[:, 0:1], h[:, 1:]) array([[ 1. , -0.47140452, -0.24618298, -0.45732956], [-0.47140452, 1. , 0.05802589, -0.29643243], [-0.24618298, 0.05802589, 1. , 0.80218063], [-0.45732956, -0.29643243, 0.80218063, 1. ]]) >>> pearson(h[:, 1], h[:, 2]) array([[ 1. , 0.05802589], [ 0.05802589, 1. ]]) References ---------- Pearson correlation coefficient. (2017, July 12). In Wikipedia, The Free Encyclopedia. From https://en.wikipedia.org/w/index.php?title=Pearson_correlation_coefficient&oldid=790217169 Rencher, A. (n.d.). Methods of Multivariate Analysis (2nd ed.). Brigham Young University: John Wiley & Sons, Inc. """ x = _create_array(x)[0] if y is not None: y = _create_array(y)[0] x = np.column_stack([x, y]) cormat = np.empty((x.shape[1], x.shape[1])) covmat = covar(x) for i in np.arange(covmat.shape[0]): for j in np.arange(covmat.shape[0]): cormat[i, j] = covmat[i, j] / np.sqrt(covmat[i, i] * covmat[j, j]) return cormat
def ttest(y1, y2=None, mu=None, var_equal=False): r""" Performs one and two-sample t-tests. Parameters ---------- y1 First sample to test y2 Second sample. Optional mu Optional, sets the mean for comparison in the one sample t-test. Default 0. var_equal Optional, default False. If False, Welch's t-test for unequal variance and sample sizes is used. If True, equal variance between samples is assumed and Student's t-test is used. Returns ------- namedtuple Namedtuple containing following values: t-value degrees of freedom p-value confidence intervals sample means Notes ----- Welch's t-test is an adaption of Student's t test and is more performant when the sample variances and size are unequal. The test still depends on the assumption of the underlying population distributions being normally distributed. Welch's t test is defined as: .. math:: t = \frac{\bar{X_1} - \bar{X_2}}{\sqrt{\frac{s_{1}^{2}}{N_1} + \frac{s_{2}^{2}}{N_2}}} where: :math:`\bar{X}` is the sample mean, :math:`s^2` is the sample variance, :math:`n` is the sample size If the :code:`var_equal` argument is True, Student's t-test is used, which assumes the two samples have equal variance. The t statistic is computed as: .. math:: t = \frac{\bar{X}_1 - \bar{X}_2}{s_p \sqrt{\frac{1}{n_1} + \frac{1}{n_2}} where: .. math:: s_p = \sqrt{\frac{(n_1 - 1)s^2_{X_1} + (n_2 - 1)s^2_{X_2}}{n_1 + n_2 - 2} References ---------- Rencher, A. C., & Christensen, W. F. (2012). Methods of multivariate analysis (3rd Edition). Student's t-test. (2017, June 20). In Wikipedia, The Free Encyclopedia. From https://en.wikipedia.org/w/index.php?title=Student%27s_t-test&oldid=786562367 """ y1 = _create_array(y1)[0] n1 = len(y1) s1 = var(y1) ybar1 = np.mean(y1) if y2 is not None: y2 = _create_array(y2)[0] n2 = len(y2) s2 = var(y2) ybar2 = np.mean(y2) if var_equal is False: tval = float((ybar1 - ybar2) / np.sqrt(s1 / n1 + s2 / n2)) else: sp = np.sqrt(((n1 - 1.) * s1 + (n2 - 1.) * s2) / (n1 + n2 - 2.)) tval = float((ybar1 - ybar2) / (sp * np.sqrt(1. / n1 + 1. / n2))) else: ybar2, n2, s2 = 0.0, 1.0, 0.0 if mu is None: mu = 0.0 tval = float((ybar1 - mu) / np.sqrt(s1 / n1)) dof = degrees_of_freedom(y1, y2) pvalue = _student_t_pvalue(np.absolute(tval), dof) intervals = _t_conf_int((ybar1, n1, s1), dof=dof, y=(ybar2, n2, s2)) if y2 is not None: tTestResult = namedtuple( 'tTestResult', ['tvalue', 'dof', 'pvalue', 'confint', 'x_mean', 'y_mean']) tt = tTestResult(tvalue=tval, dof=dof, pvalue=pvalue, confint=intervals, x_mean=ybar1, y_mean=ybar2) else: tTestResult = namedtuple( 'tTestResult', ['tvalue', 'dof', 'pvalue', 'confint', 'x_mean']) tt = tTestResult(tvalue=tval, dof=dof, pvalue=pvalue, confint=intervals, x_mean=ybar1) return tt
def degrees_of_freedom(y1, y2=None, var_equal=False): r""" Computes the degrees of freedom of one or two samples. Parameters ---------- y1 First sample to test y2 Second sample. Optional. var_equal Optional, default False. If False, Welch's t-test for unequal variance and sample sizes is used. If True, equal variance between samples is assumed and Student's t-test is used. Returns ------- float the degrees of freedom Notes ----- When Welch's t test is used, the Welch-Satterthwaite equation for approximating the degrees of freedom should be used and is defined as: .. math:: \large v \approx \frac{\left(\frac{s_{1}^2}{N_1} + \frac{s_{2}^2}{N_2}\right)^2}{\frac{\left(\frac{s_1^2}{N_1^{2}}\right)^2}{v_1} + \frac{\left(\frac{s_2^2}{N_2^{2}}\right)^2}{v_2}} If the two samples are assumed to have equal variance, the degrees of freedoms become simply: .. math:: v = n_1 + n_2 - 2 In the case of one sample, the degrees of freedom are: .. math:: v = n - 1 References ---------- Rencher, A. C., & Christensen, W. F. (2012). Methods of multivariate analysis (3rd Edition). Welch's t-test. (2017, June 16). In Wikipedia, The Free Encyclopedia. From https://en.wikipedia.org/w/index.php?title=Welch%27s_t-test&oldid=785961228 """ y1 = _create_array(y1)[0] n1 = len(y1) s1 = var(y1) v1 = n1 - 1 if y2 is not None: y2 = _create_array(y2)[0] n2 = len(y2) s2 = var(y2) v2 = n2 - 1 if var_equal is False: v = np.power((s1 / n1 + s2 / n2), 2) / (np.power( (s1 / n1), 2) / v1 + np.power((s2 / n2), 2) / v2) else: v = n1 + n2 - 2 else: v = v1 return float(v)
def anova_oneway(group, x, *args): r""" Performs one-way analysis of variance (ANOVA) of one measurement and a grouping variable Parameters ---------- group One-dimensional array (Numpy ndarray, Pandas Series, list) that defines the group membership of the dependent variable(s). Must be the same length as the x parameter. x One or two-dimensional array (Numpy ndarray, Pandas DataFrame, list of lists) that defines the observation vectors of the dependent variables. Must be the same length as the group parameter. Returns ------- namedtuple Namedtuple with the following entries representing an ANOVA table: residual Df: Residuals Degrees of Freedom Group Df: Group Vector Degrees of Freedom F-Value: Computed F-Value of ANOVA procedure p-value: Resulting p-value Group Sum of Squares: SST Group Mean Squares: MST Residual Sum of Squares: SSE Residual Mean Squares: MSE Notes ----- One-way ANOVA can be considered an extension of the t-test when more than two groups are being tested. The factor, or categorical variable, is often referred to as the 'treatment' in the ANOVA setting. ANOVA involves partitioning the data's total variation into variation between and within groups. This procedure is thus known as Analysis of Variance as sources of variation are examined separately. The data is assumed to be normally distributed with mean :math:`\mu_i` and standard deviation :math:`\sigma^2_i`. Stating the hypothesis is also similar to previous examples when there were only two samples of interest. The hypothesis can be defined formally as: :math:`H_O: \mu_1 = \mu_2 = \cdots = \mu_k` :math:`H_A:` Not all population means are equal The one-way ANOVA splits the data's variation into two sources which are in turn used to calculate the F-statistic. The F-statistic is determined by the F-test, which is done by dividing the variance between groups by the variance within groups. The sum of squares for treatments is defined as :math:`SST`, for error as :math:`SSE` and the total :math:`TotalSS`. The mean squares are calculated by dividing the sum of squares by the degrees of freedom. Each sum of squares can be defined as: .. math:: SST = \sum_{i=1}^k n_i(\bar{y_{i}} - \bar{y})^2 .. math:: SSE = \sum_{i=1}^k (n_i - 1)s_i^2 .. math:: TotalSS = \sum_{i=1}^k \sum_{j=1}^{n_i} (y_{ij} - \bar{y})^2 The mean squares are the sum of squares divided by the degrees of freedom. .. math:: MST = \frac{SST}{k - 1} .. math:: MSE = \frac{SSE}{n - k} The F-statistic is defined as: .. math:: f = \frac{MST}{MSE} References ---------- Rencher, A. (n.d.). Methods of Multivariate Analysis (2nd ed.). Brigham Young University: John Wiley & Sons, Inc. """ if args is not (): c = args[0] for i in np.arange(1, len(args)): c = np.column_stack((c, args[i])) x = np.column_stack((x, c)) x = _create_array(x)[0] if x.ndim > 1: x = np.sum(x, axis=1) grouparr, groupname = _create_array(group) groupnames = np.unique(grouparr) data = np.column_stack([grouparr, x]) xmeans = npi.group_by(data[:, 0], data[:, 1], np.mean) xn = npi.group_by(data[:, 0], data[:, 1], len) xvars = npi.group_by(data[:, 0], data[:, 1], var) sst, sse = _sst(xn, xmeans, np.mean(data[:, 1])), _sse(xn, xvars) k = len(groupnames) res_dof = len(x) - (k) mst = sst / (k - 1) mse = sse / res_dof fval = mst / mse pval = _f_p_value(fval, k - 1, res_dof) AnovaResult = namedtuple('AnovaResult', [ 'residualdf', 'groupdf', 'fvalue', 'pvalue', 'groupSumSq', 'groupMeanSq', 'resSumSq', 'resMeanSq' ]) aov = AnovaResult(residualdf=res_dof, groupdf=k, fvalue=fval, pvalue=pval, groupSumSq=sst, groupMeanSq=mst, resSumSq=sse, resMeanSq=mse) return aov
def _sym_eig(x): x = _create_array(x)[0] eigs = np.linalg.eigvals(x) return eigs
def __init__(self, x): self.x = _create_array(x)[0] self.order = 'norm2'
def central_difference(x, y): r""" Approximates the derivative of an unknown function given a set of x and y = f(x) data points using the central-difference approximation method. Parameters ---------- x : array-like Pandas DataFrame or Series, Numpy array, list or dictionary of x values y : array-like Pandas DataFrame or Series, Numpy array, list or dictionary of values of the function at x. Returns ------- dict length :math:`n - 1` where :math:`n` is the length of the data vector x of the approximated values of the derivative function at the corresponding values of x. Notes ----- The derivative of a function :math:`f` at a value :math:`x_0` is defined by: .. math:: f^\prime(x_0) = \underset{h \rightarrow 0}{lim} \frac{x_0 + h) - f(x_0)}{h} The central-difference method is another approach to approximating the derivative of a function, whether it be a known function or a set of points and the function evaluated at those points. The central-difference formula is defined as: .. math:: f^\prime (x_i) = \frac{f(x + h) - f(x - h)}{2h} The central-difference method is often more accurate than the backward or forward methods as it is essentially an average of the latter two approaches. Examples -------- >>> x, y = [0.0, 0.2, 0.4], [0.00000, 0.74140, 1.3718] >>> central_difference(x, y) {'f(0.0)': 3.7069999999999994, 'f(0.2)': 3.7069999999999994, 'f(0.4)': 3.1519999999999997} >>> central_difference([0.5,0.6,0.7], [0.4794,0.5646,0.6442]) {'f(0.5)': 0.8520000000000002, 'f(0.6)': 0.8520000000000002, 'f(0.7)': 0.79600000000000026} References ---------- Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.). Boston, MA: Brooks/Cole, Cengage Learning. Finite difference. (2017, June 9). In Wikipedia, The Free Encyclopedia. From https://en.wikipedia.org/w/index.php?title=Finite_difference&oldid=784585490 """ x, y = _create_array(x)[0], _create_array(y)[0] if len(x) != len(y): raise ValueError('x and y must be the same length') n = len(x) fdx = {} fdx['f(' + str(x[0]) + ')'] = ((y[1] - y[0]) - (y[0] - y[1])) / (2 * (x[1] - x[0])) for i in np.arange(1, n): fdx['f(' + str(x[i]) + ')'] = ((y[i] - y[i - 1]) - (y[i - 1] - y[i])) / (2 * (x[i] - x[i - 1])) return fdx
def forward_difference(x, y): r""" Approximates the derivative of an unknown function given a set of x and y = f(x) data points using the forward-difference approximation method. The x-values should be equally-spaced for the central difference method to return accurate results. Otherwise, the forward or backward difference methods should be employed (or a more accurate method altogether). Parameters ---------- x : array-like Pandas DataFrame or Series, Numpy array, list or dictionary of x values y : array-like Pandas DataFrame or Series, Numpy array, list or dictionary of values of the function at x. Returns ------- dict length :math:`n - 1` where :math:`n` is the length of the data vector x of the approximated values of the derivative function at the corresponding values of x. Notes ----- The derivative of a function :math:`f` at a value :math:`x_0` is defined by: .. math:: f^\prime(x_0) = \underset{h \rightarrow 0}{lim} \frac{x_0 + h) - f(x_0)}{h} However, if the function is unknown, the derivative of the function can still be approximated at a value of :math:`x_0` given a set of points :math:`(x_1, y_1), (x_2, y_2), \cdots, (x_n, y_n)`. The forward difference method is one approach to approximating the derivative. Given a set of data points, the forward difference approximation of a derivative can be defined as: .. math:: f^\prime (x_i) = y^\prime_i \approx \frac{y_{i+1} - y_i}{x_{i+1} - x_i} Examples -------- >>> x, y = [0.0, 0.2, 0.4], [0.00000, 0.74140, 1.3718] >>> forward_difference(x, y) {'f(0.0)': 3.7069999999999994, 'f(0.2)': 3.7069999999999994, 'f(0.4)': 3.1519999999999997} >>> forward_difference([0.5,0.6,0.7], [0.4794,0.5646,0.6442]) {'f(0.5)': 0.8520000000000002, 'f(0.6)': 0.8520000000000002, 'f(0.7)': 0.79600000000000026} References ---------- Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.). Boston, MA: Brooks/Cole, Cengage Learning. Finite difference. (2017, June 9). In Wikipedia, The Free Encyclopedia. From https://en.wikipedia.org/w/index.php?title=Finite_difference&oldid=784585490 """ x, y = _create_array(x)[0], _create_array(y)[0] if len(x) != len(y): raise ValueError('x and y must be the same length') n = len(x) fdx = {} fdx['f(' + str(x[0]) + ')'] = (y[1] - y[0]) / (x[1] - x[0]) for i in np.arange(1, n): fdx['f(' + str(x[i]) + ')'] = (y[i - 1] - y[i]) / (x[i - 1] - x[i]) return fdx
def lu(a): r""" Computes the LU decomposition of a square matrix :math:`A`. Parameters ---------- a : array_like Accepts a list, nested list, dictionary, pandas DataFrame or pandas Series. The private function _create_array is called to create a copy of x as a numpy array. Returns ------- l_u : tuple Returns a tuple containing the lower triangular matrix :math:`L` and the upper triangular matrix :math:`U`. Notes ----- LU Decomposition factors a square matrix (:math:`n \times n`) into the product of a 'lower' and 'upper' triangular matrix (hence the name 'LU'). More formally: .. math:: A = LU The :math:`L` and :math:`U` matrices are lower and upper triangular, respectively. For example, the LU decomposition of a :math:`3 \times 3` matrix would be similar to: .. math:: \begin{bmatrix} a_{11} & a_{12} & a_{13} \\ a_{21} & a_{22} & a_{23} \\ a_{31} & a_{32} & a_{33} \end{bmatrix} = \begin{bmatrix} l_{11} & 0 & 0 \\ l_{21} & l_{22} & 0 \\ l_{31} & l_{32} & l_{33} \end{bmatrix} \begin{bmatrix} u_{11} & u_{12} & u_{13} \\ 0 & u_{22} & u_{23} \\ 0 & 0 & u_{33} \end{bmatrix} Examples -------- >>> a = pd.DataFrame({0: [16, 4, 8, 4], 2: [4, 10, 8, 4], 3: [8, 8, 12, 10], 4: [4, 4, 10, 12]}) >>> l, u = lu(a) >>> print(l, u) [[ 1. 0. 0. 0. ] [ 0.25 1. 0. 0. ] [ 0.5 0.66666667 1. 0. ] [ 0.25 0.33333333 1.5 1. ]] [[ 16. 4. 8. 4.] [ 0. 9. 6. 3.] [ 0. 0. 4. 6.] [ 0. 0. 0. 1.]] >>> np.dot(l, u) array([[ 16., 4., 8., 4.], [ 4., 10., 8., 4.], [ 8., 8., 12., 10.], [ 4., 4., 10., 12.]]) References ---------- Cormen, T., Leiserson, C., Rivest, R., & Stein, C. (2009). Introduction to algorithms (3rd ed., pp. 819-822). Cambridge (Inglaterra): Mit Press. """ x = _create_array(a)[0].copy() n, m = x.shape if n != m: raise ValueError('Matrix must be square to perform LU decomposition') l, u = np.eye(n), np.zeros((n, n)) for k in np.arange(n): u[k, k] = x[k, k] for i in np.arange(k + 1, n): l[i, k] = x[i, k] / u[k, k] u[k, i] = x[k, i] for i in np.arange(k + 1, n): for j in np.arange(k + 1, n): x[i, j] = x[i, j] - l[i, k] * u[k, j] l_u = (l, u) return l_u
def cholesky(a): r""" Function for computing the Cholesky decomposition of a symmetric, positive definite matrix. Parameters ---------- a : array_like Accepts a list, nested list, dictionary, pandas DataFrame or pandas Series. The private function _create_array is called to create a copy of x as a numpy array. Returns ------- llt : tuple The cholesky function returns the lower-triangular matrix L and its transpose, the upper-triangular matrix L^T. Notes ----- Cholesky decomposition is a special case of :math:`LU` decomposition for symmetric, positive definite matrices (Hermitian in the complex case). Cholesky decomposition is preferred when applicable as it is more efficient than LU decomposition. The Cholesky decomposition factors a matrix :math:`A` into the product of a lower triangular matrix :math:`L` and its transpose :math:`L^T` (or :math:`L^*` which denotes the conjugate transpose in the Hermitian case). More formally, for a symmetric, positive definite matrix :math:`A`, the Cholesky decomposition is defined as: .. math:: A = LL^T In component notation: .. math:: L_{ii} = \sqrt{a_{ii} - \sum^{i-1}_{k=0} L^2_{ik}} .. math:: L_{ji} = \frac{1}{L_{ii}} (a_{ij} - \sum^{i-1}_{k=0} L_{ik} L_{jk}) \qquad j = i + 1, i + 2, \cdots, N - 1 Examples -------- >>> h = pd.DataFrame({0: [16, 4, 8, 4], 2: [4, 10, 8, 4], 3: [8, 8, 12, 10], 4: [4, 4, 10, 12]}) >>> l, lt = cholesky(h) >>> l array([[4, 0, 0, 0], [1, 3, 0, 0], [2, 2, 2, 0], [1, 1, 3, 1]], dtype=int64) >>> lt array([[4, 1, 2, 1], [0, 3, 2, 1], [0, 0, 2, 3], [0, 0, 0, 1]], dtype=int64) References ---------- Press, W. (2007). Numerical Recipes 3rd Edition: The Art of Scientific Computing (3rd ed.). New York: Cambridge University Press. Watkins, D. (2010). Fundamentals of Matrix Computations, 3rd Edition. John Wiley & Sons. """ x = _create_array(a)[0].copy() n, m = x.shape if ispositivedefinite(x) is False: raise ValueError('Matrix is not positive definite') for j in np.arange(n): x[j, j] = np.sqrt(x[j, j] - np.dot(x[j, 0:j], x[j, 0:j])) for i in np.arange(j + 1, n): x[i, j] = (x[i, j] - np.dot(x[i, 0:j], x[j, 0:j])) / x[j, j] for j in np.arange(1, n): x[0:j, j] = 0.0 llt = (x, x.T) return llt
def neville(x, y, x0): r""" Evaluates an interpolated polynomial at a particular :math:`x` value given a set of :math:`x` and corresponding :math:`y` values. Parameters ---------- x One-dimensional array of x values. Can be a pandas DataFrame or Series, list, dictionary (first key-value pair is used if there are more than one), or numpy array. Must be same length as y. y One-dimensional array of y values. Can be a pandas DataFrame or Series, list, dictionary (first key-value pair is used if there are more than one), or numpy array. Must be same length as y. x0 Desired value at which to interpolate and approximate poynomial. Returns ------- tuple Contains the approximated value of the interpolated polynomial evaluated at the point :math:`x` as float and a numpy array representing the iterated Neville table with intermediate values generated recursively. Notes ----- Neville's method evaluates a polynomial that passes through a given set of :math:`x` and :math:`y` points for a particular :math:`x` value using the Newton polynomial form. Neville's method is similar to a now defunct procedure named Aitken's algorithm and is based on the divided differences recursion relation. It was stated before in a previous post on Lagrangian polynomial interpolation that there exists a Lagrange polynomial that passes through points :math:`y_1, y_2, \cdots, y_k` where each is a distinct integer and :math:`0 \leq y_i \leq n` at corresponding x values :math:`x_0, x_1, x_2, \cdots, x_n`. The :math:`k` points :math:`y_1, y_2, \cdots, y_k` are denoted :math:`P_{y_1, y_2, \cdots, y_k}(x)`. Neville's method can be stated as follows: Let a function :math:`f` be defined at points :math:`x_0, x_1, \cdots, x_k` where :math:`x_j` and :math:`x_i` are two distinct members. For each :math:`k`, there exists a Lagrange polynomial :math:`P` that interpolates the function :math:`f` at the :math:`k + 1` points :math:`x_0, x_1, \cdots, x_k`. The :math:`k`th Lagrange polynomial is defined as: .. math:: P(x) = \frac{(x - x_j) P_{0,1,\cdots,j-1,j+1,\cdots,k}(x) - (x - x_i) P_{0,1,\cdots,i-1,i+1,\cdots,k}(x)}{(x_i - x_j)} The :math:`P_{0,1,\cdots,j-1,j+1,\cdots,k}` and :math:`P_{0,1,\cdots,i-1,i+1,\cdots,k}` are often denoted :math:`\hat{Q}` and :math:`Q`, respectively, for ease of notation. .. math:: P(x) = \frac{(x - x_j) \hat{Q}(x) - (x - x_i) Q(x)}{(x_i - x_j)} Examples -------- >>> x, y = [8.1, 8.3, 8.6, 8.7], [16.9446, 17.56492, 18.50515, 18.82091] >>> neville(x, y, 8.4) 17.8770925 References ---------- Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.). Boston, MA: Brooks/Cole, Cengage Learning. Cheney, E. W., & Kincaid, D. (2013). Numerical mathematics and computing (6th ed.). Boston, MA: Brooks/Cole, Cengage Learning. Neville's algorithm. (2016, January 2). In Wikipedia, The Free Encyclopedia. From https://en.wikipedia.org/w/index.php?title=Neville%27s_algorithm&oldid=697870140 """ x, y = _create_array(x)[0], _create_array(y)[0] if len(x) != len(y): raise ValueError('x and y must be the same length to evaluate polynomial') n = len(x) q = np.zeros((n, n)) q[:, 0] = y for i in np.arange(1, n): for j in np.arange(i, n): q[j, i] = ((x0 - x[j - i]) * q[j, i - 1] - (x0 - x[j]) * q[j - 1, i - 1]) / (x[j] - x[j - i]) return float(q[n - 1, n - 1]), q
def __init__(self, x): self.x = _create_array(x)[0] self.order = 'frobenius' self.n, self.m = self.x.shape
def divided_differences(x, y, x0=None): r""" Constructs an interpolating polynomial that passes through given x and y points using the divided differences method. Parameters ---------- x One-dimensional array of x values. Can be a pandas DataFrame or Series, list, dictionary (first key-value pair is used if there are more than one), or numpy array. Must be same length as y. y One-dimensional array of y values. Can be a pandas DataFrame or Series, list, dictionary (first key-value pair is used if there are more than one), or numpy array. Must be same length as y. x0 Optional. Desired value to interpolate poynomial and approximate. Returns ------- dict dict object containing the following entries: Approximated value of the intepolated polynomial (if given) The interpolated polynomial Divided Differences Table Notes ----- The divided differences method is a numerical procedure for interpolating a polynomial given a set of points. Unlike Neville's method, which is used to approximate the value of an interpolating polynomial at a given point, the divided differences method constructs the interpolating polynomial in Newton form. Assume that :math:`P_n(x)` is the :math:`nth` Lagrangian polynomial that corresponds with the function :math:`f` at a set of :math:`x` data points. The polynomial :math:`P_n(x)` can be expressed using the divided differences of the function :math:`f` with respect to the :math:`x`-values. .. math:: P_n(x) = a_0 + a_1(x - x_0) + a_2(x - x_0)(x - x_1) + \cdots + a_n(x - x_0) \cdots (x - x_{n-1}) Therefore the constants :math:`a_0, a_1, \cdots, a_n` must be found to construct the polynomial. To find these constants, the divided differences are recursively generated until :math:`n` iterations have been completed. We start with the zeroth divided difference of the function :math:`f` with respect to :math:`x_i`, which is the value of :math:`f` at that point. Bracket notation is introduced to distinguish the divided differences. .. math:: f[x_i] = f(x_i) The first divided difference is then the function :math:`f` with respect to the values :math:`x_i` and :math:`x_{i+1}`. .. math:: f[x_i, x_{i+1}] = \frac{f[x_{i+1}] - f[x_i]}{x_{i+1 - x_i}} The second divided difference follows: .. math:: f[x_i, x_{i+1}, x_{i+2}] = \frac{f[x_{i+1},x_{i+2}] - f[x_i, x_{i+1}]}{x_{i+2} - x_i} This iteration continues until the :math:`n`th divided difference: .. math:: f[x_0, x_1, \cdots, x_n] = \frac{f[x_1, x_2, \cdots, x_n] - f[x_0, x_1, \cdots, x_n]}{x_n - x_0} Thus the interpolating polynomial resulting from the divided differences method takes the form: .. math:: P_n(x) = f[x_0] + f[x_0, x_1](x - x_0) + f[x_0, x_1, x_2](x - x_0)(x - x_1) + \cdots + f[x_0, x_1, x_2, \cdots, x_n](x - x_0)(x - x_1) \cdots (x - x_{n-1}) Examples -------- >>> x, y = [8.1, 8.3, 8.6, 8.7], [16.9446, 17.56492, 18.50515, 18.82091] >>> divided_differences(x, y, 8.4) {'Approximated Value of Interpolated Polynomial': 17.8770925200000, 'Divided Differences Table': array([[ 1.69446000e+01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], [ 1.75649200e+01, 3.10160000e+00, 0.00000000e+00, 0.00000000e+00], [ 1.85051500e+01, 3.13410000e+00, 6.50000000e-02, 0.00000000e+00], [ 1.88209100e+01, 3.15760000e+00, 5.87500000e-02, -1.04166667e-02]]), 'Interpolated Function': '16.9446 + 3.1016*(x - 8.1) + 0.065*(x - 8.1)*(x - 8.3) + -0.01042*(x - 8.1)*(x - 8.3)*(x - 8.6)'} References ---------- Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.). Boston, MA: Brooks/Cole, Cengage Learning. """ x, y = _create_array(x)[0], _create_array(y)[0] if len(x) != len(y): raise ValueError('x and y must be the same length to evaluate polynomial') n = len(x) q = np.zeros((n, n)) q[:, 0] = y f = str(np.round(q[0, 0], 5)) fi = '' for i in np.arange(1, n): for j in np.arange(i, n): q[j, i] = (q[j, i-1] - q[j-1, i-1]) / (x[j] - x[j-i]) fi = fi + '*(x - ' + str(x[i-1]) + ')' f = f + ' + ' + str(np.round(q[i,i], 5)) + fi x = Symbol('x') if x0 is None: raise ValueError('x0 must be provided to approximate polynomial') else: approx = parse_expr(f).evalf(subs={x: x0}) res = {'Approximated Value of Interpolated Polynomial': approx, 'Interpolated Function': f, 'Divided Differences Table': q} return res
def spearman(x, y=None): r""" Computes the Spearman correlation coefficients of the given variables. Parameters ---------- x : array-like Can be Pandas DataFrame, Pandas Series, numpy ndarray, list, list of lists, or dictionary representing a 1D or 2D array containing the variables and their respective observation vectors. The input is concatenated with the parameter y if given. y : array-like Can be Pandas DataFrame, Pandas Series, numpy ndarray, list, list of lists, or dictionary representing a 1D or 2D array containing the variables and their respective observation vectors. Returns ------- numpy ndarray The correlation coefficient matrix of the inputted variables. Notes ----- Spearman's :math:`\rho`, often denoted :math:`r_s` is a nonparametric measure of correlation. While Pearson's product-moment correlation coefficient represents the linear relationship between two variables, Spearman's correlation measures the monotonicity of two variables. Put more simply, Spearman's correlation is Pearson's correlation performed on ranked variables. Two random variables :math:`X` and :math:`Y` and their respective observation vectors :math:`x_1, x_2, \cdots, x_n` and :math:`y_1, y_2, \cdots, y_n` are converted to ranked variables (identical values are averaged), often denoted :math:`rg_X` and :math:`rg_Y`, and the correlation :math:`r_s` is computed as: .. math:: r_s = \rho_{rg_X, rg_Y} = \frac{cov(rg_X, rg_Y}{\sigma_{rg_X} \sigma_{rg_Y}} Where :math:`\rho` is the Pearson correlation coefficient applied to the ranked variables, :math:`cov(rg_X, rg_Y)` is the covariance of the ranked variables and :math:`\sigma_{rg_X}` and :math:`\sigma_{rg_Y}` are the standard deviations of the ranked variables. Examples -------- >>> h = np.array([[16,4,8,4], [4,10,8,4], [8,8,12,10], [4,4,10,12]]) >>> spearman(h) array([[ 1. , -0.33333333, -0.03703704, -0.33333333], [-0.33333333, 1. , -0.03703704, -0.33333333], [-0.03703704, -0.03703704, 1. , 0.85185185], [-0.33333333, -0.33333333, 0.85185185, 1. ]]) >>> spearman(h[:, 0:1], h[:, 1:]) array([[ 1. , -0.33333333, -0.03703704, -0.33333333], [-0.33333333, 1. , -0.03703704, -0.33333333], [-0.03703704, -0.03703704, 1. , 0.85185185], [-0.33333333, -0.33333333, 0.85185185, 1. ]]) >>> spearman(h[:, 0], h[:, 1]) array([[ 1. , -0.33333333], [-0.33333333, 1. ]]) References ---------- Spearman's rank correlation coefficient. (2017, June 24). In Wikipedia, The Free Encyclopedia. From https://en.wikipedia.org/w/index.php?title=Spearman%27s_rank_correlation_coefficient&oldid=787350680 """ x = _create_array(x)[0] if y is not None: y = _create_array(y)[0] x = np.column_stack([x, y]) ranked = x.copy() for i in np.arange(ranked.shape[1]): ranked[:, i] = rankdata(x[:, i], 'average') covranked = pearson(ranked) return covranked
def backward_difference(x, y): r""" Approximates the derivative of an unknown function given a set of x and y = f(x) data points using the backward-difference approximation method. Parameters ---------- x : array-like Pandas DataFrame or Series, Numpy array, list or dictionary of x values y : array-like Pandas DataFrame or Series, Numpy array, list or dictionary of values of the function at x. Returns ------- dict length :math:`n - 1` where :math:`n` is the length of the data vector x of the approximated values of the derivative function at the corresponding values of x. Notes ----- The derivative of a function :math:`f` at a value :math:`x_0` is defined by: .. math:: f^\prime(x_0) = \underset{h \rightarrow 0}{lim} \frac{x_0 + h) - f(x_0)}{h} The backward difference method is one approach to approximating the derivative of a function, whether known or unknown. Given a set of data points, the backward difference approximation of a derivative can be defined as: .. math:: f^\prime (x_i) = y^\prime_i \approx \frac{y_i - y_{i-1}}{x_i - x_{i-1}} Examples -------- >>> x, y = [0.0, 0.2, 0.4], [0.00000, 0.74140, 1.3718] >>> backward_difference(x, y) {'f(0.0)': 3.7069999999999994, 'f(0.2)': 3.1519999999999997, 'f(0.4)': 3.1519999999999997} >>> backward_difference([0.5,0.6,0.7], [0.4794,0.5646,0.6442]) {'f(0.5)': 0.8520000000000002, 'f(0.6)': 0.79600000000000026, 'f(0.7)': 0.79600000000000026} References ---------- Burden, R. L., & Faires, J. D. (2011). Numerical analysis (9th ed.). Boston, MA: Brooks/Cole, Cengage Learning. Finite difference. (2017, June 9). In Wikipedia, The Free Encyclopedia. From https://en.wikipedia.org/w/index.php?title=Finite_difference&oldid=784585490 """ x, y = _create_array(x)[0], _create_array(y)[0] if len(x) != len(y): raise ValueError('x and y must be the same length') n = len(x) fdx = {} for i in np.arange(1, n): fdx['f(' + str(x[i - 1]) + ')'] = (y[i] - y[i - 1]) / (x[i] - x[i - 1]) fdx['f(' + str(x[n - 1]) + ')'] = fdx['f(' + str(x[n - 2]) + ')'] return fdx
def manova_oneway(group, x, *args): r""" Performs Multiple Analysis of Variance (MANOVA) of one grouping variable and n dependent variables Parameters ---------- group One-dimensional array (Numpy ndarray, Pandas Series, list) that defines the group membership of the dependent variable(s). Must be the same length as the x parameter. x One or two-dimensional array (Numpy ndarray, Pandas DataFrame, list of lists) that defines the observation vectors of the dependent variables. Must be the same length as the group parameter. Returns ------- namedtuple Namedtuple with the following entries representing a MANOVA table: Group Df: Group Vector Degrees of Freedom residual Df: Residuals Degrees of Freedom Num Df: Numerator Degrees of Freedom Den Df: Denominator Degrees of Freedom Pillai Statistic: Pillai Test Statistic Wilk's Lambda: Wilk's Lambda Lawley-Hotelling T^2: T^2 statistic, also known as Lawley-Hotelling statistic Roy's Test: Reported value from Roy's Test Pillai F-Value: Approximated F-Value of Pillai statistic Wilk's Lambda F-Value: Approximated F-Value of Wilk's Lambda Lawley-Hotelling T^2 F-Value: Approximated F-Value of T^2 Roy's Test F-Value: Approximated F-Value of Roy's Test statistic Pillai p-value: p-value of approximated Pillai F-Value with Num Df and Den Df Wilk's Lambda p-value: p-value of approximated Wilk's Lambda F-Value with Num Df and Den Df Lawley-Hotelling T^2 p-value: p-value of approximated Lawley-Hotelling F-Value with Num Df and Den Df Roy's Test p-value: p-value of approximated Roy's Test F-Value with Num Df and Den Df Notes ----- MANOVA, or Multiple Analysis of Variance, is an extension of Analysis of Variance (ANOVA) to several dependent variables. The approach to MANOVA is similar to ANOVA in many regards and requires the same assumptions (normally distributed dependent variables with equal covariance matrices). In the MANOVA setting, each observation vector can have a model denoted as: .. math:: y_{ij} = \mu_i + \epsilon_{ij} \qquad i = 1, 2, \cdots, k; \qquad j = 1, 2, \cdots, n An 'observation vector' is a set of observations measured over several variables. With :math:`p` variables, :math:`y_{ij}` becomes: .. math:: \begin{bmatrix} y_{ij1} \\ y_{ij2} \\ \vdots \\ y_{ijp} \end{bmatrix} = \begin{bmatrix} \mu_{i1} \\ \mu_{i2} \\ \vdots \\ \mu_{ip} \end{bmatrix} + \begin{bmatrix} \epsilon_{ij1} \\ \epsilon_{ij2} \\ \vdots \\ \epsilon_{ijp} \end{bmatrix} As before in ANOVA, the goal is to compare the groups to see if there are any significant differences. However, instead of a single variable, the comparisons will be made with the mean vectors of the samples. The null hypothesis :math:`H_0` can be formalized the same way in MANOVA: .. math:: H_0: \mu_1 = \mu_2 = \dots = \mu_k With an alternative hypothesis :math:`H_a` that at least two :math:`\mu` are unequal. There are :math:`p(k - 1)`, where :math:`k` is the number of groups in the data, equalities that must be true for :math:`H_0` to be accepted. Similar to ANOVA, we are interested in partitioning the data's total variation into variation between and within groups. In the case of ANOVA, this partitioning is done by calculating :math:`SSH` and :math:`SSE`; however, in the multivariate case, we must extend this to encompass the variation in all the :math:`p` variables. Therefore, we must compute the between and within sum of squares for each possible comparison. This procedure results in the :math:`H` "hypothesis matrix" and :math:`E` "error matrix." The :math:`H` matrix is a square :math:`p \times p` with the form: .. math:: H = \begin{bmatrix} SSH_{11} & SPH_{21} & \dots & SPH_{1p} \\ SPH_{12} & SSH_{22} & \dots & SPH_{2p} \\ \vdots & \vdots & & \vdots \\ SPH_{1p} & SPH_{2p} & \cdots & SSH_{pp} \end{bmatrix} The error matrix :math:`E` is also :math:`p \times p` .. math:: E = \begin{bmatrix} SSE_{11} & SPE_{12} & \cdots & SPE_{1p} \\ SPE_{12} & SSE_{22} & \cdots & SPE_{2p} \\ \vdots & \vdots & & \vdots \\ SPE_{1p} & SPE_{2p} & \cdots & SSE_{pp} \end{bmatrix} Once the :math:`H` and :math:`E` matrices are constructed, the mean vectors can be compared to determine if significant differences exist. There are several test statistics, of which the most common are Wilk's lambda, Roy's test, Pillai, and Lawley-Hotelling, that can be employed to test for significant differences. Each test statistic has specific properties and power. References ---------- Rencher, A. (n.d.). Methods of Multivariate Analysis (2nd ed.). Brigham Young University: John Wiley & Sons, Inc. """ if args is not (): c = args[0] for i in np.arange(1, len(args)): c = np.column_stack((c, args[i])) x = np.column_stack((x, c)) x = _create_array(x)[0] grouparr, groupname = _create_array(group) groupnames = np.unique(grouparr) kn = len(groupnames) data = np.column_stack((grouparr, x)) xmeans = data[:, 1:].mean(axis=0) xn = len(xmeans) groupmeans = npi.group_by(data[:, 0]).mean(data)[1][:, 1:] groupn = npi.group_by(data[:, 0], data, len) groups = npi.group_by(data[:, 0], data[:, 1:])[1] n = [i for _, i in groupn] h, e = np.zeros((xn, xn)), np.zeros((xn, xn)) for i in np.arange(xn): for j in np.arange(i + 1): h[i, j] = n[i] * np.sum((groupmeans[:, i] - xmeans[i]) * (groupmeans[:, j] - xmeans[j])) h[j, i] = n[i] * np.sum((groupmeans[:, j] - xmeans[j]) * (groupmeans[:, i] - xmeans[i])) b = [] for k in groups: a = np.sum((k[:, i] - np.mean(k[:, i])) * (k[:, j] - np.mean(k[:, j]))) b.append(a) e[i, j], e[j, i] = np.sum(b), np.sum(b) vh, ve, pillai, pillai_f, wilks_lambda, wilks_lambda_f, t2, t2_f, roy, roy_f = _manova_statistics( h, e, kn, len(x)) num_df, denom_df = vh * xn, ve * xn pillai_pval, wilks_pval, t2_pval, roy_pval = _f_p_value(pillai_f, num_df, denom_df), \ _f_p_value(wilks_lambda_f, num_df, denom_df), \ _f_p_value(t2_f, num_df, denom_df), \ _f_p_value(roy_f, num_df, denom_df) ManovaResult = namedtuple('ManovaResult', [ 'groupdf', 'residualdf', 'numdf', 'denomdf', 'pillai', 'wilks', 't2', 'roy', 'pillai_f', 'wilks_f', 't2_f', 'roy_f', 'pillai_p', 'wilks_p', 't2_p', 'roy_p' ]) maov = ManovaResult(groupdf=vh, residualdf=ve, numdf=num_df, denomdf=denom_df, pillai=pillai, wilks=wilks_lambda, t2=t2, roy=roy, pillai_f=pillai_f, wilks_f=wilks_lambda_f, t2_f=t2_f, roy_f=roy_f, pillai_p=pillai_pval, wilks_p=wilks_pval, t2_p=t2_pval, roy_p=roy_pval) return maov