def nearest_neighbor_connections(distances: numpy.ndarray, k: int = 1, symmetric: bool = True) -> Connections: """ connect each point to it's k nearest neighbors individual points can be connected to more than k points because of ties and because a pair are not necessarily each other's nearest neighbors :param distances: an n x n matrix containing the distances among a set of points :param k: the number of nearest neighbors to connect :param symmetric: should connections always be symmetric (defualt = True) or allow asymmetric connections because the nearest neighbor of one point A does not necessarily have A as it's nearest neighbor :return: returns a Connections object """ n = check_for_square_matrix(distances) output = Connections(n, symmetric) for i in range(n): dists = [] for j in range(n): if j != i: dists.append([distances[i, j], j]) dists.sort() c = k while dists[c][0] == dists[c + 1][0]: # this accounts for ties c += 1 for p in range(c): # connect the c closest points to the ith point output.store(i, dists[p][1]) return output
def residuals_from_simple_matrix_regression(y: numpy.ndarray, x: numpy.ndarray) -> numpy.ndarray: """ performs a linear regression of matrix y on matrix x and returns the residuals assumes the diagonals of the input matrices are all zeros :param y: matrix of dependent variables, as a square numpy.ndarray :param x: matrix of independenet variables, as a square numpy.ndarray :return: matrix of residuals of the simple linear regression of y on x """ n = check_for_square_matrix(y) sumx = numpy.sum(x) sumy = numpy.sum(y) sumx2 = numpy.sum(numpy.square(x)) sumxy = numpy.sum(numpy.multiply(x, y)) count = n**2 - n # number of off-diagonal elements # means are calculated without including diagonals of matrices! xbar = sumx / count ybar = sumy / count beta = (sumxy - count*xbar*ybar) / (sumx2 - count*xbar**2) alpha = ybar - beta*xbar residuals = y - alpha - beta*x numpy.fill_diagonal(residuals, 0) # reset diagonal elements to zero return residuals
def minimum_spanning_tree(distances: numpy.ndarray) -> Connections: """ calculate connections among points based on a minimum spanning tree Although I invented this algorithm myself, it sort of follows the suggestion made in Kruskal, Joseph B., Jr. 1956. On the shortest spanning subtree of a graph and the traveling salesman problem. Proceedings of the American Mathematical Society 7(1):48-50. :param distances: an n x n matrix containing distances among points :return: returns a Connections object """ n = check_for_square_matrix(distances) output = Connections(n) used = [i for i in range(n)] cnt = 1 while cnt < n: new_point = cnt old_point = 0 for i in range(cnt): for j in range(cnt, n): if distances[used[i], used[j]] < distances[used[old_point], used[new_point]]: old_point, new_point = i, j # make connection output.store(used[old_point], used[new_point]) used[cnt], used[new_point] = used[new_point], used[ cnt] # swap out a used point with an unused point cnt += 1 return output
def bearing_analysis(data: numpy.ndarray, distances: numpy.ndarray, angles: numpy.ndarray, nbearings: int = 36, npermutations: int = 0) -> Tuple[list, list]: """ Conduct a bearing analysis to test for anisotropic patterns in scattered data, method originally described in: Falsetti, A.B., and R.R. Sokal. 1993. Genetic structure of human populations in the British Isles. Annals of Human Biology 20:215-229. :param data: an n x n matrix representing distances among data values :param distances: an n x n matrix representing geographic distances among data points :param angles: an n x n matrix representing geographic angles among data points :param nbearings: the number of bearings to test; the default is 36 (every 5 degrees) :param npermutations: the number of random permutations used to test the correlations; the default is 0 :return: a tuple containing a list of output values and a list of text output """ n = check_for_square_matrix(data) if (n != check_for_square_matrix(distances)) or (n != check_for_square_matrix(angles)): raise ValueError("input matrices must be the same size") angle_width = pi / nbearings output = [] for a in range(nbearings): test_angle = a * angle_width b_matrix = distances * numpy.square(numpy.cos(angles - test_angle)) r, p_value, _, _, _, rand_p, _, _ = pyssage.mantel.mantel(data, b_matrix, [], npermutations) if npermutations > 0: output.append([a*180/nbearings, r, p_value, rand_p]) else: output.append([a*180/nbearings, r, p_value]) # create basic output text output_text = list() output_text.append("Bearing Analysis") output_text.append("") output_text.append("Tested {} vectors".format(nbearings)) output_text.append("") if npermutations > 0: col_headers = ["Bearing", "Correlation", "Prob", "RandProb"] col_formats = ["f", "f", "f", "f"] else: col_headers = ["Bearing", "Correlation", "Prob"] col_formats = ["f", "f", "f"] create_output_table(output_text, output, col_headers, col_formats) bearing_output = namedtuple("bearing_output", ["output_values", "output_text"]) return bearing_output(output, output_text)
def square_matrix_covariance(x: numpy.ndarray, y: numpy.ndarray) -> float: """ returns the covariance of two square matrices, assuming the diagonals are both zeros :param x: first matrix, as a square numpy.ndarray :param y: second matrix, as a square numpy.ndarray :return: the covariance of the two matrices """ n = check_for_square_matrix(y) sumx = numpy.sum(x) sumy = numpy.sum(y) sumxy = numpy.sum(numpy.multiply(x, y)) count = n**2 - n return (sumxy - sumx*sumy/count) / (count - 1)
def residuals_from_multi_matrix_regression(y: numpy.ndarray, x_list: list) -> numpy.ndarray: """ performs a muliple linear regression of matrix y on all of the matrices in x and returns the residuals """ n = check_for_square_matrix(y) # create a column of y values ymat = flatten_without_diagonal(y) # create an x matrix with the first column 1's and one additional column for each matrix in the x list xmat = numpy.ones((len(ymat), len(x_list) + 1), dtype=float) for i, x in enumerate(x_list): xmat[:, i+1] = flatten_without_diagonal(x) b = numpy.matmul(numpy.matmul(numpy.linalg.inv(numpy.matmul(xmat.T, xmat)), xmat.T), ymat) yhat = numpy.matmul(xmat, b) residuals = ymat - yhat return deflatten_without_diagonal(residuals, n)
def create_windrose_connections(distances: numpy.ndarray, angles: numpy.ndarray, annulus: int, sector: int, a: int, c: float, d: float, e: float) -> Tuple[Connections, float, float]: n = check_for_square_matrix(distances) output = Connections(n) output.min_scale = c * annulus**2 + d * annulus + e output.max_scale = c * (annulus + 1)**2 + d * (annulus + 1) + e sector_breadth = pi / windrose_sectors_per_annulus(a, annulus) sector_min = sector * sector_breadth sector_max = (sector + 1) * sector_breadth for i in range(n): for j in range(i): if (output.min_scale <= distances[i, j] < output.max_scale) and ( sector_min <= angles[i, j] < sector_max): output.store(i, j) return output, sector_min, sector_max
def relative_neighborhood_network(distances: numpy.ndarray) -> Connections: """ calculate connections among points based on a relative neighborhood network :param distances: an n x n matrix containing distances among points :return: returns a Connections object """ n = check_for_square_matrix(distances) output = Connections(n) for i in range(n): for j in range(i): good = True for k in range(n): if (k != i) and (k != j): if (distances[k, j] < distances[i, j]) and ( distances[k, i] < distances[i, j]): good = False if good: output.store(i, j) return output
def gabriel_network(distances: numpy.ndarray) -> Connections: """ calculate connections among points based on a Gabriel network :param distances: an n x n matrix containing distances among points :return: returns a Connections object """ n = check_for_square_matrix(distances) output = Connections(n) sq_distances = numpy.square(distances) for i in range(n): for j in range(i): good = True for k in range(n): if (k != i) and (k != j): if sq_distances[ i, j] > sq_distances[k, j] + sq_distances[k, i]: good = False if good: output.store(i, j) return output
def connect_distance_range(distances: numpy.ndarray, maxdist: float, mindist: float = 0) -> Connections: """ calculate connections based on a distance range, defined by maxdist and mindist points are not connected to themselves, even with a distance of zero :param distances: an n x n matrix containing distances among points :param maxdist: the maximum distance between points to connect. this distance is exclusive :param mindist: the minimum distance between points to connect (default = 0). this distance is inclusive :return: returns a Connections object """ n = check_for_square_matrix(distances) output = Connections(n) output.min_scale = mindist output.max_scale = maxdist for i in range(n): for j in range(i): if mindist <= distances[i, j] < maxdist: output.store(i, j) return output
def least_diagonal_network(x: numpy.ndarray, y: numpy.ndarray, distances: numpy.ndarray) -> Connections: """ calculate connections among points based on a least diagonal network :param x: the x coordinates of n points :param y: the y coordinates of n points :param distances: an n x n matrix containing the distances among the points defined by x and y :return: returns a Connections object """ n = check_for_square_matrix(distances) if (n != len(x)) or (n != len(y)): raise ValueError( "The coordinate arrays and the distance matrix must have the same length" ) output = Connections(n) # flatten distances into one dimension (half matrix only), but also track position in matrix dists = [] for i in range(n): for j in range(i): dists.append([distances[i, j], i, j]) dists.sort() good_pairs = [] m1, m2 = 1, 1 b1, b2 = 0, 0 # work through all pairs from closest to farthest for d in dists: i, j = d[1], d[2] # need the point indices, not the actual distance if x[i] != x[j]: vertical1 = False m1 = (y[i] - y[j]) / (x[i] - x[j]) # calculate slope b1 = y[i] - m1 * x[i] # calculate intercept else: vertical1 = True # compare to previously added links k = 0 good = True while k < len(good_pairs): pair = good_pairs[k] pair1, pair2 = pair[0], pair[1] if (i not in pair) and (j not in pair): if x[pair1] != x[pair2]: vertical2 = False m2 = (y[pair1] - y[pair2]) / (x[[pair1]] - x[pair2] ) # calculate slope b2 = y[pair1] - m2 * x[pair1] # calculate intercept else: vertical2 = True check = True xc, yc = x[i], y[j] # defaults; likely unnecessary if vertical1 and vertical2: # if both line segments are vertical, they overlap if either point of one pair is between both # points of the other pair check = False if x[i] == x[pair1]: if (y[i] < y[pair1] < y[j]) or (y[i] > y[pair1] > y[j]) or \ (y[i] < y[pair2] < y[j]) or (y[i] > y[pair2] > y[j]): good = False elif vertical1: # one segment is vertical; calculate the y at that x position xc = x[i] yc = m2 * xc + b2 elif vertical2: # one segment is vertical; calculate the y at that x position xc = x[pair1] yc = m1 * xc + b1 elif m1 == m2: # segments have identical slopes; can only overlap if they have identical projected intercepts check = False if b1 == b2: # segments do have identical intercepts; they overlap if either point of one pair is between # both points of the other pair if (y[i] < y[pair1] < y[j]) or (y[i] > y[pair1] > y[j]) or \ (y[i] < y[pair1] < y[j]) or (y[i] > y[pair1] > y[j]): good = False else: xc = (b2 - b1) / (m1 - m2) yc = m1 * xc + b1 if check: # did not get pre-checked from one of the parallel slope cases above # xc, yc is the projected crossing point of the two line segments; the segments overlap if # this point falls within both segments if (((x[i] <= xc <= x[j]) or (x[i] >= xc >= x[j])) and ((y[i] <= yc <= y[j]) or (y[i] >= yc >= y[j]))) and \ (((x[pair1] <= xc <= x[pair2]) or (x[pair1] >= xc >= x[pair2])) and ((y[pair1] <= yc <= y[pair2]) or (y[pair1] >= yc >= y[pair2]))): good = False if good: k += 1 else: k = len(good_pairs) if good: good_pairs.append([i, j]) for pair in good_pairs: output.store(pair[0], pair[1]) return output
def mantel(input_matrix1: numpy.ndarray, input_matrix2: numpy.ndarray, partial, permutations: int = 0, tail: str = "both") -> Tuple[float, float, list, float, float, float, list, float]: check_tail(tail) n = check_for_square_matrix(input_matrix1) if n != check_for_square_matrix(input_matrix2): raise ValueError("input matrices must be the same size") if len(partial) > 0: for x in partial: if n != check_for_square_matrix(x): raise ValueError("input matrices must be the same size") matrix1 = residuals_from_matrix_regression(input_matrix1, partial) matrix2 = residuals_from_matrix_regression(input_matrix2, partial) else: matrix1 = numpy.copy(input_matrix1) matrix2 = numpy.copy(input_matrix2) observed_z = numpy.sum(numpy.multiply(matrix1, matrix2)) # assumes diagonals are all zeros # for non-partial version the denominator of r is constant no matter the permutation, so only calculate once to # save time on two-tailed tests sq_cov2 = square_matrix_covariance(matrix2, matrix2) sqxy = sqrt(square_matrix_covariance(matrix1, matrix1) * sq_cov2) r = square_matrix_covariance(matrix1, matrix2) / sqxy observed_mu, observed_std = mantel_moments(matrix1, matrix2) z_score = (observed_z - observed_mu) / observed_std p_value = scipy.stats.norm.cdf(z_score) # create basic output text output_text = list() output_text.append("Mantel Test") output_text.append("") # matrix information here?? # OutputAddLine('Matrix 1: ' + iMat1.MatrixName); # OutputAddLine('Matrix 2: ' + iMat2.MatrixName); # if (MList.Count > 0) then begin # outstr := 'Matrices held constant: '; # for i := 0 to MList.Count - 1 do begin # if (i > 0) then outstr := outstr + ', '; # outstr := outstr + TpasBasicMatrix(MList[i]).MatrixName; # end; # OutputAddLine(outstr); # end; output_text.append("Matrices are {0} x {0}".format(n)) if len(partial) > 0: output_text.append("{} matrices held constant".format(len(partial))) output_text.append("") output_text.append("Observed Z = " + format(observed_z, OUT_FRMT)) output_text.append("Correlation = " + format(r, OUT_FRMT)) output_text.append("t = " + format(z_score, OUT_FRMT)) output_text.append("Left-tailed p = " + format(p_value, OUT_FRMT)) output_text.append("Right-tailed p = " + format(1 - p_value, OUT_FRMT)) output_text.append("Two-tailed p = " + format(2*min(p_value, 1 - p_value), OUT_FRMT)) output_text.append("") # change p_value to requested tail if tail == "both": p_value = 2*min(p_value, 1 - p_value) elif tail == "right": p_value = 1 - p_value # perform permutation tests permuted_r_list = [r] if permutations > 0: cumulative_left = 0 cumulative_right = 0 cumulative_equal = 1 cumulative_total = 1 for p in range(permutations - 1): # count observed as first permutation # created permuted version of matrix 1, permuting rows and columns in tandem rand_order = numpy.arange(n) numpy.random.shuffle(rand_order) matrix1 = input_matrix1[numpy.ix_(rand_order, rand_order)] if len(partial) > 0: # if partial, calculate residuals for permuted matrix matrix1 = residuals_from_matrix_regression(matrix1, partial) # if we want to save the permuted values, we cannot use the quick way for one-tailed tests numerator = square_matrix_covariance(matrix1, matrix2) if len(partial) > 0: denominator = sqrt(square_matrix_covariance(matrix1, matrix1) * sq_cov2) else: # for non-partial tests can save computation as denominator is fixed denominator = sqxy permuted_r = numerator / denominator if permuted_r < r: cumulative_left += 1 elif permuted_r > r: cumulative_right += 1 else: cumulative_equal += 1 if abs(permuted_r) >= abs(r): cumulative_total += 1 permuted_r_list.append(permuted_r) # # If it is a two-tailed test, we need to calculate r, otherwise for one-tailed tests we can stick with # # Z which is faster # if tail == "both": # numerator = square_matrix_covariance(matrix1, matrix2) # if len(partial) > 0: # denominator = sqrt(square_matrix_covariance(matrix1, matrix1) * sq_cov2) # else: # for non-partial tests can save computation as denominator is fixed # denominator = sqxy # permuted_r = numerator / denominator # if permuted_r < r: # cumulative_left += 1 # elif permuted_r > r: # cumulative_right += 1 # else: # cumulative_equal += 1 # if abs(permuted_r) >= abs(r): # cumulative_total += 1 # else: # permuted_z = numpy.sum(numpy.multiply(matrix1, matrix2)) # if permuted_z < observed_z: # cumulative_left += 1 # elif permuted_z > observed_z: # cumulative_right += 1 # else: # cumulative_equal += 1 permuted_right_p = (cumulative_equal + cumulative_right) / permutations permuted_left_p = (cumulative_equal + cumulative_left) / permutations if tail == "both": permuted_two_p = cumulative_total / permutations else: permuted_two_p = 1 output_text.append("Probability results from {} permutation".format(permutations)) output_text.append("# of permutations < observed = {}".format(cumulative_left)) output_text.append("# of permutations > observed = {}".format(cumulative_right)) if tail == "both": output_text.append("# of permutations >= |observed| = {}".format(cumulative_total)) output_text.append("# of permutations = observed = {}".format(cumulative_equal)) output_text.append("") output_text.append("Left-tailed p = " + format(permuted_left_p, OUT_FRMT)) output_text.append("Right-tailed p = observed = " + format(permuted_right_p, OUT_FRMT)) if tail == "both": output_text.append("Two-tailed p = observed = " + format(permuted_two_p, OUT_FRMT)) output_text.append("") else: permuted_left_p, permuted_right_p, permuted_two_p = 1, 1, 1 mantel_output = namedtuple("mantel_output", ["r", "p_value", "output_text", "permuted_left_p", "permuted_right_p", "permuted_two_p", "permuted_r_list", "z_score"]) return mantel_output(r, p_value, output_text, permuted_left_p, permuted_right_p, permuted_two_p, permuted_r_list, z_score)