예제 #1
0
def nearest_neighbor_connections(distances: numpy.ndarray,
                                 k: int = 1,
                                 symmetric: bool = True) -> Connections:
    """
    connect each point to it's k nearest neighbors

    individual points can be connected to more than k points because of ties and because a pair are not necessarily
    each other's nearest neighbors

    :param distances: an n x n matrix containing the distances among a set of points
    :param k: the number of nearest neighbors to connect
    :param symmetric: should connections always be symmetric (defualt = True) or allow asymmetric connections because
                      the nearest neighbor of one point A does not necessarily have A as it's nearest neighbor
    :return: returns a Connections object
    """
    n = check_for_square_matrix(distances)
    output = Connections(n, symmetric)
    for i in range(n):
        dists = []
        for j in range(n):
            if j != i:
                dists.append([distances[i, j], j])
        dists.sort()
        c = k
        while dists[c][0] == dists[c + 1][0]:  # this accounts for ties
            c += 1
        for p in range(c):  # connect the c closest points to the ith point
            output.store(i, dists[p][1])
    return output
예제 #2
0
def residuals_from_simple_matrix_regression(y: numpy.ndarray, x: numpy.ndarray) -> numpy.ndarray:
    """
    performs a linear regression of matrix y on matrix x and returns the residuals

    assumes the diagonals of the input matrices are all zeros

    :param y: matrix of dependent variables, as a square numpy.ndarray
    :param x: matrix of independenet variables, as a square numpy.ndarray
    :return: matrix of residuals of the simple linear regression of y on x
    """
    n = check_for_square_matrix(y)
    sumx = numpy.sum(x)
    sumy = numpy.sum(y)
    sumx2 = numpy.sum(numpy.square(x))
    sumxy = numpy.sum(numpy.multiply(x, y))
    count = n**2 - n  # number of off-diagonal elements
    # means are calculated without including diagonals of matrices!
    xbar = sumx / count
    ybar = sumy / count
    beta = (sumxy - count*xbar*ybar) / (sumx2 - count*xbar**2)
    alpha = ybar - beta*xbar

    residuals = y - alpha - beta*x
    numpy.fill_diagonal(residuals, 0)  # reset diagonal elements to zero

    return residuals
예제 #3
0
def minimum_spanning_tree(distances: numpy.ndarray) -> Connections:
    """
    calculate connections among points based on a minimum spanning tree

    Although I invented this algorithm myself, it sort of follows the suggestion made in Kruskal, Joseph B., Jr. 1956.
    On the shortest spanning subtree of a graph and the traveling salesman problem.  Proceedings of the
    American Mathematical Society 7(1):48-50.

    :param distances: an n x n matrix containing distances among points
    :return: returns a Connections object
    """
    n = check_for_square_matrix(distances)
    output = Connections(n)
    used = [i for i in range(n)]
    cnt = 1
    while cnt < n:
        new_point = cnt
        old_point = 0
        for i in range(cnt):
            for j in range(cnt, n):
                if distances[used[i], used[j]] < distances[used[old_point],
                                                           used[new_point]]:
                    old_point, new_point = i, j
        # make connection
        output.store(used[old_point], used[new_point])
        used[cnt], used[new_point] = used[new_point], used[
            cnt]  # swap out a used point with an unused point
        cnt += 1
    return output
예제 #4
0
def bearing_analysis(data: numpy.ndarray, distances: numpy.ndarray, angles: numpy.ndarray, nbearings: int = 36,
                     npermutations: int = 0) -> Tuple[list, list]:
    """
    Conduct a bearing analysis to test for anisotropic patterns in scattered data, method originally described in:

    Falsetti, A.B., and R.R. Sokal. 1993. Genetic structure of human populations in the British Isles. Annals of
    Human Biology 20:215-229.

    :param data: an n x n matrix representing distances among data values
    :param distances: an n x n matrix representing geographic distances among data points
    :param angles: an n x n matrix representing geographic angles among data points
    :param nbearings: the number of bearings to test; the default is 36 (every 5 degrees)
    :param npermutations: the number of random permutations used to test the correlations; the default is 0
    :return: a tuple containing a list of output values and a list of text output
    """
    n = check_for_square_matrix(data)
    if (n != check_for_square_matrix(distances)) or (n != check_for_square_matrix(angles)):
        raise ValueError("input matrices must be the same size")

    angle_width = pi / nbearings
    output = []
    for a in range(nbearings):
        test_angle = a * angle_width
        b_matrix = distances * numpy.square(numpy.cos(angles - test_angle))
        r, p_value, _, _, _, rand_p, _, _ = pyssage.mantel.mantel(data, b_matrix, [], npermutations)
        if npermutations > 0:
            output.append([a*180/nbearings, r, p_value, rand_p])
        else:
            output.append([a*180/nbearings, r, p_value])

    # create basic output text
    output_text = list()
    output_text.append("Bearing Analysis")
    output_text.append("")
    output_text.append("Tested {} vectors".format(nbearings))
    output_text.append("")
    if npermutations > 0:
        col_headers = ["Bearing", "Correlation", "Prob", "RandProb"]
        col_formats = ["f", "f", "f", "f"]
    else:
        col_headers = ["Bearing", "Correlation", "Prob"]
        col_formats = ["f", "f", "f"]
    create_output_table(output_text, output, col_headers, col_formats)
    bearing_output = namedtuple("bearing_output", ["output_values", "output_text"])
    return bearing_output(output, output_text)
예제 #5
0
def square_matrix_covariance(x: numpy.ndarray, y: numpy.ndarray) -> float:
    """
    returns the covariance of two square matrices, assuming the diagonals are both zeros

    :param x: first matrix, as a square numpy.ndarray
    :param y: second matrix, as a square numpy.ndarray
    :return: the covariance of the two matrices
    """
    n = check_for_square_matrix(y)
    sumx = numpy.sum(x)
    sumy = numpy.sum(y)
    sumxy = numpy.sum(numpy.multiply(x, y))
    count = n**2 - n

    return (sumxy - sumx*sumy/count) / (count - 1)
예제 #6
0
def residuals_from_multi_matrix_regression(y: numpy.ndarray, x_list: list) -> numpy.ndarray:
    """
    performs a muliple linear regression of matrix y on all of the matrices in x and returns the residuals
    """
    n = check_for_square_matrix(y)
    # create a column of y values
    ymat = flatten_without_diagonal(y)
    # create an x matrix with the first column 1's and one additional column for each matrix in the x list
    xmat = numpy.ones((len(ymat), len(x_list) + 1), dtype=float)
    for i, x in enumerate(x_list):
        xmat[:, i+1] = flatten_without_diagonal(x)
    b = numpy.matmul(numpy.matmul(numpy.linalg.inv(numpy.matmul(xmat.T,  xmat)), xmat.T), ymat)
    yhat = numpy.matmul(xmat, b)
    residuals = ymat - yhat
    return deflatten_without_diagonal(residuals, n)
예제 #7
0
def create_windrose_connections(distances: numpy.ndarray,
                                angles: numpy.ndarray, annulus: int,
                                sector: int, a: int, c: float, d: float,
                                e: float) -> Tuple[Connections, float, float]:
    n = check_for_square_matrix(distances)
    output = Connections(n)
    output.min_scale = c * annulus**2 + d * annulus + e
    output.max_scale = c * (annulus + 1)**2 + d * (annulus + 1) + e
    sector_breadth = pi / windrose_sectors_per_annulus(a, annulus)
    sector_min = sector * sector_breadth
    sector_max = (sector + 1) * sector_breadth
    for i in range(n):
        for j in range(i):
            if (output.min_scale <= distances[i, j] < output.max_scale) and (
                    sector_min <= angles[i, j] < sector_max):
                output.store(i, j)
    return output, sector_min, sector_max
예제 #8
0
def relative_neighborhood_network(distances: numpy.ndarray) -> Connections:
    """
    calculate connections among points based on a relative neighborhood network

    :param distances: an n x n matrix containing distances among points
    :return: returns a Connections object
    """
    n = check_for_square_matrix(distances)
    output = Connections(n)
    for i in range(n):
        for j in range(i):
            good = True
            for k in range(n):
                if (k != i) and (k != j):
                    if (distances[k, j] < distances[i, j]) and (
                            distances[k, i] < distances[i, j]):
                        good = False
            if good:
                output.store(i, j)
    return output
예제 #9
0
def gabriel_network(distances: numpy.ndarray) -> Connections:
    """
    calculate connections among points based on a Gabriel network

    :param distances: an n x n matrix containing distances among points
    :return: returns a Connections object
    """
    n = check_for_square_matrix(distances)
    output = Connections(n)
    sq_distances = numpy.square(distances)
    for i in range(n):
        for j in range(i):
            good = True
            for k in range(n):
                if (k != i) and (k != j):
                    if sq_distances[
                            i, j] > sq_distances[k, j] + sq_distances[k, i]:
                        good = False
            if good:
                output.store(i, j)
    return output
예제 #10
0
def connect_distance_range(distances: numpy.ndarray,
                           maxdist: float,
                           mindist: float = 0) -> Connections:
    """
    calculate connections based on a distance range, defined by maxdist and mindist

    points are not connected to themselves, even with a distance of zero

    :param distances: an n x n matrix containing distances among points
    :param maxdist: the maximum distance between points to connect. this distance is exclusive
    :param mindist: the minimum distance between points to connect (default = 0). this distance is inclusive
    :return: returns a Connections object
    """
    n = check_for_square_matrix(distances)
    output = Connections(n)
    output.min_scale = mindist
    output.max_scale = maxdist
    for i in range(n):
        for j in range(i):
            if mindist <= distances[i, j] < maxdist:
                output.store(i, j)
    return output
예제 #11
0
def least_diagonal_network(x: numpy.ndarray, y: numpy.ndarray,
                           distances: numpy.ndarray) -> Connections:
    """
    calculate connections among points based on a least diagonal network

    :param x: the x coordinates of n points
    :param y: the y coordinates of n points
    :param distances: an n x n matrix containing the distances among the points defined by x and y
    :return: returns a Connections object
    """
    n = check_for_square_matrix(distances)
    if (n != len(x)) or (n != len(y)):
        raise ValueError(
            "The coordinate arrays and the distance matrix must have the same length"
        )
    output = Connections(n)
    # flatten distances into one dimension (half matrix only), but also track position in matrix
    dists = []
    for i in range(n):
        for j in range(i):
            dists.append([distances[i, j], i, j])
    dists.sort()
    good_pairs = []
    m1, m2 = 1, 1
    b1, b2 = 0, 0
    # work through all pairs from closest to farthest
    for d in dists:
        i, j = d[1], d[2]  # need the point indices, not the actual distance
        if x[i] != x[j]:
            vertical1 = False
            m1 = (y[i] - y[j]) / (x[i] - x[j])  # calculate slope
            b1 = y[i] - m1 * x[i]  # calculate intercept
        else:
            vertical1 = True
        # compare to previously added links
        k = 0
        good = True
        while k < len(good_pairs):
            pair = good_pairs[k]
            pair1, pair2 = pair[0], pair[1]
            if (i not in pair) and (j not in pair):
                if x[pair1] != x[pair2]:
                    vertical2 = False
                    m2 = (y[pair1] - y[pair2]) / (x[[pair1]] - x[pair2]
                                                  )  # calculate slope
                    b2 = y[pair1] - m2 * x[pair1]  # calculate intercept
                else:
                    vertical2 = True
                check = True
                xc, yc = x[i], y[j]  # defaults; likely unnecessary
                if vertical1 and vertical2:
                    # if both line segments are vertical, they overlap if either point of one pair is between both
                    # points of the other pair
                    check = False
                    if x[i] == x[pair1]:
                        if (y[i] < y[pair1] < y[j]) or (y[i] > y[pair1] > y[j]) or \
                                (y[i] < y[pair2] < y[j]) or (y[i] > y[pair2] > y[j]):
                            good = False
                elif vertical1:
                    # one segment is vertical; calculate the y at that x position
                    xc = x[i]
                    yc = m2 * xc + b2
                elif vertical2:
                    # one segment is vertical; calculate the y at that x position
                    xc = x[pair1]
                    yc = m1 * xc + b1
                elif m1 == m2:
                    # segments have identical slopes; can only overlap if they have identical projected intercepts
                    check = False
                    if b1 == b2:
                        # segments do have identical intercepts; they overlap if either point of one pair is between
                        # both points of the other pair
                        if (y[i] < y[pair1] < y[j]) or (y[i] > y[pair1] > y[j]) or \
                                (y[i] < y[pair1] < y[j]) or (y[i] > y[pair1] > y[j]):
                            good = False
                else:
                    xc = (b2 - b1) / (m1 - m2)
                    yc = m1 * xc + b1
                if check:  # did not get pre-checked from one of the parallel slope cases above
                    # xc, yc is the projected crossing point of the two line segments; the segments overlap if
                    # this point falls within both segments
                    if (((x[i] <= xc <= x[j]) or (x[i] >= xc >= x[j])) and
                        ((y[i] <= yc <= y[j]) or (y[i] >= yc >= y[j]))) and \
                            (((x[pair1] <= xc <= x[pair2]) or (x[pair1] >= xc >= x[pair2])) and
                             ((y[pair1] <= yc <= y[pair2]) or (y[pair1] >= yc >= y[pair2]))):
                        good = False
            if good:
                k += 1
            else:
                k = len(good_pairs)
        if good:
            good_pairs.append([i, j])
    for pair in good_pairs:
        output.store(pair[0], pair[1])
    return output
예제 #12
0
def mantel(input_matrix1: numpy.ndarray, input_matrix2: numpy.ndarray, partial, permutations: int = 0,
           tail: str = "both") -> Tuple[float, float, list, float, float, float, list, float]:
    check_tail(tail)
    n = check_for_square_matrix(input_matrix1)
    if n != check_for_square_matrix(input_matrix2):
        raise ValueError("input matrices must be the same size")

    if len(partial) > 0:
        for x in partial:
            if n != check_for_square_matrix(x):
                raise ValueError("input matrices must be the same size")
        matrix1 = residuals_from_matrix_regression(input_matrix1, partial)
        matrix2 = residuals_from_matrix_regression(input_matrix2, partial)
    else:
        matrix1 = numpy.copy(input_matrix1)
        matrix2 = numpy.copy(input_matrix2)

    observed_z = numpy.sum(numpy.multiply(matrix1, matrix2))  # assumes diagonals are all zeros

    # for non-partial version the denominator of r is constant no matter the permutation, so only calculate once to
    # save time on two-tailed tests
    sq_cov2 = square_matrix_covariance(matrix2, matrix2)
    sqxy = sqrt(square_matrix_covariance(matrix1, matrix1) * sq_cov2)

    r = square_matrix_covariance(matrix1, matrix2) / sqxy
    observed_mu, observed_std = mantel_moments(matrix1, matrix2)
    z_score = (observed_z - observed_mu) / observed_std
    p_value = scipy.stats.norm.cdf(z_score)

    # create basic output text
    output_text = list()
    output_text.append("Mantel Test")
    output_text.append("")
    # matrix information here??
    # OutputAddLine('Matrix 1: ' + iMat1.MatrixName);
    # OutputAddLine('Matrix 2: ' + iMat2.MatrixName);
    # if (MList.Count > 0) then begin
    #    outstr := 'Matrices held constant: ';
    #    for i := 0 to MList.Count - 1 do begin
    #        if (i > 0) then outstr := outstr + ', ';
    #        outstr := outstr + TpasBasicMatrix(MList[i]).MatrixName;
    #    end;
    #    OutputAddLine(outstr);
    # end;
    output_text.append("Matrices are {0} x {0}".format(n))
    if len(partial) > 0:
        output_text.append("{} matrices held constant".format(len(partial)))
    output_text.append("")
    output_text.append("Observed Z = " + format(observed_z, OUT_FRMT))
    output_text.append("Correlation = " + format(r, OUT_FRMT))
    output_text.append("t = " + format(z_score, OUT_FRMT))
    output_text.append("Left-tailed p = " + format(p_value, OUT_FRMT))
    output_text.append("Right-tailed p = " + format(1 - p_value, OUT_FRMT))
    output_text.append("Two-tailed p = " + format(2*min(p_value, 1 - p_value), OUT_FRMT))
    output_text.append("")

    # change p_value to requested tail
    if tail == "both":
        p_value = 2*min(p_value, 1 - p_value)
    elif tail == "right":
        p_value = 1 - p_value

    # perform permutation tests
    permuted_r_list = [r]
    if permutations > 0:
        cumulative_left = 0
        cumulative_right = 0
        cumulative_equal = 1
        cumulative_total = 1
        for p in range(permutations - 1):  # count observed as first permutation
            # created permuted version of matrix 1, permuting rows and columns in tandem
            rand_order = numpy.arange(n)
            numpy.random.shuffle(rand_order)
            matrix1 = input_matrix1[numpy.ix_(rand_order, rand_order)]

            if len(partial) > 0:  # if partial, calculate residuals for permuted matrix
                matrix1 = residuals_from_matrix_regression(matrix1, partial)

            # if we want to save the permuted values, we cannot use the quick way for one-tailed tests
            numerator = square_matrix_covariance(matrix1, matrix2)
            if len(partial) > 0:
                denominator = sqrt(square_matrix_covariance(matrix1, matrix1) * sq_cov2)
            else:  # for non-partial tests can save computation as denominator is fixed
                denominator = sqxy
            permuted_r = numerator / denominator
            if permuted_r < r:
                cumulative_left += 1
            elif permuted_r > r:
                cumulative_right += 1
            else:
                cumulative_equal += 1
            if abs(permuted_r) >= abs(r):
                cumulative_total += 1
            permuted_r_list.append(permuted_r)

            # # If it is a two-tailed test, we need to calculate r, otherwise for one-tailed tests we can stick with
            # # Z which is faster
            # if tail == "both":
            #     numerator = square_matrix_covariance(matrix1, matrix2)
            #     if len(partial) > 0:
            #         denominator = sqrt(square_matrix_covariance(matrix1, matrix1) * sq_cov2)
            #     else:  # for non-partial tests can save computation as denominator is fixed
            #         denominator = sqxy
            #     permuted_r = numerator / denominator
            #     if permuted_r < r:
            #         cumulative_left += 1
            #     elif permuted_r > r:
            #         cumulative_right += 1
            #     else:
            #         cumulative_equal += 1
            #     if abs(permuted_r) >= abs(r):
            #         cumulative_total += 1
            # else:
            #     permuted_z = numpy.sum(numpy.multiply(matrix1, matrix2))
            #     if permuted_z < observed_z:
            #         cumulative_left += 1
            #     elif permuted_z > observed_z:
            #         cumulative_right += 1
            #     else:
            #         cumulative_equal += 1

        permuted_right_p = (cumulative_equal + cumulative_right) / permutations
        permuted_left_p = (cumulative_equal + cumulative_left) / permutations
        if tail == "both":
            permuted_two_p = cumulative_total / permutations
        else:
            permuted_two_p = 1
        output_text.append("Probability results from {} permutation".format(permutations))
        output_text.append("# of permutations < observed = {}".format(cumulative_left))
        output_text.append("# of permutations > observed = {}".format(cumulative_right))
        if tail == "both":
            output_text.append("# of permutations >= |observed| = {}".format(cumulative_total))
        output_text.append("# of permutations = observed = {}".format(cumulative_equal))
        output_text.append("")
        output_text.append("Left-tailed p = " + format(permuted_left_p, OUT_FRMT))
        output_text.append("Right-tailed p = observed = " + format(permuted_right_p, OUT_FRMT))
        if tail == "both":
            output_text.append("Two-tailed p = observed = " + format(permuted_two_p, OUT_FRMT))
        output_text.append("")
    else:
        permuted_left_p, permuted_right_p, permuted_two_p = 1, 1, 1

    mantel_output = namedtuple("mantel_output", ["r", "p_value", "output_text", "permuted_left_p", "permuted_right_p",
                                                 "permuted_two_p", "permuted_r_list", "z_score"])
    return mantel_output(r, p_value, output_text, permuted_left_p, permuted_right_p, permuted_two_p, permuted_r_list,
                         z_score)