def snpWeights(L, Pi, f, K, t): n, m = Pi.shape # Dimensions E = np.empty((n, m), dtype=np.float32) covariance_cy.updatePCAngsd(L, Pi, E, t) covariance_cy.standardizeE(E, f, t) # Performing SVD on normalized expected genotypes _, s, U = svds(E, k=K) snpW = U[::-1, :].T * (s[::-1]** 2) / m # Scaling by eigenvalues (PC-scores) return snpW
def selectionScan(L, Pi, f, K, t): n, m = Pi.shape # Dimensions E = np.empty((n, m), dtype=np.float32) Dsquared = np.empty((m, K), dtype=np.float32) covariance_cy.updatePCAngsd(L, Pi, E, t) covariance_cy.standardizeE(E, f, t) # Performing SVD on normalized expected genotypes _, _, U = svds(E, k=K) U = U[::-1, :] shared.computeD(U, Dsquared) return Dsquared
def kinshipConomos(L, Pi, t): n, m = Pi.shape # Initiate containers E = np.empty((n, m), dtype=np.float32) dKin = np.empty(n, dtype=np.float32) temp1 = np.empty((n, n), dtype=np.float32) temp2 = np.empty((n, n), dtype=np.float32) # Dosages covariance_cy.updatePCAngsd(L, Pi, E, t) # Kinship computations kinship_cy.diagKinship(L, Pi, dKin, t) kinship_cy.numeratorKin(E, Pi, t) np.dot(E, E.T, out=temp1) kinship_cy.denominatorKin(E, Pi, t) np.dot(E, E.T, out=temp2) temp2 *= 4 temp1 /= temp2 np.fill_diagonal(temp1, dKin) # Insert correct diagonal return temp1
def pcaEM(L, e, f, m_iter, m_tole, no_std, t): n, m = L.shape # Dimension of likelihood matrix n //= 3 # Number of individuals K = e # Initiate matrices E = np.empty((n, m), dtype=np.float32) dCov = np.zeros(n, dtype=np.float32) # Estimate covariance matrix (Fumagalli) and infer number of PCs if K == 0: # Prepare dosages and diagonal covariance_cy.covFumagalli(L, f, E, dCov, t) covariance_cy.standardizeE(E, f, t) C = np.dot(E, E.T) / m np.fill_diagonal(C, dCov) if m_iter == 0: print("Returning with ngsTools covariance matrix!") return C, None, K # Velicer's Minimum Average Partial (MAP) Test eigVals, eigVecs = eigsh(C, k=min( n - 1, 15)) # Eigendecomposition (Symmetric) - ARPACK eigVals = eigVals[::-1] # Sorted eigenvalues eigVals[eigVals < 0] = 0 eigVecs = eigVecs[:, ::-1] # Sorted eigenvectors loadings = eigVecs * np.sqrt(eigVals) mapTest = np.empty(min(m - 1, 15), dtype=np.float32) # Loop over m-1 eigenvalues for MAP test (Shriner implementation) for eig in range(min(m - 1, 15)): partcov = C - (np.dot(loadings[:, 0:(eig + 1)], loadings[:, 0:(eig + 1)].T)) d = np.diag(partcov) if (np.sum(np.isnan(d)) > 0) or (np.sum(d == 0) > 0) or (np.sum(d < 0) > 0): mapTest[eig] = 1 else: d = np.diagflat(1 / np.sqrt(d)) pr = np.dot(d, np.dot(partcov, d)) mapTest[eig] = (np.sum(pr**2) - m) / (m * (m - 1)) K = max([1, np.argmin(mapTest) + 1 ]) # Number of principal components retained print("Using " + str(K) + " principal components (MAP test)") # Release memory del eigVals, eigVecs, loadings, partcov, mapTest else: print("Using " + str(K) + " principal components (manually selected)") covariance_cy.updateFumagalli(L, f, E, t) # Estimate individual allele frequencies Pi = np.empty((n, m), dtype=np.float32) covariance_cy.centerE(E, f, t) W = estimateSVD(E, K, Pi) covariance_cy.updatePi(Pi, f, t) prevW = np.copy(W) print("Individual allele frequencies estimated (1)") # Iterative estimation for iteration in range(2, m_iter + 1): covariance_cy.updatePCAngsd(L, Pi, E, t) # Estimate individual allele frequencies covariance_cy.centerE(E, f, t) W = estimateSVD(E, K, Pi) covariance_cy.updatePi(Pi, f, t) # Break iterative update if converged diff = covariance_cy.rmse2d_eig(W, prevW) print("Individual allele frequencies estimated (" + str(iteration) + "). RMSE=" + str(diff)) if diff < m_tole: print("Estimation of individual allele frequencies has converged.") break prevW = np.copy(W) del W, prevW # Estimate covariance matrix (PCAngsd) if no_std: covariance_cy.covPCAngsdNoStd(L, f, Pi, E, dCov, t) covariance_cy.centerE(E, f, t) else: covariance_cy.covPCAngsd(L, f, Pi, E, dCov, t) covariance_cy.standardizeE(E, f, t) C = np.dot(E, E.T) / m np.fill_diagonal(C, dCov) del E return C, Pi, K
if args.maf_save: np.save(args.o + ".maf", f.astype(float)) print("Saved population allele frequencies as " + str(args.o) + ".maf.npy (Binary)\n") if args.indf_save: np.save(args.o + ".indf", Pi.astype(float)) print("Saved individual allele frequencies as " + str(args.o) + ".indf.npy (Binary)\n") if args.dosage_save: import covariance_cy E = np.empty((L.shape[0] // 3, L.shape[1]), dtype=np.float32) # Dosage matrix covariance_cy.updatePCAngsd(L, Pi, E, args.threads) np.save(args.o + ".dosage", E.astype(float)) print("Saved genotype dosages as " + str(args.o) + ".dosage.npy (Binary)\n") del E if args.post_save or args.sites_save: print("Loading site information") import pandas as pd if args.beagle is not None: infoDF = pd.read_csv(args.beagle, sep="\t", header=0, usecols=[0, 1, 2], compression="gzip", dtype=str)