# intersect data c, ia, ib = intersect_mtlb(fam['iid'],pheno['IID']) logger.info(str(len(ia)) + ' subjects found to have genotype data\n') # Final sample assignment pheno = pheno.iloc[ib] pheno = pheno.reset_index(drop=True) geno_ia = geno[:,ia] # Function for null model logger.info('Generating Null models\n') cph = CoxPHFitter() cph.fit(pheno[[T_name, event_name] + covname], T_name, event_col=event_name) # res_surv = cph.compute_residuals(pheno[[T_name, event_name] + covname], 'deviance').sort_index()['deviance'] res_surv = cph.compute_residuals(pheno[[T_name, event_name] + covname], 'martingale').sort_index()['martingale'] # This is the most memory intensive part. Might need to change if we are dealing with biobank scale data if args.apr_flag == 'N': logger.info('Calculating Null covariance matrix\n') mat = cph.predict_cumulative_hazard(pheno) P = np.diff(mat,axis=-0) for isubj in range(P.shape[1]): idx = np.abs(mat.index - pheno[T_name][isubj]).argmin() P[idx::,isubj] = 0 V = da.diag(np.array(pheno[event_name] - res_surv)) - da.dot(P.transpose(),P) X = np.array(pheno[covname]) C = V - da.matmul(da.matmul(da.matmul(V,X), da.linalg.inv(da.matmul(da.matmul(X.transpose(), V),X))), da.matmul(X.transpose(), V)) else: logger.info('Using first order approximations for testing statistics\n')