def cell_type_specific(args, log): '''Cell type specific analysis''' args = copy.deepcopy(args) if args.intercept_h2 is not None: args.intercept_h2 = float(args.intercept_h2) if args.no_intercept: args.intercept_h2 = 1 M_annot_all_regr, w_ld_cname, ref_ld_cnames_all_regr, sumstats, novar_cols = \ _read_ld_sumstats(args, log, args.h2_cts) M_tot = np.sum(M_annot_all_regr) _check_ld_condnum(args, log, ref_ld_cnames_all_regr) _warn_length(log, sumstats) n_snp = len(sumstats) n_blocks = min(n_snp, args.n_blocks) if args.chisq_max is None: chisq_max = max(0.001*sumstats.N.max(), 80) else: chisq_max = args.chisq_max ii = np.ravel(sumstats.Z**2 < chisq_max) sumstats = sumstats.ix[ii, :] log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format( C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii))) n_snp = np.sum(ii) # lambdas are late-binding, so this works ref_ld_all_regr = np.array(sumstats[ref_ld_cnames_all_regr]).reshape((len(sumstats),-1)) chisq = np.array(sumstats.Z**2) keep_snps = sumstats[['SNP']] s = lambda x: np.array(x).reshape((n_snp, 1)) results_columns = ['Name', 'Coefficient', 'Coefficient_std_error', 'Coefficient_P_value'] results_data = [] for (name, ct_ld_chr) in [x.split() for x in open(args.ref_ld_chr_cts).readlines()]: ref_ld_cts_allsnps = _read_chr_split_files(ct_ld_chr, None, log, 'cts reference panel LD Score', ps.ldscore_fromlist) log.log('Performing regression.') ref_ld_cts = np.array(pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP', how='left').ix[:,1:]) if np.any(np.isnan(ref_ld_cts)): raise ValueError ('Missing some LD scores from cts files. Are you sure all SNPs in ref-ld-chr are also in ref-ld-chr-cts') ref_ld = np.hstack([ref_ld_cts, ref_ld_all_regr]) M_cts = ps.M_fromlist( _splitp(ct_ld_chr), _N_CHR, common=(not args.not_M_5_50)) M_annot = np.hstack([M_cts, M_annot_all_regr]) hsqhat = reg.Hsq(s(chisq), ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N), M_annot, n_blocks=n_blocks, intercept=args.intercept_h2, twostep=None, old_weights=True) coef, coef_se = hsqhat.coef[0], hsqhat.coef_se[0] results_data.append((name, coef, coef_se, stats.norm.sf(coef/coef_se))) if args.print_all_cts: for i in range(1, len(ct_ld_chr.split(','))): coef, coef_se = hsqhat.coef[i], hsqhat.coef_se[i] results_data.append((name+'_'+str(i), coef, coef_se, stats.norm.sf(coef/coef_se))) df_results = pd.DataFrame(data = results_data, columns = results_columns) df_results.sort_values(by = 'Coefficient_P_value', inplace=True) df_results.to_csv(args.out+'.cell_type_results.txt', sep='\t', index=False) log.log('Results printed to '+args.out+'.cell_type_results.txt')
def estimate_h2(args, log): '''Estimate h2 and partitioned h2.''' args = copy.deepcopy(args) if args.samp_prev is not None and args.pop_prev is not None: args.samp_prev, args.pop_prev = map(float, [args.samp_prev, args.pop_prev]) if args.intercept_h2 is not None: args.intercept_h2 = float(args.intercept_h2) if args.no_intercept: args.intercept_h2 = 1 M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats( args, log, args.h2) ref_ld = np.array(sumstats[ref_ld_cnames]) _check_ld_condnum(args, log, ref_ld_cnames) _warn_length(log, sumstats) n_snp = len(sumstats) n_blocks = min(n_snp, args.n_blocks) n_annot = len(ref_ld_cnames) chisq_max = args.chisq_max old_weights = False if n_annot == 1: if args.two_step is None and args.intercept_h2 is None: args.two_step = 30 else: old_weights = True if args.chisq_max is None: chisq_max = max(0.001 * sumstats.N.max(), 80) s = lambda x: np.array(x).reshape((n_snp, 1)) chisq = s(sumstats.Z**2) if chisq_max is not None: ii = np.ravel(chisq < chisq_max) sumstats = sumstats.ix[ii, :] log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format( C=chisq_max, N=np.sum(ii), M=n_snp - np.sum(ii))) n_snp = np.sum(ii) # lambdas are late-binding, so this works ref_ld = np.array(sumstats[ref_ld_cnames]) chisq = chisq[ii].reshape((n_snp, 1)) if args.two_step is not None: log.log('Using two-step estimator with cutoff at {M}.'.format( M=args.two_step)) hsqhat = reg.Hsq(chisq, ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N), M_annot, n_blocks=n_blocks, intercept=args.intercept_h2, twostep=args.two_step, old_weights=old_weights) if args.print_cov: _print_cov(hsqhat, args.out + '.cov', log) if args.print_delete_vals: _print_delete_values(hsqhat, args.out + '.delete', log) log.log( hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap=args.overlap_annot)) if args.overlap_annot: overlap_matrix, M_tot = _read_annot(args, log) # overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix, M_annot, M_tot, args.print_coefficients) df_results.to_csv(args.out + '.results', sep="\t", index=False) log.log('Results printed to ' + args.out + '.results') return hsqhat
def estimate_h2(args, log): '''Estimate h2 and partitioned h2.''' args = copy.deepcopy(args) if args.samp_prev is not None and args.pop_prev is not None: args.samp_prev, args.pop_prev = map(float, [args.samp_prev, args.pop_prev]) if args.intercept_h2 is not None: args.intercept_h2 = float(args.intercept_h2) if args.no_intercept: args.intercept_h2 = 1 if len(args.h2) > 1: log.log("Pre-reading files to use across all sumstats") orig_ref_ld = _read_ref_ld(args, log) orig_n_annot = len(orig_ref_ld.columns) - 1 orig_M_annot = _read_M(args, log, orig_n_annot) orig_M_annot, orig_ref_ld, orig_novar_cols = _check_variance( log, orig_M_annot, orig_ref_ld) orig_w_ld = _read_w_ld(args, log) overlap_matrix, M_tot = _read_annot(args, log) else: orig_ref_ld = None orig_n_annot = None orig_M_annot = None orig_novar_cols = None orig_w_ld = None overlap_matrix = None M_tot = None hsqhats = [] it = 0 for h2_fh in args.h2: ref_ld = copy.copy(orig_ref_ld) M_annot = copy.copy(orig_M_annot) n_annot = copy.copy(orig_n_annot) novar_cols = copy.copy(orig_novar_cols) w_ld = copy.copy(orig_w_ld) M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats( args, log, h2_fh, ref_ld=ref_ld, n_annot=n_annot, M_annot=M_annot, novar_cols=novar_cols, w_ld=w_ld) ref_ld = np.array(sumstats[ref_ld_cnames]) _check_ld_condnum(args, log, ref_ld_cnames) _warn_length(log, sumstats) n_snp = len(sumstats) n_blocks = min(n_snp, args.n_blocks) n_annot = len(ref_ld_cnames) chisq_max = args.chisq_max old_weights = False if n_annot == 1: if args.two_step is None and args.intercept_h2 is None: args.two_step = 30 else: old_weights = True if args.chisq_max is None: chisq_max = max(0.001 * sumstats.N.max(), 80) s = lambda x: np.array(x).reshape((n_snp, 1)) chisq = s(sumstats.Z**2) if chisq_max is not None: ii = np.ravel(chisq < chisq_max) sumstats = sumstats.ix[ii, :] log.log( 'Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format( C=chisq_max, N=np.sum(ii), M=n_snp - np.sum(ii))) n_snp = np.sum(ii) # lambdas are late-binding, so this works ref_ld = np.array(sumstats[ref_ld_cnames]) chisq = chisq[ii].reshape((n_snp, 1)) if args.two_step is not None: log.log('Using two-step estimator with cutoff at {M}.'.format( M=args.two_step)) hsqhat = reg.Hsq(chisq, ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N), M_annot, n_blocks=n_blocks, intercept=args.intercept_h2, twostep=args.two_step, old_weights=old_weights) if args.print_cov: _print_cov(hsqhat, args.out + '.cov', log) if args.print_delete_vals: _print_delete_values(hsqhat, args.out + '.delete', log) _print_part_delete_values(hsqhat, args.out + '.part_delete', log) log.log( hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap=args.overlap_annot)) if args.overlap_annot: if overlap_matrix is None or M_tot is None: overlap_matrix, M_tot = _read_annot(args, log) else: log.log("Re-using annot matrix") # overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix, M_annot, M_tot, args.print_coefficients) with open(args.out + '.results', 'a' if it > 0 else 'w') as op: df_results.to_csv( op, sep="\t", index=False, header=(it == 0), ) log.log('Results printed to ' + args.out + '.results') hsqhats.append(hsqhat) it += 1 return hsqhats if len(hsqhats) > 1 else hsqhats[0]
def cell_type_specific(args, log): '''Cell type specific analysis''' args = copy.deepcopy(args) if args.intercept_h2 is not None: args.intercept_h2 = float(args.intercept_h2) if args.no_intercept: args.intercept_h2 = 1 M_annot_all_regr, w_ld_cname, ref_ld_cnames_all_regr, sumstats, novar_cols = \ _read_ld_sumstats(args, log, args.h2_cts) M_tot = np.sum(M_annot_all_regr) _check_ld_condnum(args, log, ref_ld_cnames_all_regr) _warn_length(log, sumstats) n_snp = len(sumstats) n_blocks = min(n_snp, args.n_blocks) if args.chisq_max is None: chisq_max = max(0.001 * sumstats.N.max(), 80) else: chisq_max = args.chisq_max ii = np.ravel(sumstats.Z**2 < chisq_max) sumstats = sumstats.ix[ii, :] log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format( C=chisq_max, N=np.sum(ii), M=n_snp - np.sum(ii))) n_snp = np.sum(ii) # lambdas are late-binding, so this works ref_ld_all_regr = np.array(sumstats[ref_ld_cnames_all_regr]).reshape( (len(sumstats), -1)) chisq = np.array(sumstats.Z**2) keep_snps = sumstats[['SNP']] s = lambda x: np.array(x).reshape((n_snp, 1)) results_columns = [ 'Name', 'Coefficient', 'Coefficient_std_error', 'Coefficient_P_value' ] results_data = [] # for (name, ct_ld_chr) in [x.split() for x in open(args.ref_ld_chr_cts).readlines()]: # ORIG cts_lines = open(args.ref_ld_chr_cts).readlines() for cts_linenum, cts_line in enumerate(cts_lines, start=1): try: (name, ct_ld_chr) = cts_line.split( ) # whitespace delim file with ONLY two cols. Statement raises exception 'ValueError: too many values to unpack (expected 2)' if .split() gives more string splits. ref_ld_cts_allsnps = _read_chr_split_files( ct_ld_chr, None, log, 'cts reference panel LD Score', ps.ldscore_fromlist) log.log('Performing regression #{}/#{}. CTS name is {}'.format( cts_linenum, len(cts_lines), name)) # PT MODIFIED. sys.stdout.flush() # PT ADDED ref_ld_cts = np.array( pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP', how='left').ix[:, 1:]) if np.any(np.isnan(ref_ld_cts)): raise ValueError( 'Missing some LD scores from cts files. Are you sure all SNPs in ref-ld-chr are also in ref-ld-chr-cts' ) ref_ld = np.hstack([ref_ld_cts, ref_ld_all_regr]) M_cts = ps.M_fromlist(_splitp(ct_ld_chr), _N_CHR, common=(not args.not_M_5_50)) M_annot = np.hstack([M_cts, M_annot_all_regr]) hsqhat = reg.Hsq(s(chisq), ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N), M_annot, n_blocks=n_blocks, intercept=args.intercept_h2, twostep=None, old_weights=True) coef, coef_se = hsqhat.coef[0], hsqhat.coef_se[0] results_data.append( (name, coef, coef_se, stats.norm.sf(coef / coef_se))) df_results_tmp = pd.DataFrame(data=results_data, columns=results_columns) # PT ADD df_results_tmp.to_csv(args.out + '.cell_type_results.tmp.txt', sep='\t', index=False) # PT ADD if args.print_all_cts: for i in range(1, len(ct_ld_chr.split(','))): coef, coef_se = hsqhat.coef[i], hsqhat.coef_se[i] results_data.append((name + '_' + str(i), coef, coef_se, stats.norm.sf(coef / coef_se))) except Exception as e: # e.g may catch numpy.linalg.linalg.LinAlgError: Singular matrix. log.log( '*CTS ERROR* Caught exception during regression #{}/#{}. CTS name is {}. Exception:\n{}' .format(cts_linenum, len(cts_lines), name, e)) # PT MODIFIED. sys.stdout.flush() # PT ADDED df_results = pd.DataFrame(data=results_data, columns=results_columns) df_results.sort_values(by='Coefficient_P_value', inplace=True) df_results.to_csv(args.out + '.cell_type_results.txt', sep='\t', index=False) log.log('Results printed to ' + args.out + '.cell_type_results.txt')
def estimate_h2(args, log): '''Estimate h2 and partitioned h2.''' args = copy.deepcopy(args) if args.samp_prev is not None and args.pop_prev is not None: args.samp_prev, args.pop_prev = map( float, [args.samp_prev, args.pop_prev]) if args.intercept_h2 is not None: args.intercept_h2 = float(args.intercept_h2) if args.no_intercept: args.intercept_h2 = 1 M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats( args, log, args.h2) ref_ld = np.array(sumstats[ref_ld_cnames]) _check_ld_condnum(args, log, ref_ld_cnames) _warn_length(log, sumstats) n_snp = len(sumstats) n_blocks = min(n_snp, args.n_blocks) n_annot = len(ref_ld_cnames) chisq_max = args.chisq_max old_weights = False if n_annot == 1: if args.two_step is None and args.intercept_h2 is None and not args.force_nonneg: args.two_step = 30 else: old_weights = True if args.chisq_max is None: chisq_max = max(0.001*sumstats.N.max(), 80) s = lambda x: np.array(x).reshape((n_snp, 1)) chisq = s(sumstats.Z**2) if chisq_max is not None: ii = np.ravel(chisq < chisq_max) sumstats = sumstats.ix[ii, :] log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format( C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii))) n_snp = np.sum(ii) # lambdas are late-binding, so this works ref_ld = np.array(sumstats[ref_ld_cnames]) chisq = chisq[ii].reshape((n_snp, 1)) if args.two_step is not None: log.log('Using two-step estimator with cutoff at {M}.'.format(M=args.two_step)) if (args.ridge and args.intercept_h2 is not None): log.log("WARNING: Ridge regression with no intercept can be numerically unstable") if args.force_nonneg: log.log("Reading non-negativity constraints") nonneg_constraints = _read_nonneg_constraints(args, log, snp_names=sumstats['SNP'], ref_ld_cnames=ref_ld_cnames) if not args.ridge: args.ridge=True args.ridge_lambda=0 args.reestimate_lambdas=False args.no_standardize_ridge=True else: nonneg_constraints = None hsqhat = reg.Hsq(chisq, ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N), M_annot, n_blocks=n_blocks, intercept=args.intercept_h2, twostep=args.two_step, old_weights=old_weights, chr=sumstats['CHR'], ridge=args.ridge, ridge_lambda=args.ridge_lambda, standardize_ridge=not args.no_standardize_ridge, approx_ridge=not args.reestimate_lambdas, nonneg_constraints=nonneg_constraints ) log.log('OOC chi^2 prediction score: %0.4e'%(hsqhat.ooc_score)) if args.print_cov: _print_cov(hsqhat, args.out + '.cov', log) if args.print_delete_vals: _print_delete_values(hsqhat, args.out + '.delete', log) _print_part_delete_values(hsqhat, args.out + '.part_delete', log) log.log(hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap = args.overlap_annot)) if args.overlap_annot: overlap_matrix, M_tot = _read_annot(args, log) # overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix, M_annot, M_tot, args.print_coefficients) df_results.to_csv(args.out+'.results', sep="\t", index=False, na_rep='NA') log.log('Results printed to '+args.out+'.results') return hsqhat