Пример #1
0
def cell_type_specific(args, log):
    '''Cell type specific analysis'''
    args = copy.deepcopy(args)
    if args.intercept_h2 is not None:
        args.intercept_h2 = float(args.intercept_h2)
    if args.no_intercept:
        args.intercept_h2 = 1

    M_annot_all_regr, w_ld_cname, ref_ld_cnames_all_regr, sumstats, novar_cols = \
            _read_ld_sumstats(args, log, args.h2_cts)
    M_tot = np.sum(M_annot_all_regr)
    _check_ld_condnum(args, log, ref_ld_cnames_all_regr)
    _warn_length(log, sumstats)
    n_snp = len(sumstats)
    n_blocks = min(n_snp, args.n_blocks)
    if args.chisq_max is None:
        chisq_max = max(0.001*sumstats.N.max(), 80)
    else:
        chisq_max = args.chisq_max

    ii = np.ravel(sumstats.Z**2 < chisq_max)
    sumstats = sumstats.ix[ii, :]
    log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
            C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
    n_snp = np.sum(ii)  # lambdas are late-binding, so this works
    ref_ld_all_regr = np.array(sumstats[ref_ld_cnames_all_regr]).reshape((len(sumstats),-1))
    chisq = np.array(sumstats.Z**2)
    keep_snps = sumstats[['SNP']]

    s = lambda x: np.array(x).reshape((n_snp, 1))
    results_columns = ['Name', 'Coefficient', 'Coefficient_std_error', 'Coefficient_P_value']
    results_data = []
    for (name, ct_ld_chr) in [x.split() for x in open(args.ref_ld_chr_cts).readlines()]:
        ref_ld_cts_allsnps = _read_chr_split_files(ct_ld_chr, None, log,
                                   'cts reference panel LD Score', ps.ldscore_fromlist)
        log.log('Performing regression.')
        ref_ld_cts = np.array(pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP', how='left').ix[:,1:])
        if np.any(np.isnan(ref_ld_cts)):
            raise ValueError ('Missing some LD scores from cts files. Are you sure all SNPs in ref-ld-chr are also in ref-ld-chr-cts')

        ref_ld = np.hstack([ref_ld_cts, ref_ld_all_regr])
        M_cts = ps.M_fromlist(
                _splitp(ct_ld_chr), _N_CHR, common=(not args.not_M_5_50))
        M_annot = np.hstack([M_cts, M_annot_all_regr])
        hsqhat = reg.Hsq(s(chisq), ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N),
                     M_annot, n_blocks=n_blocks, intercept=args.intercept_h2,
                     twostep=None, old_weights=True)
        coef, coef_se = hsqhat.coef[0], hsqhat.coef_se[0]
        results_data.append((name, coef, coef_se, stats.norm.sf(coef/coef_se)))
        if args.print_all_cts:
            for i in range(1, len(ct_ld_chr.split(','))):
                coef, coef_se = hsqhat.coef[i], hsqhat.coef_se[i]
                results_data.append((name+'_'+str(i), coef, coef_se, stats.norm.sf(coef/coef_se)))


    df_results = pd.DataFrame(data = results_data, columns = results_columns)
    df_results.sort_values(by = 'Coefficient_P_value', inplace=True)
    df_results.to_csv(args.out+'.cell_type_results.txt', sep='\t', index=False)
    log.log('Results printed to '+args.out+'.cell_type_results.txt')
Пример #2
0
def estimate_h2(args, log):
    '''Estimate h2 and partitioned h2.'''
    args = copy.deepcopy(args)
    if args.samp_prev is not None and args.pop_prev is not None:
        args.samp_prev, args.pop_prev = map(float,
                                            [args.samp_prev, args.pop_prev])
    if args.intercept_h2 is not None:
        args.intercept_h2 = float(args.intercept_h2)
    if args.no_intercept:
        args.intercept_h2 = 1
    M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats(
        args, log, args.h2)
    ref_ld = np.array(sumstats[ref_ld_cnames])
    _check_ld_condnum(args, log, ref_ld_cnames)
    _warn_length(log, sumstats)
    n_snp = len(sumstats)
    n_blocks = min(n_snp, args.n_blocks)
    n_annot = len(ref_ld_cnames)
    chisq_max = args.chisq_max
    old_weights = False
    if n_annot == 1:
        if args.two_step is None and args.intercept_h2 is None:
            args.two_step = 30
    else:
        old_weights = True
        if args.chisq_max is None:
            chisq_max = max(0.001 * sumstats.N.max(), 80)

    s = lambda x: np.array(x).reshape((n_snp, 1))
    chisq = s(sumstats.Z**2)
    if chisq_max is not None:
        ii = np.ravel(chisq < chisq_max)
        sumstats = sumstats.ix[ii, :]
        log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
            C=chisq_max, N=np.sum(ii), M=n_snp - np.sum(ii)))
        n_snp = np.sum(ii)  # lambdas are late-binding, so this works
        ref_ld = np.array(sumstats[ref_ld_cnames])
        chisq = chisq[ii].reshape((n_snp, 1))

    if args.two_step is not None:
        log.log('Using two-step estimator with cutoff at {M}.'.format(
            M=args.two_step))

    hsqhat = reg.Hsq(chisq,
                     ref_ld,
                     s(sumstats[w_ld_cname]),
                     s(sumstats.N),
                     M_annot,
                     n_blocks=n_blocks,
                     intercept=args.intercept_h2,
                     twostep=args.two_step,
                     old_weights=old_weights)

    if args.print_cov:
        _print_cov(hsqhat, args.out + '.cov', log)
    if args.print_delete_vals:
        _print_delete_values(hsqhat, args.out + '.delete', log)

    log.log(
        hsqhat.summary(ref_ld_cnames,
                       P=args.samp_prev,
                       K=args.pop_prev,
                       overlap=args.overlap_annot))
    if args.overlap_annot:
        overlap_matrix, M_tot = _read_annot(args, log)

        # overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not
        df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix,
                                            M_annot, M_tot,
                                            args.print_coefficients)
        df_results.to_csv(args.out + '.results', sep="\t", index=False)
        log.log('Results printed to ' + args.out + '.results')

    return hsqhat
Пример #3
0
def estimate_h2(args, log):
    '''Estimate h2 and partitioned h2.'''
    args = copy.deepcopy(args)
    if args.samp_prev is not None and args.pop_prev is not None:
        args.samp_prev, args.pop_prev = map(float,
                                            [args.samp_prev, args.pop_prev])
    if args.intercept_h2 is not None:
        args.intercept_h2 = float(args.intercept_h2)
    if args.no_intercept:
        args.intercept_h2 = 1

    if len(args.h2) > 1:
        log.log("Pre-reading files to use across all sumstats")
        orig_ref_ld = _read_ref_ld(args, log)
        orig_n_annot = len(orig_ref_ld.columns) - 1
        orig_M_annot = _read_M(args, log, orig_n_annot)
        orig_M_annot, orig_ref_ld, orig_novar_cols = _check_variance(
            log, orig_M_annot, orig_ref_ld)
        orig_w_ld = _read_w_ld(args, log)
        overlap_matrix, M_tot = _read_annot(args, log)
    else:
        orig_ref_ld = None
        orig_n_annot = None
        orig_M_annot = None
        orig_novar_cols = None
        orig_w_ld = None
        overlap_matrix = None
        M_tot = None

    hsqhats = []
    it = 0
    for h2_fh in args.h2:

        ref_ld = copy.copy(orig_ref_ld)
        M_annot = copy.copy(orig_M_annot)
        n_annot = copy.copy(orig_n_annot)
        novar_cols = copy.copy(orig_novar_cols)
        w_ld = copy.copy(orig_w_ld)

        M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats(
            args,
            log,
            h2_fh,
            ref_ld=ref_ld,
            n_annot=n_annot,
            M_annot=M_annot,
            novar_cols=novar_cols,
            w_ld=w_ld)

        ref_ld = np.array(sumstats[ref_ld_cnames])
        _check_ld_condnum(args, log, ref_ld_cnames)
        _warn_length(log, sumstats)
        n_snp = len(sumstats)
        n_blocks = min(n_snp, args.n_blocks)
        n_annot = len(ref_ld_cnames)
        chisq_max = args.chisq_max
        old_weights = False
        if n_annot == 1:
            if args.two_step is None and args.intercept_h2 is None:
                args.two_step = 30
        else:
            old_weights = True
            if args.chisq_max is None:
                chisq_max = max(0.001 * sumstats.N.max(), 80)

        s = lambda x: np.array(x).reshape((n_snp, 1))
        chisq = s(sumstats.Z**2)
        if chisq_max is not None:
            ii = np.ravel(chisq < chisq_max)
            sumstats = sumstats.ix[ii, :]
            log.log(
                'Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
                    C=chisq_max, N=np.sum(ii), M=n_snp - np.sum(ii)))
            n_snp = np.sum(ii)  # lambdas are late-binding, so this works
            ref_ld = np.array(sumstats[ref_ld_cnames])
            chisq = chisq[ii].reshape((n_snp, 1))

        if args.two_step is not None:
            log.log('Using two-step estimator with cutoff at {M}.'.format(
                M=args.two_step))

        hsqhat = reg.Hsq(chisq,
                         ref_ld,
                         s(sumstats[w_ld_cname]),
                         s(sumstats.N),
                         M_annot,
                         n_blocks=n_blocks,
                         intercept=args.intercept_h2,
                         twostep=args.two_step,
                         old_weights=old_weights)

        if args.print_cov:
            _print_cov(hsqhat, args.out + '.cov', log)
        if args.print_delete_vals:
            _print_delete_values(hsqhat, args.out + '.delete', log)
            _print_part_delete_values(hsqhat, args.out + '.part_delete', log)

        log.log(
            hsqhat.summary(ref_ld_cnames,
                           P=args.samp_prev,
                           K=args.pop_prev,
                           overlap=args.overlap_annot))
        if args.overlap_annot:
            if overlap_matrix is None or M_tot is None:
                overlap_matrix, M_tot = _read_annot(args, log)
            else:
                log.log("Re-using annot matrix")

            # overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not
            df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix,
                                                M_annot, M_tot,
                                                args.print_coefficients)
            with open(args.out + '.results', 'a' if it > 0 else 'w') as op:
                df_results.to_csv(
                    op,
                    sep="\t",
                    index=False,
                    header=(it == 0),
                )
            log.log('Results printed to ' + args.out + '.results')

        hsqhats.append(hsqhat)
        it += 1

    return hsqhats if len(hsqhats) > 1 else hsqhats[0]
Пример #4
0
def cell_type_specific(args, log):
    '''Cell type specific analysis'''
    args = copy.deepcopy(args)
    if args.intercept_h2 is not None:
        args.intercept_h2 = float(args.intercept_h2)
    if args.no_intercept:
        args.intercept_h2 = 1

    M_annot_all_regr, w_ld_cname, ref_ld_cnames_all_regr, sumstats, novar_cols = \
            _read_ld_sumstats(args, log, args.h2_cts)
    M_tot = np.sum(M_annot_all_regr)
    _check_ld_condnum(args, log, ref_ld_cnames_all_regr)
    _warn_length(log, sumstats)
    n_snp = len(sumstats)
    n_blocks = min(n_snp, args.n_blocks)
    if args.chisq_max is None:
        chisq_max = max(0.001 * sumstats.N.max(), 80)
    else:
        chisq_max = args.chisq_max

    ii = np.ravel(sumstats.Z**2 < chisq_max)
    sumstats = sumstats.ix[ii, :]
    log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
        C=chisq_max, N=np.sum(ii), M=n_snp - np.sum(ii)))
    n_snp = np.sum(ii)  # lambdas are late-binding, so this works
    ref_ld_all_regr = np.array(sumstats[ref_ld_cnames_all_regr]).reshape(
        (len(sumstats), -1))
    chisq = np.array(sumstats.Z**2)
    keep_snps = sumstats[['SNP']]

    s = lambda x: np.array(x).reshape((n_snp, 1))
    results_columns = [
        'Name', 'Coefficient', 'Coefficient_std_error', 'Coefficient_P_value'
    ]
    results_data = []
    # for (name, ct_ld_chr) in [x.split() for x in open(args.ref_ld_chr_cts).readlines()]: # ORIG
    cts_lines = open(args.ref_ld_chr_cts).readlines()
    for cts_linenum, cts_line in enumerate(cts_lines, start=1):
        try:
            (name, ct_ld_chr) = cts_line.split(
            )  # whitespace delim file with ONLY two cols. Statement raises exception 'ValueError: too many values to unpack (expected 2)' if .split() gives more string splits.
            ref_ld_cts_allsnps = _read_chr_split_files(
                ct_ld_chr, None, log, 'cts reference panel LD Score',
                ps.ldscore_fromlist)
            log.log('Performing regression #{}/#{}. CTS name is {}'.format(
                cts_linenum, len(cts_lines), name))  # PT MODIFIED.
            sys.stdout.flush()  # PT ADDED
            ref_ld_cts = np.array(
                pd.merge(keep_snps, ref_ld_cts_allsnps, on='SNP',
                         how='left').ix[:, 1:])
            if np.any(np.isnan(ref_ld_cts)):
                raise ValueError(
                    'Missing some LD scores from cts files. Are you sure all SNPs in ref-ld-chr are also in ref-ld-chr-cts'
                )

            ref_ld = np.hstack([ref_ld_cts, ref_ld_all_regr])
            M_cts = ps.M_fromlist(_splitp(ct_ld_chr),
                                  _N_CHR,
                                  common=(not args.not_M_5_50))
            M_annot = np.hstack([M_cts, M_annot_all_regr])
            hsqhat = reg.Hsq(s(chisq),
                             ref_ld,
                             s(sumstats[w_ld_cname]),
                             s(sumstats.N),
                             M_annot,
                             n_blocks=n_blocks,
                             intercept=args.intercept_h2,
                             twostep=None,
                             old_weights=True)
            coef, coef_se = hsqhat.coef[0], hsqhat.coef_se[0]
            results_data.append(
                (name, coef, coef_se, stats.norm.sf(coef / coef_se)))
            df_results_tmp = pd.DataFrame(data=results_data,
                                          columns=results_columns)  # PT ADD
            df_results_tmp.to_csv(args.out + '.cell_type_results.tmp.txt',
                                  sep='\t',
                                  index=False)  # PT ADD
            if args.print_all_cts:
                for i in range(1, len(ct_ld_chr.split(','))):
                    coef, coef_se = hsqhat.coef[i], hsqhat.coef_se[i]
                    results_data.append((name + '_' + str(i), coef, coef_se,
                                         stats.norm.sf(coef / coef_se)))
        except Exception as e:  # e.g may catch numpy.linalg.linalg.LinAlgError: Singular matrix.
            log.log(
                '*CTS ERROR* Caught exception during regression #{}/#{}. CTS name is {}. Exception:\n{}'
                .format(cts_linenum, len(cts_lines), name, e))  # PT MODIFIED.
            sys.stdout.flush()  # PT ADDED

    df_results = pd.DataFrame(data=results_data, columns=results_columns)
    df_results.sort_values(by='Coefficient_P_value', inplace=True)
    df_results.to_csv(args.out + '.cell_type_results.txt',
                      sep='\t',
                      index=False)
    log.log('Results printed to ' + args.out + '.cell_type_results.txt')
Пример #5
0
def estimate_h2(args, log):
    '''Estimate h2 and partitioned h2.'''
    
    
    args = copy.deepcopy(args)
    if args.samp_prev is not None and args.pop_prev is not None:
        args.samp_prev, args.pop_prev = map(
            float, [args.samp_prev, args.pop_prev])
    if args.intercept_h2 is not None:
        args.intercept_h2 = float(args.intercept_h2)
    if args.no_intercept:
        args.intercept_h2 = 1
    M_annot, w_ld_cname, ref_ld_cnames, sumstats, novar_cols = _read_ld_sumstats(
        args, log, args.h2)    
    ref_ld = np.array(sumstats[ref_ld_cnames])
    _check_ld_condnum(args, log, ref_ld_cnames)
    _warn_length(log, sumstats)
    n_snp = len(sumstats)
    n_blocks = min(n_snp, args.n_blocks)
    n_annot = len(ref_ld_cnames)
    chisq_max = args.chisq_max
    old_weights = False
    if n_annot == 1:
        if args.two_step is None and args.intercept_h2 is None and not args.force_nonneg:
            args.two_step = 30
    else:
        old_weights = True
        if args.chisq_max is None:
            chisq_max = max(0.001*sumstats.N.max(), 80)

    s = lambda x: np.array(x).reshape((n_snp, 1))
    chisq = s(sumstats.Z**2)
    if chisq_max is not None:
        ii = np.ravel(chisq < chisq_max)
        sumstats = sumstats.ix[ii, :]        
        log.log('Removed {M} SNPs with chi^2 > {C} ({N} SNPs remain)'.format(
                C=chisq_max, N=np.sum(ii), M=n_snp-np.sum(ii)))
        n_snp = np.sum(ii)  # lambdas are late-binding, so this works
        ref_ld = np.array(sumstats[ref_ld_cnames])
        chisq = chisq[ii].reshape((n_snp, 1))

    if args.two_step is not None:
        log.log('Using two-step estimator with cutoff at {M}.'.format(M=args.two_step))

    if (args.ridge and args.intercept_h2 is not None):
        log.log("WARNING: Ridge regression with no intercept can be numerically unstable")
        
        
    if args.force_nonneg:
        log.log("Reading non-negativity constraints")
        nonneg_constraints = _read_nonneg_constraints(args, log, snp_names=sumstats['SNP'], ref_ld_cnames=ref_ld_cnames)
        if not args.ridge:
            args.ridge=True
            args.ridge_lambda=0
            args.reestimate_lambdas=False
            args.no_standardize_ridge=True
    else:
        nonneg_constraints = None
        
    
    hsqhat = reg.Hsq(chisq, ref_ld, s(sumstats[w_ld_cname]), s(sumstats.N),
                    M_annot, n_blocks=n_blocks, intercept=args.intercept_h2,
                    twostep=args.two_step, old_weights=old_weights,
                    chr=sumstats['CHR'],
                    ridge=args.ridge, ridge_lambda=args.ridge_lambda,
                    standardize_ridge=not args.no_standardize_ridge,
                    approx_ridge=not args.reestimate_lambdas,
                    nonneg_constraints=nonneg_constraints
                    )
                    
    log.log('OOC chi^2 prediction score: %0.4e'%(hsqhat.ooc_score))

    if args.print_cov:
        _print_cov(hsqhat, args.out + '.cov', log)
    if args.print_delete_vals:
        _print_delete_values(hsqhat, args.out + '.delete', log)
        _print_part_delete_values(hsqhat, args.out + '.part_delete', log)

    log.log(hsqhat.summary(ref_ld_cnames, P=args.samp_prev, K=args.pop_prev, overlap = args.overlap_annot))
    if args.overlap_annot:
        overlap_matrix, M_tot = _read_annot(args, log)

        # overlap_matrix = overlap_matrix[np.array(~novar_cols), np.array(~novar_cols)]#np.logical_not
        df_results = hsqhat._overlap_output(ref_ld_cnames, overlap_matrix, M_annot, M_tot, args.print_coefficients)
        df_results.to_csv(args.out+'.results', sep="\t", index=False, na_rep='NA')
        log.log('Results printed to '+args.out+'.results')

    return hsqhat