示例#1
0
def main(p_dict):

    bimfile = None
    if p_dict['vbim'] is not None:
        bimfile = p_dict['vbim']
    elif p_dict['vgf'] is not None:
        bimfile = p_dict['vgf'] + '.bim'
    elif p_dict['gf'] is not None:
        bimfile = p_dict['gf'] + '.bim'
    else:
        print('Set of validation SNPs is missing!  Please specify either a validation PLINK genotype file, ' \
              'or a PLINK BIM file with the SNPs of interest.')
    if os.path.isfile(p_dict['out']):
        print('Output file (%s) already exists!  Delete, rename it, or use a different output file.'\
              % (p_dict['out']))
        raise Exception('Output file already exists!')

    h5f = h5py.File(p_dict['out'], 'w')
    
    summary_dict = {}
    summary_dict[0]={'name':'Summary statistics filename:','value':p_dict['ssf']}
    summary_dict[1]={'name':'LD reference genotypes filename:','value':p_dict['gf']}
    summary_dict[3]={'name':'Coordinated data output filename:','value':p_dict['out']}
    if p_dict['vgf'] is not None:
        summary_dict[2]={'name':'Validation genotypes filename:','value':p_dict['vgf']}

    sum_stats_parsers.parse_sum_stats(h5f, p_dict, bimfile, summary_dict)
    coordinate_datasets(p_dict['gf'], h5f,summary_dict,
                        validation_genotype_file=p_dict['vgf'], 
                        max_freq_discrep=p_dict['max_freq_discrep'],
                        min_maf=p_dict['maf'], 
                        skip_coordination=p_dict['skip_coordination'], 
                        debug=p_dict['debug'])
    h5f.close()
    reporting.print_summary(summary_dict, 'Summary of coordination step')
示例#2
0
def main(p_dict):
    #Check parameters
    summary_dict = {}
    summary_dict[0]={'name':'Coordinated data filename','value':p_dict['cf']}
    summary_dict[0.1]={'name':'SNP weights output file (prefix)', 'value':p_dict['out']}
    summary_dict[0.2]={'name':'LD data filename (prefix)', 'value':p_dict['ldf']}
    summary_dict[1]={'name':'LD radius used','value':str(p_dict['ldr'])}
    t0 = time.time()
    summary_dict[1.09]={'name':'dash', 'value':'LD information'}
    ld_dict = ld.get_ld_dict(p_dict['cf'], p_dict['ldf'], p_dict['ldr'], verbose=p_dict['debug'],
                              compressed=not p_dict['no_ld_compression'], use_hickle=p_dict['hickle_ld'], summary_dict=summary_dict)    
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[1.2]={'name':'Running time for calculating LD information:','value':'%d min and %0.2f secs'% (t / 60, t % 60)}
    t0 = time.time()
    summary_dict[1.9]={'name':'dash', 'value':'LDpred Gibbs sampler'}
    ldpred_genomewide(data_file=p_dict['cf'], out_file_prefix=p_dict['out'], ps=p_dict['f'], ld_radius=p_dict['ldr'],
                      ld_dict=ld_dict, n=p_dict['N'], num_iter=p_dict['n_iter'], burn_in=p_dict['n_burn_in'], 
                      h2=p_dict['h2'], use_gw_h2=p_dict['use_gw_h2'], verbose=p_dict['debug'], summary_dict=summary_dict)
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[2.2]={'name':'Running time for Gibbs sampler(s):','value':'%d min and %0.2f secs'% (t / 60, t % 60)}
    reporting.print_summary(summary_dict, 'Summary of LDpred Gibbs')

        

        
示例#3
0
def main(p_dict):

    bimfile = None
    if p_dict['vbim'] is not None:
        bimfile = p_dict['vbim']
    elif p_dict['vgf'] is not None:
        bimfile = p_dict['vgf'] + '.bim'
    elif p_dict['gf'] is not None:
        bimfile = p_dict['gf'] + '.bim'
    else:
        print('Set of validation SNPs is missing!  Please specify either a validation PLINK genotype file, ' \
              'or a PLINK BIM file with the SNPs of interest.')
    if os.path.isfile(p_dict['out']):
        print('Output file (%s) already exists!  Delete, rename it, or use a different output file.'\
              % (p_dict['out']))
        raise Exception('Output file already exists!')

    h5f = h5py.File(p_dict['out'], 'w')
    
    summary_dict = {}
    summary_dict[0]={'name':'Summary statistics filename:','value':p_dict['ssf']}
    summary_dict[1]={'name':'LD reference genotypes filename:','value':p_dict['gf']}
    summary_dict[3]={'name':'Coordinated data output filename:','value':p_dict['out']}
    if p_dict['vgf'] is not None:
        summary_dict[2]={'name':'Validation genotypes filename:','value':p_dict['vgf']}

    sum_stats_parsers.parse_sum_stats(h5f, p_dict, bimfile, summary_dict)
    coordinate_datasets(p_dict['gf'], h5f,summary_dict,
                        validation_genotype_file=p_dict['vgf'], 
                        max_freq_discrep=p_dict['max_freq_discrep'],
                        min_maf=p_dict['maf'], 
                        skip_coordination=p_dict['skip_coordination'], 
                        debug=p_dict['debug'])
    h5f.close()
    reporting.print_summary(summary_dict, 'Summary of coordination step')
示例#4
0
def main(p_dict):
    #Check parameters
    summary_dict = {}
    summary_dict[0] = {
        'name': 'Coordinated data filename',
        'value': p_dict['cf']
    }
    summary_dict[0.1] = {
        'name': 'SNP weights output file (prefix)',
        'value': p_dict['out']
    }

    eff_type = get_eff_type(p_dict['cf'])
    #If already BLUP betas, then skip LD calculation
    if eff_type != 'BLUP':
        summary_dict[0.2] = {
            'name': 'LD data filename (prefix)',
            'value': p_dict['ldf']
        }
        summary_dict[1.01] = {
            'name': 'LD radius used',
            'value': str(p_dict['ldr'])
        }
        summary_dict[1] = {'name': 'dash', 'value': 'LD information'}
        t0 = time.time()
        ld_dict = ld.get_ld_dict_using_p_dict(p_dict, summary_dict)
        t1 = time.time()
        t = (t1 - t0)
        summary_dict[1.2] = {
            'name': 'Running time for calculating LD information:',
            'value': '%d min and %0.2f secs' % (t / 60, t % 60)
        }
        t0 = time.time()

    summary_dict[1.9] = {'name': 'dash', 'value': 'LDpred-fast'}
    ldpred_fast_genomewide(
        data_file=p_dict['cf'],
        out_file_prefix=p_dict['out'],
        ps=p_dict['f'],
        ld_radius=p_dict['ldr'],
        ld_dict=ld_dict,
        n=p_dict['N'],
        h2=p_dict['h2'],
        use_gw_h2=p_dict['use_gw_h2'],
        eff_type=eff_type,
        summary_dict=summary_dict,
        debug=p_dict['debug'],
    )
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[3] = {
        'name': 'Running time for LDpred-fast:',
        'value': '%d min and %0.2f secs' % (t / 60, t % 60)
    }
    reporting.print_summary(summary_dict, 'Summary of LDpred-fast')
示例#5
0
def main(p_dict):

    summary_dict = {}
    summary_dict[0] = {
        'name': 'Coordinated data filename',
        'value': p_dict['cf']
    }
    summary_dict[0.1] = {
        'name': 'SNP weights output file (prefix)',
        'value': p_dict['out']
    }
    summary_dict[0.2] = {
        'name': 'LD data filename (prefix)',
        'value': p_dict['ldf']
    }
    summary_dict[1] = {'name': 'LD radius used', 'value': str(p_dict['ldr'])}
    t0 = time.time()
    summary_dict[1.09] = {'name': 'dash', 'value': 'LD information'}
    ld_dict = ld.get_ld_dict_using_p_dict(p_dict, summary_dict={})
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[1.2] = {
        'name': 'Running time for calculating LD information:',
        'value': '%d min and %0.2f secs' % (t / 60, t % 60)
    }
    t0 = time.time()
    summary_dict[1.9] = {'name': 'dash', 'value': 'LDpred infinitesimal model'}

    ldpred_inf_genomewide(data_file=p_dict['cf'],
                          out_file_prefix=p_dict['out'],
                          ld_radius=p_dict['ldr'],
                          ld_dict=ld_dict,
                          n=p_dict['N'],
                          h2=p_dict['h2'],
                          use_gw_h2=p_dict['use_gw_h2'],
                          verbose=p_dict['debug'],
                          summary_dict=summary_dict)
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[2.2] = {
        'name': 'Running time for LDpred-inf:',
        'value': '%d min and %0.2f secs' % (t / 60, t % 60)
    }
    reporting.print_summary(summary_dict, 'Summary of LDpred-inf')
示例#6
0
def main(p_dict):
    summary_dict = {}
    summary_dict[0]={'name':'Coordinated data filename','value':p_dict['cf']}
    summary_dict[0.1]={'name':'SNP weights output file (prefix)', 'value':p_dict['out']}
    summary_dict[0.2]={'name':'LD data filename (prefix)', 'value':p_dict['ldf']}
    summary_dict[1]={'name':'LD radius used','value':str(p_dict['ldr'])}
    t0 = time.time()
    summary_dict[1.09]={'name':'dash', 'value':'LD information'}
    ld_dict = ld.get_ld_dict(p_dict['cf'], p_dict['ldf'], p_dict['ldr'], verbose=p_dict['debug'],
                              compressed=not p_dict['no_ld_compression'], use_hickle=p_dict['hickle_ld'], summary_dict=summary_dict)    
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[1.2]={'name':'Running time for calculating LD information:','value':'%d min and %0.2f secs'% (t / 60, t % 60)}
    t0 = time.time()
    summary_dict[1.9]={'name':'dash', 'value':'LDpred Gibbs sampler'}
    ldpred_genomewide(data_file=p_dict['cf'], out_file_prefix=p_dict['out'], ps=p_dict['f'], ld_radius=p_dict['ldr'],
                      ld_dict=ld_dict, n=p_dict['N'], num_iter=p_dict['n_iter'], burn_in=p_dict['n_burn_in'], 
                      h2=p_dict['h2'], verbose=p_dict['debug'], summary_dict=summary_dict)
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[2.2]={'name':'Running time for Gibbs sampler(s):','value':'%d min and %0.2f secs'% (t / 60, t % 60)}
    reporting.print_summary(summary_dict, 'Summary of LDpred Gibbs')
示例#7
0
def main(p_dict):
    summary_dict = {}
    summary_dict[0] = {
        'name': 'Coordinated data filename',
        'value': p_dict['cf']
    }
    summary_dict[0.1] = {
        'name': 'SNP weights output file (prefix)',
        'value': p_dict['out']
    }
    summary_dict[1] = {'name': 'LD radius used', 'value': str(p_dict['ldr'])}
    t0 = time.time()
    summary_dict[1.1] = {'name': 'dash', 'value': 'LD-pruning + Thresholding'}
    run_pt(p_dict, summary_dict)
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[2] = {
        'name': 'Running time for calculating P+T:',
        'value': '%d min and %0.2f secs' % (t / 60, t % 60)
    }

    reporting.print_summary(summary_dict,
                            'Summary of LD-pruning + Tresholding')
示例#8
0
def main(p_dict):
    summary_dict = {}
    non_zero_chromosomes = set()
    verbose = p_dict['debug']

    t0 = time.time()

    summary_dict[0] = {
        'name': 'Validation genotype file (prefix):',
        'value': p_dict['gf']
    }
    summary_dict[0.1] = {
        'name': 'Input weight file(s) (prefix):',
        'value': p_dict['rf']
    }
    summary_dict[0.2] = {
        'name': 'Output scores file(s) (prefix):',
        'value': p_dict['out']
    }

    adjust_for_pcs = False
    adjust_for_covs = False

    if not p_dict['only_score']:
        summary_dict[0.9] = {'name': 'dash', 'value': 'Phenotypes'}
        print('Parsing phenotypes')
        if p_dict['pf'] is None:
            if p_dict['gf'] is not None:
                phen_map = parse_phen_file(p_dict['gf'] + '.fam',
                                           'FAM',
                                           verbose=verbose,
                                           summary_dict=summary_dict)
            else:
                raise Exception('Validation phenotypes were not found.')
        else:
            phen_map = parse_phen_file(p_dict['pf'],
                                       p_dict['pf_format'],
                                       verbose=verbose,
                                       summary_dict=summary_dict)
        t1 = time.time()
        t = (t1 - t0)
        summary_dict[1.1] = {
            'name': 'Individuals with phenotype information:',
            'value': len(phen_map)
        }
        summary_dict[1.2] = {
            'name': 'Running time for parsing phenotypes:',
            'value': '%d min and %0.2f secs' % (t / 60, t % 60)
        }

        if p_dict['cov_file'] != None:
            adjust_for_covs = True
            if verbose:
                print('Parsing additional covariates')

            with open(p_dict['cov_file'], 'r') as f:
                num_missing = 0
                for line in f:
                    l = line.split()
                    iid = l[0]
                    if iid in phen_map:
                        covariates = list(map(float, l[1:]))
                        phen_map[iid]['covariates'] = covariates
                    else:
                        num_missing += 1
                if num_missing > 0:
                    summary_dict[2.1] = {
                        'name': 'Individuals w missing covariate information:',
                        'value': num_missing
                    }
                    if verbose:
                        print('Unable to find %d iids in phen file!' %
                              num_missing)
            summary_dict[2] = {
                'name': 'Parsed covariates file:',
                'value': p_dict['cov_file']
            }

        if p_dict['pcs_file']:
            adjust_for_pcs = True
            if verbose:
                print('Parsing PCs')

            with open(p_dict['pcs_file'], 'r') as f:
                num_missing = 0
                for line in f:
                    l = line.split()
                    iid = l[1]
                    if iid in phen_map:
                        pcs = list(map(float, l[2:]))
                        phen_map[iid]['pcs'] = pcs
                    else:
                        num_missing += 1
                if num_missing > 0:
                    summary_dict[3.1] = {
                        'name': 'Individuals w missing PCs:',
                        'value': num_missing
                    }
                    if verbose:
                        print('Unable to find %d iids in phen file!' %
                              num_missing)
            summary_dict[3] = {
                'name': 'Parsed PCs file:',
                'value': p_dict['pcs_file']
            }

        num_individs = len(phen_map)
        assert num_individs > 0, 'No phenotypes were found!'
    else:
        phen_map = None

    t0 = time.time()
    prs_file_is_missing = True
    res_dict = {}
    if p_dict['rf_format'] == 'LDPRED' or p_dict['rf_format'] == 'ANY':
        weights_file = '%s_LDpred-inf.txt' % (p_dict['rf'])

        if os.path.isfile(weights_file):
            print('')
            print('Calculating LDpred-inf risk scores')
            rs_id_map = parse_ldpred_res(weights_file)
            out_file = '%s_LDpred-inf.txt' % (p_dict['out'])
            res_dict['LDpred_inf'] = calc_risk_scores(
                p_dict['gf'],
                rs_id_map,
                phen_map,
                out_file=out_file,
                split_by_chrom=p_dict['split_by_chrom'],
                adjust_for_pcs=adjust_for_pcs,
                adjust_for_covariates=adjust_for_covs,
                only_score=p_dict['only_score'],
                verbose=verbose,
                summary_dict=summary_dict)
            if not p_dict['only_score']:
                summary_dict[5.2] = {
                    'name': 'LDpred_inf (unadjusted) Pearson R2:',
                    'value': '%0.4f' % res_dict['LDpred_inf']['pred_r2']
                }
            prs_file_is_missing = False

        best_ldpred_pred_r2 = 0
        best_p = None
        for p in p_dict['f']:
            weights_file = '%s_LDpred_p%0.4e.txt' % (p_dict['rf'], p)
            if os.path.isfile(weights_file):
                print('')
                print('Calculating LDpred risk scores using f=%0.3e' % p)
                rs_id_map = parse_ldpred_res(weights_file)
                out_file = '%s_LDpred_p%0.4e.txt' % (p_dict['out'], p)
                method_str = 'LDpred_p%0.4e' % (p)
                res_dict[method_str] = calc_risk_scores(
                    p_dict['gf'],
                    rs_id_map,
                    phen_map,
                    out_file=out_file,
                    split_by_chrom=p_dict['split_by_chrom'],
                    adjust_for_pcs=adjust_for_pcs,
                    adjust_for_covariates=adjust_for_covs,
                    only_score=p_dict['only_score'],
                    verbose=verbose,
                    summary_dict=summary_dict)
                if len(res_dict[method_str]) and (
                        res_dict[method_str]['pred_r2']) > best_ldpred_pred_r2:
                    best_ldpred_pred_r2 = res_dict[method_str]['pred_r2']
                    best_p = p

                prs_file_is_missing = False
        if best_ldpred_pred_r2 > 0 and not p_dict['only_score']:
            summary_dict[5.3] = {
                'name': 'Best LDpred (f=%0.2e) (unadjusted) R2:' % (best_p),
                'value': '%0.4f' % best_ldpred_pred_r2
            }

        # Plot results?

    if p_dict['rf_format'] == 'P+T' or p_dict['rf_format'] == 'ANY':

        best_pt_pred_r2 = 0
        best_t = None
        best_r2 = None
        for max_r2 in p_dict['r2']:
            for p_thres in p_dict['p']:
                weights_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (p_dict['rf'],
                                                             max_r2, p_thres)
                if os.path.isfile(weights_file):
                    print(
                        'Calculating P+T risk scores using p-value threshold of %0.3e, and r2 threshold of %0.2f'
                        % (p_thres, max_r2))
                    rs_id_map, non_zero_chromosomes = parse_pt_res(
                        weights_file)
                    if len(rs_id_map) > 0:
                        out_file = '%s_P+T_p%0.4e.txt' % (p_dict['out'],
                                                          p_thres)
                        method_str = 'P+T_p%0.4e' % (p_thres)
                        res_dict[method_str] = calc_risk_scores(
                            p_dict['gf'],
                            rs_id_map,
                            phen_map,
                            out_file=out_file,
                            split_by_chrom=p_dict['split_by_chrom'],
                            non_zero_chromosomes=non_zero_chromosomes,
                            adjust_for_pcs=adjust_for_pcs,
                            adjust_for_covariates=adjust_for_covs,
                            only_score=p_dict['only_score'],
                            verbose=verbose,
                            summary_dict=summary_dict)
                        if len(res_dict[method_str]) and (res_dict[method_str][
                                'pred_r2']) > best_pt_pred_r2:
                            best_pt_pred_r2 = res_dict[method_str]['pred_r2']
                            best_t = p_thres
                            best_r2 = max_r2
                    else:
                        print(
                            'No SNPs found with p-values below the given threshold.'
                        )
                    prs_file_is_missing = False
        if best_pt_pred_r2 > 0 and not p_dict['only_score']:
            summary_dict[5.4] = {
                'name':
                'Best P+T (r2=%0.2f, p=%0.2e) (unadjusted) R2:' %
                (best_r2, best_t),
                'value':
                '%0.4f' % best_pt_pred_r2
            }

    # Plot results?
    assert not prs_file_is_missing, 'No SNP weights file was found.  A prefix to these should be provided via the --rf flag. Note that the prefix should exclude the _LDpred_.. extension or file ending. '

    #Identifying the best prediction
    if not p_dict['only_score']:
        best_pred_r2 = 0
        best_method_str = None
        for method_str in res_dict:
            if len(res_dict[method_str]) and (
                    res_dict[method_str]['pred_r2']) > best_pred_r2:
                best_pred_r2 = res_dict[method_str]['pred_r2']
                best_method_str = method_str
        if best_method_str is not None:
            print(
                'The highest (unadjusted) Pearson R2 was %0.4f, and provided by %s'
                % (best_pred_r2, best_method_str))
            summary_dict[5.99] = {
                'name': 'dash',
                'value': 'Optimal polygenic score'
            }
            summary_dict[6] = {
                'name': 'Method with highest (unadjusted) Pearson R2:',
                'value': best_method_str
            }
            summary_dict[6.1] = {
                'name': 'Best (unadjusted) Pearson R2:',
                'value': '%0.4f' % best_pred_r2
            }
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[4.9] = {'name': 'dash', 'value': 'Scoring'}
    summary_dict[5.9] = {
        'name': 'Running time for calculating scores:',
        'value': '%d min and %0.2f secs' % (t / 60, t % 60)
    }

    if prs_file_is_missing:
        print(
            'SNP weights files were not found.  This could be due to a mis-specified --rf flag, or other issues.'
        )

    reporting.print_summary(summary_dict, 'Scoring Summary')
示例#9
0
def main(p_dict):
    summary_dict = {}
    non_zero_chromosomes = set()
    verbose = p_dict['debug']

    t0 = time.time()

    summary_dict[0]={'name':'Validation genotype file (prefix):','value':p_dict['gf']}
    summary_dict[0.1]={'name':'Input weight file(s) (prefix):','value':p_dict['rf']}
    summary_dict[0.2]={'name':'Output scores file(s) (prefix):','value':p_dict['out']}

    adjust_for_pcs=False
    adjust_for_covs=False

    if not p_dict['only_score']:
        summary_dict[0.9]={'name':'dash', 'value':'Phenotypes'}
        print('Parsing phenotypes')
        if p_dict['pf'] is None:
            if p_dict['gf'] is not None:
                phen_map = parse_phen_file(p_dict['gf'] + '.fam', 'FAM', verbose=verbose, summary_dict=summary_dict)
            else:
                raise Exception('Validation phenotypes were not found.')
        else:
            phen_map = parse_phen_file(p_dict['pf'], p_dict['pf_format'], verbose=verbose, summary_dict=summary_dict)
        t1 = time.time()
        t = (t1 - t0)
        summary_dict[1.1]={'name':'Individuals with phenotype information:','value':len(phen_map)}
        summary_dict[1.2]={'name':'Running time for parsing phenotypes:','value':'%d min and %0.2f secs'% (t / 60, t % 60)}
    
        if p_dict['cov_file'] != None:
            adjust_for_covs=True
            if verbose:
                print('Parsing additional covariates')
    
            with open(p_dict['cov_file'], 'r') as f:
                num_missing = 0
                for line in f:
                    l = line.split()
                    iid = l[0]
                    if iid in phen_map:
                        covariates = list(map(float, l[1:]))
                        phen_map[iid]['covariates'] = covariates
                    else:
                        num_missing += 1
                if num_missing > 0:
                    summary_dict[2.1]={'name':'Individuals w missing covariate information:','value':num_missing}
                    if verbose:
                        print('Unable to find %d iids in phen file!' % num_missing)
            summary_dict[2]={'name':'Parsed covariates file:','value':p_dict['cov_file']}
    
        if p_dict['pcs_file']:
            adjust_for_pcs=True
            if verbose:
                print('Parsing PCs')
    
            with open(p_dict['pcs_file'], 'r') as f:
                num_missing = 0
                for line in f:
                    l = line.split()
                    iid = l[1]
                    if iid in phen_map:
                        pcs = list(map(float, l[2:]))
                        phen_map[iid]['pcs'] = pcs
                    else:
                        num_missing += 1
                if num_missing > 0:
                    summary_dict[3.1]={'name':'Individuals w missing PCs:','value':num_missing}
                    if verbose:
                        print('Unable to find %d iids in phen file!' % num_missing)
            summary_dict[3]={'name':'Parsed PCs file:','value':p_dict['pcs_file']}
    
        num_individs = len(phen_map)
        assert num_individs > 0, 'No phenotypes were found!'
    else:
        phen_map = None

    t0 = time.time()
    prs_file_is_missing = True
    res_dict = {}
    if p_dict['rf_format'] == 'LDPRED' or p_dict['rf_format']=='ANY':
        weights_file = '%s_LDpred-inf.txt' % (p_dict['rf'])
        
        if os.path.isfile(weights_file):
            print('')
            print('Calculating LDpred-inf risk scores')
            rs_id_map = parse_ldpred_res(weights_file)
            out_file = '%s_LDpred-inf.txt' % (p_dict['out'])
            res_dict['LDpred_inf'] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file, 
                             split_by_chrom=p_dict['split_by_chrom'],
                             adjust_for_pcs=adjust_for_pcs,
                             adjust_for_covariates=adjust_for_covs,
                             only_score=p_dict['only_score'],
                             verbose=verbose, summary_dict=summary_dict)
            if not p_dict['only_score']:
                summary_dict[5.2]={'name':'LDpred_inf (unadjusted) Pearson R2:','value':'%0.4f'%res_dict['LDpred_inf']['pred_r2']}
            prs_file_is_missing = False
        
       
        best_ldpred_pred_r2 = 0
        best_p = None
        for p in p_dict['f']:
            weights_file = '%s_LDpred_p%0.4e.txt' % (p_dict['rf'], p)
            if os.path.isfile(weights_file):
                print('')
                print('Calculating LDpred risk scores using f=%0.3e' % p)
                rs_id_map = parse_ldpred_res(weights_file)
                out_file = '%s_LDpred_p%0.4e.txt' % (p_dict['out'], p)
                method_str = 'LDpred_p%0.4e' % (p)
                res_dict[method_str] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file,
                                                        split_by_chrom=p_dict['split_by_chrom'],
                                                        adjust_for_pcs=adjust_for_pcs,
                                                        adjust_for_covariates=adjust_for_covs,
                                                        only_score=p_dict['only_score'],
                                                        verbose=verbose, summary_dict=summary_dict)
                if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_ldpred_pred_r2:
                    best_ldpred_pred_r2 = res_dict[method_str]['pred_r2']
                    best_p = p
        
                prs_file_is_missing=False
        if best_ldpred_pred_r2>0 and not p_dict['only_score']:         
            summary_dict[5.3]={'name':'Best LDpred (f=%0.2e) (unadjusted) R2:'%(best_p),'value':'%0.4f'%best_ldpred_pred_r2}

        # Plot results?

    if p_dict['rf_format'] == 'P+T' or p_dict['rf_format']=='ANY':

        best_pt_pred_r2 = 0
        best_t = None
        best_r2 = None
        for max_r2 in p_dict['r2']:
            for p_thres in p_dict['p']:
                weights_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (p_dict['rf'], max_r2, p_thres)
                if os.path.isfile(weights_file):
                    print('')
                    print('Calculating P+T risk scores using p-value threshold of %0.3e, and r2 threshold of %0.2f' % (p_thres, max_r2))
                    rs_id_map, non_zero_chromosomes = parse_pt_res(weights_file)
                    out_file = '%s_P+T_p%0.4e.txt' % (p_dict['out'], p_thres)
                    method_str = 'P+T_p%0.4e' % (p_thres)
                    res_dict[method_str] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file,
                                                            split_by_chrom=p_dict['split_by_chrom'],
                                                            non_zero_chromosomes=non_zero_chromosomes, 
                                                            adjust_for_pcs=adjust_for_pcs,
                                                            adjust_for_covariates=adjust_for_covs,
                                                            only_score=p_dict['only_score'],
                                                            verbose=verbose, summary_dict=summary_dict)
                    if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_pt_pred_r2:
                        best_pt_pred_r2 = res_dict[method_str]['pred_r2']
                        best_t = p_thres
                        best_r2 = max_r2
                    prs_file_is_missing=False
        if best_pt_pred_r2>0 and not p_dict['only_score']:                
            summary_dict[5.4]={'name':'Best P+T (r2=%0.2f, p=%0.2e) (unadjusted) R2:'%(best_r2, best_t),'value':'%0.4f'%best_pt_pred_r2}

    # Plot results?
    assert not prs_file_is_missing, 'No SNP weights file was found.  A prefix to these should be provided via the --rf flag. Note that the prefix should exclude the _LDpred_.. extension or file ending. '

    
    #Identifying the best prediction
    if not p_dict['only_score']:
        best_pred_r2 = 0
        best_method_str = None
        for method_str in res_dict:
            if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_pred_r2:
                best_pred_r2 = res_dict[method_str]['pred_r2']
                best_method_str = method_str
        if best_method_str is not None:
            print('The highest (unadjusted) Pearson R2 was %0.4f, and provided by %s'%(best_pred_r2,best_method_str))
            summary_dict[5.99]={'name':'dash', 'value':'Optimal polygenic score'}
            summary_dict[6]={'name':'Method with highest (unadjusted) Pearson R2:','value':best_method_str}
            summary_dict[6.1]={'name':'Best (unadjusted) Pearson R2:','value':'%0.4f'%best_pred_r2}
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[4.9]={'name':'dash', 'value':'Scoring'}
    summary_dict[5.9]={'name':'Running time for calculating scores:','value':'%d min and %0.2f secs'% (t / 60, t % 60)}

    if prs_file_is_missing:
        print('SNP weights files were not found.  This could be due to a mis-specified --rf flag, or other issues.')
    
    reporting.print_summary(summary_dict,'Scoring Summary')
示例#10
0
def main(p_dict):
    assert p_dict['summary_file'] is None or not p_dict[
        'only_score'], 'Prediction summary file cannot be produced when the --only-score flag is set.'

    summary_dict = {}
    non_zero_chromosomes = set()
    verbose = p_dict['debug']

    t0 = time.time()

    summary_dict[0] = {
        'name': 'Validation genotype file (prefix):',
        'value': p_dict['gf']
    }
    summary_dict[0.1] = {
        'name': 'Input weight file(s) (prefix):',
        'value': p_dict['rf']
    }
    summary_dict[0.2] = {
        'name': 'Output scores file(s) (prefix):',
        'value': p_dict['out']
    }

    adjust_for_pcs = False
    adjust_for_covs = False

    if not p_dict['only_score']:
        summary_dict[0.9] = {'name': 'dash', 'value': 'Phenotypes'}
        if verbose:
            print('Parsing phenotypes')
        if p_dict['pf'] is None:
            if p_dict['gf'] is not None:
                phen_map = parse_phen_file(p_dict['gf'] + '.fam',
                                           'FAM',
                                           verbose=verbose,
                                           summary_dict=summary_dict)
            else:
                raise Exception('Validation phenotypes were not found.')
        else:
            phen_map = parse_phen_file(p_dict['pf'],
                                       p_dict['pf_format'],
                                       verbose=verbose,
                                       summary_dict=summary_dict)
        t1 = time.time()
        t = (t1 - t0)
        summary_dict[1.1] = {
            'name': 'Individuals with phenotype information:',
            'value': len(phen_map)
        }
        summary_dict[1.2] = {
            'name': 'Running time for parsing phenotypes:',
            'value': '%d min and %0.2f secs' % (t / 60, t % 60)
        }

        if p_dict['cov_file'] != None:
            adjust_for_covs = True
            if verbose:
                print('Parsing additional covariates')
            parse_covariates(p_dict, phen_map, summary_dict, verbose)

        if p_dict['pcs_file']:
            adjust_for_pcs = True
            if verbose:
                print('Parsing PCs')
            parse_pcs(p_dict, phen_map, summary_dict, verbose)

        num_individs = len(phen_map)
        assert num_individs > 0, 'No phenotypes were found!'
    else:
        phen_map = None

    t0 = time.time()
    prs_file_is_missing = True
    res_dict = {}
    if p_dict['rf_format'] == 'LDPRED' or p_dict['rf_format'] == 'ANY':
        weights_file = '%s_LDpred-inf.txt' % (p_dict['rf'])

        if os.path.isfile(weights_file):
            print('')
            print('Calculating LDpred-inf risk scores')
            rs_id_map = parse_ldpred_res(weights_file)
            out_file = '%s_LDpred-inf.txt' % (p_dict['out'])
            res_dict['LDpred_inf'] = calc_risk_scores(
                p_dict['gf'],
                rs_id_map,
                phen_map,
                out_file=out_file,
                split_by_chrom=p_dict['split_by_chrom'],
                adjust_for_pcs=adjust_for_pcs,
                adjust_for_covariates=adjust_for_covs,
                only_score=p_dict['only_score'],
                verbose=verbose,
                summary_dict=summary_dict)
            if not p_dict['only_score']:
                summary_dict[5.2] = {
                    'name': 'LDpred_inf (unadjusted) Pearson R2:',
                    'value': '%0.4f' % res_dict['LDpred_inf']['pred_r2']
                }
            prs_file_is_missing = False

        best_ldpred_pred_r2 = 0
        best_p = None
        for p in p_dict['f']:
            weights_file = '%s_LDpred_p%0.4e.txt' % (p_dict['rf'], p)
            if os.path.isfile(weights_file):
                print('')
                print('Calculating LDpred risk scores using f=%0.3e' % p)
                rs_id_map = parse_ldpred_res(weights_file)
                out_file = '%s_LDpred_p%0.4e.txt' % (p_dict['out'], p)
                method_str = 'LDpred_p%0.4e' % (p)
                res_dict[method_str] = calc_risk_scores(
                    p_dict['gf'],
                    rs_id_map,
                    phen_map,
                    out_file=out_file,
                    split_by_chrom=p_dict['split_by_chrom'],
                    adjust_for_pcs=adjust_for_pcs,
                    adjust_for_covariates=adjust_for_covs,
                    only_score=p_dict['only_score'],
                    verbose=verbose,
                    summary_dict=summary_dict)
                if len(res_dict[method_str]) and (
                        res_dict[method_str]['pred_r2']) > best_ldpred_pred_r2:
                    best_ldpred_pred_r2 = res_dict[method_str]['pred_r2']
                    best_p = p

                prs_file_is_missing = False
        if best_ldpred_pred_r2 > 0 and not p_dict['only_score']:
            summary_dict[5.3] = {
                'name': 'Best LDpred (f=%0.2e) (unadjusted) R2:' % (best_p),
                'value': '%0.4f' % best_ldpred_pred_r2
            }

        best_ldpred_fast_pred_r2 = 0
        best_p = None
        for p in p_dict['f']:
            weights_file = '%s_LDpred_fast_p%0.4e.txt' % (p_dict['rf'], p)
            if os.path.isfile(weights_file):
                print('')
                print('Calculating LDpred-fast risk scores using f=%0.3e' % p)
                rs_id_map = parse_ldpred_res(weights_file)
                out_file = '%s_LDpred_fast_p%0.4e.txt' % (p_dict['out'], p)
                method_str = 'LDpred_fast_p%0.4e' % (p)
                res_dict[method_str] = calc_risk_scores(
                    p_dict['gf'],
                    rs_id_map,
                    phen_map,
                    out_file=out_file,
                    split_by_chrom=p_dict['split_by_chrom'],
                    adjust_for_pcs=adjust_for_pcs,
                    adjust_for_covariates=adjust_for_covs,
                    only_score=p_dict['only_score'],
                    verbose=verbose,
                    summary_dict=summary_dict)
                if len(res_dict[method_str]) and (res_dict[method_str][
                        'pred_r2']) > best_ldpred_fast_pred_r2:
                    best_ldpred_fast_pred_r2 = res_dict[method_str]['pred_r2']
                    best_p = p

                prs_file_is_missing = False
        if best_ldpred_fast_pred_r2 > 0 and not p_dict['only_score']:
            summary_dict[5.4] = {
                'name':
                'Best LDpred-fast (f=%0.2e) (unadjusted) R2:' % (best_p),
                'value': '%0.4f' % best_ldpred_fast_pred_r2
            }

        # Plot results?

    if p_dict['rf_format'] == 'P+T' or p_dict['rf_format'] == 'ANY':

        best_pt_pred_r2 = 0
        best_t = None
        best_r2 = None
        for max_r2 in p_dict['r2']:
            for p_thres in p_dict['p']:
                weights_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (p_dict['rf'],
                                                             max_r2, p_thres)
                if os.path.isfile(weights_file):
                    print(
                        'Calculating P+T risk scores using p-value threshold of %0.3e, and r2 threshold of %0.2f'
                        % (p_thres, max_r2))
                    rs_id_map, non_zero_chromosomes = parse_pt_res(
                        weights_file)
                    if len(rs_id_map) > 0:
                        out_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (
                            p_dict['out'], max_r2, p_thres)
                        method_str = 'P+T_r%0.2f_p%0.4e' % (max_r2, p_thres)
                        res_dict[method_str] = calc_risk_scores(
                            p_dict['gf'],
                            rs_id_map,
                            phen_map,
                            out_file=out_file,
                            split_by_chrom=p_dict['split_by_chrom'],
                            non_zero_chromosomes=non_zero_chromosomes,
                            adjust_for_pcs=adjust_for_pcs,
                            adjust_for_covariates=adjust_for_covs,
                            only_score=p_dict['only_score'],
                            verbose=verbose,
                            summary_dict=summary_dict)
                        if len(res_dict[method_str]) and (res_dict[method_str][
                                'pred_r2']) > best_pt_pred_r2:
                            best_pt_pred_r2 = res_dict[method_str]['pred_r2']
                            best_t = p_thres
                            best_r2 = max_r2
                    else:
                        print(
                            'No SNPs found with p-values below the given threshold.'
                        )
                    prs_file_is_missing = False
        if best_pt_pred_r2 > 0 and not p_dict['only_score']:
            summary_dict[5.5] = {
                'name':
                'Best P+T (r2=%0.2f, p=%0.2e) (unadjusted) R2:' %
                (best_r2, best_t),
                'value':
                '%0.4f' % best_pt_pred_r2
            }

    # Plot results?
    assert not prs_file_is_missing, 'No SNP weights file was found.  A prefix to these should be provided via the --rf flag. Note that the prefix should exclude the _LDpred_.. extension or file ending. '

    res_summary_file = p_dict['summary_file']
    if res_summary_file is not None and not p_dict['only_score']:
        with open(res_summary_file, 'w') as f:
            if verbose:
                print('Writing Results Summary to file %s' % res_summary_file)
            out_str = 'Pred_Method    Pred_corr    Pred_R2    SNPs_used\n'
            f.write(out_str)
            for method_str in sorted(res_dict):
                out_str = '%s    %0.4f    %0.4f    %i\n' % (
                    method_str, res_dict[method_str]['corr_r2'],
                    res_dict[method_str]['pred_r2'],
                    res_dict[method_str]['num_snps'])
                f.write(out_str)

    #Identifying the best prediction
    if not p_dict['only_score']:
        best_pred_r2 = 0
        best_method_str = None
        for method_str in res_dict:
            if len(res_dict[method_str]) and (
                    res_dict[method_str]['pred_r2']) > best_pred_r2:
                best_pred_r2 = res_dict[method_str]['pred_r2']
                best_method_str = method_str

        if best_method_str is not None:
            print(
                'The highest (unadjusted) Pearson R2 was %0.4f, and provided by %s'
                % (best_pred_r2, best_method_str))
            summary_dict[5.99] = {
                'name': 'dash',
                'value': 'Optimal polygenic score'
            }
            summary_dict[6] = {
                'name': 'Method with highest (unadjusted) Pearson R2:',
                'value': best_method_str
            }
            summary_dict[6.1] = {
                'name': 'Best (unadjusted) Pearson R2:',
                'value': '%0.4f' % best_pred_r2
            }
            if verbose:
                summary_dict[6.2] = {
                    'name': 'Number of SNPs used',
                    'value': '%d' % res_dict[best_method_str]['num_snps']
                }
                summary_dict[6.3] = {
                    'name': 'Number of SNPs flipped',
                    'value':
                    '%d' % res_dict[best_method_str]['num_flipped_nts']
                }
                summary_dict[6.4] = {
                    'name': 'Fraction of SNPs not found in validation data',
                    'value':
                    '%0.4f' % res_dict[best_method_str]['perc_missing']
                }
                summary_dict[6.5] = {
                    'name': 'Number of duplicated SNPs',
                    'value':
                    '%d' % res_dict[best_method_str]['duplicated_snps']
                }
                summary_dict[6.6] = {
                    'name':
                    'Number of non-matching nucleotides SNPs',
                    'value':
                    '%d' % res_dict[best_method_str]['num_non_matching_nts']
                }
    t1 = time.time()
    t = (t1 - t0)
    summary_dict[4.9] = {'name': 'dash', 'value': 'Scoring'}
    summary_dict[5.9] = {
        'name': 'Running time for calculating scores:',
        'value': '%d min and %0.2f secs' % (t / 60, t % 60)
    }

    if prs_file_is_missing:
        print(
            'SNP weights files were not found.  This could be due to a mis-specified --rf flag, or other issues.'
        )

    reporting.print_summary(summary_dict, 'Scoring Summary')