Exemplo n.º 1
0
                                      b=load_list(
                                          args.individual_list_exclude))

    indiv_list = sorted(indiv_list)

    df_pheno = rearrange_rows(df_pheno, indiv_list)
    df_pheno = transpose_df(df_pheno, col='indiv')
    if df_covar is not None:
        df_covar = rearrange_rows(df_covar, indiv_list)
        df_covar.set_index('indiv', inplace=True)
    logging.info('There are {} individauls being included.'.format(
        df_pheno.shape[1]))

    logging.info('Loading genotypes.')
    pr = genotypeio.PlinkReader(args.geno_bed_prefix,
                                select_samples=df_pheno.columns,
                                dtype=np.int8)
    variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
    genotype_df = pd.DataFrame(pr.load_genotypes(),
                               index=pr.bim['snp'],
                               columns=pr.fam['iid'])

    if args.map_trans_params is None:
        map_args = {}
    else:
        map_args = read_yaml(args.map_trans_params)
    logging.info('Arguments used for tensorqtl: ' + dict_to_str(map_args))

    logging.info('Start mapping.')
    pairs_df = trans.map_trans(genotype_df,
                               df_pheno,
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description='tensorQTL: GPU-based QTL mapper')
    parser.add_argument('genotype_path', help='Genotypes in PLINK format')
    parser.add_argument('phenotype_bed', help='Phenotypes in BED format')
    parser.add_argument('prefix', help='Prefix for output file names')
    parser.add_argument(
        '--mode',
        default='cis',
        choices=['cis', 'cis_nominal', 'cis_independent', 'trans'],
        help='Mapping mode. Default: cis')
    parser.add_argument(
        '--covariates',
        default=None,
        help='Covariates file, tab-delimited, covariates x samples')
    parser.add_argument('--permutations',
                        type=int,
                        default=10000,
                        help='Number of permutations. Default: 10000')
    parser.add_argument('--interaction',
                        default=None,
                        type=str,
                        help='Interaction term')
    parser.add_argument(
        '--cis_output',
        default=None,
        type=str,
        help=
        "Output from 'cis' mode with q-values. Required for independent cis-QTL mapping."
    )
    parser.add_argument(
        '--phenotype_groups',
        default=None,
        type=str,
        help=
        'Phenotype groups. Header-less TSV with two columns: phenotype_id, group_id'
    )
    parser.add_argument('--window',
                        default=1000000,
                        type=np.int32,
                        help='Cis-window size, in bases. Default: 1000000.')
    parser.add_argument(
        '--pval_threshold',
        default=None,
        type=np.float64,
        help=
        'Output only significant phenotype-variant pairs with a p-value below threshold. Default: 1e-5 for trans-QTL'
    )
    parser.add_argument(
        '--maf_threshold',
        default=None,
        type=np.float64,
        help=
        'Include only genotypes with minor allele frequency >= maf_threshold. Default: 0'
    )
    parser.add_argument(
        '--maf_threshold_interaction',
        default=0.05,
        type=np.float64,
        help=
        'MAF threshold for interactions, applied to lower and upper half of samples'
    )
    parser.add_argument('--return_dense',
                        action='store_true',
                        help='Return dense output for trans-QTL.')
    parser.add_argument('--return_r2',
                        action='store_true',
                        help='Return r2 (only for sparse trans-QTL output)')
    parser.add_argument(
        '--best_only',
        action='store_true',
        help=
        'Only write lead association for each phenotype (interaction mode only)'
    )
    parser.add_argument(
        '--output_text',
        action='store_true',
        help=
        'Write output in txt.gz format instead of parquet (trans-QTL mode only)'
    )
    parser.add_argument(
        '--batch_size',
        type=int,
        default=20000,
        help='Batch size. Reduce this if encountering OOM errors.')
    parser.add_argument(
        '--load_split',
        action='store_true',
        help='Load genotypes into memory separately for each chromosome.')
    parser.add_argument('--fdr',
                        default=0.05,
                        type=np.float64,
                        help='FDR for cis-QTLs')
    parser.add_argument('--qvalue_lambda',
                        default=None,
                        type=np.float64,
                        help='lambda parameter for pi0est in qvalue.')
    parser.add_argument('--seed',
                        default=None,
                        type=int,
                        help='Seed for permutations.')
    parser.add_argument('-o',
                        '--output_dir',
                        default='.',
                        help='Output directory')
    args = parser.parse_args()

    # check inputs
    if args.mode == 'cis_independent' and (
            args.cis_output is None or not os.path.exists(args.cis_output)):
        raise ValueError("Output from 'cis' mode must be provided.")

    logger = SimpleLogger(
        os.path.join(args.output_dir,
                     args.prefix + '.tensorQTL.{}.log'.format(args.mode)))
    logger.write('[{}] Running TensorQTL: {}-QTL mapping'.format(
        datetime.now().strftime("%b %d %H:%M:%S"),
        args.mode.split('_')[0]))
    if torch.cuda.is_available():
        logger.write('  * using GPU ({})'.format(
            torch.cuda.get_device_name(torch.cuda.current_device())))
    else:
        logger.write('  * WARNING: using CPU!')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if args.seed is not None:
        logger.write('  * using seed {}'.format(args.seed))

    # load inputs
    logger.write('  * reading phenotypes ({})'.format(args.phenotype_bed))
    phenotype_df, phenotype_pos_df = read_phenotype_bed(args.phenotype_bed)

    tss_dict = phenotype_pos_df.T.to_dict()
    if args.covariates is not None:
        logger.write('  * reading covariates ({})'.format(args.covariates))
        covariates_df = pd.read_csv(args.covariates, sep='\t', index_col=0).T
        assert np.all(phenotype_df.columns == covariates_df.index)
    if args.interaction is not None:
        logger.write('  * reading interaction term ({})'.format(
            args.interaction))
        interaction_s = pd.read_csv(args.interaction,
                                    sep='\t',
                                    index_col=0,
                                    header=None,
                                    squeeze=True)
        assert covariates_df.index.isin(interaction_s.index).all()
        interaction_s = interaction_s.loc[covariates_df.index].astype(
            np.float32)
    else:
        interaction_s = None

    if args.maf_threshold is None:
        if args.mode == 'trans':
            maf_threshold = 0.05
        else:
            maf_threshold = 0
    else:
        maf_threshold = args.maf_threshold

    if args.phenotype_groups is not None:
        group_s = pd.read_csv(args.phenotype_groups,
                              sep='\t',
                              index_col=0,
                              header=None,
                              squeeze=True)
        # verify sort order
        group_dict = group_s.to_dict()
        previous_group = ''
        parsed_groups = 0
        for i in phenotype_df.index:
            if group_dict[i] != previous_group:
                parsed_groups += 1
                previous_group = group_dict[i]
        if not parsed_groups == len(group_s.unique()):
            raise ValueError(
                'Groups defined in input do not match phenotype file (check sort order).'
            )
    else:
        group_s = None

    # load genotypes
    pr = genotypeio.PlinkReader(args.genotype_path,
                                select_samples=phenotype_df.columns,
                                dtype=np.int8)
    variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
    if args.mode != 'cis_nominal' or not args.load_split:  # load all genotypes into memory
        genotype_df = pd.DataFrame(pr.load_genotypes(),
                                   index=pr.bim['snp'],
                                   columns=pr.fam['iid'])

    if args.mode.startswith('cis'):
        if args.mode == 'cis':
            res_df = cis.map_cis(genotype_df,
                                 variant_df,
                                 phenotype_df,
                                 phenotype_pos_df,
                                 covariates_df,
                                 group_s=group_s,
                                 nperm=args.permutations,
                                 window=args.window,
                                 logger=logger,
                                 seed=args.seed,
                                 verbose=True)
            logger.write('  * writing output')
            out_file = os.path.join(args.output_dir,
                                    args.prefix + '.cis_qtl.txt.gz')
            res_df.to_csv(
                out_file, sep='\t',
                float_format='%.6g')  # to output table without qvalue
            if has_rpy2:
                calculate_qvalues(res_df,
                                  fdr=args.fdr,
                                  qvalue_lambda=args.qvalue_lambda,
                                  logger=logger)
            # out_file = os.path.join(args.output_dir, args.prefix+'.cis_qtl.txt.gz')
            res_df.to_csv(out_file, sep='\t', float_format='%.6g')
        elif args.mode == 'cis_nominal':
            if not args.load_split:
                cis.map_nominal(
                    genotype_df,
                    variant_df,
                    phenotype_df,
                    phenotype_pos_df,
                    args.prefix,
                    covariates_df=covariates_df,
                    interaction_s=interaction_s,
                    maf_threshold_interaction=args.maf_threshold_interaction,
                    group_s=None,
                    window=args.window,
                    run_eigenmt=True,
                    output_dir=args.output_dir,
                    write_top=True,
                    write_stats=not args.best_only,
                    logger=logger,
                    verbose=True)
            else:  # load genotypes for each chromosome separately
                top_df = []
                for chrom in pr.chrs:
                    g, pos_s = pr.get_region(chrom)
                    genotype_df = pd.DataFrame(
                        g, index=pos_s.index,
                        columns=pr.fam['iid'])[phenotype_df.columns]
                    variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]
                    chr_df = cis.map_nominal(
                        genotype_df,
                        variant_df[variant_df['chrom'] == chrom],
                        phenotype_df[phenotype_pos_df['chr'] == chrom],
                        phenotype_pos_df[phenotype_pos_df['chr'] == chrom],
                        args.prefix,
                        covariates_df=covariates_df,
                        interaction_s=interaction_s,
                        maf_threshold_interaction=args.
                        maf_threshold_interaction,
                        group_s=None,
                        window=args.window,
                        run_eigenmt=True,
                        output_dir=args.output_dir,
                        write_top=True,
                        write_stats=not args.best_only,
                        logger=logger,
                        verbose=True)
                    top_df.append(chr_df)
                if interaction_s is not None:
                    top_df = pd.concat(top_df)
                    top_df.to_csv(os.path.join(
                        args.output_dir,
                        '{}.cis_qtl_top_assoc.txt.gz'.format(args.prefix)),
                                  sep='\t',
                                  float_format='%.6g')

        elif args.mode == 'cis_independent':
            summary_df = pd.read_csv(args.cis_output, sep='\t', index_col=0)
            summary_df.rename(columns={
                'minor_allele_samples': 'ma_samples',
                'minor_allele_count': 'ma_count'
            },
                              inplace=True)
            res_df = cis.map_independent(genotype_df,
                                         variant_df,
                                         summary_df,
                                         phenotype_df,
                                         phenotype_pos_df,
                                         covariates_df,
                                         group_s=group_s,
                                         fdr=args.fdr,
                                         nperm=args.permutations,
                                         window=args.window,
                                         logger=logger,
                                         seed=args.seed,
                                         verbose=True)
            logger.write('  * writing output')
            out_file = os.path.join(
                args.output_dir, args.prefix + '.cis_independent_qtl.txt.gz')
            res_df.to_csv(out_file, sep='\t', index=False, float_format='%.6g')
    elif args.mode == 'trans':
        return_sparse = not args.return_dense
        pval_threshold = args.pval_threshold
        if pval_threshold is None and return_sparse:
            pval_threshold = 1e-5
            logger.write(
                '  * p-value threshold: {:.2g}'.format(pval_threshold))

        pairs_df = trans.map_trans(genotype_df,
                                   phenotype_df,
                                   covariates_df,
                                   interaction_s=interaction_s,
                                   return_sparse=return_sparse,
                                   pval_threshold=pval_threshold,
                                   maf_threshold=maf_threshold,
                                   batch_size=args.batch_size,
                                   return_r2=args.return_r2,
                                   logger=logger)

        logger.write('  * filtering out cis-QTLs (within +/-5Mb)')
        pairs_df = trans.filter_cis(pairs_df,
                                    tss_dict,
                                    variant_df,
                                    window=5000000)

        logger.write('  * writing output')
        if not args.output_text:
            pairs_df.to_parquet(
                os.path.join(args.output_dir,
                             args.prefix + '.trans_qtl_pairs.parquet'))
        else:
            out_file = os.path.join(args.output_dir,
                                    args.prefix + '.trans_qtl_pairs.txt.gz')
            pairs_df.to_csv(out_file,
                            sep='\t',
                            index=False,
                            float_format='%.6g')

    logger.write('[{}] Finished mapping'.format(
        datetime.now().strftime("%b %d %H:%M:%S")))
Exemplo n.º 3
0
# load phenotypes and covariates
logger.write('  * reading phenotypes ({})'.format(args.input))
phenotype_df, phenotype_pos_df = read_phenotype_bed(args.input)
# covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T

if args.cov is not None:
    logger.write('  * reading covariates ({})'.format(args.cov))
    covariates_df = pd.read_csv(args.cov, sep='\t', index_col=0).T
    assert np.all(phenotype_df.columns == covariates_df.index)

if args.cond_exp is not None:
    express_df, express_pos_df = read_phenotype_bed(args.cond_exp)
    assert np.all(phenotype_df.columns == express_df.columns)

pr = genotypeio.PlinkReader(plink_prefix_path, exclude_chrs=excluded_chr_list)
genotype_df = pr.load_genotypes()
variant_df = pr.bim.set_index('snp')[['chrom', 'pos']]

if mode == 'cis':
    # cis-QTL: empirical p-values for phenotypes
    if args.cond_exp:  # gene expression as condition
        if excluded_chr_list:
            cis_df = cis_exon.map_cis(
                genotype_df,
                variant_df,
                phenotype_df.loc[phenotype_pos_df['chr'] == chr_id],
                phenotype_pos_df.loc[phenotype_pos_df['chr'] == chr_id],
                covariates_df=covariates_df,
                seed=args.seed,
                express_df=express_df,