Exemplo n.º 1
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--program",
                      dest="program",
                      type="choice",
                      choices=["plink2", "gcta", "plinkdev"],
                      help="program to execute genome-wide analysis")

    parser.add_option("--input-file-pattern",
                      dest="infile_pattern",
                      type="string",
                      help="file prefix that identifies a group of files")

    parser.add_option("--input-file-format",
                      dest="file_format",
                      type="choice",
                      choices=[
                          "plink", "plink_binary", "oxford", "oxford_binary",
                          "vcf", "GRM_binary", "GRM_gz", "GRM_plink"
                      ],
                      help="format of input files")

    parser.add_option("--phenotypes-file",
                      dest="pheno_file",
                      type="string",
                      help="text file of additional phenotypes")

    parser.add_option("--pheno",
                      dest="pheno",
                      type="string",
                      help="either phenotype file column header or number")

    parser.add_option("--covariates-file",
                      dest="covariate_file",
                      type="string",
                      help="file containing covariates.  Used as the "
                      "continuous covariates in GCTA-based analyses")

    parser.add_option("--covariate-column",
                      dest="covar_col",
                      type="string",
                      help="column number(s) or header(s) to include in "
                      "association model")

    parser.add_option("--discrete-covariates-file",
                      dest="covariate_discrete",
                      type="string",
                      help="file containing discrete covariates "
                      "to adjust for in GCTA-based analyses")

    parser.add_option("--association-model",
                      dest="assoc_model",
                      type="choice",
                      choices=["recessive", "dominant", "genotype"],
                      help="model to report from association analysis")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=[
                          "association", "summary", "format", "matrix", "reml",
                          "bivariate_reml", "pca", "lmm", "simulation",
                          "epistasis", "ld", "estimate_haplotypes"
                      ],
                      help="method to apply to genome-wide data")

    parser.add_option("--reml-method",
                      dest="reml_method",
                      type="choice",
                      choices=[
                          "standard", "priors", "reml_algorithm",
                          "unconstrained", "GxE", "LRT", "BLUP_EBV", "snpBLUP",
                          "no_residual", "fixed_cor"
                      ],
                      help="method for REML estimate of heritability method "
                      "including either single or dual phenotypes")

    parser.add_option("--reml-parameters",
                      dest="reml_param",
                      type="string",
                      help="comma separated list of parameters to pass to "
                      "REML variance components analysis")

    parser.add_option("--prevalence",
                      dest="prevalence",
                      type="float",
                      help="binary trait prevalence in a cohort study. "
                      "Used to estimate h2 on the liability threshold "
                      "scale.")

    parser.add_option("--lmm-method",
                      dest="lmm_method",
                      type="choice",
                      choices=["standard", "loco", "no_covar"],
                      help="type of linear mixed model analysis to run")

    parser.add_option("--grm-prefix",
                      dest="grm_prefix",
                      type="string",
                      help="prefix of the pre-computed GRM files to use "
                      "in the linear mixed model analysis")

    parser.add_option(
        "--epistasis-method",
        dest="epi_method",
        type="choice",
        choices=["fast_epistasis", "epistasis", "two_locus", "adjusted"],
        help="epistasis method to use")

    parser.add_option("--epistasis-parameter",
                      dest="epi_param",
                      type="string",
                      help="modifiers of epistasis functions")

    parser.add_option("--epistasis-threshold",
                      dest="epi_sig",
                      type="string",
                      help="statistical significance threshold for counting "
                      "interactions")

    parser.add_option("--epistasis-report-threshold",
                      dest="epi_report",
                      type="string",
                      help="threshold used to count the "
                      "proportion of statistically significant interactions")

    parser.add_option("--set-file",
                      dest="set_file",
                      type="string",
                      help="file containing variant sets as per Plink "
                      ".set file specification")

    parser.add_option("--set-method",
                      dest="set_method",
                      type="choice",
                      choices=["set-by-all", "set-by-set"],
                      help="set method to use when `set_file` provided")

    parser.add_option("--principal-components",
                      dest="num_pcs",
                      type="int",
                      help="the number of principal components to output")

    parser.add_option("--matrix-shape",
                      dest="matrix_shape",
                      type="choice",
                      choices=["triangle", "square", "square0"],
                      help="output matrix shape.",
                      default="triangle")

    parser.add_option("--matrix-compression",
                      dest="matrix_compress",
                      type="choice",
                      choices=["gz", "bin", "bin4"],
                      help="compression to apply to output matrix")

    parser.add_option("--matrix-form",
                      dest="matrix_form",
                      type="choice",
                      choices=["distance", "grm"],
                      help="type of relationship matrix to calculate")

    parser.add_option(
        "--matrix-metric",
        dest="matrix_metric",
        type="choice",
        choices=["fhat", "cov", "ibc2", "ibc3", "ibs", "genomic", "hamming"],
        help="value to calculate for diagonal elements of the "
        "grm. Default is fhat for grm and hamming for distance.")

    parser.add_option(
        "--matrix-options",
        dest="matrix_options",
        type="string",
        help="modifiers of matrix output, see plink documentation "
        "for details")

    parser.add_option("--association-method",
                      dest="assoc_method",
                      type="choice",
                      choices=["linear", "logistic", "assoc", "qassoc"],
                      help="association analysis to run")

    parser.add_option(
        "--permutation",
        dest="permutation",
        action="store_true",
        help="perform association testing by permutation analysis")

    parser.add_option("--repeats",
                      dest="n_perms",
                      type="int",
                      help="number of repetitions for permutation analysis")

    parser.add_option("--association-options",
                      dest="assoc_option",
                      type="string",
                      help="association analysis modifiers")

    parser.add_option("--format-method",
                      dest="format_method",
                      type="choice",
                      choices=[
                          "change_format", "change_missing_values",
                          "update_variants", "update_samples", "flip_strands",
                          "flip_scan", "sort", "merge", "find_duplicates"
                      ],
                      help="file formatting to apply to input files")

    parser.add_option("--format-parameter",
                      dest="format_param",
                      type="string",
                      help="formatting parameter, where appropriate")

    parser.add_option(
        "--reformat-type",
        dest="reformat",
        type="choice",
        choices=["plink", "plink_binary", "oxford", "oxford_binary", "raw"],
        help="new format of input files to be reformatted to")

    parser.add_option("--apply-missing",
                      dest="apply_missing",
                      type="choice",
                      choices=["genotype", "phenotype"],
                      help="genotype or phenotype missing values to alter")

    parser.add_option("--update-variant-attribute",
                      dest="variant_update",
                      type="choice",
                      choices=[
                          "variant_ids", "missing_id", "chromosome",
                          "centimorgan", "name", "alleles", "map"
                      ],
                      help="update variant attributes")

    parser.add_option("--update-sample-attribute",
                      dest="sample_update",
                      type="choice",
                      choices=["sample_ids", "parents", "gender"],
                      help="sample attributes to be updated")

    parser.add_option("--strand-flip-subset",
                      dest="flip_subset",
                      action="store_true",
                      help="apply strand flipping to a subset of samples")

    parser.add_option("--flip-scan-type",
                      dest="scan_param",
                      type="choice",
                      choices=["default", "window", "threshold"],
                      help="strand flipping scan to apply to SNPs")

    parser.add_option("--sort-type",
                      dest="sort_type",
                      type="choice",
                      choices=["none", "natural", "ascii", "file"],
                      help="sort type to input files")

    parser.add_option("--merge-file-format",
                      dest="merge_format",
                      type="choice",
                      choices=["plink", "plink_binary"],
                      help="format of input files to be merged")

    parser.add_option(
        "--merge-mode",
        dest="merge_mode",
        type="choice",
        choices=[
            "default", "original_missing", "new_nonmissing", "no_overwrite",
            "force", "report_all", "report_nonmissing"
        ],
        help="merge mode to apply to dealing with merge conflicts")

    parser.add_option("--duplicates-method",
                      dest="dup_method",
                      type="choice",
                      choices=["same_ref", "id_match", "suppress_first"],
                      help="method for identifying and dealing with duplicate "
                      "variants")

    parser.add_option("--summary-method",
                      dest="summary_method",
                      type="choice",
                      choices=[
                          "allele_frequency", "missing_data", "hardy_weinberg",
                          "mendel_errors", "inbreeding", "gender_checker",
                          "wrights_fst", "case_control_fst"
                      ],
                      help="summary statistics to calculate")

    parser.add_option("--summary-parameter",
                      dest="sum_param",
                      type="string",
                      help="optional parameters that can be passed to summary "
                      "statistics methods")

    parser.add_option("--haplotype-frequency",
                      dest="filt_haplotype_frequency",
                      type="string",
                      help="min allele frequency for SNPs to be "
                      "considered for a haplotype")

    parser.add_option("--haplotype-size",
                      dest="filt_haplotype_size",
                      type="string",
                      help="maximum genomic size of "
                      "haplotypes")

    parser.add_option("--ld-statistic",
                      dest="ld_stat",
                      type="choice",
                      choices=["r", "r2"],
                      help="compute either the raw "
                      "inter variant allele count correlation, R, or the "
                      "squared correlation, R^2")

    parser.add_option("--ld-min",
                      dest="ld_min",
                      type="string",
                      help="minimum value to report for pair-wise LD "
                      "calculations.  Beware output files may be very "
                      "large if `ld_min` is very small.")

    parser.add_option("--ld-window",
                      dest="ld_window",
                      type="string",
                      help="distance between SNPs, beyond which LD will "
                      "not be calculated")

    parser.add_option("--ld-format-output",
                      dest="ld_shape",
                      type="choice",
                      choices=["square", "table", "triangle", "square0"],
                      help="output either as table, or matrix format with a "
                      "specific shape.")

    parser.add_option(
        "--genotype-rate",
        dest="filt_genotype_rate",
        type="string",
        help="genotyping rate threshold.  SNPs below this threshold "
        "will be excluded from analysis")

    parser.add_option("--indiv-missing",
                      dest="filt_missingness",
                      type="string",
                      help="individual missingness rate.  Individuals below "
                      "this threshold will be excluded from analysis")

    parser.add_option("--hardy-weinberg",
                      dest="filt_hwe",
                      type="string",
                      help="hardy-weinberg p-value threshold for SNPs.  SNPs "
                      "with a 2df chisquared p-value below this will be "
                      "filtered out")

    parser.add_option(
        "--min-allele-frequency",
        dest="filt_min_allele_frequency",
        type="string",
        help="only include SNPs with an allele frequency equal to "
        "or above this threshold")

    parser.add_option(
        "--max-allele-frequency",
        dest="filt_max_allele_frequency",
        type="string",
        help="only include SNPs with an allele frequency equal to "
        "or below this threshold")

    parser.add_option(
        "--mendelian-error",
        dest="filt_mendelian_error",
        type="string",
        help="exclude individuals/trios with mendelian errors that "
        "exceed this value")

    parser.add_option("--keep-individuals",
                      dest="filt_keep",
                      type="string",
                      help="a file containing individuals IDs to keep, "
                      "one per row")

    parser.add_option("--remove-individuals",
                      dest="filt_remove",
                      type="string",
                      help="a file of individual IDs to remove, one per row")

    parser.add_option("--min-quality-score",
                      dest="filt_min_qaul_score",
                      type="string",
                      help="reset the minimum low bound of quality scores for "
                      "variants in a VCF file.  Default is 0")

    parser.add_option(
        "--max-quality-score",
        dest="filt_max_qual_score",
        type="string",
        help="reset the maximum upper bound of quality scores for "
        "a VCCF file.  Default is Inf")

    parser.add_option("--allow-no-gender",
                      dest="filt_allow_no_sex",
                      type="string",
                      help="allow individuals with gender missing")

    parser.add_option("--enforce-gender",
                      dest="filt_enforce_sex",
                      type="string",
                      help="only include individuals with non-missing gender "
                      "information")

    parser.add_option("--subset-filter",
                      dest="filt_subset_filter",
                      type="choice",
                      choices=[
                          "cases", "controls", "males", "females", "founders",
                          "nonfounders"
                      ],
                      help="only apply filters to the specific subset of "
                      "individuals supplied")

    parser.add_option(
        "--extract-snps",
        dest="filt_extract",
        type="string",
        help="text file of variant IDs to include in the analysis, "
        "ignoring all others")

    parser.add_option("--exclude-snps",
                      dest="filt_exclude",
                      type="string",
                      help="a file of variant IDs to exclude from analysis")

    parser.add_option("--restrict-chromosome",
                      dest="filt_chromosome",
                      type="string",
                      help="restict analysis to either a single chromosome, "
                      "or a comma-separated list of chromosomes")

    parser.add_option("--exclude-chromosomes",
                      dest="filt_exclude_chromosome",
                      type="string",
                      help="exclude all variants on these "
                      "chromosome(s)")

    parser.add_option(
        "--autosome-only",
        dest="filt_autosome",
        action="store_true",
        help="if present only autosomal variants will be analysed")

    parser.add_option(
        "--pseudo-autosome",
        dest="filt_pseudo_autosome",
        action="store_true",
        help="include on the pseudo-autosomal region of chromosome X")

    parser.add_option("--ignore-indels",
                      dest="filt_ignore_indels",
                      action="store_true",
                      help="only include bi-allelic single nucleotide "
                      "variants in analysis")

    parser.add_option(
        "--snp-range",
        dest="filt_snp_bp_range",
        type="string",
        help="comma separated list of from, to genome co-ordinates "
        "within which to include variants for analysis")

    parser.add_option(
        "--conditional-snp",
        dest="filt_conditional_snp",
        type="string",
        help="condition the analysis on this SNP ID.  Can only be "
        "used in the linear and logistic regression models.")

    parser.add_option("--snp-id-range",
                      dest="filt_snp_id_range",
                      type="string",
                      help="comma separate list of IDs from, to within which "
                      "to include variants for analysis.")

    parser.add_option("--snp-id",
                      dest="filt_specific_snp",
                      type="string",
                      help="include a single snp in the analysis given by "
                      "it's variant ID.")

    parser.add_option("--exclude-variant",
                      dest="filt_exclude_snp",
                      type="string",
                      help="exclude a single variant from the analysis, "
                      "given by it's variant ID")

    parser.add_option(
        "--covariate-filter",
        dest="filt_covariate_filter",
        type="string",
        help="covariate column headers or column numbers on which "
        "to filter on. Requries --covariate-file")

    parser.add_option(
        "--filter-parameter",
        dest="param",
        type="string",
        help="parameter values to be passed to filtering function")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="string",
                      help="alters the behaviour of the --snp-range and "
                      "--include/exclude snp options.  variants within +/- "
                      "half * window_size (kb) are included")

    parser.add_option(
        "--range-resolution",
        dest="filt_range_resolution",
        type="choice",
        choices=["bp", "kb", "mb"],
        help="alters the (from, to) range resolution to either bp, "
        "kb or mb")

    parser.add_option(
        "--output-file-pattern",
        dest="out_pattern",
        type="string",
        help="output file pattern prefix. file suffixes are dependent "
        "on the task executed")

    parser.add_option("--threads",
                      dest="threads",
                      type="int",
                      help="the number of threads to use for multi-threaded "
                      "processes")

    parser.add_option("--memory",
                      dest="memory",
                      type="string",
                      help="amount of memory to reserve for the task")

    parser.add_option("--parallel",
                      dest="parallel",
                      type="int",
                      help="number of jobs to split task into")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    parser.set_defaults(sum_param=None,
                        dup_method="same_ref",
                        n_perms=None,
                        permutation=False,
                        matrix_shape="triangle",
                        matrix_options=None,
                        matrix_compress="gz",
                        random_seed=random.randint(0, 19999),
                        sample_update=None,
                        memory="60G",
                        parallel=None,
                        covariate_file=None,
                        covar_col=None,
                        epi_report=0.001,
                        epi_sig=0.001)

    if not options.infile_pattern:
        infiles = (argv[-1]).split(",")
    else:
        infiles = options.infile_pattern

    # create a new filegroup object
    geno_files = gwas.FileGroup(files=infiles,
                                file_format=options.file_format,
                                genotype_format="imputed")
    if options.pheno_file:
        geno_files.set_phenotype(pheno_file=options.pheno_file,
                                 pheno=options.pheno)
    else:
        pass

    # add FileGroup object to the gwas program object
    if options.program == "plink2":
        gwas_object = gwas.Plink2(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    elif options.program == "plinkdev":
        gwas_object = gwas.PlinkDev(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    elif options.program == "gcta":
        gwas_object = gwas.GCTA(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    else:
        pass

    # collect filtering options from options
    opt_dict = options.__dict__
    filter_keys = [fx for fx in opt_dict.keys() if re.search("filt", fx)]
    filter_dict = {k: options.__dict__[k] for k in filter_keys if opt_dict[k]}

    # iteratively add genotype filters to GWASProgram object
    for fkey in filter_dict:
        filt_key = fkey.lstrip("filt_")
        filter_value = filter_dict[fkey]
        gwas_object.apply_filters(filter_type=filt_key,
                                  filter_value=filter_value)

    # handle summary statistics
    if options.method == "summary":
        if options.summary_method == "allele_frequency":
            gwas_object._output_statistics(allele_frequency=options.sum_param)
        elif options.summary_method == "hardy_weinberg":
            gwas_object._output_statistics(hardy_weinberg=options.sum_param)
        elif options.summary_method == "missing_data":
            gwas_object._output_statistics(missing_data=options.sum_param)
        elif options.summary_method == "mendel_errors":
            gwas_object._output_statistics(mendel_errors=options.sum_param)
        elif options.summary_method == "inbreeding":
            gwas_object._output_statistics(inbreeding=options.sum_param)
        elif options.summary_method == "gender_checker":
            gwas_object._output_statistics(gender_checker=options.sum_param)
        elif options.summary_method == "wrights_fst":
            gwas_object._output_statistics(wrights_fst=options.sum_param)
        elif options.summary_method == "case_control_fst":
            gwas_object._output_statistics(case_control_fst=options.sum_param)
        else:
            pass
    elif options.method == "pca":
        gwas_object.PCA(n_pcs=options.num_pcs)
    elif options.method == "ld":
        gwas_object.calc_ld(ld_statistic=options.ld_stat,
                            ld_threshold=float(options.ld_min),
                            ld_shape=options.ld_shape)
    elif options.method == "association":
        gwas_object.run_association(association=options.assoc_method,
                                    permutation=options.permutation,
                                    n_perms=options.n_perms,
                                    random_seed=options.random_seed,
                                    covariates_file=options.covariate_file,
                                    covariates=options.covar_col)
    elif options.method == "estimate_haplotypes":
        gwas_object._run_tasks(estimate_haplotypes="haplotype")
    elif options.method == "lmm":
        gwas_object.mixed_model(lmm_method=options.lmm_method,
                                grm=options.grm_prefix,
                                qcovar=options.covariate_file,
                                dcovar=options.covariate_discrete)
    elif options.method == "epistasis":
        gwas_object._detect_interactions(
            method=options.epi_method,
            modifier=options.epi_param,
            set_file=options.set_file,
            set_mode=options.set_method,
            report_threshold=options.epi_report,
            sig_threshold=options.epi_sig,
            covariates_file=options.covariate_file,
            covariates=options.covar_col)
    elif options.method == "reml":
        gwas_object.reml_analysis(method=options.reml_method,
                                  parameters=options.reml_param,
                                  prevalence=options.prevalence,
                                  qcovariates=options.covariate_file,
                                  discrete_covar=options.covariate_discrete)
    elif options.method == "format":
        if options.format_method == "change_format":
            # adding filtering options to plink requires the --make-bed flag
            try:
                update_samples = opt_dict["sample_update"]
                if update_samples:
                    E.info("updating samples from %s" % options.format_param)
                    gwas_object._run_tasks(change_format=options.reformat,
                                           parameter=options.format_param)
                    gwas_object._run_tasks(
                        update_samples=options.sample_update,
                        parameter=options.format_param)
                else:
                    gwas_object._run_tasks(change_format=options.reformat,
                                           parameter=options.format_param)
            except KeyError:
                gwas_object._run_tasks(change_format=options.reformat,
                                       parameter=options.format_param)
        elif options.format_method == "change_missing_values":
            gwas_object._run_tasks(change_missing_values=options.apply_missing,
                                   parameter=options.format_param)
        elif options.format_method == "update_variants":
            gwas_object._run_tasks(update_variants=options.variant_update,
                                   parameter=options.format_param)
            gwas_object._run_tasks(change_format=options.file_format)
        elif options.format_method == "update_samples":
            gwas_object._run_tasks(update_samples=options.sample_update,
                                   parameter=options.format_param)
        elif options.format_method == "flip_strands":
            if options.flip_subset:
                gwas_object._run_tasks(flip_strands="subset",
                                       parameter=options.format_param)
            else:
                gwas_object._run_tasks(flip_strands="all_samples",
                                       parameter=options.format_param)
        elif options.format_method == "flip_scan":
            gwas_object._run_tasks(flip_scan=options.scan_param,
                                   parameter=options.format_param)
        elif options.format_method == "sort":
            gwas_object._run_tasks(sort=options.sort_type,
                                   parameter=options.format_param)
        elif options.format_method == "merge":
            if options.merge_mode:
                gwas_object._run_tasks(merge_mode=options.merge_mode,
                                       parameter=options.format_param)
            else:
                gwas_object._run_tasks(merge=options.merge_format,
                                       parameter=options.format_param)
        elif options.format_method == "find_duplicates":
            gwas_object._run_tasks(find_duplicates=options.dup_method,
                                   parameter=options.format_param)
        else:
            pass
    elif options.method == "matrix":
        if options.matrix_form == "distance":
            if options.matrix_metric == "hamming":
                gwas_object.hamming_matrix(shape=options.matrix_shape,
                                           compression=options.matrix_compress,
                                           options=options.matrix_options)
            elif options.matrix_metric == "ibs":
                gwas_object.ibs_matrix(shape=options.matrix_shape,
                                       compression=options.matrix_compress,
                                       options=options.matrix_options)
            elif options.matrix_metric == "genomic":
                gwas_object.genome_matrix(shape=options.matrix_shape,
                                          compression=options.matrix_compress,
                                          options=options.matrix_options)
        elif options.matrix_form == "grm":
            gwas_object.genetic_relationship_matrix(
                shape=options.matrix_shape,
                compression=options.matrix_compress,
                metric=options.matrix_metric,
                options=options.matrix_options)
    else:
        pass

    gwas_object.build_statement(infiles=geno_files,
                                outfile=options.out_pattern,
                                threads=options.threads,
                                memory=options.memory,
                                parallel=options.parallel)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 2
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--plot-type",
                      dest="plot_type",
                      type="choice",
                      choices=["manhattan", "qqplot", "epistasis"],
                      help="plot type to generate")

    parser.add_option("--resolution",
                      dest="resolution",
                      type="choice",
                      choices=["genome_wide", "chromosome", "fine_map"],
                      help="the resolution of plotting, wether the plot "
                      "depicts the whole genome, a single chromosome or "
                      "a specific locus")

    parser.add_option("--file-format",
                      dest="file_format",
                      type="choice",
                      choices=["plink", "cassi", "cassi_covar"],
                      help="input file format, used to parse the file "
                      "properly")

    parser.add_option("--save-path",
                      dest="save_path",
                      type="string",
                      help="path and filename to save image to")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(resolution="genome_wide",
                        plot_type="manhattan",
                        file_format="plink")

    # if the input is a list of files, split them
    infile = argv[-1]
    infiles = infile.split(",")

    # need to parse epistasis output slightly differently
    if options.plot_type == "epistasis":
        epi = True
    else:
        epi = False

    if len(infiles) > 1:
        results = gwas.GWASResults(assoc_file=infiles,
                                   epistasis=epi,
                                   file_format=options.file_format)
    elif len(infiles) == 1:
        results = gwas.GWASResults(assoc_file=infile,
                                   epistasis=epi,
                                   file_format=options.file_format)
    else:
        raise IOError("no input files detected, please specifiy association "
                      "results files as the last command line argument")

    if options.plot_type == "manhattan":
        df = results.plotManhattan(resolution=options.resolution,
                                   save_path=options.save_path)
    elif options.plot_type == "qqplot":
        results.plotQQ(save_path=options.save_path,
                       resolution=options.resolution)
    elif options.plot_type == "epistasis":
        results.plotEpistasis(save_path=options.save_path,
                              resolution=options.resolution)
    else:
        pass

    # only output appended results for Manhattan plot, not qqplot
    try:
        df.to_csv(options.stdout, sep="\t", index=None)
    except UnboundLocalError:
        pass

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 3
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--program", dest="program", type="choice",
                      choices=["plink2", "gcta", "plinkdev"],
                      help="program to execute genome-wide analysis")

    parser.add_option("--input-file-pattern", dest="infile_pattern", type="string",
                      help="file prefix that identifies a group of files")

    parser.add_option("--input-file-format", dest="file_format", type="choice",
                      choices=["plink", "plink_binary", "oxford",
                               "oxford_binary", "vcf", "GRM_binary",
                               "GRM_gz"],
                      help="format of input files")

    parser.add_option("--phenotypes-file", dest="pheno_file", type="string",
                      help="text file of additional phenotypes")

    parser.add_option("--pheno", dest="pheno", type="string",
                      help="either phenotype file column header or number")

    parser.add_option("--covariates-file", dest="covariate_file", type="string",
                      help="file containing covariates")

    parser.add_option("--covariate-column", dest="covar_col", type="string",
                      help="column number(s) or header(s) to include in "
                      "association model")

    parser.add_option("--method", dest="method", type="choice",
                      choices=["ld_prune", "summary", "flag_hets",
                               "remove_relations", "check_gender",
                               "IBD"],
                      help="method to apply to genome-wide data")

    parser.add_option("--IBD-parameter", dest="ibd_param", type="choice",
                      choices=["norm", "relatives", "full"], help="param "
                      "to pass to IBD calculations")

    parser.add_option("--principal-components", dest="num_pcs", type="int",
                      help="the number of principal components to output")

    parser.add_option("--matrix-shape", dest="matrix_shape", type="choice",
                      choices=["triangle", "square", "square0"],
                      help="output matrix shape.", default="triangle")

    parser.add_option("--matrix-compression", dest="matrix_compress", type="choice",
                      choices=["gz", "bin", "bin4"],
                      help="compression to apply to output matrix file",
                      default="gz")

    parser.add_option("--matrix-form", dest="matrix_form", type="choice",
                      choices=["distance", "grm"],
                      help="type of relationship matrix to calculate")

    parser.add_option("--matrix-metric", dest="matrix_metric", type="choice",
                      choices=["fhat", "cov", "ibc2", "ibc3", "ibs",
                               "genomic", "hamming"],
                      help="value to calculate for diagonal elements of the "
                      "grm. Default is fhat for grm and hamming for distance.")

    parser.add_option("--matrix-options", dest="matrix_options", type="string",
                      help="modifiers of matrix output, see plink documentation "
                      "for details")

    parser.add_option("--strand-flip-subset", dest="flip_subset", action="store_true",
                      help="apply strand flipping to a subset of samples")

    parser.add_option("--flip-scan-type", dest="scan_param", type="choice",
                      choices=["default", "window", "threshold"],
                      help="strand flipping scan to apply to SNPs")

    parser.add_option("--sort-type", dest="sort_type", type="choice",
                      choices=["none", "natural", "ascii", "file"],
                      help="sort type to input files")

    parser.add_option("--merge-file-format", dest="merge_format", type="choice",
                      choices=["plink", "binary_plink"],
                      help="format of input files to be merged")

    parser.add_option("--merge-mode", dest="merge_mode", type="choice",
                      choices=["default", "original_missing", "new_nonmissing",
                               "no_overwrite", "force", "report_all",
                               "report_nonmissing"],
                      help="merge mode to apply to dealing with merge conflicts")

    parser.add_option("--duplicates-method", dest="dup_method", type="choice",
                      choices=["same_ref", "id_match", "suppress_first"],
                      help="method for identifying and dealing with duplicate "
                      "variants")

    parser.add_option("--summary-method", dest="summary_method", type="choice",
                      choices=["allele_frequency", "missing_data", "hardy_weinberg",
                               "mendel_errors", "inbreeding", "inbreeding_coef",
                               "gender_checker", "wrights_fst"],
                      help="summary statistics to calculate")

    parser.add_option("--summary-parameter", dest="sum_param", type="string",
                      help="optional parameters that can be passed to summary "
                      "statistics methods")

    parser.add_option("--genotype-rate", dest="filt_genotype_rate", type="string",
                      help="genotyping rate threshold.  SNPs below this threshold "
                      "will be excluded from analysis")

    parser.add_option("--indiv-missing", dest="filt_missingness", type="string",
                      help="individual missingness rate.  Individuals below "
                      "this threshold will be excluded from analysis")

    parser.add_option("--hardy-weinberg", dest="filt_hwe", type="string",
                      help="hardy-weinberg p-value threshold for SNPs.  SNPs "
                      "with a 2df chisquared p-value below this will be "
                      "filtered out")

    parser.add_option("--min-allele-frequency", dest="filt_min_allele_frequency",
                      type="string",
                      help="only include SNPs with an allele frequency equal to "
                      "or above this threshold")

    parser.add_option("--max-allele-frequency", dest="filt_max_allele_frequency",
                      type="string",
                      help="only include SNPs with an allele frequency equal to "
                      "or below this threshold")

    parser.add_option("--mendelian-error", dest="filt_mendelian_error", type="string",
                      help="exclude individuals/trios with mendelian errors that "
                      "exceed this value")

    parser.add_option("--min-quality-score", dest="filt_min_qaul_score", type="string",
                      help="reset the minimum low bound of quality scores for "
                      "variants in a VCF file.  Default is 0")

    parser.add_option("--max-quality-score", dest="filt_max_qual_score", type="string",
                      help="reset the maximum upper bound of quality scores for "
                      "a VCCF file.  Default is Inf")

    parser.add_option("--allow-no-gender", dest="filt_allow_no_sex", type="string",
                      help="allow individuals with gender missing")

    parser.add_option("--enforce-gender", dest="filt_enforce_sex", type="string",
                      help="only include individuals with non-missing gender "
                      "information")

    parser.add_option("--keep-individuals", dest="filt_keep", type="string",
                      help="a file containing individuals IDs to keep, "
                      "one per row")

    parser.add_option("--remove-individuals", dest="filt_remove", type="string",
                      help="a file of individual IDs to remove, one per row")

    parser.add_option("--subset-filter", dest="filt_subset_filter", type="choice",
                      choices=["cases", "controls", "males", "females",
                               "founders", "nonfounders"],
                      help="only apply filters to the specific subset of "
                      "individuals supplied")

    parser.add_option("--extract-snps", dest="filt_extract", type="string",
                      help="text file of variant IDs to include in the analysis, "
                      "ignoring all others")

    parser.add_option("--exclude-snps", dest="filt_exclude", type="string",
                      help="a file of variant IDs to exclude from analysis")

    parser.add_option("--restrict-chromosome", dest="filt_chromosome", type="string",
                      help="restict analysis to either a single chromosome, "
                      "or a comma-separated list of chromosomes")

    parser.add_option("--exclude-chromosomes", dest="filt_exclude_chromosome",
                      type="string", help="exclude all variants on these "
                      "chromosome(s)")

    parser.add_option("--autosome-only", dest="filt_autosome", action="store_true",
                      help="if present only autosomal variants will be analysed")

    parser.add_option("--pseudo-autosome", dest="filt_pseudo_autosome", action="store_true",
                      help="include on the pseudo-autosomal region of chromosome X")

    parser.add_option("--ignore-indels", dest="filt_ignore_indels", action="store_true",
                      help="only include bi-allelic single nucleotide "
                      "variants in analysis")

    parser.add_option("--snp-range", dest="filt_snp_bp_range", type="string",
                      help="comma separated list of from, to genome co-ordinates "
                      "within which to include variants for analysis")

    parser.add_option("--snp-id-range", dest="filt_snp_id_range", type="string",
                      help="comma separate list of IDs from, to within which "
                      "to include variants for analysis.")

    parser.add_option("--snp-id", dest="filt_specific_snp", type="string",
                      help="include a single snp in the analysis given by "
                      "it's variant ID.")

    parser.add_option("--exclude-variant", dest="filt_exclude_snp", type="string",
                      help="exclude a single variant from the analysis, "
                      "given by it's variant ID")

    parser.add_option("--covariate-filter", dest="filt_covariate_filter", type="string",
                      help="covariate column headers or column numbers on which "
                      "to filter on. Requries --covariate-file")

    parser.add_option("--filter-parameter", dest="param", type="string",
                      help="parameter values to be passed to filtering function")

    parser.add_option("--window-size", dest="window_size", type="string",
                      help="alters the behaviour of the --snp-range and "
                      "--include/exclude snp options.  variants within +/- "
                      "half * window_size (kb) are included")

    parser.add_option("--range-resolution", dest="filt_range_resolution", type="choice",
                      choices=["bp", "kb", "mb"],
                      help="alters the (from, to) range resolution to either bp, "
                      "kb or mb")

    parser.add_option("--output-file-pattern", dest="out_pattern", type="string",
                      help="output file pattern prefix. file suffixes are dependent "
                      "on the task executed")

    parser.add_option("--threads", dest="threads", type="int",
                      help="the number of threads to use for multi-threaded "
                      "processes")

    parser.add_option("--use-kb", dest="kb", action="store_true",
                      help="if present uses a kb sized window for LD pruning")

    parser.add_option("--prune-method", dest="prune_method", type="choice",
                      choices=["R2", "VIF"], help="type of LD pruning to "
                      "perform, pair-wise LD or variance inflation factor")

    parser.add_option("--step-size", dest="step", type="string",
                      help="step size to advance window by")

    parser.add_option("--threshold", dest="threshold", type="string",
                      help="threshold on which to filter results")

    parser.add_option("--parallel", dest="parallel", type="int",
                      help="number of jobs to split task into")

    parser.add_option("--memory", dest="memory", type="string",
                      help="amount of memory to reserve for the task")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(sum_param=None,
                        dup_method="same_ref",
                        matrix_shape="triangle",
                        matrix_options=None,
                        matrix_compress="gz",
                        kb=False,
                        random_seed=random.randint(0, 19999),
                        memory="60G",
                        parallel=None)

    if not options.infile_pattern:
        infiles = (argv[-1]).split(",")
    else:
        infiles = options.infile_pattern

    # create a new filegroup object
    geno_files = gwas.FileGroup(files=infiles,
                                file_format=options.file_format,
                                genotype_format="imputed")
    if options.pheno_file:
        geno_files.set_phenotype(pheno_file=options.pheno_file,
                                 pheno=options.pheno)
    else:
        pass

    # add FileGroup object to the gwas program object
    if options.program == "plink2":
        gwas_object = gwas.Plink2(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    elif options.program == "plinkdev":
        gwas_object = gwas.PlinkDev(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)

    elif options.program == "gcta":
        gwas_object = gwas.GCTA(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    else:
        pass

    # collect filtering options from options
    opt_dict = options.__dict__
    filter_keys = [fx for fx in opt_dict.keys() if re.search("filt", fx)]
    filter_dict = {k: options.__dict__[k] for k in filter_keys if opt_dict[k]}

    # iteratively add all filters to GWASProgram object
    for fkey in filter_dict:
        filt_key = fkey.replace("filt_", "")
        filter_value = filter_dict[fkey]
        gwas_object.apply_filters(filter_type=filt_key,
                                  filter_value=filter_value)

    # handle summary statistics
    if options.method == "ld_prune":
        gwas_object._qc_methods(ld_prune=options.prune_method,
                                kb=True,
                                window=options.window_size,
                                step=options.step,
                                threshold=options.threshold)
    elif options.method == "IBD":
        # use sum param to pass arguments to ibd estiamte
        # these are norm, full or relatitves
        gwas_object._qc_methods(ibd=options.ibd_param)
    elif options.method == "summary":
        if options.summary_method == "allele_frequency":
            gwas_object._output_statistics(allele_frequency=options.sum_param)
        elif options.summary_method == "hardy_weinberg":
            gwas_object._output_statistics(hardy_weinberg=options.sum_param)
        elif options.summary_method == "missing_data":
            gwas_object._output_statistics(missing_data=options.sum_param)
        elif options.summary_method == "mendel_errors":
            gwas_object._output_statistics(mendel_errors=options.sum_param)
        elif options.summary_method == "inbreeding":
            gwas_object._output_statistics(inbreeding=options.sum_param)
        elif options.summary_method == "inbreeding_coef":
            gwas_object._output_statistics(inbreeding_coef=options.sum_param)
        elif options.summary_method == "gender_checker":
            gwas_object._output_statistics(gender_checker=options.sum_param)
        elif options.summary_method == "wrights_fst":
            gwas_object._output_statistics(wrights_fst=options.sum_param)
        else:
            pass
    elif options.method == "remove_relations":
        gwas_object._run_tasks(remove_relations="cutoff",
                               parameter=options.threshold)
    elif options.method == "check_gender":
        gwas_object._run_tasks(check_gender="")
    else:
        pass

    gwas_object.build_statement(infiles=geno_files,
                                outfile=options.out_pattern,
                                threads=options.threads,
                                memory=options.memory,
                                parallel=options.parallel)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 4
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--task", dest="task", type="choice",
                      choices=["get_hits", "extract_results",
                               "merge_freq"],
                      help="task to perform")

    parser.add_option("--p-threshold", dest="p_threshold", type="float",
                      help="threshold for association p-value, below "
                      "which results will be output")

    parser.add_option("--output-directory", dest="outdir", type="string",
                      help="output file directory")

    parser.add_option("--snp-set", dest="snpset", type="string",
                      help="file containing list of SNP per row to "
                      "extract from GWAS results")

    parser.add_option("--frequency-directory", dest="freq_dir", type="string",
                      help="Directory containing plink .frq files corresponding"
                      " to all chromosomes")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # if the input is a list of files, split them
    infile = argv[-1]
    infiles = infile.split(",")
    if len(infiles) > 1:
        results = gwas.GWASResults(assoc_file=infiles)
    elif len(infiles) == 1:
        results = gwas.GWASResults(assoc_file=infile)
    else:
        raise IOError("no input files detected, please specifiy association "
                      "results files as the last command line argument")

    if options.task == "get_hits":
        hits = results.getHits(float(options.p_threshold))
        for name, region in hits:
            try:
                try:
                    top_reg = region.sort_values(by="CHISQ",
                                                 ascending=False)
                    top_bp = top_reg.iloc[0]["BP"]
                    top_snp = top_reg.iloc[0]["SNP"]
                except KeyError:
                    top_reg = region
                    top_reg.loc[:, "STAT"] = abs(top_reg["STAT"])
                    top_reg = top_reg.sort_values(by="STAT",
                                                  ascending=False)
                    top_bp = top_reg.iloc[0]["BP"]
                    top_snp = top_reg.iloc[0]["SNP"]
            except KeyError:
                top_reg = region
                top_reg.loc[:, "STAT"] = abs(top_reg["T"])
                top_reg = top_reg.sort_values(by="T",
                                              ascending=False)
                top_bp = top_reg.iloc[0]["BP"]
                top_snp = top_reg.iloc[0]["SNP"]

            outname = "_".join(["chr%s" % str(name),
                                str(top_bp),
                                top_snp,
                                "significant"])

            outfile = outname + ".tsv"
            out_file = "/".join([options.outdir, outfile])
            E.info("output association results from Chr%s to %s" %
                   (str(name), out_file))
            # this keeps outputing the first column as unamed: 0,
            # need to remove this
            try:
                if region.columns[0] != "A1":
                    region.drop([region.columns[0]], inplace=True, axis=1)
            except:
                pass

            region.to_csv(out_file, sep="\t", index=None)

    elif options.task == "extract_results":
        with IOTools.open_file(options.snpset, "r") as sfile:
            snpset = sfile.readlines()
            snpset = [snp.rstrip("\n") for snp in snpset]

        snp_df = results.extractSNPs(snpset)
        snp_df.dropna(axis=0, how='all', inplace=True)
        snp_df.drop_duplicates(subset=["SNP"], inplace=True)
        snp_df.to_csv(options.stdout, sep="\t", index=None)

    elif options.task == "merge_freq":
        # sequentially merge GWAS result with frequency data
        # to make file for GCTA joint analysis
        regex = re.compile("(\S+).frq$")
        cojo_df = results.mergeFrequencyResults(options.freq_dir,
                                                file_regex=regex)
        cojo_df.to_csv(options.stdout, sep="\t", index=None)
    else:
        pass

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 5
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--score-method",
                      dest="method",
                      type="choice",
                      choices=[
                          "PICS", "LDscore", "ABF", "R2_rank", "get_eigen",
                          "calc_prior", "credible_set", "summarise"
                      ],
                      help="SNP scoring/prioritisation method to apply.")

    parser.add_option("--database",
                      dest="database",
                      type="string",
                      help="SQL database containing LD information "
                      "in table format. Expects columns SNP_A, "
                      "SNP_B, R2, BP_A and BP_B (Plink --r2 output)")

    parser.add_option("--ld-directory",
                      dest="ld_dir",
                      type="string",
                      help="directory containing tabix-index BGZIP "
                      "LD files.  Assumes Plink used to calculate LD")

    parser.add_option("--table-name",
                      dest="table",
                      type="string",
                      help="name of the SQL table containing the LD"
                      "values")

    parser.add_option("--chromosome",
                      dest="chromosome",
                      type="string",
                      help="chromosome to subset the association results "
                      "file on")

    parser.add_option("--ld-threshold",
                      dest="ld_threshold",
                      type="float",
                      help="the threshold of LD above which variants will "
                      "be taken forward.")

    parser.add_option("--rank-threshold",
                      dest="rank_threshold",
                      type="float",
                      help="the threshold in terms of the top n% SNPs to "
                      "output based on the ranking metric. e.g. "
                      "--rank-threshold=0.01 is the top 1% SNPs")

    parser.add_option("--credible-interval",
                      dest="interval",
                      type="float",
                      help="The credible set interval size to generate the "
                      "credible set of SNPs")

    parser.add_option("--prior-variance",
                      dest="prior_var",
                      type="float",
                      help="the prior variance used to weight the SNP "
                      "variance")

    parser.add_option("--fine-map-window",
                      dest="map_window",
                      type="int",
                      help="the region size to included around the index "
                      "SNP as the fine-mapping region.")

    parser.add_option("--eigen-score-directory",
                      dest="eigen_dir",
                      type="string",
                      help="PATH to directory containing tabix indexed "
                      "eigen score files")

    parser.add_option("--flat-prior",
                      dest="flat_prior",
                      action="store_true",
                      help="Ignore functional annotation information and "
                      "use an uninformative prior on each SNP")

    parser.add_option("--snp-set",
                      dest="snp_set",
                      type="string",
                      help="Pre-defined SNP set as a list of SNP IDs."
                      "If used to calculate priors contains column of scores.")

    parser.add_option(
        "--distribution",
        dest="dist",
        type="choice",
        choices=["normal", "t", "gamma", "lognormal", "exponential"],
        help="distribution from which to draw prior "
        "probabilities")

    parser.add_option("--distribution-parameters",
                      dest="dist_params",
                      type="string",
                      help="distribution parameters as a comma-separated list")

    parser.add_option("--lead-snp-id",
                      dest="lead_snp",
                      type="int",
                      help="0-based item number in filename")

    parser.add_option("--filename-separator",
                      dest="separator",
                      type="string",
                      help="filename separator to extract information")

    parser.add_option("--snp-column",
                      dest="snp_col",
                      type="int",
                      help="0-based index of SNP ID column number")

    parser.add_option("--probability-column",
                      dest="prob_col",
                      type="int",
                      help="0-based index of posterior probabilities column"
                      " number")

    parser.set_defaults(
        ld_dir=None,
        dist="normal",
        dist_params=None,
        snp_set=None,
        prior_var=0.04,
        interval=0.99,
        eigen_dir=None,
        map_window=100000,
        ld_threshold=0.5,
        database=None,
        table=None,
        flat_prior=False,
        lead_snp=2,
        separator="_",
        snp_col=0,
        prob_col=1,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    infile = argv[-1]

    if len(infile.split(",")) > 1:
        pass
    else:
        peek = pd.read_table(infile, nrows=5, sep="\s*", header=0)
        try:
            if len(peek["TEST"] != "ADD"):
                clean = False
            else:
                clean = True
        except KeyError:
            clean = True

    if options.method == "LDscore":
        snpscores = gwas.snpPriorityScore(gwas_results=infile,
                                          database=options.database,
                                          table_name=options.table,
                                          chromosome=options.chromosome,
                                          ld_dir=options.ld_dir,
                                          clean=clean)
        # take top 1%, all SNPs doesn't achieve anything useful
        ranks = int(len(snpscores.index) * 0.01)
        snpscores = snpscores.iloc[:ranks]

    elif options.method == "PICS":
        snp_list = {}
        if options.snp_set and not options.flat_prior:
            with IOTools.open_file(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    try:
                        score = float(line.split("\t")[-1].rstrip("\n"))
                    except ValueError:
                        score = 0
                    snp_list[snp] = float(score)

            # get the parameter estimates for the distribution
            # if they have not been provided
            if not options.dist_params:
                dist_params = gwas.estimateDistributionParameters(
                    data=snp_list.values(), distribution=options.dist)
            else:
                dist_params = tuple(
                    [float(fx) for fx in options.dist_params.split(",")])

            E.info("Calculating priors on SNPs")
            priors = gwas.calcPriorsOnSnps(snp_list=snp_list,
                                           distribution=options.dist,
                                           params=dist_params)

        elif options.snp_set and options.flat_prior:
            with IOTools.open_file(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    snp_list[snp] = 1.0

            priors = snp_list

        else:
            # allow for no priors or scores to be set,
            # use of priors will be ignored,
            # i.e. when prior and likelihood are not from
            # conjugate distributions
            priors = None

        # PICS scores expects the gwas results file to
        # only contain the region of interest, which
        # represents an independent association signal
        # if a SNP has not been genotyped,
        # but it is in strong LD, it will cause problems
        # downstream <- only allow SNPs that
        # are present in the analysis
        snpscores = gwas.PICSscore(gwas_results=infile,
                                   database=options.database,
                                   table_name=options.table,
                                   chromosome=options.chromosome,
                                   priors=priors,
                                   clean=clean,
                                   ld_dir=options.ld_dir,
                                   ld_threshold=options.ld_threshold)

        snpscores.columns = ["SNP", "PICS"]
        posterior_sum = 0
        snpscores.sort_values(ascending=False, inplace=True)
        post_snps = []
        for snp in snpscores.index:
            if posterior_sum < 99.0:
                posterior_sum += snpscores.loc[snp]
                post_snps.append(snp)
            else:
                break

        snpscores = snpscores.loc[post_snps]

        snpscores.drop_duplicates(inplace=True)

    elif options.method == "R2_rank":
        # rank SNPs based on their LD with the lead
        # SNP, take the top n% SNPs
        snpscores = gwas.LdRank(gwas_results=infile,
                                database=options.database,
                                table_name=options.table,
                                ld_dir=options.ld_dir,
                                chromosome=options.chromosome,
                                ld_threshold=options.ld_threshold,
                                top_snps=options.rank_threshold,
                                clean=clean)

    elif options.method == "ABF":
        snpscores = gwas.ABFScore(gwas_results=infile,
                                  region_size=options.map_window,
                                  chromosome=options.chromosome,
                                  prior_variance=options.prior_var,
                                  clean=clean)
    elif options.method == "get_eigen":
        E.info("Fetching Eigen scores")
        snpscores = gwas.getEigenScores(eigen_dir=options.eigen_dir,
                                        bim_file=infile,
                                        snp_file=options.snp_set)
        snpscores = pd.DataFrame(snpscores).T

    elif options.method == "credible_set":
        E.info("Creating credible set")

        snpscores = gwas.makeCredibleSet(probs_file=infile,
                                         credible_set=options.interval,
                                         lead_snp_indx=options.lead_snp,
                                         filename_sep=options.separator,
                                         snp_column=options.snp_col,
                                         probs_column=options.prob_col)

    elif options.method == "summarise":
        E.info("Collating SNP prioritisation resuslts")
        file_list = infile.split(",")
        snpscores = gwas.summariseResults(file_list=file_list)

    snpscores.to_csv(options.stdout, index_label="SNP", sep="\t")

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 6
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--method", dest="method", type="choice",
                      choices=["cases_explained", "probability_phenotype"],
                      help="Which results to report, either the proportion "
                      "of cases explained or the probability of the "
                      "phenotype given the number of alleles carried")

    parser.add_option("--map-file", dest="map_file", type="string",
                      help="plink .map file with SNP positions")

    parser.add_option("--ped-file", dest="ped_file", type="string",
                      help="plink ped file with phenotype and "
                      "genotype data - A2 major allele coded")

    parser.add_option("--gwas-file", dest="gwas", type="string",
                      help="gwas results file, assumes Plink "
                      "output format.  Must contain SNP, BP, "
                      "OR column headers.  Assumes results relate "
                      "to the A1 allele")

    parser.add_option("--flip-alleles", dest="flip", action="store_true",
                      help="force alleles to flip if OR < 1")

    parser.add_option("--plot-statistic", dest="plot_stat", type="choice",
                      choices=["frequency", "cumulative"],
                      help="plot either cases frequency or cumulative "
                      "frequency of cases")

    parser.add_option("--plot-path", dest="plot_path", type="string",
                      help="save path for plot")

    parser.add_option("--flag-explained-recessive", dest="explained",
                      action="store_true",
                      help="flag individuals explained by carriage of "
                      "2 risk alleles - NOT IMPLIMENTED")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # required files are .ped file, .map file and gwas results file
    E.info("reading GWAS results file: %s" % options.gwas)
    snp_df = pd.read_table(options.gwas, sep="\t", header=0,
                           index_col=None)
    snp_list = snp_df["SNP"].values

    # parse ped file
    E.info("Reading ped file: %s" % options.ped_file)
    ped_df = gwas.parsePed(options.ped_file,
                           compound_geno=True)

    # parse map file and get SNP indices that correspond to
    # ped file genotypes
    E.info("Fetching SNPs from map file: %s" % options.map_file)
    snp_index = gwas.getSNPs(options.map_file,
                             snp_list)

    E.info("SNPs found: %i" % len(snp_index))
    # extract SNPs and ORs as key, value pairs
    or_dict = snp_df.loc[:, ["SNP", "OR"]].to_dict(orient='list')
    snp_or = dict(zip(or_dict["SNP"], or_dict["OR"]))

    if options.flip:
        E.info("Flipping major alleles to risk alleles")
        flipped_genos = gwas.flipRiskAlleles(snp_index=snp_index,
                                             snp_results=snp_or,
                                             genos=ped_df["GENOS"].tolist())
        # merge flipped genotypes with pedigree frame to get phenotypes
        geno_df = pd.DataFrame(flipped_genos, index=ped_df["FID"])
    else:
        # split genos into a dataframe
        genos = np.array(ped_df["GENOS"].tolist())
        geno_df = pd.DataFrame(genos, index=ped_df["FID"])

    merged = pd.merge(geno_df, ped_df, left_index=True, right_on="FID")

    # need to discount missing genotypes > 1%

    # frequencies of number of risk alleles by trait frequency
    E.info("count #risk alleles per individual")
    risk_results = gwas.countRiskAlleles(ped_frame=merged,
                                         snp_index=snp_index.values(),
                                         report=options.method,
                                         flag=options.explained)
    risk_freqs = risk_results["freqs"]
    cumulative = risk_results["cumulative"]
    # select results upto and including cumulative freq = 1.0
    max_indx = [fx for fx, fy in enumerate(cumulative) if fy == 1.0][0]
    max_freqs = risk_freqs[:max_indx + 1]
    max_cum = cumulative[:max_indx + 1]
    bins = [ix for ix, iy in enumerate(cumulative)][:max_indx + 1]

    # plot!
    # need to add number of individuals into each bin as point size
    if options.plot_stat == "frequency":
        E.info("Generating plot of #risk alleles vs. P(Phenotype)")
        hist_df = gwas.plotRiskFrequency(bins=bins,
                                         frequencies=max_freqs,
                                         counts=risk_results["cases"][:max_indx + 1],
                                         savepath=options.plot_path,
                                         ytitle="P(Phenotype)")
    elif options.plot_stat == "cumulative":
        E.info("Generating plot of #risk alleles vs. cumulative frequency")
        hist_df = gwas.plotRiskFrequency(bins=bins,
                                         frequencies=max_cum,
                                         counts=risk_results["cases"][:max_indx + 1],
                                         savepath=options.plot_path,
                                         ytitle="Cumulative frequency cases")

    hist_df["freq"] = risk_results["freqs"][:max_indx + 1]
    hist_df["cumulative"] = risk_results["cumulative"][:max_indx + 1]
    hist_df["cases"] = risk_results["cases"][:max_indx + 1]
    hist_df["controls"] = risk_results["controls"][:max_indx + 1]
    hist_df["total"] = hist_df["cases"] + hist_df["controls"]
    hist_df.to_csv(options.stdout, sep="\t", index=None)

    # write footer and output benchmark information.
    E.Stop()
Exemplo n.º 7
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option(
        "--task",
        dest="task",
        type="choice",
        choices=["mafs", "penetrance", "detect_duplicates", "allele_diff"],
        help="task to perform")

    parser.add_option("--ped-file",
                      dest="ped_file",
                      type="string",
                      help="plink format .ped file")

    parser.add_option("--map-file",
                      dest="map_file",
                      type="string",
                      help="plink format .map file")

    parser.add_option("--freq-file",
                      dest="mafs",
                      type="string",
                      help="text file containing populations minor "
                      "allele frequencies of variants.  One row per "
                      "variant with ID MAF")

    parser.add_option("--groups-file",
                      dest="group_file",
                      type="string",
                      help="file containing group labels for individuals "
                      "in the provided ped file")

    parser.add_option("--ref-label",
                      dest="ref_label",
                      type="string",
                      help="group label to be used as the reference case")

    parser.add_option("--test-label",
                      dest="test_label",
                      type="string",
                      help="group label to be used as the test case")

    parser.add_option("--subset",
                      dest="subset",
                      type="choice",
                      choices=["cases", "gender"],
                      help="subset the "
                      "data by either case/control or gender")

    parser.add_option("--take-last",
                      dest="take",
                      action="store_true",
                      help="if use duplicates will take the last variant, "
                      "default behaviour is to take the first")

    parser.add_option("--outfile-pattern",
                      dest="out_pattern",
                      type="string",
                      help="outfile pattern to use for finding duplicates "
                      "and triallelic variants")

    parser.add_option("--snp-set",
                      dest="snp_subset",
                      type="string",
                      help="list of SNPs to include")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(mafs=None, subset=None, take_last=False)

    if options.task == "mafs":
        mafs = gwas.countByVariantAllele(options.ped_file, options.map_file)

        mafs.to_csv(options.stdout, index_col=None, sep="\t")

    elif options.task == "penetrance":
        summary, pens = gwas.calcPenetrance(options.ped_file,
                                            options.map_file,
                                            subset=options.subset,
                                            mafs=options.mafs,
                                            snpset=options.snp_subset)

        pens.to_csv(options.stdout, sep="\t", index_label="SNP")
        summary.to_csv("/".join([os.getcwd(), "penetrance_summary.txt"]),
                       sep="\t",
                       index_label="SNP")

    elif options.task == "allele_diff":
        allele_diffs = gwas.calcMaxAlleleFreqDiff(
            ped_file=options.ped_file,
            map_file=options.map_file,
            group_file=options.group_file,
            test=options.test_label,
            ref=options.ref_label)

        allele_diffs.to_csv(options.stdout, sep="\t")

    elif options.task == "detect_duplicates":
        # find variants with duplicated position and shared reference
        # allele indicative of triallelic variants - also same ID
        # ouput to a filter list
        infile = argv[-1]
        dups, tris, oves = gwas.findDuplicateVariants(bim_file=infile,
                                                      take_last=options.take)

        if os.path.isabs(options.out_pattern):
            with open(options.out_pattern + ".triallelic", "w") as otfile:
                for tvar in tris:
                    otfile.write("%s\n" % tvar)

            with open(options.out_pattern + ".duplicates", "w") as odfile:
                for dvar in dups:
                    odfile.write("%s\n" % dvar)

            with open(options.out_pattern + ".overlapping", "w") as ovfile:
                for ovar in oves:
                    ovfile.write("%s\n" % ovar)
        else:
            outpattern = os.path.abspath(options.out_pattern)
            with open(outpattern + ".triallelic", "w") as otfile:
                for tvar in tris:
                    otfile.write("%s\n" % tvar)

            with open(outpattern + ".duplicates", "w") as odfile:
                for dvar in dups:
                    odfile.write("%s\n" % dvar)

            with open(outpattern + ".overlapping", "w") as ovfile:
                for ovar in oves:
                    ovfile.write("%s\n" % ovar)

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 8
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--task",
                      dest="task",
                      type="choice",
                      choices=[
                          "merge_exclusions", "flag_hets", "find_inbreds",
                          "flag_relations", "discordant_gender"
                      ],
                      help="task to execute on phenotype file(s)")

    parser.add_option("--gender-check-file",
                      dest="gender_check",
                      type="string",
                      help="output from gender checking "
                      "by Plink, suffix should be .sexcheck")

    parser.add_option("--relationship-file",
                      dest="relations",
                      type="string",
                      help="output file from IBS "
                      "calculation.  Should contain all pairwise "
                      "relationships.")

    parser.add_option("--inbreeding-coef-file",
                      dest="inbreed_file",
                      type="string",
                      help="file containing either Plink "
                      "or GCTA estimates of F, inbreeding coefficient")

    parser.add_option("--inbreeding-coefficient",
                      dest="inbred_coeff",
                      type="choice",
                      choices=["Fhat1", "Fhat2", "Fhat3", "F", "ibc"],
                      help="inbreeding coefficient "
                      "to use to identify highly inbred individuals")

    parser.add_option("--inbred-cutoff",
                      dest="inbred_cutoff",
                      type="float",
                      help="threshold above which individuals are classed "
                      "as inbred.")

    parser.add_option("--ibs-cutoff",
                      dest="ibs_cutoff",
                      type="float",
                      help="IBS threshold to flag individuals as being "
                      "closely related")

    parser.add_option("--trimmed-relationships",
                      dest="rel_cutoff",
                      type="string",
                      help="output file from Plink "
                      "--rel-cutoff with trimmed data set of unrelated "
                      "individuals.")

    parser.add_option(
        "--heterozygotes-file",
        dest="hets_file",
        type="string",
        help="file from heterozygote analysis containing observed "
        "homozygosity and F coefficients")

    parser.add_option("--auxillary-file",
                      dest="aux_file",
                      type="string",
                      help="a file of IIDs and FIDs for individuals that are "
                      "to be removed from analysis, unrelated to QC")

    parser.add_option("--plotting-path",
                      dest="plot_path",
                      type="string",
                      help="PATH to save any plots to")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.task == "flag_hets":
        # calculate heterozygosity rates, find and flag
        # individuals > 3 s.d. away from mean value
        # rate = (nonissing - homs) / nonmissing
        # i.e. non-homozygote rate
        flags = gwas.flagExcessHets(options.hets_file,
                                    plot=True,
                                    plot_path=options.plot_path)
        flags.to_csv(options.stdout, index=None, sep="\t")

    elif options.task == "merge_exclusions":
        exclusions = gwas.mergeQcExclusions(hets_file=options.hets_file,
                                            inbred_file=options.inbreed_file,
                                            related_file=options.relations,
                                            gender_file=options.gender_check,
                                            mask_file=options.aux_file)
        exclusions.to_csv(options.stdout, index=None, sep="\t")
    elif options.task == "find_inbreds":
        inbreds = gwas.flagInbred(inbred_file=options.inbreed_file,
                                  inbreeding_coefficient=options.inbred_coeff,
                                  ibc_threshold=options.inbred_cutoff,
                                  plot=True,
                                  plot_path=options.plot_path)
        inbreds.to_csv(options.stdout, sep="\t", index=None)
    elif options.task == "flag_relations":
        # the input file is likely to be huge! Ergo, read the file in chunks
        # calculate any related individuals and store them, store
        # an array of IBD values for plotting, drop the rest
        relate = gwas.flagRelated(ibd_file=options.relations,
                                  chunk_size=500000,
                                  threshold=options.ibs_cutoff,
                                  plot=True,
                                  plotting_path=options.plot_path)
    elif options.task == "discordant_gender":
        sex_discord = gwas.flagGender(gender_file=options.gender_check,
                                      plot=True,
                                      plot_path=options.plot_path)
        sex_discord.to_csv(options.stdout, index=None, sep="\t")
    else:
        pass

    # write footer and output benchmark information.
    E.stop()
Exemplo n.º 9
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("-p",
                      "--plot-type",
                      dest="plot_type",
                      type="choice",
                      choices=[
                          "histogram", "barplot", "density", "boxplot",
                          "scatter", "map", "pca"
                      ],
                      help="the plot type to generate")

    parser.add_option("--plot-n-pc",
                      dest="n_pcs",
                      type="int",
                      help="The number of principal components to "
                      "plot")

    parser.add_option("-g",
                      "--group-by",
                      dest="group_by",
                      type="string",
                      help="column header to group observations by")

    parser.add_option("-x",
                      "--x-column",
                      dest="x_col",
                      type="string",
                      help="column to plot on X axis")

    parser.add_option("-y",
                      "--y-column",
                      dest="y_col",
                      type="string",
                      help="column to plot on y axis")

    parser.add_option("-i",
                      "--index_column",
                      dest="indx",
                      type="string",
                      help="column number that refers to the row index")

    parser.add_option("--output-file",
                      dest="outfile",
                      type="string",
                      help="path and filename to save plot to")

    parser.add_option("--labels",
                      dest="labels",
                      type="string",
                      help="a comma-separated list of axis labels. "
                      "The first 2 correspond to the X and Y-axis, "
                      "respectively, and the third is the plot title")

    parser.add_option("--metadata-file",
                      dest="meta_file",
                      type="string",
                      help="file containing metadata for annotating "
                      "plots with. Use `--group-labels` to define table "
                      "columns to use")

    parser.add_option("--fam-file",
                      dest="fam_file",
                      type="string",
                      help="Plink .fam file containing file IDs")

    parser.add_option("--xvar-labels",
                      dest="xvar_labs",
                      type="string",
                      help="a comma-separated list of variable X labels"
                      "only applies when X is a discrete or categorical "
                      "variable. The labels must be in the correct order")

    parser.add_option("--group-labels",
                      dest="group_labs",
                      type="string",
                      help="a comma-separated list of grouping variable "
                      "labels.  Can only apply when the grouping variable "
                      "is discrete or categorical.  The labels must be "
                      "input in the order of the data")

    parser.add_option("--yvar-labels",
                      dest="yvar_labs",
                      type="string",
                      help="a comma-separated list of variable Y labels"
                      "only applies when Y is a discrete or categorical "
                      "variable")

    parser.add_option("--var-type",
                      dest="var_type",
                      type="choice",
                      choices=["continuous", "categorical", "integer"],
                      help="The data type of the variables to be plotted."
                      "The default is continuous")

    parser.add_option("--coordinate-file",
                      dest="coordinates",
                      type="string",
                      help="file containing co-ordinates data")

    parser.add_option("--coords-id-col",
                      dest="coord_ids",
                      type="string",
                      help="column header containing individual IDs")

    parser.add_option("--lattitude-column",
                      dest="lat_col",
                      type="string",
                      help="column header containing lattitude co-ordinates")

    parser.add_option("--longitude-column",
                      dest="long_col",
                      type="string",
                      help="column header containing longitude co-ordinates")

    parser.add_option("--reference-value",
                      dest="ref_val",
                      type="string",
                      help="categorical variable level to dichotomise on")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(y_col=None,
                        group_by=None,
                        indx=None,
                        labels="X,Y,title",
                        xvar_labs=None,
                        yvar_labs=None,
                        var_type="continuous")
    infile = argv[-1]

    df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0)

    if options.plot_type == "map":
        df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0)

        coords_df = pd.read_table(options.coordinates,
                                  sep="\t",
                                  header=0,
                                  index_col=options.indx)
        gwas.plotMapPhenotype(data=df,
                              coords=coords_df,
                              coord_id_col=options.coord_ids,
                              lat_col=options.lat_col,
                              long_col=options.long_col,
                              save_path=options.outfile,
                              xvar=options.x_col,
                              var_type=options.var_type,
                              xlabels=options.xvar_labs,
                              level=options.ref_val)

    elif options.plot_type == "pca":
        data = gwas.parseFlashPCA(pcs_file=infile, fam_file=options.fam_file)

        gwas.plotPCA(data=data,
                     nPCs=options.n_pcs,
                     point_labels=options.group_labs,
                     save_path=options.outfile,
                     headers=False,
                     metadata=options.meta_file,
                     multiplot=True)
    else:
        df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0)

        gwas.plotPhenotype(data=df,
                           plot_type=options.plot_type,
                           x=options.x_col,
                           y=options.y_col,
                           group=options.group_by,
                           save_path=options.outfile,
                           labels=options.labels,
                           xlabels=options.xvar_labs,
                           ylabels=options.yvar_labs,
                           glabels=options.group_labs,
                           var_type=options.var_type)

    # write footer and output benchmark information.
    E.stop()