def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("-p", "--plot-type", dest="plot_type", type="choice", choices=[ "histogram", "barplot", "density", "boxplot", "scatter", "map", "pca" ], help="the plot type to generate") parser.add_option("--plot-n-pc", dest="n_pcs", type="int", help="The number of principal components to " "plot") parser.add_option("-g", "--group-by", dest="group_by", type="string", help="column header to group observations by") parser.add_option("-x", "--x-column", dest="x_col", type="string", help="column to plot on X axis") parser.add_option("-y", "--y-column", dest="y_col", type="string", help="column to plot on y axis") parser.add_option("-i", "--index_column", dest="indx", type="string", help="column number that refers to the row index") parser.add_option("--output-file", dest="outfile", type="string", help="path and filename to save plot to") parser.add_option("--labels", dest="labels", type="string", help="a comma-separated list of axis labels. " "The first 2 correspond to the X and Y-axis, " "respectively, and the third is the plot title") parser.add_option("--metadata-file", dest="meta_file", type="string", help="file containing metadata for annotating " "plots with. Use `--group-labels` to define table " "columns to use") parser.add_option("--fam-file", dest="fam_file", type="string", help="Plink .fam file containing file IDs") parser.add_option("--xvar-labels", dest="xvar_labs", type="string", help="a comma-separated list of variable X labels" "only applies when X is a discrete or categorical " "variable. The labels must be in the correct order") parser.add_option("--group-labels", dest="group_labs", type="string", help="a comma-separated list of grouping variable " "labels. Can only apply when the grouping variable " "is discrete or categorical. The labels must be " "input in the order of the data") parser.add_option("--yvar-labels", dest="yvar_labs", type="string", help="a comma-separated list of variable Y labels" "only applies when Y is a discrete or categorical " "variable") parser.add_option("--var-type", dest="var_type", type="choice", choices=["continuous", "categorical", "integer"], help="The data type of the variables to be plotted." "The default is continuous") parser.add_option("--coordinate-file", dest="coordinates", type="string", help="file containing co-ordinates data") parser.add_option("--coords-id-col", dest="coord_ids", type="string", help="column header containing individual IDs") parser.add_option("--lattitude-column", dest="lat_col", type="string", help="column header containing lattitude co-ordinates") parser.add_option("--longitude-column", dest="long_col", type="string", help="column header containing longitude co-ordinates") parser.add_option("--reference-value", dest="ref_val", type="string", help="categorical variable level to dichotomise on") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) parser.set_defaults(y_col=None, group_by=None, indx=None, labels="X,Y,title", xvar_labs=None, yvar_labs=None, var_type="continuous") infile = argv[-1] df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0) if options.plot_type == "map": df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0) coords_df = pd.read_table(options.coordinates, sep="\t", header=0, index_col=options.indx) gwas.plotMapPhenotype(data=df, coords=coords_df, coord_id_col=options.coord_ids, lat_col=options.lat_col, long_col=options.long_col, save_path=options.outfile, xvar=options.x_col, var_type=options.var_type, xlabels=options.xvar_labs, level=options.ref_val) elif options.plot_type == "pca": data = gwas.parseFlashPCA(pcs_file=infile, fam_file=options.fam_file) gwas.plotPCA(data=data, nPCs=options.n_pcs, point_labels=options.group_labs, save_path=options.outfile, headers=False, metadata=options.meta_file, multiplot=True) else: df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0) gwas.plotPhenotype(data=df, plot_type=options.plot_type, x=options.x_col, y=options.y_col, group=options.group_by, save_path=options.outfile, labels=options.labels, xlabels=options.xvar_labs, ylabels=options.yvar_labs, glabels=options.group_labs, var_type=options.var_type) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--plot-type", dest="plot_type", type="choice", choices=["manhattan", "qqplot", "epistasis"], help="plot type to generate") parser.add_option("--resolution", dest="resolution", type="choice", choices=["genome_wide", "chromosome", "fine_map"], help="the resolution of plotting, wether the plot " "depicts the whole genome, a single chromosome or " "a specific locus") parser.add_option("--file-format", dest="file_format", type="choice", choices=["plink", "cassi", "cassi_covar"], help="input file format, used to parse the file " "properly") parser.add_option("--save-path", dest="save_path", type="string", help="path and filename to save image to") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) parser.set_defaults(resolution="genome_wide", plot_type="manhattan", file_format="plink") # if the input is a list of files, split them infile = argv[-1] infiles = infile.split(",") # need to parse epistasis output slightly differently if options.plot_type == "epistasis": epi = True else: epi = False if len(infiles) > 1: results = gwas.GWASResults(assoc_file=infiles, epistasis=epi, file_format=options.file_format) elif len(infiles) == 1: results = gwas.GWASResults(assoc_file=infile, epistasis=epi, file_format=options.file_format) else: raise IOError("no input files detected, please specifiy association " "results files as the last command line argument") if options.plot_type == "manhattan": df = results.plotManhattan(resolution=options.resolution, save_path=options.save_path) elif options.plot_type == "qqplot": results.plotQQ(save_path=options.save_path, resolution=options.resolution) elif options.plot_type == "epistasis": results.plotEpistasis(save_path=options.save_path, resolution=options.resolution) else: pass # only output appended results for Manhattan plot, not qqplot try: df.to_csv(options.stdout, sep="\t", index=None) except UnboundLocalError: pass # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--program", dest="program", type="choice", choices=["plink2", "gcta", "plinkdev"], help="program to execute genome-wide analysis") parser.add_option("--input-file-pattern", dest="infile_pattern", type="string", help="file prefix that identifies a group of files") parser.add_option("--input-file-format", dest="file_format", type="choice", choices=[ "plink", "plink_binary", "oxford", "oxford_binary", "vcf", "GRM_binary", "GRM_gz", "GRM_plink" ], help="format of input files") parser.add_option("--phenotypes-file", dest="pheno_file", type="string", help="text file of additional phenotypes") parser.add_option("--pheno", dest="pheno", type="string", help="either phenotype file column header or number") parser.add_option("--covariates-file", dest="covariate_file", type="string", help="file containing covariates. Used as the " "continuous covariates in GCTA-based analyses") parser.add_option("--covariate-column", dest="covar_col", type="string", help="column number(s) or header(s) to include in " "association model") parser.add_option("--discrete-covariates-file", dest="covariate_discrete", type="string", help="file containing discrete covariates " "to adjust for in GCTA-based analyses") parser.add_option("--association-model", dest="assoc_model", type="choice", choices=["recessive", "dominant", "genotype"], help="model to report from association analysis") parser.add_option("--method", dest="method", type="choice", choices=[ "association", "summary", "format", "matrix", "reml", "bivariate_reml", "pca", "lmm", "simulation", "epistasis", "ld", "estimate_haplotypes" ], help="method to apply to genome-wide data") parser.add_option("--reml-method", dest="reml_method", type="choice", choices=[ "standard", "priors", "reml_algorithm", "unconstrained", "GxE", "LRT", "BLUP_EBV", "snpBLUP", "no_residual", "fixed_cor" ], help="method for REML estimate of heritability method " "including either single or dual phenotypes") parser.add_option("--reml-parameters", dest="reml_param", type="string", help="comma separated list of parameters to pass to " "REML variance components analysis") parser.add_option("--prevalence", dest="prevalence", type="float", help="binary trait prevalence in a cohort study. " "Used to estimate h2 on the liability threshold " "scale.") parser.add_option("--lmm-method", dest="lmm_method", type="choice", choices=["standard", "loco", "no_covar"], help="type of linear mixed model analysis to run") parser.add_option("--grm-prefix", dest="grm_prefix", type="string", help="prefix of the pre-computed GRM files to use " "in the linear mixed model analysis") parser.add_option( "--epistasis-method", dest="epi_method", type="choice", choices=["fast_epistasis", "epistasis", "two_locus", "adjusted"], help="epistasis method to use") parser.add_option("--epistasis-parameter", dest="epi_param", type="string", help="modifiers of epistasis functions") parser.add_option("--epistasis-threshold", dest="epi_sig", type="string", help="statistical significance threshold for counting " "interactions") parser.add_option("--epistasis-report-threshold", dest="epi_report", type="string", help="threshold used to count the " "proportion of statistically significant interactions") parser.add_option("--set-file", dest="set_file", type="string", help="file containing variant sets as per Plink " ".set file specification") parser.add_option("--set-method", dest="set_method", type="choice", choices=["set-by-all", "set-by-set"], help="set method to use when `set_file` provided") parser.add_option("--principal-components", dest="num_pcs", type="int", help="the number of principal components to output") parser.add_option("--matrix-shape", dest="matrix_shape", type="choice", choices=["triangle", "square", "square0"], help="output matrix shape.", default="triangle") parser.add_option("--matrix-compression", dest="matrix_compress", type="choice", choices=["gz", "bin", "bin4"], help="compression to apply to output matrix") parser.add_option("--matrix-form", dest="matrix_form", type="choice", choices=["distance", "grm"], help="type of relationship matrix to calculate") parser.add_option( "--matrix-metric", dest="matrix_metric", type="choice", choices=["fhat", "cov", "ibc2", "ibc3", "ibs", "genomic", "hamming"], help="value to calculate for diagonal elements of the " "grm. Default is fhat for grm and hamming for distance.") parser.add_option( "--matrix-options", dest="matrix_options", type="string", help="modifiers of matrix output, see plink documentation " "for details") parser.add_option("--association-method", dest="assoc_method", type="choice", choices=["linear", "logistic", "assoc", "qassoc"], help="association analysis to run") parser.add_option( "--permutation", dest="permutation", action="store_true", help="perform association testing by permutation analysis") parser.add_option("--repeats", dest="n_perms", type="int", help="number of repetitions for permutation analysis") parser.add_option("--association-options", dest="assoc_option", type="string", help="association analysis modifiers") parser.add_option("--format-method", dest="format_method", type="choice", choices=[ "change_format", "change_missing_values", "update_variants", "update_samples", "flip_strands", "flip_scan", "sort", "merge", "find_duplicates" ], help="file formatting to apply to input files") parser.add_option("--format-parameter", dest="format_param", type="string", help="formatting parameter, where appropriate") parser.add_option( "--reformat-type", dest="reformat", type="choice", choices=["plink", "plink_binary", "oxford", "oxford_binary", "raw"], help="new format of input files to be reformatted to") parser.add_option("--apply-missing", dest="apply_missing", type="choice", choices=["genotype", "phenotype"], help="genotype or phenotype missing values to alter") parser.add_option("--update-variant-attribute", dest="variant_update", type="choice", choices=[ "variant_ids", "missing_id", "chromosome", "centimorgan", "name", "alleles", "map" ], help="update variant attributes") parser.add_option("--update-sample-attribute", dest="sample_update", type="choice", choices=["sample_ids", "parents", "gender"], help="sample attributes to be updated") parser.add_option("--strand-flip-subset", dest="flip_subset", action="store_true", help="apply strand flipping to a subset of samples") parser.add_option("--flip-scan-type", dest="scan_param", type="choice", choices=["default", "window", "threshold"], help="strand flipping scan to apply to SNPs") parser.add_option("--sort-type", dest="sort_type", type="choice", choices=["none", "natural", "ascii", "file"], help="sort type to input files") parser.add_option("--merge-file-format", dest="merge_format", type="choice", choices=["plink", "plink_binary"], help="format of input files to be merged") parser.add_option( "--merge-mode", dest="merge_mode", type="choice", choices=[ "default", "original_missing", "new_nonmissing", "no_overwrite", "force", "report_all", "report_nonmissing" ], help="merge mode to apply to dealing with merge conflicts") parser.add_option("--duplicates-method", dest="dup_method", type="choice", choices=["same_ref", "id_match", "suppress_first"], help="method for identifying and dealing with duplicate " "variants") parser.add_option("--summary-method", dest="summary_method", type="choice", choices=[ "allele_frequency", "missing_data", "hardy_weinberg", "mendel_errors", "inbreeding", "gender_checker", "wrights_fst", "case_control_fst" ], help="summary statistics to calculate") parser.add_option("--summary-parameter", dest="sum_param", type="string", help="optional parameters that can be passed to summary " "statistics methods") parser.add_option("--haplotype-frequency", dest="filt_haplotype_frequency", type="string", help="min allele frequency for SNPs to be " "considered for a haplotype") parser.add_option("--haplotype-size", dest="filt_haplotype_size", type="string", help="maximum genomic size of " "haplotypes") parser.add_option("--ld-statistic", dest="ld_stat", type="choice", choices=["r", "r2"], help="compute either the raw " "inter variant allele count correlation, R, or the " "squared correlation, R^2") parser.add_option("--ld-min", dest="ld_min", type="string", help="minimum value to report for pair-wise LD " "calculations. Beware output files may be very " "large if `ld_min` is very small.") parser.add_option("--ld-window", dest="ld_window", type="string", help="distance between SNPs, beyond which LD will " "not be calculated") parser.add_option("--ld-format-output", dest="ld_shape", type="choice", choices=["square", "table", "triangle", "square0"], help="output either as table, or matrix format with a " "specific shape.") parser.add_option( "--genotype-rate", dest="filt_genotype_rate", type="string", help="genotyping rate threshold. SNPs below this threshold " "will be excluded from analysis") parser.add_option("--indiv-missing", dest="filt_missingness", type="string", help="individual missingness rate. Individuals below " "this threshold will be excluded from analysis") parser.add_option("--hardy-weinberg", dest="filt_hwe", type="string", help="hardy-weinberg p-value threshold for SNPs. SNPs " "with a 2df chisquared p-value below this will be " "filtered out") parser.add_option( "--min-allele-frequency", dest="filt_min_allele_frequency", type="string", help="only include SNPs with an allele frequency equal to " "or above this threshold") parser.add_option( "--max-allele-frequency", dest="filt_max_allele_frequency", type="string", help="only include SNPs with an allele frequency equal to " "or below this threshold") parser.add_option( "--mendelian-error", dest="filt_mendelian_error", type="string", help="exclude individuals/trios with mendelian errors that " "exceed this value") parser.add_option("--keep-individuals", dest="filt_keep", type="string", help="a file containing individuals IDs to keep, " "one per row") parser.add_option("--remove-individuals", dest="filt_remove", type="string", help="a file of individual IDs to remove, one per row") parser.add_option("--min-quality-score", dest="filt_min_qaul_score", type="string", help="reset the minimum low bound of quality scores for " "variants in a VCF file. Default is 0") parser.add_option( "--max-quality-score", dest="filt_max_qual_score", type="string", help="reset the maximum upper bound of quality scores for " "a VCCF file. Default is Inf") parser.add_option("--allow-no-gender", dest="filt_allow_no_sex", type="string", help="allow individuals with gender missing") parser.add_option("--enforce-gender", dest="filt_enforce_sex", type="string", help="only include individuals with non-missing gender " "information") parser.add_option("--subset-filter", dest="filt_subset_filter", type="choice", choices=[ "cases", "controls", "males", "females", "founders", "nonfounders" ], help="only apply filters to the specific subset of " "individuals supplied") parser.add_option( "--extract-snps", dest="filt_extract", type="string", help="text file of variant IDs to include in the analysis, " "ignoring all others") parser.add_option("--exclude-snps", dest="filt_exclude", type="string", help="a file of variant IDs to exclude from analysis") parser.add_option("--restrict-chromosome", dest="filt_chromosome", type="string", help="restict analysis to either a single chromosome, " "or a comma-separated list of chromosomes") parser.add_option("--exclude-chromosomes", dest="filt_exclude_chromosome", type="string", help="exclude all variants on these " "chromosome(s)") parser.add_option( "--autosome-only", dest="filt_autosome", action="store_true", help="if present only autosomal variants will be analysed") parser.add_option( "--pseudo-autosome", dest="filt_pseudo_autosome", action="store_true", help="include on the pseudo-autosomal region of chromosome X") parser.add_option("--ignore-indels", dest="filt_ignore_indels", action="store_true", help="only include bi-allelic single nucleotide " "variants in analysis") parser.add_option( "--snp-range", dest="filt_snp_bp_range", type="string", help="comma separated list of from, to genome co-ordinates " "within which to include variants for analysis") parser.add_option( "--conditional-snp", dest="filt_conditional_snp", type="string", help="condition the analysis on this SNP ID. Can only be " "used in the linear and logistic regression models.") parser.add_option("--snp-id-range", dest="filt_snp_id_range", type="string", help="comma separate list of IDs from, to within which " "to include variants for analysis.") parser.add_option("--snp-id", dest="filt_specific_snp", type="string", help="include a single snp in the analysis given by " "it's variant ID.") parser.add_option("--exclude-variant", dest="filt_exclude_snp", type="string", help="exclude a single variant from the analysis, " "given by it's variant ID") parser.add_option( "--covariate-filter", dest="filt_covariate_filter", type="string", help="covariate column headers or column numbers on which " "to filter on. Requries --covariate-file") parser.add_option( "--filter-parameter", dest="param", type="string", help="parameter values to be passed to filtering function") parser.add_option("--window-size", dest="window_size", type="string", help="alters the behaviour of the --snp-range and " "--include/exclude snp options. variants within +/- " "half * window_size (kb) are included") parser.add_option( "--range-resolution", dest="filt_range_resolution", type="choice", choices=["bp", "kb", "mb"], help="alters the (from, to) range resolution to either bp, " "kb or mb") parser.add_option( "--output-file-pattern", dest="out_pattern", type="string", help="output file pattern prefix. file suffixes are dependent " "on the task executed") parser.add_option("--threads", dest="threads", type="int", help="the number of threads to use for multi-threaded " "processes") parser.add_option("--memory", dest="memory", type="string", help="amount of memory to reserve for the task") parser.add_option("--parallel", dest="parallel", type="int", help="number of jobs to split task into") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) parser.set_defaults(sum_param=None, dup_method="same_ref", n_perms=None, permutation=False, matrix_shape="triangle", matrix_options=None, matrix_compress="gz", random_seed=random.randint(0, 19999), sample_update=None, memory="60G", parallel=None, covariate_file=None, covar_col=None, epi_report=0.001, epi_sig=0.001) if not options.infile_pattern: infiles = (argv[-1]).split(",") else: infiles = options.infile_pattern # create a new filegroup object geno_files = gwas.FileGroup(files=infiles, file_format=options.file_format, genotype_format="imputed") if options.pheno_file: geno_files.set_phenotype(pheno_file=options.pheno_file, pheno=options.pheno) else: pass # add FileGroup object to the gwas program object if options.program == "plink2": gwas_object = gwas.Plink2(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) elif options.program == "plinkdev": gwas_object = gwas.PlinkDev(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) elif options.program == "gcta": gwas_object = gwas.GCTA(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) else: pass # collect filtering options from options opt_dict = options.__dict__ filter_keys = [fx for fx in opt_dict.keys() if re.search("filt", fx)] filter_dict = {k: options.__dict__[k] for k in filter_keys if opt_dict[k]} # iteratively add genotype filters to GWASProgram object for fkey in filter_dict: filt_key = fkey.lstrip("filt_") filter_value = filter_dict[fkey] gwas_object.apply_filters(filter_type=filt_key, filter_value=filter_value) # handle summary statistics if options.method == "summary": if options.summary_method == "allele_frequency": gwas_object._output_statistics(allele_frequency=options.sum_param) elif options.summary_method == "hardy_weinberg": gwas_object._output_statistics(hardy_weinberg=options.sum_param) elif options.summary_method == "missing_data": gwas_object._output_statistics(missing_data=options.sum_param) elif options.summary_method == "mendel_errors": gwas_object._output_statistics(mendel_errors=options.sum_param) elif options.summary_method == "inbreeding": gwas_object._output_statistics(inbreeding=options.sum_param) elif options.summary_method == "gender_checker": gwas_object._output_statistics(gender_checker=options.sum_param) elif options.summary_method == "wrights_fst": gwas_object._output_statistics(wrights_fst=options.sum_param) elif options.summary_method == "case_control_fst": gwas_object._output_statistics(case_control_fst=options.sum_param) else: pass elif options.method == "pca": gwas_object.PCA(n_pcs=options.num_pcs) elif options.method == "ld": gwas_object.calc_ld(ld_statistic=options.ld_stat, ld_threshold=float(options.ld_min), ld_shape=options.ld_shape) elif options.method == "association": gwas_object.run_association(association=options.assoc_method, permutation=options.permutation, n_perms=options.n_perms, random_seed=options.random_seed, covariates_file=options.covariate_file, covariates=options.covar_col) elif options.method == "estimate_haplotypes": gwas_object._run_tasks(estimate_haplotypes="haplotype") elif options.method == "lmm": gwas_object.mixed_model(lmm_method=options.lmm_method, grm=options.grm_prefix, qcovar=options.covariate_file, dcovar=options.covariate_discrete) elif options.method == "epistasis": gwas_object._detect_interactions( method=options.epi_method, modifier=options.epi_param, set_file=options.set_file, set_mode=options.set_method, report_threshold=options.epi_report, sig_threshold=options.epi_sig, covariates_file=options.covariate_file, covariates=options.covar_col) elif options.method == "reml": gwas_object.reml_analysis(method=options.reml_method, parameters=options.reml_param, prevalence=options.prevalence, qcovariates=options.covariate_file, discrete_covar=options.covariate_discrete) elif options.method == "format": if options.format_method == "change_format": # adding filtering options to plink requires the --make-bed flag try: update_samples = opt_dict["sample_update"] if update_samples: E.info("updating samples from %s" % options.format_param) gwas_object._run_tasks(change_format=options.reformat, parameter=options.format_param) gwas_object._run_tasks( update_samples=options.sample_update, parameter=options.format_param) else: gwas_object._run_tasks(change_format=options.reformat, parameter=options.format_param) except KeyError: gwas_object._run_tasks(change_format=options.reformat, parameter=options.format_param) elif options.format_method == "change_missing_values": gwas_object._run_tasks(change_missing_values=options.apply_missing, parameter=options.format_param) elif options.format_method == "update_variants": gwas_object._run_tasks(update_variants=options.variant_update, parameter=options.format_param) gwas_object._run_tasks(change_format=options.file_format) elif options.format_method == "update_samples": gwas_object._run_tasks(update_samples=options.sample_update, parameter=options.format_param) elif options.format_method == "flip_strands": if options.flip_subset: gwas_object._run_tasks(flip_strands="subset", parameter=options.format_param) else: gwas_object._run_tasks(flip_strands="all_samples", parameter=options.format_param) elif options.format_method == "flip_scan": gwas_object._run_tasks(flip_scan=options.scan_param, parameter=options.format_param) elif options.format_method == "sort": gwas_object._run_tasks(sort=options.sort_type, parameter=options.format_param) elif options.format_method == "merge": if options.merge_mode: gwas_object._run_tasks(merge_mode=options.merge_mode, parameter=options.format_param) else: gwas_object._run_tasks(merge=options.merge_format, parameter=options.format_param) elif options.format_method == "find_duplicates": gwas_object._run_tasks(find_duplicates=options.dup_method, parameter=options.format_param) else: pass elif options.method == "matrix": if options.matrix_form == "distance": if options.matrix_metric == "hamming": gwas_object.hamming_matrix(shape=options.matrix_shape, compression=options.matrix_compress, options=options.matrix_options) elif options.matrix_metric == "ibs": gwas_object.ibs_matrix(shape=options.matrix_shape, compression=options.matrix_compress, options=options.matrix_options) elif options.matrix_metric == "genomic": gwas_object.genome_matrix(shape=options.matrix_shape, compression=options.matrix_compress, options=options.matrix_options) elif options.matrix_form == "grm": gwas_object.genetic_relationship_matrix( shape=options.matrix_shape, compression=options.matrix_compress, metric=options.matrix_metric, options=options.matrix_options) else: pass gwas_object.build_statement(infiles=geno_files, outfile=options.out_pattern, threads=options.threads, memory=options.memory, parallel=options.parallel) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--method", dest="method", type="choice", choices=["cases_explained", "probability_phenotype"], help="Which results to report, either the proportion " "of cases explained or the probability of the " "phenotype given the number of alleles carried") parser.add_option("--map-file", dest="map_file", type="string", help="plink .map file with SNP positions") parser.add_option("--ped-file", dest="ped_file", type="string", help="plink ped file with phenotype and " "genotype data - A2 major allele coded") parser.add_option("--gwas-file", dest="gwas", type="string", help="gwas results file, assumes Plink " "output format. Must contain SNP, BP, " "OR column headers. Assumes results relate " "to the A1 allele") parser.add_option("--flip-alleles", dest="flip", action="store_true", help="force alleles to flip if OR < 1") parser.add_option("--plot-statistic", dest="plot_stat", type="choice", choices=["frequency", "cumulative"], help="plot either cases frequency or cumulative " "frequency of cases") parser.add_option("--plot-path", dest="plot_path", type="string", help="save path for plot") parser.add_option("--flag-explained-recessive", dest="explained", action="store_true", help="flag individuals explained by carriage of " "2 risk alleles - NOT IMPLIMENTED") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # required files are .ped file, .map file and gwas results file E.info("reading GWAS results file: %s" % options.gwas) snp_df = pd.read_table(options.gwas, sep="\t", header=0, index_col=None) snp_list = snp_df["SNP"].values # parse ped file E.info("Reading ped file: %s" % options.ped_file) ped_df = gwas.parsePed(options.ped_file, compound_geno=True) # parse map file and get SNP indices that correspond to # ped file genotypes E.info("Fetching SNPs from map file: %s" % options.map_file) snp_index = gwas.getSNPs(options.map_file, snp_list) E.info("SNPs found: %i" % len(snp_index)) # extract SNPs and ORs as key, value pairs or_dict = snp_df.loc[:, ["SNP", "OR"]].to_dict(orient='list') snp_or = dict(zip(or_dict["SNP"], or_dict["OR"])) if options.flip: E.info("Flipping major alleles to risk alleles") flipped_genos = gwas.flipRiskAlleles(snp_index=snp_index, snp_results=snp_or, genos=ped_df["GENOS"].tolist()) # merge flipped genotypes with pedigree frame to get phenotypes geno_df = pd.DataFrame(flipped_genos, index=ped_df["FID"]) else: # split genos into a dataframe genos = np.array(ped_df["GENOS"].tolist()) geno_df = pd.DataFrame(genos, index=ped_df["FID"]) merged = pd.merge(geno_df, ped_df, left_index=True, right_on="FID") # need to discount missing genotypes > 1% # frequencies of number of risk alleles by trait frequency E.info("count #risk alleles per individual") risk_results = gwas.countRiskAlleles(ped_frame=merged, snp_index=snp_index.values(), report=options.method, flag=options.explained) risk_freqs = risk_results["freqs"] cumulative = risk_results["cumulative"] # select results upto and including cumulative freq = 1.0 max_indx = [fx for fx, fy in enumerate(cumulative) if fy == 1.0][0] max_freqs = risk_freqs[:max_indx + 1] max_cum = cumulative[:max_indx + 1] bins = [ix for ix, iy in enumerate(cumulative)][:max_indx + 1] # plot! # need to add number of individuals into each bin as point size if options.plot_stat == "frequency": E.info("Generating plot of #risk alleles vs. P(Phenotype)") hist_df = gwas.plotRiskFrequency( bins=bins, frequencies=max_freqs, counts=risk_results["cases"][:max_indx + 1], savepath=options.plot_path, ytitle="P(Phenotype)") elif options.plot_stat == "cumulative": E.info("Generating plot of #risk alleles vs. cumulative frequency") hist_df = gwas.plotRiskFrequency( bins=bins, frequencies=max_cum, counts=risk_results["cases"][:max_indx + 1], savepath=options.plot_path, ytitle="Cumulative frequency cases") hist_df["freq"] = risk_results["freqs"][:max_indx + 1] hist_df["cumulative"] = risk_results["cumulative"][:max_indx + 1] hist_df["cases"] = risk_results["cases"][:max_indx + 1] hist_df["controls"] = risk_results["controls"][:max_indx + 1] hist_df["total"] = hist_df["cases"] + hist_df["controls"] hist_df.to_csv(options.stdout, sep="\t", index=None) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--score-method", dest="method", type="choice", choices=[ "PICS", "LDscore", "ABF", "R2_rank", "get_eigen", "calc_prior", "credible_set", "summarise" ], help="SNP scoring/prioritisation method to apply.") parser.add_option("--database", dest="database", type="string", help="SQL database containing LD information " "in table format. Expects columns SNP_A, " "SNP_B, R2, BP_A and BP_B (Plink --r2 output)") parser.add_option("--ld-directory", dest="ld_dir", type="string", help="directory containing tabix-index BGZIP " "LD files. Assumes Plink used to calculate LD") parser.add_option("--table-name", dest="table", type="string", help="name of the SQL table containing the LD" "values") parser.add_option("--chromosome", dest="chromosome", type="string", help="chromosome to subset the association results " "file on") parser.add_option("--ld-threshold", dest="ld_threshold", type="float", help="the threshold of LD above which variants will " "be taken forward.") parser.add_option("--rank-threshold", dest="rank_threshold", type="float", help="the threshold in terms of the top n% SNPs to " "output based on the ranking metric. e.g. " "--rank-threshold=0.01 is the top 1% SNPs") parser.add_option("--credible-interval", dest="interval", type="float", help="The credible set interval size to generate the " "credible set of SNPs") parser.add_option("--prior-variance", dest="prior_var", type="float", help="the prior variance used to weight the SNP " "variance") parser.add_option("--fine-map-window", dest="map_window", type="int", help="the region size to included around the index " "SNP as the fine-mapping region.") parser.add_option("--eigen-score-directory", dest="eigen_dir", type="string", help="PATH to directory containing tabix indexed " "eigen score files") parser.add_option("--flat-prior", dest="flat_prior", action="store_true", help="Ignore functional annotation information and " "use an uninformative prior on each SNP") parser.add_option("--snp-set", dest="snp_set", type="string", help="Pre-defined SNP set as a list of SNP IDs." "If used to calculate priors contains column of scores.") parser.add_option( "--distribution", dest="dist", type="choice", choices=["normal", "t", "gamma", "lognormal", "exponential"], help="distribution from which to draw prior " "probabilities") parser.add_option("--distribution-parameters", dest="dist_params", type="string", help="distribution parameters as a comma-separated list") parser.add_option("--lead-snp-id", dest="lead_snp", type="int", help="0-based item number in filename") parser.add_option("--filename-separator", dest="separator", type="string", help="filename separator to extract information") parser.add_option("--snp-column", dest="snp_col", type="int", help="0-based index of SNP ID column number") parser.add_option("--probability-column", dest="prob_col", type="int", help="0-based index of posterior probabilities column" " number") parser.set_defaults( ld_dir=None, dist="normal", dist_params=None, snp_set=None, prior_var=0.04, interval=0.99, eigen_dir=None, map_window=100000, ld_threshold=0.5, database=None, table=None, flat_prior=False, lead_snp=2, separator="_", snp_col=0, prob_col=1, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) infile = argv[-1] if len(infile.split(",")) > 1: pass else: peek = pd.read_table(infile, nrows=5, sep="\s*", header=0) try: if len(peek["TEST"] != "ADD"): clean = False else: clean = True except KeyError: clean = True if options.method == "LDscore": snpscores = gwas.snpPriorityScore(gwas_results=infile, database=options.database, table_name=options.table, chromosome=options.chromosome, ld_dir=options.ld_dir, clean=clean) # take top 1%, all SNPs doesn't achieve anything useful ranks = int(len(snpscores.index) * 0.01) snpscores = snpscores.iloc[:ranks] elif options.method == "PICS": snp_list = {} if options.snp_set and not options.flat_prior: with IOTools.openFile(options.snp_set, "r") as sfile: for line in sfile.readlines(): snp = line.split("\t")[0] try: score = float(line.split("\t")[-1].rstrip("\n")) except ValueError: score = 0 snp_list[snp] = float(score) # get the parameter estimates for the distribution # if they have not been provided if not options.dist_params: dist_params = gwas.estimateDistributionParameters( data=snp_list.values(), distribution=options.dist) else: dist_params = tuple( [float(fx) for fx in options.dist_params.split(",")]) E.info("Calculating priors on SNPs") priors = gwas.calcPriorsOnSnps(snp_list=snp_list, distribution=options.dist, params=dist_params) elif options.snp_set and options.flat_prior: with IOTools.openFile(options.snp_set, "r") as sfile: for line in sfile.readlines(): snp = line.split("\t")[0] snp_list[snp] = 1.0 priors = snp_list else: # allow for no priors or scores to be set, # use of priors will be ignored, # i.e. when prior and likelihood are not from # conjugate distributions priors = None # PICS scores expects the gwas results file to # only contain the region of interest, which # represents an independent association signal # if a SNP has not been genotyped, # but it is in strong LD, it will cause problems # downstream <- only allow SNPs that # are present in the analysis snpscores = gwas.PICSscore(gwas_results=infile, database=options.database, table_name=options.table, chromosome=options.chromosome, priors=priors, clean=clean, ld_dir=options.ld_dir, ld_threshold=options.ld_threshold) snpscores.columns = ["SNP", "PICS"] posterior_sum = 0 snpscores.sort_values(ascending=False, inplace=True) post_snps = [] for snp in snpscores.index: if posterior_sum < 99.0: posterior_sum += snpscores.loc[snp] post_snps.append(snp) else: break snpscores = snpscores.loc[post_snps] snpscores.drop_duplicates(inplace=True) elif options.method == "R2_rank": # rank SNPs based on their LD with the lead # SNP, take the top n% SNPs snpscores = gwas.LdRank(gwas_results=infile, database=options.database, table_name=options.table, ld_dir=options.ld_dir, chromosome=options.chromosome, ld_threshold=options.ld_threshold, top_snps=options.rank_threshold, clean=clean) elif options.method == "ABF": snpscores = gwas.ABFScore(gwas_results=infile, region_size=options.map_window, chromosome=options.chromosome, prior_variance=options.prior_var, clean=clean) elif options.method == "get_eigen": E.info("Fetching Eigen scores") snpscores = gwas.getEigenScores(eigen_dir=options.eigen_dir, bim_file=infile, snp_file=options.snp_set) snpscores = pd.DataFrame(snpscores).T elif options.method == "credible_set": E.info("Creating credible set") snpscores = gwas.makeCredibleSet(probs_file=infile, credible_set=options.interval, lead_snp_indx=options.lead_snp, filename_sep=options.separator, snp_column=options.snp_col, probs_column=options.prob_col) elif options.method == "summarise": E.info("Collating SNP prioritisation resuslts") file_list = infile.split(",") snpscores = gwas.summariseResults(file_list=file_list) snpscores.to_csv(options.stdout, index_label="SNP", sep="\t") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option( "--task", dest="task", type="choice", choices=["mafs", "penetrance", "detect_duplicates", "allele_diff"], help="task to perform") parser.add_option("--ped-file", dest="ped_file", type="string", help="plink format .ped file") parser.add_option("--map-file", dest="map_file", type="string", help="plink format .map file") parser.add_option("--freq-file", dest="mafs", type="string", help="text file containing populations minor " "allele frequencies of variants. One row per " "variant with ID MAF") parser.add_option("--groups-file", dest="group_file", type="string", help="file containing group labels for individuals " "in the provided ped file") parser.add_option("--ref-label", dest="ref_label", type="string", help="group label to be used as the reference case") parser.add_option("--test-label", dest="test_label", type="string", help="group label to be used as the test case") parser.add_option("--subset", dest="subset", type="choice", choices=["cases", "gender"], help="subset the " "data by either case/control or gender") parser.add_option("--take-last", dest="take", action="store_true", help="if use duplicates will take the last variant, " "default behaviour is to take the first") parser.add_option("--outfile-pattern", dest="out_pattern", type="string", help="outfile pattern to use for finding duplicates " "and triallelic variants") parser.add_option("--snp-set", dest="snp_subset", type="string", help="list of SNPs to include") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) parser.set_defaults(mafs=None, subset=None, take_last=False) if options.task == "mafs": mafs = gwas.countByVariantAllele(options.ped_file, options.map_file) mafs.to_csv(options.stdout, index_col=None, sep="\t") elif options.task == "penetrance": summary, pens = gwas.calcPenetrance(options.ped_file, options.map_file, subset=options.subset, mafs=options.mafs, snpset=options.snp_subset) pens.to_csv(options.stdout, sep="\t", index_label="SNP") summary.to_csv("/".join([os.getcwd(), "penetrance_summary.txt"]), sep="\t", index_label="SNP") elif options.task == "allele_diff": allele_diffs = gwas.calcMaxAlleleFreqDiff( ped_file=options.ped_file, map_file=options.map_file, group_file=options.group_file, test=options.test_label, ref=options.ref_label) allele_diffs.to_csv(options.stdout, sep="\t") elif options.task == "detect_duplicates": # find variants with duplicated position and shared reference # allele indicative of triallelic variants - also same ID # ouput to a filter list infile = argv[-1] dups, tris, oves = gwas.findDuplicateVariants(bim_file=infile, take_last=options.take) if os.path.isabs(options.out_pattern): with open(options.out_pattern + ".triallelic", "w") as otfile: for tvar in tris: otfile.write("%s\n" % tvar) with open(options.out_pattern + ".duplicates", "w") as odfile: for dvar in dups: odfile.write("%s\n" % dvar) with open(options.out_pattern + ".overlapping", "w") as ovfile: for ovar in oves: ovfile.write("%s\n" % ovar) else: outpattern = os.path.abspath(options.out_pattern) with open(outpattern + ".triallelic", "w") as otfile: for tvar in tris: otfile.write("%s\n" % tvar) with open(outpattern + ".duplicates", "w") as odfile: for dvar in dups: odfile.write("%s\n" % dvar) with open(outpattern + ".overlapping", "w") as ovfile: for ovar in oves: ovfile.write("%s\n" % ovar) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--task", dest="task", type="choice", choices=["merge_exclusions", "flag_hets", "find_inbreds", "flag_relations", "discordant_gender"], help="task to execute on phenotype file(s)") parser.add_option("--gender-check-file", dest="gender_check", type="string", help="output from gender checking " "by Plink, suffix should be .sexcheck") parser.add_option("--relationship-file", dest="relations", type="string", help="output file from IBS " "calculation. Should contain all pairwise " "relationships.") parser.add_option("--inbreeding-coef-file", dest="inbreed_file", type="string", help="file containing either Plink " "or GCTA estimates of F, inbreeding coefficient") parser.add_option("--inbreeding-coefficient", dest="inbred_coeff", type="choice", choices=["Fhat1", "Fhat2", "Fhat3", "F", "ibc"], help="inbreeding coefficient " "to use to identify highly inbred individuals") parser.add_option("--inbred-cutoff", dest="inbred_cutoff", type="float", help="threshold above which individuals are classed " "as inbred.") parser.add_option("--ibs-cutoff", dest="ibs_cutoff", type="float", help="IBS threshold to flag individuals as being " "closely related") parser.add_option("--trimmed-relationships", dest="rel_cutoff", type="string", help="output file from Plink " "--rel-cutoff with trimmed data set of unrelated " "individuals.") parser.add_option("--heterozygotes-file", dest="hets_file", type="string", help="file from heterozygote analysis containing observed " "homozygosity and F coefficients") parser.add_option("--auxillary-file", dest="aux_file", type="string", help="a file of IIDs and FIDs for individuals that are " "to be removed from analysis, unrelated to QC") parser.add_option("--plotting-path", dest="plot_path", type="string", help="PATH to save any plots to") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.task == "flag_hets": # calculate heterozygosity rates, find and flag # individuals > 3 s.d. away from mean value # rate = (nonissing - homs) / nonmissing # i.e. non-homozygote rate flags = gwas.flagExcessHets(options.hets_file, plot=True, plot_path=options.plot_path) flags.to_csv(options.stdout, index=None, sep="\t") elif options.task == "merge_exclusions": exclusions = gwas.mergeQcExclusions(hets_file=options.hets_file, inbred_file=options.inbreed_file, related_file=options.relations, gender_file=options.gender_check, mask_file=options.aux_file) exclusions.to_csv(options.stdout, index=None, sep="\t") elif options.task == "find_inbreds": inbreds = gwas.flagInbred(inbred_file=options.inbreed_file, inbreeding_coefficient=options.inbred_coeff, ibc_threshold=options.inbred_cutoff, plot=True, plot_path=options.plot_path) inbreds.to_csv(options.stdout, sep="\t", index=None) elif options.task == "flag_relations": # the input file is likely to be huge! Ergo, read the file in chunks # calculate any related individuals and store them, store # an array of IBD values for plotting, drop the rest relate = gwas.flagRelated(ibd_file=options.relations, chunk_size=500000, threshold=options.ibs_cutoff, plot=True, plotting_path=options.plot_path) elif options.task == "discordant_gender": sex_discord = gwas.flagGender(gender_file=options.gender_check, plot=True, plot_path=options.plot_path) sex_discord.to_csv(options.stdout, index=None, sep="\t") else: pass # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--program", dest="program", type="choice", choices=["plink2", "gcta", "plinkdev"], help="program to execute genome-wide analysis") parser.add_option("--input-file-pattern", dest="infile_pattern", type="string", help="file prefix that identifies a group of files") parser.add_option("--input-file-format", dest="file_format", type="choice", choices=[ "plink", "plink_binary", "oxford", "oxford_binary", "vcf", "GRM_binary", "GRM_gz" ], help="format of input files") parser.add_option("--phenotypes-file", dest="pheno_file", type="string", help="text file of additional phenotypes") parser.add_option("--pheno", dest="pheno", type="string", help="either phenotype file column header or number") parser.add_option("--covariates-file", dest="covariate_file", type="string", help="file containing covariates") parser.add_option("--covariate-column", dest="covar_col", type="string", help="column number(s) or header(s) to include in " "association model") parser.add_option("--method", dest="method", type="choice", choices=[ "ld_prune", "summary", "flag_hets", "remove_relations", "check_gender", "IBD" ], help="method to apply to genome-wide data") parser.add_option("--IBD-parameter", dest="ibd_param", type="choice", choices=["norm", "relatives", "full"], help="param " "to pass to IBD calculations") parser.add_option("--principal-components", dest="num_pcs", type="int", help="the number of principal components to output") parser.add_option("--matrix-shape", dest="matrix_shape", type="choice", choices=["triangle", "square", "square0"], help="output matrix shape.", default="triangle") parser.add_option("--matrix-compression", dest="matrix_compress", type="choice", choices=["gz", "bin", "bin4"], help="compression to apply to output matrix file", default="gz") parser.add_option("--matrix-form", dest="matrix_form", type="choice", choices=["distance", "grm"], help="type of relationship matrix to calculate") parser.add_option( "--matrix-metric", dest="matrix_metric", type="choice", choices=["fhat", "cov", "ibc2", "ibc3", "ibs", "genomic", "hamming"], help="value to calculate for diagonal elements of the " "grm. Default is fhat for grm and hamming for distance.") parser.add_option( "--matrix-options", dest="matrix_options", type="string", help="modifiers of matrix output, see plink documentation " "for details") parser.add_option("--strand-flip-subset", dest="flip_subset", action="store_true", help="apply strand flipping to a subset of samples") parser.add_option("--flip-scan-type", dest="scan_param", type="choice", choices=["default", "window", "threshold"], help="strand flipping scan to apply to SNPs") parser.add_option("--sort-type", dest="sort_type", type="choice", choices=["none", "natural", "ascii", "file"], help="sort type to input files") parser.add_option("--merge-file-format", dest="merge_format", type="choice", choices=["plink", "binary_plink"], help="format of input files to be merged") parser.add_option( "--merge-mode", dest="merge_mode", type="choice", choices=[ "default", "original_missing", "new_nonmissing", "no_overwrite", "force", "report_all", "report_nonmissing" ], help="merge mode to apply to dealing with merge conflicts") parser.add_option("--duplicates-method", dest="dup_method", type="choice", choices=["same_ref", "id_match", "suppress_first"], help="method for identifying and dealing with duplicate " "variants") parser.add_option("--summary-method", dest="summary_method", type="choice", choices=[ "allele_frequency", "missing_data", "hardy_weinberg", "mendel_errors", "inbreeding", "inbreeding_coef", "gender_checker", "wrights_fst" ], help="summary statistics to calculate") parser.add_option("--summary-parameter", dest="sum_param", type="string", help="optional parameters that can be passed to summary " "statistics methods") parser.add_option( "--genotype-rate", dest="filt_genotype_rate", type="string", help="genotyping rate threshold. SNPs below this threshold " "will be excluded from analysis") parser.add_option("--indiv-missing", dest="filt_missingness", type="string", help="individual missingness rate. Individuals below " "this threshold will be excluded from analysis") parser.add_option("--hardy-weinberg", dest="filt_hwe", type="string", help="hardy-weinberg p-value threshold for SNPs. SNPs " "with a 2df chisquared p-value below this will be " "filtered out") parser.add_option( "--min-allele-frequency", dest="filt_min_allele_frequency", type="string", help="only include SNPs with an allele frequency equal to " "or above this threshold") parser.add_option( "--max-allele-frequency", dest="filt_max_allele_frequency", type="string", help="only include SNPs with an allele frequency equal to " "or below this threshold") parser.add_option( "--mendelian-error", dest="filt_mendelian_error", type="string", help="exclude individuals/trios with mendelian errors that " "exceed this value") parser.add_option("--min-quality-score", dest="filt_min_qaul_score", type="string", help="reset the minimum low bound of quality scores for " "variants in a VCF file. Default is 0") parser.add_option( "--max-quality-score", dest="filt_max_qual_score", type="string", help="reset the maximum upper bound of quality scores for " "a VCCF file. Default is Inf") parser.add_option("--allow-no-gender", dest="filt_allow_no_sex", type="string", help="allow individuals with gender missing") parser.add_option("--enforce-gender", dest="filt_enforce_sex", type="string", help="only include individuals with non-missing gender " "information") parser.add_option("--keep-individuals", dest="filt_keep", type="string", help="a file containing individuals IDs to keep, " "one per row") parser.add_option("--remove-individuals", dest="filt_remove", type="string", help="a file of individual IDs to remove, one per row") parser.add_option("--subset-filter", dest="filt_subset_filter", type="choice", choices=[ "cases", "controls", "males", "females", "founders", "nonfounders" ], help="only apply filters to the specific subset of " "individuals supplied") parser.add_option( "--extract-snps", dest="filt_extract", type="string", help="text file of variant IDs to include in the analysis, " "ignoring all others") parser.add_option("--exclude-snps", dest="filt_exclude", type="string", help="a file of variant IDs to exclude from analysis") parser.add_option("--restrict-chromosome", dest="filt_chromosome", type="string", help="restict analysis to either a single chromosome, " "or a comma-separated list of chromosomes") parser.add_option("--exclude-chromosomes", dest="filt_exclude_chromosome", type="string", help="exclude all variants on these " "chromosome(s)") parser.add_option( "--autosome-only", dest="filt_autosome", action="store_true", help="if present only autosomal variants will be analysed") parser.add_option( "--pseudo-autosome", dest="filt_pseudo_autosome", action="store_true", help="include on the pseudo-autosomal region of chromosome X") parser.add_option("--ignore-indels", dest="filt_ignore_indels", action="store_true", help="only include bi-allelic single nucleotide " "variants in analysis") parser.add_option( "--snp-range", dest="filt_snp_bp_range", type="string", help="comma separated list of from, to genome co-ordinates " "within which to include variants for analysis") parser.add_option("--snp-id-range", dest="filt_snp_id_range", type="string", help="comma separate list of IDs from, to within which " "to include variants for analysis.") parser.add_option("--snp-id", dest="filt_specific_snp", type="string", help="include a single snp in the analysis given by " "it's variant ID.") parser.add_option("--exclude-variant", dest="filt_exclude_snp", type="string", help="exclude a single variant from the analysis, " "given by it's variant ID") parser.add_option( "--covariate-filter", dest="filt_covariate_filter", type="string", help="covariate column headers or column numbers on which " "to filter on. Requries --covariate-file") parser.add_option( "--filter-parameter", dest="param", type="string", help="parameter values to be passed to filtering function") parser.add_option("--window-size", dest="window_size", type="string", help="alters the behaviour of the --snp-range and " "--include/exclude snp options. variants within +/- " "half * window_size (kb) are included") parser.add_option( "--range-resolution", dest="filt_range_resolution", type="choice", choices=["bp", "kb", "mb"], help="alters the (from, to) range resolution to either bp, " "kb or mb") parser.add_option( "--output-file-pattern", dest="out_pattern", type="string", help="output file pattern prefix. file suffixes are dependent " "on the task executed") parser.add_option("--threads", dest="threads", type="int", help="the number of threads to use for multi-threaded " "processes") parser.add_option("--use-kb", dest="kb", action="store_true", help="if present uses a kb sized window for LD pruning") parser.add_option("--prune-method", dest="prune_method", type="choice", choices=["R2", "VIF"], help="type of LD pruning to " "perform, pair-wise LD or variance inflation factor") parser.add_option("--step-size", dest="step", type="string", help="step size to advance window by") parser.add_option("--threshold", dest="threshold", type="string", help="threshold on which to filter results") parser.add_option("--parallel", dest="parallel", type="int", help="number of jobs to split task into") parser.add_option("--memory", dest="memory", type="string", help="amount of memory to reserve for the task") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) parser.set_defaults(sum_param=None, dup_method="same_ref", matrix_shape="triangle", matrix_options=None, matrix_compress="gz", kb=False, random_seed=random.randint(0, 19999), memory="60G", parallel=None) if not options.infile_pattern: infiles = (argv[-1]).split(",") else: infiles = options.infile_pattern # create a new filegroup object geno_files = gwas.FileGroup(files=infiles, file_format=options.file_format, genotype_format="imputed") if options.pheno_file: geno_files.set_phenotype(pheno_file=options.pheno_file, pheno=options.pheno) else: pass # add FileGroup object to the gwas program object if options.program == "plink2": gwas_object = gwas.Plink2(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) elif options.program == "plinkdev": gwas_object = gwas.PlinkDev(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) elif options.program == "gcta": gwas_object = gwas.GCTA(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) else: pass # collect filtering options from options opt_dict = options.__dict__ filter_keys = [fx for fx in opt_dict.keys() if re.search("filt", fx)] filter_dict = {k: options.__dict__[k] for k in filter_keys if opt_dict[k]} # iteratively add all filters to GWASProgram object for fkey in filter_dict: filt_key = fkey.replace("filt_", "") filter_value = filter_dict[fkey] gwas_object.apply_filters(filter_type=filt_key, filter_value=filter_value) # handle summary statistics if options.method == "ld_prune": gwas_object._qc_methods(ld_prune=options.prune_method, kb=True, window=options.window_size, step=options.step, threshold=options.threshold) elif options.method == "IBD": # use sum param to pass arguments to ibd estiamte # these are norm, full or relatitves gwas_object._qc_methods(ibd=options.ibd_param) elif options.method == "summary": if options.summary_method == "allele_frequency": gwas_object._output_statistics(allele_frequency=options.sum_param) elif options.summary_method == "hardy_weinberg": gwas_object._output_statistics(hardy_weinberg=options.sum_param) elif options.summary_method == "missing_data": gwas_object._output_statistics(missing_data=options.sum_param) elif options.summary_method == "mendel_errors": gwas_object._output_statistics(mendel_errors=options.sum_param) elif options.summary_method == "inbreeding": gwas_object._output_statistics(inbreeding=options.sum_param) elif options.summary_method == "inbreeding_coef": gwas_object._output_statistics(inbreeding_coef=options.sum_param) elif options.summary_method == "gender_checker": gwas_object._output_statistics(gender_checker=options.sum_param) elif options.summary_method == "wrights_fst": gwas_object._output_statistics(wrights_fst=options.sum_param) else: pass elif options.method == "remove_relations": gwas_object._run_tasks(remove_relations="cutoff", parameter=options.threshold) elif options.method == "check_gender": gwas_object._run_tasks(check_gender="") else: pass gwas_object.build_statement(infiles=geno_files, outfile=options.out_pattern, threads=options.threads, memory=options.memory, parallel=options.parallel) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--task", dest="task", type="choice", choices=["get_hits", "extract_results", "merge_freq"], help="task to perform") parser.add_option("--p-threshold", dest="p_threshold", type="float", help="threshold for association p-value, below " "which results will be output") parser.add_option("--output-directory", dest="outdir", type="string", help="output file directory") parser.add_option("--snp-set", dest="snpset", type="string", help="file containing list of SNP per row to " "extract from GWAS results") parser.add_option("--frequency-directory", dest="freq_dir", type="string", help="Directory containing plink .frq files corresponding" " to all chromosomes") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # if the input is a list of files, split them infile = argv[-1] infiles = infile.split(",") if len(infiles) > 1: results = gwas.GWASResults(assoc_file=infiles) elif len(infiles) == 1: results = gwas.GWASResults(assoc_file=infile) else: raise IOError("no input files detected, please specifiy association " "results files as the last command line argument") if options.task == "get_hits": hits = results.getHits(float(options.p_threshold)) for name, region in hits: try: try: top_reg = region.sort_values(by="CHISQ", ascending=False) top_bp = top_reg.iloc[0]["BP"] top_snp = top_reg.iloc[0]["SNP"] except KeyError: top_reg = region top_reg.loc[:, "STAT"] = abs(top_reg["STAT"]) top_reg = top_reg.sort_values(by="STAT", ascending=False) top_bp = top_reg.iloc[0]["BP"] top_snp = top_reg.iloc[0]["SNP"] except KeyError: top_reg = region top_reg.loc[:, "STAT"] = abs(top_reg["T"]) top_reg = top_reg.sort_values(by="T", ascending=False) top_bp = top_reg.iloc[0]["BP"] top_snp = top_reg.iloc[0]["SNP"] outname = "_".join(["chr%s" % str(name), str(top_bp), top_snp, "significant"]) outfile = outname + ".tsv" out_file = "/".join([options.outdir, outfile]) E.info("output association results from Chr%s to %s" % (str(name), out_file)) # this keeps outputing the first column as unamed: 0, # need to remove this try: if region.columns[0] != "A1": region.drop([region.columns[0]], inplace=True, axis=1) except: pass region.to_csv(out_file, sep="\t", index=None) elif options.task == "extract_results": with IOTools.openFile(options.snpset, "r") as sfile: snpset = sfile.readlines() snpset = [snp.rstrip("\n") for snp in snpset] snp_df = results.extractSNPs(snpset) snp_df.dropna(axis=0, how='all', inplace=True) snp_df.drop_duplicates(subset=["SNP"], inplace=True) snp_df.to_csv(options.stdout, sep="\t", index=None) elif options.task == "merge_freq": # sequentially merge GWAS result with frequency data # to make file for GCTA joint analysis regex = re.compile("(\S+).frq$") cojo_df = results.mergeFrequencyResults(options.freq_dir, file_regex=regex) cojo_df.to_csv(options.stdout, sep="\t", index=None) else: pass # write footer and output benchmark information. E.Stop()