def testPedSnpBoundaryTPed(self): pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) DataParser.boundary = SnpBoundaryCheck(snps=["rs0001-rs0003"]) BoundaryCheck.chrom = 1 ped_parser.load_tfam(pc) ped_parser.load_genotypes() pedigree = libgwas.get_lines(self.tped_filename, split=True) index = 0 loci = ped_parser.get_loci() for snp in loci: self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertAlmostEqual(self.hetero_freq_tped[index], snp.hetero_freq, places=4) index += 1 self.assertEqual(3, ped_parser.locus_count) index = 0 for snp in ped_parser: self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(snp.genotype_data)) index += 1 self.assertEqual(3, index)
def testBoundaryInitSpaceAsSnp(self): # By default, it will identify as invalid, since it didn't find any boundaries # This is just for simplifying command line parsing BoundaryCheck.chrom = -1 b = SnpBoundaryCheck(snps=['']) self.assertEqual(False, b.valid)
def testPedSnpBoundary2TPed(self): pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) DataParser.boundary = SnpBoundaryCheck(snps=["rs0005-rs0006"]) BoundaryCheck.chrom = 2 ped_parser.load_tfam(pc) ped_parser.load_genotypes() pedigree = [x.split() for x in open(self.tped_filename).readlines()] index = 4 loci = ped_parser.get_loci() for snp in loci: self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertAlmostEqual(self.hetero_freq_tped[index], snp.hetero_freq, places=4) index += 1 self.assertEqual(2, ped_parser.locus_count) index = 4 for snp in ped_parser: self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(snp.genotype_data)) index += 1 self.assertEqual(6, index)
def testPedRegionBoundaryWithExclusionsTPed(self): pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) DataParser.boundary = SnpBoundaryCheck(snps=["rs0005-rs0007"]) DataParser.boundary.LoadExclusions(snps=["rs0007"]) BoundaryCheck.chrom = 2 ped_parser.load_tfam(pc) ped_parser.load_genotypes() pedigree = get_lines(self.tped_filename, split=True) index = 4 loci = ped_parser.get_loci() for snp in loci: self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) index += 1 index = 4 for snp in ped_parser: snp_filter = numpy.ones(snp.missing_genotypes.shape[0]) == 1 try: genodata = snp.get_genotype_data(snp_filter) self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(6, index)
def testRegionBoundaryWithExclusions(self): pc = PhenoCovar() DataParser.boundary = SnpBoundaryCheck(snps=["rs0005-rs0007"]) DataParser.boundary.LoadExclusions(snps=["rs0007"]) BoundaryCheck.chrom = 2 parser = Parser(self.nonmissing, data_field='GT') parser.init_subjects(pc) parser.load_genotypes() index = 4 self.assertEqual(2, parser.locus_count) for snp in parser: for y in pc: snp_filter = numpy.ones(snp.missing_genotypes.shape[0]) == 1 try: genodata = snp.get_genotype_data(snp_filter) self.assertEqual(int(self.nonmissing_mapdata[index][0]), snp.chr) self.assertEqual(int(self.nonmissing_mapdata[index][1]), snp.pos) self.assertEqual(self.nonmissing_mapdata[index][2], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(6, index)
def testPedRegionBoundaryWithExclusionsTPed(self): pc = PhenoCovar() DataParser.boundary = SnpBoundaryCheck(snps=["rs0005-rs0007"]) DataParser.boundary.LoadExclusions(snps=["rs0007"]) BoundaryCheck.chrom = 2 ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() pedigree = self.nonmissing_mapdata index = 4 self.assertEqual(2, ped_parser.locus_count) for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(6, index)
def testPedRegionBoundaryWithExclusionsTPed(self): pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) DataParser.boundary = SnpBoundaryCheck(snps=["rs0005-rs0007"]) DataParser.boundary.LoadExclusions(snps=["rs0007"]) BoundaryCheck.chrom = 2 ped_parser.load_tfam(pc) ped_parser.load_genotypes() pedigree = [x.split() for x in open(self.tped_filename).readlines()] index = 4 loci = ped_parser.get_loci() for snp in loci: self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) index += 1 index = 4 for snp in ped_parser: self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(snp.genotype_data)) index += 1 self.assertEqual(6, index)
def testInfoFileUseNoChrPos(self): # We'll give it an invalid gen_ext so that we can be certain that it's using the files provided mach_parser.Parser.chrpos_encoding = False DataParser.boundary = SnpBoundaryCheck(self.locus_labels) mach_parser.Parser.gen_ext = 'asdf' PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = mach_parser.Parser( [self.gen_file, self.gen_file2], info_files=[self.info_file1, self.info_file2]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual("NA", snp.pos) self.assertEqual("NA", snp.chr) self.assertEqual("%s:%s" % (self.chroms[idx], self.positions[idx]), snp.rsid) for i in range(0, len(self.dosage_encoding[idx])): self.assertAlmostEqual(self.dosage_encoding[idx][i], snp.genotype_data[i], places=3) idx += 1 self.assertEqual(20, idx)
def testMapFileWithSnpBoundary(self): BoundaryCheck.chrom = 1 DataParser.boundary = SnpBoundaryCheck(snps=["rs0001-rs0003"]) ped_parser = PedigreeParser(self.map_filename, self.ped_filename) ped_parser.load_mapfile() self.assertEqual(3, len(ped_parser.markers)) self.assertEqual(7, len(ped_parser.snp_mask)) self.assertEqual(3, ped_parser.locus_count) # Masks are filters, so we should have 7 entries, but 4 will be 1 self.assertEqual(4, numpy.sum(ped_parser.snp_mask[:, 0])) self.assertEqual(0, ped_parser.snp_mask[0, 0]) self.assertEqual(0, ped_parser.snp_mask[1, 1]) self.assertEqual(0, ped_parser.snp_mask[2, 1])
def testChromosomesNoChrPos(self): mach_parser.Parser.chrpos_encoding = False DataParser.boundary = SnpBoundaryCheck(self.locus_labels) pc = PhenoCovar() parser = mach_parser.Parser([self.gen_file, self.gen_file2]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual("NA", snp.pos) self.assertEqual("NA", snp.chr) self.assertEqual("%d:%d" % (self.chroms[idx], self.positions[idx]), snp.rsid) idx += 1 self.assertEqual(20, idx)
def testBedSnpBounded(self): BoundaryCheck.chrom = 1 DataParser.boundary = SnpBoundaryCheck(snps=["rs1000-rs3000"]) pheno = PhenoCovar() ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pheno) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() results = [x for x in mv_esteq.RunAnalysis(ped_parser, pheno)] self.assertEqual(1, results[0].chr) self.assertEqual(1000, results[0].pos) self.assertAlmostEqual(0.00347562, results[0].p_mvtest, places=6) self.assertAlmostEqual(0.00085539, results[0].lmpv, places=6) self.assertAlmostEqual(0.5777812, results[1].p_mvtest, places=6) self.assertAlmostEqual(0.42212155, results[1].lmpv, places=6) self.assertAlmostEqual(0.44661276, results[2].p_mvtest, places=6) self.assertAlmostEqual(0.61386344, results[2].lmpv, places=6)
def testPedSnpBoundaryBed(self): pc = PhenoCovar() DataParser.boundary = SnpBoundaryCheck(snps=["rs0001-rs0003"]) BoundaryCheck.chrom = 1 ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() pedigree = self.nonmissing_mapdata index = 0 valid_loci = 0 self.assertEqual(3, ped_parser.locus_count) for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) valid_loci += 1 except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(3, valid_loci) # we have selected only the first 3 self.assertEqual(3, index)
def testPedSnpBoundaryTPed(self): pc = PhenoCovar() ped_parser = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) DataParser.boundary = SnpBoundaryCheck(snps=["rs0001-rs0003"]) BoundaryCheck.chrom = 1 ped_parser.load_tfam(pc) ped_parser.load_genotypes() pedigree = get_lines(self.tped_filename, split=True) index = 0 loci = ped_parser.get_loci() for snp in loci: self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) index += 1 self.assertEqual(3, ped_parser.locus_count) index = 0 for snp in ped_parser: snp_filter = numpy.ones(snp.missing_genotypes.shape[0]) == 1 try: genodata = snp.get_genotype_data(snp_filter) self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertAlmostEqual(self.hetero_freq_tped[index], genodata.hetero_freq, places=4) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(3, index)
def testTpedSnpBounded(self): BoundaryCheck.chrom = 1 DataParser.boundary = SnpBoundaryCheck(snps=["rs1000-rs3000"]) pheno = PhenoCovar() dataset = TransposedPedigreeParser(self.tfam_filename, self.tped_filename) dataset.load_tfam(pheno) dataset.load_genotypes() results = [x for x in mv_esteq.RunAnalysis(dataset, pheno)] self.assertEqual(1, results[0].chr) self.assertEqual(1000, results[0].pos) self.assertAlmostEqual(0.0034756155, results[0].p_mvtest, places=6) self.assertAlmostEqual(0.1134684009, results[0].betas[1], places=6) self.assertAlmostEqual(0.0337649965541, results[0].beta_stderr[1], places=6) self.assertAlmostEqual(0.0007779211, results[0].beta_pvalues[1], places=6) self.assertAlmostEqual(-0.0033479839, results[0].betas[3], places=6) self.assertAlmostEqual(0.0492050029324, results[0].beta_stderr[3], places=6) self.assertAlmostEqual(0.9457525716, results[0].beta_pvalues[3], places=6) self.assertAlmostEqual(0.57778118, results[1].p_mvtest, places=6) self.assertAlmostEqual(0.02798537, results[1].betas[1], places=6) self.assertAlmostEqual(0.033790691857, results[1].beta_stderr[1], places=6) self.assertAlmostEqual(0.40755865, results[1].beta_pvalues[1], places=6) self.assertAlmostEqual(0.03275892, results[1].betas[3], places=6) self.assertAlmostEqual(0.0475661, results[1].beta_stderr[3], places=6) self.assertAlmostEqual(0.49101013, results[1].beta_pvalues[3], places=6) self.assertAlmostEqual(0.44661276, results[2].p_mvtest, places=6) self.assertAlmostEqual(0.01663975, results[2].betas[1], places=6) self.assertAlmostEqual(0.03443300, results[2].beta_stderr[1], places=6) self.assertAlmostEqual(0.62891811, results[2].beta_pvalues[1], places=6) self.assertAlmostEqual(0.05712017, results[2].betas[3], places=6) self.assertAlmostEqual(0.04783608, results[2].beta_stderr[3], places=6) self.assertAlmostEqual(0.232446188, results[2].beta_pvalues[3], places=6)
def LoadCmdLine(self, args=sys.argv[1:]): """Parse user arguments using argparse and set up components""" parser = argparse.ArgumentParser(description="MV Test: " + __version__, epilog=""" mvtest.py is uses many of the same arguments as plink, but there are a few differences, so please consider the list above carefully. """) parser.add_argument("-v", action='store_true', help="Print version number") parser.add_argument( "--vall", action='store_true', help="Print version number along with each dependency") parser.add_argument("--chr", type=int, default=-1, metavar="N", help="Select Chromosome") parser.add_argument( "--snps", type=str, default="", help="Comma-delimited list of SNP(s): rs1,rs2,rs3-rs6") parser.add_argument("--from-bp", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-bp", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-kb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-kb", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-mb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-mb", type=int, metavar="END", help="SNP range end") parser.add_argument( "--exclude", type=str, default="", help="Comma-delimited list of rsids to be excluded") # For now, I'm not implementing keep, since we don't have any real meaningful need for analyzing individuals # PLINK does, but we don't do the QC stuff they do. parser.add_argument( "--keep", type=str, default="", help="Comma-delimited list of individuals to be analyzed") parser.add_argument( "--remove", type=str, default="", help= "Comma-delimited list of individuals to be removed from analysis") parser.add_argument("--file", type=str, help="Prefix for .ped and .map files") parser.add_argument("--ped", type=argparse.FileType('r'), help="PLINK compatible .ped file") parser.add_argument("--map", type=argparse.FileType('r'), help="PLINK compatible .map file") parser.add_argument("--map3", action='store_true', help="MAP file has only 3 columns") parser.add_argument("--no-sex", action='store_true', help="Pedigree file doesn't have column 5 (sex)") parser.add_argument( "--no-parents", action="store_true", help="Pedigree file doesn't have columns 3 and 4 (parents)") parser.add_argument( "--no-fid", action="store_true", help="Pedigree file doesn't have column 1 (family ID)") parser.add_argument( "--no-pheno", action="store_true", help="Pedigree file doesn't have column 6 (phenotype") parser.add_argument("--liability", action="store_true", help="Pedigree file has column 7 (liability)") parser.add_argument("--bfile", type=str, help="Prefix for .bed, .bim and .fam files") parser.add_argument("--bed", type=argparse.FileType('r'), help="Binary Ped file (.bed)") parser.add_argument("--bim", type=argparse.FileType('r'), help="Binary ped marker file (.bim)") parser.add_argument("--fam", type=argparse.FileType('r'), help="Binary ped family file (.fam)") parser.add_argument("--tfile", type=str, help="Prefix for .tped and .tfam files") parser.add_argument("--tped", type=argparse.FileType('r'), help="Transposed Pedigree file (.tped)") parser.add_argument("--tfam", type=argparse.FileType('r'), help="Transposed pedigre Family file (.tfam)") parser.add_argument( "--compressed", action="store_true", help="Ped/TPed compressed with gzip (named .ped.tgz or .tped.tgz)") parser.add_argument( "--impute", type=argparse.FileType('r'), help="File containing list of impute output for analysis") parser.add_argument( "--impute-fam", type=argparse.FileType('r'), help="File containing family details for impute data") parser.add_argument( "--impute-offset", type=int, default=-1, help="Impute file index (1 based) to begin analysis") parser.add_argument( "--impute-count", type=int, default=-1, help="Number of impute files to process (for this node)") parser.add_argument( "--impute-uncompressed", action="store_true", help="Indicate that the impute input is not gzipped, but plain text" ) parser.add_argument( "--impute-encoding", type=str, choices=['additive', 'dominant', 'recessive', 'genotype'], default='additive', help='Genetic model to be used') parser.add_argument("--impute-info-ext", type=str, default='info', help="Portion of filename denotes info filename") parser.add_argument("--impute-gen-ext", type=str, default='gen.gz', help="Portion of filename that denotes gen file") parser.add_argument( "--impute-info-thresh", type=float, default=0.4, help="Threshold for filtering imputed SNPs with poor 'info' values" ) parser.add_argument( "--mach", type=argparse.FileType('r'), help="File containing list of MACH output for analysis") parser.add_argument("--mach-offset", type=int, default=-1, help="Mach file index (1 based) to begin analysis") parser.add_argument( "--mach-count", type=int, default=-1, help="Number of mach files to process (for this node)") parser.add_argument("--mach-uncompressed", action="store_true", help="Indicate that the mach input is not gzipped") parser.add_argument( "--mach-chunk-size", type=int, default=100000, help= "Max number of loci to load at once (higher increases memory requirements with some speed benefits)" ) parser.add_argument("--mach-info-ext", type=str, default="info.gz", help="Portion of filename denotes info filenames") parser.add_argument("--mach-dose-ext", type=str, default="dose.gz", help="Portion of filename that denotes dose files") parser.add_argument("--mach-min-rsquared", type=float, default=0.3, help="Filter out loci with RSquared < this value") parser.add_argument( "--mach-chrpos", action="store_true", help= "When true, first col in .info file must be chr:pos (additional pieces allowed)" ) parser.add_argument("--pheno", type=argparse.FileType('r'), help="File containing phenotypes") parser.add_argument("--sample-pheno", type=argparse.FileType('r'), help="(Mach) Sample file containing phenotypes") parser.add_argument( "--mphenos", type=str, default="", help= "Column number(s) for phenotype to be analyzed if number of columns > 1" ) parser.add_argument( "--pheno-names", type=str, default="", help= "Name for phenotype(s) to be analyzed (must be in --pheno file)") parser.add_argument("--all-pheno", action="store_true", help="Analyze all columns from the phenotype file") #parser.add_argument("--all-pheno", action='store_true', help="Analyze each phenotype") parser.add_argument("--covar", type=argparse.FileType('r'), help="File containing covariates") parser.add_argument("--sample-covar", type=argparse.FileType('r'), help="(Mach) Sample file containing covariates") parser.add_argument("--covar-numbers", type=str, default="", help="Comma-separated list of covariate indices") parser.add_argument("--covar-names", type=str, default="", help="Comma-separated list of covariate names") parser.add_argument( "--sex", action='store_true', help="Use sex from the pedigree file as a covariate") parser.add_argument("--missing-phenotype", type=float, default=-9.0, help="Encoding for missing phenotypes") parser.add_argument("--maf", type=float, default=0.0, help="Minimum MAF allowed for analysis") parser.add_argument("--max-maf", type=float, default=1.0, help="MAX MAF allowed for analysis") parser.add_argument("--geno", type=float, default=1.0, help="MAX per-SNP missing for analysis") parser.add_argument("--mind", type=float, default=1.0, help="MAX per-person missing") parser.add_argument("--verbose", action='store_true', help="Output additional data details") parser.set_defaults(all_pheno=False, sex=False, mach_chrpos=False) args = parser.parse_args(args) # Report version, if requested, and exit if args.v: print("%s: %s" % (os.path.basename(__file__), __version__), file=sys.stderr) sys.exit(0) if args.vall: print("%s: %s" % (os.path.basename(__file__), __version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(libgwas.__file__), libgwas.__version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(scipy.__file__), scipy.__version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(numpy.__file__), numpy.__version__), file=sys.stderr) sys.exit(0) ############################################################################################################### # Here we deal with the various ways we filter SNPs in and out of anlysis # We might handle MACH files differently. We'll default the chromosome # to be "NA" which is how those can be returned. if args.mach is None or args.mach_chrpos: BoundaryCheck.chrom = args.chr else: if args.chr != -1: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) BoundaryCheck.chrom = "NA" snps = args.snps.split(",") try: b = BoundaryCheck(bp=(args.from_bp, args.to_bp), kb=(args.from_kb, args.to_kb), mb=(args.from_mb, args.to_mb)) except InvalidBoundarySpec as e: print("Invalid boundary spec associated: %s" % (e.malformed_boundary), file=sys.stderr) sys.exit(1) try: s = SnpBoundaryCheck(snps=snps) except InvalidBoundarySpec as e: print("Invalid SNP boundary defined: %s" % (e.malformed_boundary), file=sys.stderr) print( "SNPs must be either single or have be a range such as rs123-rs345", file=sys.stderr) sys.exit(1) if b.valid and s.valid: print( "Only one type of boundary conditions is permitted. Either use --from-bp, etc. or rs123-rs345. ", file=sys.stderr) sys.exit(1) if len(b.bounds) > 0 and not b.valid: if BoundaryCheck.chrom == "NA": libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) if s.valid: DataParser.boundary = s # If b isn't valid, we still want to potentially allow for chr and SNPs, it just won't have else: b.LoadSNPs(snps) # any actual boundary listings DataParser.boundary = b DataParser.boundary.LoadExclusions(snps=args.exclude.split(",")) ############################################################################################################### # Setup the various Dataset filter criteria DataParser.min_maf = args.maf DataParser.max_maf = args.max_maf DataParser.snp_miss_tol = args.geno DataParser.ind_miss_tol = args.mind DataParser.ind_exclusions = ParseIndList(args.remove) PhenoCovar.sex_as_covariate = args.sex if args.compressed: DataParser.compressed_pedigree = True DataParser.has_sex = not args.no_sex DataParser.has_parents = not args.no_parents DataParser.has_fid = not args.no_fid DataParser.has_pheno = not args.no_pheno DataParser.has_liability = args.liability pheno_covar = PhenoCovar() self.verbose = False if args.verbose: self.verbose = True if args.file != None or args.ped or args.map: if args.ped and not args.map or args.map and not args.ped: print( "When analyzing pedigree data, both .map and .ped must be specified", file=sys.stderr) sys.exit(1) if args.ped: dataset = pedigree_parser.Parser(args.map.name, args.ped.name) else: dataset = pedigree_parser.Parser("%s.map" % (args.file), "%s.ped" % (args.file)) dataset.load_mapfile(map3=args.map3) dataset.load_genotypes(pheno_covar) elif args.tfile != None or args.tped or args.tfam: if args.tped and not args.tfam or args.tfam and not args.tped: print( "When analyzing transposed pedigree data, both .tfam and .tped must be specified", file=sys.stderr) sys.exit(1) if args.tped: dataset = transposed_pedigree_parser.Parser( args.tfam.name, args.tped.name) else: dataset = transposed_pedigree_parser.Parser( "%s.tfam" % (args.tfile), "%s.tped" % (args.tfile)) dataset.load_tfam(pheno_covar) dataset.load_genotypes() elif args.bfile != None: dataset = bed_parser.Parser("%s.fam" % (args.bfile), "%s.bim" % (args.bfile), "%s.bed" % (args.bfile)) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.bed or args.bim or args.fam: if (args.bed and not args.fam or not args.bim) or ( args.bim and not args.bed or not args.fam) or (args.fam and not args.bed or not args.bim): print( "When analyzing binary pedigree data, .bed, .bim and .fam files must be provided", file=sys.stderr) sys.exit(1) dataset = bed_parser.Parser(args.fam, args.bim, args.bed) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.impute: DataParser.compressed_pedigree = not args.impute_uncompressed if (args.impute_offset > 0 and args.impute_count == -1) or ( args.impute_offset == -1 and args.impute_count > 0): print( "--impute-count and --impute_offset must both > 0 if one is set other than -1. ", file=sys.stderr) sys.exit(1) if DataParser.snp_miss_tol != 1.0: print("--geno does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if DataParser.ind_miss_tol != 1.0: print("--mind does not have any impact on imputed data", file=sys.stderr) sys.exit(1) impute_parser.SetEncoding(args.impute_encoding) impute_parser.Parser.info_ext = args.impute_info_ext impute_parser.Parser.info_threshold = args.impute_info_thresh libgwas.ExitIf( "--impute-fam is required for when processing imputed data", args.impute_fam == None) archives, chroms, infos = self.ParseImputeFile( args.impute.name, args.impute_offset, args.impute_count) dataset = impute_parser.Parser(args.impute_fam.name, archives, chroms, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() elif args.mach: DataParser.compressed_pedigree = not args.mach_uncompressed if (args.mach_offset > 0 and args.mach_count == -1) or (args.mach_offset == -1 and args.impute_count > 0): print( "--mach-count and --mach_offset must both be > 0 if one is set other than -1. ", file=sys.stderr) sys.exit(1) if DataParser.snp_miss_tol != 1.0: print("--geno does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if DataParser.ind_miss_tol != 1.0: print("--mind does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if BoundaryCheck.chrom != "NA" and not args.mach_chrpos: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) mach_parser.Parser.chrpos_encoding = args.mach_chrpos mach_parser.Parser.info_ext = args.mach_info_ext mach_parser.Parser.dosage_ext = args.mach_dose_ext mach_parser.Parser.chunk_stride = args.mach_chunk_size mach_parser.Parser.min_rsquared = args.mach_min_rsquared archives, infos = self.ParseMachFile(args.mach.name, args.mach_offset, args.mach_count) dataset = mach_parser.Parser(archives, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() else: parser.print_usage(sys.stderr) print( "\nNo data has been specified. Users must specify either pedigree or transposed pedigree to continue", file=sys.stderr) sys.exit(1) if args.pheno or args.sample_pheno: mphenos = [] if args.mphenos != "": mphenos = args.mphenos.split(",") nphenos = [] if args.pheno_names != "": nphenos = args.pheno_names.split(",") if len(mphenos) + len(nphenos) == 0 and not args.all_pheno: libgwas.Exit("You must select one or more phenotypes when ") sample_file = False pheno_filename = args.pheno if args.sample_pheno: pheno_filename = args.sample_pheno sample_file = True pheno_covar.load_phenofile(pheno_filename, mphenos, nphenos, sample_file) if args.covar: pheno_covar.load_covarfile(args.covar, args.covar_numbers.split(","), args.covar_names.split(",")) pheno_covar.do_standardize_variables = True return dataset, pheno_covar
class MVTestApplication(object): """Basic application wrapper. Parses the command line and sets the various flags associated with the user's preference and then reports the final settings in use. """ def LoadCmdLine(self, args=sys.argv[1:]): """Parse user arguments using argparse and set up components""" parser = argparse.ArgumentParser(description="MV Test: " + __version__, epilog=""" mvtest.py is uses many of the same arguments as plink, but there are a few differences, so please consider the list above carefully. """) parser.add_argument("-v", action='store_true', help="Print version number") parser.add_argument( "--vall", action='store_true', help="Print version number along with each dependency") parser.add_argument("--chr", type=int, default=-1, metavar="N", help="Select Chromosome") parser.add_argument( "--snps", type=str, default="", help="Comma-delimited list of SNP(s): rs1,rs2,rs3-rs6") parser.add_argument("--from-bp", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-bp", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-kb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-kb", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-mb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-mb", type=int, metavar="END", help="SNP range end") parser.add_argument( "--exclude", type=str, default="", help="Comma-delimited list of rsids to be excluded") # For now, I'm not implementing keep, since we don't have any real meaningful need for analyzing individuals # PLINK does, but we don't do the QC stuff they do. parser.add_argument( "--keep", type=str, default="", help="Comma-delimited list of individuals to be analyzed") parser.add_argument( "--remove", type=str, default="", help= "Comma-delimited list of individuals to be removed from analysis") parser.add_argument("--file", type=str, help="Prefix for .ped and .map files") parser.add_argument("--ped", type=argparse.FileType('r'), help="PLINK compatible .ped file") parser.add_argument("--map", type=argparse.FileType('r'), help="PLINK compatible .map file") parser.add_argument("--map3", action='store_true', help="MAP file has only 3 columns") parser.add_argument("--no-sex", action='store_true', help="Pedigree file doesn't have column 5 (sex)") parser.add_argument( "--no-parents", action="store_true", help="Pedigree file doesn't have columns 3 and 4 (parents)") parser.add_argument( "--no-fid", action="store_true", help="Pedigree file doesn't have column 1 (family ID)") parser.add_argument( "--no-pheno", action="store_true", help="Pedigree file doesn't have column 6 (phenotype") parser.add_argument("--liability", action="store_true", help="Pedigree file has column 7 (liability)") parser.add_argument("--bfile", type=str, help="Prefix for .bed, .bim and .fam files") parser.add_argument("--bed", type=argparse.FileType('r'), help="Binary Ped file (.bed)") parser.add_argument("--bim", type=argparse.FileType('r'), help="Binary ped marker file (.bim)") parser.add_argument("--fam", type=argparse.FileType('r'), help="Binary ped family file (.fam)") parser.add_argument("--tfile", type=str, help="Prefix for .tped and .tfam files") parser.add_argument("--tped", type=argparse.FileType('r'), help="Transposed Pedigree file (.tped)") parser.add_argument("--tfam", type=argparse.FileType('r'), help="Transposed pedigre Family file (.tfam)") parser.add_argument( "--compressed", action="store_true", help="Ped/TPed compressed with gzip (named .ped.tgz or .tped.tgz)") parser.add_argument( "--impute", type=argparse.FileType('r'), help="File containing list of impute output for analysis") parser.add_argument( "--impute-fam", type=argparse.FileType('r'), help="File containing family details for impute data") parser.add_argument( "--impute-offset", type=int, default=-1, help="Impute file index (1 based) to begin analysis") parser.add_argument( "--impute-count", type=int, default=-1, help="Number of impute files to process (for this node)") parser.add_argument( "--impute-uncompressed", action="store_true", help="Indicate that the impute input is not gzipped, but plain text" ) parser.add_argument( "--impute-encoding", type=str, choices=['additive', 'dominant', 'recessive', 'genotype'], default='additive', help='Genetic model to be used') parser.add_argument("--impute-info-ext", type=str, default='info', help="Portion of filename denotes info filename") parser.add_argument("--impute-gen-ext", type=str, default='gen.gz', help="Portion of filename that denotes gen file") parser.add_argument( "--impute-info-thresh", type=float, default=0.4, help="Threshold for filtering imputed SNPs with poor 'info' values" ) parser.add_argument( "--mach", type=argparse.FileType('r'), help="File containing list of MACH output for analysis") parser.add_argument("--mach-offset", type=int, default=-1, help="Mach file index (1 based) to begin analysis") parser.add_argument( "--mach-count", type=int, default=-1, help="Number of mach files to process (for this node)") parser.add_argument("--mach-uncompressed", action="store_true", help="Indicate that the mach input is not gzipped") parser.add_argument( "--mach-chunk-size", type=int, default=100000, help= "Max number of loci to load at once (higher increases memory requirements with some speed benefits)" ) parser.add_argument("--mach-info-ext", type=str, default="info.gz", help="Portion of filename denotes info filenames") parser.add_argument("--mach-dose-ext", type=str, default="dose.gz", help="Portion of filename that denotes dose files") parser.add_argument("--mach-min-rsquared", type=float, default=0.3, help="Filter out loci with RSquared < this value") parser.add_argument( "--mach-chrpos", action="store_true", help= "When true, first col in .info file must be chr:pos (additional pieces allowed)" ) parser.add_argument("--pheno", type=argparse.FileType('r'), help="File containing phenotypes") parser.add_argument("--sample-pheno", type=argparse.FileType('r'), help="(Mach) Sample file containing phenotypes") parser.add_argument( "--mphenos", type=str, default="", help= "Column number(s) for phenotype to be analyzed if number of columns > 1" ) parser.add_argument( "--pheno-names", type=str, default="", help= "Name for phenotype(s) to be analyzed (must be in --pheno file)") parser.add_argument("--all-pheno", action="store_true", help="Analyze all columns from the phenotype file") #parser.add_argument("--all-pheno", action='store_true', help="Analyze each phenotype") parser.add_argument("--covar", type=argparse.FileType('r'), help="File containing covariates") parser.add_argument("--sample-covar", type=argparse.FileType('r'), help="(Mach) Sample file containing covariates") parser.add_argument("--covar-numbers", type=str, default="", help="Comma-separated list of covariate indices") parser.add_argument("--covar-names", type=str, default="", help="Comma-separated list of covariate names") parser.add_argument( "--sex", action='store_true', help="Use sex from the pedigree file as a covariate") parser.add_argument("--missing-phenotype", type=float, default=-9.0, help="Encoding for missing phenotypes") parser.add_argument("--maf", type=float, default=0.0, help="Minimum MAF allowed for analysis") parser.add_argument("--max-maf", type=float, default=1.0, help="MAX MAF allowed for analysis") parser.add_argument("--geno", type=float, default=1.0, help="MAX per-SNP missing for analysis") parser.add_argument("--mind", type=float, default=1.0, help="MAX per-person missing") parser.add_argument("--verbose", action='store_true', help="Output additional data details") parser.set_defaults(all_pheno=False, sex=False, mach_chrpos=False) args = parser.parse_args(args) # Report version, if requested, and exit if args.v: print >> sys.stderr, "%s: %s" % (os.path.basename(__file__), __version__) sys.exit(0) if args.vall: print >> sys.stderr, "%s: %s" % (os.path.basename(__file__), __version__) print >> sys.stderr, "%s: %s" % (os.path.dirname( libgwas.__file__), libgwas.__version__) print >> sys.stderr, "%s: %s" % (os.path.dirname( scipy.__file__), scipy.__version__) print >> sys.stderr, "%s: %s" % (os.path.dirname( numpy.__file__), numpy.__version__) sys.exit(0) ############################################################################################################### # Here we deal with the various ways we filter SNPs in and out of anlysis # We might handle MACH files differently. We'll default the chromosome # to be "NA" which is how those can be returned. if args.mach is None or args.mach_chrpos: BoundaryCheck.chrom = args.chr else: if args.chr != -1: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) BoundaryCheck.chrom = "NA" snps = args.snps.split(",") try: b = BoundaryCheck(bp=(args.from_bp, args.to_bp), kb=(args.from_kb, args.to_kb), mb=(args.from_mb, args.to_mb)) except InvalidBoundarySpec, e: print >> sys.stderr, "Invalid boundary spec associated: %s" % ( e.malformed_boundary) sys.exit(1) try: s = SnpBoundaryCheck(snps=snps) except InvalidBoundarySpec, e: print >> sys.stderr, "Invalid SNP boundary defined: %s" % ( e.malformed_boundary) print >> sys.stderr, "SNPs must be either single or have be a range such as rs123-rs345" sys.exit(1)
def testSnpRangesWithExclusions(self): BoundaryCheck.chrom = "NA" b = SnpBoundaryCheck(snps=["rs1-rs500","1:600-1:650","1:987654321"]) b.ignored_rs = ["1:751","1:501"] self.assertFalse(b.NoExclusions()) self.assertTrue(b.valid) self.assertFalse(b.TestBoundary(1, "NA", "rs1")) self.assertFalse(b.TestBoundary("NA", "NA", "rs750")) self.assertTrue(b.TestBoundary("NA", "NA", "rs1")) self.assertFalse(b.TestBoundary("NA", "NA", "1:751")) self.assertTrue(b.TestBoundary("NA", "NA", "1:50")) self.assertTrue(b.TestBoundary("NA", "NA", "rs1")) # We don't really care which RS numbers we see, except for the boundaries self.assertTrue(b.TestBoundary("NA", "NA", "rs1")) self.assertFalse(b.TestBoundary("NA", "NA", "1:501")) self.assertTrue(b.TestBoundary("NA", "NA", "rs500")) self.assertFalse(b.TestBoundary("NA", "NA", "rs499")) self.assertFalse(b.TestBoundary("NA", "NA", "rs500")) self.assertTrue(b.TestBoundary("NA", "NA", "1:600")) self.assertTrue(b.TestBoundary("NA", "NA", "1:625")) self.assertTrue(b.TestBoundary("NA", "NA", "1:20000")) self.assertTrue(b.TestBoundary("NA", "NA", "1:650")) self.assertFalse(b.TestBoundary("NA", "NA", "1:650")) self.assertFalse(b.TestBoundary(21, "NA", "1:987654321")) self.assertTrue(b.TestBoundary("NA", "NA", "1:987654321"))
def testSnpRanges(self): BoundaryCheck.chrom = "NA" b = SnpBoundaryCheck(snps=["rs1-rs500","rs600-rs650","rs987654321"]) self.assertFalse(b.NoExclusions()) self.assertTrue(b.valid) self.assertFalse(b.TestBoundary(1, 10, "rs1")) self.assertFalse(b.TestBoundary("NA", "NA", "rs750")) self.assertTrue(b.TestBoundary("NA", "NA", "rs1")) self.assertTrue(b.TestBoundary("NA", "NA", "rs50")) self.assertTrue(b.TestBoundary("NA", "NA", "rs1")) # We don't really care which RS numbers we see, except for the boundaries self.assertTrue(b.TestBoundary("NA", "NA", "rs1")) self.assertTrue(b.TestBoundary("NA", "NA", "rs500")) self.assertFalse(b.TestBoundary("NA", "NA", "rs499")) self.assertFalse(b.TestBoundary(22, 23001, "rs500")) self.assertTrue(b.TestBoundary("NA", "NA", "rs600")) self.assertTrue(b.TestBoundary("NA", '23003', "rs625")) self.assertTrue(b.TestBoundary("NA", "NA", "rs20000")) self.assertTrue(b.TestBoundary("NA", "NA", "rs650")) self.assertFalse(b.TestBoundary("NA", "NA", "rs650")) self.assertFalse(b.TestBoundary(21, "NA", "rs987654321")) self.assertTrue(b.TestBoundary("NA", "NA", "rs987654321"))
def testSnpRangesWithExclusions(self): BoundaryCheck.chrom = 22 b = SnpBoundaryCheck(snps=["rs1-rs500","rs600-rs650","rs987654321"]) b.ignored_rs = ["rs751","rs501"] self.assertFalse(b.NoExclusions()) self.assertTrue(b.valid) self.assertFalse(b.TestBoundary(1, 10, "rs1")) self.assertFalse(b.TestBoundary(22, 20000, "rs750")) self.assertTrue(b.TestBoundary(22, 20001, "rs1")) self.assertFalse(b.TestBoundary(22, 20002, "rs751")) self.assertTrue(b.TestBoundary(22, 22003, "rs50")) self.assertTrue(b.TestBoundary(22, 22005, "rs1")) # We don't really care which RS numbers we see, except for the boundaries self.assertTrue(b.TestBoundary(22, 22010, "rs1")) self.assertFalse(b.TestBoundary(22, 22011, "rs501")) self.assertTrue(b.TestBoundary(22, 22012, "rs500")) self.assertFalse(b.TestBoundary(22, 23000, "rs499")) self.assertFalse(b.TestBoundary(22, 23002, "rs500")) self.assertTrue(b.TestBoundary(22, 23003, "rs600")) self.assertTrue(b.TestBoundary(22, 23004, "rs625")) self.assertTrue(b.TestBoundary(22, 23010, "rs20000")) self.assertTrue(b.TestBoundary(22, 24000, "rs650")) self.assertFalse(b.TestBoundary(22, 25000, "rs650")) self.assertFalse(b.TestBoundary(21, 2500000, "rs987654321")) self.assertTrue(b.TestBoundary(22, 2500000, "rs987654321"))