def testBedAnalysis(self): # We'll start with the correct phenotype with the genotypes, so we'll use # a boundary to restrict us to only use the first SNP BoundaryCheck.chrom = 1 DataParser.boundary = BoundaryCheck() pheno = PhenoCovar() ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pheno) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() results = [x for x in mv_esteq.RunAnalysis(ped_parser, pheno)] self.assertAlmostEqual(0.00347562, results[0].p_mvtest, places=6) self.assertAlmostEqual(0.00085539, results[0].lmpv, places=6) self.assertAlmostEqual(0.5777812, results[1].p_mvtest, places=6) self.assertAlmostEqual(0.42212155, results[1].lmpv, places=6) self.assertAlmostEqual(0.44661276, results[2].p_mvtest, places=6) self.assertAlmostEqual(0.61386344, results[2].lmpv, places=6) self.assertAlmostEqual(0.13555597, results[3].p_mvtest, places=6) self.assertAlmostEqual(0.59682217, results[3].lmpv, places=6) self.assertAlmostEqual(0.54029842, results[4].p_mvtest, places=6) self.assertAlmostEqual(0.60475964, results[4].lmpv, places=6) self.assertAlmostEqual(0.03547514, results[5].p_mvtest, places=6) self.assertAlmostEqual(0.86663730, results[5].lmpv, places=6) self.assertAlmostEqual(0.79249216, results[6].p_mvtest, places=6) self.assertAlmostEqual(0.67678089, results[6].lmpv, places=6) self.assertAlmostEqual(0.20973300, results[7].p_mvtest, places=6) self.assertAlmostEqual(0.14431260, results[7].lmpv, places=6) self.assertAlmostEqual(0.81471528, results[8].p_mvtest, places=6) self.assertAlmostEqual(0.56378497, results[8].lmpv, places=6)
def testPedRegionBoundaryWithExclusionsTPed(self): pc = PhenoCovar() DataParser.boundary = SnpBoundaryCheck(snps=["rs0005-rs0007"]) DataParser.boundary.LoadExclusions(snps=["rs0007"]) BoundaryCheck.chrom = 2 ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() pedigree = self.nonmissing_mapdata index = 4 self.assertEqual(2, ped_parser.locus_count) for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(6, index)
def testBedAnalysisCov(self): PhenoCovar.sex_as_covariate = True DataParser.boundary = BoundaryCheck() pheno = PhenoCovar() ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pheno) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() results = [x for x in mv_esteq.RunAnalysis(ped_parser, pheno)] self.assertAlmostEqual(0.0034238, results[0].p_mvtest, places=6) self.assertAlmostEqual(0.0143949, results[0].lmpv, places=6) self.assertAlmostEqual(0.58495059, results[1].p_mvtest, places=6) self.assertAlmostEqual(0.65786, results[1].lmpv, places=5) self.assertAlmostEqual(0.45178985, results[2].p_mvtest, places=6) self.assertAlmostEqual(0.83956, results[2].lmpv, places=5) self.assertAlmostEqual(0.133661, results[3].p_mvtest, places=6) self.assertAlmostEqual(0.82169, results[3].lmpv, places=5) self.assertAlmostEqual(0.541391, results[4].p_mvtest, places=6) self.assertAlmostEqual(0.83595, results[4].lmpv, places=5) self.assertAlmostEqual(0.035665, results[5].p_mvtest, places=6) self.assertAlmostEqual(0.94900, results[5].lmpv, places=5) self.assertAlmostEqual(0.784660, results[6].p_mvtest, places=6) self.assertAlmostEqual(0.59324, results[6].lmpv, places=5) self.assertAlmostEqual(0.2137434, results[7].p_mvtest, places=6) self.assertAlmostEqual(0.18069, results[7].lmpv, places=5) self.assertAlmostEqual(0.8160148, results[8].p_mvtest, places=6) self.assertAlmostEqual(0.79734, results[8].lmpv, places=5)
def testPedWithMissingComplete(self): pc = PhenoCovar() ped_parser = bed_parser.Parser(self.missing_fam, self.missing_bim, self.missing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() mapdata = self.missing_mapdata index = 0 for snp in ped_parser: try: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(self.genotypes_w_missing[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass except InvariantVar as e: pass index += 1 self.assertEqual(7, index)
def testTPedPhenoComplete(self): PhenoCovar.sex_as_covariate = True pc = PhenoCovar() ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() self.assertEqual(12, len(pc.covariate_data[0])) self.assertEqual(12, len(pc.phenotype_data[0])) self.assertEqual(1, len(pc.phenotype_names)) mapdata = self.nonmissing_mapdata index = 0 for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(7, index)
def testBedBounded(self): BoundaryCheck.chrom = 1 DataParser.boundary = BoundaryCheck(bp=[2000, 3000]) pheno = PhenoCovar() ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pheno) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() results = [x for x in mv_esteq.RunAnalysis(ped_parser, pheno)] self.assertEqual(1, results[0].chr) self.assertEqual(2000, results[0].pos) self.assertAlmostEqual(0.5777811, results[0].p_mvtest, places=6) self.assertAlmostEqual(0.4221215, results[0].lmpv, places=6) self.assertAlmostEqual(0.4466128, results[1].p_mvtest, places=6) self.assertAlmostEqual(0.6138634, results[1].lmpv, places=6)
def testPedWithMissingMxBothComplete(self): pc = PhenoCovar() DataParser.snp_miss_tol = 0.5 # We should only lose 1 DataParser.ind_miss_tol = 0.5 # We should only lose 1 ped_parser = bed_parser.Parser(self.missing_fam, self.missing_bim, self.missing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() mapdata = self.missing_mapdata genotypes_w_missing = [[0, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0], [1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0], [0, 2, 1, 1, 0, 0, 1, 2, 1, 1, 0], [1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 0], [1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]] index = 0 valid_loci = 0 for snp in ped_parser: try: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(genotypes_w_missing[index], list(genodata.genotypes)) valid_loci += 1 except TooMuchMissing as e: pass except InvalidFrequency as e: pass except InvariantVar as e: pass index += 1 self.assertEqual(6, valid_loci) self.assertEqual(7, index)
def testMissingWithExclusions(self): DataParser.ind_exclusions = ["2:2", "3:3"] genotypes_w_missing = [[0, -1, -1, -1, -1, -1, -1, -1, -1, 1], [1, 0, 0, 1, 1, 1, 0, 0, 0, 1], [0, 1, 0, 0, 0, 2, 1, 1, 0, 0], [0, 1, 1, 0, 0, 1, 2, 1, 1, 0], [1, 1, 0, 0, 1, 2, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 0]] pc = PhenoCovar() ped_parser = bed_parser.Parser(self.missing_fam, self.missing_bim, self.missing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() mapdata = self.missing_mapdata self.assertEqual(7, ped_parser.locus_count) index = 0 for snp in ped_parser: try: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(genotypes_w_missing[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass except InvariantVar as e: pass index += 1 self.assertEqual(7, index)
def testBedSnpBounded(self): BoundaryCheck.chrom = 1 DataParser.boundary = SnpBoundaryCheck(snps=["rs1000-rs3000"]) pheno = PhenoCovar() ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pheno) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() results = [x for x in mv_esteq.RunAnalysis(ped_parser, pheno)] self.assertEqual(1, results[0].chr) self.assertEqual(1000, results[0].pos) self.assertAlmostEqual(0.00347562, results[0].p_mvtest, places=6) self.assertAlmostEqual(0.00085539, results[0].lmpv, places=6) self.assertAlmostEqual(0.5777812, results[1].p_mvtest, places=6) self.assertAlmostEqual(0.42212155, results[1].lmpv, places=6) self.assertAlmostEqual(0.44661276, results[2].p_mvtest, places=6) self.assertAlmostEqual(0.61386344, results[2].lmpv, places=6)
def testPedSnpBoundaryBed(self): pc = PhenoCovar() DataParser.boundary = SnpBoundaryCheck(snps=["rs0001-rs0003"]) BoundaryCheck.chrom = 1 ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() pedigree = self.nonmissing_mapdata index = 0 valid_loci = 0 self.assertEqual(3, ped_parser.locus_count) for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) valid_loci += 1 except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(3, valid_loci) # we have selected only the first 3 self.assertEqual(3, index)
def testRegionBoundaryWithExclusions(self): DataParser.ind_exclusions = ["1:1", "2:2", "3:3"] genotypes = [[0, 1, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 1, 1, 0, 0, 0, 1], [1, 0, 0, 0, 2, 1, 1, 0, 0], [1, 1, 0, 0, 1, 2, 1, 1, 0], [1, 0, 0, 1, 2, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0]] BoundaryCheck.chrom = 2 pc = PhenoCovar() ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() mapdata = self.nonmissing_mapdata index = 4 for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(mapdata[index][0]), snp.chr) self.assertEqual(int(mapdata[index][3]), snp.pos) self.assertEqual(mapdata[index][1], snp.rsid) self.assertEqual(genotypes[index], list(genodata.genotypes)) except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(7, index)
def testPedBoundaryBed(self): pc = PhenoCovar() DataParser.boundary = BoundaryCheck() BoundaryCheck.chrom = 2 ped_parser = bed_parser.Parser(self.nonmissing_fam, self.nonmissing_bim, self.nonmissing_bed) ped_parser.load_fam(pc) ped_parser.load_bim(map3=False) ped_parser.load_genotypes() pedigree = self.nonmissing_mapdata index = 4 valid_loci = 0 for snp in ped_parser: for y in pc: (pheno, covars, nonmissing) = y.get_variables(snp.missing_genotypes) try: genodata = snp.get_genotype_data(nonmissing) self.assertEqual(int(pedigree[index][0]), snp.chr) self.assertEqual(int(pedigree[index][3]), snp.pos) self.assertEqual(pedigree[index][1], snp.rsid) self.assertEqual(self.genotypes[index], list(genodata.genotypes)) valid_loci += 1 except TooMuchMissing as e: pass except InvalidFrequency as e: pass index += 1 self.assertEqual(3, valid_loci) self.assertEqual(7, index)
def LoadCmdLine(self, args=sys.argv[1:]): """Parse user arguments using argparse and set up components""" parser = argparse.ArgumentParser(description="MV Test: " + __version__, epilog=""" mvtest.py is uses many of the same arguments as plink, but there are a few differences, so please consider the list above carefully. """) parser.add_argument("-v", action='store_true', help="Print version number") parser.add_argument( "--vall", action='store_true', help="Print version number along with each dependency") parser.add_argument("--chr", type=int, default=-1, metavar="N", help="Select Chromosome") parser.add_argument( "--snps", type=str, default="", help="Comma-delimited list of SNP(s): rs1,rs2,rs3-rs6") parser.add_argument("--from-bp", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-bp", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-kb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-kb", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-mb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-mb", type=int, metavar="END", help="SNP range end") parser.add_argument( "--exclude", type=str, default="", help="Comma-delimited list of rsids to be excluded") # For now, I'm not implementing keep, since we don't have any real meaningful need for analyzing individuals # PLINK does, but we don't do the QC stuff they do. parser.add_argument( "--keep", type=str, default="", help="Comma-delimited list of individuals to be analyzed") parser.add_argument( "--remove", type=str, default="", help= "Comma-delimited list of individuals to be removed from analysis") parser.add_argument("--file", type=str, help="Prefix for .ped and .map files") parser.add_argument("--ped", type=argparse.FileType('r'), help="PLINK compatible .ped file") parser.add_argument("--map", type=argparse.FileType('r'), help="PLINK compatible .map file") parser.add_argument("--map3", action='store_true', help="MAP file has only 3 columns") parser.add_argument("--no-sex", action='store_true', help="Pedigree file doesn't have column 5 (sex)") parser.add_argument( "--no-parents", action="store_true", help="Pedigree file doesn't have columns 3 and 4 (parents)") parser.add_argument( "--no-fid", action="store_true", help="Pedigree file doesn't have column 1 (family ID)") parser.add_argument( "--no-pheno", action="store_true", help="Pedigree file doesn't have column 6 (phenotype") parser.add_argument("--liability", action="store_true", help="Pedigree file has column 7 (liability)") parser.add_argument("--bfile", type=str, help="Prefix for .bed, .bim and .fam files") parser.add_argument("--bed", type=argparse.FileType('r'), help="Binary Ped file (.bed)") parser.add_argument("--bim", type=argparse.FileType('r'), help="Binary ped marker file (.bim)") parser.add_argument("--fam", type=argparse.FileType('r'), help="Binary ped family file (.fam)") parser.add_argument("--tfile", type=str, help="Prefix for .tped and .tfam files") parser.add_argument("--tped", type=argparse.FileType('r'), help="Transposed Pedigree file (.tped)") parser.add_argument("--tfam", type=argparse.FileType('r'), help="Transposed pedigre Family file (.tfam)") parser.add_argument( "--compressed", action="store_true", help="Ped/TPed compressed with gzip (named .ped.tgz or .tped.tgz)") parser.add_argument( "--impute", type=argparse.FileType('r'), help="File containing list of impute output for analysis") parser.add_argument( "--impute-fam", type=argparse.FileType('r'), help="File containing family details for impute data") parser.add_argument( "--impute-offset", type=int, default=-1, help="Impute file index (1 based) to begin analysis") parser.add_argument( "--impute-count", type=int, default=-1, help="Number of impute files to process (for this node)") parser.add_argument( "--impute-uncompressed", action="store_true", help="Indicate that the impute input is not gzipped, but plain text" ) parser.add_argument( "--impute-encoding", type=str, choices=['additive', 'dominant', 'recessive', 'genotype'], default='additive', help='Genetic model to be used') parser.add_argument("--impute-info-ext", type=str, default='info', help="Portion of filename denotes info filename") parser.add_argument("--impute-gen-ext", type=str, default='gen.gz', help="Portion of filename that denotes gen file") parser.add_argument( "--impute-info-thresh", type=float, default=0.4, help="Threshold for filtering imputed SNPs with poor 'info' values" ) parser.add_argument( "--mach", type=argparse.FileType('r'), help="File containing list of MACH output for analysis") parser.add_argument("--mach-offset", type=int, default=-1, help="Mach file index (1 based) to begin analysis") parser.add_argument( "--mach-count", type=int, default=-1, help="Number of mach files to process (for this node)") parser.add_argument("--mach-uncompressed", action="store_true", help="Indicate that the mach input is not gzipped") parser.add_argument( "--mach-chunk-size", type=int, default=100000, help= "Max number of loci to load at once (higher increases memory requirements with some speed benefits)" ) parser.add_argument("--mach-info-ext", type=str, default="info.gz", help="Portion of filename denotes info filenames") parser.add_argument("--mach-dose-ext", type=str, default="dose.gz", help="Portion of filename that denotes dose files") parser.add_argument("--mach-min-rsquared", type=float, default=0.3, help="Filter out loci with RSquared < this value") parser.add_argument( "--mach-chrpos", action="store_true", help= "When true, first col in .info file must be chr:pos (additional pieces allowed)" ) parser.add_argument("--pheno", type=argparse.FileType('r'), help="File containing phenotypes") parser.add_argument("--sample-pheno", type=argparse.FileType('r'), help="(Mach) Sample file containing phenotypes") parser.add_argument( "--mphenos", type=str, default="", help= "Column number(s) for phenotype to be analyzed if number of columns > 1" ) parser.add_argument( "--pheno-names", type=str, default="", help= "Name for phenotype(s) to be analyzed (must be in --pheno file)") parser.add_argument("--all-pheno", action="store_true", help="Analyze all columns from the phenotype file") #parser.add_argument("--all-pheno", action='store_true', help="Analyze each phenotype") parser.add_argument("--covar", type=argparse.FileType('r'), help="File containing covariates") parser.add_argument("--sample-covar", type=argparse.FileType('r'), help="(Mach) Sample file containing covariates") parser.add_argument("--covar-numbers", type=str, default="", help="Comma-separated list of covariate indices") parser.add_argument("--covar-names", type=str, default="", help="Comma-separated list of covariate names") parser.add_argument( "--sex", action='store_true', help="Use sex from the pedigree file as a covariate") parser.add_argument("--missing-phenotype", type=float, default=-9.0, help="Encoding for missing phenotypes") parser.add_argument("--maf", type=float, default=0.0, help="Minimum MAF allowed for analysis") parser.add_argument("--max-maf", type=float, default=1.0, help="MAX MAF allowed for analysis") parser.add_argument("--geno", type=float, default=1.0, help="MAX per-SNP missing for analysis") parser.add_argument("--mind", type=float, default=1.0, help="MAX per-person missing") parser.add_argument("--verbose", action='store_true', help="Output additional data details") parser.set_defaults(all_pheno=False, sex=False, mach_chrpos=False) args = parser.parse_args(args) # Report version, if requested, and exit if args.v: print("%s: %s" % (os.path.basename(__file__), __version__), file=sys.stderr) sys.exit(0) if args.vall: print("%s: %s" % (os.path.basename(__file__), __version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(libgwas.__file__), libgwas.__version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(scipy.__file__), scipy.__version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(numpy.__file__), numpy.__version__), file=sys.stderr) sys.exit(0) ############################################################################################################### # Here we deal with the various ways we filter SNPs in and out of anlysis # We might handle MACH files differently. We'll default the chromosome # to be "NA" which is how those can be returned. if args.mach is None or args.mach_chrpos: BoundaryCheck.chrom = args.chr else: if args.chr != -1: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) BoundaryCheck.chrom = "NA" snps = args.snps.split(",") try: b = BoundaryCheck(bp=(args.from_bp, args.to_bp), kb=(args.from_kb, args.to_kb), mb=(args.from_mb, args.to_mb)) except InvalidBoundarySpec as e: print("Invalid boundary spec associated: %s" % (e.malformed_boundary), file=sys.stderr) sys.exit(1) try: s = SnpBoundaryCheck(snps=snps) except InvalidBoundarySpec as e: print("Invalid SNP boundary defined: %s" % (e.malformed_boundary), file=sys.stderr) print( "SNPs must be either single or have be a range such as rs123-rs345", file=sys.stderr) sys.exit(1) if b.valid and s.valid: print( "Only one type of boundary conditions is permitted. Either use --from-bp, etc. or rs123-rs345. ", file=sys.stderr) sys.exit(1) if len(b.bounds) > 0 and not b.valid: if BoundaryCheck.chrom == "NA": libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) if s.valid: DataParser.boundary = s # If b isn't valid, we still want to potentially allow for chr and SNPs, it just won't have else: b.LoadSNPs(snps) # any actual boundary listings DataParser.boundary = b DataParser.boundary.LoadExclusions(snps=args.exclude.split(",")) ############################################################################################################### # Setup the various Dataset filter criteria DataParser.min_maf = args.maf DataParser.max_maf = args.max_maf DataParser.snp_miss_tol = args.geno DataParser.ind_miss_tol = args.mind DataParser.ind_exclusions = ParseIndList(args.remove) PhenoCovar.sex_as_covariate = args.sex if args.compressed: DataParser.compressed_pedigree = True DataParser.has_sex = not args.no_sex DataParser.has_parents = not args.no_parents DataParser.has_fid = not args.no_fid DataParser.has_pheno = not args.no_pheno DataParser.has_liability = args.liability pheno_covar = PhenoCovar() self.verbose = False if args.verbose: self.verbose = True if args.file != None or args.ped or args.map: if args.ped and not args.map or args.map and not args.ped: print( "When analyzing pedigree data, both .map and .ped must be specified", file=sys.stderr) sys.exit(1) if args.ped: dataset = pedigree_parser.Parser(args.map.name, args.ped.name) else: dataset = pedigree_parser.Parser("%s.map" % (args.file), "%s.ped" % (args.file)) dataset.load_mapfile(map3=args.map3) dataset.load_genotypes(pheno_covar) elif args.tfile != None or args.tped or args.tfam: if args.tped and not args.tfam or args.tfam and not args.tped: print( "When analyzing transposed pedigree data, both .tfam and .tped must be specified", file=sys.stderr) sys.exit(1) if args.tped: dataset = transposed_pedigree_parser.Parser( args.tfam.name, args.tped.name) else: dataset = transposed_pedigree_parser.Parser( "%s.tfam" % (args.tfile), "%s.tped" % (args.tfile)) dataset.load_tfam(pheno_covar) dataset.load_genotypes() elif args.bfile != None: dataset = bed_parser.Parser("%s.fam" % (args.bfile), "%s.bim" % (args.bfile), "%s.bed" % (args.bfile)) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.bed or args.bim or args.fam: if (args.bed and not args.fam or not args.bim) or ( args.bim and not args.bed or not args.fam) or (args.fam and not args.bed or not args.bim): print( "When analyzing binary pedigree data, .bed, .bim and .fam files must be provided", file=sys.stderr) sys.exit(1) dataset = bed_parser.Parser(args.fam, args.bim, args.bed) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.impute: DataParser.compressed_pedigree = not args.impute_uncompressed if (args.impute_offset > 0 and args.impute_count == -1) or ( args.impute_offset == -1 and args.impute_count > 0): print( "--impute-count and --impute_offset must both > 0 if one is set other than -1. ", file=sys.stderr) sys.exit(1) if DataParser.snp_miss_tol != 1.0: print("--geno does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if DataParser.ind_miss_tol != 1.0: print("--mind does not have any impact on imputed data", file=sys.stderr) sys.exit(1) impute_parser.SetEncoding(args.impute_encoding) impute_parser.Parser.info_ext = args.impute_info_ext impute_parser.Parser.info_threshold = args.impute_info_thresh libgwas.ExitIf( "--impute-fam is required for when processing imputed data", args.impute_fam == None) archives, chroms, infos = self.ParseImputeFile( args.impute.name, args.impute_offset, args.impute_count) dataset = impute_parser.Parser(args.impute_fam.name, archives, chroms, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() elif args.mach: DataParser.compressed_pedigree = not args.mach_uncompressed if (args.mach_offset > 0 and args.mach_count == -1) or (args.mach_offset == -1 and args.impute_count > 0): print( "--mach-count and --mach_offset must both be > 0 if one is set other than -1. ", file=sys.stderr) sys.exit(1) if DataParser.snp_miss_tol != 1.0: print("--geno does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if DataParser.ind_miss_tol != 1.0: print("--mind does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if BoundaryCheck.chrom != "NA" and not args.mach_chrpos: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) mach_parser.Parser.chrpos_encoding = args.mach_chrpos mach_parser.Parser.info_ext = args.mach_info_ext mach_parser.Parser.dosage_ext = args.mach_dose_ext mach_parser.Parser.chunk_stride = args.mach_chunk_size mach_parser.Parser.min_rsquared = args.mach_min_rsquared archives, infos = self.ParseMachFile(args.mach.name, args.mach_offset, args.mach_count) dataset = mach_parser.Parser(archives, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() else: parser.print_usage(sys.stderr) print( "\nNo data has been specified. Users must specify either pedigree or transposed pedigree to continue", file=sys.stderr) sys.exit(1) if args.pheno or args.sample_pheno: mphenos = [] if args.mphenos != "": mphenos = args.mphenos.split(",") nphenos = [] if args.pheno_names != "": nphenos = args.pheno_names.split(",") if len(mphenos) + len(nphenos) == 0 and not args.all_pheno: libgwas.Exit("You must select one or more phenotypes when ") sample_file = False pheno_filename = args.pheno if args.sample_pheno: pheno_filename = args.sample_pheno sample_file = True pheno_covar.load_phenofile(pheno_filename, mphenos, nphenos, sample_file) if args.covar: pheno_covar.load_covarfile(args.covar, args.covar_numbers.split(","), args.covar_names.split(",")) pheno_covar.do_standardize_variables = True return dataset, pheno_covar
dataset.load_genotypes(pheno_covar) elif args.tfile != None or args.tped or args.tfam: if args.tped and not args.tfam or args.tfam and not args.tped: print >> sys.stderr, "When analyzing transposed pedigree data, both .tfam and .tped must be specified" sys.exit(1) if args.tped: dataset = transposed_pedigree_parser.Parser( args.tfam.name, args.tped.name) else: dataset = transposed_pedigree_parser.Parser( "%s.tfam" % (args.tfile), "%s.tped" % (args.tfile)) dataset.load_tfam(pheno_covar) dataset.load_genotypes() elif args.bfile != None: dataset = bed_parser.Parser("%s.fam" % (args.bfile), "%s.bim" % (args.bfile), "%s.bed" % (args.bfile)) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.bed or args.bim or args.fam: if (args.bed and not args.fam or not args.bim) or ( args.bim and not args.bed or not args.fam) or (args.fam and not args.bed or not args.bim): print >> sys.stderr, "When analyzing binary pedigree data, .bed, .bim and .fam files must be provided" sys.exit(1) dataset = bed_parser.Parser(args.fam, args.bim, args.bed) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes()