def testLongerList(self): impute_parser.encoding = impute_parser.Encoding.Recessive PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2] * 3, chroms=["3", "4"] * 3) parser.load_family_details(pc) parser.load_genotypes() idx = 0 values = numpy.vstack( (self.recessive_encoding, self.recessive_encoding, self.recessive_encoding)) positions = self.positions * 3 for snp in parser: self.assertEqual(positions[idx], snp.pos) for i in range(0, len(values[idx])): self.assertAlmostEqual(values[idx][i], snp.genotype_data[i], places=3) idx += 1 self.assertEqual(60, idx)
def testBoundariedMiddle(self): BoundaryCheck.chrom = 4 DataParser.boundary = BoundaryCheck(bp=[30734, 33528]) impute_parser.encoding = impute_parser.Encoding.Recessive PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2], chroms=[3, 4]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 dropped = 0 for snp in parser: while self.positions[idx] < 30734 or self.positions[idx] > 33528: idx += 1 dropped += 1 self.assertEqual(self.positions[idx], snp.pos) for i in range(0, len(self.recessive_encoding[idx])): self.assertAlmostEqual(self.recessive_encoding[idx][i], snp.genotype_data[i], places=3) idx += 1 self.assertEqual(12, dropped)
def testGenotypeValues(self): impute_parser.encoding = impute_parser.Encoding.Genotype PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2], chroms=["3", "4"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) for i in range(0, len(self.raw[idx])): genotype = 2 AA, Aa, aa = self.raw[idx][i] if Aa >= AA and Aa >= aa: genotype = 1 elif AA >= Aa and AA >= aa: genotype = 0 self.assertAlmostEqual(genotype, snp.genotype_data[i], places=3) idx += 1 self.assertEqual(20, idx)
def testRawValuesUncompressed(self): DataParser.compressed_pedigree = False impute_parser.Parser.gen_ext = "gen" impute_parser.encoding = impute_parser.Encoding.Raw PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.uncmp_1, self.uncmp_2], chroms=["3", "4"]) chromosomes = (['3'] * 10 + ['4'] * 10) * 2 parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) self.assertEqual(chromosomes[idx], snp.chr) for i in range(0, len(self.raw[idx])): for j in [0, 1, 2]: self.assertAlmostEqual(self.raw[idx][i][j], snp.genotype_data[i][j], places=3) idx += 1 self.assertEqual(20, idx)
def testFamilyData(self): PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2], chroms=["3", "4"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for id in self.ind_ids: self.assertTrue(id in pc.pedigree_data) self.assertEqual(self.phenotypes[idx], pc.phenotype_data[0][idx]) self.assertEqual(self.sex[idx], pc.covariate_data[0][idx]) idx += 1
def testAlelles(self): pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file], chroms=["3"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) self.assertEqual(snp.major_allele, self.allele_1[idx]) self.assertEqual(snp.minor_allele, self.allele_2[idx]) self.assertEqual(snp.rsid, self.rsids[idx]) idx += 1 self.assertEqual(10, idx)
def testFilterMAF(self): DataParser.min_maf = 0.45 pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file], chroms=["3"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: while numpy.mean(self.mafs[idx]) < DataParser.min_maf: idx += 1 self.assertEqual(self.positions[idx], snp.pos) self.assertEqual(snp.major_allele, self.allele_1[idx]) self.assertEqual(snp.minor_allele, self.allele_2[idx]) self.assertEqual(snp.rsid, self.rsids[idx]) idx += 1
def testMAF(self): pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file], chroms=["3"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) maf = numpy.mean(self.mafs[idx]) if maf > 0.5: maf = 1.0 - maf self.assertAlmostEqual(maf, snp.maf, places=3) idx += 1 self.assertEqual(10, idx)
def testChromosomes(self): impute_parser.encoding = impute_parser.Encoding.Raw PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2], chroms=["3", "4"]) chromosomes = ['3'] * 10 + ['4'] * 10 parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) self.assertEqual(chromosomes[idx], snp.chr) idx += 1 self.assertEqual(20, idx)
def testAdditiveValuesFilterInfo(self): impute_parser.Parser.info_threshold = 0.4 impute_parser.encoding = impute_parser.Encoding.Additive PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file], chroms=["3"]) parser.load_family_details(pc) parser.load_genotypes() idx = 4 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) for i in range(0, len(self.additive_encoding[idx])): self.assertAlmostEqual(self.additive_encoding[idx][i], snp.genotype_data[i], places=3) idx += 1 self.assertEqual(10, idx)
def testRecessiveValues(self): impute_parser.encoding = impute_parser.Encoding.Recessive PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2], chroms=["3", "4"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) for i in range(0, len(self.recessive_encoding[idx])): self.assertAlmostEqual(self.recessive_encoding[idx][i], snp.genotype_data[i], places=3) idx += 1 self.assertEqual(20, idx)
def testBoundariedUpper(self): BoundaryCheck.chrom = 3 DataParser.boundary = BoundaryCheck(bp=[21000, 50000]) impute_parser.encoding = impute_parser.Encoding.Recessive PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2], chroms=[3, 4]) parser.load_family_details(pc) parser.load_genotypes() idx = 6 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) for i in range(0, len(self.recessive_encoding[idx])): self.assertAlmostEqual(self.recessive_encoding[idx][i], snp.genotype_data[i], places=3) idx += 1
def testFilterSNP(self): DataParser.boundary.LoadExclusions(snps=["rs132670", "rs132938"]) pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.gen_file, self.gen_file2], chroms=["3", "4"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 dropped = 0 for snp in parser: while self.rsids[idx] in DataParser.boundary.ignored_rs: dropped += 1 idx += 1 self.assertEqual(self.positions[idx], snp.pos) self.assertEqual(snp.major_allele, self.allele_1[idx]) self.assertEqual(snp.minor_allele, self.allele_2[idx]) self.assertEqual(snp.rsid, self.rsids[idx]) idx += 1 self.assertEqual(2, dropped)
def testAdditiveValuesUncompressed(self): impute_parser.Parser.gen_ext = "gen" DataParser.compressed_pedigree = False impute_parser.encoding = impute_parser.Encoding.Additive PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser(self.fam_file, [self.uncmp_1, self.uncmp_2], chroms=["3", "4"]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) for i in range(0, len(self.additive_encoding[idx])): self.assertAlmostEqual(self.additive_encoding[idx][i], snp.genotype_data[i], places=3) idx += 1 self.assertEqual(20, idx)
def testInfoFileUse(self): # We'll give it an invalid gen_ext so that we can be certain that it's using the files provided impute_parser.Parser.gen_ext = 'asdf' impute_parser.encoding = impute_parser.Encoding.Dominant PhenoCovar.sex_as_covariate = True pc = PhenoCovar() parser = impute_parser.Parser( self.fam_file, [self.gen_file, self.gen_file2], chroms=["3", "4"], info_files=[self.info_file1, self.info_file2]) parser.load_family_details(pc) parser.load_genotypes() idx = 0 for snp in parser: self.assertEqual(self.positions[idx], snp.pos) for i in range(0, len(self.dominant_encoding[idx])): self.assertAlmostEqual(self.dominant_encoding[idx][i], snp.genotype_data[i], places=3) idx += 1 self.assertEqual(20, idx)
def LoadCmdLine(self, args=sys.argv[1:]): """Parse user arguments using argparse and set up components""" parser = argparse.ArgumentParser(description="MV Test: " + __version__, epilog=""" mvtest.py is uses many of the same arguments as plink, but there are a few differences, so please consider the list above carefully. """) parser.add_argument("-v", action='store_true', help="Print version number") parser.add_argument( "--vall", action='store_true', help="Print version number along with each dependency") parser.add_argument("--chr", type=int, default=-1, metavar="N", help="Select Chromosome") parser.add_argument( "--snps", type=str, default="", help="Comma-delimited list of SNP(s): rs1,rs2,rs3-rs6") parser.add_argument("--from-bp", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-bp", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-kb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-kb", type=int, metavar="END", help="SNP range end") parser.add_argument("--from-mb", type=int, metavar="START", help="SNP range start") parser.add_argument("--to-mb", type=int, metavar="END", help="SNP range end") parser.add_argument( "--exclude", type=str, default="", help="Comma-delimited list of rsids to be excluded") # For now, I'm not implementing keep, since we don't have any real meaningful need for analyzing individuals # PLINK does, but we don't do the QC stuff they do. parser.add_argument( "--keep", type=str, default="", help="Comma-delimited list of individuals to be analyzed") parser.add_argument( "--remove", type=str, default="", help= "Comma-delimited list of individuals to be removed from analysis") parser.add_argument("--file", type=str, help="Prefix for .ped and .map files") parser.add_argument("--ped", type=argparse.FileType('r'), help="PLINK compatible .ped file") parser.add_argument("--map", type=argparse.FileType('r'), help="PLINK compatible .map file") parser.add_argument("--map3", action='store_true', help="MAP file has only 3 columns") parser.add_argument("--no-sex", action='store_true', help="Pedigree file doesn't have column 5 (sex)") parser.add_argument( "--no-parents", action="store_true", help="Pedigree file doesn't have columns 3 and 4 (parents)") parser.add_argument( "--no-fid", action="store_true", help="Pedigree file doesn't have column 1 (family ID)") parser.add_argument( "--no-pheno", action="store_true", help="Pedigree file doesn't have column 6 (phenotype") parser.add_argument("--liability", action="store_true", help="Pedigree file has column 7 (liability)") parser.add_argument("--bfile", type=str, help="Prefix for .bed, .bim and .fam files") parser.add_argument("--bed", type=argparse.FileType('r'), help="Binary Ped file (.bed)") parser.add_argument("--bim", type=argparse.FileType('r'), help="Binary ped marker file (.bim)") parser.add_argument("--fam", type=argparse.FileType('r'), help="Binary ped family file (.fam)") parser.add_argument("--tfile", type=str, help="Prefix for .tped and .tfam files") parser.add_argument("--tped", type=argparse.FileType('r'), help="Transposed Pedigree file (.tped)") parser.add_argument("--tfam", type=argparse.FileType('r'), help="Transposed pedigre Family file (.tfam)") parser.add_argument( "--compressed", action="store_true", help="Ped/TPed compressed with gzip (named .ped.tgz or .tped.tgz)") parser.add_argument( "--impute", type=argparse.FileType('r'), help="File containing list of impute output for analysis") parser.add_argument( "--impute-fam", type=argparse.FileType('r'), help="File containing family details for impute data") parser.add_argument( "--impute-offset", type=int, default=-1, help="Impute file index (1 based) to begin analysis") parser.add_argument( "--impute-count", type=int, default=-1, help="Number of impute files to process (for this node)") parser.add_argument( "--impute-uncompressed", action="store_true", help="Indicate that the impute input is not gzipped, but plain text" ) parser.add_argument( "--impute-encoding", type=str, choices=['additive', 'dominant', 'recessive', 'genotype'], default='additive', help='Genetic model to be used') parser.add_argument("--impute-info-ext", type=str, default='info', help="Portion of filename denotes info filename") parser.add_argument("--impute-gen-ext", type=str, default='gen.gz', help="Portion of filename that denotes gen file") parser.add_argument( "--impute-info-thresh", type=float, default=0.4, help="Threshold for filtering imputed SNPs with poor 'info' values" ) parser.add_argument( "--mach", type=argparse.FileType('r'), help="File containing list of MACH output for analysis") parser.add_argument("--mach-offset", type=int, default=-1, help="Mach file index (1 based) to begin analysis") parser.add_argument( "--mach-count", type=int, default=-1, help="Number of mach files to process (for this node)") parser.add_argument("--mach-uncompressed", action="store_true", help="Indicate that the mach input is not gzipped") parser.add_argument( "--mach-chunk-size", type=int, default=100000, help= "Max number of loci to load at once (higher increases memory requirements with some speed benefits)" ) parser.add_argument("--mach-info-ext", type=str, default="info.gz", help="Portion of filename denotes info filenames") parser.add_argument("--mach-dose-ext", type=str, default="dose.gz", help="Portion of filename that denotes dose files") parser.add_argument("--mach-min-rsquared", type=float, default=0.3, help="Filter out loci with RSquared < this value") parser.add_argument( "--mach-chrpos", action="store_true", help= "When true, first col in .info file must be chr:pos (additional pieces allowed)" ) parser.add_argument("--pheno", type=argparse.FileType('r'), help="File containing phenotypes") parser.add_argument("--sample-pheno", type=argparse.FileType('r'), help="(Mach) Sample file containing phenotypes") parser.add_argument( "--mphenos", type=str, default="", help= "Column number(s) for phenotype to be analyzed if number of columns > 1" ) parser.add_argument( "--pheno-names", type=str, default="", help= "Name for phenotype(s) to be analyzed (must be in --pheno file)") parser.add_argument("--all-pheno", action="store_true", help="Analyze all columns from the phenotype file") #parser.add_argument("--all-pheno", action='store_true', help="Analyze each phenotype") parser.add_argument("--covar", type=argparse.FileType('r'), help="File containing covariates") parser.add_argument("--sample-covar", type=argparse.FileType('r'), help="(Mach) Sample file containing covariates") parser.add_argument("--covar-numbers", type=str, default="", help="Comma-separated list of covariate indices") parser.add_argument("--covar-names", type=str, default="", help="Comma-separated list of covariate names") parser.add_argument( "--sex", action='store_true', help="Use sex from the pedigree file as a covariate") parser.add_argument("--missing-phenotype", type=float, default=-9.0, help="Encoding for missing phenotypes") parser.add_argument("--maf", type=float, default=0.0, help="Minimum MAF allowed for analysis") parser.add_argument("--max-maf", type=float, default=1.0, help="MAX MAF allowed for analysis") parser.add_argument("--geno", type=float, default=1.0, help="MAX per-SNP missing for analysis") parser.add_argument("--mind", type=float, default=1.0, help="MAX per-person missing") parser.add_argument("--verbose", action='store_true', help="Output additional data details") parser.set_defaults(all_pheno=False, sex=False, mach_chrpos=False) args = parser.parse_args(args) # Report version, if requested, and exit if args.v: print("%s: %s" % (os.path.basename(__file__), __version__), file=sys.stderr) sys.exit(0) if args.vall: print("%s: %s" % (os.path.basename(__file__), __version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(libgwas.__file__), libgwas.__version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(scipy.__file__), scipy.__version__), file=sys.stderr) print("%s: %s" % (os.path.dirname(numpy.__file__), numpy.__version__), file=sys.stderr) sys.exit(0) ############################################################################################################### # Here we deal with the various ways we filter SNPs in and out of anlysis # We might handle MACH files differently. We'll default the chromosome # to be "NA" which is how those can be returned. if args.mach is None or args.mach_chrpos: BoundaryCheck.chrom = args.chr else: if args.chr != -1: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) BoundaryCheck.chrom = "NA" snps = args.snps.split(",") try: b = BoundaryCheck(bp=(args.from_bp, args.to_bp), kb=(args.from_kb, args.to_kb), mb=(args.from_mb, args.to_mb)) except InvalidBoundarySpec as e: print("Invalid boundary spec associated: %s" % (e.malformed_boundary), file=sys.stderr) sys.exit(1) try: s = SnpBoundaryCheck(snps=snps) except InvalidBoundarySpec as e: print("Invalid SNP boundary defined: %s" % (e.malformed_boundary), file=sys.stderr) print( "SNPs must be either single or have be a range such as rs123-rs345", file=sys.stderr) sys.exit(1) if b.valid and s.valid: print( "Only one type of boundary conditions is permitted. Either use --from-bp, etc. or rs123-rs345. ", file=sys.stderr) sys.exit(1) if len(b.bounds) > 0 and not b.valid: if BoundaryCheck.chrom == "NA": libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) if s.valid: DataParser.boundary = s # If b isn't valid, we still want to potentially allow for chr and SNPs, it just won't have else: b.LoadSNPs(snps) # any actual boundary listings DataParser.boundary = b DataParser.boundary.LoadExclusions(snps=args.exclude.split(",")) ############################################################################################################### # Setup the various Dataset filter criteria DataParser.min_maf = args.maf DataParser.max_maf = args.max_maf DataParser.snp_miss_tol = args.geno DataParser.ind_miss_tol = args.mind DataParser.ind_exclusions = ParseIndList(args.remove) PhenoCovar.sex_as_covariate = args.sex if args.compressed: DataParser.compressed_pedigree = True DataParser.has_sex = not args.no_sex DataParser.has_parents = not args.no_parents DataParser.has_fid = not args.no_fid DataParser.has_pheno = not args.no_pheno DataParser.has_liability = args.liability pheno_covar = PhenoCovar() self.verbose = False if args.verbose: self.verbose = True if args.file != None or args.ped or args.map: if args.ped and not args.map or args.map and not args.ped: print( "When analyzing pedigree data, both .map and .ped must be specified", file=sys.stderr) sys.exit(1) if args.ped: dataset = pedigree_parser.Parser(args.map.name, args.ped.name) else: dataset = pedigree_parser.Parser("%s.map" % (args.file), "%s.ped" % (args.file)) dataset.load_mapfile(map3=args.map3) dataset.load_genotypes(pheno_covar) elif args.tfile != None or args.tped or args.tfam: if args.tped and not args.tfam or args.tfam and not args.tped: print( "When analyzing transposed pedigree data, both .tfam and .tped must be specified", file=sys.stderr) sys.exit(1) if args.tped: dataset = transposed_pedigree_parser.Parser( args.tfam.name, args.tped.name) else: dataset = transposed_pedigree_parser.Parser( "%s.tfam" % (args.tfile), "%s.tped" % (args.tfile)) dataset.load_tfam(pheno_covar) dataset.load_genotypes() elif args.bfile != None: dataset = bed_parser.Parser("%s.fam" % (args.bfile), "%s.bim" % (args.bfile), "%s.bed" % (args.bfile)) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.bed or args.bim or args.fam: if (args.bed and not args.fam or not args.bim) or ( args.bim and not args.bed or not args.fam) or (args.fam and not args.bed or not args.bim): print( "When analyzing binary pedigree data, .bed, .bim and .fam files must be provided", file=sys.stderr) sys.exit(1) dataset = bed_parser.Parser(args.fam, args.bim, args.bed) dataset.load_bim(map3=args.map3) dataset.load_fam(pheno_covar) dataset.load_genotypes() elif args.impute: DataParser.compressed_pedigree = not args.impute_uncompressed if (args.impute_offset > 0 and args.impute_count == -1) or ( args.impute_offset == -1 and args.impute_count > 0): print( "--impute-count and --impute_offset must both > 0 if one is set other than -1. ", file=sys.stderr) sys.exit(1) if DataParser.snp_miss_tol != 1.0: print("--geno does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if DataParser.ind_miss_tol != 1.0: print("--mind does not have any impact on imputed data", file=sys.stderr) sys.exit(1) impute_parser.SetEncoding(args.impute_encoding) impute_parser.Parser.info_ext = args.impute_info_ext impute_parser.Parser.info_threshold = args.impute_info_thresh libgwas.ExitIf( "--impute-fam is required for when processing imputed data", args.impute_fam == None) archives, chroms, infos = self.ParseImputeFile( args.impute.name, args.impute_offset, args.impute_count) dataset = impute_parser.Parser(args.impute_fam.name, archives, chroms, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() elif args.mach: DataParser.compressed_pedigree = not args.mach_uncompressed if (args.mach_offset > 0 and args.mach_count == -1) or (args.mach_offset == -1 and args.impute_count > 0): print( "--mach-count and --mach_offset must both be > 0 if one is set other than -1. ", file=sys.stderr) sys.exit(1) if DataParser.snp_miss_tol != 1.0: print("--geno does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if DataParser.ind_miss_tol != 1.0: print("--mind does not have any impact on imputed data", file=sys.stderr) sys.exit(1) if BoundaryCheck.chrom != "NA" and not args.mach_chrpos: libgwas.Exit( ("Positional based filtering (--chr, --from/--to)" + " only work with mach_chrpos. See manual for details.")) mach_parser.Parser.chrpos_encoding = args.mach_chrpos mach_parser.Parser.info_ext = args.mach_info_ext mach_parser.Parser.dosage_ext = args.mach_dose_ext mach_parser.Parser.chunk_stride = args.mach_chunk_size mach_parser.Parser.min_rsquared = args.mach_min_rsquared archives, infos = self.ParseMachFile(args.mach.name, args.mach_offset, args.mach_count) dataset = mach_parser.Parser(archives, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() else: parser.print_usage(sys.stderr) print( "\nNo data has been specified. Users must specify either pedigree or transposed pedigree to continue", file=sys.stderr) sys.exit(1) if args.pheno or args.sample_pheno: mphenos = [] if args.mphenos != "": mphenos = args.mphenos.split(",") nphenos = [] if args.pheno_names != "": nphenos = args.pheno_names.split(",") if len(mphenos) + len(nphenos) == 0 and not args.all_pheno: libgwas.Exit("You must select one or more phenotypes when ") sample_file = False pheno_filename = args.pheno if args.sample_pheno: pheno_filename = args.sample_pheno sample_file = True pheno_covar.load_phenofile(pheno_filename, mphenos, nphenos, sample_file) if args.covar: pheno_covar.load_covarfile(args.covar, args.covar_numbers.split(","), args.covar_names.split(",")) pheno_covar.do_standardize_variables = True return dataset, pheno_covar
sys.exit(1) if DataParser.snp_miss_tol != 1.0: print >> sys.stderr, "--geno does not have any impact on imputed data" sys.exit(1) if DataParser.ind_miss_tol != 1.0: print >> sys.stderr, "--mind does not have any impact on imputed data" sys.exit(1) impute_parser.SetEncoding(args.impute_encoding) impute_parser.Parser.info_ext = args.impute_info_ext impute_parser.Parser.info_threshold = args.impute_info_thresh libgwas.ExitIf( "--impute-fam is required for when processing imputed data", args.impute_fam == None) archives, chroms, infos = self.ParseImputeFile( args.impute.name, args.impute_offset, args.impute_count) dataset = impute_parser.Parser(args.impute_fam.name, archives, chroms, infos) dataset.load_family_details(pheno_covar) dataset.load_genotypes() elif args.mach: DataParser.compressed_pedigree = not args.mach_uncompressed if (args.mach_offset > 0 and args.mach_count == -1) or (args.mach_offset == -1 and args.impute_count > 0): print >> sys.stderr, "--mach-count and --mach_offset must both be > 0 if one is set other than -1. " sys.exit(1) if DataParser.snp_miss_tol != 1.0: print >> sys.stderr, "--geno does not have any impact on imputed data" sys.exit(1) if DataParser.ind_miss_tol != 1.0: print >> sys.stderr, "--mind does not have any impact on imputed data"
def WriteTestFiles(self, prefix="__test_imputed"): self.fam_file = "%s.gen_samples" % (prefix) fam_file = open(self.fam_file, 'w') print >> fam_file, """ID_1 ID_2 missing father mother sex plink_pheno 0 0 0 D D D B ID0001 FAM001 0 0 0 1 0.1 ID0002 FAM002 0 0 0 2 0.4 ID0003 FAM003 0 0 0 1 1.0 ID0004 FAM004 0 0 0 1 0.5 ID0005 FAM005 0 0 0 2 0.9 ID0006 FAM006 0 0 0 2 1.0 ID0007 FAM007 0 0 0 1 0.1 ID0008 FAM008 0 0 0 1 0.4 ID0009 FAM009 0 0 0 2 1.0 ID0010 FAM010 0 0 0 2 0.5 ID0011 FAM011 0 0 0 2 0.9 ID0012 FAM012 0 0 0 1 1.0""" fam_file.close() self.ind_ids = [ "ID0001:FAM001", "ID0002:FAM002", "ID0003:FAM003", "ID0004:FAM004", "ID0005:FAM005", "ID0006:FAM006", "ID0007:FAM007", "ID0008:FAM008", "ID0009:FAM009", "ID0010:FAM010", "ID0011:FAM011", "ID0012:FAM012" ] self.gen_file = "%s.gen.gz" % (prefix) self.gen_file2 = "%s-2.gen.gz" % (prefix) self.info_file1 = "%s.info" % (prefix) self.info_file2 = "%s-2.info" % (prefix) self.uncmp_1 = "%s.gen" % (prefix) self.uncmp_2 = "%s-2.gen" % (prefix) gen_file = gzip.open(self.gen_file, 'wb') uncmp_file = open(self.uncmp_1, 'w') idx = 0 self.additive_encoding = numpy.zeros((20, 12)) self.dominant_encoding = numpy.zeros((20, 12)) self.recessive_encoding = numpy.zeros((20, 12)) self.raw = numpy.zeros((20, 12, 3)) self.positions = [] self.mafs = [] self.rsids = [] info = [0.1 * (x % 10) + 0.05 for x in range(0, 20)] certainty = [0.2 * (x % 5) + 0.05 for x in range(0, 20)] info_file = open(self.info_file1, 'w') print >> info_file, "snp_id rs_id position exp_freq_a1 info certainty type info_type0 concord_type0 r2_type0" for base in base_freq: f = numpy.random.normal(loc=base, scale=0.1, size=12) f[f > 1.0] = 1.0 f[f < 0] = 0.0 maf = 1.0 - f AA = f * f Aa = 2 * f * maf aa = maf * maf self.raw[idx] = numpy.hstack( (AA.reshape(-1, 1), Aa.reshape(-1, 1), aa.reshape(-1, 1))) self.additive_encoding[idx] = Aa + 2 * aa self.mafs.append(numpy.mean(self.additive_encoding[idx] / 2)) self.dominant_encoding[idx] = Aa + aa self.recessive_encoding[idx] = aa line = numpy.hstack( (AA.reshape(-1, 1), Aa.reshape(-1, 1), aa.reshape(-1, 1))) self.positions.append((10 + idx) * 1397) self.rsids.append("rs132%d" % (idx * 67)) print >> gen_file, "\t".join([ "--", self.rsids[-1], str(self.positions[-1]), self.allele_1[idx], self.allele_2[idx]]) + \ "\t" + \ "\t".join(["%0.6f" % (x) for x in line.reshape(-1)]) print >> uncmp_file, "\t".join([ "--", self.rsids[-1], str(self.positions[-1]), self.allele_1[idx], self.allele_2[idx]]) + \ "\t" + \ "\t".join(["%0.6f" % (x) for x in line.reshape(-1)]) print >> info_file, " ".join([ "--", self.rsids[-1], str(self.positions[-1]), str(numpy.mean(self.additive_encoding[idx] / 2)), str(info[idx]), str(certainty[idx]), "0", "-1", "-1", "-1" ]) idx += 1 gen_file.close() uncmp_file.close() info_file.close() gen_file = gzip.open(self.gen_file2, 'wb') uncmp_file = open(self.uncmp_2, 'w') info_file = open(self.info_file2, 'w') print >> info_file, "snp_id rs_id position exp_freq_a1 info certainty type info_type0 concord_type0 r2_type0" for base in base_freq: f = numpy.random.normal(loc=base, scale=0.1, size=12) f[f > 1.0] = 1.0 f[f < 0] = 0.0 maf = 1.0 - f AA = f * f Aa = 2 * f * maf aa = maf * maf self.raw[idx] = numpy.hstack( (AA.reshape(-1, 1), Aa.reshape(-1, 1), aa.reshape(-1, 1))) self.additive_encoding[idx] = Aa + 2 * aa self.mafs.append(numpy.mean(self.additive_encoding[idx] / 2)) self.dominant_encoding[idx] = Aa + aa self.recessive_encoding[idx] = aa line = numpy.hstack( (AA.reshape(-1, 1), Aa.reshape(-1, 1), aa.reshape(-1, 1))) self.positions.append((10 + idx) * 1397) self.rsids.append("rs132%d" % (idx * 67)) print >> gen_file, "\t".join([ "--", self.rsids[-1], str(self.positions[-1]), self.allele_1[idx], self.allele_2[idx]]) + \ "\t" + \ "\t".join(["%0.4f" % (x) for x in line.reshape(-1)]) print >> uncmp_file, "\t".join([ "--", self.rsids[-1], str(self.positions[-1]), self.allele_1[idx], self.allele_2[idx]]) + \ "\t" + \ "\t".join(["%0.4f" % (x) for x in line.reshape(-1)]) print >> info_file, " ".join([ "--", self.rsids[-1], str(self.positions[-1]), str(numpy.mean(self.additive_encoding[idx] / 2)), str(info[idx]), str(certainty[idx]), "0", "-1", "-1", "-1" ]) idx += 1 self.impute_parser = impute_parser.Parser(self.fam_file, [self.gen_file], chroms=["3"])