with open(args.regionsFile, "rt") as intFile: intervals = genomics.Intervals( tuples=[line.split() for line in intFile]) nIntervals = len(intervals.chroms) sys.stderr.write("Recording SFS for {} intervals\n".format(nIntervals)) else: intervals = None nIntervals = 1 '###############################################################################################################' #parse pop and individual data, if necessary if args.inputType == "genotypes": genoFileReader = genomics.GenoFileReader( inputFile, headerLine=args.header, scafCol=args.scafCol, posCol=args.posCol, firstSampleCol=args.firstSampleCol) popDict = {} popNames = [] if args.pop or args.FSpops: if args.pop: for pop in args.pop: popNames.append(pop[0]) popDict[pop[0]] = [] if len(pop) == 1 else pop[1].split(",") if args.FSpops: for pop in [p for pops in args.FSpops for p in pops]: if pop not in popNames: popNames.append(pop) popDict[pop] = []
except: sys.stderr.write( "WARNING: Could not parse fai file, vcf header will not contain contig entries...\n" ) scafs_lengths = None with gzip.open(args.reference, "r") if args.reference.endswith(".gz") else open( args.reference, "r") as ref: refDict = dict(zip(*genomics.parseFasta(ref.read()))) else: refDict = None ######################################################################################### genoFileReader = genomics.GenoFileReader(genoFile) allNames = genoFileReader.names if not args.samples: namesToUse = allNames else: namesToUse = args.samples.split(",") outFile.write("##fileformat=VCFv4.2\n") if refDict: outFile.write("##reference=file:{}\n".format( args.reference.split("/")[-1])) if scafs_lengths: for x in range(len(scafs_lengths)): outFile.write("##contig=<ID={},length={}>\n".format(
cdsEnds = [geneData[scaffold][mRNA]["cdsEnds"][x] for x in order] #for each exon we exytract the sequence for all individuals #and build a dictionary of position and genotype for each individual for i in range(geneData[scaffold][mRNA]["exons"]): #extract the sequence data for this exon genoStream = tabixStream(args.genoFile, chrom=scaffold, start=cdsStarts[i], end=cdsEnds[i], header=True) reader = genomics.GenoFileReader(genoStream, splitPhased=args.split, ploidy=args.ploidy) #if this is the first exon, initialise dictionary of genotype for each pos for each seqName if i == 0: nSeqs = len(reader.names) empty = ["N"] * nSeqs siteGTdict = defaultdict(lambda: empty) #now update if there is data for this window for siteData in reader.siteBySite( asDict=True if args.samples else False): GTs = [siteData["GTs"][name] for name in args.samples ] if args.samples else siteData["GTs"] siteGTdict[siteData["position"]] = [ genomics.complement(gt) for gt in GTs