Пример #1
0
        with open(args.regionsFile, "rt") as intFile:
            intervals = genomics.Intervals(
                tuples=[line.split() for line in intFile])
    nIntervals = len(intervals.chroms)
    sys.stderr.write("Recording SFS for {} intervals\n".format(nIntervals))
else:
    intervals = None
    nIntervals = 1

'###############################################################################################################'

#parse pop and individual data, if necessary
if args.inputType == "genotypes":
    genoFileReader = genomics.GenoFileReader(
        inputFile,
        headerLine=args.header,
        scafCol=args.scafCol,
        posCol=args.posCol,
        firstSampleCol=args.firstSampleCol)
    popDict = {}
    popNames = []
    if args.pop or args.FSpops:
        if args.pop:
            for pop in args.pop:
                popNames.append(pop[0])
                popDict[pop[0]] = [] if len(pop) == 1 else pop[1].split(",")

        if args.FSpops:
            for pop in [p for pops in args.FSpops for p in pops]:
                if pop not in popNames:
                    popNames.append(pop)
                    popDict[pop] = []
Пример #2
0
    except:
        sys.stderr.write(
            "WARNING: Could not parse fai file, vcf header will not contain contig entries...\n"
        )
        scafs_lengths = None

    with gzip.open(args.reference,
                   "r") if args.reference.endswith(".gz") else open(
                       args.reference, "r") as ref:
        refDict = dict(zip(*genomics.parseFasta(ref.read())))

else:
    refDict = None
#########################################################################################

genoFileReader = genomics.GenoFileReader(genoFile)

allNames = genoFileReader.names

if not args.samples: namesToUse = allNames
else: namesToUse = args.samples.split(",")

outFile.write("##fileformat=VCFv4.2\n")

if refDict:
    outFile.write("##reference=file:{}\n".format(
        args.reference.split("/")[-1]))

    if scafs_lengths:
        for x in range(len(scafs_lengths)):
            outFile.write("##contig=<ID={},length={}>\n".format(
Пример #3
0
        cdsEnds = [geneData[scaffold][mRNA]["cdsEnds"][x] for x in order]

        #for each exon we exytract the sequence for all individuals
        #and build a dictionary of position and genotype for each individual
        for i in range(geneData[scaffold][mRNA]["exons"]):

            #extract the sequence data for this exon

            genoStream = tabixStream(args.genoFile,
                                     chrom=scaffold,
                                     start=cdsStarts[i],
                                     end=cdsEnds[i],
                                     header=True)

            reader = genomics.GenoFileReader(genoStream,
                                             splitPhased=args.split,
                                             ploidy=args.ploidy)

            #if this is the first exon, initialise dictionary of genotype for each pos for each seqName
            if i == 0:
                nSeqs = len(reader.names)
                empty = ["N"] * nSeqs
                siteGTdict = defaultdict(lambda: empty)

            #now update if there is data for this window
            for siteData in reader.siteBySite(
                    asDict=True if args.samples else False):
                GTs = [siteData["GTs"][name] for name in args.samples
                       ] if args.samples else siteData["GTs"]
                siteGTdict[siteData["position"]] = [
                    genomics.complement(gt) for gt in GTs