def processSummary(con): myReader = summary.Reader(open(sys.argv[2], 'rb')) gene = None ffold = 0 snps = 0 introns = [] exons = [] exon = [] intron = [] for site in myReader: if isCoding(myReader, site): if gene == None: #get the gene ID try: res = con.execute( "SELECT gene_id FROM genes WHERE scaf=%s AND (start=%s OR stop=%s)" % (site.CHROM, site.POS, site.POS)) gene = res[0] except sqlite3.Error, e: sys.stderr.write( "Database Error: %s\nCould not get gene ID for gene at %s %s." % (e.args[0], site.CHROM, site.POS)) if site.ALT_NUM > 0: snps += 1 if myReader.TypeToCode['4fold'] in site.Types: ffold += 1
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 3: usage() processArgs(3) summaryReader = summary.Reader(open(sys.argv[1], "rb")) div = pickle.load(open(sys.argv[2], "rb")) print(summaryReader.Header) #read the infile file... for site in summaryReader: if div.has_key(site.CHROM): try: site.DIVERGENCE = div[site.CHROM][site.POS] except IndexError: sys.stderr.write( "No divergence for site %s %s in divergence file. Assigning a -1.\n" % (site.CHROM, site.POS)) site.DIVERGENCE = -1 else: sys.stderr.write( "No divergence for site %s %s in divergence file (Missing CHROM in divergence). Assigning a -1.\n" % (site.CHROM, site.POS)) site.DIVERGENCE = -1 print(site)
def __main__(): processArgs() hapcutReader = hapcut.Reader(open(_args.hapcut)) summaryReader = summary.Reader(open(_args.summary)) myGenerator = getNextBlock(hapcutReader) block = myGenerator.next() pSite = None #print(summaryReader.Genotypes) for site in summaryReader: if site.CHROM != block.chrom: sys.stderr.write( "Not on the correct chromosome. Skipping ahead. (Summary: %s Haplotype: %s)\n" % (site.CHROM, block.chrom)) continue if site.POS >= block.start and site.POS <= block.end and _args.name in site.Genotypes.keys( ): #within the block #add the site to the haplotypes #sys.stderr.write("WITHIN BLOCK %s %s.\n" % (site.CHROM, site.POS)) #sys.stderr.write("%s %s\n" % (block.start, block.end)) #sys.stderr.write("%s %s %s\n" % (site.CHROM, site.POS ,"".join(block.haplotype1))) #sys.stderr.write("%s\n" % summaryReader.Genotypes[site.Genotypes[_args.name]]) if site.POS in block.SNPs: #site already phased NOTE the same qual cutoffs may NOT have happened here continue if len(site.REF) > 1 or len(site.ALT) > 1: block.addSNP(site.CHROM, site.POS, "N", "N", 1, 0, 0) if summaryReader.Genotypes[site.Genotypes[ _args.name]] == "heterozygote": block.addSNP(site.CHROM, site.POS, "N", "N", 1, 0, 0) elif summaryReader.Genotypes[site.Genotypes[ _args.name]] == "homozygote reference": block.addSNP(site.CHROM, site.POS, site.REF, site.ALT, 0, 0, 0) elif summaryReader.Genotypes[site.Genotypes[ _args.name]] == "homozygote alternate": block.addSNP(site.CHROM, site.POS, site.REF, site.ALT, 1, 1, 0) else: #N continue #sys.stderr.write("%s %s %s\n" % (site.CHROM, site.POS, "".join(block.haplotype1))) if site.POS == block.end: #output this block outputBlock(block) block = myGenerator.next() if block == None: break if site.POS > block.end: #we missed something #add N's to fill it out fillN(block, pSite, site.POS) outputBlock(block) block = myGenerator.next() if block == None: break pSite = site.POS
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 2: usage() processArgs(2) reader = summary.Reader(open(sys.argv[1], "rb")) afs = [0] * (_N + 1) div = [0, 0] #checked, diverged start = 0 pSite = 0 header = "chrom\tstart\tstop\tlen\tdivergenceSites\tdivergence\tafs_" for i in range(_N + 1): header += str(i) + "\tafs_" header = header[:-4] print(header) for record in reader: if _v: sys.stderr.write("On line:\t%s\n" % record.prettyStr()) if 'intron' in record.typeNames(): if start == 0: if _v: sys.stderr.write("Start intron: %s.\n" % record.prettyStr()) start = record.POS pSite = record.POS if record.TOTAL != _N: continue afs[record.ALT_NUM] += 1 if record.DIVERGENCE != -1: div[0] += 1 if record.DIVERGENCE: div[1] += 1 elif pSite != 0: if _v: sys.stderr.write("End of intron at %s.\n" % (record.prettyStr())) #scaf, start, stop, length, divergenceChecked, divergence, afs afsStr = "\t".join(map(str, afs)) print("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (record.CHROM, start, pSite, pSite - start, div[0], div[1], afsStr)) pSite = 0 start = 0 afs = [0] * (_N + 1) div = [0, 0]
def __main__(): mySum = summary.Reader(open(sys.argv[1],'rb')) out = open(sys.argv[2],'w') #out.write("pac piS piN Ssite# Nsite#\n") myTest = sys.argv[3] processArgs(4) count = _d siteDic = mySum.summary.typesAsInt() if myTest == "sfs": sfs(mySum, out, count, siteDic) elif myTest == "dfealpha": dfealpha(mySum, out, count, siteDic) elif myTest == "diversity": diversity(mySum, out, count, siteDic)
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 3: usage() processArgs(3) summaryReader = summary.Reader(open(sys.argv[1], "r")) regions = open(sys.argv[2], "r") #read the regions #regions{IND{SCAF{(start, end)}}} regionsList = {} for ind in summaryReader.summary.Samples: regionsList[ind] = {} for line in regions: line = line.rstrip() sline = line.split() if line.startswith("CHROM"): continue if len(sline) != 4: sys.stderr.write("Weird line:\n\t%s\n" % line) continue chrom = sline[0] start = int(sline[1]) end = int(sline[2]) ind = sline[3] if chrom in regionsList[ind].keys(): regionsList[ind][chrom].append((start, end)) else: regionsList[ind][chrom] = [(start, end)] header = summaryReader.summary.prettyHeader() header = header[0:header.index("DIVERGENCE") + 10] print(header) genoReverse = {v: k for k, v in summaryReader.summary.Genotypes.items()} #read the infile file... for site in summaryReader: ibdInds = getUnsafeInds(site, regionsList) #print("%s %s" % (site.POS, ibdInds)) alleles = [] for ind in site.Genotypes.keys(): if ind in ibdInds: if site.Genotypes[ind] == genoReverse["homozygote reference"]: alleles.append(site.REF) elif site.Genotypes[ind] == genoReverse[ "homozygote alternate"]: alleles.append(site.ALT) elif site.Genotypes[ind] == genoReverse[ "heterozygote"]: #assume ref for all IBD individuals with data alleles.append(site.REF) elif site.Genotypes[ind] == genoReverse["unknown"]: alleles.append(genoReverse["unknown"]) else: if site.Genotypes[ind] == genoReverse["homozygote reference"]: alleles.append(site.REF) alleles.append(site.REF) elif site.Genotypes[ind] == genoReverse[ "homozygote alternate"]: alleles.append(site.ALT) alleles.append(site.ALT) elif site.Genotypes[ind] == genoReverse["heterozygote"]: alleles.append(site.REF) alleles.append(site.ALT) elif site.Genotypes[ind] == genoReverse["unknown"]: alleles.append(genoReverse["unknown"]) alleles.append(genoReverse["unknown"]) newAlleles = random.sample(alleles, _N) ref = newAlleles.count(site.REF) alt = newAlleles.count(site.ALT) site.REF_NUM = ref site.ALT_NUM = alt if site.TOTAL != 0: #if total was 0 for some reason, like being filtered, keep it that way site.TOTAL = ref + alt site.Genos = [] print(site)
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 2: usage() processArgs(2) args = { "fraction coding": _c, "window size": _w, "sample size": _N, "region output file": _o } argStr = "" for arg in args.keys(): argStr += ("#%s\t%s\n" % (arg, args[arg])) sys.stderr.write(argStr) myReader = summary.Reader(open(sys.argv[1], "rb")) codingN = [] for t in _coding: if t in myReader.TypeToCode.keys(): codingN.append(myReader.TypeToCode[t]) codingN = set(codingN) ncodingN = [] for t in _ncoding: if t in myReader.TypeToCode.keys(): ncodingN.append(myReader.TypeToCode[t]) ncodingN = set(ncodingN) if _o: filteredRegions = open(_o, "w") else: filteredRegions = None window = [] safe_sites = 0.0 print(myReader.Header) pScaf = "" #read the infile file... for site in myReader: if pScaf == "": site.CHROM if (len(window) == _w or pScaf != site.CHROM) and window: processWindow(window, safe_sites, filteredRegions) window = [] safe_sites = 0.0 #check if all individuals are represented if site.TOTAL == _N: safe_sites += 1 #check if a site is both a coding site and a noncoding site #if it is, make the site not safe, set type to unknown safe = True if len(set(site.Types) & codingN) and len(set(site.Types) & ncodingN): safe = False if not safe: site.Types = [myReader.TypeToCode['unknown']] window.append(site) pScaf = site.CHROM processWindow(window, safe_sites, filteredRegions) #get the last window if _o: filteredRegions.close()
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 2: usage() processArgs(2) if _N % 2 == 0: folded_size = int(_N / 2 + 1) else: folded_size = int(ceil(float(_N) / 2)) summaryReader = summary.Reader(open(sys.argv[1], "rb")) #read the infile file... lastGeneSite = None # only count exon, 4, and 0 fold sites currentCNCs = [] # list of tuples (start, stop, div, pi) cncStart = None #start of current stretch of CNCs cncDiv = 0 cncDivTot = 0 cncWindow = [] pScaf = None pSite = None geneTypes = [ summaryReader.summary.typeToCode['exon'], summaryReader.summary.typeToCode['4fold'], summaryReader.summary.typeToCode['0fold'] ] cncTypes = [summaryReader.summary.typeToCode['cnc']] for t in _t: cncTypes.append[summaryReader.summary.typeToCode[t]] print("CHROM\tCNC_START\tCNC_END\tCNC_DIV\tCNC_PI\tUP_DIST\tDOWN_DIST") for site in summaryReader: if pScaf == None: pScaf = site.CHROM if pScaf != site.CHROM: #on next scaf, clean up if cncStart != None: #end CNC at last site if cncDivTot: div = float(cncDiv) / cncDivTot else: div = "NA" pi = calcPi(cncWindow) currentCNCs.append((cncStart, pSite, div, pi)) outputCNCs(pScaf, currentCNCs, lastGeneSite, None) cncStart = None cncWindow = [] cncDiv = 0 cncDivTot = 0 currentCNCs = [] pScaf = site.CHROM if isCNC(cncTypes, site.Types): if cncStart == None: cncStart = site.POS if site.REF_NUM + site.ALT_NUM != _N or site.TOTAL != _N: sys.stderr.write("Bad data at site %s %s, ignoring it.\n" % (site.CHROM, site.POS)) cncWindow.append(site.REF_NUM) if site.DIVERGENCE == 1: cncDiv += 1 cncDivTot += 1 elif site.DIVERGENCE == 0: cncDivTot += 1 elif isGene(geneTypes, site.Types): if cncStart != None: if cncDivTot: div = float(cncDiv) / cncDivTot else: div = "NA" pi = calcPi(cncWindow) currentCNCs.append((cncStart, pSite, div, pi)) cncStart = None cncWindow = [] cncDiv = 0 cncDivTot = 0 if currentCNCs: outputCNCs(pScaf, currentCNCs, lastGeneSite, site.POS) currentCNCs = [] lastGeneSite = site.POS else: if cncStart != None: if cncDivTot: div = float(cncDiv) / cncDivTot else: div = "NA" pi = calcPi(cncWindow) currentCNCs.append((cncStart, pSite, div, pi)) cncStart = None cncWindow = [] cncDiv = 0 cncDivTot = 0 pSite = site.POS
import sys import summary reader = summary.Reader(open(sys.argv[1])) _w = int(sys.argv[2])#this is a dumb way to do this! print(reader.Header) prev = [] skip = 0 for record in reader: dup = False #sys.stderr.write("Checking %s %s.\n"%(skip,record)) if prev and record.POS == prev[-1].POS:#reached an indel #do stuff to fix it #clear the buffer, because we're skipping stuff for r in prev: r.TOTAL = 0 #is it an deletion if len(record.REF) > 1: skip = len(record.REF)+_w else:#it is an insertion skip = _w #DONOT add this record to the list, because it is a duplicate #the previous non-indel version is already in there with a total of 0 dup = True if len(prev) == _w+1:#+1 because we want to keep one mroe than window size, to throw out the duplicate site too print(prev.pop(0))
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 3: usage() processArgs(3) summaryReader = summary.Reader(open(sys.argv[1], "rb")) if _t not in summaryReader.Types: intCodes = map(int, summaryReader.Codes) summaryReader.addCode(max(intCodes) + 1, _t) print(summaryReader.Header) sites = open(sys.argv[2], "r") if (sites == None): print("Bad sites file name: " + sys.argv[1]) sys.exit() next_site = getNextSite(sites) #read the infile file... for site in summaryReader: if next_site == (None, None): #print the remaining sites if _d and summaryReader.TypeToCode[_t] in site.Types: site.Types.remove(summaryReader.TypeToCode[_t]) site.Types.append(0) print(site) continue while next_site[0] != None and site.CHROM > next_site[ 0]: #we missed this site sys.stderr.write("Missed a site (CHROM): " + str(next_site) + "\n") next_site = getNextSite(sites) if next_site == (None, None): sys.stderr.write("Ran out of sites\n") if _d and summaryReader.TypeToCode[_t] in site.Types: site.Types.remove(summaryReader.TypeToCode[_t]) site.Types.append(0) print(site) continue if site.CHROM < next_site[ 0]: #we havent found the right scaf yet, go to the next site. if _d and summaryReader.TypeToCode[_t] in site.Types: site.Types.remove(summaryReader.TypeToCode[_t]) site.Types.append(0) print(site) continue #on the right scaf, now deal with pos info while next_site[1] != None and site.POS > next_site[ 1]: #we missed a site WARNING, this while loop could bring us to the next chromosome.... sys.stderr.write("Missed a site (POS): " + str(next_site) + "\n") next_site = getNextSite(sites) if next_site[1] > site.POS: #haven't reached the site yet if _d and summaryReader.TypeToCode[_t] in site.Types: site.Types.remove(summaryReader.TypeToCode[_t]) site.Types.append(0) print(site) continue if next_site == (None, None): sys.stderr.write("Ran out of sites\n") if _d and summaryReader.TypeToCode[_t] in site.Types: site.Types.remove(summaryReader.TypeToCode[_t]) site.Types.append(0) print(site) continue if next_site[1] == site.POS: #at the site if _r: for r in _r: if summaryReader.TypeToCode[r] in site.Types: site.Types.remove(summaryReader.TypeToCode[r]) site.Types.append(summaryReader.TypeToCode[_t]) print(site) next_site = getNextSite(sites)
# gffDic[i] = [0]*2000 #make a dic of all the gene locations geneFile = open(sys.argv[2],'r') #geneFile = open('../test_gff_gene_flatfile','r') for line in geneFile: scaf, gene, start, end = line.split() scafNum = int(scaf.split('_')[1]) for i in range(int(start), int(end)+1): #not that worried about 0 or 1 based coordinates because we're only dealing with 0 and 4 fold sites. gffDic[scafNum][i] = gene geneFile.close() #myOut = open('../../0fold4fold.withgenes.summary','w')) #sumRead = summary.Reader(open('/data/youngwha.lee/189_genomes/UG_all_vars/recal_vcfs/vcfsummaries/downsampled320/sc8_down_320_4','rb')) sumRead = summary.Reader(open(sys.argv[1],'rb')) sumRead.addGenes() print(sumRead.Header) for site in sumRead: if gffDic[int(site.CHROM.split('_')[1])][site.POS] == 0: #not in a gene? sys.stderr.write("err not in gene "+site.prettyStr()+"\n") continue else: geneName = gffDic[int(site.CHROM.split('_')[1])][site.POS] site.GENE = geneName print(site) #print(" ".join([site.__str__(),geneName,"NA"]))
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 3: usage() processArgs(3) summaryReader = summary.Reader(open(sys.argv[1],"r")) sites = open(sys.argv[2],"r") summaryReader.addGenes() print(summaryReader.Header) next_site = getNextSite(sites) #read the infile file... for site in summaryReader: if next_site == (None, None, None, None, None):#print the remaining sites site.GENE = "N" site.DIR = "0" print(site) continue while next_site[0] != None and site.CHROM > next_site[0]:#we missed this site sys.stderr.write("Missed a site (CHROM): %s %s\n" % (next_site[0], next_site[1])) next_site = getNextSite(sites) if next_site == (None, None, None, None, None): site.GENE = "N" site.DIR = "0" print(site) continue if site.CHROM < next_site[0]:#we havent found the right scaf yet, go to the next site. site.GENE = "N" site.DIR = "0" print(site) continue #on the right scaf, now deal with pos info while next_site[1] != None and site.POS > next_site[1]:#we missed a site WARNING, this while loop could bring us to the next chromosome.... sys.stderr.write("Missed a site (POS): "+str(next_site)+"\n") next_site = getNextSite(sites) if next_site[1] > site.POS:#haven't reached the site yet site.GENE = "N" site.DIR = "0" print(site) continue if next_site == (None, None, None, None, None): site.GENE = "N" site.DIR = "0" print(site) continue if next_site[1] == site.POS:#at the site site.GENE = next_site[3] site.DIR = next_site[4] print(site) next_site = getNextSite(sites)
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 3: usage() processArgs(3) summaryReader = summary.Reader(open(sys.argv[1], "rb")) annotationReader = annotation.Reader(open(sys.argv[2], "rb")) annotIter = annotationReader.__iter__() myGene = getNextGene(annotIter) if myGene == None: sys.stderr.write("No genes listed in annotation. Exiting.\n") sys.exit(0) if len(summaryReader.summary.Samples) == 0: sys.stderr.write( "No individual genotype info in this Summary file. Cannot make fastas.\n" ) sys.exit(0) seqs = {} for samp in summaryReader.summary.Samples: seqs[samp] = [[], []] doneScafs = [] genoDict = {v: k for k, v in summaryReader.summary.Genotypes.items()} pScaf = "" pSite = 0 #read the infile file... for site in summaryReader: #sys.stderr.write("Processing site %s\n" % site.prettyStr()) if site.CHROM != myGene.scaf: while myGene.scaf in doneScafs: sys.stderr.write( "Missed gene %s on scaf %s. Make sure file is sorted correctly.\n" % (myGene.name, myGene.scaf)) myGene = getNextGene(annotIter) for samp in summaryReader.summary.Samples: seqs[samp] = [[], []] if site.CHROM != myGene.scaf: #haven't reached this scaf yet, skip along in the summary continue if pScaf == "" or pScaf != site.CHROM: doneScafs.append(pScaf) pScaf = site.CHROM pSite = 0 myGene = addNs(seqs, pSite, site.POS, myGene, annotIter) if myGene == None: #sys.stderr.write("main 71: ran out of genes. Exiting.\n") break if site.POS < myGene.exons[0][0]: pass elif site.POS <= myGene.exons[0][1]: for samp, geno in site.Genotypes.items(): if geno == genoDict['heterozygote']: seqs[samp][0].append(site.REF) seqs[samp][1].append(site.ALT) elif geno == genoDict['homozygote reference']: seqs[samp][0].append(site.REF) seqs[samp][1].append(site.REF) elif geno == genoDict['homozygote alternate']: seqs[samp][0].append(site.ALT) seqs[samp][1].append(site.ALT) else: seqs[samp][0].append("N") seqs[samp][1].append("N") if site.POS >= myGene.exons[0][1]: myGene.exons.pop(0) if not myGene.exons: #print the seqs to a file outputFasta(myGene, seqs) #grab new gene myGene = getNextGene(annotIter) #check it if myGene == None: break #reset seqs for samp in summaryReader.summary.Samples: seqs[samp] = [[], []] pSite = site.POS
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 2: usage() processArgs(2) reader = summary.Reader(open(sys.argv[1], 'rb')) if not "GENE" in reader.summary.Fields: sys.stderr.write( "Your summary does not have gene names. Please add gene info to this summary and try again.\n" ) sys.exit() myHeader = "GENE DIR IntronNum IntronLen Window CNCnum " for i in range(int(floor(_N / 2.0) + 1)): myHeader += "afs_%s " % i print(myHeader) currentGene = "" dir = "" introns = [] currentIntronAF = [] currentIntronCNC = [] if _v: sys.stderr.write("Reading in Summary...\n") for record in reader: if record.GENE != "N": #we are in a gene if record.GENE != currentGene and currentGene != "N": #finish off the previous gene if currentIntronAF: introns.append(zip(currentIntronAF, currentIntronCNC)) processGene(currentGene, dir, introns) currentGene = record.GENE dir = record.DIR introns = [] currentIntronAF = [] currentIntronCNC = [] elif not currentGene: #start this new gene currentGene = record.GENE dir = record.DIR introns = [] currentIntronAF = [] currentIntronCNC = [] if reader.TypeToCode[_intron] in record.Types: #in an intron if record.TOTAL == _N: currentIntronAF.append(min(record.REF_NUM, record.ALT_NUM)) else: #no data currentIntronAF.append(-1) if reader.TypeToCode[_cnc] in record.Types: #also a CNC site currentIntronCNC.append(1) else: #not a CNC currentIntronCNC.append(0) else: #not in an intron if currentIntronAF: #we have passed the previous intron introns.append(zip(currentIntronAF, currentIntronCNC)) currentIntronAF = [] currentIntronCNC = [] else: #we arent in a gene if currentGene and currentGene != "N": if currentIntronAF: introns.append(zip(currentIntronAF, currentIntronCNC)) processGene(currentGene, dir, introns) currentGene = "" dir = "" introns = [] currentIntronAF = [] currentIntronCNC = [] if introns: #get the last set of introns, if any if currentIntronAF: introns.append(zip(currentIntronAF, currentIntronCNC)) processGene(currentGene, dir, introns)
def __main__(): #check aruguments processArgs() summaryReader = summary.Reader(open(_args.summary, "rb")) #dictionary of site codes #types = {'0fold': 3, 'stop': 8, 'intergene': 0, '5utr': 6, '3utr': 5, 'exon': 2, 'intron': 1, 'istop': 7, '4fold': 4, 'unknown': 9, 'cnc':10} #reverse_types = {v:k for k, v in types.items()} #Read in the file and build the regions of each size. Calculate the local AFS for each region. if _args.sample_size % 2 == 0: folded_size = int(_args.sample_size / 2 + 1) else: folded_size = int(ceil(float(_args.sample_size) / 2)) regions = {} i = -1 currscaf = None afs = resetAFS({}, summaryReader.summary.typeToCode, folded_size) div = resetAFS( {}, summaryReader.summary.typeToCode, 2 ) #just makes a dictionary for each type with a list of length 2, store num sites and num div sites added = 0 for site in summaryReader: #"#CHROM\tPOS\tREF\tALT\tREF_NUMBER\tALT_NUMBER\tTOTAL\tSITE_TYPE") if currscaf == None: currscaf = site.CHROM #if i == -1:#start the window at the first site we have # i = site.POS if added >= _args.window or currscaf != site.CHROM: #if site.POS >= i+_args.window or currscaf != site.CHROM: regions[currscaf + "_" + str(site.POS)] = (afs, div) afs = resetAFS({}, summaryReader.summary.typeToCode, folded_size) div = resetAFS({}, summaryReader.summary.typeToCode, 2) # i += _args.window added = 0 if currscaf != site.CHROM: currscaf = site.CHROM if site.TOTAL != _args.sample_size or site.TOTAL != site.REF_NUM + site.ALT_NUM: #not enough alleles at this locus, or total is wrong (filtered data) sys.stderr.write( "Skipping site, either not enough alleles or the total number of alleles does not match the sum of alleles.\n\t%s\n" % site) continue af = min(site.REF_NUM, site.ALT_NUM) #if in SNP mode only add to the window size if we are not at a fixed site if af != 0 and af != 1: added += 1 elif not _args.snps: added += 1 for t in site.Types: if t not in afs.keys(): sys.stderr.write("Weird site type encountered: \n\t" + str(site) + "\n") continue afs[t][af] += 1 #store divergence info for the window if site.DIVERGENCE != -1: div[t][0] += 1 if site.DIVERGENCE == 1: div[t][1] += 1 #this is commented out because the last region is usually not long enough to be useful #regions[currscaf+"_"+str(i)] = (afs, div)#store the last region #make sure all wanted types are actually present safe = [] for w in _args.site_types: if w in summaryReader.summary.Types.keys(): safe.append(w) else: sys.stderr.write( "Wanted type not present in codes: %s. Excluding it from analysis.\n" % w) _args.site_types = safe counts = resetAFS({}, summaryReader.summary.typeToCode, 1) for r in regions.values(): for t in summaryReader.summary.Types.keys(): counts[t][0] += sum(r[0][t]) sys.stderr.write("Numbers of each site type in whole analysis:\n") for t, val in summaryReader.summary.typeToCode.items(): sys.stderr.write("%s %s\n" % (t, counts[val][0])) region_names = list(regions.keys()) num_regions = len(region_names) #print out the non-bootstrapped afs output_bootstrap(summaryReader.summary.typeToCode, summaryReader.summary.Types, folded_size, region_names, regions, "real") #perform the bootstraps for i in range(_args.bootstraps): boot_regions = [] while len(boot_regions) < num_regions: boot_regions.append(choice(region_names)) #output the AFS for this bootstrap output_bootstrap(summaryReader.summary.typeToCode, summaryReader.summary.Types, folded_size, boot_regions, regions, str(i))
def main(): reader = summary.Reader(open(args.summary, 'rb')) if reader.summary.Samples: N = reader.summary.Ploidy * len(reader.summary.Samples) elif not args.sample_size: sys.stderr.write( "No samples listed in this summary. You must provide a sample size with -N.\n" ) sys.exit() else: N = args.sample_size genAFS = lambda: [0] * (N + 1) if "GENE" not in reader.summary.Fields: sys.stderr.write( "The gene names are not in this summary. All gene names will be blank if you use this file.\n" ) try: intronCode = reader.summary.typeToCode['intron'] except KeyError: sys.stderr.write( "Summary has no 'intron' codes, you need to add them or use a different summary.\n" ) sys.exit() #output header outputIntron(N=N) currentIntron = None for record in reader: if intronCode in record.Types: if not currentIntron: #new one currentIntron = Intron(record.GENE, record.CHROM, record.POS, genAFS()) elif currentIntron.gene != record.GENE: #end of previous one, we're on a new one #this should really not happen sys.stderr.write( "Reached a new intron adjacent to an old one at %s %s, you may want to check your annotation.\n" % (record.CHROM, record.POS)) outputIntron(N, currentIntron) currentIntron = Intron(record.GENE, record.CHROM, record.POS, genAFS()) # print(record.POS, currentIntron.afs) #doesn't count AF at sites with too few alleles if record.TOTAL == N: try: currentIntron.afs[record.ALT_NUM] += 1 except IndexError: sys.stderr.write( "At %s %s there is a problem with the ALT_NUM. It is out of acceptable ranges. Is there a problem with your summary? Or with the provided sample size?\n" % (record.CHROM, record.POS)) currentIntron.end = record.POS elif currentIntron: #we reached the end on this one outputIntron(N, currentIntron) currentIntron = None #get the last one if currentIntron: outputIntron(N, currentIntron)
def __main__(): #check aruguments if len(sys.argv) == 1: details() sys.exit() if len(sys.argv) < 2: usage() processArgs(2) summaryReader = summary.Reader(open(sys.argv[1], "rb")) global _t safeT = [] myCounts = {} for t in _t: if t not in summaryReader.summary.typeToCode.keys(): sys.stderr.write( "Type %s not found in summary. It will be ignored. The valid options are: %s.\n" % (t, summaryReader.summary.typeToCode.keys())) else: safeT.append(summaryReader.summary.typeToCode[t]) myCounts[summaryReader.summary.typeToCode[t]] = [] _t = safeT if len(_t) == 0: sys.stderr.write( "No valid types to count were provided. Use the -t option to specify sites of interest.\n" ) sys.exit(0) start = None pScaf = None pSite = None print("TYPE\tCHROM\tMIDPOINT\tCOUNT") #read the infile file... for site in summaryReader: if start == None: start = site.POS pScaf = site.CHROM pSite = site.POS if pScaf != site.CHROM: printWindow(myCounts, summaryReader.summary, (pSite + start) / 2.0, pScaf) for t in _t: myCounts[t] = [] start = site.POS pScaf = site.CHROM if site.POS - start >= _w: printWindow(myCounts, summaryReader.summary, (site.POS + start) / 2.0, pScaf) for t in _t: myCounts[t] = [] start = site.POS for t in _t: if t in site.Types: myCounts[t].append(1) else: myCounts[t].append(0) pSite = site.POS printWindow(myCounts, summaryReader.summary, (pSite - start) / 2.0, pScaf)