Пример #1
0
def processSummary(con):
    myReader = summary.Reader(open(sys.argv[2], 'rb'))

    gene = None
    ffold = 0
    snps = 0
    introns = []
    exons = []
    exon = []
    intron = []
    for site in myReader:
        if isCoding(myReader, site):
            if gene == None:  #get the gene ID
                try:
                    res = con.execute(
                        "SELECT gene_id FROM genes WHERE scaf=%s AND (start=%s OR stop=%s)"
                        % (site.CHROM, site.POS, site.POS))
                    gene = res[0]
                except sqlite3.Error, e:
                    sys.stderr.write(
                        "Database Error: %s\nCould not get gene ID for gene at %s %s."
                        % (e.args[0], site.CHROM, site.POS))

            if site.ALT_NUM > 0:
                snps += 1
                if myReader.TypeToCode['4fold'] in site.Types:
                    ffold += 1
Пример #2
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 3:
        usage()

    processArgs(3)

    summaryReader = summary.Reader(open(sys.argv[1], "rb"))
    div = pickle.load(open(sys.argv[2], "rb"))

    print(summaryReader.Header)

    #read the infile file...
    for site in summaryReader:
        if div.has_key(site.CHROM):
            try:
                site.DIVERGENCE = div[site.CHROM][site.POS]
            except IndexError:
                sys.stderr.write(
                    "No divergence for site %s %s in divergence file. Assigning a -1.\n"
                    % (site.CHROM, site.POS))
                site.DIVERGENCE = -1
        else:
            sys.stderr.write(
                "No divergence for site %s %s in divergence file (Missing CHROM in divergence). Assigning a -1.\n"
                % (site.CHROM, site.POS))
            site.DIVERGENCE = -1
        print(site)
Пример #3
0
def __main__():
    processArgs()
    hapcutReader = hapcut.Reader(open(_args.hapcut))
    summaryReader = summary.Reader(open(_args.summary))

    myGenerator = getNextBlock(hapcutReader)
    block = myGenerator.next()
    pSite = None
    #print(summaryReader.Genotypes)
    for site in summaryReader:
        if site.CHROM != block.chrom:
            sys.stderr.write(
                "Not on the correct chromosome. Skipping ahead. (Summary: %s Haplotype: %s)\n"
                % (site.CHROM, block.chrom))
            continue

        if site.POS >= block.start and site.POS <= block.end and _args.name in site.Genotypes.keys(
        ):  #within the block
            #add the site to the haplotypes
            #sys.stderr.write("WITHIN BLOCK %s %s.\n" % (site.CHROM, site.POS))
            #sys.stderr.write("%s %s\n" % (block.start, block.end))
            #sys.stderr.write("%s %s %s\n" % (site.CHROM, site.POS ,"".join(block.haplotype1)))
            #sys.stderr.write("%s\n" % summaryReader.Genotypes[site.Genotypes[_args.name]])
            if site.POS in block.SNPs:
                #site already phased NOTE the same qual cutoffs may NOT have happened here
                continue

            if len(site.REF) > 1 or len(site.ALT) > 1:
                block.addSNP(site.CHROM, site.POS, "N", "N", 1, 0, 0)
            if summaryReader.Genotypes[site.Genotypes[
                    _args.name]] == "heterozygote":
                block.addSNP(site.CHROM, site.POS, "N", "N", 1, 0, 0)
            elif summaryReader.Genotypes[site.Genotypes[
                    _args.name]] == "homozygote reference":
                block.addSNP(site.CHROM, site.POS, site.REF, site.ALT, 0, 0, 0)
            elif summaryReader.Genotypes[site.Genotypes[
                    _args.name]] == "homozygote alternate":
                block.addSNP(site.CHROM, site.POS, site.REF, site.ALT, 1, 1, 0)
            else:  #N
                continue

            #sys.stderr.write("%s %s %s\n" % (site.CHROM, site.POS, "".join(block.haplotype1)))

        if site.POS == block.end:  #output this block
            outputBlock(block)
            block = myGenerator.next()
            if block == None:
                break

        if site.POS > block.end:  #we missed something
            #add N's to fill it out
            fillN(block, pSite, site.POS)
            outputBlock(block)
            block = myGenerator.next()
            if block == None:
                break

        pSite = site.POS
Пример #4
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 2:
        usage()

    processArgs(2)

    reader = summary.Reader(open(sys.argv[1], "rb"))

    afs = [0] * (_N + 1)
    div = [0, 0]  #checked, diverged
    start = 0
    pSite = 0

    header = "chrom\tstart\tstop\tlen\tdivergenceSites\tdivergence\tafs_"
    for i in range(_N + 1):
        header += str(i) + "\tafs_"
    header = header[:-4]
    print(header)

    for record in reader:
        if _v: sys.stderr.write("On line:\t%s\n" % record.prettyStr())

        if 'intron' in record.typeNames():
            if start == 0:
                if _v:
                    sys.stderr.write("Start intron: %s.\n" %
                                     record.prettyStr())
                start = record.POS

            pSite = record.POS

            if record.TOTAL != _N:
                continue

            afs[record.ALT_NUM] += 1

            if record.DIVERGENCE != -1:
                div[0] += 1
                if record.DIVERGENCE:
                    div[1] += 1
        elif pSite != 0:
            if _v:
                sys.stderr.write("End of intron at %s.\n" %
                                 (record.prettyStr()))
            #scaf, start, stop, length, divergenceChecked, divergence, afs
            afsStr = "\t".join(map(str, afs))
            print("%s\t%s\t%s\t%s\t%s\t%s\t%s" %
                  (record.CHROM, start, pSite, pSite - start, div[0], div[1],
                   afsStr))
            pSite = 0
            start = 0
            afs = [0] * (_N + 1)
            div = [0, 0]
Пример #5
0
def __main__():

	mySum = summary.Reader(open(sys.argv[1],'rb'))
	out = open(sys.argv[2],'w')
	#out.write("pac	piS	piN	Ssite#	Nsite#\n")
	myTest = sys.argv[3]
	processArgs(4)	
	count = _d
	siteDic = mySum.summary.typesAsInt()
	if myTest == "sfs":
		sfs(mySum, out, count, siteDic)

	elif myTest == "dfealpha":
		dfealpha(mySum, out, count, siteDic)
	elif myTest == "diversity":
		diversity(mySum, out, count, siteDic)
Пример #6
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 3:
        usage()

    processArgs(3)

    summaryReader = summary.Reader(open(sys.argv[1], "r"))

    regions = open(sys.argv[2], "r")

    #read the regions
    #regions{IND{SCAF{(start, end)}}}
    regionsList = {}
    for ind in summaryReader.summary.Samples:
        regionsList[ind] = {}

    for line in regions:
        line = line.rstrip()
        sline = line.split()

        if line.startswith("CHROM"):
            continue

        if len(sline) != 4:
            sys.stderr.write("Weird line:\n\t%s\n" % line)
            continue

        chrom = sline[0]
        start = int(sline[1])
        end = int(sline[2])
        ind = sline[3]

        if chrom in regionsList[ind].keys():
            regionsList[ind][chrom].append((start, end))
        else:
            regionsList[ind][chrom] = [(start, end)]

    header = summaryReader.summary.prettyHeader()
    header = header[0:header.index("DIVERGENCE") + 10]
    print(header)

    genoReverse = {v: k for k, v in summaryReader.summary.Genotypes.items()}
    #read the infile file...
    for site in summaryReader:
        ibdInds = getUnsafeInds(site, regionsList)
        #print("%s %s" % (site.POS, ibdInds))
        alleles = []
        for ind in site.Genotypes.keys():
            if ind in ibdInds:
                if site.Genotypes[ind] == genoReverse["homozygote reference"]:
                    alleles.append(site.REF)
                elif site.Genotypes[ind] == genoReverse[
                        "homozygote alternate"]:
                    alleles.append(site.ALT)
                elif site.Genotypes[ind] == genoReverse[
                        "heterozygote"]:  #assume ref for all IBD individuals with data
                    alleles.append(site.REF)
                elif site.Genotypes[ind] == genoReverse["unknown"]:
                    alleles.append(genoReverse["unknown"])
            else:
                if site.Genotypes[ind] == genoReverse["homozygote reference"]:
                    alleles.append(site.REF)
                    alleles.append(site.REF)
                elif site.Genotypes[ind] == genoReverse[
                        "homozygote alternate"]:
                    alleles.append(site.ALT)
                    alleles.append(site.ALT)
                elif site.Genotypes[ind] == genoReverse["heterozygote"]:
                    alleles.append(site.REF)
                    alleles.append(site.ALT)
                elif site.Genotypes[ind] == genoReverse["unknown"]:
                    alleles.append(genoReverse["unknown"])
                    alleles.append(genoReverse["unknown"])

        newAlleles = random.sample(alleles, _N)
        ref = newAlleles.count(site.REF)
        alt = newAlleles.count(site.ALT)
        site.REF_NUM = ref
        site.ALT_NUM = alt

        if site.TOTAL != 0:  #if total was 0 for some reason, like being filtered, keep it that way
            site.TOTAL = ref + alt

        site.Genos = []

        print(site)
Пример #7
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 2:
        usage()

    processArgs(2)

    args = {
        "fraction coding": _c,
        "window size": _w,
        "sample size": _N,
        "region output file": _o
    }

    argStr = ""
    for arg in args.keys():
        argStr += ("#%s\t%s\n" % (arg, args[arg]))
    sys.stderr.write(argStr)

    myReader = summary.Reader(open(sys.argv[1], "rb"))

    codingN = []
    for t in _coding:
        if t in myReader.TypeToCode.keys():
            codingN.append(myReader.TypeToCode[t])
    codingN = set(codingN)

    ncodingN = []
    for t in _ncoding:
        if t in myReader.TypeToCode.keys():
            ncodingN.append(myReader.TypeToCode[t])
    ncodingN = set(ncodingN)

    if _o:
        filteredRegions = open(_o, "w")
    else:
        filteredRegions = None

    window = []
    safe_sites = 0.0

    print(myReader.Header)
    pScaf = ""

    #read the infile file...
    for site in myReader:
        if pScaf == "":
            site.CHROM

        if (len(window) == _w or pScaf != site.CHROM) and window:
            processWindow(window, safe_sites, filteredRegions)
            window = []
            safe_sites = 0.0

        #check if all individuals are represented
        if site.TOTAL == _N:
            safe_sites += 1

        #check if a site is both a coding site and a noncoding site
        #if it is, make the site not safe, set type to unknown
        safe = True
        if len(set(site.Types) & codingN) and len(set(site.Types) & ncodingN):
            safe = False

        if not safe:
            site.Types = [myReader.TypeToCode['unknown']]

        window.append(site)

        pScaf = site.CHROM

    processWindow(window, safe_sites, filteredRegions)  #get the last window
    if _o: filteredRegions.close()
Пример #8
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 2:
        usage()

    processArgs(2)

    if _N % 2 == 0:
        folded_size = int(_N / 2 + 1)
    else:
        folded_size = int(ceil(float(_N) / 2))

    summaryReader = summary.Reader(open(sys.argv[1], "rb"))

    #read the infile file...
    lastGeneSite = None  # only count exon, 4, and 0 fold sites
    currentCNCs = []  # list of tuples (start, stop, div, pi)
    cncStart = None  #start of current stretch of CNCs
    cncDiv = 0
    cncDivTot = 0
    cncWindow = []
    pScaf = None
    pSite = None

    geneTypes = [
        summaryReader.summary.typeToCode['exon'],
        summaryReader.summary.typeToCode['4fold'],
        summaryReader.summary.typeToCode['0fold']
    ]
    cncTypes = [summaryReader.summary.typeToCode['cnc']]

    for t in _t:
        cncTypes.append[summaryReader.summary.typeToCode[t]]

    print("CHROM\tCNC_START\tCNC_END\tCNC_DIV\tCNC_PI\tUP_DIST\tDOWN_DIST")

    for site in summaryReader:
        if pScaf == None:
            pScaf = site.CHROM

        if pScaf != site.CHROM:  #on next scaf, clean up
            if cncStart != None:  #end CNC at last site
                if cncDivTot:
                    div = float(cncDiv) / cncDivTot
                else:
                    div = "NA"
                pi = calcPi(cncWindow)
                currentCNCs.append((cncStart, pSite, div, pi))
                outputCNCs(pScaf, currentCNCs, lastGeneSite, None)
            cncStart = None
            cncWindow = []
            cncDiv = 0
            cncDivTot = 0
            currentCNCs = []
            pScaf = site.CHROM

        if isCNC(cncTypes, site.Types):
            if cncStart == None:
                cncStart = site.POS

            if site.REF_NUM + site.ALT_NUM != _N or site.TOTAL != _N:
                sys.stderr.write("Bad data at site %s %s, ignoring it.\n" %
                                 (site.CHROM, site.POS))

            cncWindow.append(site.REF_NUM)

            if site.DIVERGENCE == 1:
                cncDiv += 1
                cncDivTot += 1
            elif site.DIVERGENCE == 0:
                cncDivTot += 1
        elif isGene(geneTypes, site.Types):
            if cncStart != None:
                if cncDivTot:
                    div = float(cncDiv) / cncDivTot
                else:
                    div = "NA"
                pi = calcPi(cncWindow)
                currentCNCs.append((cncStart, pSite, div, pi))
                cncStart = None
                cncWindow = []
                cncDiv = 0
                cncDivTot = 0

            if currentCNCs:
                outputCNCs(pScaf, currentCNCs, lastGeneSite, site.POS)
                currentCNCs = []

            lastGeneSite = site.POS
        else:
            if cncStart != None:
                if cncDivTot:
                    div = float(cncDiv) / cncDivTot
                else:
                    div = "NA"

                pi = calcPi(cncWindow)
                currentCNCs.append((cncStart, pSite, div, pi))
                cncStart = None
                cncWindow = []
                cncDiv = 0
                cncDivTot = 0

        pSite = site.POS
Пример #9
0
import sys
import summary

reader = summary.Reader(open(sys.argv[1]))

_w = int(sys.argv[2])#this is a dumb way to do this!

print(reader.Header)

prev = []
skip = 0
for record in reader:
    dup = False
    #sys.stderr.write("Checking %s  %s.\n"%(skip,record))
    if prev and record.POS == prev[-1].POS:#reached an indel
        #do stuff to fix it
        #clear the buffer, because we're skipping stuff
        for r in prev:
            r.TOTAL = 0
        #is it an deletion
        if len(record.REF) > 1:
            skip = len(record.REF)+_w
        else:#it is an insertion
            skip = _w
        #DONOT add this record to the list, because it is a duplicate
        #the previous non-indel version is already in there with a total of 0
        dup = True

    if len(prev) == _w+1:#+1 because we want to keep one mroe than window size, to throw out the duplicate site too
        print(prev.pop(0))
Пример #10
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 3:
        usage()

    processArgs(3)

    summaryReader = summary.Reader(open(sys.argv[1], "rb"))

    if _t not in summaryReader.Types:
        intCodes = map(int, summaryReader.Codes)
        summaryReader.addCode(max(intCodes) + 1, _t)

    print(summaryReader.Header)

    sites = open(sys.argv[2], "r")

    if (sites == None):
        print("Bad sites file name: " + sys.argv[1])
        sys.exit()

    next_site = getNextSite(sites)
    #read the infile file...
    for site in summaryReader:
        if next_site == (None, None):  #print the remaining sites
            if _d and summaryReader.TypeToCode[_t] in site.Types:
                site.Types.remove(summaryReader.TypeToCode[_t])
                site.Types.append(0)

            print(site)
            continue

        while next_site[0] != None and site.CHROM > next_site[
                0]:  #we missed this site
            sys.stderr.write("Missed a site (CHROM): " + str(next_site) + "\n")
            next_site = getNextSite(sites)

        if next_site == (None, None):
            sys.stderr.write("Ran out of sites\n")
            if _d and summaryReader.TypeToCode[_t] in site.Types:
                site.Types.remove(summaryReader.TypeToCode[_t])
                site.Types.append(0)
            print(site)
            continue

        if site.CHROM < next_site[
                0]:  #we havent found the right scaf yet, go to the next site.
            if _d and summaryReader.TypeToCode[_t] in site.Types:
                site.Types.remove(summaryReader.TypeToCode[_t])
                site.Types.append(0)
            print(site)
            continue

        #on the right scaf, now deal with pos info

        while next_site[1] != None and site.POS > next_site[
                1]:  #we missed a site WARNING, this while loop could bring us to the next chromosome....
            sys.stderr.write("Missed a site (POS): " + str(next_site) + "\n")
            next_site = getNextSite(sites)

        if next_site[1] > site.POS:  #haven't reached the site yet
            if _d and summaryReader.TypeToCode[_t] in site.Types:
                site.Types.remove(summaryReader.TypeToCode[_t])
                site.Types.append(0)
            print(site)
            continue

        if next_site == (None, None):
            sys.stderr.write("Ran out of sites\n")
            if _d and summaryReader.TypeToCode[_t] in site.Types:
                site.Types.remove(summaryReader.TypeToCode[_t])
                site.Types.append(0)
            print(site)
            continue

        if next_site[1] == site.POS:  #at the site
            if _r:
                for r in _r:
                    if summaryReader.TypeToCode[r] in site.Types:
                        site.Types.remove(summaryReader.TypeToCode[r])

            site.Types.append(summaryReader.TypeToCode[_t])

            print(site)
            next_site = getNextSite(sites)
Пример #11
0
#	gffDic[i] = [0]*2000

#make a dic of all the gene locations
geneFile = open(sys.argv[2],'r')
#geneFile = open('../test_gff_gene_flatfile','r')
for line in geneFile:
	scaf, gene, start, end = line.split()
	scafNum = int(scaf.split('_')[1])
	for i in range(int(start), int(end)+1): #not that worried about 0 or 1 based coordinates because we're only dealing with 0 and 4 fold sites.
		gffDic[scafNum][i] = gene

geneFile.close()

#myOut = open('../../0fold4fold.withgenes.summary','w'))
#sumRead = summary.Reader(open('/data/youngwha.lee/189_genomes/UG_all_vars/recal_vcfs/vcfsummaries/downsampled320/sc8_down_320_4','rb'))
sumRead = summary.Reader(open(sys.argv[1],'rb'))

sumRead.addGenes()
print(sumRead.Header)
for site in sumRead:
	if gffDic[int(site.CHROM.split('_')[1])][site.POS] == 0: #not in a gene?
		sys.stderr.write("err not in gene "+site.prettyStr()+"\n")
		continue
	else:
		geneName = gffDic[int(site.CHROM.split('_')[1])][site.POS]
		site.GENE = geneName
		print(site)
		#print("	".join([site.__str__(),geneName,"NA"]))
	

Пример #12
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()
    
    if len(sys.argv) < 3:
        usage()
    
    processArgs(3)
    
    summaryReader = summary.Reader(open(sys.argv[1],"r"))
    sites = open(sys.argv[2],"r")  
    
    summaryReader.addGenes()
    print(summaryReader.Header)
    
    next_site = getNextSite(sites)
    #read the infile file...
    for site in summaryReader:
        if next_site == (None, None, None, None, None):#print the remaining sites
            site.GENE = "N"
            site.DIR = "0"
            print(site)
            continue
    
        while next_site[0] != None and site.CHROM > next_site[0]:#we missed this site
            sys.stderr.write("Missed a site (CHROM): %s %s\n" % (next_site[0], next_site[1]))
            next_site = getNextSite(sites)
            
        if next_site == (None, None, None, None, None):
            site.GENE = "N"
            site.DIR = "0"
            print(site)
            continue
        
        if site.CHROM < next_site[0]:#we havent found the right scaf yet, go to the next site. 
            site.GENE = "N"
            site.DIR = "0"
            print(site)
            continue
        
        #on the right scaf, now deal with pos info
        
        while next_site[1] != None and site.POS > next_site[1]:#we missed a site WARNING, this while loop could bring us to the next chromosome....
            sys.stderr.write("Missed a site (POS): "+str(next_site)+"\n")
            next_site = getNextSite(sites)
            
        if next_site[1] > site.POS:#haven't reached the site yet
            site.GENE = "N"
            site.DIR = "0"
            print(site)
            continue
        
        if next_site == (None, None, None, None, None):
            site.GENE = "N"
            site.DIR = "0"
            print(site)
            continue
        
        if next_site[1] == site.POS:#at the site
            site.GENE = next_site[3]
            site.DIR = next_site[4]
            print(site)
            next_site = getNextSite(sites)
Пример #13
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 3:
        usage()

    processArgs(3)

    summaryReader = summary.Reader(open(sys.argv[1], "rb"))

    annotationReader = annotation.Reader(open(sys.argv[2], "rb"))
    annotIter = annotationReader.__iter__()

    myGene = getNextGene(annotIter)

    if myGene == None:
        sys.stderr.write("No genes listed in annotation. Exiting.\n")
        sys.exit(0)

    if len(summaryReader.summary.Samples) == 0:
        sys.stderr.write(
            "No individual genotype info in this Summary file. Cannot make fastas.\n"
        )
        sys.exit(0)

    seqs = {}
    for samp in summaryReader.summary.Samples:
        seqs[samp] = [[], []]

    doneScafs = []

    genoDict = {v: k for k, v in summaryReader.summary.Genotypes.items()}
    pScaf = ""
    pSite = 0
    #read the infile file...
    for site in summaryReader:
        #sys.stderr.write("Processing site %s\n" % site.prettyStr())
        if site.CHROM != myGene.scaf:
            while myGene.scaf in doneScafs:
                sys.stderr.write(
                    "Missed gene %s on scaf %s. Make sure file is sorted correctly.\n"
                    % (myGene.name, myGene.scaf))
                myGene = getNextGene(annotIter)

            for samp in summaryReader.summary.Samples:
                seqs[samp] = [[], []]

            if site.CHROM != myGene.scaf:  #haven't reached this scaf yet, skip along in the summary
                continue

        if pScaf == "" or pScaf != site.CHROM:
            doneScafs.append(pScaf)
            pScaf = site.CHROM
            pSite = 0

        myGene = addNs(seqs, pSite, site.POS, myGene, annotIter)

        if myGene == None:
            #sys.stderr.write("main 71: ran out of genes. Exiting.\n")
            break

        if site.POS < myGene.exons[0][0]:
            pass

        elif site.POS <= myGene.exons[0][1]:
            for samp, geno in site.Genotypes.items():
                if geno == genoDict['heterozygote']:
                    seqs[samp][0].append(site.REF)
                    seqs[samp][1].append(site.ALT)
                elif geno == genoDict['homozygote reference']:
                    seqs[samp][0].append(site.REF)
                    seqs[samp][1].append(site.REF)
                elif geno == genoDict['homozygote alternate']:
                    seqs[samp][0].append(site.ALT)
                    seqs[samp][1].append(site.ALT)
                else:
                    seqs[samp][0].append("N")
                    seqs[samp][1].append("N")

        if site.POS >= myGene.exons[0][1]:
            myGene.exons.pop(0)

        if not myGene.exons:
            #print the seqs to a file
            outputFasta(myGene, seqs)
            #grab new gene
            myGene = getNextGene(annotIter)
            #check it
            if myGene == None:
                break
            #reset seqs
            for samp in summaryReader.summary.Samples:
                seqs[samp] = [[], []]

        pSite = site.POS
Пример #14
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 2:
        usage()

    processArgs(2)

    reader = summary.Reader(open(sys.argv[1], 'rb'))

    if not "GENE" in reader.summary.Fields:
        sys.stderr.write(
            "Your summary does not have gene names. Please add gene info to this summary and try again.\n"
        )
        sys.exit()

    myHeader = "GENE DIR IntronNum IntronLen Window CNCnum "
    for i in range(int(floor(_N / 2.0) + 1)):
        myHeader += "afs_%s " % i

    print(myHeader)

    currentGene = ""
    dir = ""
    introns = []
    currentIntronAF = []
    currentIntronCNC = []
    if _v: sys.stderr.write("Reading in Summary...\n")
    for record in reader:
        if record.GENE != "N":  #we are in a gene
            if record.GENE != currentGene and currentGene != "N":  #finish off the previous gene
                if currentIntronAF:
                    introns.append(zip(currentIntronAF, currentIntronCNC))
                processGene(currentGene, dir, introns)
                currentGene = record.GENE
                dir = record.DIR
                introns = []
                currentIntronAF = []
                currentIntronCNC = []
            elif not currentGene:  #start this new gene
                currentGene = record.GENE
                dir = record.DIR
                introns = []
                currentIntronAF = []
                currentIntronCNC = []

            if reader.TypeToCode[_intron] in record.Types:  #in an intron
                if record.TOTAL == _N:
                    currentIntronAF.append(min(record.REF_NUM, record.ALT_NUM))
                else:  #no data
                    currentIntronAF.append(-1)

                if reader.TypeToCode[_cnc] in record.Types:  #also a CNC site
                    currentIntronCNC.append(1)
                else:  #not a CNC
                    currentIntronCNC.append(0)

            else:  #not in an intron
                if currentIntronAF:  #we have passed the previous intron
                    introns.append(zip(currentIntronAF, currentIntronCNC))
                    currentIntronAF = []
                    currentIntronCNC = []

        else:  #we arent in a gene
            if currentGene and currentGene != "N":
                if currentIntronAF:
                    introns.append(zip(currentIntronAF, currentIntronCNC))
                processGene(currentGene, dir, introns)
                currentGene = ""
                dir = ""
                introns = []
                currentIntronAF = []
                currentIntronCNC = []

    if introns:  #get the last set of introns, if any
        if currentIntronAF:
            introns.append(zip(currentIntronAF, currentIntronCNC))
        processGene(currentGene, dir, introns)
Пример #15
0
def __main__():
    #check aruguments
    processArgs()

    summaryReader = summary.Reader(open(_args.summary, "rb"))

    #dictionary of site codes
    #types = {'0fold': 3, 'stop': 8, 'intergene': 0, '5utr': 6, '3utr': 5, 'exon': 2, 'intron': 1, 'istop': 7, '4fold': 4, 'unknown': 9, 'cnc':10}
    #reverse_types = {v:k for k, v in types.items()}

    #Read in the file and build the regions of each size. Calculate the local AFS for each region.
    if _args.sample_size % 2 == 0:
        folded_size = int(_args.sample_size / 2 + 1)
    else:
        folded_size = int(ceil(float(_args.sample_size) / 2))

    regions = {}
    i = -1
    currscaf = None
    afs = resetAFS({}, summaryReader.summary.typeToCode, folded_size)
    div = resetAFS(
        {}, summaryReader.summary.typeToCode, 2
    )  #just makes a dictionary for each type with a list of length 2, store num sites and num div sites
    added = 0
    for site in summaryReader:  #"#CHROM\tPOS\tREF\tALT\tREF_NUMBER\tALT_NUMBER\tTOTAL\tSITE_TYPE")
        if currscaf == None:
            currscaf = site.CHROM

        #if i == -1:#start the window at the first site we have
        #    i = site.POS

        if added >= _args.window or currscaf != site.CHROM:
            #if site.POS >= i+_args.window or currscaf != site.CHROM:
            regions[currscaf + "_" + str(site.POS)] = (afs, div)
            afs = resetAFS({}, summaryReader.summary.typeToCode, folded_size)
            div = resetAFS({}, summaryReader.summary.typeToCode, 2)
            #   i += _args.window
            added = 0
            if currscaf != site.CHROM:
                currscaf = site.CHROM

        if site.TOTAL != _args.sample_size or site.TOTAL != site.REF_NUM + site.ALT_NUM:  #not enough alleles at this locus, or total is wrong (filtered data)
            sys.stderr.write(
                "Skipping site, either not enough alleles or the total number of alleles does not match the sum of alleles.\n\t%s\n"
                % site)
            continue

        af = min(site.REF_NUM, site.ALT_NUM)

        #if in SNP mode only add to the window size if we are not at a fixed site
        if af != 0 and af != 1:
            added += 1
        elif not _args.snps:
            added += 1

        for t in site.Types:
            if t not in afs.keys():
                sys.stderr.write("Weird site type encountered: \n\t" +
                                 str(site) + "\n")
                continue

            afs[t][af] += 1

            #store divergence info for the window
            if site.DIVERGENCE != -1:
                div[t][0] += 1

            if site.DIVERGENCE == 1:
                div[t][1] += 1

    #this is commented out because the last region is usually not long enough to be useful
    #regions[currscaf+"_"+str(i)] = (afs, div)#store the last region

    #make sure all wanted types are actually present
    safe = []
    for w in _args.site_types:
        if w in summaryReader.summary.Types.keys():
            safe.append(w)
        else:
            sys.stderr.write(
                "Wanted type not present in codes: %s. Excluding it from analysis.\n"
                % w)
    _args.site_types = safe

    counts = resetAFS({}, summaryReader.summary.typeToCode, 1)
    for r in regions.values():
        for t in summaryReader.summary.Types.keys():
            counts[t][0] += sum(r[0][t])

    sys.stderr.write("Numbers of each site type in whole analysis:\n")
    for t, val in summaryReader.summary.typeToCode.items():
        sys.stderr.write("%s %s\n" % (t, counts[val][0]))

    region_names = list(regions.keys())
    num_regions = len(region_names)

    #print out the non-bootstrapped afs
    output_bootstrap(summaryReader.summary.typeToCode,
                     summaryReader.summary.Types, folded_size, region_names,
                     regions, "real")

    #perform the bootstraps
    for i in range(_args.bootstraps):
        boot_regions = []
        while len(boot_regions) < num_regions:
            boot_regions.append(choice(region_names))

        #output the AFS for this bootstrap
        output_bootstrap(summaryReader.summary.typeToCode,
                         summaryReader.summary.Types, folded_size,
                         boot_regions, regions, str(i))
Пример #16
0
def main():
    reader = summary.Reader(open(args.summary, 'rb'))
    if reader.summary.Samples:
        N = reader.summary.Ploidy * len(reader.summary.Samples)
    elif not args.sample_size:
        sys.stderr.write(
            "No samples listed in this summary. You must provide a sample size with -N.\n"
        )
        sys.exit()
    else:
        N = args.sample_size

    genAFS = lambda: [0] * (N + 1)

    if "GENE" not in reader.summary.Fields:
        sys.stderr.write(
            "The gene names are not in this summary. All gene names will be blank if you use this file.\n"
        )

    try:
        intronCode = reader.summary.typeToCode['intron']
    except KeyError:
        sys.stderr.write(
            "Summary has no 'intron' codes, you need to add them or use a different summary.\n"
        )
        sys.exit()

    #output header
    outputIntron(N=N)

    currentIntron = None
    for record in reader:
        if intronCode in record.Types:
            if not currentIntron:
                #new one
                currentIntron = Intron(record.GENE, record.CHROM, record.POS,
                                       genAFS())
            elif currentIntron.gene != record.GENE:
                #end of previous one, we're on a new one
                #this should really not happen
                sys.stderr.write(
                    "Reached a new intron adjacent to an old one at %s %s, you may want to check your annotation.\n"
                    % (record.CHROM, record.POS))
                outputIntron(N, currentIntron)
                currentIntron = Intron(record.GENE, record.CHROM, record.POS,
                                       genAFS())
    #        print(record.POS, currentIntron.afs)
    #doesn't count AF at sites with too few alleles
            if record.TOTAL == N:
                try:
                    currentIntron.afs[record.ALT_NUM] += 1
                except IndexError:
                    sys.stderr.write(
                        "At %s %s there is a problem with the ALT_NUM. It is out of acceptable ranges. Is there a problem with your summary? Or with the provided sample size?\n"
                        % (record.CHROM, record.POS))
            currentIntron.end = record.POS
        elif currentIntron:
            #we reached the end on this one
            outputIntron(N, currentIntron)
            currentIntron = None
    #get the last one
    if currentIntron:
        outputIntron(N, currentIntron)
Пример #17
0
def __main__():
    #check aruguments
    if len(sys.argv) == 1:
        details()
        sys.exit()

    if len(sys.argv) < 2:
        usage()

    processArgs(2)

    summaryReader = summary.Reader(open(sys.argv[1], "rb"))

    global _t
    safeT = []

    myCounts = {}

    for t in _t:
        if t not in summaryReader.summary.typeToCode.keys():
            sys.stderr.write(
                "Type %s not found in summary. It will be ignored. The valid options are: %s.\n"
                % (t, summaryReader.summary.typeToCode.keys()))
        else:
            safeT.append(summaryReader.summary.typeToCode[t])
            myCounts[summaryReader.summary.typeToCode[t]] = []

    _t = safeT

    if len(_t) == 0:
        sys.stderr.write(
            "No valid types to count were provided. Use the -t option to specify sites of interest.\n"
        )
        sys.exit(0)

    start = None
    pScaf = None
    pSite = None

    print("TYPE\tCHROM\tMIDPOINT\tCOUNT")

    #read the infile file...
    for site in summaryReader:
        if start == None:
            start = site.POS
            pScaf = site.CHROM
            pSite = site.POS

        if pScaf != site.CHROM:
            printWindow(myCounts, summaryReader.summary, (pSite + start) / 2.0,
                        pScaf)
            for t in _t:
                myCounts[t] = []
            start = site.POS
            pScaf = site.CHROM

        if site.POS - start >= _w:
            printWindow(myCounts, summaryReader.summary,
                        (site.POS + start) / 2.0, pScaf)
            for t in _t:
                myCounts[t] = []
            start = site.POS

        for t in _t:
            if t in site.Types:
                myCounts[t].append(1)
            else:
                myCounts[t].append(0)

        pSite = site.POS

    printWindow(myCounts, summaryReader.summary, (pSite - start) / 2.0, pScaf)