def main(): parser = OptionParser(prog="pyRAD", usage="%prog [options]", version="%prog 3.0.6") parser.add_option('-p', action="store", type="string", dest="params", help="input file for within sample filtering and clustering\n") parser.add_option('-s', action="store", dest="steps", help="""perform step-wise parts of within analysis\n 1 = barcode sorting \ 2 = filter/edit raw sequences \ 3 = within-sample clustering \ 4 = estimate pi and e \ 5 = consensus calling \ 6 = cluster consensus \ 7 = align & create output files """ ) parser.add_option('-d', action="store", type="string", dest="dtest", help="""input file for D-test of introgression, can iterate over multiple samples """ ) parser.add_option('-n', action="store_true", dest="newparamsfile", help="""creates a new empty input params.txt file """ ) parser.add_option('-D', action="store_true", dest="newDtestfile", help="""creates a new empty Dtest input file """ ) (options, args) = parser.parse_args() if not any([options.params,options.dtest,options.newparamsfile,options.newDtestfile]): print "\n\tmust include option of -p, -d, -D or -n\n" sys.exit() if options.params: sys.stderr.write('\n\n'+' '*5+'---'*20+'\n'+\ ' '*6+'pyRAD : RADseq for phylogenetics & introgression analyses\n'+\ ' '*5+'---'*20+'\n\n') readin = [line.strip().split('##')[0].strip() for line in open(options.params).readlines()] if "==** " not in str(readin[0]): print "\n\twarning: update params input file format to latest version\n"; sys.exit() WORK = str(readin[1]) GLOB = str(readin[2]) Bcode = str(readin[3]) vsearch = str(readin[4]) muscle = str(readin[5]) CUT = str(readin[6]) parallel = int(readin[7]) mindepth = int(readin[8]) pN = str(readin[9]) wclust = str(readin[10]) datatype = str(readin[11]) minsamp = int(readin[12]) maxpoly = str(readin[13]) outname = str(readin[14]) ########################### ## 15 is separator line ########################### subset = str(readin[16]) outgroup = str(readin[17]) exclude = str(readin[18]) Floc = str(readin[19]) try: maxmismatch = int(readin[20]) except (ValueError,IndexError): maxmismatch = 1 try: Q = int(readin[21]) except (ValueError,IndexError): Q = 33 try: strict = int(readin[22]) except (ValueError, IndexError): strict = 0 try: E,H = str(readin[23]).strip().split(",") except ValueError: E = ""; H = "" try: maxN = int(readin[24]) except ValueError: maxN = 5 try: maxH = int(readin[25]) except ValueError: maxH = 5 try: haplos = int(readin[26]) except ValueError: haplos = 2 maxSNP = str(readin[27]) if maxSNP == "": maxSNP = "99" max_inserts = str(readin[28]) if max_inserts == "": max_inserts = "3" try: seed = int(readin[29]) except ValueError: seed = 112233 try: overhang = [int(i) for i in str(readin[30]).strip().split(',')] except (ValueError,IndexError): overhang = [0,0] try: outform = str(readin[31]) except (ValueError,IndexError): outform = "" try: lowcounts = int(readin[32]) except (ValueError, IndexError): lowcounts = mindepth ##mergepairs = str(readin[31]) ##if mergepairs in [0,""]: mergepairs = 0 try: trimkeep = int(readin[33]) except ValueError: trimkeep = 0 try: maxstack = int(readin[34]) except ValueError: maxstack = "2SD" try: minuniq = int(readin[35]) except ValueError: minuniq = 0 try: hierarch = int(readin[36]) except ValueError: hierarch = 0 try: MASK = int(readin[37]) except ValueError: MASK = 'dust' if MASK == 1: MASK='dust' else: MASK='none' try: threads = int(readin[38]) except ValueError: threads = 6 ############################### ## 39 is separator line ############################### try: clustprefix = readin[40:] except IndexError: clustprefix = "" clustprefix = [i for i in clustprefix if i] """ expand ./ ~ and ../ designators in location names """ def expander(namepath): if "~" in namepath: namepath = namepath.replace("~",os.path.expanduser("~")) if "../" in namepath: a,b = namepath.split("../") namepath = os.path.abspath(os.path.join(os.path.dirname( "" ), '..', b)) elif "./" in namepath: a,b = namepath.split("./") namepath = os.path.abspath("")+"/"+b return namepath if WORK == "": WORK = os.path.abspath("")+"/" else: WORK = expander(WORK) if WORK[-1] != "/": WORK = WORK+"/" stripped = 0 if Floc: if Floc[0] == "@": stripped = 1 Floc = expander(Floc[1:]) else: Floc = expander(Floc) if GLOB: GLOB = expander(GLOB) if Bcode: Bcode = expander(Bcode) if vsearch: vsearch = expander(vsearch) if options.dtest: options.dtest = expander(options.dtest) """ find location of vsearch (or usearch) and muscle """ def cmd_exists(cmd): return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 # " check platform: mac v linux " # if 'linux' in sys.platform: # vsearch = "vsearch-1.0.3-linux-x86_64" # else: # vsearch = "vsearch-1.0.3-mac-x86_64" # " find vsearch and muscle in user's lib/" # PYRADPATH = os.path.dirname(os.path.realpath(__file__)) # vsearch = PYRADPATH+"/lib/"+vsearch # muscle = PYRADPATH+"/lib/muscle" " threads = 1 for usearch" if 'vsearch' not in vsearch: threads = 1 if not cmd_exists(vsearch): print "\tcannot find vsearch (or usearch), edit path in param file" sys.exit() if not cmd_exists(muscle): print "\tcannot find muscle, edit path in input file" sys.exit() """ expand clustprefix cluster groups """ gids = [] groups = [] minhits = [] "hierarchical clustering " for line in clustprefix: gid, hits, inds = line.strip().split() gids.append(gid) minhits.append(hits) if "," in inds: thisgroup = [] ii = inds.split(",") for i in ii: if "*" in i: expanded = glob.glob(WORK+"clust"+wclust+"/"+i+".consens*") [thisgroup.append(i) for i in expanded] else: thisgroup.append(WORK+"clust"+wclust+"/"+i+".consens.gz") groups.append(thisgroup) else: if "*" in inds: expanded = glob.glob(WORK+"clust"+wclust+"/"+inds+".consens*") groups.append(expanded) else: inds = inds.split(",") groups.append([WORK+"clust"+wclust+"/"+i+".consens.gz" for i in inds]) "TODO check for size=1 " if not gids: gids = "" " step of the analysis " k = tuple('1234567') if options.steps: k = tuple(str(options.steps)) " check that the data type was entered correctly " datopts = ['rad','gbs','ddrad','pairgbs','pairddrad','merged','2brad'] if datatype not in datopts: print "\t datatype argument (line 11) not recognized " sys.exit() # if datatype == 'merged': # print "specify mergetype in params file, ex: mergeddrad or mergegbs " # sys.exit() " parse max_inserts argument " w1=3 w2=6 a1=a2=99 if 'pair' in datatype: if "," in max_inserts: wargs = max_inserts.strip().split(",") if len(wargs) == 2: w1 = w2 = wargs[0] a1 = a2 = wargs[1] elif len(wargs) == 4: w1,w2,a1,a2 = wargs else: print "\n\tmax_inserts parameter not recognized. see documentation" sys.exit() else: if "," in max_inserts: w1,a1 = map(int,max_inserts.split(",")) ######### Begin analysis ################################################### if '1' in k: " expand Barcode file name if necessary " if "*" in Bcode: try: Bcode = glob.glob(Bcode)[0] except IndexError: print "\tcould not find barcodes file ",Bcode, "\n\tcomment out line 3 of params file or edit path to barcodes file" sys.exit() if Floc: print "\tskipping step 1: line 18 of input file shows seqs already sorted" else: " if directory as input select all inside" if GLOB: if GLOB[-1] == "/": GLOB = GLOB+"*" sortandcheck2.main(Bcode,GLOB,CUT,datatype,parallel,maxmismatch,WORK) ### step 2 ################### if '2' in k: if Floc: print >>sys.stderr, "\tsorted .fastq from %s being used" % Floc if len(glob.glob(Floc))<1: sys.stderr.write("\t... no files found in line 18 location, check required file name formatting\n") sys.exit() FQs = Floc if stripped: print "\tbarcode & restriction site are already stripped off of sequences" CUT = "" if strict: print "\tApplying step 2 filter (param 19) is not recommended for data that is stripped (w/ @) \n" else: " default location " FQs = WORK+"fastq/"+subset+"*.fq.gz" " if directory as input select all inside" if FQs[-1] == "/": FQs = FQs+"*" " if not paired filter only read 1 " if 'pair' not in datatype: # in ['rad','ddrad','gbs','merged','2brad']: editraw_rads.main(parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype) else: #elif datatype in ['pairddrad','pairgbs']: " check for both CUT sites in pairddrad" if datatype == 'pairddrad': if "," not in CUT: print "\n\tyou must enter two restriction sites for pair ddRAD data" sys.exit() editraw_pairs.main(parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype) #elif "merge" in datatype: # editraw_merges.main(parallel, WORK, FQs, CUT, # pN, Q, strict, trimkeep) ### step 3 #################### if '3' in k: cluster7dp.main(WORK, parallel, wclust, mindepth, subset, datatype, w1, w2, minuniq, MASK, muscle, vsearch, threads, remake=0) ### step 4 #################### if '4' in k: " if using low depth option still use a reasonable limit for parameter estimates" if mindepth < 5: tempmindepth = 5 else: tempmindepth = mindepth H_err_dp.main(parallel, wclust, tempmindepth, subset, haplos, WORK, CUT, datatype) ### step 5 #################### if '5' in k: if not E: try: Pi = open(WORK+"stats/Pi_E_estimate.txt").readlines() except IOError: Pi = "" if Pi: El = [] Hl = [] for line in Pi[1:]: try: _,h,e = line.strip().split("\t") except IndexError: None Hl.append(float(h)) El.append(float(e)) if len(Hl) == 0: print "\n\terror in step 4, no estimates in file stats/Pi_E_estimate.txt" sys.exit() H = sum(Hl)/len(Hl) E = sum(El)/len(El) else: E = 0.001 H = 0.01 print "\n\tstep 4 values not detected, using E=0.001, H=0.01" if 'pair' in datatype: " call consensus on each pair separately " consens_pairs.main(parallel, float(E), float(H), wclust, mindepth, subset+"*", maxN, maxH, haplos, CUT, datatype, lowcounts, strict, WORK, maxstack) else: " call consensus on single end clusters " consensdp.main(parallel, float(E), float(H), wclust, mindepth, subset+"*", maxN, maxH, haplos, CUT, datatype, lowcounts, strict, WORK, maxstack) ### step 6 #################### if '6' in k: if not hierarch: gids = "" if "," in subset: inlist = [WORK+"clust"+wclust+"/"+i+".consens*" for i in subset.strip().split(",")] else: inlist = glob.glob(WORK+"clust"+wclust+"/"+subset+"*.consens*") cluster_cons7_shuf.main(vsearch, wclust, datatype, outgroup, seed, gids, minhits, inlist, WORK, MASK, 0) print "\n\tfinished clustering" else: """ re-expand clustprefix cluster groups in case no -s """ Hgids = [] Hgroups = {} Hminhits = [] "hierarchical clustering " for line in clustprefix: Hgid, Hhits, Hinds = line.strip().split() Hgids.append(Hgid) Hminhits.append(Hhits) Hgroups[Hgid] = [] if "," in Hinds: Hinds = Hinds.split(",") for Hind in Hinds: if "*" in Hind: expanded = glob.glob(WORK+"clust"+wclust+"/"+Hind+".consens*") Hgroups[Hgid] += expanded #.append(expanded) else: Hgroups[Hgid].append(WORK+"clust"+wclust+"/"+Hind+".consens.gz") else: if "*" in Hinds: expanded = glob.glob(WORK+"clust"+wclust+"/"+Hinds+".consens*") Hgroups[Hgid] += expanded #.append(expanded) else: Hgroups[Hgid].append(WORK+"clust"+wclust+"/"+Hinds+".consens.gz") for i,j in zip(Hgids,Hminhits): for cons in Hgroups[i]: if cons not in glob.glob(WORK+"clust"+wclust+"/*.consens.gz"): print "\n\tsample name",cons,"in group",i,"does not match any filenames" sys.exit() preclusts = [] for i in Hgroups.values(): preclusts += i for cons in glob.glob(WORK+"clust"+wclust+"/*.consens.gz"): if cons not in preclusts: print "\n\twarning: sample",cons,"not assigned to a cluster group" #if not gids: # gids = "" " make prefix directory " if not os.path.exists(WORK+'prefix/'): os.makedirs(WORK+'prefix') ########### TODO #################################### # if os.path.exists(WORK+"prefix/cat.clust_.gz"): # print "\tRemaking clusters from existing clustprefix files "+\ # "using minmatches: ",minmatch # print "\t(To completely re-start hierarchical clustering delete the prefix/ directory)\n" # # for (gid,minhit,inlist) in zip(gids,minhits,groups): # handle = WORK+"clust"+wclust+"/cat.haplos_"+gid # #cluster_cons7_shuf.makeclust(handle, datatype, pre, pre, minm, WORK, 1) # #tier2clust.makeclust(wclust, datatype, WORK) ####################################################### " queue up jobs " work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() " submit jobs " for (Hgid,Hminhit) in zip(Hgids,Hminhits): inlist = Hgroups[Hgid] work_queue.put([vsearch, wclust, datatype, outgroup, seed, Hgid, Hminhit, inlist, WORK, MASK, 1 ]) " execute first tier jobs " jobs = [] for i in range(parallel): worker = Worker(work_queue, result_queue, cluster_cons7_shuf.main) jobs.append(worker) worker.start() for j in jobs: j.join() " cluster second tier " tier2clust.main(vsearch, wclust, datatype, Hgids, seed, WORK, MASK) print "\n\tfinished clustering\n" " cleanup " #for ff in glob.glob(WORK+"clust"+wclust+"/cat.consens_*.gz"): # os.remove(ff) #for ff in glob.glob(WORK+"clust"+wclust+"/cat.u*"): # os.remove(ff) if '7' in k: if minsamp < 2: print "\n\tminimum minCov setting is <2: changing to 2" minsamp = 2 if gids: inclustfile = WORK+"prefix/cat.clust_.gz" else: inclustfile = WORK+'clust'+wclust+"/cat.clust_.gz" if not os.path.exists(inclustfile): #sys.stderr.write("\n\t didn't find hierarchically clustered subset: \n\t"+inclustfile) #sys.stderr.write("\n\t looking for default full cluster file") if os.path.exists(WORK+'clust'+wclust+"/cat.clust_.gz"): inclustfile = WORK+'clust'+wclust+"/cat.clust_.gz" sys.stderr.write("\n\tCluster input file: using \n\t"+inclustfile+"\n\n") else: print "\tnot found" #print "\tcat.clust_ file is selected based on line 15 subset argument " #print "\n\t if you wish to exclude samples from an existing cat.clust file "+\ # "\n\t in your output alignments list exclude names on line 17 of the params file.\n " sys.exit() #if any([i in outform for i in ['t','m']]): # if gids: # print "\tgroups for 't' or 'm' outputs:", gids taxadict = OrderedDict(zip(gids,groups)) alignable.main(outgroup, minsamp, outname, inclustfile, maxpoly, parallel, maxSNP, muscle, exclude, overhang, outform, WORK, gids, CUT, a1, a2, datatype, subset, parser.version.split(" ")[1], mindepth, taxadict, minhits, seed, haplos) if '8' in k: cluster7dp.main(WORK, parallel, wclust, mindepth, subset, datatype, w1, w2, minuniq, MASK, muscle, vsearch, threads, remake=1) if options.dtest: readin = [line.strip() for line in open(options.dtest).readlines()] nboots = int(readin[0].split("##")[0].strip()) alignfile = str(readin[1].split("##")[0].strip()) outfile = str(readin[2].split("##")[0].strip()) ntax = str(readin[3].split("##")[0].strip()) nproc = int(readin[4].split("##")[0].strip()) makesort = int(readin[5].split("##")[0].strip()) makeboots = int(readin[6].split("##")[0].strip()) tests = [] for line in readin[8:]: if line: notes = "" if "##" in line: tax,notes = line.strip().split("##")[0], line.strip().split("##")[-1], if tax: tests.append([tax.strip().split(), notes.strip()]) #.split("\t"),notes.strip()]) else: tests.append(line.strip().split()) # "\t")) if ntax == '4': Dtest.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots) elif ntax == 'part': Dtest_5.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots) elif ntax == 'foil': Dtest_foil.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots,0) elif ntax == 'foilalt': Dtest_foil.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots,1) else: print "error in input file" if options.newparamsfile: if os.path.exists("./params.txt"): print "\tfile params.txt already exists" sys.exit() else: createfile.main(parser.version.split(" ")[1]) if options.newDtestfile: outstring = """200 ## N bootstrap replicates test.loci ## loc/path to input .loci file dstats/test1_res ## output file path/name (no suffix) 4 ## which test: 4,part,foil,foilalt 2 ## N cores (execute jobs [lines below] in parallel 0 ## output ABBA/BABA loci to files (0=no,1,2=verbose) 0 ## output bootstrap Ds to files (0=no,1=yes) -----------------------------------------------------------\n""" sys.stdout.write(outstring)
def writefunc(GLOB, Parallel, Bcode, CUT, datatype, maxmismatch, WORK): "create barcode dictionary" codetable = open(Bcode, 'r') codes = [line.strip().split() for line in codetable.readlines()] C = {} for line in codes: if line[0]: C[line[1].strip().upper()] = line[0] " find longest barcode " keylens = map(len, C.keys()) if len(set(keylens)) == 1: longB = (keylens[0], 'same') else: longB = (max(keylens), 'diff') " check for CUT in barcodes " CCC = unambig(CUT) if len(CCC) > 1: for cut in CCC: if any([cut in i for i in C.keys()]): print "\n\twarning: CUT site matches within one of the barcodes, "+\ "I suggest double \n\tchecking the file to make sure it properly demultiplexes" else: if any([CUT in i for i in C.keys()]): print "\n\twarning: CUT site matches within one of the barcodes, "+\ "I suggest double \n\tchecking the file to make sure it properly demultiplexes" " read in sequence files " if len(glob.glob(GLOB)) > 1: FS = [f for f in glob.glob(GLOB)] else: FS = glob.glob(GLOB) if 'pair' in datatype: Raws = combinefiles(GLOB) else: Raws = FS "send jobs to multiprocess queue" num = 0 work_queue = multiprocessing.Queue() submitted = 0 for fs in Raws: if 'pair' in datatype: work_queue.put([ C, [fs[0], fs[1]], CUT, datatype, num, maxmismatch, WORK, longB ]) submitted += 1 else: work_queue.put( [C, fs, CUT, datatype, num, maxmismatch, WORK, longB]) submitted += 1 num += 1 result_queue = multiprocessing.Queue() "spawn workers, give function" jobs = [] for i in range(min(Parallel, submitted)): worker = Worker(work_queue, result_queue, barmatch) worker.start() jobs.append(worker) for job in jobs: job.join() Ms = {} if len(glob.glob(WORK + "fastq/.*.pickle")) > 1: for pick in glob.glob(WORK + "fastq/.*.pickle"): pickin = open(pick, "rb") M = pickle.load(pickin) pickin.close() for key in M: if key not in Ms: Ms[key] = M[key] else: Ms[key] += M[key] os.remove(pick) elif len(glob.glob(WORK + "fastq/.*.pickle")) == 1: pick = glob.glob(WORK + "fastq/.*.pickle")[0] pickin = open(pick, 'rb') Ms = pickle.load(pickin) pickin.close() os.remove(pick) else: print "\nno stats file generated" Mkeys = Ms.keys() Mkeys.sort(key=lambda x: Ms[x], reverse=True) statout = open(WORK + "stats/s1.sorting.txt", 'a') statout.write("\n\n") statout.write("sample\ttrue_bar\tobs_bars\tN_obs\n") Cnames = C.keys() Cnames.sort() try: maxl = max(map(len, map(str, Ms.values()))) except ValueError: maxl = 5 hits = [] for bar in Cnames: for barcode in Mkeys: if matching(bar, barcode, maxmismatch): print >> statout, "%s \t%s \t%s\t%s" % ( C[bar], bar, barcode, str(Ms[barcode]) + " " * (maxl + 3 - len(str(Ms[barcode])))) hits.append(barcode) statout.write("\n") maxl = max(map(len, Mkeys)) for barcode in Mkeys: if barcode not in hits: print >> statout, "nomatch \t%s \t%i" % ( barcode + " " * (maxl + 3 - len(barcode)), Ms[barcode]) statout.close()
def main(WORK, parallel, wclust, mindepth, subset, datatype, w1, w2, minuniq, MASK, muscle, vsearch, threads, remake): " find .edit files in edits/ directory " if not os.path.exists(WORK+'edits/'): print "\terror: could not find edits/ folder in working directory" sys.exit() " make output folder for clusters" if not os.path.exists(WORK+'clust'+wclust): os.makedirs(WORK+'clust'+wclust) outfolder = WORK+'clust'+str(wclust) if not os.path.exists(WORK+'stats'): os.makedirs(WORK+'stats') " remake option... in development" if remake: for ufile in glob.glob(outfolder+"/*.u"): infile = open(ufile).readlines() cmd = "/bin/sed '$d' < " + ufile + " > tempfile" os.system(cmd) cmd = "/bin/mv "+ufile+" "+ufile+".backup" os.system(cmd) cmd = "/bin/mv tempfile "+ufile os.system(cmd) FS = [] " if not only 1 sample " if len(glob.glob(WORK+"edits/"+subset+"*.edit*")) > 1: for f in glob.glob(WORK+"edits/"+subset+"*.edit*"): " append files to list if not already clustered or empty" if not os.path.exists(outfolder+"/"+f.replace(".edit",".clustS.gz")): size = os.stat(f) if size.st_size > 0: FS.append(f) else: print "excluding "+str(f)+" file is empty" else: print f.replace(".edit",".clustS")+" already exists" " arranges files by decreasing size for fast clustering order" for i in range(len(FS)): statinfo = os.stat(FS[i]) FS[i] = FS[i],statinfo.st_size FS.sort(key=operator.itemgetter(1), reverse = True) FS = [i[0] for i in FS] elif len(glob.glob(WORK+"edits/"+subset+"*.edit*")) == 1: f = glob.glob(WORK+"edits/"+subset+"*.edit*") size = os.stat(f[0]) if size.st_size > 0: FS = f else: print "excluding "+f[0]+" file is empty" else: print "\tNo .edit files found in edits/ dir." sys.stderr.write("\n\tde-replicating files for clustering...\n") """ do not split big files if using 64-bit Usearch, or if using Vsearch, else do it to avoid 4GB limit of 32-bit usearch""" if "vsearch" not in vsearch: print '\n\tsplitting big files' splitbigfilesforderep(FS, vsearch, datatype, minuniq) " load work queue" work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() " perform function 'final' on files in FS list " submitted = {} fileno = 1 if not remake: if threads == 0: nthreads = 'all' else: nthreads =threads np = min(parallel,len(FS)) sys.stderr.write("\n\tstep 3: within-sample clustering of "+\ `len(FS)`+" samples at \n\t "+`wclust`+\ " similarity. Running "+`np`+" parallel jobs\n\t"+\ " \twith up to "+`nthreads`+" threads per job."+\ " If needed, \n\t\tadjust to avoid CPU and MEM limits\n\n") else: sys.stderr.write("\n\tstep 3: rebuilding clusters from unfinished step 3 files\n") for handle in FS: if outfolder+"/"+handle.split("/")[-1].replace(".edit",".clustS.gz") not in glob.glob(outfolder+"/*"): work_queue.put([vsearch,outfolder,handle,wclust,mindepth, parallel,muscle,datatype,fileno, w1, w2, WORK, minuniq, MASK, threads, remake]) submitted[handle] = 1 fileno += 1 else: print "\tskipping "+handle.split("/")[-1].replace(".edit",".clustS.gz")+\ ' already exists in '+WORK+outfolder.split("/")[-1] " create a queue to pass to workers to store the results" jobs = [] for i in range( min(submitted,parallel) ): worker = Worker(work_queue, result_queue, final) jobs.append(worker) worker.start() for j in jobs: j.join() " output statistics on depth of coverage" outstats = open(WORK+"stats/s3.clusters.txt",'a') print >>outstats, '\n'+'\t'.join(['taxa','total','dpt.me', 'dpt.sd','d>'+`mindepth-1`+'.tot', 'd>'+`mindepth-1`+'.me', 'd>'+`mindepth-1`+'.sd', 'badpairs']) RES = [] HISTO = [] #for ff in glob.glob(outfolder+"/.temp.*"): for ff in FS: end = ff.split("/")[-1].replace(".edit","") ff = outfolder+"/.temp."+end if os.path.exists(ff): line = open(ff).readlines() RES.append(line[0].strip().split("\t")) HISTO.append([line[0].split("\t")[0],"".join(line[1:])]) os.remove(ff) RES.sort(key=lambda x:x[0]) HISTO.sort(key=lambda x:x[0]) for i in RES: print >>outstats, "\t".join(i) print >>outstats, """ ## total = total number of clusters, including singletons ## dpt.me = mean depth of clusters ## dpt.sd = standard deviation of cluster depth ## >N.tot = number of clusters with depth greater than N ## >N.me = mean depth of clusters with depth greater than N ## >N.sd = standard deviation of cluster depth for clusters with depth greater than N ## badpairs = mismatched 1st & 2nd reads (only for paired ddRAD data)\n\nHISTOGRAMS\n """ for i in HISTO: print >>outstats, "sample: "+i[0]+"\n"+i[1] outstats.close() for handle in FS: nothere = 0 try: submitted[handle] except KeyError: nothere = 1 if not nothere: if submitted[handle]: if os.path.exists(outfolder+"/"+handle.split("/")[-1].replace(".edit",".clust.gz")): cmd = "/bin/rm "+outfolder+"/"+handle.split("/")[-1].replace(".edit",".clust.gz") subprocess.call(cmd, shell=True)
def multiproc_it(tests, alignfile, outfile, nboots, nproc, namelen, makesort, makeboots): " submit jobs to processors " work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() submitted = 0 Notes = [] for rep in tests: notes = "" if len(rep) == 2: rep,notes = rep p1,p2,p3,o = rep if any(["[" in i for i in rep]): p1 = p1[1:-1].split(",") p2 = p2[1:-1].split(",") p3 = p3[1:-1].split(",") o = o[1:-1].split(",") taxalist = list(itertools.chain(*[p1+p2+p3+o])) if checktaxa(taxalist,alignfile): work_queue.put([alignfile,[p1,p2,p3,o],nboots,1, submitted]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' else: if checktaxa([p1,p2,p3,o],alignfile): work_queue.put([alignfile,[p1,p2,p3,o],nboots,0, submitted]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' Notes.append(notes) jobs = [] for i in range(nproc): worker = Worker(work_queue, result_queue, runtest) jobs.append(worker) worker.start() for j in jobs: j.join() " read results back in " #Results = [result_queue.get() for i in range(submitted)] Results = [pickle.load(open(".save.D4temp"+str(i),'rb')) for i in xrange(submitted)] Results.sort(key = lambda x:x[8]) "setup results file " outs = open(outfile+".D4.txt", 'w') header = "\t".join([ 'P1'+" "*(namelen[0]-2), 'P2'+" "*(namelen[1]-2), 'P3'+" "*(namelen[2]-2), 'O'+" "*(namelen[3]-1), 'D','std(D)','Z', 'BABA','ABBA', 'nloci','nboot','pdisc', 'notes']) print >>outs, header for i in range(len(Results)): ps,D,STD,Z,nloci,ABBA,BABA,pdisc,sub,ABBAloci,BABAloci,boots = Results[i] ps = [str(x).replace("['","[").replace("']","]").replace("', '",",").replace(">","") for x in ps] print >>outs, "%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.2f\t%s" % (ps[0]+" "*(namelen[0]-len(ps[0])), ps[1]+" "*(namelen[1]-len(ps[1])), ps[2]+" "*(namelen[2]-len(ps[2])), ps[3]+" "*(namelen[3]-len(ps[3])), D,STD,Z, BABA,ABBA, nloci,nboots, pdisc,Notes[i]) loci = open(alignfile).read().strip().split("|")[:-1] if makesort: makesortfiles('ABBA',ABBAloci,4,loci,outfile,makesort,sub,ps) makesortfiles('BABA',BABAloci,4,loci,outfile,makesort,sub,ps) if makeboots: with open(outfile+"_"+str(sub+1)+".boots",'w') as out: out.write(",".join(map(str,boots))) for oldpickle in glob.glob(".save.D4temp*"): os.remove(oldpickle)
def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots,noterminals): work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() submitted = 0 Notes = [] for rep in subtests: notes = "" if len(rep) == 2: rep,notes = rep p1,p2,p3a,p3b,o = rep if all(["[" in i for i in rep[1:]]): p1 = p1[1:-1].split(",") p2 = p2[1:-1].split(",") p3a = p3a[1:-1].split(",") p3b = p3b[1:-1].split(",") o = o[1:-1].split(",") if checktaxa([p1,p2,p3a,p3b,o],alignfile): work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted, noterminals]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' else: if checktaxa([p1,p2,p3a,p3b,o],alignfile): work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted, noterminals]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' Notes.append(notes) jobs = [] for i in range(min(submitted,nproc)): worker = Worker(work_queue, result_queue, runtest) jobs.append(worker) worker.start() for j in jobs: j.join() " read results back in " Results = [result_queue.get() for i in range(submitted)] Results.sort(key = lambda x:x[15]) " setup results file " if noterminals: outs = open(outfile+".Dfoilalt.txt", 'w') else: outs = open(outfile+".Dfoil.txt", 'w') header = "\t".join([ 'p1'+" "*(namelen[0]-2), 'p2'+" "*(namelen[1]-2), 'p3'+" "*(namelen[2]-2), 'p4'+" "*(namelen[3]-2), 'O'+" "*(namelen[4]-1), 'Dfo','Dil','Dfi','Dol', 'Z_fo','Z_il','Z_fi','Z_ol', 'BABBA','ABBBA', 'BABAA','ABBAA', 'BAABA','ABABA', 'BBBAA','BBABA', 'AABAA','AAABA', 'BAAAA','ABAAA', 'nloci','sign', 'notes']) print >>outs, header for i in range(len(Results)): L,DFO,ZFO,DIL,ZIL,DFI,ZFI,DOL,ZOL,nloc,BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA,pdisc,sub,BBFO,BBIL,BBFI,BBOL = Results[i] L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L] sign = [] for s,d in zip([ZFO,ZIL,ZFI,ZOL],[DFO,DIL,DFI,DOL]): if s>3.5: if d>0: sign.append("+") else: sign.append("-") else: sign.append("0") #print sign resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))), str(L[1])+" "*(namelen[1]-len(str(L[1]))), str(L[2])+" "*(namelen[2]-len(str(L[2]))), str(L[3])+" "*(namelen[3]-len(str(L[3]))), str(L[4])+" "*(namelen[4]-len(str(L[4]))), DFO,DIL,DFI,DOL, ZFO,ZIL,ZFI,ZOL, BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA, nloc, "".join(sign), Notes[i]]) print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%s\t%s" % resin loci = open(alignfile).read().strip().split("|")[:-1] if makesort: None # makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L) if makeboots: None
def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots): work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() submitted = 0 Notes = [] for rep in subtests: notes = "" if len(rep) == 2: rep,notes = rep p1,p2,p3a,p3b,o = rep if all(["[" in i for i in rep[1:]]): p1 = p1[1:-1].split(",") p2 = p2[1:-1].split(",") p3a = p3a[1:-1].split(",") p3b = p3b[1:-1].split(",") o = o[1:-1].split(",") if checktaxa([p1,p2,p3a,p3b,o],alignfile): work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' else: if checktaxa([p1,p2,p3a,p3b,o],alignfile): work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' Notes.append(notes) jobs = [] for i in range(min(submitted,nproc)): worker = Worker(work_queue, result_queue, runtest) jobs.append(worker) worker.start() for j in jobs: j.join() " read results back in " #Results = [result_queue.get() for i in range(submitted)] Results = [pickle.load(open(".save."+str(i),'rb')) for i in range(submitted)] Results.sort(key = lambda x:x[15]) " setup results file " outs = open(outfile+".partD.txt", 'w') header = "\t".join([ 'p1'+" "*(namelen[0]-2), 'p2'+" "*(namelen[1]-2), 'p3_1'+" "*(namelen[2]-4), 'p3_2'+" "*(namelen[3]-4), 'O'+" "*(namelen[4]-1), 'D_12','D_1','D_2', 'Z_12','Z_1','Z_2', 'BABBA','ABBBA', 'BABAA','ABBAA', 'BAABA','ABABA', 'nloci','pdisc', 'notes']) print >>outs, header for i in range(len(Results)): L,D12,Z12,D1,Z1,D2,Z2,nloc,ABBBA,BABBA,ABBAA,BABAA,ABABA,BAABA,pdisc,sub,ABBBAloci,BABBAloci,ABBAAloci,BABAAloci,ABABAloci,BAABAloci,BB12,BB1,BB2 = Results[i] L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L] resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))), str(L[1])+" "*(namelen[1]-len(str(L[1]))), str(L[2])+" "*(namelen[2]-len(str(L[2]))), str(L[3])+" "*(namelen[3]-len(str(L[3]))), str(L[4])+" "*(namelen[4]-len(str(L[4]))), D12, D1, D2, Z12, Z1, Z2, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, nloc, pdisc, Notes[i]]) print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s" % resin loci = open(alignfile).read().strip().split("|")[:-1] if makesort: makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L) makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L) makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L) makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L) makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L) makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L) if makeboots: with open(outfile+"_"+str(sub+1)+".boots_D12",'w') as out: out.write(",".join(map(str,BB12))) with open(outfile+"_"+str(sub+1)+".boots_D1",'w') as out: out.write(",".join(map(str,BB1))) with open(outfile+"_"+str(sub+1)+".boots_D2",'w') as out: out.write(",".join(map(str,BB2)))
def main(Parallel, E, H, ID, mindepth, subset, maxN, maxH, haplos, CUT, datatype, lowcounts, strict, WORK, maxstack): " find clust.xx directory " if not os.path.exists(WORK+'clust'+ID): print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \ "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \ "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold" sys.exit() " load up work queue" work_queue = multiprocessing.Queue() " iterate over files" outfolder = WORK+'clust'+str(ID) HH = glob.glob(outfolder+"/"+subset+".clustS*") stringout = "\n\tstep 5: creating consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5)) sys.stderr.write(stringout) if len(HH) > 1: " sort files by size" for i in xrange(len(HH)): statinfo = os.stat(HH[i]) HH[i] = HH[i],statinfo.st_size HH.sort(key=operator.itemgetter(1)) FS = [f[0] for f in HH][::-1] else: FS = HH REMOVE = glob.glob('clust'+ID+"/cat.*") FS = [f for f in FS if f not in REMOVE] submitted = 0 for handle in FS: if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"): m,sd = upSD(handle,mindepth) if maxstack == "2SD": upperSD = max(500,m+(sd*2.5)) else: upperSD = int(maxstack) work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype, haplos,CUT,upperSD,strict,lowcounts]) submitted += 1 else: print "\tskipping "+handle.replace(".clustS",".consens")+\ ', it already exists in '+outfolder+"/" " create a queue to pass to workers to store the results" result_queue = multiprocessing.Queue() " spawn workers" jobs = [] for i in xrange( min(Parallel,submitted) ): worker = Worker(work_queue, result_queue, consensus) jobs.append(worker) worker.start() for j in jobs: j.join() " get results" stats = open(WORK+'stats/s5.consens.txt','a+') print >>stats, "taxon \tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly" for i in range(submitted): a,b,c,d,e,f,g = result_queue.get() print >> stats, "\t".join(map(str,[a.replace(".clustS.gz","")+" "*(10-len(a)),b,c,d,e,f,g])) print >>stats, """ ## nloci = number of loci ## f1loci = number of loci with >N depth coverage ## f2loci = number of loci with >N depth and passed paralog filter ## nsites = number of sites across f loci ## npoly = number of polymorphic sites in nsites ## poly = frequency of polymorphic sites""" stats.close()
def multiproc_it(tests, alignfile, outfile, nboots, nproc, namelen, makesort, makeboots): " submit jobs to processors " work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() submitted = 0 Notes = [] for rep in tests: notes = "" if len(rep) == 2: rep, notes = rep p1, p2, p3, o = rep if any(["[" in i for i in rep]): p1 = p1[1:-1].split(",") p2 = p2[1:-1].split(",") p3 = p3[1:-1].split(",") o = o[1:-1].split(",") taxalist = list(itertools.chain(*[p1 + p2 + p3 + o])) if checktaxa(taxalist, alignfile): work_queue.put( [alignfile, [p1, p2, p3, o], nboots, 1, submitted]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' else: if checktaxa([p1, p2, p3, o], alignfile): work_queue.put( [alignfile, [p1, p2, p3, o], nboots, 0, submitted]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' Notes.append(notes) jobs = [] for i in range(nproc): worker = Worker(work_queue, result_queue, runtest) jobs.append(worker) worker.start() for j in jobs: j.join() " read results back in " #Results = [result_queue.get() for i in range(submitted)] Results = [ pickle.load(open(".save.D4temp" + str(i), 'rb')) for i in xrange(submitted) ] Results.sort(key=lambda x: x[8]) "setup results file " outs = open(outfile + ".D4.txt", 'w') header = "\t".join([ 'P1' + " " * (namelen[0] - 2), 'P2' + " " * (namelen[1] - 2), 'P3' + " " * (namelen[2] - 2), 'O' + " " * (namelen[3] - 1), 'D', 'std(D)', 'Z', 'BABA', 'ABBA', 'nloci', 'nboot', 'pdisc', 'notes' ]) print >> outs, header for i in range(len(Results)): ps, D, STD, Z, nloci, ABBA, BABA, pdisc, sub, ABBAloci, BABAloci, boots = Results[ i] ps = [ str(x).replace("['", "[").replace("']", "]").replace("', '", ",").replace(">", "") for x in ps ] print >> outs, "%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.2f\t%s" % ( ps[0] + " " * (namelen[0] - len(ps[0])), ps[1] + " " * (namelen[1] - len(ps[1])), ps[2] + " " * (namelen[2] - len(ps[2])), ps[3] + " " * (namelen[3] - len(ps[3])), D, STD, Z, BABA, ABBA, nloci, nboots, pdisc, Notes[i]) loci = open(alignfile).read().strip().split("|")[:-1] if makesort: makesortfiles('ABBA', ABBAloci, 4, loci, outfile, makesort, sub, ps) makesortfiles('BABA', BABAloci, 4, loci, outfile, makesort, sub, ps) if makeboots: with open(outfile + "_" + str(sub + 1) + ".boots", 'w') as out: out.write(",".join(map(str, boots))) for oldpickle in glob.glob(".save.D4temp*"): os.remove(oldpickle)
def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype): print >>sys.stderr, "\tstep 2: editing raw reads \n\t", " create output directories " if not os.path.exists(WORK+'stats'): os.makedirs(WORK+'stats') if not os.path.exists(WORK+'edits'): os.makedirs(WORK+'edits') " load up work queue " submitted = 0 work_queue = multiprocessing.Queue() if len(glob.glob(FQs)) > 1: FS = glob.glob(FQs) " order files by size " for i in range(len(FS)): statinfo = os.stat(FS[i]) FS[i] = FS[i],statinfo.st_size FS.sort(key=operator.itemgetter(1)) FS = [i[0] for i in FS][::-1] " submit jobs to queue " for handle in FS: finder = WORK+'edits/'+handle.split("/")[-1] while finder.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: finder = finder.replace('.'+finder.split(".")[-1], "").replace("_R1","") if finder+".edit" not in glob.glob(WORK+"edits/*"): if os.stat(handle).st_size > 0: ## exclude empty files args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype] work_queue.put(args) submitted += 1 else: print "skipping",handle,", file is empty" else: print "\t"+finder+" already in edits/" elif len(glob.glob(FQs)) == 1: " if only one file " work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype]) submitted += 1 else: print "\tNo demultiplexed files found. Check path." sys.exit() " create a queue to pass to workers to store the results " result_queue = multiprocessing.Queue() " spawn workers, give function " jobs = [] for i in range( min(Parallel,submitted) ): worker = Worker(work_queue, result_queue, rawedit) worker.start() jobs.append(worker) for job in jobs: job.join() " collect the results off the queue " outstats = open(WORK+"stats/s2.rawedit.txt",'a') print >> outstats, "\t".join(["sample ","Nreads","passed","passed.w.trim","passed.total"]) STATS = [] for i in range(submitted): STATS.append(result_queue.get()) STATS.sort(key = lambda x: x[0]) for i in range(submitted): a,b,c,d = STATS[i] print >> outstats, "\t".join([a,b,c,d,str(int(c)+int(d))]) print >>outstats, """ Nreads = total number of reads for a sample passed = retained reads that passed quality filtering at full length passed.w.trim= retained reads that were trimmed due to detection of adapters passed.total = total kept reads of sufficient length note: you can set the option in params file to include trimmed reads of xx length. """ outstats.close()
def main(Parallel, E, H, ID, mindepth, subset, maxN, maxH, ploidy, CUT, datatype, lowcounts, strict, WORK, maxstack): " find clust.xx directory " if not os.path.exists(WORK+'clust'+ID): print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \ "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \ "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold" sys.exit() " create work queue" work_queue = multiprocessing.Queue() " iterate over files" outfolder = WORK+'clust'+str(ID) HH = glob.glob(outfolder+"/"+subset+".clustS*") stringout = "\n\tstep 5: created consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5)) sys.stderr.write(stringout) if len(HH) > 1: " sort files by size" for i in range(len(HH)): statinfo = os.stat(HH[i]) HH[i] = HH[i],statinfo.st_size HH.sort(key=operator.itemgetter(1)) FS = [f[0] for f in HH][::-1] else: FS = HH REMOVE = glob.glob('clust'+ID+"/cat.*") FS = [f for f in FS if f not in REMOVE] submitted = 0 for handle in FS: if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"): m,sd = upSD(handle,mindepth) if maxstack == "2SD": upperSD = max(500,m+(sd*2.5)) else: upperSD = int(maxstack) work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype, ploidy,CUT,upperSD,strict,lowcounts]) submitted += 1 else: print "\tskipping "+handle.replace(".clustS",".consens")+\ ', it already exists in '+outfolder+"/" " create a queue to pass to workers to store the results" result_queue = multiprocessing.Queue() " spawn workers" jobs = [] for i in range( min(Parallel,submitted) ): worker = Worker(work_queue, result_queue, consensus) jobs.append(worker) worker.start() for j in jobs: j.join() " get results" stats = open(WORK+'stats/s5.consens.txt','a+') print >>stats, "taxon\tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly" for i in range(submitted): a,b,c,d,e,f,g = result_queue.get() nn = a.replace(".clustS.gz","") print >> stats, "\t".join(map(str,[nn,b,c,d,e,f,g])) print >>stats, """ ## nloci = number of loci ## f1loci = number of loci with >N depth coverage ## f2loci = number of loci with >N depth and passed paralog filter ## nsites = number of sites across f loci ## npoly = number of polymorphic sites in nsites ## poly = frequency of polymorphic sites""" stats.close()
def multiproc_it(subtests, alignfile, outfile, nboots, nproc, namelen, makesort, makeboots, noterminals): work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() submitted = 0 Notes = [] for rep in subtests: notes = "" if len(rep) == 2: rep, notes = rep p1, p2, p3a, p3b, o = rep if all(["[" in i for i in rep[1:]]): p1 = p1[1:-1].split(",") p2 = p2[1:-1].split(",") p3a = p3a[1:-1].split(",") p3b = p3b[1:-1].split(",") o = o[1:-1].split(",") if checktaxa([p1, p2, p3a, p3b, o], alignfile): work_queue.put([ alignfile, [p1, p2, p3a, p3b, o], nboots, 1, submitted, noterminals ]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' else: if checktaxa([p1, p2, p3a, p3b, o], alignfile): work_queue.put([ alignfile, [p1, p2, p3a, p3b, o], nboots, 0, submitted, noterminals ]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' Notes.append(notes) jobs = [] for i in range(min(submitted, nproc)): worker = Worker(work_queue, result_queue, runtest) jobs.append(worker) worker.start() for j in jobs: j.join() " read results back in " Results = [result_queue.get() for i in range(submitted)] Results.sort(key=lambda x: x[15]) " setup results file " if noterminals: outs = open(outfile + ".Dfoilalt.txt", 'w') else: outs = open(outfile + ".Dfoil.txt", 'w') header = "\t".join([ 'p1' + " " * (namelen[0] - 2), 'p2' + " " * (namelen[1] - 2), 'p3' + " " * (namelen[2] - 2), 'p4' + " " * (namelen[3] - 2), 'O' + " " * (namelen[4] - 1), 'Dfo', 'Dil', 'Dfi', 'Dol', 'Z_fo', 'Z_il', 'Z_fi', 'Z_ol', 'BABBA', 'ABBBA', 'BABAA', 'ABBAA', 'BAABA', 'ABABA', 'BBBAA', 'BBABA', 'AABAA', 'AAABA', 'BAAAA', 'ABAAA', 'nloci', 'sign', 'notes' ]) print >> outs, header for i in range(len(Results)): L, DFO, ZFO, DIL, ZIL, DFI, ZFI, DOL, ZOL, nloc, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, BBBAA, BBABA, AABAA, AAABA, BAAAA, ABAAA, pdisc, sub, BBFO, BBIL, BBFI, BBOL = Results[ i] L = [ str(x).replace("['", "[").replace("']", "]").replace("', '", ",") for x in L ] sign = [] for s, d in zip([ZFO, ZIL, ZFI, ZOL], [DFO, DIL, DFI, DOL]): if s > 3.5: if d > 0: sign.append("+") else: sign.append("-") else: sign.append("0") #print sign resin = tuple([ str(L[0]) + " " * (namelen[0] - len(str(L[0]))), str(L[1]) + " " * (namelen[1] - len(str(L[1]))), str(L[2]) + " " * (namelen[2] - len(str(L[2]))), str(L[3]) + " " * (namelen[3] - len(str(L[3]))), str(L[4]) + " " * (namelen[4] - len(str(L[4]))), DFO, DIL, DFI, DOL, ZFO, ZIL, ZFI, ZOL, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, BBBAA, BBABA, AABAA, AAABA, BAAAA, ABAAA, nloc, "".join(sign), Notes[i] ]) print >> outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%s\t%s" % resin loci = open(alignfile).read().strip().split("|")[:-1] if makesort: None # makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L) # makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L) if makeboots: None
def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype): print >> sys.stderr, "\n\tstep 2: quality filtering \n\t", " create output directories " if not os.path.exists(WORK + 'stats'): os.makedirs(WORK + 'stats') if not os.path.exists(WORK + 'edits'): os.makedirs(WORK + 'edits') " load up work queue " submitted = 0 work_queue = multiprocessing.Queue() " do not select merged or discarded reads if PEAR was used on data" FQs = glob.glob(FQs) fqs = [ i for i in FQs if not any([j in i for j in ["discarded", ".assembled."]]) ] if len(fqs) > 1: " subselect only the first reads " if any([".unassembled.forward." in i for i in fqs]): FS = [i for i in fqs if '.forward.' in i] else: FS = [i for i in fqs if '_R1.' in i] " order files by size " for i in range(len(FS)): statinfo = os.stat(FS[i]) FS[i] = FS[i], statinfo.st_size FS.sort(key=operator.itemgetter(1)) FS = [i[0] for i in FS][::-1] " submit jobs to queue " for handle in FS: n = handle.split('/')[-1] while n.split(".")[-1] in [ "fastq", "fastQ", "gz", "fq", "FastQ", "nomerge" ]: n = n.replace('.' + n.split(".")[-1], "") if '.forward.' in n: n = n.split(".forward")[0] None else: "_".join(n.split('_R')[:-1]) if WORK + "edits/" + n + ".edit" not in glob.glob(WORK + "edits/*"): if os.stat(handle).st_size > 0: ## exclude empty files args = [ WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype ] work_queue.put(args) submitted += 1 else: print 'skipping', handle, ", file is empty" else: print "\t" + n + '.edit' + " already in edits/" elif len(fqs) == 1: " if only one file " work_queue.put([ WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype ]) submitted += 1 else: print "no _paired_ de-multiplexed files found in this location." sys.exit() " create a queue to pass to workers to store the results " result_queue = multiprocessing.Queue() " spawn workers, give function " jobs = [] for i in range(min(Parallel, submitted)): worker = Worker(work_queue, result_queue, rawedit) worker.start() jobs.append(worker) for job in jobs: job.join() " collect the results off the queue " outstats = open(WORK + "stats/s2.rawedit.txt", 'a') print >> outstats, "\t".join( ["sample", "Nreads", "exclude", "trimmed", "passed"]) for i in range(submitted): a, b, c, d = result_queue.get() print >> outstats, "\t".join([a, b, str(int(b) - int(d)), c, d]) print >> outstats, """ Nreads = total number of reads for a sample exclude = reads that were excluded trimmed = reads that had adapter trimmed but were kept passed = total kept reads """ outstats.close()
def main(WORK, UCLUST, FQs, match, Q, Parallel): " create output directories " if not os.path.exists(WORK+'fastq/'): os.makedirs(WORK+'fastq') if not os.path.exists(WORK+'mergedreads'): os.makedirs(WORK+'mergedreads') if not os.path.exists(WORK+'stats'): os.makedirs(WORK+'stats') submitted = 0 work_queue = multiprocessing.Queue() names = [i for i in glob.glob(FQs) if "_R1.fq" in i] " submit jobs to queue " if len(names) > 1: for handle in names: if "nomerge." not in handle: n = str(handle.split('/')[-1]).replace("_R1.",".") while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]: n = n.replace('.'+n.split(".")[-1], "") finder = WORK+'edits/'+n+".edit" if finder not in glob.glob(WORK+"edits/*"): if os.stat(handle).st_size > 0: ## exclude empty files if os.path.exists(handle.replace("_R1.","_R2.")): if not os.path.exists(handle.replace(".fq",".nomerge.fq")): args = [WORK, UCLUST, handle, match, Q] work_queue.put(args) submitted += 1 else: print "merge file already created for", handle.split("/")[-1] else: print "cannot find 2nd read file for", handle.split("/")[-1] else: print "\t"+finder+" already in edits/" else: if not names: if [i for i in glob.glob(FQs) if "_R1_." in i]: print "\n\tfile names should have _R1. not _R1_." print "\n\tcannot find input files" sys.exit() else: work_queue.put([WORK, UCLUST, names[0], match, Q]) submitted += 1 " create a queue to pass to workers to store the results " result_queue = multiprocessing.Queue() " spawn workers, give function " jobs = [] for i in range( min(Parallel,submitted) ): worker = Worker(work_queue, result_queue, mergepairs) worker.start() jobs.append(worker) for job in jobs: job.join() if submitted > 0: statout = open(WORK+"stats/s2.mergedreads.txt",'w') print >>statout, "\t".join(["taxon","mergedreads"]) for i in range(submitted): stat = result_queue.get() a,b = stat n = a.strip().split("/")[-1].replace(".nomerge.gz","") print >>statout, "\t".join([n,str(b)]) print >>statout, "\nmerged reads written to", WORK+"mergedreads/ " statout.close()
def main(Parallel,ID,minsamp,subset,haplos,WORK,CUT,datatype): sys.stderr.write("\n\tstep 4: estimating error rate and heterozygosity\n\t") " find clust.xx directory " if not os.path.exists(WORK+'clust'+ID): print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \ "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \ "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold" sys.exit() # warning message for low minsamp if minsamp < 5: sys.stderr.write("""\n\t warning: Mindepth < 5 is not recommended for this step.\n If you intend to make low coverage base calls use a high mindepth in step 4 to accurately infer H & E parameters, and then use a low mindepth in conjunction with the line 31 params file option to make low coverage base calls""") # if haploid data if haplos == 1: sys.stderr.write("\n\tapplying haploid-based test (infer E while H is fixed to 0)\n\t") # if double digest use first cut site if "," in CUT: CUT1, CUT2 = CUT.strip().split(",") else: CUT1 = CUT2 = CUT # load up work queue work_queue = multiprocessing.Queue() # iterate over files HH = glob.glob(WORK+"clust"+ID+"/"+subset+"*.clustS*") submitted = 0 FS = [] if len(HH) > 1: ## sort files by size for i in range(len(HH)): statinfo = os.stat(HH[i]) if statinfo.st_size > 1000: FS.append((HH[i],statinfo.st_size)) else: print "excluding ",HH[i],"file is too small\n" FS.sort(key=lambda x: x[1]) FS = [i[0] for i in FS] else: FS = HH REMOVE = glob.glob(WORK+'clust'+ID+"/cat.*") FS = [f for f in FS if f not in REMOVE] for handle in FS: work_queue.put([WORK,handle, minsamp, CUT1, CUT2, datatype, haplos]) submitted += 1 " remove temp files if previous run " for ff in FS: end = ff.split("/")[-1].replace(".clustS.gz","") ff = WORK+"stats/."+end+".temp" if os.path.exists(ff): os.remove(ff) " create a queue to pass to workers to store the results " result_queue = multiprocessing.Queue() results = [] " spawn workers " jobs = [] for i in range( min(Parallel,submitted) ): worker = Worker(work_queue, result_queue, optim) worker.start() jobs.append(worker) for job in jobs: job.join() " write results to stats file " if not os.path.exists(WORK+"stats/Pi_E_estimate.txt"): outstats = open(WORK+"stats/Pi_E_estimate.txt",'w') outstats.write("taxa\tH\tE\n") else: outstats = open(WORK+"stats/Pi_E_estimate.txt",'a') for ff in FS: end = ff.split("/")[-1].replace(".clustS.gz","") ft = WORK+"stats/."+end+".temp" line = open(ft).readlines() outstats.write(line[0]) os.remove(ft) # n,h,e = line[0].strip().split("\t") # H.append(float(h)) # E.append(float(e)) #outstats.write(" ".join(["mean E =",str(numpy.mean(E))])+"\n") #outstats.write(" ".join(["mean H =",str(numpy.mean(H))])) outstats.close()
def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype): print >> sys.stderr, "\tstep 2: editing raw reads \n\t", " create output directories " if not os.path.exists(WORK + 'stats'): os.makedirs(WORK + 'stats') if not os.path.exists(WORK + 'edits'): os.makedirs(WORK + 'edits') " load up work queue " submitted = 0 work_queue = multiprocessing.Queue() if len(glob.glob(FQs)) > 1: FS = glob.glob(FQs) " order files by size " for i in range(len(FS)): statinfo = os.stat(FS[i]) FS[i] = FS[i], statinfo.st_size FS.sort(key=operator.itemgetter(1)) FS = [i[0] for i in FS][::-1] " submit jobs to queue " for handle in FS: finder = WORK + 'edits/' + handle.split("/")[-1] while finder.split(".")[-1] in [ "fastq", "fastQ", "gz", "fq", "FastQ" ]: finder = finder.replace('.' + finder.split(".")[-1], "").replace("_R1", "") if finder + ".edit" not in glob.glob(WORK + "edits/*"): if os.stat(handle).st_size > 0: ## exclude empty files args = [ WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype ] work_queue.put(args) submitted += 1 else: print "skipping", handle, ", file is empty" else: print "\t" + finder + " already in edits/" elif len(glob.glob(FQs)) == 1: " if only one file " work_queue.put([ WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype ]) submitted += 1 else: print "\tNo demultiplexed files found. Check path." sys.exit() " create a queue to pass to workers to store the results " result_queue = multiprocessing.Queue() " spawn workers, give function " jobs = [] for i in range(min(Parallel, submitted)): worker = Worker(work_queue, result_queue, rawedit) worker.start() jobs.append(worker) for job in jobs: job.join() " collect the results off the queue " outstats = open(WORK + "stats/s2.rawedit.txt", 'a') print >> outstats, "\t".join( ["sample ", "Nreads", "passed", "passed.w.trim", "passed.total"]) STATS = [] for i in range(submitted): STATS.append(result_queue.get()) STATS.sort(key=lambda x: x[0]) for i in range(submitted): a, b, c, d = STATS[i] print >> outstats, "\t".join([a, b, c, d, str(int(c) + int(d))]) print >> outstats, """ Nreads = total number of reads for a sample passed = retained reads that passed quality filtering at full length passed.w.trim= retained reads that were trimmed due to detection of adapters passed.total = total kept reads of sufficient length note: you can set the option in params file to include trimmed reads of xx length. """ outstats.close()
def multiproc_it(subtests, alignfile, outfile, nboots, nproc, namelen, makesort, makeboots): work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() submitted = 0 Notes = [] for rep in subtests: notes = "" if len(rep) == 2: rep, notes = rep p1, p2, p3a, p3b, o = rep if all(["[" in i for i in rep[1:]]): p1 = p1[1:-1].split(",") p2 = p2[1:-1].split(",") p3a = p3a[1:-1].split(",") p3b = p3b[1:-1].split(",") o = o[1:-1].split(",") if checktaxa([p1, p2, p3a, p3b, o], alignfile): work_queue.put( [alignfile, [p1, p2, p3a, p3b, o], nboots, 1, submitted]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' else: if checktaxa([p1, p2, p3a, p3b, o], alignfile): work_queue.put( [alignfile, [p1, p2, p3a, p3b, o], nboots, 0, submitted]) submitted += 1 else: print 'a taxon name was found that is not in the sequence file' Notes.append(notes) jobs = [] for i in range(min(submitted, nproc)): worker = Worker(work_queue, result_queue, runtest) jobs.append(worker) worker.start() for j in jobs: j.join() " read results back in " #Results = [result_queue.get() for i in range(submitted)] Results = [ pickle.load(open(".save." + str(i), 'rb')) for i in range(submitted) ] Results.sort(key=lambda x: x[15]) " setup results file " outs = open(outfile + ".partD.txt", 'w') header = "\t".join([ 'p1' + " " * (namelen[0] - 2), 'p2' + " " * (namelen[1] - 2), 'p3_1' + " " * (namelen[2] - 4), 'p3_2' + " " * (namelen[3] - 4), 'O' + " " * (namelen[4] - 1), 'D_12', 'D_1', 'D_2', 'Z_12', 'Z_1', 'Z_2', 'BABBA', 'ABBBA', 'BABAA', 'ABBAA', 'BAABA', 'ABABA', 'nloci', 'pdisc', 'notes' ]) print >> outs, header for i in range(len(Results)): L, D12, Z12, D1, Z1, D2, Z2, nloc, ABBBA, BABBA, ABBAA, BABAA, ABABA, BAABA, pdisc, sub, ABBBAloci, BABBAloci, ABBAAloci, BABAAloci, ABABAloci, BAABAloci, BB12, BB1, BB2 = Results[ i] L = [ str(x).replace("['", "[").replace("']", "]").replace("', '", ",") for x in L ] resin = tuple([ str(L[0]) + " " * (namelen[0] - len(str(L[0]))), str(L[1]) + " " * (namelen[1] - len(str(L[1]))), str(L[2]) + " " * (namelen[2] - len(str(L[2]))), str(L[3]) + " " * (namelen[3] - len(str(L[3]))), str(L[4]) + " " * (namelen[4] - len(str(L[4]))), D12, D1, D2, Z12, Z1, Z2, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, nloc, pdisc, Notes[i] ]) print >> outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s" % resin loci = open(alignfile).read().strip().split("|")[:-1] if makesort: makesortfiles("ABBBA", ABBBAloci, 5, loci, outfile, makesort, sub, L) makesortfiles("BABBA", BABBAloci, 5, loci, outfile, makesort, sub, L) makesortfiles("ABBAA", ABBAAloci, 5, loci, outfile, makesort, sub, L) makesortfiles("BABAA", BABAAloci, 5, loci, outfile, makesort, sub, L) makesortfiles("ABABA", ABABAloci, 5, loci, outfile, makesort, sub, L) makesortfiles("BAABA", BAABAloci, 5, loci, outfile, makesort, sub, L) if makeboots: with open(outfile + "_" + str(sub + 1) + ".boots_D12", 'w') as out: out.write(",".join(map(str, BB12))) with open(outfile + "_" + str(sub + 1) + ".boots_D1", 'w') as out: out.write(",".join(map(str, BB1))) with open(outfile + "_" + str(sub + 1) + ".boots_D2", 'w') as out: out.write(",".join(map(str, BB2)))
def main(Parallel, ID, minsamp, subset, haplos, WORK, CUT, datatype): sys.stderr.write( "\n\tstep 4: estimating error rate and heterozygosity\n\t") " find clust.xx directory " if not os.path.exists(WORK + 'clust' + ID): print "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \ "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \ "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold" sys.exit() # warning message for low minsamp if minsamp < 5: sys.stderr.write( """\n\t warning: Mindepth < 5 is not recommended for this step.\n If you intend to make low coverage base calls use a high mindepth in step 4 to accurately infer H & E parameters, and then use a low mindepth in conjunction with the line 31 params file option to make low coverage base calls""") # if haploid data if haplos == 1: sys.stderr.write( "\n\tapplying haploid-based test (infer E while H is fixed to 0)\n\t" ) # if double digest use first cut site if "," in CUT: CUT1, CUT2 = CUT.strip().split(",") else: CUT1 = CUT2 = CUT # load up work queue work_queue = multiprocessing.Queue() # iterate over files HH = glob.glob(WORK + "clust" + ID + "/" + subset + "*.clustS*") submitted = 0 FS = [] if len(HH) > 1: ## sort files by size for i in range(len(HH)): statinfo = os.stat(HH[i]) if statinfo.st_size > 1000: FS.append((HH[i], statinfo.st_size)) else: print "excluding ", HH[i], "file is too small\n" FS.sort(key=lambda x: x[1]) FS = [i[0] for i in FS] else: FS = HH REMOVE = glob.glob(WORK + 'clust' + ID + "/cat.*") FS = [f for f in FS if f not in REMOVE] for handle in FS: work_queue.put([WORK, handle, minsamp, CUT1, CUT2, datatype, haplos]) submitted += 1 " remove temp files if previous run " for ff in FS: end = ff.split("/")[-1].replace(".clustS.gz", "") ff = WORK + "stats/." + end + ".temp" if os.path.exists(ff): os.remove(ff) " create a queue to pass to workers to store the results " result_queue = multiprocessing.Queue() results = [] " spawn workers " jobs = [] for i in range(min(Parallel, submitted)): worker = Worker(work_queue, result_queue, optim) worker.start() jobs.append(worker) for job in jobs: job.join() " write results to stats file " if not os.path.exists(WORK + "stats/Pi_E_estimate.txt"): outstats = open(WORK + "stats/Pi_E_estimate.txt", 'w') outstats.write("taxa\tH\tE\n") else: outstats = open(WORK + "stats/Pi_E_estimate.txt", 'a') for ff in FS: end = ff.split("/")[-1].replace(".clustS.gz", "") ft = WORK + "stats/." + end + ".temp" line = open(ft).readlines() outstats.write(line[0]) os.remove(ft) # n,h,e = line[0].strip().split("\t") # H.append(float(h)) # E.append(float(e)) #outstats.write(" ".join(["mean E =",str(numpy.mean(E))])+"\n") #outstats.write(" ".join(["mean H =",str(numpy.mean(H))])) outstats.close()
def main(): parser = OptionParser(prog="pyRAD", usage="%prog [options]", version="%prog 3.0.61") parser.add_option( '-p', action="store", type="string", dest="params", help="input file for within sample filtering and clustering\n") parser.add_option('-s', action="store", dest="steps", help="""perform step-wise parts of within analysis\n 1 = barcode sorting \ 2 = filter/edit raw sequences \ 3 = within-sample clustering \ 4 = estimate pi and e \ 5 = consensus calling \ 6 = cluster consensus \ 7 = align & create output files """) parser.add_option('-d', action="store", type="string", dest="dtest", help="""input file for D-test of introgression, can iterate over multiple samples """) parser.add_option('-n', action="store_true", dest="newparamsfile", help="""creates a new empty input params.txt file """) parser.add_option('-D', action="store_true", dest="newDtestfile", help="""creates a new empty Dtest input file """) (options, args) = parser.parse_args() if not any([ options.params, options.dtest, options.newparamsfile, options.newDtestfile ]): print "\n\tmust include option of -p, -d, -D or -n\n" sys.exit() if options.params: sys.stderr.write('\n\n'+' '*5+'---'*20+'\n'+\ ' '*6+'pyRAD : RADseq for phylogenetics & introgression analyses\n'+\ ' '*5+'---'*20+'\n\n') readin = [ line.strip().split('##')[0].strip() for line in open(options.params).readlines() ] if "==** " not in str(readin[0]): print "\n\twarning: update params input file format to latest version\n" sys.exit() WORK = str(readin[1]) GLOB = str(readin[2]) Bcode = str(readin[3]) vsearch = str(readin[4]) muscle = str(readin[5]) CUT = str(readin[6]) parallel = int(readin[7]) mindepth = int(readin[8]) pN = str(readin[9]) wclust = str(readin[10]) datatype = str(readin[11]) minsamp = int(readin[12]) maxpoly = str(readin[13]) outname = str(readin[14]) ########################### ## 15 is separator line ########################### subset = str(readin[16]) outgroup = str(readin[17]) exclude = str(readin[18]) Floc = str(readin[19]) try: maxmismatch = int(readin[20]) except (ValueError, IndexError): maxmismatch = 1 try: Q = int(readin[21]) except (ValueError, IndexError): Q = 33 try: strict = int(readin[22]) except (ValueError, IndexError): strict = 0 try: E, H = str(readin[23]).strip().split(",") except ValueError: E = "" H = "" try: maxN = int(readin[24]) except ValueError: maxN = 5 try: maxH = int(readin[25]) except ValueError: maxH = 5 try: haplos = int(readin[26]) except ValueError: haplos = 2 maxSNP = str(readin[27]) if maxSNP == "": maxSNP = "99" max_inserts = str(readin[28]) if max_inserts == "": max_inserts = "3" try: seed = int(readin[29]) except ValueError: seed = 112233 try: overhang = [int(i) for i in str(readin[30]).strip().split(',')] except (ValueError, IndexError): overhang = [0, 0] try: outform = str(readin[31]) except (ValueError, IndexError): outform = "" try: lowcounts = int(readin[32]) except (ValueError, IndexError): lowcounts = mindepth ##mergepairs = str(readin[31]) ##if mergepairs in [0,""]: mergepairs = 0 try: trimkeep = int(readin[33]) except ValueError: trimkeep = 0 try: maxstack = int(readin[34]) except ValueError: maxstack = "2SD" try: minuniq = int(readin[35]) except ValueError: minuniq = 0 try: hierarch = int(readin[36]) except ValueError: hierarch = 0 try: MASK = int(readin[37]) except ValueError: MASK = 'dust' if MASK == 1: MASK = 'dust' else: MASK = 'none' try: threads = int(readin[38]) except ValueError: threads = 6 ############################### ## 39 is separator line ############################### try: clustprefix = readin[40:] except IndexError: clustprefix = "" clustprefix = [i for i in clustprefix if i] """ expand ./ ~ and ../ designators in location names """ def expander(namepath): if "~" in namepath: namepath = namepath.replace("~", os.path.expanduser("~")) if "../" in namepath: a, b = namepath.split("../") namepath = os.path.abspath( os.path.join(os.path.dirname(""), '..', b)) elif "./" in namepath: a, b = namepath.split("./") namepath = os.path.abspath("") + "/" + b return namepath if WORK == "": WORK = os.path.abspath("") + "/" else: WORK = expander(WORK) if WORK[-1] != "/": WORK = WORK + "/" stripped = 0 if Floc: if Floc[0] == "@": stripped = 1 Floc = expander(Floc[1:]) else: Floc = expander(Floc) if GLOB: GLOB = expander(GLOB) if Bcode: Bcode = expander(Bcode) if vsearch: vsearch = expander(vsearch) if options.dtest: options.dtest = expander(options.dtest) """ find location of vsearch (or usearch) and muscle """ def cmd_exists(cmd): return subprocess.call("type " + cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 # " check platform: mac v linux " # if 'linux' in sys.platform: # vsearch = "vsearch-1.0.3-linux-x86_64" # else: # vsearch = "vsearch-1.0.3-mac-x86_64" # " find vsearch and muscle in user's lib/" # PYRADPATH = os.path.dirname(os.path.realpath(__file__)) # vsearch = PYRADPATH+"/lib/"+vsearch # muscle = PYRADPATH+"/lib/muscle" " threads = 1 for usearch" if 'vsearch' not in vsearch: threads = 1 if not cmd_exists(vsearch): print "\tcannot find vsearch (or usearch), edit path in param file" sys.exit() if not cmd_exists(muscle): print "\tcannot find muscle, edit path in input file" sys.exit() """ expand clustprefix cluster groups """ gids = [] groups = [] minhits = [] "hierarchical clustering " for line in clustprefix: gid, hits, inds = line.strip().split() gids.append(gid) minhits.append(hits) if "," in inds: thisgroup = [] ii = inds.split(",") for i in ii: if "*" in i: expanded = glob.glob(WORK + "clust" + wclust + "/" + i + ".consens*") [thisgroup.append(i) for i in expanded] else: thisgroup.append(WORK + "clust" + wclust + "/" + i + ".consens.gz") groups.append(thisgroup) else: if "*" in inds: expanded = glob.glob(WORK + "clust" + wclust + "/" + inds + ".consens*") groups.append(expanded) else: inds = inds.split(",") groups.append([ WORK + "clust" + wclust + "/" + i + ".consens.gz" for i in inds ]) "TODO check for size=1 " if not gids: gids = "" " step of the analysis " k = tuple('1234567') if options.steps: k = tuple(str(options.steps)) " check that the data type was entered correctly " datopts = [ 'rad', 'gbs', 'ddrad', 'pairgbs', 'pairddrad', 'merged', '2brad' ] if datatype not in datopts: print "\t datatype argument (line 11) not recognized " sys.exit() # if datatype == 'merged': # print "specify mergetype in params file, ex: mergeddrad or mergegbs " # sys.exit() " parse max_inserts argument " w1 = 3 w2 = 6 a1 = a2 = 99 if 'pair' in datatype: if "," in max_inserts: wargs = max_inserts.strip().split(",") if len(wargs) == 2: w1 = w2 = wargs[0] a1 = a2 = wargs[1] elif len(wargs) == 4: w1, w2, a1, a2 = wargs else: print "\n\tmax_inserts parameter not recognized. see documentation" sys.exit() else: if "," in max_inserts: w1, a1 = map(int, max_inserts.split(",")) ######### Begin analysis ################################################### if '1' in k: " expand Barcode file name if necessary " if "*" in Bcode: try: Bcode = glob.glob(Bcode)[0] except IndexError: print "\tcould not find barcodes file ", Bcode, "\n\tcomment out line 3 of params file or edit path to barcodes file" sys.exit() if Floc: print "\tskipping step 1: line 18 of input file shows seqs already sorted" else: " if directory as input select all inside" if GLOB: if GLOB[-1] == "/": GLOB = GLOB + "*" sortandcheck2.main(Bcode, GLOB, CUT, datatype, parallel, maxmismatch, WORK) ### step 2 ################### if '2' in k: if Floc: print >> sys.stderr, "\tsorted .fastq from %s being used" % Floc if len(glob.glob(Floc)) < 1: sys.stderr.write( "\t... no files found in line 18 location, check required file name formatting\n" ) sys.exit() FQs = Floc if stripped: print "\tbarcode & restriction site are already stripped off of sequences" CUT = "" if strict: print "\tApplying step 2 filter (param 19) is not recommended for data that is stripped (w/ @) \n" else: " default location " FQs = WORK + "fastq/" + subset + "*.fq.gz" " if directory as input select all inside" if FQs[-1] == "/": FQs = FQs + "*" " if not paired filter only read 1 " if 'pair' not in datatype: # in ['rad','ddrad','gbs','merged','2brad']: editraw_rads.main(parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype) else: #elif datatype in ['pairddrad','pairgbs']: " check for both CUT sites in pairddrad" if datatype == 'pairddrad': if "," not in CUT: print "\n\tyou must enter two restriction sites for pair ddRAD data" sys.exit() editraw_pairs.main(parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype) #elif "merge" in datatype: # editraw_merges.main(parallel, WORK, FQs, CUT, # pN, Q, strict, trimkeep) ### step 3 #################### if '3' in k: cluster7dp.main(WORK, parallel, wclust, mindepth, subset, datatype, w1, w2, minuniq, MASK, muscle, vsearch, threads, remake=0) ### step 4 #################### if '4' in k: " if using low depth option still use a reasonable limit for parameter estimates" if mindepth < 5: tempmindepth = 5 else: tempmindepth = mindepth H_err_dp.main(parallel, wclust, tempmindepth, subset, haplos, WORK, CUT, datatype) ### step 5 #################### if '5' in k: if not E: try: Pi = open(WORK + "stats/Pi_E_estimate.txt").readlines() except IOError: Pi = "" if Pi: El = [] Hl = [] for line in Pi[1:]: try: _, h, e = line.strip().split("\t") except IndexError: None Hl.append(float(h)) El.append(float(e)) if len(Hl) == 0: print "\n\terror in step 4, no estimates in file stats/Pi_E_estimate.txt" sys.exit() H = sum(Hl) / len(Hl) E = sum(El) / len(El) else: E = 0.001 H = 0.01 print "\n\tstep 4 values not detected, using E=0.001, H=0.01" if 'pair' in datatype: " call consensus on each pair separately " consens_pairs.main(parallel, float(E), float(H), wclust, mindepth, subset + "*", maxN, maxH, haplos, CUT, datatype, lowcounts, strict, WORK, maxstack) else: " call consensus on single end clusters " consensdp.main(parallel, float(E), float(H), wclust, mindepth, subset + "*", maxN, maxH, haplos, CUT, datatype, lowcounts, strict, WORK, maxstack) ### step 6 #################### if '6' in k: if not hierarch: gids = "" if "," in subset: inlist = [ WORK + "clust" + wclust + "/" + i + ".consens*" for i in subset.strip().split(",") ] else: inlist = glob.glob(WORK + "clust" + wclust + "/" + subset + "*.consens*") cluster_cons7_shuf.main(vsearch, wclust, datatype, outgroup, seed, gids, minhits, inlist, WORK, MASK, 0) print "\n\tfinished clustering" else: """ re-expand clustprefix cluster groups in case no -s """ Hgids = [] Hgroups = {} Hminhits = [] "hierarchical clustering " for line in clustprefix: Hgid, Hhits, Hinds = line.strip().split() Hgids.append(Hgid) Hminhits.append(Hhits) Hgroups[Hgid] = [] if "," in Hinds: Hinds = Hinds.split(",") for Hind in Hinds: if "*" in Hind: expanded = glob.glob(WORK + "clust" + wclust + "/" + Hind + ".consens*") Hgroups[Hgid] += expanded #.append(expanded) else: Hgroups[Hgid].append(WORK + "clust" + wclust + "/" + Hind + ".consens.gz") else: if "*" in Hinds: expanded = glob.glob(WORK + "clust" + wclust + "/" + Hinds + ".consens*") Hgroups[Hgid] += expanded #.append(expanded) else: Hgroups[Hgid].append(WORK + "clust" + wclust + "/" + Hinds + ".consens.gz") for i, j in zip(Hgids, Hminhits): for cons in Hgroups[i]: if cons not in glob.glob(WORK + "clust" + wclust + "/*.consens.gz"): print "\n\tsample name", cons, "in group", i, "does not match any filenames" sys.exit() preclusts = [] for i in Hgroups.values(): preclusts += i for cons in glob.glob(WORK + "clust" + wclust + "/*.consens.gz"): if cons not in preclusts: print "\n\twarning: sample", cons, "not assigned to a cluster group" #if not gids: # gids = "" " make prefix directory " if not os.path.exists(WORK + 'prefix/'): os.makedirs(WORK + 'prefix') ########### TODO #################################### # if os.path.exists(WORK+"prefix/cat.clust_.gz"): # print "\tRemaking clusters from existing clustprefix files "+\ # "using minmatches: ",minmatch # print "\t(To completely re-start hierarchical clustering delete the prefix/ directory)\n" # # for (gid,minhit,inlist) in zip(gids,minhits,groups): # handle = WORK+"clust"+wclust+"/cat.haplos_"+gid # #cluster_cons7_shuf.makeclust(handle, datatype, pre, pre, minm, WORK, 1) # #tier2clust.makeclust(wclust, datatype, WORK) ####################################################### " queue up jobs " work_queue = multiprocessing.Queue() result_queue = multiprocessing.Queue() " submit jobs " for (Hgid, Hminhit) in zip(Hgids, Hminhits): inlist = Hgroups[Hgid] work_queue.put([ vsearch, wclust, datatype, outgroup, seed, Hgid, Hminhit, inlist, WORK, MASK, 1 ]) " execute first tier jobs " jobs = [] for i in range(parallel): worker = Worker(work_queue, result_queue, cluster_cons7_shuf.main) jobs.append(worker) worker.start() for j in jobs: j.join() " cluster second tier " tier2clust.main(vsearch, wclust, datatype, Hgids, seed, WORK, MASK) print "\n\tfinished clustering\n" " cleanup " #for ff in glob.glob(WORK+"clust"+wclust+"/cat.consens_*.gz"): # os.remove(ff) #for ff in glob.glob(WORK+"clust"+wclust+"/cat.u*"): # os.remove(ff) if '7' in k: if minsamp < 2: print "\n\tminimum minCov setting is <2: changing to 2" minsamp = 2 if gids: inclustfile = WORK + "prefix/cat.clust_.gz" else: inclustfile = WORK + 'clust' + wclust + "/cat.clust_.gz" if not os.path.exists(inclustfile): #sys.stderr.write("\n\t didn't find hierarchically clustered subset: \n\t"+inclustfile) #sys.stderr.write("\n\t looking for default full cluster file") if os.path.exists(WORK + 'clust' + wclust + "/cat.clust_.gz"): inclustfile = WORK + 'clust' + wclust + "/cat.clust_.gz" sys.stderr.write("\n\tCluster input file: using \n\t" + inclustfile + "\n\n") else: print "\tnot found" #print "\tcat.clust_ file is selected based on line 15 subset argument " #print "\n\t if you wish to exclude samples from an existing cat.clust file "+\ # "\n\t in your output alignments list exclude names on line 17 of the params file.\n " sys.exit() #if any([i in outform for i in ['t','m']]): # if gids: # print "\tgroups for 't' or 'm' outputs:", gids taxadict = OrderedDict(zip(gids, groups)) alignable.main(outgroup, minsamp, outname, inclustfile, maxpoly, parallel, maxSNP, muscle, exclude, overhang, outform, WORK, gids, CUT, a1, a2, datatype, subset, parser.version.split(" ")[1], mindepth, taxadict, minhits, seed, haplos) if '8' in k: cluster7dp.main(WORK, parallel, wclust, mindepth, subset, datatype, w1, w2, minuniq, MASK, muscle, vsearch, threads, remake=1) if options.dtest: readin = [line.strip() for line in open(options.dtest).readlines()] nboots = int(readin[0].split("##")[0].strip()) alignfile = str(readin[1].split("##")[0].strip()) outfile = str(readin[2].split("##")[0].strip()) ntax = str(readin[3].split("##")[0].strip()) nproc = int(readin[4].split("##")[0].strip()) makesort = int(readin[5].split("##")[0].strip()) makeboots = int(readin[6].split("##")[0].strip()) tests = [] for line in readin[8:]: if line: notes = "" if "##" in line: tax, notes = line.strip().split( "##")[0], line.strip().split("##")[-1], if tax: tests.append([tax.strip().split(), notes.strip() ]) #.split("\t"),notes.strip()]) else: tests.append(line.strip().split()) # "\t")) if ntax == '4': Dtest.main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots) elif ntax == 'part': Dtest_5.main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots) elif ntax == 'foil': Dtest_foil.main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots, 0) elif ntax == 'foilalt': Dtest_foil.main(tests, alignfile, outfile, nboots, nproc, makesort, makeboots, 1) else: print "error in input file" if options.newparamsfile: if os.path.exists("./params.txt"): print "\tfile params.txt already exists" sys.exit() else: createfile.main(parser.version.split(" ")[1]) if options.newDtestfile: outstring = """200 ## N bootstrap replicates test.loci ## loc/path to input .loci file dstats/test1_res ## output file path/name (no suffix) 4 ## which test: 4,part,foil,foilalt 2 ## N cores (execute jobs [lines below] in parallel 0 ## output ABBA/BABA loci to files (0=no,1,2=verbose) 0 ## output bootstrap Ds to files (0=no,1=yes) -----------------------------------------------------------\n""" sys.stdout.write(outstring)
def writefunc(GLOB,Parallel,Bcode,CUT,datatype,maxmismatch,WORK): "create barcode dictionary" codetable = open(Bcode, 'r') codes = [line.strip().split() for line in codetable.readlines()] C = {} for line in codes: if line: C[line[1].strip().upper()] = line[0] " find longest barcode " keylens = map(len,C.keys()) if len(set(keylens)) == 1: longB = (keylens[0],'same') else: longB = (max(keylens),'diff') " check for CUT in barcodes " CCC = unambig(CUT) if len(CCC)>1: for cut in CCC: if any([cut in i for i in C.keys()]): print "\n\twarning: CUT site matches within one of the barcodes, "+\ "I suggest double \n\tchecking the file to make sure it properly demultiplexes" else: if any([CUT in i for i in C.keys()]): print "\n\twarning: CUT site matches within one of the barcodes, "+\ "I suggest double \n\tchecking the file to make sure it properly demultiplexes" " read in sequence files " if len(glob.glob(GLOB)) > 1: FS = [f for f in glob.glob(GLOB)] else: FS = glob.glob(GLOB) if 'pair' in datatype: Raws = combinefiles(GLOB) else: Raws = FS "send jobs to multiprocess queue" num = 0 work_queue = multiprocessing.Queue() submitted = 0 for fs in Raws: if 'pair' in datatype: work_queue.put([C, [fs[0],fs[1]], CUT, datatype, num, maxmismatch, WORK, longB]) submitted += 1 else: work_queue.put([C, fs, CUT, datatype, num, maxmismatch, WORK, longB]) submitted += 1 num += 1 result_queue = multiprocessing.Queue() "spawn workers, give function" jobs = [] for i in range( min(Parallel,submitted) ): worker = Worker(work_queue, result_queue, barmatch) worker.start() jobs.append(worker) for job in jobs: job.join() Ms = {} if len(glob.glob(WORK+"fastq/.*.pickle")) > 1: for pick in glob.glob(WORK+"fastq/.*.pickle"): pickin = open(pick, "rb") M = pickle.load( pickin ) pickin.close() for key in M: if key not in Ms: Ms[key] = M[key] else: Ms[key] += M[key] os.remove(pick) elif len(glob.glob(WORK+"fastq/.*.pickle")) == 1: pick = glob.glob(WORK+"fastq/.*.pickle")[0] pickin = open(pick, 'rb') Ms = pickle.load( pickin ) pickin.close() os.remove(pick) else: print "\nno stats file generated" Mkeys = Ms.keys() Mkeys.sort(key=lambda x: Ms[x], reverse=True) statout = open(WORK+"stats/s1.sorting.txt",'a') statout.write("\n\n") statout.write("sample\ttrue_bar\tobs_bars\tN_obs\n") Cnames = C.keys() Cnames.sort() try: maxl = max(map(len,map(str,Ms.values()))) except ValueError: maxl = 5 hits = [] for bar in Cnames: for barcode in Mkeys: if matching(bar, barcode, maxmismatch): print >>statout, "%s \t%s \t%s\t%s" % (C[bar], bar, barcode, str(Ms[barcode])+" "*(maxl+3-len(str(Ms[barcode])))) hits.append(barcode) statout.write("\n") maxl = max(map(len,Mkeys)) for barcode in Mkeys: if barcode not in hits: print >>statout, "nomatch \t%s \t%i" % (barcode+" "*(maxl+3-len(barcode)), Ms[barcode]) statout.close()