def chunk_clusters(data, sample): """ split job into bits and pass to the client """ ## counter for split job submission num = 0 ## set optim size for chunks in N clusters. The first few chunks take longer ## because they contain larger clusters, so we create 4X as many chunks as ## processors so that they are split more evenly. optim = int((sample.stats.clusters_total // data.cpus) + \ (sample.stats.clusters_total % data.cpus)) ## break up the file into smaller tmp files for each engine ## chunking by cluster is a bit trickier than chunking by N lines chunkslist = [] ## open to clusters with gzip.open(sample.files.clusters, 'rb') as clusters: ## create iterator to sample 2 lines at a time pairdealer = itertools.izip(*[iter(clusters)]*2) ## Use iterator to sample til end of cluster done = 0 while not done: ## grab optim clusters and write to file. done, chunk = clustdealer(pairdealer, optim) chunkhandle = os.path.join(data.dirs.clusts, "tmp_"+str(sample.name)+"."+str(num*optim)) if chunk: chunkslist.append((optim, chunkhandle)) with open(chunkhandle, 'wb') as outchunk: outchunk.write("//\n//\n".join(chunk)+"//\n//\n") num += 1 return chunkslist
def newconsensus(data, sample, tmpchunk, optim): """ new faster replacement to consensus """ ## do reference map funcs? isref = "reference" in data.paramsdict["assembly_method"] ## temporarily store the mean estimates to Assembly data._este = data.stats.error_est.mean() data._esth = data.stats.hetero_est.mean() ## get number relative to tmp file tmpnum = int(tmpchunk.split(".")[-1]) ## prepare data for reading clusters = open(tmpchunk, 'rb') pairdealer = itertools.izip(*[iter(clusters)]*2) maxlen = data._hackersonly["max_fragment_length"] ## write to tmp cons to file to be combined later consenshandle = os.path.join( data.dirs.consens, sample.name+"_tmpcons."+str(tmpnum)) tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.") with h5py.File(tmp5, 'w') as io5: io5.create_dataset("cats", (optim, maxlen, 4), dtype=np.uint32) io5.create_dataset("alls", (optim, ), dtype=np.uint8) io5.create_dataset("chroms", (optim, 3), dtype=np.int64) ## local copies to use to fill the arrays catarr = io5["cats"][:] nallel = io5["alls"][:] refarr = io5["chroms"][:] ## if reference-mapped then parse the fai to get index number of chroms if isref: fai = pd.read_csv(data.paramsdict["reference_sequence"] + ".fai", names=['scaffold', 'size', 'sumsize', 'a', 'b'], sep="\t") faidict = {j:i for i,j in enumerate(fai.scaffold)} ## store data for stats counters counters = {"name" : tmpnum, "heteros": 0, "nsites" : 0, "nconsens" : 0} ## store data for what got filtered filters = {"depth" : 0, "maxh" : 0, "maxn" : 0} ## store data for writing storeseq = {} ## set max limits if 'pair' in data.paramsdict["datatype"]: maxhet = sum(data.paramsdict["max_Hs_consens"]) maxn = sum(data.paramsdict["max_Ns_consens"]) else: maxhet = data.paramsdict["max_Hs_consens"][0] maxn = data.paramsdict["max_Ns_consens"][0] ## load the refmap dictionary if refmapping done = 0 while not done: try: done, chunk = clustdealer(pairdealer, 1) except IndexError: raise IPyradError("clustfile formatting error in %s", chunk) if chunk: ## get names and seqs piece = chunk[0].strip().split("\n") names = piece[0::2] seqs = piece[1::2] ## pull replicate read info from seqs reps = [int(sname.split(";")[-2][5:]) for sname in names] ## IF this is a reference mapped read store the chrom and pos info ## -1 defaults to indicating an anonymous locus, since we are using ## the faidict as 0 indexed. If chrompos fails it defaults to -1 ref_position = (-1, 0, 0) if isref: try: ## parse position from name string name, _, _ = names[0].rsplit(";", 2) chrom, pos0, pos1 = name.rsplit(":", 2) ## pull idx from .fai reference dict chromint = faidict[chrom] + 1 ref_position = (int(chromint), int(pos0), int(pos1)) except Exception as inst: LOGGER.debug("Reference sequence chrom/pos failed for {}".format(names[0])) LOGGER.debug(inst) ## apply read depth filter if nfilter1(data, reps): ## get stacks of base counts sseqs = [list(seq) for seq in seqs] arrayed = np.concatenate( [[seq]*rep for seq, rep in zip(sseqs, reps)]) arrayed = arrayed[:, :maxlen] ## get consens call for each site, applies paralog-x-site filter #consens = np.apply_along_axis(basecall, 0, arrayed, data) consens = basecaller( arrayed, data.paramsdict["mindepth_majrule"], data.paramsdict["mindepth_statistical"], data._esth, data._este, ) ## apply a filter to remove low coverage sites/Ns that ## are likely sequence repeat errors. This is only applied to ## clusters that already passed the read-depth filter (1) if "N" in consens: try: consens, arrayed = removerepeats(consens, arrayed) except ValueError as _: LOGGER.info("Caught a bad chunk w/ all Ns. Skip it.") continue ## get hetero sites hidx = [i for (i, j) in enumerate(consens) \ if j in list("RKSYWM")] nheteros = len(hidx) ## filter for max number of hetero sites if nfilter2(nheteros, maxhet): ## filter for maxN, & minlen if nfilter3(consens, maxn): ## counter right now current = counters["nconsens"] ## get N alleles and get lower case in consens consens, nhaps = nfilter4(consens, hidx, arrayed) ## store the number of alleles observed nallel[current] = nhaps ## store a reduced array with only CATG catg = np.array(\ [np.sum(arrayed == i, axis=0) \ for i in list("CATG")], dtype='uint32').T catarr[current, :catg.shape[0], :] = catg refarr[current] = ref_position ## store the seqdata for tmpchunk storeseq[counters["name"]] = "".join(list(consens)) counters["name"] += 1 counters["nconsens"] += 1 counters["heteros"] += nheteros else: #LOGGER.debug("@haplo") filters['maxn'] += 1 else: #LOGGER.debug("@hetero") filters['maxh'] += 1 else: #LOGGER.debug("@depth") filters['depth'] += 1 ## close infile io clusters.close() ## write final consens string chunk if storeseq: with open(consenshandle, 'wb') as outfile: outfile.write("\n".join([">"+sample.name+"_"+str(key)+"\n"+\ str(storeseq[key]) for key in storeseq])) ## write to h5 array, this can be a bit slow on big data sets and is not ## currently convered by progressbar movement. with h5py.File(tmp5, 'a') as io5: io5["cats"][:] = catarr io5["alls"][:] = nallel io5["chroms"][:] = refarr del catarr del nallel del refarr ## return stats counters['nsites'] = sum([len(i) for i in storeseq.itervalues()]) return counters, filters