def get_read_limit(fasta, readLimit, verbose): """Return read limit and libraries.""" # limit no. of reads to align as fraction of genome size limit = 0 if readLimit: stats = fasta_stats(open(fasta)) fastaSize = int(stats.split("\t")[2]) limit = int(readLimit * fastaSize) if verbose: sys.stderr.write(" Aligning %s mates per library...\n" % limit) return limit
def run_scaffolding(outdir, scaffoldsFname, fastq, libraries, reducedFname, mapq, threads, \ joins, limit, iters, sspacebin, verbose, \ identity, overlap, minLength, lib=""): """Execute scaffolding step.""" # run scaffolding using libraries with increasing insert size in multiple iterations pout = reducedFname i = 0 #for i, (libnames, libFs, libRs, orients, libIS, libISStDev) in enumerate(libraries, 1): while i < len(libraries): libnames, libFs, libRs, orients, libIS, libISStDev = libraries[i] i += 1 for j in range(1, iters+1): if verbose: sys.stderr.write(" iteration %s.%s ...\n"%(i,j)) out = os.path.join(outdir, "_sspace.%s.%s"%(i, j)) lib = "" # run fastq scaffolding fastq2sspace(out, open(pout), lib, libnames, libFs, libRs, orients, \ libIS, libISStDev, threads, mapq, limit, joins, \ sspacebin, verbose=0) # store out info pout = out+".fa" # link output ie out/_sspace.1.1/_sspace.1.1.scaffolds.fasta --> out/_sspace.1.1.scaffolds.fasta targetout = os.path.join(os.path.basename(out), os.path.basename(out+".final.scaffolds.fasta")) symlink(targetout, pout) # if number of gaps larger than 1%, run gap closer & reduction stats = fasta_stats(open(pout)) fastaSize = int(stats.split('\t')[2]) gapSize = int(stats.split('\t')[-2]) if 1.0 * gapSize / fastaSize > 0.01: # close gaps if verbose: sys.stderr.write(" closing gaps ...\n") nogapsFname = ".".join(pout.split(".")[:-1]) + ".filled.fa" basename = "_sspace.%s.%s._gapcloser"%(i, j) run_gapclosing(outdir, mapq, [libraries[i-1],], nogapsFname, pout, \ threads, limit, 1, 0, basename) pout = nogapsFname # reduce '''reducedFname = ".".join(pout.split(".")[:-1]) + ".reduced.fa" with open(reducedFname, "w") as out: fasta2homozygous(out, open(nogapsFname), identity, overlap, \ minLength, libraries, limit, threads) # update pout pout = reducedFname #nogapsFname''' # update library insert size estimation, especially for mate-pairs libraries = get_libraries(fastq, pout, mapq, threads, verbose=0) # create symlink to final scaffolds or pout symlink(pout, scaffoldsFname) return libraries
def redundants( fastq, fasta, outdir, mapq, threads, identity, overlap, minLength, joins, readLimit, iters, sspacebin, reduction=1, scaffolding=1, gapclosing=1, cleaning=1, verbose=1, log=sys.stderr, ): """Launch redundans pipeline.""" # redirect stderr # sys.stderr = log # prepare outdir or quit if exists if os.path.isdir(outdir): sys.stderr.write("Directory %s exists!\n" % outdir) sys.exit(1) else: os.makedirs(outdir) # REDUCTION contigsFname = os.path.join(outdir, "contigs.fa") reducedFname = os.path.join(outdir, "contigs.reduced.fa") # link contigs & genome symlink(fasta, contigsFname) # get read limit & libraries limit = get_read_limit(contigsFname, readLimit, verbose) libraries = get_libraries(fastq, contigsFname, mapq, threads, verbose) if reduction: if verbose: sys.stderr.write("%sReduction...\n" % timestamp()) sys.stderr.write( "#file name\tgenome size\tcontigs\theterozygous size\t[%]\theterozygous contigs\t[%]\tidentity [%]\tpossible joins\thomozygous size\t[%]\thomozygous contigs\t[%]\n" ) with open(reducedFname, "w") as out: info = fasta2homozygous(out, open(contigsFname), identity, overlap, minLength, libraries, limit, threads) else: symlink(contigsFname, reducedFname) # update fasta list fastas = [contigsFname, reducedFname] # update read limit using reduced assembly as reference limit = get_read_limit(reducedFname, readLimit, verbose) # SCAFFOLDING scaffoldsFname = os.path.join(outdir, "scaffolds.fa") if scaffolding: if verbose: sys.stderr.write("%sScaffolding...\n" % timestamp()) # estimate read limit libraries = run_scaffolding( outdir, scaffoldsFname, fastq, libraries, reducedFname, mapq, threads, joins, limit, iters, sspacebin, verbose, identity, overlap, minLength, ) else: symlink(reducedFname, scaffoldsFname) # update fasta list fastas += sorted(glob.glob(os.path.join(outdir, "_sspace.*.fa"))) fastas.append(scaffoldsFname) # GAP CLOSING ## gapclosing is only necessary after scaffolding nogapsFname = os.path.join(outdir, "scaffolds.filled.fa") if gapclosing and libraries: if verbose: sys.stderr.write("%sGap closing...\n" % timestamp()) run_gapclosing(outdir, mapq, libraries, nogapsFname, scaffoldsFname, threads, limit, iters, verbose) else: symlink(scaffoldsFname, nogapsFname) # update fasta list fastas += sorted(glob.glob(os.path.join(outdir, "_gap*.fa"))) fastas.append(nogapsFname) # FASTA STATS if verbose: sys.stderr.write("%sReporting statistics...\n" % timestamp()) # report stats sys.stderr.write("#fname\tcontigs\tbases\tGC [%]\tcontigs >1kb\tbases in contigs >1kb\tN50\tN90\tNs\tlongest\n") for fn in fastas: sys.stderr.write(fasta_stats(open(fn))) # Clean-up # rm fq.is.txt if cleaning: if verbose: sys.stderr.write("%sCleaning-up...\n" % timestamp()) for root, dirs, fnames in os.walk(outdir): for fn in filter(lambda x: not x.endswith((".fa", ".fasta", ".stats")), fnames): os.unlink(os.path.join(root, fn))