예제 #1
0
def get_read_limit(fasta, readLimit, verbose):
    """Return read limit and libraries."""
    # limit no. of reads to align as fraction of genome size
    limit = 0
    if readLimit:
        stats = fasta_stats(open(fasta))
        fastaSize = int(stats.split("\t")[2])
        limit = int(readLimit * fastaSize)
        if verbose:
            sys.stderr.write(" Aligning %s mates per library...\n" % limit)
    return limit
예제 #2
0
def run_scaffolding(outdir, scaffoldsFname, fastq, libraries, reducedFname, mapq, threads, \
                    joins, limit, iters, sspacebin, verbose, \
                    identity, overlap, minLength, lib=""):
    """Execute scaffolding step."""        
    # run scaffolding using libraries with increasing insert size in multiple iterations
    pout = reducedFname
    i = 0
    #for i, (libnames, libFs, libRs, orients, libIS, libISStDev) in enumerate(libraries, 1):
    while i < len(libraries):
        libnames, libFs, libRs, orients, libIS, libISStDev = libraries[i]
        i += 1
        for j in range(1, iters+1):
            if verbose:
                sys.stderr.write(" iteration %s.%s ...\n"%(i,j))
            out = os.path.join(outdir, "_sspace.%s.%s"%(i, j))
            lib = ""
            # run fastq scaffolding
            fastq2sspace(out, open(pout), lib, libnames, libFs, libRs, orients, \
                         libIS, libISStDev, threads, mapq, limit, joins, \
                         sspacebin, verbose=0)
            # store out info
            pout = out+".fa"
            # link output ie out/_sspace.1.1/_sspace.1.1.scaffolds.fasta --> out/_sspace.1.1.scaffolds.fasta
            targetout = os.path.join(os.path.basename(out), os.path.basename(out+".final.scaffolds.fasta"))
            symlink(targetout, pout)
            # if number of gaps larger than 1%, run gap closer & reduction
            stats     = fasta_stats(open(pout))
            fastaSize = int(stats.split('\t')[2])
            gapSize   = int(stats.split('\t')[-2])
            if 1.0 * gapSize / fastaSize > 0.01:
                # close gaps
                if verbose:
                    sys.stderr.write("  closing gaps ...\n")
                nogapsFname = ".".join(pout.split(".")[:-1]) + ".filled.fa"
                basename    = "_sspace.%s.%s._gapcloser"%(i, j)
                run_gapclosing(outdir, mapq, [libraries[i-1],], nogapsFname, pout, \
                               threads, limit, 1, 0, basename)
                pout = nogapsFname
                # reduce
                '''reducedFname = ".".join(pout.split(".")[:-1]) + ".reduced.fa"
                with open(reducedFname, "w") as out:
                    fasta2homozygous(out, open(nogapsFname), identity, overlap, \
                                     minLength, libraries, limit, threads)
                # update pout
                pout = reducedFname #nogapsFname'''
        # update library insert size estimation, especially for mate-pairs
        libraries = get_libraries(fastq, pout, mapq, threads, verbose=0)
    # create symlink to final scaffolds or pout
    symlink(pout, scaffoldsFname)

    return libraries
예제 #3
0
def redundants(
    fastq,
    fasta,
    outdir,
    mapq,
    threads,
    identity,
    overlap,
    minLength,
    joins,
    readLimit,
    iters,
    sspacebin,
    reduction=1,
    scaffolding=1,
    gapclosing=1,
    cleaning=1,
    verbose=1,
    log=sys.stderr,
):
    """Launch redundans pipeline."""
    # redirect stderr
    # sys.stderr = log

    # prepare outdir or quit if exists
    if os.path.isdir(outdir):
        sys.stderr.write("Directory %s exists!\n" % outdir)
        sys.exit(1)
    else:
        os.makedirs(outdir)

    # REDUCTION
    contigsFname = os.path.join(outdir, "contigs.fa")
    reducedFname = os.path.join(outdir, "contigs.reduced.fa")
    # link contigs & genome
    symlink(fasta, contigsFname)
    # get read limit & libraries
    limit = get_read_limit(contigsFname, readLimit, verbose)
    libraries = get_libraries(fastq, contigsFname, mapq, threads, verbose)
    if reduction:
        if verbose:
            sys.stderr.write("%sReduction...\n" % timestamp())
            sys.stderr.write(
                "#file name\tgenome size\tcontigs\theterozygous size\t[%]\theterozygous contigs\t[%]\tidentity [%]\tpossible joins\thomozygous size\t[%]\thomozygous contigs\t[%]\n"
            )
        with open(reducedFname, "w") as out:
            info = fasta2homozygous(out, open(contigsFname), identity, overlap, minLength, libraries, limit, threads)
    else:
        symlink(contigsFname, reducedFname)
    # update fasta list
    fastas = [contigsFname, reducedFname]

    # update read limit using reduced assembly as reference
    limit = get_read_limit(reducedFname, readLimit, verbose)
    # SCAFFOLDING
    scaffoldsFname = os.path.join(outdir, "scaffolds.fa")
    if scaffolding:
        if verbose:
            sys.stderr.write("%sScaffolding...\n" % timestamp())
        # estimate read limit
        libraries = run_scaffolding(
            outdir,
            scaffoldsFname,
            fastq,
            libraries,
            reducedFname,
            mapq,
            threads,
            joins,
            limit,
            iters,
            sspacebin,
            verbose,
            identity,
            overlap,
            minLength,
        )
    else:
        symlink(reducedFname, scaffoldsFname)
    # update fasta list
    fastas += sorted(glob.glob(os.path.join(outdir, "_sspace.*.fa")))
    fastas.append(scaffoldsFname)

    # GAP CLOSING
    ## gapclosing is only necessary after scaffolding
    nogapsFname = os.path.join(outdir, "scaffolds.filled.fa")
    if gapclosing and libraries:
        if verbose:
            sys.stderr.write("%sGap closing...\n" % timestamp())
        run_gapclosing(outdir, mapq, libraries, nogapsFname, scaffoldsFname, threads, limit, iters, verbose)
    else:
        symlink(scaffoldsFname, nogapsFname)
    # update fasta list
    fastas += sorted(glob.glob(os.path.join(outdir, "_gap*.fa")))
    fastas.append(nogapsFname)

    # FASTA STATS
    if verbose:
        sys.stderr.write("%sReporting statistics...\n" % timestamp())
    # report stats
    sys.stderr.write("#fname\tcontigs\tbases\tGC [%]\tcontigs >1kb\tbases in contigs >1kb\tN50\tN90\tNs\tlongest\n")
    for fn in fastas:
        sys.stderr.write(fasta_stats(open(fn)))

    # Clean-up
    # rm fq.is.txt
    if cleaning:
        if verbose:
            sys.stderr.write("%sCleaning-up...\n" % timestamp())
        for root, dirs, fnames in os.walk(outdir):
            for fn in filter(lambda x: not x.endswith((".fa", ".fasta", ".stats")), fnames):
                os.unlink(os.path.join(root, fn))