def assemble(samples, data_dir, results_dir, seed_fa): """assemble using SSAKE.""" # jobs = [] for sample in samples: fastas = ngseq.getfilelist(datadir, sample + ".jnd.fa.gz") assert(len(fastas) == 1) gzipfasta = fastas[0] outdir = "%s/%s" % (results_dir, sample) fasta = outdir + "/" + op.splitext(op.basename(gzipfasta))[0] if not op.exists(fasta): bsub.poll(ngseq.extract(gzipfasta, fasta)) cmd = "SSAKE -f " + fasta + " -s " + seed_fa + " -m 40 -o 50 -r 0.8 -b " + sample + " -p 1 -v 1 -d 200 -e 0.75 -k 10 -a 0.5 -x 50" jobid = bsub("3prime_seed_extension", cwd=outdir, R="select[mem>16] rusage[mem=16] span[hosts=1]", verbose=True)(cmd)
def join(samples, datadir, script): """joins paired-end data into SSAKE format.""" jobs = [] sub = bsub("join_reads", verbose=True) for sample in samples: # sort for ordering: R1 then R2 fastqs = sorted(ngseq.getfilelist(datadir, sample + "_*.trm.fq.gz")) # check for output joinresult = datadir + "/" + sample + ".jnd.fa.gz" if op.exists(joinresult) or op.exists(joinresult + ".gz"): continue assert(len(fastqs) == 2) # usage: join_reads.py R1 R2 --insert 200 cmd = "python " + script + " " + " ".join(fastqs) + " | gzip -c > " + joinresult jobs.append(sub(cmd)) return jobs