def _aln(ref, fastq, tmp="/tmp", threads=8, threshold=0.05): sai = os.path.join(tmp, '%09d.sai' % random.randrange(0, 1e10)) with file_transaction(sai) as tx: cmd = ("bwa aln -n {threshold} -t {threads} " "{ref} {fastq} > {tx}").format(**locals()) run(cmd) return sai
def chop_reads(fastq, out_file, length, minlength, cores=10): if not out_file.endswith('.gz'): out_file = out_file + '.gz' if file_exists(out_file): print("output file exists, chop reads will not repeat itself.") return out_file pd.set_option('display.float_format', lambda x: '%.0f' % x) p = multiprocessing.Pool(cores) original_count = 0 readcount = 0 bpcount = 0 size_var = OnlineVariance(ddof=0) with file_transaction(out_file) as tx_outfile: with open(tx_outfile, "w") as txoh: for result in multiprocess(chop_read, readfx(fastq), length, minlength, pool=p): if type(result) == list: for r in result: print(r, file=txoh) original_count += 1 lines = r.split("\n") for name, seq, qual in read_fq_string(lines): readcount += 1 bpcount += len(seq) size_var.include(len(seq)) else: print(result, file=txoh) original_count += 1 lines = result.split("\n") for name, seq, qual in read_fq_string(lines): readcount += 1 bpcount += len(seq) size_var.include(len(seq)) meanbp = bpcount / readcount readbp_std = size_var.std out_file = pigz_file(out_file, cores) outdata = pd.Series( data=[original_count, readcount, bpcount, meanbp, readbp_std], index=[ 'original_count', 'read_count', 'bp_count', 'mean_read_len', 'read_len_std' ]) print(outdata) outdata.to_csv(out_file.replace("fastq.gz", "chop_data"), sep="\t") return out_file
def samse_aln(ref, reads, bamout, tmp="/tmp", threads=8, threshold=0.05): with bwa_index(ref) as bwaidx: r_sai = _aln(bwaidx, reads, tmp, threads, threshold) samse = ("bwa samse {ref} {r_sai} {reads} | samtools view -bSF0x0004 - " "| samtools sort -f -m 8 - {bam_sorted}") with file_transaction(bam_sorted) as tx: run(samse) return bam_sorted
def combine_fasta_qual(fas, qual, outfile, cores=8): if outfile.endswith(gz) == False: outfile = outfile + ".gz" with file_transaction(outfile) as tx_out: with open(fas) as fin, open(qual) as qin, open(tx_out, "w") as oh: for rec in PairedFastaQualIterator(fin, qin): SeqIO.write(rec, oh, "fastq") outfile = pigz_outfile(outfile, cores) return outfile
def index_bam(bam_file): """ Build an index for a bam file. parameters bam_file : alignment file path returns index file name : string """ bam_index = bam_file + '.bai' if not file_exists(bam_index): with file_transaction(bam_index) as tx_out_file: run('samtools index %s %s' % (bam_file, tx_out_file)) return bam_index
def sampe_aln(ref, reads, bam_sorted, tmp="/tmp", threads=1, threshold=0.05): r1, r2 = tmp_split_reads(reads, tmp) with bwa_index(ref) as bwaidx: r1_sai = _aln(bwaidx, r1, tmp, threads, threshold) r2_sai = _aln(bwaidx, r2, tmp, threads, threshold) sampe = ("bwa sampe {ref} {r1_sai} {r2_sai} {r1} {r2} " "| samtools view -bSF0x0004 - " "| samtools sort -f -m 8 - {bam_sorted}").format(ref=bwaidx, r1_sai=r1_sai, r2_sai=r2_sai, r1=r1, r2=r2, bam_sorted=bam_sorted) with file_transaction(bam_sorted) as tx: run(sampe) return bam_sorted
def get_coverage(bam_file, bedout=None): ''' create per base coverage patterns from sorted bam ''' bedgraph = "" filename, ext = op.splitext(bam_file) if bedout is None: bedout = filename + ".genomecoverage" if op.exists(bedout): return bedout with file_transaction(bedout) as tx_oh: cmd = ("bedtools genomecov -dz -ibam {bam_file} > {tx_oh}").format(**locals()) subprocess.check_call(cmd, shell=True) return bedout
def extract_fastq(bam, out_fastq): ''' Uses bedtools bamtofastq function to extract reads from bam Args: bam (string): path to bam alignment file out_fastq (string): output fastq to write to Returns: out_fastq (string): path to written output >> bam = 'Tara_test1_vs_Simons_LoCos_Conc.pctid95.overlap0.minlen100.bam' >> outfastq = 'testout.fastq' >> extract_fastq(bam, outfastq) == out_fastq ''' with file_transaction(out_fastq) as temp_oh: cmd = "bedtools bamtofastq -i {bam} -fq {fastq}".format(bam=bam, fastq=temp_oh) run(cmd) return out_fastq
def run_seqtk_sample(fastq, outfile, n, seed=37): """Subsample incoming paired-end fastqs to `n` reads (serially). Args: fastqs (str): path to fastq outfile (str): path of output fastq paths; output files are always gzipped n (int): number of subsampled reads seed (int): for random selection of reads Returns: str: subsampled reads file path """ if file_exists(outfile): return outfile logger.info("Subsampling to %d reads" % n) with file_transaction(outfile) as tx: cmd = "seqtk sample -s {seed} {fastq} {number} | gzip > {out}".format( seed=seed, fastq=fastq, number=n, out=tx) run(cmd) print("%s created" % outfile) return outfile
def bwa_mem(fastq, out_file, reference, options, cores=1): """ align reads using bwa mem. parameters fastq : path to reads out_file : path to aligned reads bam index : path to bwa index options : bwa mem options cores : int returns output file path : string """ if file_exists(out_file): return out_file predefined_options = [('-t', False)] if options is not None: options = filter_options(options, predefined_options) opts = " ".join(options) else: opts = "" logger.info("Mapping %s to %s using bwa mem" % (fastq, reference)) reference = bwa_index(reference) with file_transaction(out_file) as tx_out_file: cmd = ("bwa mem -t {cores} {options} {index} {fastq} | samtools view " "-ShuF4q2 - | samtools sort -o -m 8G - tmp > {result}" ).format(cores=cores, options=opts, index=reference, fastq=fastq, result=tx_out_file) run(cmd) index_bam(tx_out_file) return out_file