def wrapper_bedtools_intersect2(bedfile1, bedfile2, outfile=None): """ Using two bedfile to get the intsersection of pairs :param bigg_one: :param bigg_two: :return: """ if outfile is None: prefix1 = get_file_prefix(bedfile1) prefix2 = get_file_prefix(bedfile2) location = get_file_location(bedfile1) outfile = location + "/" + "_".join([prefix1, prefix2]) + ".bed" sort_cmd1 = "bedtools sort -i {bed} > {bed}_s".format(bed=bedfile1) sort_cmd2 = "bedtools sort -i {bed} > {bed}_s".format(bed=bedfile2) _ = myexe(sort_cmd1) _ = myexe(sort_cmd2) # generate the bedfile cmd = "bedtools intersect -wa -wb -a {bedfile1}_s -b {bedfile2}_s>{out}".format( bedfile1=bedfile1, bedfile2=bedfile2, out=outfile) _ = myexe(cmd) ### cleanup bed1s = bedfile1 + "_s" bed2s = bedfile2 + "_s" del_files([bedfile1, bedfile2, bed1s, bed2s]) return outfile
def bwa_index_wrapper(ref_file): """ :param ref_file: :return: """ cmd_index="bwa index {ref}".format(ref=ref_file) myexe(cmd_index) return ref_file
def adaptor_blast(query,dbpatch="adaptor.fasta"): # build the blast db, maybe adding an asserting to identify the exsentise of the db is better db=dbpatch.split(".")[0] print myexe("makeblastdb -in %s -dbtype nucl -input_type fasta -out %s" % (dbpatch,db)) blastn_cline = NcbiblastnCommandline(db=db, outfmt=5) out, err = blastn_cline(stdin=query) blast_records = NCBIXML.read(StringIO(out)) # return is a generator, need a loop to parse the result return blast_records
def wrapper_samtools_merge(work_dir, ref_file, bam_list, out=None): if out is None: out_s=ref_file.split("/")[-1].split(".")[0]+".bam" out=os.path.join(work_dir, out_s) merge_cmd="samtools merge {out_bam} {in_bam}".format( out_bam=out, in_bam=" ".join(bam_list)) print(merge_cmd) myexe(merge_cmd) return sort_index_wrapper(out)
def sort_index_wrapper(bamfile, core=1, bam_sorted=None): if bam_sorted==None: bam_sorted=bamfile.split(".")[0]+"_s.bam" else: pass sort_cmd="samtools sort {bamfile} -@ {core} -o {bam_sorted}".format( bamfile=bamfile, core=core, bam_sorted=bam_sorted) index_cmd="samtools index {bam_sorted}".format(bam_sorted=bam_sorted) myexe(sort_cmd) myexe(index_cmd) return bam_sorted
def fq_subseq(fqin, namefile, fqout=None): """ used in fq_subset_main() :param fqin:454 :param namefile: :param fqout: :return: """ if fqout==None: fqout=fqin.split(".")[0]+"_s.fq" cmd_subseq = "seqtk subseq {fqin} {namefile} > {fq_out}".format(fqin=fqin, namefile=namefile, fq_out=fqout) print(cmd_subseq) myexe(cmd_subseq) return fqout
def exonerate_wrapper(query, target, outfile=False, geneticcode=5, score=100, bestn=None): """ --geneticcode 5 return is a outfile name in relative path todo: using stringIO to hinder the file IO """ if bestn is None: bestn=len(fasta2dic(target)) # default, output one region for one query exonerate_cmd="exonerate {query} {target} \ --geneticcode {geneticcode} \ --score {score} \ --bestn {bestn} \ ".format( query=query, target=target, geneticcode=geneticcode, score=score, bestn=bestn, ) out=myexe(exonerate_cmd) ## trigger to write the outfile to disk if outfile: outname=query.split("/")[-1].split(".")[0]+".exonerate" with open(outname, "w") as fw: fw.write(outname) return out
def sra2fq_wrapper(sra_file, outdir): sra_cmd = "fastq-dump --split-files {} --outdir {}".format(sra_file, outdir) print(sra_cmd) print(myexe(sra_cmd)) return outdir
def bwa_mem_wrapper(ref_file, fq_str, core=25, min_seed_length=20, band_width=2000, out="mapped.bam"): """ a bwa mem mapper only collect the mapped reads :param ref_file: :param fq_str: a list contains the name of the sra -extracted :param core, min_seed_length and band_width is bwa mem parameter -t, -k and -w, respectively :return: the out bam file """ cmd_bwa="bwa mem -k {band} -w {width} -t {core} {ref} \ {fq_str} \ | samtools view -F 4 -b -o {out}".format( band=min_seed_length, width=band_width, core=core,ref=ref_file, fq_str=fq_str, out=out) print(cmd_bwa) myexe(cmd_bwa) return out
def spades_wrapper(fq_name_dict, outdir="spades_out", core=12, rna_model=False): """ :param fq_name_dict: a dict with {"pe1-1": "ERRxxxx_1_s.fq","pe1-2": "ERRxxxx_1_s.fq", "s2":"ERRxxxx_1.fq"} :param outdir: :param core: :return: the file position of the scaf fasta and the fastg file """ fq_str_list = [] # to generate the readpool string used for spades lib_num = 1 for k, v in fq_name_dict.iteritems(): read_type = "pe" if len(v) >= 2 else "s" for fq_one_name in v: if read_type == "s": str_one = "--{read_type}{lib_num} {fq_one_name}".format( read_type=read_type, lib_num=lib_num, fq_one_name=fq_one_name) elif read_type == "pe": if "_1" in fq_one_name or "_F" in fq_one_name: fq_pos_ind = "-1" elif "_2" in fq_one_name or "_R" in fq_one_name: fq_pos_ind = "-2" elif "_3" in fq_one_name: fq_pos_ind = "-s" else: fq_pos_ind = "-s" print( "Treat the PE reads as single reads!" "Please re-check the origin fastq file, make sure _1, _2 or _F, _R in the paired file." ) str_one = "--{read_type}{lib_num}{fq_pos_ind} {fq_one_name}".format( read_type=read_type, lib_num=lib_num, fq_pos_ind=fq_pos_ind, fq_one_name=fq_one_name) fq_str_list.append(str_one) lib_num += 1 fq_str = " ".join(fq_str_list) spades_bin = "rnaspades.py" if rna_model else "spades.py" spades_cmd = "{spades_bin} --only-assembler -t {core} {readpool} -o {outdir}".format( spades_bin=spades_bin, core=core, readpool=fq_str, outdir=outdir) print(spades_cmd) print(myexe(spades_cmd)) scaf_fasta = os.path.join(outdir, "scaffolds.fasta") scaf_fastg = os.path.join(outdir, "assembly_graph.fastg") return scaf_fasta, scaf_fastg
def wrapper_bedtools_intersect2_v2(beddf1, beddf2, outfile=None): ## Input two sorted bed dataframe ## Use bedtools to intersect and return a dataframe ## Define tmp file 1,2,3 bedfile1 = NamedTemporaryFile('w+t') bedfile2 = NamedTemporaryFile('w+t') beddf1.to_csv(bedfile1, sep="\t", header=False, index=False) beddf2.to_csv(bedfile2, sep="\t", header=False, index=False) # generate the bedfile ## command adapted to strand ## use sorted option to save memory with NamedTemporaryFile('w+t') as outfile: cmd = "bedtools intersect -wa -wb -s -sorted -a {bedfile1} -b {bedfile2} > {out}".format( bedfile1=bedfile1.name, bedfile2=bedfile2.name, out=outfile.name) _ = myexe(cmd) try: intersectDf = pandas.read_csv(outfile.name, sep="\t", header=None, dtype={ 0: str, 1: int, 2: int, 3: object, 4: str, 5: str, 6: str, 7: int, 8: int, 9: str, 10: str, 11: str }) except: intersectDf = pandas.DataFrame() # Return empty dataframe ## Close tmp files, they will be automatically removed bedfile1.close() bedfile2.close() return intersectDf
def mitfi_wrapper_trna(fastafile, MITFIPATH=None, prefix=None): """ mitfi.jar in in $MITFIPATH=./bins :return:teh filename of mitfi run """ if MITFIPATH is None: path = os.path.dirname(__file__) MITFIPATH=os.path.join(path, "bins", "mitfi") #print MITFIPATH jarfile=os.path.join(MITFIPATH, "mitfi.jar") mitfi_cmd="java -jar {jarfile} {fastafile}".format( jarfile=jarfile, fastafile=fastafile) trna_out=myexe(mitfi_cmd) print(trna_out) if prefix is None: prefix=".".join(fastafile.split("/")[-1].split(".")[0:-1]) with open(prefix+"_trna.txt", "w") as fw: fw.write(trna_out) return prefix+"_trna.txt"
def _cmsearch_wrapper_rrna(fastafile, MITFIPATH=None): """ todo: too slow to be practical, maybe change to INFERNAL 1.1 and try mitfi.jar in in $MITFIPATH=./bins :return:teh filename of mitfi run """ if MITFIPATH is None: path = os.path.dirname(__file__) MITFIPATH=os.path.join(path, "bins", "mitfi") #print MITFIPATH jarfile=os.path.join(MITFIPATH, "mitfi.jar") rrna_cm=os.path.join(os.path.dirname(__file__), "bins", "mitfi","r_rna.cm") mitfi_cmd = "java -jar {jarfile} -cm {rrna_cm} -top {fastafile}".format( jarfile=jarfile, fastafile=fastafile, rrna_cm=rrna_cm) rrna_out = myexe(mitfi_cmd) print rrna_out prefix=fastafile.split("/")[-1].split(".")[0] with open(prefix+"_rrna.txt", "w") as fw: fw.write(rrna_out) return prefix+"_rrna.txt"