def transrate(submitted, trinitydir, transrate_dir, transrate_out, trinity_fasta, sample, trimdir, sra, mmetsp): trim_1P = trimdir + sra + ".trim_1P.fq" trim_2P = trimdir + sra + ".trim_2P.fq" if os.path.isfile(trim_1P) and os.path.isfile(trim_2P): transrate_command = """ transrate --assembly={} --threads=8 \ --left={}{}.trim_1P.fq \ --right={}{}.trim_2P.fq \ --output=/tmp/transrate_out.{} cp /tmp/transrate_out.{}/assemblies.csv {}{}.assemblies.csv rm -rf /tmp/transrate_out.{} """.format(trinity_fasta, trimdir, sra, trimdir, sra, sample, sample, transrate_dir, mmetsp, sample) print(transrate_command) commands = [transrate_command] process_name = "transrate" module_name_list = "" filename = mmetsp submitted.append(mmetsp) clusterfunc_py3.qsub_file(transrate_dir, process_name, module_name_list, filename, commands) else: print("trimfiles not present:", trim_1P, trim_2P) return submitted
def get_sourmash_command(mmetsp): sourmash_command=""" sourmash compute --dna --protein /mnt/home/ljcohen/mmetsp_assemblies_trinity2.2.0/{}.trinity_out_2.2.0.Trinity.fasta -k 21 --name-from-first """.format(mmetsp) commands = [sourmash_command] process_name = "sourmash" module_name_list = [""] filename = mmetsp clusterfunc_py3.qsub_file("/mnt/home/ljcohen/mmetsp_sourmash/",process_name, module_name_list, filename, commands)
def consolidate(mmetsp_dir,item): combine_orphaned_string = combine_orphaned(mmetsp_dir,item) rename_pe_string = rename_pe(mmetsp_dir,item) split_reads_string = split_reads(mmetsp_dir,item) combine_string = combine(mmetsp_dir,item) consolidate_commands=[combine_orphaned_string,rename_pe_string,split_reads_string,combine_string] process_name="consolidate" module_name_list = ["GNU/4.8.3", "khmer/2.0"] clusterfunc_py3.qsub_file(mmetsp_dir,process_name,module_name_list,item,consolidate_commands)
def run_move_files(trimdir, sra): orphan_string = make_orphans(trimdir, sra) mv_string1, mv_string2 = move_files(trimdir, sra) commands = [orphan_string, mv_string1, mv_string2] process_name = "move" module_name_list = "" filename = sra clusterfunc_py3.qsub_file(trimdir, process_name, module_name_list, filename, commands)
def run_trimmomatic_TruSeq(missing, trimmed, remaining, trimdir, file1, file2, sra): bash_filename = trimdir + sra + ".trim.TruSeq.sh" clusterfunc_py3.check_dir(trimdir + "qsub_files/") listoffile = os.listdir(trimdir + "qsub_files/") trim_file = trimdir + "qsub_files/" "trim." + sra + ".log" matching = [s for s in listoffile if "trim." + sra + ".log" in s] matching_string = "TrimmomaticPE: Completed successfully" if os.path.isfile(trim_file): with open(trim_file) as f: content = f.readlines() if len(matching) != 0: trim_complete = [m for m in content if matching_string in m] if len(trim_complete) != 0: print("Already trimmed:", matching) trimmed.append(sra) else: missing.append(trimdir) j = """ java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra, file1, file2, sra) orphan_string = make_orphans(trimdir, sra) commands = [j, orphan_string] process_name = "trim" module_name_list = "" filename = sra clusterfunc_py3.qsub_file(trimdir, process_name, module_name_list, filename, commands) else: remaining.append(trimdir) j = """ java -jar /mnt/home/ljcohen/bin/Trimmomatic-0.33/trimmomatic-0.33.jar PE \\ -baseout {}.trim.fq \\ {} {} \\ ILLUMINACLIP:/mnt/home/ljcohen/bin/Trimmomatic-0.33/adapters/combined.fa:2:40:15 \\ SLIDINGWINDOW:4:2 \\ LEADING:2 \\ TRAILING:2 \\ MINLEN:25 &> trim.{}.log """.format(sra, file1, file2, sra) orphan_string = make_orphans(trimdir, sra) commands = [j, orphan_string] process_name = "trim" module_name_list = "" filename = sra clusterfunc_py3.qsub_file(trimdir, process_name, module_name_list, filename, commands) return missing, trimmed, remaining
def run_filter_abund(diginormdir, sra): keep_dir = diginormdir + "qsub_files/" filter_string = """ filter-abund.py -V -Z 18 {}norm.C20k20.ct {}*.keep """.format(diginormdir, keep_dir) extract_paired_string = extract_paired(diginormdir) commands = [filter_string, extract_paired_string] process_name = "filtabund" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = sra clusterfunc_py3.qsub_file(diginormdir, process_name, module_name_list, filename, commands)
def run_diginorm(diginormdir, interleavedir, trimdir, sra): normalize_median_string = """ normalize-by-median.py -p -k 20 -C 20 -M 4e9 \\ --savegraph {}norm.C20k20.ct \\ -u {}orphans.fq.gz \\ {}*.fq """.format(diginormdir, trimdir, interleavedir) normalize_median_command = [normalize_median_string] process_name = "diginorm" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = sra clusterfunc_py3.qsub_file(diginormdir, process_name, module_name_list, filename, normalize_median_command)
def run_diginorm(mmetsp_dir, mmetsp): normalize_median_string = """ normalize-by-median.py -p -k 20 -C 20 -M 4e9 \\ --savegraph {}norm.C20k20.ct \\ -u {}orphans.fq.gz \\ {}*.interleaved.fq """.format(mmetsp_dir, mmetsp_dir, mmetsp_dir) #s=subprocess.Popen("cat diginorm.sh",shell=True) # s.wait() normalize_median_command = [normalize_median_string] process_name = "diginorm" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc_py3.qsub_file(mmetsp_dir, process_name, module_name_list, filename, normalize_median_command)
def transrate(transrate_dir, sample, trinity_fasta, mmetsp_assemblies_dir, filename): transrate_command = """ transrate -o /tmp/{} \\ --assembly {} \\ --reference {} \\ --threads 8 cp /tmp/{}/assemblies.csv {}{}.assemblies.csv rm -rf /tmp/{}* """.format(sample, trinity_fasta, filename,sample,transrate_dir,sample,sample) commands = [transrate_command] process_name = "trans_ref" module_name_list = "" filename = sample #print(transrate_command) clusterfunc_py3.qsub_file(mmetsp_assemblies_dir,process_name,module_name_list,filename,commands)
def transrate(transrate_dir, sample, trinity_fasta, mmetsp_assemblies_dir, filename): transrate_command = """ transrate -o /tmp/{}_forw \\ --assembly {} \\ --reference {} \\ --threads 8 cp /tmp/{}_forw/{}*/contigs.csv {}{}.contigs.csv rm -rf /tmp/{}_forw* """.format(sample, trinity_fasta, filename,sample,sample,transrate_dir,sample,sample) commands = [transrate_command] process_name = "trans_ref" module_name_list = "" filename = sample #print(transrate_command) clusterfunc_py3.qsub_file(mmetsp_assemblies_dir,process_name,module_name_list,filename,commands)
def interleave_reads(trimdir, sra, interleavedir): interleavefile = interleavedir + sra + ".trimmed.interleaved.fq" if os.path.isfile(interleavefile): print("already interleaved") else: interleave_string = "interleave-reads.py " + trimdir + sra + \ ".trim_1P.fq " + trimdir + sra + ".trim_2P.fq > " + interleavefile print(interleave_string) interleave_command = [interleave_string] process_name = "interleave" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = sra clusterfunc_py3.qsub_file(interleavedir, process_name, module_name_list, filename, interleave_command)
def transrate_reverse(transrate_dir, sample, trinity_fasta, mmetsp_assemblies_dir, filename): transrate_command = """ transrate -o /tmp/{}_rev \\ --assembly {} \\ --reference {} \\ --threads 8 cp /tmp/{}_rev/assemblies.csv {}{}.assemblies.csv rm -rf /tmp/{}_rev* """.format(sample, filename,trinity_fasta,sample,transrate_dir,sample,sample) #print("This is the reverse transrate command:") commands = [transrate_command] process_name = "trans_ref_reverse" module_name_list = "" filename = sample print(transrate_command) clusterfunc_py3.qsub_file(mmetsp_assemblies_dir,process_name,module_name_list,filename,commands)
def run_trinity(trinitydir, left, right, mmetsp): trinity_command = """ set -x # stops execution if there is an error set -e if [ -f {}trinity_out_2.2.0.Trinity.fasta ]; then exit 0 ; fi Trinity --left {} \\ --right {} --output /tmp/{}.trinity_out_2.2.0 --full_cleanup --seqType fq --max_memory 50G --CPU 16 cp /tmp/{}.trinity_out_2.2.0.Trinity.fasta /mnt/home/ljcohen/oysterriver_assemblies/ rm -rf /tmp/{}.trinity_out_2.2.0* """.format(trinitydir, left, right, mmetsp, mmetsp, mmetsp) commands = [trinity_command] process_name = "trinity_2.2.0" module_name_list = ["trinity/2.2.0"] filename = mmetsp clusterfunc_py3.qsub_file(trinitydir, process_name, module_name_list, filename, commands)
def rename_files(trinitydir, diginormdir, diginormfile, SRA): # takes diginormfile in,splits reads and put into newdir rename_orphans = combine_orphans(diginormdir) split_paired = "split-paired-reads.py -d " + diginormdir + " " + diginormfile rename_string1 = "cat " + diginormdir + "*.1 > " + trinitydir + SRA + ".left.fq" rename_string2 = "cat " + diginormdir + \ "*.2 > " + trinitydir + SRA + ".right.fq" rename_string3 = "gunzip -c " + diginormdir + \ "orphans.keep.abundfilt.fq.gz >> " + trinitydir + SRA + ".left.fq" commands = [ rename_orphans, split_paired, rename_string1, rename_string2, rename_string3 ] process_name = "rename" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = SRA clusterfunc_py3.qsub_file(diginormdir, process_name, module_name_list, filename, commands)
def fastqc_report(fastq_file_list, newdir, fastqcdir, filename): # imports list of files in each directory print(fastq_file_list) print(fastqcdir + filename) if glob.glob(fastqcdir + filename + "_*_fastqc.zip"): print("fastqc already complete:", filename) else: # creates command to generate fastqc reports from all files in list file_string = str(fastq_file_list) # print fastq_file_list file_string = " ".join(fastq_file_list) # print file_string fastqc_string = "fastqc -o " + fastqcdir + " " + file_string print("fastqc reports being generated for: " + str(fastq_file_list)) fastqc_command = [fastqc_string] process_name = "fastqc" module_name_list = "" filename = filename clusterfunc_py3.qsub_file(fastqcdir, process_name, module_name_list, filename, fastqc_command)
def run_busco(busco_dir,sample,basedir,filename): #protists_ensembl #eukaryota_odb9 busco_command = """ source ~/.bashrc module load GNU/4.8.3 module unload python module load parallel source activate busco_v3 python /mnt/home/ljcohen/bin/busco/scripts/run_BUSCO.py \ -i {}{} \ -o {} -l /mnt/home/ljcohen/bin/busco/eukaryota_odb9 \ -m tran --cpu 8 """.format(basedir,filename,sample) print(busco_command) commands = [busco_command] process_name = "busco_euk" module_name_list = "" filename = sample clusterfunc_py3.qsub_file(busco_dir, process_name,module_name_list, filename, commands)
def run_streaming_diginorm(trimdir, SRA, diginormdir): # from Jessica's streaming protocol: diginormfile = diginormdir + SRA + ".stream.diginorm.sh" # os.chdir(diginormdir) stream_string = """#!/bin/bash (interleave-reads.py {}{}.trim_1P.fq {}{}.trim_2P.fq && zcat {}orphans.fq.gz)| \\ (trim-low-abund.py -V -k 20 -Z 18 -C 2 - -o - -M 4e9 --diginorm --diginorm-coverage=20) | \\ (extract-paired-reads.py --gzip -p {}{}.paired.gz -s {}{}.single.gz) > /dev/null """.format(trimdir, SRA, trimdir, SRA, trimdir, diginormdir, SRA, diginormdir, SRA) print(stream_string) # with open(diginormfile,"w") as diginorm_script: # diginorm_script.write(stream_string) #s=subprocess.Popen("sudo bash "+diginormfile,shell=True) # s.wait() # print "file written:",diginormfile # os.chdir("/home/ubuntu/MMETSP/") streaming_diginorm_command = [stream_string] module_load_list = [] process_name = "diginorm_stream" clusterfunc_py3.qsub_file(diginormdir, process_name, module_load_list, SRA, streaming_diginorm_command)
def interleave_reads(mmetsp_dir, mmetsp): interleave_string = """ cd {} for filename in *.trim_1P.fq do base=$(basename $filename .fq) echo $base base2=${{base/_1P/_2P}} echo $base2 output=${{base/_1P/}}.interleaved.fq #echo $output (interleave-reads.py ${{base}}.fq ${{base2}}.fq | gzip > $output) done """.format(mmetsp_dir) print(interleave_string) interleave_command = [interleave_string] process_name = "interleave" module_name_list = ["GNU/4.8.3", "khmer/2.0"] filename = mmetsp clusterfunc_py3.qsub_file(mmetsp_dir, process_name, module_name_list, filename, interleave_command)
def quant_salmon(salmon_indexdir, salmondir, sra, mmetsp, newdir, trinity_fasta): file1 = newdir + "trim/" + sra + ".trim_1P.fq" file2 = newdir + "trim/" + sra + ".trim_2P.fq" if os.path.isfile(file1): print("file exists:", file1) else: print("missing:", file1) if os.path.isfile(file2): print("file exists:", file2) index, salmon_index_string = salmon_index(salmondir, salmon_indexdir, sra, trinity_fasta) salmon_string = "salmon quant -i " + index + " --libType IU -1 " + file1 + \ " -2 " + file2 + " -o " + salmondir + mmetsp + "_" + sra + ".quant --dumpEq --auxDir aux" commands = [salmon_index_string, salmon_string] print(salmon_index_string) print(salmon_string) process_name = "salmon" module_name_list = "" filename = sra clusterfunc_py3.qsub_file(salmondir, process_name, module_name_list, filename, commands)
def run_dammit(dammit_string,dammitdir,mmetsp): dammit_command = [dammit_string] process_name = "dammit" module_name_list = [] filename = mmetsp clusterfunc_py3.qsub_file(dammit_dir, process_name, module_name_list, filename, dammit_command)
import os import os.path from os.path import basename import subprocess from subprocess import Popen, PIPE import glob # custom Lisa module import clusterfunc_py3 def fastqc_report(fastq_file_list, newdir, fastqcdir, filename): print fastq_file_list print fastqcdir + filename file_string = str(fastq_file_list) # print fastq_file_list file_string = " ".join(fastq_file_list) # print file_string fastqc_string = "fastqc -o " + fastqcdir + " " + file_string print "fastqc reports being generated for: " + str(fastq_file_list) fastqc_command = [fastqc_string] process_name = "fastqc" module_name_list = "" filename = filename clusterfunc_py3.qsub_file(fastqcdir, process_name, module_name_list, filename, fastqc_command) with open("~/trimmed_files.txt") as