def step_07_combine_tsv(self, input_dir): log, output_dir = self.initialize_step() start_time = time.time() if len(os.listdir(output_dir)) > 0: log.warning( 'output directory "%s" is not empty, this step will be skipped', output_dir) else: input_fp_list = glob.glob(f"{input_dir}/*.tsv") if len(input_fp_list) == 0: raise PipelineException( f'found no tsv files in directory "{input_dir}"') exit() log.info(f"input files = {input_fp_list}") out_basename = "_".join( os.path.basename(input_fp_list[0]).split("_")[0:-2]) out_fp = os.path.join(output_dir, f"{out_basename}_interpro_combined.tsv") log.info(f"out_fp = {out_fp}") with open(out_fp, "w") as out: for fp in input_fp_list: with open(fp, "r") as f: for l in f: out.write(l) end_time = time.time() log.info(f"Time taken for this step: {int((end_time - start_time))}s") self.complete_step(log, output_dir) return output_dir
def step_05_chunk_reads(self, input_dir): log, output_dir = self.initialize_step() start_time = time.time() if len(os.listdir(output_dir)) > 0: log.info( 'output directory "%s" is not empty, this step will be skipped', output_dir) else: log.info('input directory listing:\n\t%s', '\n\t'.join(os.listdir(input_dir))) input_files_glob = os.path.join( input_dir, f'*.ee{self.vsearch_filter_maxee}minlen{self.vsearch_filter_minlen}*.faa' ) #log.info('input file glob: "%s"', input_files_glob) input_fp_list = sorted(glob.glob(input_files_glob)) if len(input_fp_list) == 0: raise PipelineException( f'found no .ee{self.vsearch_filter_maxee}minlen{self.vsearch_filter_minlen}.faa files in directory "{input_dir}"' ) log.info(f"input file list: {input_fp_list}") chunk_size = 10000 for input_fp in input_fp_list: i = 0 log.info(f"reading input file {input_fp}") fname, ext = input_fp.rsplit('.', 1) _, fname = os.path.split(fname) written = False with open(input_fp, "r") as infile: outfilepath = f"{output_dir}/{fname}_{i}.{ext}" log.info(f"writing chunk to {outfilepath}") tmp_count = 0 outfile = open(outfilepath, 'w') for record in SeqIO.parse(infile, "fasta"): if tmp_count == chunk_size: outfile.close() i += 1 outfilepath = f"{output_dir}/{fname}_{i}.{ext}" log.info(f"writing chunk to {outfilepath}") outfile = open(outfilepath, 'w') tmp_count = 0 tmp_seq = str(record.seq) tmp_id = record.id tmp_seq = re.sub("\*", "X", tmp_seq) outfile.write(f">{tmp_id}\n{tmp_seq}\n") tmp_count += 1 try: outfile.close() except: pass #gzip_files(glob.glob(os.path.join(output_dir, '*.fasta'))) end_time = time.time() log.info(f"Time taken for this step: {int((end_time - start_time))}s") self.complete_step(log, output_dir) return output_dir
def complete_step(self, log, output_dir): """ Checks that the output directory contains files :param log: :param output_dir: :return: """ output_dir_list = sorted(os.listdir(output_dir)) if len(output_dir_list) == 0: raise PipelineException( 'ERROR: no output files in directory "{}"'.format(output_dir)) return
def step_04_get_gene_reads(self, input_dir): """ Uses FragGeneScan to find reads containing fragments of genes :param input_dir: string path to input files :return: string path to output directory """ log, output_dir = self.initialize_step() start_time = time.time() if len(os.listdir(output_dir)) > 0: log.warning( 'output directory "%s" is not empty, this step will be skipped', output_dir) else: #input_fps = glob.glob(f"{input_dir}/*.fastq.gz") #TODO CHANGE BACK input_fp_list = glob.glob(f"{input_dir}/*.fasta") #input_fps = glob.glob(f"{input_dir}/*.fasta") if len(input_fp_list) == 0: raise PipelineException( f'found no fasta files in directory "{input_dir}"') log.info(f"input files = {input_fp_list}") #log.info("uncompressing input files") #uncompressed_input_fps = ungzip_files(*input_fps, target_dir=input_dir, debug=self.debug) #for fp in uncompressed_input_fps: for fp in input_fp_list: #fasta_fp = re.sub( # string=fp, # pattern='\.fastq', # repl='.fasta') #log.info(f"converting fastq {fp} to fasta {fasta_fp}") #fastq_to_fasta(fp, fasta_fp) #os.remove(fp) out_fp = os.path.join( output_dir, re.sub(string=os.path.basename(fp), pattern='\.fasta', repl='.frags')) log.info(f"writing output of {fp} to {out_fp}") run_cmd( [ self.frag_executable_fp, f"-genome={fp}", f"-out={out_fp}", "-complete=0", f"-train={self.frag_train_file}", f"thread={self.threads}" # INCLUDE MORE ARGS ], log_file=os.path.join(output_dir, 'log'), debug=self.debug) end_time = time.time() log.info(f"Time taken for this step: {int((end_time - start_time))}s") self.complete_step(log, output_dir) return output_dir
def step_06_get_orfs(self, input_dir): """ Uses Interproscan to get ORFS and connect them to GO terms :param input_dir: string path to input files :return: string path to output directory """ log, output_dir = self.initialize_step() start_time = time.time() if len(os.listdir(output_dir)) > 0: log.warning( 'output directory "%s" is not empty, this step will be skipped', output_dir) else: input_fp_list = glob.glob(f"{input_dir}/*.faa") if len(input_fp_list) == 0: raise PipelineException( f'found no faa files in directory "{input_dir}"') log.info(f"input files = {input_fp_list}") for fp in input_fp_list: out_basename = os.path.join( output_dir, re.sub(string=os.path.basename(fp), pattern='\.faa', repl='_interpro')) log.info(f"writing output of {fp} to {out_basename}") run_cmd( [ self.interproscan_executable_fp, "-appl", "Pfam", "-i", fp, "-b", out_basename, "-goterms", "-iprlookup", "-dra", "-cpu", str(self.threads) # INCLUDE MORE ARGS ], log_file=os.path.join(output_dir, 'log'), debug=self.debug) end_time = time.time() log.info(f"Time taken for this step: {int((end_time - start_time))}s") self.complete_step(log, output_dir) return output_dir
def step_02_qc_reads_with_vsearch(self, input_dir): log, output_dir = self.initialize_step() start_time = time.time() if len(os.listdir(output_dir)) > 0: log.warning( 'output directory "%s" is not empty, this step will be skipped', output_dir) else: # if self.paired_ends: # input_files_glob = os.path.join(input_dir, # '*.assembled*.fastq*') # else: input_files_glob = os.path.join(input_dir, '*.fastq*') input_fp_list = glob.glob(input_files_glob) if len(input_fp_list) == 0: raise PipelineException( 'found no .fastq files in directory "{}"'.format( input_dir)) # log.info('input file glob: "%s"', input_files_glob) log.info(f"input file list: {input_fp_list}") for input_fastq_fp in input_fp_list: # Uncompress if its still compressed if input_fastq_fp[-3:] == ".gz": uncompressed_input_fp = ungzip_files(input_fastq_fp, target_dir=input_dir, debug=self.debug)[0] os.remove(input_fastq_fp) input_fastq_fp = uncompressed_input_fp input_file_basename = os.path.basename(input_fastq_fp) output_file_basename = re.sub( string=input_file_basename, pattern='\.fastq*', repl='.ee{}minlen{}.fasta'.format( self.vsearch_filter_maxee, self.vsearch_filter_minlen)) output_fasta_fp = os.path.join(output_dir, output_file_basename) log.info('vsearch executable: "%s"', self.vsearch_executable_fp) log.info('filtering "%s"', input_fastq_fp) run_cmd( [ self.vsearch_executable_fp, '-fastq_filter', input_fastq_fp, '-fastaout', output_fasta_fp, '-fastq_maxee', str(self.vsearch_filter_maxee), '-fastq_minlen', str(self.vsearch_filter_minlen), # '-fastq_trunclen', str(self.vsearch_filter_trunclen), '-threads', str(self.threads) ], log_file=os.path.join(output_dir, 'log'), debug=self.debug) # gzip_glob = glob.glob(os.path.join(output_dir, '*.fastq')) # log.info(f"zipping files {gzip_glob}") # gzip_files(gzip_glob, debug=self.debug) with open(os.path.join(output_dir, 'log'), 'r') as logcheck: kept_num = 0 discarded_num = 0 for l in logcheck: if 'sequences kept' in l: l_arr = l.split(' ') kept_num = int(l_arr[0]) discarded_num = int(l_arr[7]) if 'executing' in l: l_arr = l.split(' ') ran_fp = l_arr[3] log.info("kept_num = {}, discarded_num = {}".format( kept_num, discarded_num)) if kept_num == 0: log.error( "No sequences kept by vsearch qc for input file '{}'... Exiting" .format(ran_fp)) exit(1) if discarded_num > kept_num: log.warning( "More sequences discarded than kept by vsearch qc for input file '{}'" .format(ran_fp)) end_time = time.time() log.info(f"Time taken for this step: {int((end_time - start_time))}s") self.complete_step(log, output_dir) return output_dir
def step_01_trimming(self, input_file): """ Uses Trimmomatic to trim reads :param input_dir: path to input files :return: path to output directory """ log, output_dir = self.initialize_step() start_time = time.time() if len(os.listdir(output_dir)) > 0: log.warning( 'output directory "%s" is not empty, this step will be skipped', output_dir) else: input_fp_list = [] """ types = ['*.fastq*', '*.fq*'] if self.paired_ends: input_fp_list = get_forward_fastq_files(input_dir, self.debug) else: for t in types: input_fp_list.extend(glob.glob(os.path.join(input_dir, t))) if len(input_fp_list) == 0: raise PipelineException( f'found no fastq files in directory "{input_dir}"') log.info(f"input files = {input_fp_list}") """ if not os.path.isfile(input_file): raise PipelineException( f'input file {input_file} is not a file or does not exist') exit() run_arr = [ self.java_executable_fp, "-jar", self.trim_executable_fp ] trim_log = f"{output_dir}/trim_log" """ if self.paired_ends: run_arr.append("PE") out_base = re.sub(string=os.path.basename(input_file), pattern=r'_1\.(fastq|fq)', repl=".fastq") run_arr.extend([ "-threads", self.threads, "-trimlog", trim_log, "-basein", fp, "-baseout", os.path.join(output_dir, out_base) ]) else: """ run_arr.append("SE") out_base = re.sub(string=os.path.basename(input_file), pattern=r'.(fastq|fq)', repl=".fastq") run_arr.extend([ "-threads", str(self.threads), "-trimlog", trim_log, input_file, os.path.join(output_dir, out_base) ]) illuminaclip_str = (f"ILLUMINACLIP:{self.trim_adapter_fasta}:" f"{self.trim_seed_mismatches}:" f"{self.trim_palindrome_clip_thresh}:" f"{self.trim_simple_clip_thresh}:" f"{self.trim_min_adapter_length}:" f"{self.trim_keep_both_reads}") leading_str = f"LEADING:{self.trim_min_quality}" trailing_str = f"TRAILING:{self.trim_min_quality}" minlen_str = f"MINLEN:{self.trim_min_len}" run_arr.append(illuminaclip_str) run_arr.append(leading_str) run_arr.append(trailing_str) run_arr.append(minlen_str) log.info( f"writing output of {input_file} to {output_dir}/{out_base}") # run_arr = [self.java_executable_fp, "-jar", self.trim_executable_fp] run_cmd(run_arr, log_file=os.path.join(output_dir, 'log'), debug=self.debug) # Check the log to make sure most reads were trimmed properly with open(os.path.join(output_dir, 'log'), 'r') as logcheck: percent_surviving = 0.0 for l in logcheck: if "Surviving" in l: larr = l.split(" ") # if self.paired_ends: # percent_surviving = float(larr[7][1:-2]) # else: percent_surviving = float(larr[5][1:-2]) if percent_surviving < 10.0: log.error( f"Fewer than 10% ({percent_surviving}%) of reads " f"survived after Trimmomatic trimming for {input_file}... " f"Exiting") exit(1) elif percent_surviving < 50.0: log.warning( f"Fewer than 50% ({percent_surviving}%) of reads " f"survived after Trimmomatic trimming for {input_file}" ) else: log.info(f"{percent_surviving}% of reads survived after " f"Trimmomatic trimming for {input_file}") end_time = time.time() log.info(f"Time taken for this step: {int((end_time - start_time))}s") self.complete_step(log, output_dir) return output_dir