示例#1
0
 def step_07_combine_tsv(self, input_dir):
     log, output_dir = self.initialize_step()
     start_time = time.time()
     if len(os.listdir(output_dir)) > 0:
         log.warning(
             'output directory "%s" is not empty, this step will be skipped',
             output_dir)
     else:
         input_fp_list = glob.glob(f"{input_dir}/*.tsv")
         if len(input_fp_list) == 0:
             raise PipelineException(
                 f'found no tsv files in directory "{input_dir}"')
             exit()
         log.info(f"input files = {input_fp_list}")
         out_basename = "_".join(
             os.path.basename(input_fp_list[0]).split("_")[0:-2])
         out_fp = os.path.join(output_dir,
                               f"{out_basename}_interpro_combined.tsv")
         log.info(f"out_fp = {out_fp}")
         with open(out_fp, "w") as out:
             for fp in input_fp_list:
                 with open(fp, "r") as f:
                     for l in f:
                         out.write(l)
     end_time = time.time()
     log.info(f"Time taken for this step: {int((end_time - start_time))}s")
     self.complete_step(log, output_dir)
     return output_dir
示例#2
0
 def step_05_chunk_reads(self, input_dir):
     log, output_dir = self.initialize_step()
     start_time = time.time()
     if len(os.listdir(output_dir)) > 0:
         log.info(
             'output directory "%s" is not empty, this step will be skipped',
             output_dir)
     else:
         log.info('input directory listing:\n\t%s',
                  '\n\t'.join(os.listdir(input_dir)))
         input_files_glob = os.path.join(
             input_dir,
             f'*.ee{self.vsearch_filter_maxee}minlen{self.vsearch_filter_minlen}*.faa'
         )
         #log.info('input file glob: "%s"', input_files_glob)
         input_fp_list = sorted(glob.glob(input_files_glob))
         if len(input_fp_list) == 0:
             raise PipelineException(
                 f'found no .ee{self.vsearch_filter_maxee}minlen{self.vsearch_filter_minlen}.faa files in directory "{input_dir}"'
             )
         log.info(f"input file list: {input_fp_list}")
         chunk_size = 10000
         for input_fp in input_fp_list:
             i = 0
             log.info(f"reading input file {input_fp}")
             fname, ext = input_fp.rsplit('.', 1)
             _, fname = os.path.split(fname)
             written = False
             with open(input_fp, "r") as infile:
                 outfilepath = f"{output_dir}/{fname}_{i}.{ext}"
                 log.info(f"writing chunk to {outfilepath}")
                 tmp_count = 0
                 outfile = open(outfilepath, 'w')
                 for record in SeqIO.parse(infile, "fasta"):
                     if tmp_count == chunk_size:
                         outfile.close()
                         i += 1
                         outfilepath = f"{output_dir}/{fname}_{i}.{ext}"
                         log.info(f"writing chunk to {outfilepath}")
                         outfile = open(outfilepath, 'w')
                         tmp_count = 0
                     tmp_seq = str(record.seq)
                     tmp_id = record.id
                     tmp_seq = re.sub("\*", "X", tmp_seq)
                     outfile.write(f">{tmp_id}\n{tmp_seq}\n")
                     tmp_count += 1
             try:
                 outfile.close()
             except:
                 pass
         #gzip_files(glob.glob(os.path.join(output_dir, '*.fasta')))
     end_time = time.time()
     log.info(f"Time taken for this step: {int((end_time - start_time))}s")
     self.complete_step(log, output_dir)
     return output_dir
示例#3
0
 def complete_step(self, log, output_dir):
     """
     Checks that the output directory contains files
     :param log:
     :param output_dir:
     :return:
     """
     output_dir_list = sorted(os.listdir(output_dir))
     if len(output_dir_list) == 0:
         raise PipelineException(
             'ERROR: no output files in directory "{}"'.format(output_dir))
     return
示例#4
0
 def step_04_get_gene_reads(self, input_dir):
     """
     Uses FragGeneScan to find reads containing fragments of genes
     :param input_dir: string path to input files
     :return: string path to output directory
     """
     log, output_dir = self.initialize_step()
     start_time = time.time()
     if len(os.listdir(output_dir)) > 0:
         log.warning(
             'output directory "%s" is not empty, this step will be skipped',
             output_dir)
     else:
         #input_fps = glob.glob(f"{input_dir}/*.fastq.gz")
         #TODO CHANGE BACK
         input_fp_list = glob.glob(f"{input_dir}/*.fasta")
         #input_fps = glob.glob(f"{input_dir}/*.fasta")
         if len(input_fp_list) == 0:
             raise PipelineException(
                 f'found no fasta files in directory "{input_dir}"')
         log.info(f"input files = {input_fp_list}")
         #log.info("uncompressing input files")
         #uncompressed_input_fps = ungzip_files(*input_fps, target_dir=input_dir, debug=self.debug)
         #for fp in uncompressed_input_fps:
         for fp in input_fp_list:
             #fasta_fp = re.sub(
             #                    string=fp,
             #                    pattern='\.fastq',
             #                    repl='.fasta')
             #log.info(f"converting fastq {fp} to fasta {fasta_fp}")
             #fastq_to_fasta(fp, fasta_fp)
             #os.remove(fp)
             out_fp = os.path.join(
                 output_dir,
                 re.sub(string=os.path.basename(fp),
                        pattern='\.fasta',
                        repl='.frags'))
             log.info(f"writing output of {fp} to {out_fp}")
             run_cmd(
                 [
                     self.frag_executable_fp, f"-genome={fp}",
                     f"-out={out_fp}", "-complete=0",
                     f"-train={self.frag_train_file}",
                     f"thread={self.threads}"
                     # INCLUDE MORE ARGS
                 ],
                 log_file=os.path.join(output_dir, 'log'),
                 debug=self.debug)
     end_time = time.time()
     log.info(f"Time taken for this step: {int((end_time - start_time))}s")
     self.complete_step(log, output_dir)
     return output_dir
示例#5
0
 def step_06_get_orfs(self, input_dir):
     """
     Uses Interproscan to get ORFS and connect them to GO terms
     :param input_dir: string path to input files
     :return: string path to output directory
     """
     log, output_dir = self.initialize_step()
     start_time = time.time()
     if len(os.listdir(output_dir)) > 0:
         log.warning(
             'output directory "%s" is not empty, this step will be skipped',
             output_dir)
     else:
         input_fp_list = glob.glob(f"{input_dir}/*.faa")
         if len(input_fp_list) == 0:
             raise PipelineException(
                 f'found no faa files in directory "{input_dir}"')
         log.info(f"input files = {input_fp_list}")
         for fp in input_fp_list:
             out_basename = os.path.join(
                 output_dir,
                 re.sub(string=os.path.basename(fp),
                        pattern='\.faa',
                        repl='_interpro'))
             log.info(f"writing output of {fp} to {out_basename}")
             run_cmd(
                 [
                     self.interproscan_executable_fp, "-appl", "Pfam", "-i",
                     fp, "-b", out_basename, "-goterms", "-iprlookup",
                     "-dra", "-cpu",
                     str(self.threads)
                     # INCLUDE MORE ARGS
                 ],
                 log_file=os.path.join(output_dir, 'log'),
                 debug=self.debug)
     end_time = time.time()
     log.info(f"Time taken for this step: {int((end_time - start_time))}s")
     self.complete_step(log, output_dir)
     return output_dir
示例#6
0
 def step_02_qc_reads_with_vsearch(self, input_dir):
     log, output_dir = self.initialize_step()
     start_time = time.time()
     if len(os.listdir(output_dir)) > 0:
         log.warning(
             'output directory "%s" is not empty, this step will be skipped',
             output_dir)
     else:
         # if self.paired_ends:
         #    input_files_glob = os.path.join(input_dir,
         #                                    '*.assembled*.fastq*')
         # else:
         input_files_glob = os.path.join(input_dir, '*.fastq*')
         input_fp_list = glob.glob(input_files_glob)
         if len(input_fp_list) == 0:
             raise PipelineException(
                 'found no .fastq files in directory "{}"'.format(
                     input_dir))
         # log.info('input file glob: "%s"', input_files_glob)
         log.info(f"input file list: {input_fp_list}")
         for input_fastq_fp in input_fp_list:
             # Uncompress if its still compressed
             if input_fastq_fp[-3:] == ".gz":
                 uncompressed_input_fp = ungzip_files(input_fastq_fp,
                                                      target_dir=input_dir,
                                                      debug=self.debug)[0]
                 os.remove(input_fastq_fp)
                 input_fastq_fp = uncompressed_input_fp
             input_file_basename = os.path.basename(input_fastq_fp)
             output_file_basename = re.sub(
                 string=input_file_basename,
                 pattern='\.fastq*',
                 repl='.ee{}minlen{}.fasta'.format(
                     self.vsearch_filter_maxee, self.vsearch_filter_minlen))
             output_fasta_fp = os.path.join(output_dir,
                                            output_file_basename)
             log.info('vsearch executable: "%s"',
                      self.vsearch_executable_fp)
             log.info('filtering "%s"', input_fastq_fp)
             run_cmd(
                 [
                     self.vsearch_executable_fp,
                     '-fastq_filter',
                     input_fastq_fp,
                     '-fastaout',
                     output_fasta_fp,
                     '-fastq_maxee',
                     str(self.vsearch_filter_maxee),
                     '-fastq_minlen',
                     str(self.vsearch_filter_minlen),
                     # '-fastq_trunclen', str(self.vsearch_filter_trunclen),
                     '-threads',
                     str(self.threads)
                 ],
                 log_file=os.path.join(output_dir, 'log'),
                 debug=self.debug)
         # gzip_glob = glob.glob(os.path.join(output_dir, '*.fastq'))
         # log.info(f"zipping files {gzip_glob}")
         # gzip_files(gzip_glob, debug=self.debug)
         with open(os.path.join(output_dir, 'log'), 'r') as logcheck:
             kept_num = 0
             discarded_num = 0
             for l in logcheck:
                 if 'sequences kept' in l:
                     l_arr = l.split(' ')
                     kept_num = int(l_arr[0])
                     discarded_num = int(l_arr[7])
                 if 'executing' in l:
                     l_arr = l.split(' ')
                     ran_fp = l_arr[3]
                     log.info("kept_num = {}, discarded_num = {}".format(
                         kept_num, discarded_num))
                     if kept_num == 0:
                         log.error(
                             "No sequences kept by vsearch qc for input file '{}'... Exiting"
                             .format(ran_fp))
                         exit(1)
                     if discarded_num > kept_num:
                         log.warning(
                             "More sequences discarded than kept by vsearch qc for input file '{}'"
                             .format(ran_fp))
     end_time = time.time()
     log.info(f"Time taken for this step: {int((end_time - start_time))}s")
     self.complete_step(log, output_dir)
     return output_dir
示例#7
0
    def step_01_trimming(self, input_file):
        """
        Uses Trimmomatic to trim reads
        :param input_dir: path to input files
        :return: path to output directory
        """
        log, output_dir = self.initialize_step()
        start_time = time.time()
        if len(os.listdir(output_dir)) > 0:
            log.warning(
                'output directory "%s" is not empty, this step will be skipped',
                output_dir)
        else:
            input_fp_list = []
            """
            types = ['*.fastq*', '*.fq*']
            if self.paired_ends:
                input_fp_list = get_forward_fastq_files(input_dir, self.debug)
            else:
                for t in types:
                    input_fp_list.extend(glob.glob(os.path.join(input_dir, t)))
            if len(input_fp_list) == 0:
                raise PipelineException(
                    f'found no fastq files in directory "{input_dir}"')
            log.info(f"input files = {input_fp_list}")
            """
            if not os.path.isfile(input_file):
                raise PipelineException(
                    f'input file {input_file} is not a file or does not exist')
                exit()
            run_arr = [
                self.java_executable_fp, "-jar", self.trim_executable_fp
            ]
            trim_log = f"{output_dir}/trim_log"
            """
            if self.paired_ends:
                run_arr.append("PE")
                out_base = re.sub(string=os.path.basename(input_file),
                                  pattern=r'_1\.(fastq|fq)',
                                  repl=".fastq")
                run_arr.extend([
                    "-threads", self.threads, "-trimlog", trim_log,
                    "-basein", fp, "-baseout",
                    os.path.join(output_dir, out_base)
                ])

            else:
            """
            run_arr.append("SE")
            out_base = re.sub(string=os.path.basename(input_file),
                              pattern=r'.(fastq|fq)',
                              repl=".fastq")
            run_arr.extend([
                "-threads",
                str(self.threads), "-trimlog", trim_log, input_file,
                os.path.join(output_dir, out_base)
            ])
            illuminaclip_str = (f"ILLUMINACLIP:{self.trim_adapter_fasta}:"
                                f"{self.trim_seed_mismatches}:"
                                f"{self.trim_palindrome_clip_thresh}:"
                                f"{self.trim_simple_clip_thresh}:"
                                f"{self.trim_min_adapter_length}:"
                                f"{self.trim_keep_both_reads}")
            leading_str = f"LEADING:{self.trim_min_quality}"
            trailing_str = f"TRAILING:{self.trim_min_quality}"
            minlen_str = f"MINLEN:{self.trim_min_len}"
            run_arr.append(illuminaclip_str)
            run_arr.append(leading_str)
            run_arr.append(trailing_str)
            run_arr.append(minlen_str)
            log.info(
                f"writing output of {input_file} to {output_dir}/{out_base}")
            # run_arr = [self.java_executable_fp, "-jar", self.trim_executable_fp]
            run_cmd(run_arr,
                    log_file=os.path.join(output_dir, 'log'),
                    debug=self.debug)
            # Check the log to make sure most reads were trimmed properly
            with open(os.path.join(output_dir, 'log'), 'r') as logcheck:
                percent_surviving = 0.0
                for l in logcheck:
                    if "Surviving" in l:
                        larr = l.split(" ")
                        # if self.paired_ends:
                        #    percent_surviving = float(larr[7][1:-2])
                        # else:
                        percent_surviving = float(larr[5][1:-2])
                if percent_surviving < 10.0:
                    log.error(
                        f"Fewer than 10% ({percent_surviving}%) of reads "
                        f"survived after Trimmomatic trimming for {input_file}... "
                        f"Exiting")
                    exit(1)
                elif percent_surviving < 50.0:
                    log.warning(
                        f"Fewer than 50% ({percent_surviving}%) of reads "
                        f"survived after Trimmomatic trimming for {input_file}"
                    )
                else:
                    log.info(f"{percent_surviving}% of reads survived after "
                             f"Trimmomatic trimming for {input_file}")

        end_time = time.time()
        log.info(f"Time taken for this step: {int((end_time - start_time))}s")
        self.complete_step(log, output_dir)
        return output_dir