示例#1
0
    def step_04_fasta_format(self, input_dir):
        """
        fastq_to_fasta does not read gzipped files but it will write them,
        but in this step it writes uncompressed output files and they are
        separately compressed. The next step will use the uncompressed files
        and then delete them.

        This step reads uncompressed files from the previous step and
        then deletes them.

        :param input_dir: directory of input files
        :return: directory of output files
        """
        log, output_dir = self.initialize_step()

        print('begin FASTA format step')

        fastq_file_glob = os.path.join(input_dir,
                                       '{}*.fastq'.format(self.prefix))
        log.info('FASTQ file glob: %s', fastq_file_glob)
        ungzipped_fastq_file_list = glob.glob(fastq_file_glob)
        log.info('FASTQ file list:\n\t%s',
                 '\n\t'.join(ungzipped_fastq_file_list))

        fasta_output_file_list = []
        for fastq_fp in ungzipped_fastq_file_list:
            fasta_fp = os.path.join(
                output_dir,
                re.sub(string=os.path.basename(fastq_fp),
                       pattern='\.fastq$',
                       repl='.fasta'))

            run_cmd([
                'fastq_to_fasta', '-i', fastq_fp, '-o', fasta_fp, '-n', '-v',
                '-r'
            ],
                    log_file=os.path.join(output_dir, 'log'))
            fasta_output_file_list.append(fasta_fp)

        delete_files(*ungzipped_fastq_file_list)

        gzip_files(*fasta_output_file_list)

        self.complete_step(log=log, output_dir=output_dir)
        return output_dir
示例#2
0
    def step_02_join_paired_end_reads(self, input_dir):
        log, output_dir = self.initialize_step()

        print('begin joined paired ends step')

        trimmed_reads_file_glob = os.path.join(
            input_dir, '{}*.trim[12]p.fastq.gz'.format(self.prefix))
        log.info('trimmed reads file glob: %s', trimmed_reads_file_glob)
        trimmed_reads_files = glob.glob(trimmed_reads_file_glob)
        log.info('trimmed reads files:\n\t%s',
                 '\n\t'.join(trimmed_reads_files))
        trimmed_forward_reads_fp, trimmed_reverse_reads_fp = sorted(
            trimmed_reads_files)

        uncompressed_trimmed_forward_reads_fp, uncompressed_trimmed_reverse_reads_fp = ungzip_files(
            trimmed_forward_reads_fp, trimmed_reverse_reads_fp)

        joined_reads_pattern_fp = os.path.join(
            output_dir,
            re.sub(
                string=os.path.basename(uncompressed_trimmed_forward_reads_fp),
                pattern=r'\.trim1p.fastq$',
                repl='.trim.%.fastq'))

        run_cmd([
            'fastq-join', uncompressed_trimmed_forward_reads_fp,
            uncompressed_trimmed_reverse_reads_fp, '-m', '20', '-o',
            joined_reads_pattern_fp
        ],
                log_file=os.path.join(output_dir, 'log'))

        output_file_glob = os.path.join(output_dir,
                                        '{}*.trim.*.fastq'.format(self.prefix))
        log.info('fastq-join output file glob: %s', output_file_glob)
        output_file_list = glob.glob(output_file_glob)
        log.info('fastq-join output files:\n\t%s',
                 '\n\t'.join(output_file_list))

        gzip_files(*output_file_list)

        delete_files(uncompressed_trimmed_forward_reads_fp,
                     uncompressed_trimmed_reverse_reads_fp, *output_file_list)

        self.complete_step(log, output_dir)
        return output_dir
示例#3
0
    def step_05_length_filter(self, input_dir):
        log, output_dir = self.initialize_step()

        print('begin FASTA format step')

        fasta_file_glob = os.path.join(input_dir,
                                       '{}*.fasta'.format(self.prefix))
        log.info('FASTA file glob: %s', fasta_file_glob)
        fasta_file_list = glob.glob(fasta_file_glob)
        log.info('FASTA file list:\n\t%s', '\n\t'.join(fasta_file_list))

        length_filtered_file_list = []
        for fasta_fp in fasta_file_list:
            length_filtered_fp = os.path.join(
                output_dir,
                re.sub(string=os.path.basename(fasta_fp),
                       pattern='\.fasta$',
                       repl='.length.fasta'))

            run_cmd([
                'fastx_clipper',
                '-i',
                fasta_fp,
                '-o',
                length_filtered_fp,
                '-l',
                str(50),
                '-n',
                '-v',
            ],
                    log_file=os.path.join(output_dir, 'log'))

            length_filtered_file_list.append(length_filtered_fp)

        delete_files(*fasta_file_list)
        gzip_files(*length_filtered_file_list)
        delete_files(*length_filtered_file_list)

        self.complete_step(log=log, output_dir=output_dir)
        return output_dir
示例#4
0
def test_step_06_rewrite_sequence_ids():
    with tempfile.TemporaryDirectory(
    ) as input_dir, tempfile.TemporaryDirectory() as work_dir:
        input_file_1 = write_test_input(input_dir=input_dir,
                                        file_name='unittest.quality.fasta',
                                        content='>1\n{}\n'.format('A' * 100))

        gzip_files(input_file_1)

        output_dir = get_pipeline(
            work_dir=work_dir).step_06_rewrite_sequence_ids(
                input_dir=input_dir)

        assert os.path.exists(output_dir)
        assert os.path.isdir(output_dir)

        output_file_list = get_sorted_file_list(output_dir)
        assert len(output_file_list) == 1
        assert output_file_list[0].name == 'unittest.quality.id.fasta.gz'

        with gzip.open(os.path.join(output_dir, output_file_list[0].name),
                       'rt') as output_file:
            assert output_file.readlines()[0] == '>unittest_1\n'
示例#5
0
def test_step_02_join_paired_end_reads():
    """
    Output from fastq-join will be three files:
        'unittest.trim.join.fastq', 'unittest.trim.un1.fastq', 'unittest.trim.un2.fastq'

    """
    with tempfile.TemporaryDirectory(
    ) as input_dir, tempfile.TemporaryDirectory() as work_dir:

        input_file_1 = write_test_input(
            input_dir=input_dir,
            file_name='unittest.trim1p.fastq',
            content='@read_1 forward\n{}\n+\n{}\n'.format(
                'A' * 100, 'a' * 100))
        input_file_2 = write_test_input(
            input_dir=input_dir,
            file_name='unittest.trim2p.fastq',
            content='@read_1 reverse\n{}\n+\n{}\n'.format(
                'T' * 100, 'a' * 100))

        gzip_files(input_file_1, input_file_2)

        output_dir = get_pipeline(
            work_dir=work_dir).step_02_join_paired_end_reads(
                input_dir=input_dir)

        assert os.path.exists(output_dir)
        assert os.path.isdir(output_dir)

        output_file_list = get_sorted_file_list(output_dir)
        assert len(output_file_list) == 4
        assert output_file_list[0].name == 'log'
        assert output_file_list[1].name == 'unittest.trim.join.fastq.gz'
        assert output_file_list[2].name == 'unittest.trim.un1.fastq.gz'
        assert output_file_list[3].name == 'unittest.trim.un2.fastq.gz'

        check_for_fastq_results(output_dir)
示例#6
0
    def step_03_quality_filter(self, input_dir):
        log, output_dir = self.initialize_step()

        print('begin quality filtering step')

        joined_reads_file_glob = os.path.join(
            input_dir, '{}*.join.fastq*'.format(self.prefix))
        log.info('trimmed reads file glob: %s', joined_reads_file_glob)
        joined_reads_fp = glob.glob(joined_reads_file_glob)[0]
        log.info('joined reads file: %s', joined_reads_fp)

        ungzipped_joined_reads_fp, *_ = ungzip_files(joined_reads_fp)

        quality_filtered_reads_fp = os.path.join(
            output_dir,
            re.sub(string=os.path.basename(ungzipped_joined_reads_fp),
                   pattern=r'\.fastq$',
                   repl='.quality.fastq'))

        quality_cutoff = 30
        min_percentage = 90

        run_cmd([
            'fastq_quality_filter', '-i', ungzipped_joined_reads_fp, '-o',
            quality_filtered_reads_fp, '-v', '-q',
            str(quality_cutoff), '-p',
            str(min_percentage), '-Q{}'.format(self.phred)
        ],
                log_file=os.path.join(output_dir, 'log'))

        delete_files(ungzipped_joined_reads_fp)

        gzip_files(quality_filtered_reads_fp)

        self.complete_step(log, output_dir)
        return output_dir