Пример #1
0
    def test_sortmerna_map_sam_alignments(self):
        """ SortMeRNA version 2.0 for mapping sequences onto a reference
            outputting Blast and SAM alignments
        """

        # Rebuild the index
        sortmerna_db, db_files_to_remove = build_database_sortmerna(
            abspath(self.file_reference_seq_fp),
            max_pos=250,
            output_dir=self.output_dir)

        # Files created by indexdb_rna to be deleted
        self.files_to_remove.extend(db_files_to_remove)

        # Run SortMeRNA mapper
        app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
                                   output_dir=self.output_dir,
                                   refseqs_fp=self.file_reference_seq_fp,
                                   sortmerna_db=sortmerna_db,
                                   output_sam=True)

        # Check all sortmerna output files exist
        output_files = [join(self.output_dir, ext)
                        for ext in ['sortmerna_map.blast',
                                    'sortmerna_map.sam',
                                    'sortmerna_map.log']]

        # Check output files exist
        for fp in output_files:
            self.assertTrue(exists(fp))

        sam_alignments_fp = app_result['SAMAlignments'].name

        # Check there are 30 alignments in the SAM output (1 per read)
        with open(sam_alignments_fp, 'U') as sam_actual:
            entries = (line.strip().split('\t') for line in sam_actual)
            actual_alignments = {r[0]: r[1:] for r in entries}

        # 30 alignments expected + 2 lines for @HD and @PG fields
        self.assertEqual(32, len(actual_alignments))

        # Check this alignment exists
        self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
                        in actual_alignments)
        self.assertEqual("295053", actual_alignments[
            "HMPMockV1.2.Staggered2.673827_47"][1])
        self.assertEqual("AS:i:418", actual_alignments[
            "HMPMockV1.2.Staggered2.673827_47"][10])

        # Check alignment for random read is NULL
        self.assertTrue("simulated_random_reads.fa.000000000"
                        in actual_alignments)
        self.assertEqual("*", actual_alignments[
            "simulated_random_reads.fa.000000000"][1])
Пример #2
0
    def test_sortmerna_map_num_alignments(self):
        """ SortMeRNA version 2.0 for mapping sequences onto a reference
            outputting first INT num_alignments passing the E-value threshold
            (rather than first INT best alignments)
        """

        # Rebuild the index
        sortmerna_db, db_files_to_remove = build_database_sortmerna(
            abspath(self.file_reference_seq_fp),
            max_pos=250,
            output_dir=self.output_dir)

        # Files created by indexdb_rna to be deleted
        self.files_to_remove.extend(db_files_to_remove)

        # Run SortMeRNA mapper
        app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
                                   output_dir=self.output_dir,
                                   refseqs_fp=self.file_reference_seq_fp,
                                   sortmerna_db=sortmerna_db,
                                   num_alignments=1)

        # Check all sortmerna output files exist
        output_files = [join(self.output_dir, ext)
                        for ext in ['sortmerna_map.blast',
                                    'sortmerna_map.log']]

        # Check output files exist
        for fp in output_files:
            self.assertTrue(exists(fp))

        blast_alignments_fp = app_result['BlastAlignments'].name

        # Check there are 30 alignments (1 per read)
        with open(blast_alignments_fp, 'U') as blast_actual:
            entries = (line.strip().split('\t') for line in blast_actual)
            actual_alignments = {r[0]: r[1:] for r in entries}

        self.assertEqual(30, len(actual_alignments))

        # Check this alignment exists
        self.assertTrue("HMPMockV1.2.Staggered2.673827_47"
                        in actual_alignments)
        self.assertEqual("97.3", actual_alignments[
            "HMPMockV1.2.Staggered2.673827_47"][1])
        self.assertEqual("100", actual_alignments[
            "HMPMockV1.2.Staggered2.673827_47"][12])

        # Check alignment for random read is NULL
        self.assertTrue("simulated_random_reads.fa.000000000"
                        in actual_alignments)
        self.assertEqual("*", actual_alignments[
            "simulated_random_reads.fa.000000000"][0])
Пример #3
0
    def test_sortmerna_map_sam_alignments_with_tags(self):
        """ SortMeRNA version 2.0 for mapping sequences onto a reference
            outputting SAM alignments with @SQ tags
        """

        # Rebuild the index
        sortmerna_db, db_files_to_remove = build_database_sortmerna(
            abspath(self.file_reference_seq_fp),
            max_pos=250,
            output_dir=self.output_dir)

        # Files created by indexdb_rna to be deleted
        self.files_to_remove.extend(db_files_to_remove)

        # Run SortMeRNA mapper
        app_result = sortmerna_map(seq_path=self.file_read_seqs_fp,
                                   output_dir=self.output_dir,
                                   refseqs_fp=self.file_reference_seq_fp,
                                   sortmerna_db=sortmerna_db,
                                   output_sam=True,
                                   sam_SQ_tags=True,
                                   blast_format=None)

        # Check all sortmerna output files exist
        output_files = [join(self.output_dir, ext)
                        for ext in ['sortmerna_map.sam',
                                    'sortmerna_map.log']]

        # Check output files exist
        for fp in output_files:
            self.assertTrue(exists(fp))

        sam_alignments_fp = app_result['SAMAlignments'].name

        # Check there are 30 alignments in the SAM output (1 per read)
        with open(sam_alignments_fp, 'U') as sam_actual:
            actual_entries = [line.strip().split('\t') for line in sam_actual]

        # 30 alignments expected + 2 lines for @HD and @PG fields + 5 lines
        # for the @SQ tags
        self.assertEqual(37, len(actual_entries))

        # Check all expected @SQ tags have been included
        SQ_array = [['@SQ', 'SN:42684', 'LN:1501'],
                    ['@SQ', 'SN:342684', 'LN:1486'],
                    ['@SQ', 'SN:426848', 'LN:1486'],
                    ['@SQ', 'SN:295053', 'LN:1389'],
                    ['@SQ', 'SN:879972', 'LN:1371']]
        for entry in SQ_array:
            self.assertTrue(entry in actual_entries)
Пример #4
0
def remove_artifacts_seqs(seqs_fp,
                          ref_fp,
                          output_fp,
                          ref_db_fp=None,
                          negate=False,
                          threads=1):
    """Remove artifacts from FASTA file using SortMeRNA.

    Parameters
    ----------
    seqs_fp: string
        file path to FASTA input sequence file
    ref_fp: tuple
        file path(s) to FASTA database file
    output_fp: string
        file path to store output results
    ref_db_fp: string or tuple, optional
        file path(s) to indexed FASTA database
    negate: boolean, optional
        if True, discard all input sequences aligning
        to reference database
    threads: integer, optional
        number of threads to use for SortMeRNA
    """
    working_dir = join(dirname(output_fp), "working_dir")
    if not exists(working_dir):
        makedirs(working_dir)

    aligned_seq_ids = set()
    files_to_remove = []

    for i, db in enumerate(ref_fp):
        # create working directory for each
        # reference database
        db_dir_base = splitext(basename(db))[0]
        db_dir = join(working_dir, db_dir_base)
        if not exists(db_dir):
            makedirs(db_dir)

        if ref_db_fp:
            sortmerna_db = ref_db_fp[i]
        else:
            # build index
            sortmerna_db, files_to_remove = \
                build_database_sortmerna(
                    fasta_path=db,
                    max_pos=10000,
                    output_dir=db_dir)

        # run SortMeRNA
        app_result = sortmerna_map(seq_path=seqs_fp,
                                   output_dir=db_dir,
                                   refseqs_fp=db,
                                   sortmerna_db=sortmerna_db,
                                   threads=threads,
                                   best=1)

        # Print SortMeRNA errors
        stderr_fp = app_result['StdErr'].name
        if stat(stderr_fp).st_size != 0:
            with open(stderr_fp, 'U') as stderr_f:
                for line in stderr_f:
                    print line
            raise ValueError("Could not run SortMeRNA.")

        for line in app_result['BlastAlignments']:
            line = line.strip().split('\t')
            if line[1] == '*':
                continue
            else:
                aligned_seq_ids.add(line[0])

        # remove indexed database files
        remove_files(files_to_remove, error_on_missing=False)

    if negate:

        def op(x):
            return x not in aligned_seq_ids
    else:

        def op(x):
            return x in aligned_seq_ids

    # if negate = False, only output sequences
    # matching to at least one of the databases
    with open(seqs_fp, 'U') as seqs_f:
        with open(output_fp, 'w') as out_f:
            for label, seq in parse_fasta(seqs_f):
                label = label.split()[0]
                if op(label):
                    out_f.write(">%s\n%s\n" % (label, seq))
Пример #5
0
def remove_artifacts_seqs(seqs_fp,
                          ref_fp,
                          output_fp,
                          ref_db_fp=None,
                          negate=False,
                          threads=1):
    """Remove artifacts from FASTA file using SortMeRNA.

    Parameters
    ----------
    seqs_fp: string
        file path to FASTA input sequence file
    ref_fp: tuple
        file path(s) to FASTA database file
    output_fp: string
        file path to store output results
    ref_db_fp: string or tuple, optional
        file path(s) to indexed FASTA database
    negate: boolean, optional
        if True, discard all input sequences aligning
        to reference database
    threads: integer, optional
        number of threads to use for SortMeRNA
    """
    working_dir = join(dirname(output_fp), "working_dir")
    if not exists(working_dir):
        makedirs(working_dir)

    aligned_seq_ids = set()
    files_to_remove = []

    for i, db in enumerate(ref_fp):
        # create working directory for each
        # reference database
        db_dir_base = splitext(basename(db))[0]
        db_dir = join(working_dir, db_dir_base)
        if not exists(db_dir):
            makedirs(db_dir)

        if ref_db_fp:
            sortmerna_db = ref_db_fp[i]
        else:
            # build index
            sortmerna_db, files_to_remove = \
                build_database_sortmerna(
                    fasta_path=db,
                    max_pos=10000,
                    output_dir=db_dir)

        # run SortMeRNA
        app_result = sortmerna_map(
            seq_path=seqs_fp,
            output_dir=db_dir,
            refseqs_fp=db,
            sortmerna_db=sortmerna_db,
            threads=threads,
            best=1)

        # Print SortMeRNA errors
        stderr_fp = app_result['StdErr'].name
        if stat(stderr_fp).st_size != 0:
            with open(stderr_fp, 'U') as stderr_f:
                for line in stderr_f:
                    print line
            raise ValueError("Could not run SortMeRNA.")

        for line in app_result['BlastAlignments']:
            line = line.strip().split('\t')
            if line[1] == '*':
                continue
            else:
                aligned_seq_ids.add(line[0])

        # remove indexed database files
        remove_files(files_to_remove, error_on_missing=False)

    if negate:
        def op(x): return x not in aligned_seq_ids
    else:
        def op(x): return x in aligned_seq_ids

    # if negate = False, only output sequences
    # matching to at least one of the databases
    with open(seqs_fp, 'U') as seqs_f:
        with open(output_fp, 'w') as out_f:
            for label, seq in parse_fasta(seqs_f):
                label = label.split()[0]
                if op(label):
                        out_f.write(">%s\n%s\n" % (label, seq))
Пример #6
0
def remove_artifacts_seqs(seqs_fp,
                          ref_fp,
                          working_dir,
                          ref_db_fp,
                          negate=False,
                          threads=1,
                          verbose=False):
    """Remove artifacts from FASTA file using SortMeRNA.

    Parameters
    ----------
    seqs_fp: string
        file path to FASTA input sequence file
    ref_fp: tuple
        file path(s) to FASTA database file
    working_dir: string
        working directory path
    ref_db_fp: tuple
        file path(s) to indexed FASTA database
    negate: boolean, optional
        if True, discard all input sequences aligning
        to reference database
    threads: integer, optional
        number of threads to use for SortMeRNA
    verbose: boolean, optional
        If true, output SortMeRNA errors
    """
    output_fp = join(working_dir, "%s.no_artifacts" % basename(seqs_fp))
    aligned_seq_ids = set()
    for i, db in enumerate(ref_fp):
        # run SortMeRNA
        app_result = sortmerna_map(seq_path=seqs_fp,
                                   output_dir=working_dir,
                                   refseqs_fp=db,
                                   sortmerna_db=ref_db_fp[i],
                                   threads=threads,
                                   best=1)
        # Print SortMeRNA errors
        stderr_fp = app_result['StdErr'].name
        if stat(stderr_fp).st_size != 0:
            if verbose:
                with open(stderr_fp, 'U') as stderr_f:
                    for line in stderr_f:
                        print(line)
            raise ValueError("Could not run SortMeRNA.")

        for line in app_result['BlastAlignments']:
            line = line.strip().split('\t')
            if line[1] == '*':
                continue
            else:
                aligned_seq_ids.add(line[0])

    if negate:

        def op(x):
            return x not in aligned_seq_ids
    else:

        def op(x):
            return x in aligned_seq_ids

    # if negate = False, only output sequences
    # matching to at least one of the databases
    with open(seqs_fp, 'U') as seqs_f:
        with open(output_fp, 'w') as out_f:
            for label, seq in parse_fasta(seqs_f):
                label = label.split()[0]
                if op(label):
                    out_f.write(">%s\n%s\n" % (label, seq))
    return output_fp