예제 #1
0
def coverage_pbp(file, reference_length=None) -> np.ndarray:
    """
    Reads the SAM file and computes the per-based coverage
    from the aligned blocks.
    In particular: if a position in the reference is consistently
    a 'delete' compared to the matched reads, it will be counted as zero.

    The `reference_length` can be inferred from the mapped reads
    but there can be an undetected residual at the 'right' end
    if it is not specified.
    """

    from humdum.io import from_sam

    zeros = (lambda n: np.zeros(n, dtype=int))
    counts = zeros(reference_length or 0)
    for read in from_sam(file):
        a = read.pos
        for (n, A) in re.findall(r"([0-9]+)([XIDSM=])", read.cigar):
            b = a + int(n)
            assert (a < b), "Only expect positive numbers in CIGAR."
            if (A in '=M'):
                if (b > len(counts)):
                    counts = np.concatenate([counts, zeros(b - len(counts))])
                counts[a:b] += 1
            a = b

    return counts
예제 #2
0
    def test_from_sam(self):
        from humdum.io import from_sam

        alignment = list(from_sam(file)).pop()

        last_read = "CACCATCCAGAACAGTGCCTCTTGCAGAGTCTCCTTGGGAAACTTACCAAGTCTGATGGTAGCAGGGGCATGGGACCATCCTAACTGGGAAGACAAAAAGGCTGAGACCTTCCCAGAGTCACCTT"
        self.assertEqual(alignment.seq, last_read)
예제 #3
0
def tlen_hist(file):
    """
    Returns a structure containing the fields
        length
        counts
    where
        counts[i] is the number of
        transcripts of length length[i]

    Expects a SAM file `file`.

    Only counts nonnegative tlen.
    """

    from humdum.io import from_sam
    from collections import Counter

    tlens_counts = numpy.asarray(list(Counter([
        read.tlen
        for read in from_sam(file)
        if (0 <= read.tlen <= 10000)
    ]).items())).T

    class _:
        length = tlens_counts[0]
        counts = tlens_counts[1]

    return _
    def test_sw_on_data_small(self, verbose=0):

        fa = Path(
            __file__).parent / "data_for_tests/data_small/genome.chr22.5K.fa"

        reference = str(unlist1(list(from_fasta(fa))).seq)

        in_file = list((Path(__file__).parent /
                        "data_for_tests/data_small/").glob("*.sam")).pop()
        max_reads = 2
        for (read, __) in zip(from_sam(in_file), range(max_reads)):
            read: Read
            ref = reference
            query = read.seq
            aligner = SmithWaterman()
            for alignment in aligner(ref=ref, query=query):
                if verbose:
                    print(alignment.cigar, ' vs ', read.cigar)
                    print(read.mapq, ' vs ', alignment.score)
                    x, y, z = alignment.visualize(ref=ref, query=query)
                    print(x)
                    print(y)
                    print(z)
                    print(alignment.matching_subsegments(), ' vs ', read.cigar)
                self.assertEqual(
                    alignment.cigar, read.cigar,
                    f'{alignment.cigar} is not equal to cigar from sam file {read.cigar}'
                )
예제 #5
0
    def test_against_pysam(self):
        from humdum.io import from_sam, AlignedSegment
        from humdum.io import from_sam_pysam
        import pysam

        for (have, want, n) in zip(from_sam(file), from_sam_pysam(file),
                                   count()):
            self.assertIsInstance(have, AlignedSegment)
            self.assertIsInstance(want, pysam.AlignedSegment)
            self.assertEqual(have.cigar, want.cigarstring)

        self.assertEqual(n, 1169)
    def test_on_data_small(self):
        (read_file1, read_file2) = sorted(source_path.glob("*.fq"))
        genome_file = unlist1(source_path.glob("genome*.fa"))

        sam = AllTheKingsHorses.from_files(fa=genome_file,
                                           fq1=read_file1,
                                           fq2=read_file2)

        mine: AlignedSegment
        theirs: AlignedSegment
        for ((mine, theirs), n) in zip(
                zip(sam.alignments,
                    from_sam(unlist1(source_path.glob("*.sam")))), count()):
            # See io/sam.py for the explanations
            self.assertEqual(mine.flag.is_minus_strand,
                             bool(theirs.flag.value & 16))
            self.assertEqual(mine.flag.is_secondary_alignment,
                             bool(theirs.flag.value & 256))

            cigar_match = (mine.cigar == theirs.cigar)
            pos_match = (mine.pos == theirs.pos)
            tlen_match = (mine.tlen == theirs.tlen)

            if cigar_match and pos_match:
                print(F"Read {mine.qname} looks good.")
            else:
                print(F"Read {mine.qname} does not match.")
                print(F"Mine:  ", mine.cigar, "at", mine.pos)
                print(F"Theirs:", theirs.cigar, "at", theirs.pos)
                print(F"Read:  ", mine.seq)
                # print(F"Neighborhood:  ", aligned_segments.ref_genome[(mine.pos - 10):(mine.pos + 10 + len(mine.seq))])

            if not tlen_match:
                print(
                    F"tlen mismatch: {mine.tlen} (mine) vs {theirs.tlen} (theirs)"
                )