示例#1
0
    def read_or_make(cls, *, path_to_genome, path_to_index=None):
        """
        Create an index for the genome and write to file.
        Attempt to read from file instead if it already exists.
        Default `path_to_index` appends the suffix ".index".
        Returns the index.

        RA, 2020-10-23
        """

        from pathlib import Path
        DEFAULT_SUFFIX = ".index"
        path_to_genome = Path(path_to_genome)
        path_to_index = Path(path_to_index
                             or (str(path_to_genome) + DEFAULT_SUFFIX))

        assert path_to_genome.is_file()

        if path_to_index.is_file():
            return cls.read(path_to_index)
        else:
            from humdum.io import from_fasta
            from humdum.utils import unlist1
            return cls(unlist1(list(
                from_fasta(path_to_genome))).seq).write(path_to_index)
    def test_sw_on_data_small(self, verbose=0):

        fa = Path(
            __file__).parent / "data_for_tests/data_small/genome.chr22.5K.fa"

        reference = str(unlist1(list(from_fasta(fa))).seq)

        in_file = list((Path(__file__).parent /
                        "data_for_tests/data_small/").glob("*.sam")).pop()
        max_reads = 2
        for (read, __) in zip(from_sam(in_file), range(max_reads)):
            read: Read
            ref = reference
            query = read.seq
            aligner = SmithWaterman()
            for alignment in aligner(ref=ref, query=query):
                if verbose:
                    print(alignment.cigar, ' vs ', read.cigar)
                    print(read.mapq, ' vs ', alignment.score)
                    x, y, z = alignment.visualize(ref=ref, query=query)
                    print(x)
                    print(y)
                    print(z)
                    print(alignment.matching_subsegments(), ' vs ', read.cigar)
                self.assertEqual(
                    alignment.cigar, read.cigar,
                    f'{alignment.cigar} is not equal to cigar from sam file {read.cigar}'
                )
    def test_on_data_small(self):
        (read_file1, read_file2) = sorted(source_path.glob("*.fq"))
        genome_file = unlist1(source_path.glob("genome*.fa"))

        sam = AllTheKingsHorses.from_files(fa=genome_file,
                                           fq1=read_file1,
                                           fq2=read_file2)

        mine: AlignedSegment
        theirs: AlignedSegment
        for ((mine, theirs), n) in zip(
                zip(sam.alignments,
                    from_sam(unlist1(source_path.glob("*.sam")))), count()):
            # See io/sam.py for the explanations
            self.assertEqual(mine.flag.is_minus_strand,
                             bool(theirs.flag.value & 16))
            self.assertEqual(mine.flag.is_secondary_alignment,
                             bool(theirs.flag.value & 256))

            cigar_match = (mine.cigar == theirs.cigar)
            pos_match = (mine.pos == theirs.pos)
            tlen_match = (mine.tlen == theirs.tlen)

            if cigar_match and pos_match:
                print(F"Read {mine.qname} looks good.")
            else:
                print(F"Read {mine.qname} does not match.")
                print(F"Mine:  ", mine.cigar, "at", mine.pos)
                print(F"Theirs:", theirs.cigar, "at", theirs.pos)
                print(F"Read:  ", mine.seq)
                # print(F"Neighborhood:  ", aligned_segments.ref_genome[(mine.pos - 10):(mine.pos + 10 + len(mine.seq))])

            if not tlen_match:
                print(
                    F"tlen mismatch: {mine.tlen} (mine) vs {theirs.tlen} (theirs)"
                )
示例#4
0
    def from_files(cls, *, fa, fq1, fq2):
        """
        Reference genome file `fa`.
        FASTQ files `fq1` and `fq2`.

        Creates an instance of AllTheKingsHorses and
        yields from its map_paired(...) member function.
        """

        ref_genome = unlist1(from_fasta(fa))

        index = GenomeIndex.read_or_make(path_to_genome=fa)

        aligner = SequenceAligner()

        atkh = AllTheKingsHorses(genome_index=index,
                                 sequence_aligner=aligner,
                                 ref_genome=ref_genome)

        class _:
            headers = atkh.headers()
            alignments = atkh.map_paired(fq1, fq2)

        return _
示例#5
0
# RA, 2020-10-13

from pathlib import Path
from unittest import TestCase
from itertools import count

from humdum.utils import unlist1, at_most_n

data_root = Path(__file__).parent / "data_for_tests"
source_path = data_root / "data_small"
file = unlist1(sorted(source_path.glob("*.sam")))


class TestIoSam(TestCase):
    def test_import(self):
        from humdum.io import from_sam
        pass

    def test_no_import(self):
        """
        Test whether from_sam_pysam is gone.
        """
        with self.assertRaises(ImportError):
            from humdum.io import from_sam_pysam
            pass

    def test_from_sam(self):
        from humdum.io import from_sam

        alignment = list(from_sam(file)).pop()
# RA, 2020-10-23

from unittest import TestCase
from pathlib import Path

from humdum.main import AllTheKingsHorses
from humdum.utils import relpath, unlist1, at_most_n

from humdum.io import AlignedSegment
from humdum.io import from_sam

from itertools import count

data_root = Path(__file__).parent / "data_for_tests"
source_path = data_root / "data"
genome_file = unlist1(source_path.glob("genome*.fa.gz"))


class TestATKH(TestCase):
    def test_on_data_large_5xCov(self):
        (read_file1, read_file2) = sorted(source_path.glob("*5xCov*.fq*"))

        sam = AllTheKingsHorses.from_files(fa=genome_file,
                                           fq1=read_file1,
                                           fq2=read_file2)

        for alignment in at_most_n(sam.alignments, 50):
            print(alignment)
import time
from unittest import TestCase
from pathlib import Path

from humdum.io import open_maybe_gz
from humdum.utils import unlist1
from humdum.index import FmIndex as GenomeIndex

data_root = Path(__file__).parent / "data_for_tests/data"
genome_file = unlist1(data_root.glob("*.fa.gz"))


class TestFm(TestCase):
    def test_open_and_read(self):
        with open_maybe_gz(genome_file) as fd:
            fd.readline()
            fd.readline()
            fd.readline()

    def test_init_write(self):

        genome = ""

        with open_maybe_gz(genome_file) as fd:

            # skip first line
            line = fd.readline()
            line = fd.readline().rstrip()
            while True:

                genome += line
 def read_or_make(cls, *, path_to_genome, ignored=None):
     return cls(unlist1(list(from_fasta(path_to_genome))).seq)