def read_or_make(cls, *, path_to_genome, path_to_index=None): """ Create an index for the genome and write to file. Attempt to read from file instead if it already exists. Default `path_to_index` appends the suffix ".index". Returns the index. RA, 2020-10-23 """ from pathlib import Path DEFAULT_SUFFIX = ".index" path_to_genome = Path(path_to_genome) path_to_index = Path(path_to_index or (str(path_to_genome) + DEFAULT_SUFFIX)) assert path_to_genome.is_file() if path_to_index.is_file(): return cls.read(path_to_index) else: from humdum.io import from_fasta from humdum.utils import unlist1 return cls(unlist1(list( from_fasta(path_to_genome))).seq).write(path_to_index)
def test_sw_on_data_small(self, verbose=0): fa = Path( __file__).parent / "data_for_tests/data_small/genome.chr22.5K.fa" reference = str(unlist1(list(from_fasta(fa))).seq) in_file = list((Path(__file__).parent / "data_for_tests/data_small/").glob("*.sam")).pop() max_reads = 2 for (read, __) in zip(from_sam(in_file), range(max_reads)): read: Read ref = reference query = read.seq aligner = SmithWaterman() for alignment in aligner(ref=ref, query=query): if verbose: print(alignment.cigar, ' vs ', read.cigar) print(read.mapq, ' vs ', alignment.score) x, y, z = alignment.visualize(ref=ref, query=query) print(x) print(y) print(z) print(alignment.matching_subsegments(), ' vs ', read.cigar) self.assertEqual( alignment.cigar, read.cigar, f'{alignment.cigar} is not equal to cigar from sam file {read.cigar}' )
def test_on_data_big(self): source_path = data_root / "data" files = list(source_path.glob("*.fa.gz")) assert files for file in files: for genome in from_fasta(file): self.assertEqual(len(genome.seq), 51304566) self.assertTrue(genome.seq.strip("N").endswith("CGGATT"))
def test_reads_well(self): desc = "Hello" seq1 = "ABC" seq2 = "DEF" with NamedTemporaryFile(mode='w') as fn: print(*[">" + desc, seq1, seq2], sep='\n', file=fn, flush=True) record = first(from_fasta(fn.name)) self.assertEqual(record.desc, desc) self.assertEqual(record.seq, seq1 + seq2)
def test_data_small_vs_biopython(self): source_path = data_root / "data_small" files = list(source_path.glob("*.fa")) assert files from Bio import SeqIO for file in files: reference_reads = list(SeqIO.parse(file, format='fasta')) candidate_reads = list(from_fasta(file)) self.assertEqual(len(reference_reads), len(candidate_reads)) for (reference, candidate) in zip(reference_reads, candidate_reads): self.assertIsInstance(reference, SeqIO.SeqRecord) self.assertIsInstance(candidate, Sequence) self.assertEqual(str(reference.seq), candidate.seq) self.assertEqual(reference.description, candidate.desc)
def from_files(cls, *, fa, fq1, fq2): """ Reference genome file `fa`. FASTQ files `fq1` and `fq2`. Creates an instance of AllTheKingsHorses and yields from its map_paired(...) member function. """ ref_genome = unlist1(from_fasta(fa)) index = GenomeIndex.read_or_make(path_to_genome=fa) aligner = SequenceAligner() atkh = AllTheKingsHorses(genome_index=index, sequence_aligner=aligner, ref_genome=ref_genome) class _: headers = atkh.headers() alignments = atkh.map_paired(fq1, fq2) return _
def test_fails_when_many(self): with NamedTemporaryFile(mode='w') as fn: print(*[">A", "N", ">B", "N"], sep='\n', file=fn, flush=True) with self.assertRaises(AssertionError): list(from_fasta(fn.name))
def read_or_make(cls, *, path_to_genome, ignored=None): return cls(unlist1(list(from_fasta(path_to_genome))).seq)