def testSaveToFileDescriptor(self): """ A Reads instance must save to a file-like object if not passed a string filename. """ reads = Reads() read1 = Read('id1', 'AT') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) fp = StringIO() reads.save(fp) self.assertEqual('>id1\nAT\n>id2\nAC\n', fp.getvalue())
def testSaveAsFASTA(self): """ A Reads instance must be able to save in FASTA format. """ reads = Reads() read1 = Read('id1', 'AT') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) mockOpener = mockOpen() with patch('__builtin__.open', mockOpener, create=True): reads.save('filename', 'fasta') handle = mockOpener() self.assertEqual([call('>id1\nAT\n'), call('>id2\nAC\n')], handle.write.call_args_list)
def testSaveWithUppercaseFormat(self): """ A Reads instance must save correctly when the format string is given in upper case. """ reads = Reads() read1 = Read('id1', 'AT') read2 = Read('id2', 'AC') reads.add(read1) reads.add(read2) mockOpener = mockOpen() with patch('__builtin__.open', mockOpener, create=True): reads.save('filename', 'FASTA') handle = mockOpener() self.assertEqual([call('>id1\nAT\n'), call('>id2\nAC\n')], handle.write.call_args_list)
def add(self, pathogenName, sampleName): """ Add a (pathogen name, sample name) combination and get its FASTA/FASTQ file name and unique read count. Write the FASTA/FASTQ file if it does not already exist. Save the unique read count into C{self._proteinGrouper}. @param pathogenName: A C{str} pathogen name. @param sampleName: A C{str} sample name. @return: A C{str} giving the FASTA/FASTQ file name holding all the reads (without duplicates, by id) from the sample that matched the proteins in the given pathogen. """ pathogenIndex = self._pathogens.setdefault(pathogenName, len(self._pathogens)) sampleIndex = self._samples.setdefault(sampleName, len(self._samples)) try: return self._readsFilenames[(pathogenIndex, sampleIndex)] except KeyError: reads = Reads() for proteinMatch in self._proteinGrouper.pathogenNames[ pathogenName][sampleName]['proteins'].values(): for read in self._readsClass(proteinMatch['readsFilename']): reads.add(read) saveFilename = join( proteinMatch['outDir'], 'pathogen-%d-sample-%d.%s' % (pathogenIndex, sampleIndex, self._format)) reads.filter(removeDuplicatesById=True) nReads = reads.save(saveFilename, format_=self._format) # Save the unique read count into self._proteinGrouper self._proteinGrouper.pathogenNames[pathogenName][sampleName][ 'uniqueReadCount'] = nReads self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename return saveFilename
def _writeFASTA(self, i, image): """ Write a FASTA file containing the set of reads that hit a sequence. @param i: The number of the image in self._images. @param image: A member of self._images. @return: A C{dark.reads.Reads} instance holding the reads for the image title. """ reads = Reads() title = image['title'] titleAlignments = self._titlesAlignments[title] for titleAlignment in titleAlignments: reads.add(titleAlignment.read) filename = '%s/%d.fasta' % (self._outputDir, i) reads.save(filename, 'fasta') return reads
def add(self, pathogenName, sampleName): """ Add a (pathogen name, sample name) combination and get its FASTA/FASTQ file name and unique read count. Write the FASTA/FASTQ file if it does not already exist. Save the unique read count into C{self._proteinGrouper}. @param pathogenName: A C{str} pathogen name. @param sampleName: A C{str} sample name. @return: A C{str} giving the FASTA/FASTQ file name holding all the reads (without duplicates, by id) from the sample that matched the proteins in the given pathogen. """ pathogenIndex = self._pathogens.setdefault(pathogenName, len(self._pathogens)) sampleIndex = self._samples.setdefault(sampleName, len(self._samples)) try: return self._readsFilenames[(pathogenIndex, sampleIndex)] except KeyError: reads = Reads() for proteinMatch in self._proteinGrouper.pathogenNames[ pathogenName][sampleName]['proteins'].values(): for read in self._readsClass(proteinMatch['readsFilename']): reads.add(read) saveFilename = join( proteinMatch['outDir'], 'pathogen-%d-sample-%d.%s' % (pathogenIndex, sampleIndex, self._format)) reads.filter(removeDuplicatesById=True) nReads = reads.save(saveFilename, format_=self._format) # Save the unique read count into self._proteinGrouper self._proteinGrouper.pathogenNames[ pathogenName][sampleName]['uniqueReadCount'] = nReads self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename return saveFilename
if args.alignmentFile: args.align = True if args.align: len1, len2 = map(len, reads) if len1 == len2: print('Pre-alignment, sequence lengths were identical: %s' % len1) else: print('Pre-alignment, sequence lengths: %d, %d (difference %d)' % (len1, len2, abs(len1 - len2))) # Align. reads = needle(reads) if args.alignmentFile: assert reads.save(args.alignmentFile) == 2 offsets = (parseRangeString(args.sites, convertToZeroBased=True) if args.sites else None) read1, read2 = reads len1, len2 = map(len, reads) identicalLengths = len1 == len2 # Sanity check. if args.align: assert identicalLengths match = compareDNAReads(read1, read2, matchAmbiguous=(not args.strict),
if args.alignmentFile: args.align = True if args.align: len1, len2 = map(len, reads) if len1 == len2: print('Pre-alignment, sequence lengths were identical: %s' % len1) else: print('Pre-alignment, sequence lengths: %d, %d (difference %d)' % ( len1, len2, abs(len1 - len2))) # Align. reads = needle(reads) if args.alignmentFile: assert reads.save(args.alignmentFile) == 2 offsets = (parseRangeString(args.sites, convertToZeroBased=True) if args.sites else None) read1, read2 = reads len1, len2 = map(len, reads) identicalLengths = len1 == len2 # Sanity check. if args.align: assert identicalLengths match = compareDNAReads(read1, read2, matchAmbiguous=(not args.strict), offsets=offsets)
def makeBAM(template, bamReferences=None, fastaReferences=None): """ A context manager decorator to make a simple BAM file from a template. Note that this code invokes samtools. @param template: An iterable of C{str} sequences. The first will be treated as the reference, and then subsequent pairs (if any) will be treated as read and quality strings. Reads and quality strings can be indented with spaces to show where the read aligns with the reference. @return: A context manager that produces a 2-tuple containing the reference C{DNARead} instance and the C{Path} of the BAM file. """ if len(template) % 2 != 1: raise ValueError( 'The template must have an odd number of strings, specifying the ' 'reference sequence, then zero or more read/quality pairs.') leftPaddedReference = template[0] templateSequence = leftPaddedReference.lstrip().replace('-', '') if bamReferences is None: matchedReference = DNARead(REF_ID, templateSequence) bamReferences = Reads([matchedReference]) else: matchedReference = bamReferences[0] # Sanity check: The first BAM reference must have the same sequence # as the template. assert matchedReference.sequence == templateSequence bamReferences = Reads(bamReferences) fastaReferences = Reads( bamReferences if fastaReferences is None else fastaReferences) nSeqs = (len(template) - 1) >> 1 dirname = mkdtemp(prefix='test-consensus-') e = Executor() try: fastaFile = Path(dirname) / 'references.fasta' samFile = Path(dirname) / 'file.sam' bamFile = Path(dirname) / 'file.bam' fastaReferences.save(fastaFile) with open(samFile, 'w') as fp: for reference in bamReferences: print(f'@SQ\tSN:{reference.id}\tLN:{len(reference)}', file=fp) for count in range(nSeqs): leftPaddedQuery = template[count * 2 + 1].rstrip() leftPaddedQuality = template[count * 2 + 2].rstrip() assert len(leftPaddedQuery) == len(leftPaddedQuality) query = leftPaddedQuery.lstrip() quality = leftPaddedQuality.lstrip() queryNoGaps = qualityNoGaps = '' for queryBase, qualityBase in zip(query, quality): if queryBase != '-': queryNoGaps += queryBase qualityNoGaps += qualityBase print( '\t'.join( map( str, ( f'read{count}', # QNAME (query name) 0, # FLAGS matchedReference.id, # RNAME (reference name) matchOffset(leftPaddedReference, leftPaddedQuery) + 1, 30, # MAPQ (mapping quality) makeCigar(leftPaddedReference, leftPaddedQuery), # CIGAR '*', # MRNM (mate reference name) 0, # MPOS (mate position) 0, # ISIZE (insert size) queryNoGaps, # SEQ qualityNoGaps, # QUAL ))), file=fp) e.execute(f'samtools sort -O BAM --write-index -o {str(bamFile)!r} ' f'{str(samFile)!r}') yield (fastaFile, bamFile) finally: # import sys; print(f'{samFile}', file=sys.stderr) e.execute(f'rm -fr {dirname!r}')