def test_write_pbcore_records(self): records = [FastaRecord("chr1", "acgt"), FastaRecord("chr2", "tgca")] tmp_fasta = tempfile.NamedTemporaryFile(suffix=".fasta").name write_pbcore_records(FastaWriter, records, tmp_fasta) with open(tmp_fasta) as fasta_in: lines = fasta_in.read().splitlines() assert lines == [">chr1", "acgt", ">chr2", "tgca"]
def test_write_contigset_records(self): records = [FastaRecord("chr1", "acgt"), FastaRecord("chr2", "tgca")] tmp_contigs = tempfile.NamedTemporaryFile(suffix=".contigset.xml").name write_contigset_records(FastaWriter, records, tmp_contigs) with ContigSet(tmp_contigs) as ds_in: rec2 = [(rec.id, rec.sequence) for rec in ds_in] assert rec2 == [("chr1", "acgt"), ("chr2", "tgca")]
def trimSequenceData(self, sequenceData, blasrHits): print "Trimming out vector sequence..." trimmedSeqData = [] for rec_id, record in sequenceData.iteritems(): # If the record has a Blasr hit, find it try: hit = blasrHits[rec_id] # Otherwise keep the sequence as-is except KeyError: trimmedSeqData.append(record) continue # For records with hits, cut out and keep the good sequence start = int(hit.qstart) end = int(hit.qend) if start > self.minLength: newName = record.name + '_5p' newSequence = record.sequence[:start] newRecord = FastaRecord(newName, newSequence) trimmedSeqData.append(newRecord) if len(record.sequence) - end > self.minLength: newName = record.name + '_3p' newSequence = record.sequence[end:] newRecord = FastaRecord(newName, newSequence) trimmedSeqData.append(newRecord) return trimmedSeqData
def _extract_from_bash5(bash5_file, min_length, max_length, min_score, min_snr, white_list): """ Extract filtered subreads from a BasH5 or BaxH5 file """ filename = os.path.basename(bash5_file) log.info("Extracting subreads from %s" % filename) records = [] for zmw in BasH5Reader(bash5_file): zmwName = '%s/%s' % (zmw.baxH5.movieName, zmw.holeNumber) if white_list and zmwName not in white_list: continue if zmw.readScore < min_score: continue if min(zmw.zmwMetric('HQRegionSNR')) < min_snr: continue for subread in zmw.subreads: if len(subread) < min_length: continue if len(subread) > max_length: continue record = FastaRecord(subread.readName, subread.basecalls()) records.append(record) log.info('Found %s subreads that passed filters' % len(records)) return records
def reverse_complement(fasta_record): """ Reverse complement a FastaRecord """ rev_seq = fasta_record.sequence[::-1] rev_com_seq = rev_seq.translate(COMPLEMENT) return FastaRecord(fasta_record.name, rev_com_seq)
def __getitem__(self, k): if k not in self.d: raise Exception, "key {0} not in dictionary!".format(k) self.f.seek(self.d[k]) content = '' for line in self.f: if line.startswith('>'): break content += line.strip() return FastaRecord(k, content)
def _get_record(self, k): index, tell = self.d[k] f = self.fhandlers[index] f.seek(tell) content = '' for line in f: if line.startswith('>'): break content += line.strip() return FastaRecord(header=k, sequence=content)
def trim_fasta_record(record, start, end): if start is None and end is None: trimmed_sequence = record.sequence elif start is None: trimmed_sequence = record.sequence[:end] elif end is None: trimmed_sequence = record.sequence[start:] else: trimmed_sequence = record.sequence[start:end] return FastaRecord(record.name, trimmed_sequence)
def outputClusterFasta(self, reads, count): fastaFile = 'cluster%s.fasta' % count if os.path.exists(fastaFile): return fastaFile # Rename the "Reference" sequence to the cluster with FastaWriter(fastaFile) as handle: for fastqRecord in reads: fastaRecord = FastaRecord(fastqRecord.name, fastqRecord.sequence) handle.writeRecord(fastaRecord) return fastaFile
def apply_trims( records, trims ): trimmed = [] for record in records: name = record.name.split()[0] if name in trims: start, end = trims[name] trimmed_record = FastaRecord( name, record.sequence[start:end] ) trimmed.append( trimmed_record ) else: trimmed.append( record ) return trimmed
def __getitem__(self, args): # Return individual sequence Alignments if given Int rec_slice, seq_slice = slice_2d(args) records = self.records[rec_slice] sliced_records = [ FastaRecord(r.name, r.sequence[seq_slice]) for r in records ] filtered_records = [ r for r in sliced_records if len(set(r.sequence)) > 1 ] return FastaAlignment(filtered_records)
def outputReferenceFasta(self, reference, count): print "Creating reference sequence for Cluster #%s" % count referenceFile = 'cluster%s_ref.fasta' % count reference_desc = 'cluster{0}_reference\t{1}'.format( count, reference.name) if os.path.exists(referenceFile): return referenceFile with FastaWriter(referenceFile) as handle: referenceFasta = FastaRecord(reference_desc, reference.sequence) handle.writeRecord(referenceFasta) return referenceFile
def __getitem__(self, k): if k not in self.d: errMsg = "key {k} not in {f}!".format(k=k, f=self.f.name) raise ValueError(errMsg) self.f.seek(self.d[k]) content = '' for line in self.f: if line.startswith('>'): break content += line.strip() # return SeqRecord(Seq(content), id=k) return FastaRecord(name=k, sequence=content)
def get_temp_fasta_record(record): """ If a record isn't in Fasta format, try to create a FastaRecord from it """ if isinstance(record, FastaRecord): return record try: return FastaRecord(record.name.strip(), record.sequence.strip()) except: msg = 'Unrecognized sequence record type' log.error(msg) raise TypeError(msg)
def reverseComplement(cls, record): if isinstance(record, str): return record[::-1].translate(cls.DNA_TRANSLATOR) elif isinstance(record, FastaRecord): return FastaRecord(record.name, cls.reverseComplement(record.sequence)) elif isinstance(record, FastqRecord): return FastqRecord(record.name, cls.reverseComplement(record.sequence), record.quality[::-1]) else: raise ValueError("Record must be either FASTA or FASTQ")
def _slice_record(record, slice): """ Slice a region out of a Fasta or Fastq record """ sequence = record.sequence[slice] if isinstance(record, FastaRecord): return FastaRecord(record.name, sequence) elif isinstance(record, FastqRecord): quality = record.quality[slice] return FastqRecord(record.name, sequence, quality) else: msg = 'Invalid sequence record type' log.error(msg) raise TypeError(msg)
def _extract_exon_record(record, exon_num, start, end): """ Create an Exon record from its coordinates and a Fasta """ exon_name = '%s_exon%s' % (record.name, exon_num) exon_sequence = record.sequence[start:end] if isinstance(record, FastaRecord): return FastaRecord(exon_name, exon_sequence) elif isinstance(record, FastqRecord): exon_qual = record.qualityString[start:end] return FastqRecord(exon_name, exon_sequence, qualityString=exon_qual) msg = 'Record must be either FastaRecord or FastqRecord' log.error(msg) raise TypeError(msg)
def write_temp_fasta(record): """ Write a temporary Fasta file """ temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False) if isinstance(record, FastaRecord): write_fasta(record, temp.name) elif isinstance(record, FastqRecord): fasta = FastaRecord(record.name, record.sequence) write_fasta(fasta, temp.name) else: msg = 'Sequence record must be either Fasta or Fastq' log.error(msg) raise TypeError(msg) return temp
def _write_temp_fasta(record): """ Write a sequence record out to a temporary Fasta file """ temp = tempfile.NamedTemporaryFile(suffix='.fasta', delete=False) if isinstance(record, FastaRecord): write_fasta([record], temp.name) elif isinstance(record, FastqRecord): temp_record = FastaRecord(record.name, record.sequence) write_fasta([temp_record], temp.name) else: msg = 'Record must be either FastaRecord or FastqRecord' log.error(msg) raise TypeError(msg) return temp.name
def _combine_records(records): """ Combine an order series of Exon records in to a cDNA record """ name = '_'.join(records[0].name.split('_')[:-1]) cDNA_sequence = '' cDNA_quality = '' for record in records: cDNA_sequence += record.sequence if hasattr(record, 'qualityString'): cDNA_quality += record.qualityString if len(cDNA_sequence) == len(cDNA_quality): return FastqRecord(name, cDNA_sequence, qualityString=cDNA_quality) else: return FastaRecord(name, cDNA_sequence)
def _extract_from_bash5( bash5_file, min_length, min_score ): """ Extract filtered subreads from a BasH5 or BaxH5 file """ filename = os.path.basename( bash5_file ) log.info("Extracting subreads from %s" % filename) records = [] for zmw in BasH5Reader( bash5_file ): zmwName = '%s/%s' % (zmw.baxH5.movieName, zmw.holeNumber) if zmw.readScore < min_score: continue #if zmw.ccsRead and len( zmw.ccsRead.basecalls() ) > min_length: # yield FastaRecord( zmw.ccsRead.readName, zmw.ccsRead.basecalls() ) #elif zmw.subreads: long_subreads = [s for s in zmw.subreads if len(s.basecalls()) > min_length] if len( long_subreads ) == 1: subread = long_subreads[0] yield FastaRecord( subread.readName, subread.basecalls() ) elif len( long_subreads ) >= 2: ordered = sorted( long_subreads, key=lambda s: len(s.basecalls()), reverse=True ) subread = ordered[0] yield FastaRecord( subread.readName, subread.basecalls() ) log.info('Found %s subreads that passed filters' % len(records))
def _multislice_record(record, slices): """ Slice and combine multiple regions from a Fasta or Fastq """ sliced_records = [_slice_record(record, s) for name, s in slices] sequence = ''.join([r.sequence for r in sliced_records]) if isinstance(record, FastaRecord): return FastaRecord(record.name, sequence) elif isinstance(record, FastqRecord): quality_str = ''.join([r.qualityString for r in sliced_records]) return FastqRecord(record.name, sequence, qualityString=quality_str) else: msg = 'Invalid sequence record type' log.error(msg) raise TypeError(msg)
def _trim_sequences(records, trim): """Trim X bases from each end of each sequence""" trimmed = [] for record in records: if isinstance(record, FastaRecord): trimmed_record = FastaRecord(record.name, record.sequence[trim:-trim]) elif isinstance(record, FastqRecord): trimmed_record = FastqRecord(record.name, record.sequence[trim:-trim], record.quality[trim:-trim]) else: raise TypeError( "Only FastaRecord and FastqRecords support, not '%s'" % type(record)) trimmed.append(trimmed_record) return trimmed
def reverse_complement(record): """ Reverse complement a FastaRecord """ rev_seq = record.sequence[::-1] rev_com_seq = rev_seq.translate(COMPLEMENT) if isinstance(record, FastaRecord): return FastaRecord(record.name, rev_com_seq) elif isinstance(record, FastqRecord): rev_com_qual = record.qualityString[::-1] return FastqRecord(record.name, rev_com_seq, qualityString=rev_com_qual) else: msg = 'Record must be either Fasta or Fastq' log.error(msg) raise TypeError(msg)
def rename_fasta( input_file, output_file, name_key ): """ Rename a single Fasta of subreads """ renaming_dict = read_dict_file( name_key ) with FastaWriter( output_file ) as writer: for record in FastaReader( input_file ): old_name = record.name.split()[0] try: new_name = renaming_dict[old_name] except KeyError: msg = "Sequence name not found!" log.error( msg ) raise KeyError( msg ) new_record = FastaRecord( new_name, record.sequence ) writer.writeRecord( new_record ) check_output_file( output_file ) return output_file
def __getitem__(self, k): """ k --- should be <movie>/<zmw> or <movie>/<zmw>/<start_end> If former, return a list of records associated with that ZMW If latter, return just that record but still in a list """ if k.count('/') == 2: # is a subread if k not in self.d: raise ValueError("key {0} not in dictionary!".format(k)) locations = [self.d[k]] else: # is a ZMW if k not in self.zmw_d: raise ValueError("key {0} not in dictionary!".format(k)) locations = self.zmw_d[k] output = [] for seqid, loc in locations: self.f.seek(loc) content = '' for line in self.f: if line.startswith('>'): break content += line.strip() output.append(FastaRecord(name=seqid, sequence=content)) return output
def createUnalignedRecord(cls, seqParts, zmw): bases = [b for b in seqParts if type(b) is str] unalignedSequence = ''.join(bases) unalignedRecord = FastaRecord(zmw, unalignedSequence) return unalignedRecord
def convertFastqToFasta(cls, fastqRecord): return FastaRecord(fastqRecord.name, fastqRecord.sequence)
def _extract_fasta_region(records, region): name, start, end = region print name, start, end for record in records: yield FastaRecord(record.name, record.sequence[start:end])