def column_filter(seq_iter, character_list=['?','-'], max_frequency=1.0): col_freqs, seqs = seqstats.column_frequencies(seq_iter, character_list=character_list) cols_to_keep = [p < max_frequency for p in col_freqs] for seq in seqs: new_seq = itertools.compress(str(seq.seq), cols_to_keep) yield sequtils.copy_seq_metadata(seq, new_seq=''.join(new_seq))
def align_pair(seq_record1, seq_record2, tools=['mafft', 'muscle']): """ Returns the aligned copies of `SeqRecord` `seq_record1` and `seq_record2`. The `tools` argument should be a prioritized list of options for the external alignment program to use. For example, if `['mafft', 'muscle']` is specified (the default), mafft will be used if the executable is found in PATH. If mafft cannot be found, it will try to use muscle. If none of the listed programs can be found (or if the argument is an empty list or None), the (much slower) built-in `global_align` function is used. """ aligner = get_aligner(tools=tools) if not aligner: if tools: _LOG.warning('WARNING: external alignment tools not found; ' 'using (slow) built-in alignment function.') seq1, seq2 = global_align(seq_record1, seq_record2) s1 = sequtils.copy_seq_metadata(seq_record1, seq1) s2 = sequtils.copy_seq_metadata(seq_record2, seq2) return s1, s2 seqs = list(aligner.align([seq_record1, seq_record2])) assert len(seqs) == 2 sequences = dict(zip([s.id for s in seqs], seqs)) return sequences[seq_record1.id], sequences[seq_record2.id]
def longest_reading_frames(seq_iter, gap_characters=['-'], table = 1, allow_partial = True, require_start_after_stop = True): for i, s in enumerate(remove_gaps(seq_iter, gap_characters=gap_characters)): lrf = sequtils.get_longest_reading_frames(s, table = table, allow_partial = allow_partial, require_start_after_stop = require_start_after_stop) if lrf: yield lrf[0] else: yield sequtils.copy_seq_metadata(s, '')
def align_pair(seq_record1, seq_record2, tools = ['mafft', 'muscle']): """ Returns the aligned copies of `SeqRecord` `seq_record1` and `seq_record2`. The `tools` argument should be a prioritized list of options for the external alignment program to use. For example, if `['mafft', 'muscle']` is specified (the default), mafft will be used if the executable is found in PATH. If mafft cannot be found, it will try to use muscle. If none of the listed programs can be found (or if the argument is an empty list or None), the (much slower) built-in `global_align` function is used. """ aligner = get_aligner(tools = tools) if not aligner: if tools: _LOG.warning('WARNING: external alignment tools not found; ' 'using (slow) built-in alignment function.') seq1, seq2 = global_align(seq_record1, seq_record2) s1 = sequtils.copy_seq_metadata(seq_record1, seq1) s2 = sequtils.copy_seq_metadata(seq_record2, seq2) return s1, s2 seqs = list(aligner.align([seq_record1, seq_record2])) assert len(seqs) == 2 sequences = dict(zip([s.id for s in seqs], seqs)) return sequences[seq_record1.id], sequences[seq_record2.id]
def seq_mod(seq_iter, from_chars='', to_chars='', del_chars=''): ''' Modify sequences. Each sequence in `seq_iter` will have the characters in the `from_chars` string mapped to the characters in the `to_chars` string, and any characters in the `del_chars` string will be removed. ''' if len(from_chars) != len(to_chars): raise ValueError('from and to characters must have same length') table = None if len(from_chars) > 0: table = maketrans(from_chars, to_chars) for seq in seq_iter: yield sequtils.copy_seq_metadata(seq, new_seq=str(seq.seq).translate( table, del_chars))
def longest_reading_frames(seq_iter, gap_characters=['-'], table=1, allow_partial=True, require_start_after_stop=True): for i, s in enumerate(remove_gaps(seq_iter, gap_characters=gap_characters)): lrf = sequtils.get_longest_reading_frames( s, table=table, allow_partial=allow_partial, require_start_after_stop=require_start_after_stop) if lrf: yield lrf[0] else: yield sequtils.copy_seq_metadata(s, '')
def seq_mod(seq_iter, from_chars='', to_chars='', del_chars=''): ''' Modify sequences. Each sequence in `seq_iter` will have the characters in the `from_chars` string mapped to the characters in the `to_chars` string, and any characters in the `del_chars` string will be removed. ''' if len(from_chars) != len(to_chars): raise ValueError('from and to characters must have same length') table = None if len(from_chars) > 0: table = maketrans(from_chars, to_chars) for seq in seq_iter: yield sequtils.copy_seq_metadata(seq, new_seq=str(seq.seq).translate(table, del_chars))
def constant_column_filter(seq_iter): cols_to_keep, seqs = seqstats.variable_columns(seq_iter) for seq in seqs: new_seq = itertools.compress(str(seq.seq), cols_to_keep) yield sequtils.copy_seq_metadata(seq, new_seq=''.join(new_seq))
def dice(seq_iter, slices_to_keep): for seq in seq_iter: yield sequtils.copy_seq_metadata(seq, new_seq=''.join( (str(seq.seq[l:r]) for l, r in slices_to_keep)))
def test_seq_record(self): s = sequtils.copy_seq_metadata(self.seq, SeqRecord(Seq('AGCT'), id='1')) self.assertSameMetadata(self.seq, s) self.assertEqual(str(s.seq), 'AGCT') self.assertFalse(s is self.seq)
def test_string(self): s = sequtils.copy_seq_metadata(self.seq, 'AGCT') self.assertSameMetadata(self.seq, s) self.assertEqual(str(s.seq), 'AGCT') self.assertFalse(s is self.seq)
def test_empty_seq(self): s = sequtils.copy_seq_metadata(self.seq) self.assertSameMetadata(self.seq, s) self.assertEqual(str(s.seq), '') self.assertFalse(s is self.seq)
def test_seq_record(self): s = sequtils.copy_seq_metadata(self.seq, SeqRecord(Seq('AGCT'), id='1')) self.assertSameMetadata(self.seq, s) self.assertEqual(str(s.seq), 'AGCT') self.assertNotEqual(s, self.seq)
def test_string(self): s = sequtils.copy_seq_metadata(self.seq, 'AGCT') self.assertSameMetadata(self.seq, s) self.assertEqual(str(s.seq), 'AGCT') self.assertNotEqual(s, self.seq)
def test_empty_seq(self): s = sequtils.copy_seq_metadata(self.seq) self.assertSameMetadata(self.seq, s) self.assertEqual(str(s.seq), '') self.assertNotEqual(s, self.seq)
def test_seq(self): s = sequtils.copy_seq_metadata(self.seq, Seq('AGCT')) self.assertSameMetadata(self.seq, s) self.assertEqual(str(s.seq), 'AGCT') self.assertFalse(s is self.seq)