def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format): '''It splits a sequence iterator with alternating paired reads in two. It will fail if forward and reverse reads are not alternating. ''' while True: try: seq1 = seqs.next() except StopIteration: seq1 = None try: seq2 = seqs.next() except StopIteration: seq2 = None if seq1 is None: break # we have consumed the input iterator completely if seq2 is None: msg = 'The file had an odd number of sequences' raise InterleaveError(msg) _check_name_and_direction_match(seq1, seq2) write_seqrecords([seq1], out_fhand1, out_format) write_seqrecords([seq2], out_fhand2, out_format) out_fhand1.flush() out_fhand2.flush()
def test_write_empy_seq(self): 'It does not write an empty sequence' seq1 = SeqRecord(Seq('ACTG'), id='seq1') fhand = StringIO() write_seqrecords([seq1, None, SeqRecord(Seq(''), id='seq2')], fhand, file_format='fasta') fhand.flush() assert fhand.getvalue() == '>seq1\nACTG\n'
def match_pairs_unordered(seq_fpath, out_fhand, orphan_out_fhand, out_format): 'It matches the seq pairs in an iterator and splits the orphan seqs' index_ = _index_seq_file(seq_fpath) paired, orphans = _get_paired_and_orphan(index_) #write paired write_seqrecords((index_[title] for title in paired), out_fhand, out_format) #orphans write_seqrecords((index_[title] for title in orphans), orphan_out_fhand, out_format)
def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' stats = self._stats stats[PROCESSED_PACKETS] += 1 seq_fhand = write_seqrecords(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 17 filters = [{'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True}, {'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity}] matcher = BlastMatcher(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True) new_seqs = [] for seqrec in seqs: stats[PROCESSED_SEQS] += 1 segments = matcher.get_matched_segments_for_read(seqrec.id) if segments is not None: split_seqs = self._split_by_mate_linker(seqrec, segments) else: split_seqs = [seqrec] for seq in split_seqs: new_seqs.append(seq) stats[YIELDED_SEQS] += 1 return new_seqs
def _run_estscan(seqrecords, pep_out_fpath, dna_out_fpath, matrix_fpath): 'It runs estscan in the input seqs' seq_fhand = write_seqrecords(seqrecords, file_format='fasta') seq_fhand.flush() binary = get_binary_path('estscan') cmd = [binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M', matrix_fpath, seq_fhand.name] process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0]) seq_fhand.close()
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, memory_limit=DEFAULT_SEQS_IN_MEM_LIMIT): 'It matches the seq pairs in an iterator and splits the orphan seqs' buf_fwd = {'index': {}, 'items': []} buf_rev = {'index': {}, 'items': []} for seq in seqs: seq_name, direction = _parse_pair_direction_and_name(seq) if direction == FWD: buf1 = buf_rev buf2 = buf_fwd else: buf1 = buf_fwd buf2 = buf_rev try: matching_seq_index = buf1['index'][seq_name] except KeyError: matching_seq_index = None if matching_seq_index is None: #add to buff buf2['items'].append(seq) buf2['index'][seq_name] = len(buf2['items']) - 1 #check mem limit sum_items = len(buf1['items'] + buf2['items']) if memory_limit is not None and sum_items >= memory_limit: error_msg = 'There are too many consecutive non matching seqs' error_msg += ' in your input. We have reached the memory limit' raise MaxNumReadsInMem(error_msg) else: # write seqs from buffer1 orphan_seqs = buf1['items'][:matching_seq_index] matching_seq = buf1['items'][matching_seq_index] write_seqrecords(orphan_seqs, orphan_out_fhand, out_format) write_seqrecords([matching_seq, seq], out_fhand, out_format) # fix buffers 1 buf1['items'] = buf1['items'][matching_seq_index + 1:] buf1['index'] = {s: i for i, s in enumerate(buf1['items'])} # writes seqs from buffer 2 and fix buffer2 write_seqrecords(buf2['items'], orphan_out_fhand, out_format) buf2['items'] = [] buf2['index'] = {} else: orphan_seqs = buf1['items'] + buf2['items'] write_seqrecords(orphan_seqs, orphan_out_fhand, out_format) orphan_out_fhand.flush() out_fhand.flush()
def __call__(self, seqrecords): 'It trims the masked segments of the seqrecords.' stats = self.stats db_fhand = write_seqrecords(seqrecords, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 89}, {'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False}] matcher = BlastMatcher(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=True) for seqrec in seqrecords: stats[PROCESSED_SEQS] += 1 segments = matcher.get_matched_segments_for_read(seqrec.id) if segments is not None: _add_trim_segments(segments[0], seqrec, kind=VECTOR) stats[YIELDED_SEQS] += 1 return seqrecords
def _do_blast_2(db_fpath, queries, program, blast_format=None, params=None): """It returns an alignment result with the blast. It is an alternative interface to the one based on fpaths. db_fpath should be a plain sequence file. queries should be a SeqRecord list. If an alternative blast output format is given it should be tabular, so blast_format is a list of fields. """ query_fhand = write_seqrecords(queries, file_format="fasta") query_fhand.flush() blastdb = get_or_create_blastdb(db_fpath, dbtype=NUCL) if blast_format is None: blast_format = [ "query", "subject", "query_length", "subject_length", "query_start", "query_end", "subject_start", "subject_end", "expect", "identity", ] fmt = generate_tabblast_format(blast_format) if params is None: params = {} params["outfmt"] = fmt blast_fhand = tempfile.NamedTemporaryFile(suffix=".blast") do_blast(query_fhand.name, blastdb, program, blast_fhand.name, params) blasts = TabularBlastParser(blast_fhand, blast_format) return blasts, blast_fhand