def match_pairs(reads, out_fhand, orphan_out_fhand, out_format, ordered=True, check_order_buffer_size=0, max_reads_memory=None, temp_dir=None): '''It matches the seq pairs in an iterator and splits the orphan seqs.''' counts = 0 check_order_buffer = KeyedSet() for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir): if len(pair) == 1: write_seqs(pair, orphan_out_fhand, out_format) try: name = _parse_pair_direction_and_name(pair[0])[0] except PairDirectionError: name = get_name(pair[0]) if ordered and counts < check_order_buffer_size: counts += 1 if not check_order_buffer.check_add(name): msg = 'Reads are not ordered by pairs.Use unordered option' raise ItemsNotSortedError(msg) elif ordered and counts >= check_order_buffer_size: if name in check_order_buffer: msg = 'Reads are not ordered by pairs.Use unordered option' raise ItemsNotSortedError(msg) elif len(pair) == 2: write_seqs(pair, out_fhand, out_format) flush_fhand(orphan_out_fhand) flush_fhand(out_fhand)
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format): '''It splits a sequence iterator with alternating paired reads in two. It will fail if forward and reverse reads are not alternating. ''' for pair in group_pairs(seqs, n_seqs_in_pair=2): write_seqs((pair[0], ), out_fhand1, out_format) write_seqs((pair[1], ), out_fhand2, out_format) out_fhand1.flush() out_fhand2.flush()
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format): '''It splits a sequence iterator with alternating paired reads in two. It will fail if forward and reverse reads are not alternating. ''' for pair in group_pairs(seqs, n_seqs_in_pair=2): write_seqs((pair[0],), out_fhand1, out_format) write_seqs((pair[1],), out_fhand2, out_format) out_fhand1.flush() out_fhand2.flush()
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs] reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming') write_seqs(seqs, reads_fhand) reads_fhand.flush() bwa = map_with_bwamem(self._index_fpath, interleave_fpath=reads_fhand.name) bam_fhand = NamedTemporaryFile(dir=self._tempdir) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=self._tempdir) self._bam_fhand = bam_fhand reads_fhand.close()
def filter_duplicates(in_fhands, out_fhand, paired_reads, use_length=None, n_seqs_packet=None, tempdir=None): if not in_fhands: raise ValueError('At least one input fhand is required') pairs = _read_pairs(in_fhands, paired_reads) get_pair_key = _PairKeyGetter(use_length=use_length) if n_seqs_packet is None: unique_pairs = unique_unordered(pairs, key=get_pair_key) else: sorted_pairs = sorted_items(pairs, key=get_pair_key, tempdir=tempdir, max_items_in_memory=n_seqs_packet) unique_pairs = unique(sorted_pairs, key=get_pair_key) for pair in unique_pairs: write_seqs(pair, out_fhand)
def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 13 filters = [{'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True}, {'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity}] matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True, seqs_type=NUCL) new_seqs = [] for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: split_seqs = self._split_by_mate_linker(seq, segments) else: split_seqs = [seq] for seq in split_seqs: new_seqs.append(seq) return new_seqs
def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 13 filters = [{ 'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True }, { 'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity }] matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True, seqs_type=NUCL) new_seqs = [] for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: split_seqs = self._split_by_mate_linker(seq, segments) else: split_seqs = [seq] for seq in split_seqs: new_seqs.append(seq) return new_seqs
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED] for s in seqs] reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming') write_seqs(seqs, reads_fhand) reads_fhand.flush() bwa = map_with_bwamem(self._index_fpath, interleave_fpath=reads_fhand.name) bam_fhand = NamedTemporaryFile(dir=self._tempdir) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=self._tempdir) self._bam_fhand = bam_fhand reads_fhand.close()
def _do_blast_2(db_fpath, queries, program, dbtype=None, blast_format=None, params=None, remote=False): '''It returns an alignment result with the blast. It is an alternative interface to the one based on fpaths. db_fpath should be a plain sequence file. queries should be a SeqRecord list. If an alternative blast output format is given it should be tabular, so blast_format is a list of fields. ''' query_fhand = write_seqs(queries, file_format='fasta') query_fhand.flush() if remote: blastdb = db_fpath fmt = 'XML' if blast_format is None else blast_format.upper() else: blastdb = get_or_create_blastdb(db_fpath, dbtype=dbtype) if blast_format is None: blast_format = [ 'query', 'subject', 'query_length', 'subject_length', 'query_start', 'query_end', 'subject_start', 'subject_end', 'expect', 'identity', ] fmt = generate_tabblast_format(blast_format) if params is None: params = {} params['outfmt'] = fmt blast_fhand = tempfile.NamedTemporaryFile(suffix='.blast') do_blast(query_fhand.name, blastdb, program, blast_fhand.name, params, remote=remote) if remote: blasts = BlastParser(blast_fhand) else: blasts = TabularBlastParser(blast_fhand, blast_format) return blasts, blast_fhand
def _run_estscan(seqs, pep_out_fpath, dna_out_fpath, matrix_fpath): 'It runs estscan in the input seqs' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() binary = get_binary_path('estscan') cmd = [binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M', matrix_fpath, seq_fhand.name] process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0]) seq_fhand.close()
def test_seqitems_io(self): 'It checks the different seq class streams IO' fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM])) assert seqs[0].kind == SEQITEM fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' assert seqs[0].object.name == 's1' # SeqRecord fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD])) assert seqs[0].kind == SEQRECORD fhand = StringIO() write_seqs(seqs, fhand, 'fasta') assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' # seqitem not possible with different input and output formats fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') try: seqs = list(read_seqs([fhand], out_format='fastq', prefered_seq_classes=[SEQITEM])) self.fail('ValueError expected') except ValueError: pass fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], out_format='fasta', prefered_seq_classes=[SEQITEM])) fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
def _setup_checks(self, filterpacket): index_fpath = self._index_fpath seqs = [s for seqs in filterpacket[SEQS_PASSED] for s in seqs] seq_class = seqs[0].kind extra_params = [] # Which format do we need for the bowtie2 input read file fasta or # fastq? if seq_class == SEQRECORD: if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys(): file_format = 'fastq' else: extra_params.append('-f') file_format = 'fasta' elif seq_class == SEQITEM: file_format = get_file_format(seqs[0]) if 'illumina' in file_format: extra_params.append('--phred64') elif 'fasta' in file_format: extra_params.append('-f') elif 'fastq' in file_format: pass else: msg = 'For FilterBowtie2Match and SeqItems fastq or fasta ' msg += 'files are required' raise RuntimeError(msg) else: raise NotImplementedError() reads_fhand = NamedTemporaryFile(suffix=file_format) write_seqs(seqs, reads_fhand, file_format=file_format) reads_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') map_process = map_with_bowtie2(index_fpath, unpaired_fpath=reads_fhand.name, extra_params=extra_params, threads=self.threads) map_process_to_bam(map_process, bam_fhand.name) self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
def _setup_checks(self, filterpacket): index_fpath = self._index_fpath seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs] seq_class = seqs[0].kind extra_params = [] # Which format do we need for the bowtie2 input read file fasta or # fastq? if seq_class == SEQRECORD: if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys(): file_format = 'fastq' else: extra_params.append('-f') file_format = 'fasta' elif seq_class == SEQITEM: file_format = get_file_format(seqs[0]) if 'illumina' in file_format: extra_params.append('--phred64') elif 'fasta' in file_format: extra_params.append('-f') elif 'fastq' in file_format: pass else: msg = 'For FilterBowtie2Match and SeqItems fastq or fasta ' msg += 'files are required' raise RuntimeError(msg) else: raise NotImplementedError() reads_fhand = NamedTemporaryFile(suffix=file_format) write_seqs(seqs, reads_fhand, file_format=file_format) reads_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') map_process = map_with_bowtie2(index_fpath, unpaired_fpath=reads_fhand.name, extra_params=extra_params, threads=self.threads) map_process_to_bam(map_process, bam_fhand.name) self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand, chimeras_fhand=None, unknown_fhand=None, tempdir=None, threads=None, settings=get_setting('CHIMERAS_SETTINGS')): '''It maps sequences from input files, sorts them and writes to output files according to its classification''' bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) for pair, kind in classify_mapped_reads(bam_fhand, settings=settings, mate_distance=mate_distance): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand)
def _run_estscan(seqs, pep_out_fpath, dna_out_fpath, matrix_fpath): 'It runs estscan in the input seqs' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() binary = get_binary_path('estscan') cmd = [ binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M', matrix_fpath, seq_fhand.name ] process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0]) seq_fhand.close()
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs] db_fhand = write_seqs(seqs, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 87}, {'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False}] self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=True)
def _setup_checks(self, filterpacket): seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs] # we create a blastdb for these reads and then we use the oligos # as the blast query db_fhand = write_seqs(seqs, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 87}, {'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False}] self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=False)
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED] for s in seqs] db_fhand = write_seqs(seqs, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{ 'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 87 }, { 'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False }] self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=True)
def _do_blast_2(db_fpath, queries, program, dbtype=None, blast_format=None, params=None, remote=False): '''It returns an alignment result with the blast. It is an alternative interface to the one based on fpaths. db_fpath should be a plain sequence file. queries should be a SeqRecord list. If an alternative blast output format is given it should be tabular, so blast_format is a list of fields. ''' query_fhand = write_seqs(queries, file_format='fasta') query_fhand.flush() if remote: blastdb = db_fpath fmt = 'XML' if blast_format is None else blast_format.upper() else: blastdb = get_or_create_blastdb(db_fpath, dbtype=dbtype) if blast_format is None: blast_format = ['query', 'subject', 'query_length', 'subject_length', 'query_start', 'query_end', 'subject_start', 'subject_end', 'expect', 'identity', ] fmt = generate_tabblast_format(blast_format) if params is None: params = {} params['outfmt'] = fmt blast_fhand = tempfile.NamedTemporaryFile(suffix='.blast') do_blast(query_fhand.name, blastdb, program, blast_fhand.name, params, remote=remote) if remote: blasts = BlastParser(blast_fhand) else: blasts = TabularBlastParser(blast_fhand, blast_format) return blasts, blast_fhand
def _setup_checks(self, filterpacket): seqs = [s for seqs in filterpacket[SEQS_PASSED] for s in seqs] # we create a blastdb for these reads and then we use the oligos # as the blast query db_fhand = write_seqs(seqs, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{ 'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 87 }, { 'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False }] self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=False)
def test_seqitems_io(self): 'It checks the different seq class streams IO' fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM])) assert seqs[0].kind == SEQITEM fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' assert seqs[0].object.name == 's1' # SeqRecord fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD])) assert seqs[0].kind == SEQRECORD fhand = StringIO() write_seqs(seqs, fhand, 'fasta') assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' # seqitem not possible with different input and output formats fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') try: seqs = list( read_seqs([fhand], out_format='fastq', prefered_seq_classes=[SEQITEM])) self.fail('ValueError expected') except ValueError: pass fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list( read_seqs([fhand], out_format='fasta', prefered_seq_classes=[SEQITEM])) fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'