예제 #1
0
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format):
    '''It splits a sequence iterator with alternating paired reads in two.

    It will fail if forward and reverse reads are not alternating.
    '''

    while True:
        try:
            seq1 = seqs.next()
        except StopIteration:
            seq1 = None
        try:
            seq2 = seqs.next()
        except StopIteration:
            seq2 = None
        if seq1 is None:
            break   # we have consumed the input iterator completely
        if seq2 is None:
            msg = 'The file had an odd number of sequences'
            raise InterleaveError(msg)
        _check_name_and_direction_match(seq1, seq2)
        write_seqrecords([seq1], out_fhand1, out_format)
        write_seqrecords([seq2], out_fhand2, out_format)
    out_fhand1.flush()
    out_fhand2.flush()
 def test_write_empy_seq(self):
     'It does not write an empty sequence'
     seq1 = SeqRecord(Seq('ACTG'), id='seq1')
     fhand = StringIO()
     write_seqrecords([seq1, None, SeqRecord(Seq(''), id='seq2')], fhand,
                      file_format='fasta')
     fhand.flush()
     assert fhand.getvalue() == '>seq1\nACTG\n'
예제 #3
0
def match_pairs_unordered(seq_fpath, out_fhand, orphan_out_fhand, out_format):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    index_ = _index_seq_file(seq_fpath)
    paired, orphans = _get_paired_and_orphan(index_)

    #write paired
    write_seqrecords((index_[title] for title in paired), out_fhand,
                     out_format)

    #orphans
    write_seqrecords((index_[title] for title in orphans), orphan_out_fhand,
                     out_format)
예제 #4
0
    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        stats = self._stats
        stats[PROCESSED_PACKETS] += 1
        seq_fhand = write_seqrecords(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 17
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlastMatcher(seq_fhand.name, self.linkers,
                               program='blastn', filters=filters,
                               params={'task': 'blastn-short'},
                               elongate_for_global=True)
        new_seqs = []
        for seqrec in seqs:
            stats[PROCESSED_SEQS] += 1
            segments = matcher.get_matched_segments_for_read(seqrec.id)
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seqrec, segments)
            else:
                split_seqs = [seqrec]
            for seq in split_seqs:
                new_seqs.append(seq)
                stats[YIELDED_SEQS] += 1
        return new_seqs
예제 #5
0
def _run_estscan(seqrecords, pep_out_fpath, dna_out_fpath, matrix_fpath):
    'It runs estscan in the input seqs'
    seq_fhand = write_seqrecords(seqrecords, file_format='fasta')
    seq_fhand.flush()
    binary = get_binary_path('estscan')

    cmd = [binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M',
           matrix_fpath, seq_fhand.name]
    process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    check_process_finishes(process, binary=cmd[0])
    seq_fhand.close()
예제 #6
0
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                memory_limit=DEFAULT_SEQS_IN_MEM_LIMIT):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    buf_fwd = {'index': {}, 'items': []}
    buf_rev = {'index': {}, 'items': []}
    for seq in seqs:
        seq_name, direction = _parse_pair_direction_and_name(seq)
        if direction == FWD:
            buf1 = buf_rev
            buf2 = buf_fwd
        else:
            buf1 = buf_fwd
            buf2 = buf_rev

        try:
            matching_seq_index = buf1['index'][seq_name]
        except KeyError:
            matching_seq_index = None

        if matching_seq_index is None:
            #add to buff
            buf2['items'].append(seq)
            buf2['index'][seq_name] = len(buf2['items']) - 1
            #check mem limit
            sum_items = len(buf1['items'] + buf2['items'])
            if memory_limit is not None and sum_items >= memory_limit:
                error_msg = 'There are too many consecutive non matching seqs'
                error_msg += ' in your input. We have reached the memory limit'
                raise MaxNumReadsInMem(error_msg)
        else:
            # write seqs from buffer1
            orphan_seqs = buf1['items'][:matching_seq_index]
            matching_seq = buf1['items'][matching_seq_index]
            write_seqrecords(orphan_seqs, orphan_out_fhand, out_format)
            write_seqrecords([matching_seq, seq], out_fhand, out_format)
            # fix buffers 1
            buf1['items'] = buf1['items'][matching_seq_index + 1:]
            buf1['index'] = {s: i for i, s in enumerate(buf1['items'])}

            # writes seqs from buffer 2 and fix buffer2
            write_seqrecords(buf2['items'], orphan_out_fhand, out_format)
            buf2['items'] = []
            buf2['index'] = {}
    else:
        orphan_seqs = buf1['items'] + buf2['items']
        write_seqrecords(orphan_seqs, orphan_out_fhand, out_format)

    orphan_out_fhand.flush()
    out_fhand.flush()
예제 #7
0
 def __call__(self, seqrecords):
     'It trims the masked segments of the seqrecords.'
     stats = self.stats
     db_fhand = write_seqrecords(seqrecords, file_format='fasta')
     db_fhand.flush()
     params = {'task': 'blastn-short', 'expect': '0.0001'}
     filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                 'min_score': 89},
                {'kind': 'min_length', 'min_num_residues': 13,
                 'length_in_query': False}]
     matcher = BlastMatcher(db_fhand.name, self.oligos,
                            program='blastn', filters=filters,
                            params=params, elongate_for_global=True)
     for seqrec in seqrecords:
         stats[PROCESSED_SEQS] += 1
         segments = matcher.get_matched_segments_for_read(seqrec.id)
         if segments is not None:
             _add_trim_segments(segments[0], seqrec, kind=VECTOR)
         stats[YIELDED_SEQS] += 1
     return seqrecords
예제 #8
0
def _do_blast_2(db_fpath, queries, program, blast_format=None, params=None):
    """It returns an alignment result with the blast.

    It is an alternative interface to the one based on fpaths.
    db_fpath should be a plain sequence file.
    queries should be a SeqRecord list.
    If an alternative blast output format is given it should be tabular, so
    blast_format is a list of fields.
    """

    query_fhand = write_seqrecords(queries, file_format="fasta")
    query_fhand.flush()

    blastdb = get_or_create_blastdb(db_fpath, dbtype=NUCL)

    if blast_format is None:
        blast_format = [
            "query",
            "subject",
            "query_length",
            "subject_length",
            "query_start",
            "query_end",
            "subject_start",
            "subject_end",
            "expect",
            "identity",
        ]
    fmt = generate_tabblast_format(blast_format)
    if params is None:
        params = {}
    params["outfmt"] = fmt

    blast_fhand = tempfile.NamedTemporaryFile(suffix=".blast")
    do_blast(query_fhand.name, blastdb, program, blast_fhand.name, params)

    blasts = TabularBlastParser(blast_fhand, blast_format)

    return blasts, blast_fhand