Пример #1
0
def match_pairs(
    reads,
    out_fhand,
    orphan_out_fhand,
    out_format,
    ordered=True,
    check_order_buffer_size=0,
    max_reads_memory=None,
    temp_dir=None,
):
    """It matches the seq pairs in an iterator and splits the orphan seqs."""
    counts = 0
    check_order_buffer = KeyedSet()
    for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir):
        if len(pair) == 1:
            write_seqs(pair, orphan_out_fhand, out_format)
            try:
                name = _parse_pair_direction_and_name(pair[0])[0]
            except PairDirectionError:
                name = get_name(pair[0])
            if ordered and counts < check_order_buffer_size:
                counts += 1
                if not check_order_buffer.check_add(name):
                    msg = "Reads are not ordered by pairs.Use unordered option"
                    raise ItemsNotSortedError(msg)
            elif ordered and counts >= check_order_buffer_size:
                if name in check_order_buffer:
                    msg = "Reads are not ordered by pairs.Use unordered option"
                    raise ItemsNotSortedError(msg)
        elif len(pair) == 2:
            write_seqs(pair, out_fhand, out_format)
    flush_fhand(orphan_out_fhand)
    flush_fhand(out_fhand)
Пример #2
0
def match_pairs(reads,
                out_fhand,
                orphan_out_fhand,
                out_format,
                ordered=True,
                check_order_buffer_size=0,
                max_reads_memory=None,
                temp_dir=None):
    '''It matches the seq pairs in an iterator and splits the orphan seqs.'''
    counts = 0
    check_order_buffer = KeyedSet()
    for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory,
                                       temp_dir):
        if len(pair) == 1:
            write_seqs(pair, orphan_out_fhand, out_format)
            try:
                name = _parse_pair_direction_and_name(pair[0])[0]
            except PairDirectionError:
                name = get_name(pair[0])
            if ordered and counts < check_order_buffer_size:
                counts += 1
                if not check_order_buffer.check_add(name):
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
            elif ordered and counts >= check_order_buffer_size:
                if name in check_order_buffer:
                    msg = 'Reads are not ordered by pairs.Use unordered option'
                    raise ItemsNotSortedError(msg)
        elif len(pair) == 2:
            write_seqs(pair, out_fhand, out_format)
    flush_fhand(orphan_out_fhand)
    flush_fhand(out_fhand)
Пример #3
0
def filter_duplicates(in_fhands, out_fhand, paired_reads,
                      n_seqs_packet=None, tempdir=None):
    if not in_fhands:
        raise ValueError('At least one input fhand is required')
    pairs = _read_pairs(in_fhands, paired_reads)
    sorted_pairs = sorted_items(pairs, key=_get_pair_key, tempdir=tempdir,
                                max_items_in_memory=n_seqs_packet)
    for pair in unique(sorted_pairs, key=_get_pair_key):
        write_seqs(pair, out_fhand)
Пример #4
0
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format):
    """It splits a sequence iterator with alternating paired reads in two.

    It will fail if forward and reverse reads are not alternating.
    """
    for pair in group_pairs(seqs, n_seqs_in_pair=2):
        write_seqs((pair[0],), out_fhand1, out_format)
        write_seqs((pair[1],), out_fhand2, out_format)
    out_fhand1.flush()
    out_fhand2.flush()
Пример #5
0
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format):
    '''It splits a sequence iterator with alternating paired reads in two.

    It will fail if forward and reverse reads are not alternating.
    '''
    for pair in group_pairs(seqs, n_seqs_in_pair=2):
        write_seqs((pair[0], ), out_fhand1, out_format)
        write_seqs((pair[1], ), out_fhand2, out_format)
    out_fhand1.flush()
    out_fhand2.flush()
Пример #6
0
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    buf_fwd = {'index': {}, 'items': []}
    buf_rev = {'index': {}, 'items': []}
    buf1, buf2 = buf_fwd, buf_rev   # for the all orphan case
    for seq in seqs:
        try:
            seq_name, direction = _parse_pair_direction_and_name(seq)
        except PairDirectionError:
            write_seqs([seq], orphan_out_fhand, out_format)
            continue

        if direction == FWD:
            buf1 = buf_rev
            buf2 = buf_fwd
        else:
            buf1 = buf_fwd
            buf2 = buf_rev

        try:
            matching_seq_index = buf1['index'][seq_name]
        except KeyError:
            matching_seq_index = None

        if matching_seq_index is None:
            # add to buff
            buf2['items'].append(seq)
            buf2['index'][seq_name] = len(buf2['items']) - 1
            # check mem limit
            sum_items = len(buf1['items'] + buf2['items'])
            if memory_limit is not None and sum_items >= memory_limit:
                error_msg = 'There are too many consecutive non matching seqs'
                error_msg += ' in your input. We have reached the memory limit'
                raise MaxNumReadsInMem(error_msg)
        else:
            # write seqs from buffer1
            orphan_seqs = buf1['items'][:matching_seq_index]
            matching_seq = buf1['items'][matching_seq_index]
            write_seqs(orphan_seqs, orphan_out_fhand, out_format)
            write_seqs([matching_seq, seq], out_fhand, out_format)
            # fix buffers 1
            buf1['items'] = buf1['items'][matching_seq_index + 1:]
            buf1['index'] = {s: i for i, s in enumerate(buf1['items'])}

            # writes seqs from buffer 2 and fix buffer2
            write_seqs(buf2['items'], orphan_out_fhand, out_format)
            buf2['items'] = []
            buf2['index'] = {}
    else:
        orphan_seqs = buf1['items'] + buf2['items']
        write_seqs(orphan_seqs, orphan_out_fhand, out_format)

    orphan_out_fhand.flush()
    flush_fhand(out_fhand)
Пример #7
0
def match_pairs_unordered(seq_fpath, out_fhand, orphan_out_fhand, out_format):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    index_ = _index_seq_file(seq_fpath)
    paired, orphans = _get_paired_and_orphan(index_)

    # write paired
    write_seqs((SeqWrapper(SEQRECORD, index_[title], None) for title in paired), out_fhand, out_format)

    # orphans
    write_seqs((SeqWrapper(SEQRECORD, index_[title], None) for title in orphans), orphan_out_fhand,
               out_format)
Пример #8
0
def match_pairs_unordered(seq_fpath, out_fhand, orphan_out_fhand, out_format):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    index_ = _index_seq_file(seq_fpath)
    paired, orphans = _get_paired_and_orphan(index_)

    # write paired
    write_seqs((SeqWrapper(SEQRECORD, index_[title], None)
                for title in paired), out_fhand, out_format)

    # orphans
    write_seqs((SeqWrapper(SEQRECORD, index_[title], None)
                for title in orphans), orphan_out_fhand, out_format)
Пример #9
0
    def _pre_trim(self, trim_packet):
        seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
        reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming')

        write_seqs(seqs, reads_fhand)
        reads_fhand.flush()
        bwa = map_with_bwamem(self._index_fpath,
                              interleave_fpath=reads_fhand.name)
        bam_fhand = NamedTemporaryFile(dir=self._tempdir)
        map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                                 tempdir=self._tempdir)

        self._bam_fhand = bam_fhand
        reads_fhand.close()
Пример #10
0
def filter_duplicates(in_fhands, out_fhand, paired_reads, use_length=None,
                      n_seqs_packet=None, tempdir=None):
    if not in_fhands:
        raise ValueError('At least one input fhand is required')
    pairs = _read_pairs(in_fhands, paired_reads)
    get_pair_key = _PairKeyGetter(use_length=use_length)
    if n_seqs_packet is None:
        unique_pairs = unique_unordered(pairs, key=get_pair_key)
    else:
        sorted_pairs = sorted_items(pairs, key=get_pair_key, tempdir=tempdir,
                                    max_items_in_memory=n_seqs_packet)
        unique_pairs = unique(sorted_pairs, key=get_pair_key)
    for pair in unique_pairs:
        write_seqs(pair, out_fhand)
Пример #11
0
    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers,
                                        program='blastn', filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs
Пример #12
0
    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers,
                                        program='blastn', filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs
Пример #13
0
    def test_all_orphan():
        "All reads end up in orphan"
        seqs = [SeqRecord(Seq("ACT"), id="seq1"), SeqRecord(Seq("ACT"), id="seq2")]
        seqs = list(assing_kind_to_seqs(SEQRECORD, seqs, None))
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format="fasta")
        assert orphan_out_fhand.getvalue() == ">seq1\nACT\n>seq2\nACT\n"

        seq_fhand = NamedTemporaryFile(suffix=".fasta")
        write_seqs(seqs, seq_fhand, file_format="fasta")
        seq_fhand.flush()
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs_unordered(seq_fhand.name, out_fhand, orphan_out_fhand, out_format="fasta")
        assert ">seq1\nACT\n" in orphan_out_fhand.getvalue()
        assert ">seq2\nACT\n" in orphan_out_fhand.getvalue()
Пример #14
0
def _do_blast_2(db_fpath,
                queries,
                program,
                dbtype=None,
                blast_format=None,
                params=None,
                remote=False):
    '''It returns an alignment result with the blast.

    It is an alternative interface to the one based on fpaths.
    db_fpath should be a plain sequence file.
    queries should be a SeqRecord list.
    If an alternative blast output format is given it should be tabular, so
    blast_format is a list of fields.
    '''

    query_fhand = write_seqs(queries, file_format='fasta')
    query_fhand.flush()

    if remote:
        blastdb = db_fpath
        fmt = 'XML' if blast_format is None else blast_format.upper()
    else:
        blastdb = get_or_create_blastdb(db_fpath, dbtype=dbtype)
        if blast_format is None:
            blast_format = [
                'query',
                'subject',
                'query_length',
                'subject_length',
                'query_start',
                'query_end',
                'subject_start',
                'subject_end',
                'expect',
                'identity',
            ]
        fmt = generate_tabblast_format(blast_format)

    if params is None:
        params = {}
    params['outfmt'] = fmt

    blast_fhand = tempfile.NamedTemporaryFile(suffix='.blast')
    do_blast(query_fhand.name,
             blastdb,
             program,
             blast_fhand.name,
             params,
             remote=remote)
    if remote:
        blasts = BlastParser(blast_fhand)
    else:
        blasts = TabularBlastParser(blast_fhand, blast_format)

    return blasts, blast_fhand
Пример #15
0
    def test_all_orphan():
        'All reads end up in orphan'
        seqs = [SeqRecord(Seq('ACT'), id='seq1'),
                SeqRecord(Seq('ACT'), id='seq2')]
        seqs = list(assing_kind_to_seqs(SEQRECORD, seqs, None))
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format='fasta')
        assert orphan_out_fhand.getvalue() == '>seq1\nACT\n>seq2\nACT\n'

        seq_fhand = NamedTemporaryFile(suffix='.fasta')
        write_seqs(seqs, seq_fhand, file_format='fasta')
        seq_fhand.flush()
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs_unordered(seq_fhand.name, out_fhand, orphan_out_fhand,
                              out_format='fasta')
        assert '>seq1\nACT\n' in orphan_out_fhand.getvalue()
        assert '>seq2\nACT\n' in orphan_out_fhand.getvalue()
Пример #16
0
def _run_estscan(seqs, pep_out_fpath, dna_out_fpath, matrix_fpath):
    'It runs estscan in the input seqs'
    seq_fhand = write_seqs(seqs, file_format='fasta')
    seq_fhand.flush()
    binary = get_binary_path('estscan')

    cmd = [binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M',
           matrix_fpath, seq_fhand.name]
    process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    check_process_finishes(process, binary=cmd[0])
    seq_fhand.close()
Пример #17
0
def filter_duplicates(in_fhands,
                      out_fhand,
                      paired_reads,
                      use_length=None,
                      n_seqs_packet=None,
                      tempdir=None):
    if not in_fhands:
        raise ValueError('At least one input fhand is required')
    pairs = _read_pairs(in_fhands, paired_reads)
    get_pair_key = _PairKeyGetter(use_length=use_length)
    if n_seqs_packet is None:
        unique_pairs = unique_unordered(pairs, key=get_pair_key)
    else:
        sorted_pairs = sorted_items(pairs,
                                    key=get_pair_key,
                                    tempdir=tempdir,
                                    max_items_in_memory=n_seqs_packet)
        unique_pairs = unique(sorted_pairs, key=get_pair_key)
    for pair in unique_pairs:
        write_seqs(pair, out_fhand)
Пример #18
0
def _run_estscan(seqs, pep_out_fpath, dna_out_fpath, matrix_fpath):
    'It runs estscan in the input seqs'
    seq_fhand = write_seqs(seqs, file_format='fasta')
    seq_fhand.flush()
    binary = get_binary_path('estscan')

    cmd = [binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M',
           matrix_fpath, seq_fhand.name]
    process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    check_process_finishes(process, binary=cmd[0])
    seq_fhand.close()
Пример #19
0
 def _pre_trim(self, trim_packet):
     seqs = [s for seqs in trim_packet[SEQS_PASSED] for s in seqs]
     db_fhand = write_seqs(seqs, file_format="fasta")
     db_fhand.flush()
     params = {"task": "blastn-short", "expect": "0.0001"}
     filters = [
         {"kind": "score_threshold", "score_key": "identity", "min_score": 87},
         {"kind": "min_length", "min_num_residues": 13, "length_in_query": False},
     ]
     self._matcher = BlasterForFewSubjects(
         db_fhand.name, self.oligos, program="blastn", filters=filters, params=params, elongate_for_global=True
     )
Пример #20
0
    def test_seqitems_io(self):
        'It checks the different seq class streams IO'
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM]))
        assert seqs[0].kind == SEQITEM
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
        assert seqs[0].object.name == 's1'

        # SeqRecord
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD]))
        assert seqs[0].kind == SEQRECORD
        fhand = StringIO()
        write_seqs(seqs, fhand, 'fasta')
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'

        # seqitem not possible with different input and output formats
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        try:
            seqs = list(read_seqs([fhand], out_format='fastq',
                        prefered_seq_classes=[SEQITEM]))
            self.fail('ValueError expected')
        except ValueError:
            pass

        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], out_format='fasta',
                        prefered_seq_classes=[SEQITEM]))
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
Пример #21
0
    def _setup_checks(self, filterpacket):
        index_fpath = self._index_fpath
        get_or_create_bowtie2_index(index_fpath)
        seqs = [s for seqs in filterpacket[SEQS_PASSED] for s in seqs]
        seq_class = seqs[0].kind
        extra_params = []
        # Which format do we need for the bowtie2 input read file fasta or
        # fastq?
        if seq_class == SEQRECORD:
            if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys():
                file_format = 'fastq'
            else:
                extra_params.append('-f')
                file_format = 'fasta'
        elif seq_class == SEQITEM:
            file_format = get_file_format(seqs[0])
            if 'illumina' in file_format:
                extra_params.append('--phred64')
            elif 'fasta' in file_format:
                extra_params.append('-f')
            elif 'fastq' in file_format:
                pass
            else:
                msg = 'For FilterBowtie2Match and SeqItems fastq or fasta '
                msg += 'files are required'
                raise RuntimeError(msg)
        else:
            raise NotImplementedError()

        reads_fhand = NamedTemporaryFile(suffix=file_format)
        write_seqs(seqs, reads_fhand, file_format=file_format)
        reads_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        map_with_bowtie2(index_fpath,
                         bam_fhand.name,
                         unpaired_fpaths=[reads_fhand.name],
                         extra_params=extra_params)

        self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
Пример #22
0
def classify_chimeras(in_fhand,
                      index_fpath,
                      mate_distance,
                      out_fhand,
                      chimeras_fhand=None,
                      unknown_fhand=None,
                      tempdir=None,
                      threads=None,
                      settings=get_setting('CHIMERAS_SETTINGS')):
    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath,
                          interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa,
                             bam_fhand.name,
                             key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand,
                                            settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
Пример #23
0
    def _setup_checks(self, filterpacket):
        index_fpath = self._index_fpath
        get_or_create_bowtie2_index(index_fpath)
        seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs]
        seq_class = seqs[0].kind
        extra_params = []
        # Which format do we need for the bowtie2 input read file fasta or
        # fastq?
        if seq_class == SEQRECORD:
            if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys():
                file_format = 'fastq'
            else:
                extra_params.append('-f')
                file_format = 'fasta'
        elif seq_class == SEQITEM:
            file_format = get_file_format(seqs[0])
            if 'illumina' in file_format:
                extra_params.append('--phred64')
            elif 'fasta' in file_format:
                extra_params.append('-f')
            elif 'fastq' in file_format:
                pass
            else:
                msg = 'For FilterBowtie2Match and SeqItems fastq or fasta '
                msg += 'files are required'
                raise RuntimeError(msg)
        else:
            raise NotImplementedError()

        reads_fhand = NamedTemporaryFile(suffix=file_format)
        write_seqs(seqs, reads_fhand, file_format=file_format)
        reads_fhand.flush()

        bam_fhand = NamedTemporaryFile(suffix='.bam')
        map_process = map_with_bowtie2(index_fpath,
                                       unpaired_fpaths=[reads_fhand.name],
                                       extra_params=extra_params)
        map_process_to_bam(map_process, bam_fhand.name)

        self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
Пример #24
0
 def _pre_trim(self, trim_packet):
     seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
     db_fhand = write_seqs(seqs, file_format='fasta')
     db_fhand.flush()
     params = {'task': 'blastn-short', 'expect': '0.0001'}
     filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                 'min_score': 87},
                {'kind': 'min_length', 'min_num_residues': 13,
                 'length_in_query': False}]
     self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos,
                                          program='blastn', filters=filters,
                                          params=params,
                                          elongate_for_global=True)
Пример #25
0
 def _pre_trim(self, trim_packet):
     seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs]
     db_fhand = write_seqs(seqs, file_format='fasta')
     db_fhand.flush()
     params = {'task': 'blastn-short', 'expect': '0.0001'}
     filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                 'min_score': 87},
                {'kind': 'min_length', 'min_num_residues': 13,
                 'length_in_query': False}]
     self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos,
                                          program='blastn', filters=filters,
                                          params=params,
                                          elongate_for_global=True)
Пример #26
0
    def test_all_orphan():
        'All reads end up in orphan'
        seqs = [
            SeqRecord(Seq('ACT'), id='seq1'),
            SeqRecord(Seq('ACT'), id='seq2')
        ]
        seqs = list(assing_kind_to_seqs(SEQRECORD, seqs, None))
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs(seqs, out_fhand, orphan_out_fhand, out_format='fasta')
        assert orphan_out_fhand.getvalue() == '>seq1\nACT\n>seq2\nACT\n'

        seq_fhand = NamedTemporaryFile(suffix='.fasta')
        write_seqs(seqs, seq_fhand, file_format='fasta')
        seq_fhand.flush()
        out_fhand = StringIO()
        orphan_out_fhand = StringIO()
        match_pairs_unordered(seq_fhand.name,
                              out_fhand,
                              orphan_out_fhand,
                              out_format='fasta')
        assert '>seq1\nACT\n' in orphan_out_fhand.getvalue()
        assert '>seq2\nACT\n' in orphan_out_fhand.getvalue()
Пример #27
0
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format):
    '''It splits a sequence iterator with alternating paired reads in two.

    It will fail if forward and reverse reads are not alternating.
    '''
    while True:
        try:
            seq1 = seqs.next()
        except StopIteration:
            seq1 = None
        try:
            seq2 = seqs.next()
        except StopIteration:
            seq2 = None
        if seq1 is None:
            break  # we have consumed the input iterator completely
        if seq2 is None:
            msg = 'The file had an odd number of sequences'
            raise InterleaveError(msg)
        _check_name_and_direction_match(seq1, seq2)
        write_seqs([seq1], out_fhand1, out_format)
        write_seqs([seq2], out_fhand2, out_format)
    out_fhand1.flush()
    out_fhand2.flush()
Пример #28
0
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format):
    '''It splits a sequence iterator with alternating paired reads in two.

    It will fail if forward and reverse reads are not alternating.
    '''
    while True:
        try:
            seq1 = seqs.next()
        except StopIteration:
            seq1 = None
        try:
            seq2 = seqs.next()
        except StopIteration:
            seq2 = None
        if seq1 is None:
            break  # we have consumed the input iterator completely
        if seq2 is None:
            msg = 'The file had an odd number of sequences'
            raise InterleaveError(msg)
        _check_name_and_direction_match(seq1, seq2)
        write_seqs([seq1], out_fhand1, out_format)
        write_seqs([seq2], out_fhand2, out_format)
    out_fhand1.flush()
    out_fhand2.flush()
Пример #29
0
    def _setup_checks(self, filterpacket):
        seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs]

        # we create a blastdb for these reads and then we use the oligos
        # as the blast query
        db_fhand = write_seqs(seqs, file_format='fasta')
        db_fhand.flush()
        params = {'task': 'blastn-short', 'expect': '0.0001'}
        filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                    'min_score': 87},
                   {'kind': 'min_length', 'min_num_residues': 13,
                    'length_in_query': False}]
        self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos,
                                             program='blastn', filters=filters,
                                             params=params,
                                             elongate_for_global=False)
Пример #30
0
    def _setup_checks(self, filterpacket):
        seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs]

        # we create a blastdb for these reads and then we use the oligos
        # as the blast query
        db_fhand = write_seqs(seqs, file_format='fasta')
        db_fhand.flush()
        params = {'task': 'blastn-short', 'expect': '0.0001'}
        filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                    'min_score': 87},
                   {'kind': 'min_length', 'min_num_residues': 13,
                    'length_in_query': False}]
        self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos,
                                             program='blastn', filters=filters,
                                             params=params,
                                             elongate_for_global=False)
Пример #31
0
def _do_blast_2(db_fpath, queries, program, dbtype=None, blast_format=None, params=None, remote=False):
    """It returns an alignment result with the blast.

    It is an alternative interface to the one based on fpaths.
    db_fpath should be a plain sequence file.
    queries should be a SeqRecord list.
    If an alternative blast output format is given it should be tabular, so
    blast_format is a list of fields.
    """

    query_fhand = write_seqs(queries, file_format="fasta")
    query_fhand.flush()

    if remote:
        blastdb = db_fpath
        fmt = "XML" if blast_format is None else blast_format.upper()
    else:
        blastdb = get_or_create_blastdb(db_fpath, dbtype=dbtype)
        if blast_format is None:
            blast_format = [
                "query",
                "subject",
                "query_length",
                "subject_length",
                "query_start",
                "query_end",
                "subject_start",
                "subject_end",
                "expect",
                "identity",
            ]
        fmt = generate_tabblast_format(blast_format)

    if params is None:
        params = {}
    params["outfmt"] = fmt

    blast_fhand = tempfile.NamedTemporaryFile(suffix=".blast")
    do_blast(query_fhand.name, blastdb, program, blast_fhand.name, params, remote=remote)
    if remote:
        blasts = BlastParser(blast_fhand)
    else:
        blasts = TabularBlastParser(blast_fhand, blast_format)

    return blasts, blast_fhand
Пример #32
0
 def __call__(self, seqs):
     'It trims the masked segments of the SeqWrappers.'
     db_fhand = write_seqs(seqs, file_format='fasta')
     db_fhand.flush()
     params = {'task': 'blastn-short', 'expect': '0.0001'}
     filters = [{'kind': 'score_threshold', 'score_key': 'identity',
                 'min_score': 89},
                {'kind': 'min_length', 'min_num_residues': 13,
                 'length_in_query': False}]
     matcher = BlasterForFewSubjects(db_fhand.name, self.oligos,
                                     program='blastn', filters=filters,
                                     params=params,
                                     elongate_for_global=True)
     for seq in seqs:
         segments = matcher.get_matched_segments_for_read(get_name(seq))
         if segments is not None:
             _add_trim_segments(segments[0], seq, kind=VECTOR)
     return seqs
Пример #33
0
    def test_seqitems_io(self):
        'It checks the different seq class streams IO'
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(read_seqs([fhand], 'fasta',
                              prefered_seq_classes=[SEQITEM]))
        assert seqs[0].kind == SEQITEM
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
        assert seqs[0].object.name == 's1'

        # SeqRecord
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(
            read_seqs([fhand], 'fasta', prefered_seq_classes=[SEQRECORD]))
        assert seqs[0].kind == SEQRECORD
        fhand = StringIO()
        write_seqs(seqs, fhand, 'fasta')
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'

        # seqitem not possible with different input and output formats
        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        try:
            seqs = list(
                read_seqs([fhand],
                          'fasta',
                          out_format='fastq',
                          prefered_seq_classes=[SEQITEM]))
            self.fail('ValueError expected')
        except ValueError:
            pass

        fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n')
        seqs = list(
            read_seqs([fhand],
                      'fasta',
                      out_format='fasta',
                      prefered_seq_classes=[SEQITEM]))
        fhand = StringIO()
        write_seqs(seqs, fhand)
        assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
Пример #34
0
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand,
                      chimeras_fhand=None, unknown_fhand=None, tempdir=None,
                      threads=None, settings=get_setting('CHIMERAS_SETTINGS')):

    '''It maps sequences from input files, sorts them and writes to output
    files according to its classification'''
    bam_fhand = NamedTemporaryFile(suffix='.bam')
    extra_params = ['-a', '-M']
    bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name,
                          extra_params=extra_params)
    map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname',
                             tempdir=tempdir)

    for pair, kind in classify_mapped_reads(bam_fhand, settings=settings,
                                            mate_distance=mate_distance):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
Пример #35
0
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands,
                    unknown_fhand, unpaired=False, paired_result=True,
                    settings=get_setting('CHIMERAS_SETTINGS'),
                    min_seed_len=None, directory=None):
    file_format = get_format(in_fhands[0])
    if unpaired:
        unpaired_fpaths = [fhand.name for fhand in in_fhands]
        paired_fpaths = None
    else:
        f_fhand = NamedTemporaryFile()
        r_fhand = NamedTemporaryFile()
        seqs = read_seqs(in_fhands)
        deinterleave_pairs(seqs, f_fhand, r_fhand, file_format)
        paired_fpaths = [f_fhand.name, r_fhand.name]
        unpaired_fpaths = None
    bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths,
                                   directory, file_format, min_seed_len)

    total = 0
    chimeric = 0
    unknown = 0
    for pair, kind in classify_mapped_reads(bamfile, settings=settings,
                                           paired_result=paired_result,
                                           file_format=file_format):
        if kind is NON_CHIMERIC:
            write_seqs(pair, out_fhand)
        elif kind is CHIMERA and chimeras_fhand is not None:
            write_seqs(pair, chimeras_fhand)
            chimeric += 1
        elif kind is UNKNOWN and unknown_fhand is not None:
            write_seqs(pair, unknown_fhand)
            unknown += 1
        total += 1
    mapped = total - chimeric - unknown
    print 'Total pairs analyzed: ', total
    print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total)
    print 'Unknown pairs found: ', unknown, '\t', unknown / float(total)
    print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)
Пример #36
0
def match_pairs(seqs,
                out_fhand,
                orphan_out_fhand,
                out_format,
                memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    buf_fwd = {'index': {}, 'items': []}
    buf_rev = {'index': {}, 'items': []}
    buf1, buf2 = buf_rev, buf_fwd  # for the all orphan case
    for seq in seqs:
        try:
            seq_name, direction = _parse_pair_direction_and_name(seq)
        except PairDirectionError:
            write_seqs([seq], orphan_out_fhand, out_format)
            continue

        # buf1 -> buffer for the reads with the same orientation as the
        # current one
        # buf2 -> buffer for the reads with the reverse orientation as the
        # current one

        if direction == FWD:
            buf1 = buf_fwd
            buf2 = buf_rev
        else:
            buf1 = buf_rev
            buf2 = buf_fwd

        try:
            matching_seq_index = buf2['index'][seq_name]
        except KeyError:
            matching_seq_index = None

        if matching_seq_index is None:
            # add to buff
            buf1['items'].append(seq)
            buf1['index'][seq_name] = len(buf1['items']) - 1
            # check mem limit
            sum_items = len(buf2['items'] + buf1['items'])
            if memory_limit is not None and sum_items >= memory_limit:
                error_msg = 'There are too many consecutive non matching seqs'
                error_msg += ' in your input. We have reached the memory limit.'
                error_msg += 'Are you sure that the reads are sorted and '
                error_msg += 'interleaved?. You could try with the unordered'
                error_msg += ' algorith'
                raise MaxNumReadsInMem(error_msg)
        else:
            # write seqs from buffer1
            orphan_seqs = buf2['items'][:matching_seq_index]
            matching_seq = buf2['items'][matching_seq_index]
            write_seqs(orphan_seqs, orphan_out_fhand, out_format)
            write_seqs([matching_seq, seq], out_fhand, out_format)
            # fix buffer 1
            if matching_seq_index != len(buf2['items']) - 1:
                msg = 'The given files are not sorted (ordered) and '
                msg = 'interleaved. You could try with the unordered algorithm'
                raise MalformedFile(msg)
            buf2 = {'index': {}, 'items': []}

            # writes seqs from buffer 2 and fix buffer2
            write_seqs(buf1['items'], orphan_out_fhand, out_format)
            buf1['items'] = []
            buf1['index'] = {}
    else:
        orphan_seqs = buf1['items'] + buf2['items']
        write_seqs(orphan_seqs, orphan_out_fhand, out_format)

    orphan_out_fhand.flush()
    flush_fhand(out_fhand)
Пример #37
0
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format,
                memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')):
    'It matches the seq pairs in an iterator and splits the orphan seqs'
    buf_fwd = {'index': {}, 'items': []}
    buf_rev = {'index': {}, 'items': []}
    buf1, buf2 = buf_rev, buf_fwd  # for the all orphan case
    for seq in seqs:
        try:
            seq_name, direction = _parse_pair_direction_and_name(seq)
        except PairDirectionError:
            write_seqs([seq], orphan_out_fhand, out_format)
            continue

        # buf1 -> buffer for the reads with the same orientation as the
        # current one
        # buf2 -> buffer for the reads with the reverse orientation as the
        # current one

        if direction == FWD:
            buf1 = buf_fwd
            buf2 = buf_rev
        else:
            buf1 = buf_rev
            buf2 = buf_fwd

        try:
            matching_seq_index = buf2['index'][seq_name]
        except KeyError:
            matching_seq_index = None

        if matching_seq_index is None:
            # add to buff
            buf1['items'].append(seq)
            buf1['index'][seq_name] = len(buf1['items']) - 1
            # check mem limit
            sum_items = len(buf2['items'] + buf1['items'])
            if memory_limit is not None and sum_items >= memory_limit:
                error_msg = 'There are too many consecutive non matching seqs'
                error_msg += ' in your input. We have reached the memory limit.'
                error_msg += 'Are you sure that the reads are sorted and '
                error_msg += 'interleaved?. You could try with the unordered'
                error_msg += ' algorith'
                raise MaxNumReadsInMem(error_msg)
        else:
            # write seqs from buffer1
            orphan_seqs = buf2['items'][:matching_seq_index]
            matching_seq = buf2['items'][matching_seq_index]
            write_seqs(orphan_seqs, orphan_out_fhand, out_format)
            write_seqs([matching_seq, seq], out_fhand, out_format)
            # fix buffer 1
            if matching_seq_index != len(buf2['items']) - 1:
                msg = 'The given files are not sorted (ordered) and '
                msg = 'interleaved. You could try with the unordered algorithm'
                raise MalformedFile(msg)
            buf2 = {'index': {}, 'items': []}
            # writes seqs from buffer 2 and fix buffer2
            write_seqs(buf1['items'], orphan_out_fhand, out_format)
            buf1 = {'index': {}, 'items': []}

        if direction == FWD:
            buf_fwd = buf1
            buf_rev = buf2
        else:
            buf_rev = buf1
            buf_fwd = buf2

    else:
        orphan_seqs = buf1['items'] + buf2['items']
        write_seqs(orphan_seqs, orphan_out_fhand, out_format)

    orphan_out_fhand.flush()
    flush_fhand(out_fhand)