Exemplo n.º 1
0
def test_database_populate_fasta():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()

    fasta = StringIO()
    fasta.name = '/x.fasta'

    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=False)
    assert len(inserted) == 2
    assert all(isinstance(r, Record) for r in inserted)
    assert all(rec.source_file == fasta.name for rec in inserted), \
        'source file of sequence records must be set'
    assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \
        'should be able to retrieve sequences by position in source'

    with patch('biseqt.database.open', create=True) as open_mock:
        open_mock.return_value = MagicMock(spec=file, wraps=fasta)
        assert db.load_from_record(inserted[0]) == S, \
            'load_from_record should work without an open file handle'
Exemplo n.º 2
0
def seed_index():
    """Creates a database, a kmer index, and a seed index with word length 5
    stored in memory and returns the seed index. The database is populated with
    3 random sequences of length 100 and all kmers and seeds are indexed."""
    A = Alphabet('ACGT')
    num_seqs = 3
    seq_len = 100
    wordlen = 5

    db = DB(':memory:', A)
    seed_index = SeedIndex(KmerIndex(db, wordlen))
    seed_index.db.initialize()

    fasta = StringIO()
    seqs = (rand_seq(A, seq_len).to_named('#%d' % i) for i in range(num_seqs))
    write_fasta(fasta, seqs)
    fasta.seek(0)

    db.load_fasta(fasta)
    seed_index.index_seeds()
    return seed_index
Exemplo n.º 3
0
def test_database_populate_fasta_rc():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()
    fasta = StringIO()
    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=True)

    assert len(inserted) == 4
    assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \
        == [S.content_id, T.content_id], \
        'reverse complements should know what their origin is'

    def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id

    found_T_rc = next(db.find(condition=cond_T_rc))
    T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name)
    assert db.load_from_record(found_T_rc, fasta) == T_rc, \
        'reverse complements should load properly from a record'
Exemplo n.º 4
0
class ReadMapper(object):
    def __init__(self, alphabet, wordlen, db_path):
        self.db = DB(db_path, alphabet)
        self.kmer_index = KmerIndex(self.db, wordlen)
        self.seed_index = SeedIndex(self.kmer_index)
        self.bands_indexed = False

    def log(self, *args, **kwargs):
        self.db.log(*args, **kwargs)

    def initialize(self, reads_fa, refs_fa=None, num_reads=-1):
        self.db.initialize()
        with open(reads_fa) as f:
            self.db.load_fasta(f, num=num_reads, rc=True)
        if refs_fa is not None:
            with open(refs_fa) as f:
                self.db.load_fasta(f, rc=False)

    def index_bands(self, **kw):
        self.kmer_index.score_kmers()
        self.seed_index.score_diagonals(**kw)
        self.bands_indexed = True

    def load_reads(self):
        recs_by_content_id = {r.content_id: r for r in list(self.db.find())}
        reads = []
        for record in recs_by_content_id.values():
            if 'rc_of' in record.attrs:
                pair = (recs_by_content_id[record.attrs['rc_of']], record)
                reads.append(Read(self.seed_index, *pair))
        return sorted(reads, key=lambda read: read.record.id)

    def load_refs(self):
        recs_by_content_id = {r.content_id: r for r in list(self.db.find())}
        for record in recs_by_content_id.values():
            if 'rc_of' in record.attrs:
                recs_by_content_id.pop(record.attrs['rc_of'])
                recs_by_content_id.pop(record.content_id)
        return recs_by_content_id.values()

    def map_all_to_all(self, min_band_score, **aligner_kw):
        assert self.bands_indexed, 'Bands must be indexed first'
        self.log('Mapping all reads against each other')
        reads = self.load_reads()  # NOTE comes in sorted order of id
        indic = ProgressIndicator(num_total=len(reads))
        indic.start()
        for read in reads:
            indic.progress()
            # NOTE only compare to reads after us
            others = (r.record for r in reads if r.record.id > read.record.id)
            for other in others:
                rec, target_rec, aln = read.map(other,
                                                min_band_score=min_band_score,
                                                **aligner_kw)
                if rec is None:
                    continue
                yield rec, target_rec, aln
        indic.finish()

    def map_all_to_refs(self, min_band_score, **aligner_kw):
        # FIXME it would be nice to only calculate bands for read v. ref not
        # all pairwise of reads too
        assert self.bands_indexed, 'Bands must be indexed first'
        self.log('Mapping all reads against reference sequences')
        reads, refs = self.load_reads(), self.load_refs()
        indic = ProgressIndicator(num_total=len(reads))
        indic.start()
        for read in reads:
            indic.progress()
            rec, target_rec, aln = read.map(refs,
                                            min_band_score=min_band_score,
                                            **aligner_kw)
            if rec is not None:
                yield rec, target_rec, aln
        indic.finish()

    def mappings_from_sam(self, sampath):
        """Loads mappings from a SAM mapping file and translates sequence
        names to integer identifiers as stored by :class:`biseqt.database.DB`.

        Args:
            db (database.DB): The sequence database where ids are looked up.
            sampath (str): The path to SAM mappings file.

        Yields:
            tuple:
                A 3-tuple containing the read record, the reference name and
                the ``pysam.calignedsegment.AlignedSegment`` mapping it to the
                reference.
        """
        self.log('Loading SAM mappings from %s.' % sampath)
        reads_by_name = {r.record.attrs['name']: r for r in self.load_reads()}
        samfile = pysam.AlignmentFile(sampath)
        for mapping in samfile.fetch():
            qname, rname = mapping.query_name, mapping.reference_name
            # NOTE this is because BLASR does a weird thing with sequence names
            qname = qname.rsplit('/', 1)[0]
            if qname not in reads_by_name:
                continue
            yield reads_by_name[qname], rname, mapping

    def overlaps_from_sam_mappings(self, sampath, min_overlap=-1):
        """Finds all pairs of overlapping sequences based on their mappings to
        a reference.

        Args:
            sampath (str): The path to SAM mappings file.
            min_overlap (int): The minimum required length for overlaps to be
                reported; default is -1 in which case no overlap is excluded.

        Yields:
            tuple:
                A tuple of sequence integer ids (in increasing order) that are
                deemed as overlapping based on SAM mappings.
        """
        self.log('Finding overlaps from SAM mappings.')
        mappings = {read.record.id: (read, ref, mapping)
                    for read, ref, mapping in self.mappings_from_sam(sampath)}
        seqids = sorted(mappings.keys())
        for id0, id1 in combinations(seqids, 2):
            (r0, ref0, map0), (r1, ref1, map1) = mappings[id0], mappings[id1]
            if ref0 != ref1:
                continue
            # TODO ignoring query_alignment_start and query_alignment_end
            overlap_len = min(map0.reference_end, map1.reference_end) - \
                max(map0.reference_start, map1.reference_start)
            if overlap_len <= 0 or overlap_len < min_overlap:
                continue
            # FIXME the second thing we yield is not reported by our own
            # map_all_to_all.
            if map0.is_reverse == map1.is_reverse:
                yield r0.record, r1.record
                yield r0.rc_record, r1.rc_record
            else:
                yield r0.record, r1.rc_record
                yield r0.rc_record, r1.record