Exemplo n.º 1
0
def test_database_insert():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    db = DB(':memory:', A)
    db.initialize()
    attrs = {'key': 'value'}
    rec = db.insert(S, source_file='source.fa', source_pos=10, attrs=attrs)
    assert isinstance(rec.id, int)
    assert rec.content_id == S.content_id
    assert rec.source_pos == 10
    assert rec.source_file == 'source.fa'
    assert 'key' in rec.attrs and rec.attrs['key'] == 'value', \
        'attributes must be populated correctly'
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute('SELECT content_id FROM sequence WHERE id = ?',
                       (rec.id,))
        # NOTE for some reason if we just say next(cursor) ==  ...
        # the cursor remains open after the context is over (which should
        # not happen as per docs). This leads to BusyError further down.
        assert cursor.fetchall() == [(S.content_id,)], \
            'content identifier is properly populated'

    # add a second sequence
    T = A.parse('GCTG', name='bar')
    new_rec = db.insert(T)
    assert new_rec.id != rec.id, 'new ids are assigned to new sequences'
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute('SELECT content_id FROM sequence WHERE id = ?',
                       (new_rec.id,))
        assert next(cursor) == (T.content_id,), \
            'correct id must be populated'
Exemplo n.º 2
0
def test_database_populate_fasta():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()

    fasta = StringIO()
    fasta.name = '/x.fasta'

    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=False)
    assert len(inserted) == 2
    assert all(isinstance(r, Record) for r in inserted)
    assert all(rec.source_file == fasta.name for rec in inserted), \
        'source file of sequence records must be set'
    assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \
        'should be able to retrieve sequences by position in source'

    with patch('biseqt.database.open', create=True) as open_mock:
        open_mock.return_value = MagicMock(spec=file, wraps=fasta)
        assert db.load_from_record(inserted[0]) == S, \
            'load_from_record should work without an open file handle'
Exemplo n.º 3
0
def test_database_basic():
    A = Alphabet('ACGT')
    db = DB(':memory:', A)
    db.initialize()
    db.initialize()  # should be able to call it twice
    with db.connection() as conn:
        # a sequence table should be created
        conn.cursor().execute('SELECT * FROM sequence LIMIT 1;')

    with pytest.raises(AssertionError):
        DB('/cannot/possibly/exist/directory/', A)
Exemplo n.º 4
0
def test_database_overwrite():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    db = DB(':memory:', A)
    db.initialize()
    db.insert(S, source_file='old_source.fa')
    db.insert(S, source_file='new_source.fa')
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute(
            'SELECT source_file FROM sequence WHERE content_id = ?',
            (S.content_id,)
        )
        res = [x[0] for x in cursor]
        assert len(res) == 1 and res[0] == 'old_source.fa', \
            'Sequences with observed content id should be ignored'
Exemplo n.º 5
0
def test_database_find():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    T = A.parse('GGCT', name='bar')
    db = DB(':memory:', A)
    db.initialize()
    db.insert(S)
    db.insert(T)

    sql_condition = "attrs LIKE '%s'" % '%"name": "bar"%'
    found = [rec for rec in db.find(sql_condition=sql_condition)]
    assert len(found) == 1 and found[0].content_id == T.content_id, \
        'find() should work with sql_condition'

    def condition(rec): return rec.attrs['name'] == 'foo'

    found = [rec for rec in db.find(condition=condition)]
    assert len(found) == 1 and found[0].content_id == S.content_id, \
        'find() should work with callable condition'
Exemplo n.º 6
0
def test_database_events():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')

    # NOTE python 2 does not support non-local, non-global variables, put it in
    # the function object.
    test_database_events.callback_called = 0

    def callback(self, *args):
        test_database_events.callback_called += 1

    db = DB(':memory:', A)
    db.add_event_listener('db-initialized', callback)
    db.add_event_listener('sequence-inserted', callback)
    db.initialize()
    assert test_database_events.callback_called == 1, \
        'event callbacks for "initialize" should be executed'

    db.insert(S)
    assert test_database_events.callback_called == 2, \
        'event callbacks for "insert-sequence" should be executed'
Exemplo n.º 7
0
def test_database_populate_fasta_rc():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()
    fasta = StringIO()
    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=True)

    assert len(inserted) == 4
    assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \
        == [S.content_id, T.content_id], \
        'reverse complements should know what their origin is'

    def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id

    found_T_rc = next(db.find(condition=cond_T_rc))
    T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name)
    assert db.load_from_record(found_T_rc, fasta) == T_rc, \
        'reverse complements should load properly from a record'
Exemplo n.º 8
0
class ReadMapper(object):
    def __init__(self, alphabet, wordlen, db_path):
        self.db = DB(db_path, alphabet)
        self.kmer_index = KmerIndex(self.db, wordlen)
        self.seed_index = SeedIndex(self.kmer_index)
        self.bands_indexed = False

    def log(self, *args, **kwargs):
        self.db.log(*args, **kwargs)

    def initialize(self, reads_fa, refs_fa=None, num_reads=-1):
        self.db.initialize()
        with open(reads_fa) as f:
            self.db.load_fasta(f, num=num_reads, rc=True)
        if refs_fa is not None:
            with open(refs_fa) as f:
                self.db.load_fasta(f, rc=False)

    def index_bands(self, **kw):
        self.kmer_index.score_kmers()
        self.seed_index.score_diagonals(**kw)
        self.bands_indexed = True

    def load_reads(self):
        recs_by_content_id = {r.content_id: r for r in list(self.db.find())}
        reads = []
        for record in recs_by_content_id.values():
            if 'rc_of' in record.attrs:
                pair = (recs_by_content_id[record.attrs['rc_of']], record)
                reads.append(Read(self.seed_index, *pair))
        return sorted(reads, key=lambda read: read.record.id)

    def load_refs(self):
        recs_by_content_id = {r.content_id: r for r in list(self.db.find())}
        for record in recs_by_content_id.values():
            if 'rc_of' in record.attrs:
                recs_by_content_id.pop(record.attrs['rc_of'])
                recs_by_content_id.pop(record.content_id)
        return recs_by_content_id.values()

    def map_all_to_all(self, min_band_score, **aligner_kw):
        assert self.bands_indexed, 'Bands must be indexed first'
        self.log('Mapping all reads against each other')
        reads = self.load_reads()  # NOTE comes in sorted order of id
        indic = ProgressIndicator(num_total=len(reads))
        indic.start()
        for read in reads:
            indic.progress()
            # NOTE only compare to reads after us
            others = (r.record for r in reads if r.record.id > read.record.id)
            for other in others:
                rec, target_rec, aln = read.map(other,
                                                min_band_score=min_band_score,
                                                **aligner_kw)
                if rec is None:
                    continue
                yield rec, target_rec, aln
        indic.finish()

    def map_all_to_refs(self, min_band_score, **aligner_kw):
        # FIXME it would be nice to only calculate bands for read v. ref not
        # all pairwise of reads too
        assert self.bands_indexed, 'Bands must be indexed first'
        self.log('Mapping all reads against reference sequences')
        reads, refs = self.load_reads(), self.load_refs()
        indic = ProgressIndicator(num_total=len(reads))
        indic.start()
        for read in reads:
            indic.progress()
            rec, target_rec, aln = read.map(refs,
                                            min_band_score=min_band_score,
                                            **aligner_kw)
            if rec is not None:
                yield rec, target_rec, aln
        indic.finish()

    def mappings_from_sam(self, sampath):
        """Loads mappings from a SAM mapping file and translates sequence
        names to integer identifiers as stored by :class:`biseqt.database.DB`.

        Args:
            db (database.DB): The sequence database where ids are looked up.
            sampath (str): The path to SAM mappings file.

        Yields:
            tuple:
                A 3-tuple containing the read record, the reference name and
                the ``pysam.calignedsegment.AlignedSegment`` mapping it to the
                reference.
        """
        self.log('Loading SAM mappings from %s.' % sampath)
        reads_by_name = {r.record.attrs['name']: r for r in self.load_reads()}
        samfile = pysam.AlignmentFile(sampath)
        for mapping in samfile.fetch():
            qname, rname = mapping.query_name, mapping.reference_name
            # NOTE this is because BLASR does a weird thing with sequence names
            qname = qname.rsplit('/', 1)[0]
            if qname not in reads_by_name:
                continue
            yield reads_by_name[qname], rname, mapping

    def overlaps_from_sam_mappings(self, sampath, min_overlap=-1):
        """Finds all pairs of overlapping sequences based on their mappings to
        a reference.

        Args:
            sampath (str): The path to SAM mappings file.
            min_overlap (int): The minimum required length for overlaps to be
                reported; default is -1 in which case no overlap is excluded.

        Yields:
            tuple:
                A tuple of sequence integer ids (in increasing order) that are
                deemed as overlapping based on SAM mappings.
        """
        self.log('Finding overlaps from SAM mappings.')
        mappings = {read.record.id: (read, ref, mapping)
                    for read, ref, mapping in self.mappings_from_sam(sampath)}
        seqids = sorted(mappings.keys())
        for id0, id1 in combinations(seqids, 2):
            (r0, ref0, map0), (r1, ref1, map1) = mappings[id0], mappings[id1]
            if ref0 != ref1:
                continue
            # TODO ignoring query_alignment_start and query_alignment_end
            overlap_len = min(map0.reference_end, map1.reference_end) - \
                max(map0.reference_start, map1.reference_start)
            if overlap_len <= 0 or overlap_len < min_overlap:
                continue
            # FIXME the second thing we yield is not reported by our own
            # map_all_to_all.
            if map0.is_reverse == map1.is_reverse:
                yield r0.record, r1.record
                yield r0.rc_record, r1.rc_record
            else:
                yield r0.record, r1.rc_record
                yield r0.rc_record, r1.record