Exemplo n.º 1
0
def sequencing_sample(request):
    """Creates a random sequence, generates reads, with parameterized mutation
    probabilities, of equal length starting at whole multiples of half of read
    length. It is expected that successive reads have an overlap starting at
    their halfway position.

    Returns:
        tuple:
            A tuple containing the full genome, a list of reads, the gap
            probability and the seed index.
    """
    A = Alphabet('ACGT')
    gap_prob, subst_prob, wordlen = request.param
    seq_len, read_len = 2000, 500
    seq = rand_seq(A, seq_len).to_named('genome')
    mutation_process = MutationProcess(A, subst_probs=subst_prob,
                                       go_prob=gap_prob, ge_prob=gap_prob)
    reads = []
    for i in range(0, seq_len - read_len, int(read_len/2)):
        read, _ = mutation_process.mutate(seq[i: i + read_len])
        reads += [read.to_named('read#%d' % i)]

    db = DB(':memory:', A)
    kmer_index = KmerIndex(db, wordlen)
    seed_index = SeedIndex(kmer_index)
    seed_index.db.initialize()
    records = [db.insert(r) for r in reads]
    return seq, reads, records, gap_prob, seed_index
Exemplo n.º 2
0
def test_database_basic():
    A = Alphabet('ACGT')
    db = DB(':memory:', A)
    db.initialize()
    db.initialize()  # should be able to call it twice
    with db.connection() as conn:
        # a sequence table should be created
        conn.cursor().execute('SELECT * FROM sequence LIMIT 1;')

    with pytest.raises(AssertionError):
        DB('/cannot/possibly/exist/directory/', A)
Exemplo n.º 3
0
def test_database_insert():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    db = DB(':memory:', A)
    db.initialize()
    attrs = {'key': 'value'}
    rec = db.insert(S, source_file='source.fa', source_pos=10, attrs=attrs)
    assert isinstance(rec.id, int)
    assert rec.content_id == S.content_id
    assert rec.source_pos == 10
    assert rec.source_file == 'source.fa'
    assert 'key' in rec.attrs and rec.attrs['key'] == 'value', \
        'attributes must be populated correctly'
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute('SELECT content_id FROM sequence WHERE id = ?',
                       (rec.id,))
        # NOTE for some reason if we just say next(cursor) ==  ...
        # the cursor remains open after the context is over (which should
        # not happen as per docs). This leads to BusyError further down.
        assert cursor.fetchall() == [(S.content_id,)], \
            'content identifier is properly populated'

    # add a second sequence
    T = A.parse('GCTG', name='bar')
    new_rec = db.insert(T)
    assert new_rec.id != rec.id, 'new ids are assigned to new sequences'
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute('SELECT content_id FROM sequence WHERE id = ?',
                       (new_rec.id,))
        assert next(cursor) == (T.content_id,), \
            'correct id must be populated'
Exemplo n.º 4
0
def seed_index():
    """Creates a database, a kmer index, and a seed index with word length 5
    stored in memory and returns the seed index. The database is populated with
    3 random sequences of length 100 and all kmers and seeds are indexed."""
    A = Alphabet('ACGT')
    num_seqs = 3
    seq_len = 100
    wordlen = 5

    db = DB(':memory:', A)
    seed_index = SeedIndex(KmerIndex(db, wordlen))
    seed_index.db.initialize()

    fasta = StringIO()
    seqs = (rand_seq(A, seq_len).to_named('#%d' % i) for i in range(num_seqs))
    write_fasta(fasta, seqs)
    fasta.seek(0)

    db.load_fasta(fasta)
    seed_index.index_seeds()
    return seed_index
Exemplo n.º 5
0
def test_database_populate_fasta():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()

    fasta = StringIO()
    fasta.name = '/x.fasta'

    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=False)
    assert len(inserted) == 2
    assert all(isinstance(r, Record) for r in inserted)
    assert all(rec.source_file == fasta.name for rec in inserted), \
        'source file of sequence records must be set'
    assert [db.load_from_record(rec, fasta) for rec in inserted] == [S, T], \
        'should be able to retrieve sequences by position in source'

    with patch('biseqt.database.open', create=True) as open_mock:
        open_mock.return_value = MagicMock(spec=file, wraps=fasta)
        assert db.load_from_record(inserted[0]) == S, \
            'load_from_record should work without an open file handle'
Exemplo n.º 6
0
def test_database_events():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')

    # NOTE python 2 does not support non-local, non-global variables, put it in
    # the function object.
    test_database_events.callback_called = 0

    def callback(self, *args):
        test_database_events.callback_called += 1

    db = DB(':memory:', A)
    db.add_event_listener('db-initialized', callback)
    db.add_event_listener('sequence-inserted', callback)
    db.initialize()
    assert test_database_events.callback_called == 1, \
        'event callbacks for "initialize" should be executed'

    db.insert(S)
    assert test_database_events.callback_called == 2, \
        'event callbacks for "insert-sequence" should be executed'
Exemplo n.º 7
0
def test_database_find():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    T = A.parse('GGCT', name='bar')
    db = DB(':memory:', A)
    db.initialize()
    db.insert(S)
    db.insert(T)

    sql_condition = "attrs LIKE '%s'" % '%"name": "bar"%'
    found = [rec for rec in db.find(sql_condition=sql_condition)]
    assert len(found) == 1 and found[0].content_id == T.content_id, \
        'find() should work with sql_condition'

    def condition(rec): return rec.attrs['name'] == 'foo'

    found = [rec for rec in db.find(condition=condition)]
    assert len(found) == 1 and found[0].content_id == S.content_id, \
        'find() should work with callable condition'
Exemplo n.º 8
0
def test_database_overwrite():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='foo')
    db = DB(':memory:', A)
    db.initialize()
    db.insert(S, source_file='old_source.fa')
    db.insert(S, source_file='new_source.fa')
    with db.connection() as conn:
        cursor = conn.cursor()
        cursor.execute(
            'SELECT source_file FROM sequence WHERE content_id = ?',
            (S.content_id,)
        )
        res = [x[0] for x in cursor]
        assert len(res) == 1 and res[0] == 'old_source.fa', \
            'Sequences with observed content id should be ignored'
Exemplo n.º 9
0
def test_database_populate_fasta_rc():
    A = Alphabet('ACGT')
    S = A.parse('AACT', name='S')
    T = A.parse('GCAT', name='T')

    db = DB(':memory:', A)
    db.initialize()
    fasta = StringIO()
    write_fasta(fasta, [S, T])
    fasta.seek(0)
    inserted = db.load_fasta(fasta, rc=True)

    assert len(inserted) == 4
    assert [r.attrs['rc_of'] for r in inserted if 'rc_of' in r.attrs] \
        == [S.content_id, T.content_id], \
        'reverse complements should know what their origin is'

    def cond_T_rc(r): return r.attrs.get('rc_of', None) == T.content_id

    found_T_rc = next(db.find(condition=cond_T_rc))
    T_rc = T.reverse().transform(['AT', 'CG'], name='(rc) ' + T.name)
    assert db.load_from_record(found_T_rc, fasta) == T_rc, \
        'reverse complements should load properly from a record'
Exemplo n.º 10
0
 def __init__(self, alphabet, wordlen, db_path):
     self.db = DB(db_path, alphabet)
     self.kmer_index = KmerIndex(self.db, wordlen)
     self.seed_index = SeedIndex(self.kmer_index)
     self.bands_indexed = False
Exemplo n.º 11
0
class ReadMapper(object):
    def __init__(self, alphabet, wordlen, db_path):
        self.db = DB(db_path, alphabet)
        self.kmer_index = KmerIndex(self.db, wordlen)
        self.seed_index = SeedIndex(self.kmer_index)
        self.bands_indexed = False

    def log(self, *args, **kwargs):
        self.db.log(*args, **kwargs)

    def initialize(self, reads_fa, refs_fa=None, num_reads=-1):
        self.db.initialize()
        with open(reads_fa) as f:
            self.db.load_fasta(f, num=num_reads, rc=True)
        if refs_fa is not None:
            with open(refs_fa) as f:
                self.db.load_fasta(f, rc=False)

    def index_bands(self, **kw):
        self.kmer_index.score_kmers()
        self.seed_index.score_diagonals(**kw)
        self.bands_indexed = True

    def load_reads(self):
        recs_by_content_id = {r.content_id: r for r in list(self.db.find())}
        reads = []
        for record in recs_by_content_id.values():
            if 'rc_of' in record.attrs:
                pair = (recs_by_content_id[record.attrs['rc_of']], record)
                reads.append(Read(self.seed_index, *pair))
        return sorted(reads, key=lambda read: read.record.id)

    def load_refs(self):
        recs_by_content_id = {r.content_id: r for r in list(self.db.find())}
        for record in recs_by_content_id.values():
            if 'rc_of' in record.attrs:
                recs_by_content_id.pop(record.attrs['rc_of'])
                recs_by_content_id.pop(record.content_id)
        return recs_by_content_id.values()

    def map_all_to_all(self, min_band_score, **aligner_kw):
        assert self.bands_indexed, 'Bands must be indexed first'
        self.log('Mapping all reads against each other')
        reads = self.load_reads()  # NOTE comes in sorted order of id
        indic = ProgressIndicator(num_total=len(reads))
        indic.start()
        for read in reads:
            indic.progress()
            # NOTE only compare to reads after us
            others = (r.record for r in reads if r.record.id > read.record.id)
            for other in others:
                rec, target_rec, aln = read.map(other,
                                                min_band_score=min_band_score,
                                                **aligner_kw)
                if rec is None:
                    continue
                yield rec, target_rec, aln
        indic.finish()

    def map_all_to_refs(self, min_band_score, **aligner_kw):
        # FIXME it would be nice to only calculate bands for read v. ref not
        # all pairwise of reads too
        assert self.bands_indexed, 'Bands must be indexed first'
        self.log('Mapping all reads against reference sequences')
        reads, refs = self.load_reads(), self.load_refs()
        indic = ProgressIndicator(num_total=len(reads))
        indic.start()
        for read in reads:
            indic.progress()
            rec, target_rec, aln = read.map(refs,
                                            min_band_score=min_band_score,
                                            **aligner_kw)
            if rec is not None:
                yield rec, target_rec, aln
        indic.finish()

    def mappings_from_sam(self, sampath):
        """Loads mappings from a SAM mapping file and translates sequence
        names to integer identifiers as stored by :class:`biseqt.database.DB`.

        Args:
            db (database.DB): The sequence database where ids are looked up.
            sampath (str): The path to SAM mappings file.

        Yields:
            tuple:
                A 3-tuple containing the read record, the reference name and
                the ``pysam.calignedsegment.AlignedSegment`` mapping it to the
                reference.
        """
        self.log('Loading SAM mappings from %s.' % sampath)
        reads_by_name = {r.record.attrs['name']: r for r in self.load_reads()}
        samfile = pysam.AlignmentFile(sampath)
        for mapping in samfile.fetch():
            qname, rname = mapping.query_name, mapping.reference_name
            # NOTE this is because BLASR does a weird thing with sequence names
            qname = qname.rsplit('/', 1)[0]
            if qname not in reads_by_name:
                continue
            yield reads_by_name[qname], rname, mapping

    def overlaps_from_sam_mappings(self, sampath, min_overlap=-1):
        """Finds all pairs of overlapping sequences based on their mappings to
        a reference.

        Args:
            sampath (str): The path to SAM mappings file.
            min_overlap (int): The minimum required length for overlaps to be
                reported; default is -1 in which case no overlap is excluded.

        Yields:
            tuple:
                A tuple of sequence integer ids (in increasing order) that are
                deemed as overlapping based on SAM mappings.
        """
        self.log('Finding overlaps from SAM mappings.')
        mappings = {read.record.id: (read, ref, mapping)
                    for read, ref, mapping in self.mappings_from_sam(sampath)}
        seqids = sorted(mappings.keys())
        for id0, id1 in combinations(seqids, 2):
            (r0, ref0, map0), (r1, ref1, map1) = mappings[id0], mappings[id1]
            if ref0 != ref1:
                continue
            # TODO ignoring query_alignment_start and query_alignment_end
            overlap_len = min(map0.reference_end, map1.reference_end) - \
                max(map0.reference_start, map1.reference_start)
            if overlap_len <= 0 or overlap_len < min_overlap:
                continue
            # FIXME the second thing we yield is not reported by our own
            # map_all_to_all.
            if map0.is_reverse == map1.is_reverse:
                yield r0.record, r1.record
                yield r0.rc_record, r1.rc_record
            else:
                yield r0.record, r1.rc_record
                yield r0.rc_record, r1.record