Exemplo n.º 1
0
    def _region_to_seqs(self, track, extend_up=0, extend_down=0):
        BUFSIZE = 10000
        if isinstance(track, list):
            for name in track:
                chrom, coords = name.split(":")
                start, end = [int(c) for c in coords.split("-")]
                start += 1
                start -= extend_up
                end += extend_down
                seq = self.get_seq(chrom, start, end)
                yield Sequence(name, seq.seq)
        else:
            with open(track) as fin:
                lines = fin.readlines(BUFSIZE)
                while lines:
                    for line in lines:
                        name = line.strip()
                        chrom, coords = name.split(":")
                        start, end = [int(c) for c in coords.split("-")]
                        start += 1
                        start -= extend_up
                        end += extend_down
                        seq = self.get_seq(chrom, start, end)
                        yield Sequence(name, seq.seq)

                    lines = fin.readlines(BUFSIZE)
Exemplo n.º 2
0
 def _variant_to_sequence(variants):
     """
     Convert `cyvcf2.Variant` objects to `pyfaidx.Seqeunce` objects
     for reference and variants.
     """
     for v in variants:
         ref = Sequence(name=v.chrom, seq=v.ref,
                        start=v.start, end=v.start + len(v.ref))
         alt = Sequence(name=v.chrom, seq=v.alt,
                        start=v.start, end=v.start + len(v.alt))
         yield ref, alt
Exemplo n.º 3
0
def test_interval_seq_builder_concat(interval_seq_builder):
    with pytest.raises(TypeError):
        interval_seq_builder.concat()

    sequence = Sequence(seq='CCCCATCGNN', start=10, end=20)
    interval_seq_builder.restore(sequence)
    assert interval_seq_builder.concat() == 'CCCCTAGCNN'
Exemplo n.º 4
0
def interval_seq_builder():
    return IntervalSeqBuilder([
        Interval('chr1', 10, 13),
        Interval('chr1', 13, 14),
        Sequence(seq='TAGC', start=14, end=18),
        Interval('chr1', 18, 20)
    ])
Exemplo n.º 5
0
    def _bed_to_seqs(self, track, stranded=False, extend_up=0, extend_down=0):
        BUFSIZE = 10000
        with open(track) as fin:
            lines = fin.readlines(BUFSIZE)
            while lines:
                for line in lines:
                    if line.startswith("#") or line.startswith("track"):
                        continue

                    vals = line.strip().split("\t")
                    try:
                        start, end = int(vals[1]), int(vals[2])
                    except ValueError:
                        raise

                    rc = False
                    if stranded:
                        try:
                            rc = vals[5] == "-"
                        except IndexError:
                            pass

                    starts = [start]
                    ends = [end]

                    chrom = vals[0]

                    # BED12
                    if len(vals) == 12:
                        starts = [int(x) for x in vals[11].split(",")[:-1]]
                        sizes = [int(x) for x in vals[10].split(",")[:-1]]
                        starts = [start + x for x in starts]
                        ends = [
                            start + size for start, size in zip(starts, sizes)
                        ]
                    name = "{}:{}-{}".format(chrom, start, end)
                    try:
                        name = " ".join((name, vals[3]))
                    except Exception:
                        pass

                    starts = [start + 1 for start in starts]

                    # extend
                    if extend_up:
                        if rc:
                            ends[-1] += extend_up
                        else:
                            starts[0] -= extend_up
                    if extend_down:
                        if rc:
                            starts[0] -= extend_down
                        else:
                            ends[-1] += extend_down

                    intervals = zip(starts, ends)
                    seq = self.get_spliced_seq(chrom, intervals, rc)
                    yield Sequence(name, seq.seq)

                lines = fin.readlines(BUFSIZE)
Exemplo n.º 6
0
def locate(args):
    kmers, fd, fo = args.kmer, args.db, args.out
    fg = args.fg
    db = Fasta(fd)
    #
    kseqs = kmers.split(',')
    kseqs2 = [Sequence(name='kmer',seq=kseq).reverse.complement.seq for kseq in kseqs]
    ptn = "|".join([ "("+k+")" for k in kseqs+kseqs2 ])
    #
    seqs = []
    if fg != '':
        fhg = open(fg, 'r')
        for line in fhg:
            line = line.rstrip("\n")
            if not line: continue
            gid = line.split()[0]
            if gid == 'gid': continue
            if gid not in db: continue
            seqs.append(gid)
    else:
        seqs = db.keys()

    fho = open(fo, 'w')
    fho.write('kmer\tsid\tstart\tend\tsrd\n')
    i = 1
    for seqid in seqs:
        seq = db[seqid][0:].seq
        for m in re.finditer(ptn, seq):
            start, end = m.start()+1, m.end()
            srd = "+" if m.group(0) in kseqs else "-"
            fho.write(f"{m.group(0)}\t{seqid}\t{start}\t{end}\t{srd}\n")
            i += 1
    fho.close()
Exemplo n.º 7
0
 def _variant_to_sequence(self, variants):
     """
     Convert `cyvcf2.Variant` objects to `pyfaidx.Seqeunce` objects
     for reference and variants.
     """
     for v in variants:
         ref = Sequence(name=v.CHROM,
                        seq=v.REF,
                        start=v.start,
                        end=v.start + len(v.REF))
         # TO DO: consider alternative alleles.
         alt = Sequence(name=v.CHROM,
                        seq=v.ALT[0],
                        start=v.start,
                        end=v.start + len(v.ALT[0]))
         yield ref, alt
Exemplo n.º 8
0
    def _regions_to_seqs(self, track, extend_up=0, extend_down=0):
        if isinstance(track, list):
            for region in track:
                name = region.strip()
                seq = self._region_to_seq(name, extend_up, extend_down)
                yield Sequence(name, seq)
        else:
            with open(track) as fin:
                bufsize = 10000
                lines = fin.readlines(bufsize)
                for region in lines:
                    name = region.strip()
                    seq = self._region_to_seq(name, extend_up, extend_down)
                    yield Sequence(name, seq)

                    # load more lines if needed
                    lines += fin.readlines()
Exemplo n.º 9
0
def test__split_overlapping(variant_seq_extractor):
    pair = (Sequence(seq='AAA', start=3,
                     end=6), Sequence(seq='T', start=3, end=4))
    splited_pairs = list(variant_seq_extractor._split_overlapping([pair], 5))

    assert splited_pairs[0][0].seq == 'AA'
    assert splited_pairs[0][1].seq == 'T'
    assert splited_pairs[1][0].seq == 'A'
    assert splited_pairs[1][1].seq == ''

    pair = (Sequence(seq='TT', start=3,
                     end=5), Sequence(seq='AAA', start=3, end=6))
    splited_pairs = list(variant_seq_extractor._split_overlapping([pair], 4))

    assert splited_pairs[0][0].seq == 'T'
    assert splited_pairs[0][1].seq == 'A'
    assert splited_pairs[1][0].seq == 'T'
    assert splited_pairs[1][1].seq == 'AA'
Exemplo n.º 10
0
    def _bed_to_seqs(self, track, stranded=False, extend_up=0, extend_down=0):
        bufsize = 10000
        with open(track) as fin:
            lines = fin.readlines(bufsize)
            for line in lines:
                if line.startswith("#") or line.startswith("track"):
                    continue

                vals = line.strip().split("\t")
                chrom, start, end = str(vals[0]), int(vals[1]), int(vals[2])
                name = f"{chrom}:{start}-{end}"

                # there might be more...
                starts = [start]
                ends = [end]

                # BED4: add name column to name
                if len(vals) >= 4:
                    name = " ".join((name, vals[3]))

                # BED5: check strandedness
                rc = False
                if stranded and len(vals) >= 6:
                    rc = vals[5] == "-"

                # BED12: get all blocks
                if len(vals) >= 12:
                    starts = [int(x) for x in vals[11].split(",")[:-1]]
                    sizes = [int(x) for x in vals[10].split(",")[:-1]]
                    starts = [start + x for x in starts]
                    ends = [start + size for start, size in zip(starts, sizes)]
                # convert to 1-based counting
                starts = [start + 1 for start in starts]

                # extend
                if extend_up:
                    if rc:
                        ends[-1] += extend_up
                    else:
                        starts[0] -= extend_up
                if extend_down:
                    if rc:
                        starts[0] -= extend_down
                    else:
                        ends[-1] += extend_down

                intervals = zip(starts, ends)
                seq = self.get_spliced_seq(chrom, intervals, rc)
                yield Sequence(name, seq.seq)

                # load more lines if needed
                lines += fin.readlines(1)
Exemplo n.º 11
0
    def get_spliced_seq(self, name, intervals, rc=False):
        """Return a sequence by record name and list of intervals 
        
        Interval list is an iterable of [start, end].
        Coordinates are 0-based, end-exclusive.
        """
        # Get sequence for all intervals
        chunks = [self.faidx.fetch(name, s, e) for s, e in intervals]
        start = chunks[0].start
        end = chunks[-1].end

        # reverce complement
        if rc:
            seq = "".join([(-chunk).seq for chunk in chunks[::-1]])
        else:
            seq = "".join([chunk.seq for chunk in chunks])

        return Sequence(name=name, seq=seq, start=start, end=end)
Exemplo n.º 12
0
def regions_to_seqs(self, track, extend_up=0, extend_down=0):
    # if track is a file, loop over its lines
    lines = track
    if isinstance(track, str):
        lines = parse_file(track)

    for line in lines:
        name = line.split()[0]
        try:
            if bad_coords(name, track):
                continue
            seq = region_to_seq(self, name, extend_up, extend_down)
        except (ValueError, IndexError):
            msg = f"Skipping region that cannot be parsed: '{name}'"
            if isinstance(track, str):
                msg = f"Skipping region from '{os.path.basename(track)}' that cannot be parsed: '{name}'"
            logger.warning(msg)
            continue
        yield Sequence(name, seq)
Exemplo n.º 13
0
def bed_to_seq(self, vals, stranded=False, extend_up=0, extend_down=0):
    chrom, start, end = str(vals[0]), int(vals[1]), int(vals[2])
    name = f"{chrom}:{start}-{end}"

    # there might be more...
    starts = [start]
    ends = [end]

    # BED4: add name column to name
    if len(vals) >= 4:
        name = " ".join((name, vals[3]))

    # BED5: check strandedness
    rc = False
    if stranded and len(vals) >= 6:
        rc = vals[5] == "-"

    # BED12: get all blocks
    if len(vals) >= 12:
        starts = [int(x) for x in vals[11].split(",")[:-1]]
        sizes = [int(x) for x in vals[10].split(",")[:-1]]
        starts = [start + x for x in starts]
        ends = [start + size for start, size in zip(starts, sizes)]
    # convert to 1-based counting
    starts = [start + 1 for start in starts]

    # extend
    if extend_up:
        if rc:
            ends[-1] += extend_up
        else:
            starts[0] -= extend_up
    if extend_down:
        if rc:
            starts[0] -= extend_down
        else:
            ends[-1] += extend_down

    intervals = zip(starts, ends)
    seq = self.get_spliced_seq(chrom, intervals, rc).seq
    return Sequence(name, seq)
Exemplo n.º 14
0
def test_interval_seq_builder_restore(interval_seq_builder):
    sequence = Sequence(seq='CCCCATCGTT', start=10, end=20)
    interval_seq_builder.restore(sequence)
    assert interval_seq_builder[0].seq == 'CCC'
    assert interval_seq_builder[1].seq == 'C'
    assert interval_seq_builder[2].seq == 'TAGC'
    assert interval_seq_builder[3].seq == 'TT'

    interval_seq_builder.append(Interval('chr1', 5, 10))
    interval_seq_builder.restore(sequence)
    assert interval_seq_builder[4].seq == ''

    interval_seq_builder.append(Interval('chr1', 20, 25))
    interval_seq_builder.restore(sequence)
    assert interval_seq_builder[5].seq == ''

    interval_seq_builder.append(Interval('chr1', 10, 5))
    interval_seq_builder.restore(sequence)
    assert interval_seq_builder[6].seq == ''

    interval_seq_builder.append(Interval('chr1', 25, 20))
    interval_seq_builder.restore(sequence)
    assert interval_seq_builder[7].seq == ''
Exemplo n.º 15
0
def read_kmer(fk, nfea):
    kms = []
    fhk = open(fk,'r')
    for line in fhk:
        line = line.rstrip("\n")
        if not line: continue
        i, opt, bin, epi, pval, fid, fname, kmers = line.split()[:8]
        if i == 'i': continue
        i = int(i)
        if nfea == 'top5' and i > 5: break
        if nfea == 'top10' and i > 10: break
        if nfea == 'top30' and i > 30: break
        if nfea == 'top50' and i > 50: break
        if nfea == 'top100' and i > 100: break
        if nfea == 'top200' and i > 200: break
        if nfea == 'top300' and i > 300: break
        if nfea == 'top500' and i > 500: break
        kseqs = kmers.split(',')
        kseqs2 = [Sequence(name='kmer',seq=kseq).reverse.complement.seq for kseq in kseqs]
        ptn = "|".join([ "("+k+")" for k in kseqs+kseqs2 ])
        kms.append([fid,ptn,set(kseqs)])
    fhk.close()
    return kms
Exemplo n.º 16
0
from pyfaidx import Sequence, complement
from nose.tools import assert_raises, raises

seq = Sequence(name='gi|557361099|gb|KF435150.1|', seq='TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA',
                    start=100, end=150)

seq_invalid = Sequence(name='gi|557361099|gb|KF435150.1|', seq='TTGAAGATTTPGCATGCAGCAGGTGCGCAAGGTGAAATNTTCACTGTTAAA',
                    start=100, end=150)

comp_valid = 'TTGAAGATTTnGCATGCAGCAGGtgccaAGGTGAAATGTTNACTGTTAAA'

comp_invalid = 'TTGAAGATTTnGCATGCAGCPQGtgccaAGGTGAAATGTTNACTGTTAAA'

def test_negate():
    assert str(-seq) == str(seq.complement[::-1])

def test_negate_metadata():
    # Negate should affect __repr__ the same way as reverse and complement
    seq_neg = -seq
    assert seq_neg.__repr__() == seq.complement[::-1].__repr__()

def test_seq_invalid():
    assert_raises(ValueError, lambda: seq_invalid.complement)

def test_integer_index():
    assert seq[1].seq == 'T'

def test_slice_index():
    assert seq[0:10].seq == 'TTGAAGATTT'

@raises(ValueError)
Exemplo n.º 17
0
def test_check_coordinates():
    x = Sequence(name='gi|557361099|gb|KF435150.1|', seq='TTGAAGATTTTGCATGCAGCAGGTGCGCAAGGTGAAATGTTCACTGTTAAA',
                 start=100, end=110)
    x[:]
Exemplo n.º 18
0
 def _fetch(self, interval, istart, iend):
     # fetch interval, ignore strand
     seq = self.ref_seq_extractor.extract(
         Interval(interval.chrom, istart, iend))
     seq = Sequence(name=interval.chrom, seq=seq, start=istart, end=iend)
     return seq
Exemplo n.º 19
0
 def _fetch(self, interval, istart, iend):
     seq = self.fasta.extract(Interval(interval.chrom, istart, iend))
     seq = Sequence(name=interval.chrom, seq=seq, start=istart, end=iend)
     return seq