예제 #1
0
def main(argv=None):
    parser = make_parser()
    args = parser.parse_args(argv)
    genome = SequenceFileDB(args.genome)


    pwm = [IUPAC_SCORES[l] for l in args.consensus]
    pwm.extend([REQUIRED_SCORES[l] for l in args.required_3p_seq])
    pwm = motility.PWM(pwm)

    # find all matches
    with open(args.outfile, 'w') as outfile:
        for chrom in genome.keys():
            chromseq = str(genome[chrom])
            print "searching ", chrom, "of length", len(chromseq)
            if len(chromseq) < len(pwm):
                print 'chromosome/fragment', chrom, 'is too short'
                continue
            matches = pwm.find(chromseq, -args.mismatches)
            for start, stop, strand, seq in matches:
                score = pwm.calc_score(seq)
                outfile.write('\t'.join(
                    [chrom, str(start), str(stop), seq, str(score),
                     '+' if strand == 1 else '-']) + '\n')
예제 #2
0
class SequenceFileDB_Test(unittest.TestCase):
    """
    Test for all of the basic dictionary functions on 'SequenceFileDB',
    among other things.
    """

    def setUp(self):
        "Test setup"
        dnaseq = testutil.datafile('dnaseq.fasta')
        self.db = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2'

        self.db._weakValueDict.clear()   # clear the cache

    def tearDown(self):
        self.db.close() # must close SequenceFileDB!

    def test_len(self):
        assert len(self.db) == 2

    def test_seqInfoDict_len(self):
        assert len(self.db.seqInfoDict) == 2

    def test_no_file_given(self):
        "Make sure that a TypeError is raised when no file is available"
        try:
            db = SequenceFileDB()
            assert 0, "should not reach this point"
        except TypeError:
            pass

    def test_seq_descriptor(self):
        "Check the '.seq' attribute (tied to a descriptor)"
        s = self.db['seq1']
        assert str(s) == str(s.seq)

    def test_cache(self):
        "SequenceDB cache test"
        assert len(self.db._weakValueDict) == 0
        seq1 = self.db['seq1']

        # cache populated?
        assert len(self.db._weakValueDict) == 1
        assert 'seq1' in self.db._weakValueDict

        # cache functions?
        seq1_try2 = self.db['seq1']
        assert seq1 is seq1_try2

    def test_clear_cache(self):
        "SequenceDB clear_cache test"
        assert len(self.db._weakValueDict) == 0
        seq1 = self.db['seq1']

        # cache populated?
        assert len(self.db._weakValueDict) == 1
        assert 'seq1' in self.db._weakValueDict

        # clear_cache functions?
        self.db.clear_cache()
        seq1_try3 = self.db['seq1']
        assert seq1 is not seq1_try3

    def test_keys(self):
        "SequenceFileDB keys"
        k = self.db.keys()
        k.sort()
        assert k == ['seq1', 'seq2']

    def test_contains(self):
        "SequenceFileDB contains"
        assert 'seq1' in self.db, self.db.keys()
        assert 'seq2' in self.db
        assert 'foo' not in self.db

    def test_invert_class(self):
        "SequenceFileDB __invert__"
        seq = self.db['seq1']
        inversedb = ~self.db
        assert inversedb[seq] == 'seq1'
        assert seq in inversedb
        assert 'foo' not in inversedb

    def test_keys_info(self):
        "SequenceFileDB keys info"
        k = self.db.seqInfoDict.keys()
        k.sort()
        assert k == ['seq1', 'seq2']

    def test_contains_info(self):
        "SequenceFileDB contains info"
        assert 'seq1' in self.db.seqInfoDict
        assert 'seq2' in self.db.seqInfoDict
        assert 'foo' not in self.db.seqInfoDict

    def test_has_key(self):
        "SequenceFileDB has key"
        assert 'seq1' in self.db
        assert 'seq2' in self.db
        assert 'foo' not in self.db

    def test_get(self):
        "SequenceFileDB get"
        assert self.db.get('foo') is None
        assert self.db.get('seq1') is not None
        assert str(self.db.get('seq1')).startswith('atggtgtca')
        assert self.db.get('seq2') is not None
        assert str(self.db.get('seq2')).startswith('GTGTTGAA')

    def test_items(self):
        "SequenceFileDB items"
        i = [k for (k, v) in self.db.items()]
        i.sort()
        assert i == ['seq1', 'seq2']

    def test_iterkeys(self):
        "SequenceFileDB iterkeys"
        kk = self.db.keys()
        kk.sort()
        ik = list(self.db.iterkeys())
        ik.sort()
        assert kk == ik

    def test_itervalues(self):
        "SequenceFileDB itervalues"
        kv = self.db.values()
        kv.sort()
        iv = list(self.db.itervalues())
        iv.sort()
        assert kv == iv

    def test_iteritems(self):
        "SequenceFileDB iteritems"
        ki = self.db.items()
        ki.sort()
        ii = list(self.db.iteritems())
        ii.sort()
        assert ki == ii

    def test_readonly(self):
        "SequenceFileDB readonly"
        try:
            self.db.copy()          # what should 'copy' do on SequenceFileDB?
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.clear()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.setdefault('foo')
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.pop()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.popitem()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.update({})
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass

    # test some things other than dict behavior
    def test_keyerror(self):
        """SequenceFileDB keyerror.
        Make sure that the SequenceFileDB KeyError is informative."""
        try:
            self.db['foo']
        except KeyError, e:
            assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)
예제 #3
0
class Hgvs2:
    def __init__(self, refseq_fn=fileconfig.FILECONFIG['REFGENE']):
        """ register hg19 reference genome sequence and NCBI RefSeq transcript coordinates"""
        self.genome_fn = fileconfig.FILECONFIG['REFGENOME_UCSC']
        self.genome = None
        if not os.path.exists(self.genome_fn):
            raise IOError(
                'Reference Genome Sequence (UCSC format) for %s is not found' %
                self.genome_fn)

        self.refseq_fn = refseq_fn
        self.refseq = None
        if not os.path.exists(self.refseq_fn):
            raise IOError('NCBI RefSeq transcript for %s is not found' %
                          self.refseq_fn)

    def get_transcript(self, name):
        if self.refseq:
            return self.refseq.get(name)
        else:
            raise RuntimeError(
                'first, load a transcript coordinate file![%s]' %
                self.refseq_fn)

    def load_resource(self):
        #load genome sequence
        print 'loading the genome sequence [%s] for HGVS...' % self.genome_fn
        self.genome = SequenceFileDB(self.genome_fn)
        print 'done.'

        #load refseq into dic
        print 'loading the refseq transcript [%s] for HGVS...' % self.refseq_fn
        fp = open(self.refseq_fn, 'r')
        self.refseq = pyhgvs.utils.read_transcripts(fp)
        fp.close()
        print 'done.'

    def close_resource(self):
        self.genome.close()
        self.refseq = None

    def to_cDNA(self, chrom, offset, ref, alt, refseq_acc):
        """ convert to HGVS nomenclature """
        transcript = self.get_transcript(refseq_acc)

        if not chrom.startswith('chr'):
            chrom = 'chr%s' % chrom
            if chrom not in CHROMOSOMES:
                return None

        if not chrom in self.genome.keys():
            return None

        cdna = pyhgvs.format_hgvs_name(chrom, offset, ref, alt, self.genome,
                                       transcript)
        if cdna:
            itms = cdna.split(':')
            if len(itms) > 1:
                cdna = itms[1]
            else:
                cdna = cdna
            return cdna
        else:
            return None

    def gdna_to_vcf(self, gdna):
        return pyhgvs.gdna_to_vcf(gdna, self.genome)

    def to_chrom_coordinate(self, cDNA):
        try:
            chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(cDNA, self.genome, \
                                       get_transcript = self.get_transcript)

            return chrom, offset, ref, alt
        except:
            print "[%s] cannot be coverted to chromosome coordinate" % cDNA
            return None, None, None, None
예제 #4
0
파일: seqdb_test.py 프로젝트: ctb/pygr
class SequenceFileDB_Test(object):
    """
    Test for all of the basic dictionary functions on 'SequenceFileDB'.
    """
    def setup(self):
        self.db = SequenceFileDB('dnaseq')     # contains 'seq1', 'seq2'
    def keys_test(self):
        k = self.db.keys()
        k.sort()
        assert k == ['seq1', 'seq2']
    def contains_test(self):
        assert 'seq1' in self.db, self.db.keys()
        assert 'seq2' in self.db
        assert 'foo' not in self.db
    def keys_info_test(self):
        k = self.db.seqInfoDict.keys()
        k.sort()
        assert k == ['seq1', 'seq2']
    def contains_info_test(self):
        assert 'seq1' in self.db.seqInfoDict
        assert 'seq2' in self.db.seqInfoDict
        assert 'foo' not in self.db.seqInfoDict
    def has_key_test(self):
        assert self.db.has_key('seq1')
        assert self.db.has_key('seq2')
        assert not self.db.has_key('foo')
    def get_test(self):
        assert self.db.get('foo') is None
        assert self.db.get('seq1') is not None
        assert str(self.db.get('seq1')).startswith('atggtgtca')
        assert self.db.get('seq2') is not None
        assert str(self.db.get('seq2')).startswith('GTGTTGAA')
    def items_test(self):
        i = [ k for (k,v) in self.db.items() ]
        i.sort()
        assert i == ['seq1', 'seq2']
    def iterkeys_test(self):
        kk = self.db.keys()
        kk.sort()
        ik = list(self.db.iterkeys())
        ik.sort()
        assert kk == ik
    def itervalues_test(self):
        kv = self.db.values()
        kv.sort()
        iv = list(self.db.itervalues())
        iv.sort()
        assert kv == iv
    def iteritems_test(self):
        ki = self.db.items()
        ki.sort()
        ii = list(self.db.iteritems())
        ii.sort()
        assert ki == ii
    def readonly_test(self):
        try:
            self.db.copy()              # what should 'copy' do on SequenceFileDB?
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.clear()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.setdefault('foo')
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.pop()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.popitem()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.update({})
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
            
    # test some things other than dict behavior
    def keyerror_test(self):
        "Make sure that the SequenceFileDB KeyError is informative."
        try:
            self.db['foo']
        except KeyError, e:
            assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)
예제 #5
0
class SequenceFileDB_Test(unittest.TestCase):
    """
    Test for all of the basic dictionary functions on 'SequenceFileDB',
    among other things.
    """
    def setUp(self):
        "Test setup"
        dnaseq = testutil.datafile('dnaseq.fasta')
        self.db = SequenceFileDB(dnaseq)  # contains 'seq1', 'seq2'

        self.db._weakValueDict.clear()  # clear the cache

    def tearDown(self):
        self.db.close()  # must close SequenceFileDB!

    def test_len(self):
        assert len(self.db) == 2

    def test_seqInfoDict_len(self):
        assert len(self.db.seqInfoDict) == 2

    def test_no_file_given(self):
        "Make sure that a TypeError is raised when no file is available"
        try:
            db = SequenceFileDB()
            assert 0, "should not reach this point"
        except TypeError:
            pass

    def test_seq_descriptor(self):
        "Check the '.seq' attribute (tied to a descriptor)"
        s = self.db['seq1']
        assert str(s) == str(s.seq)

    def test_cache(self):
        "SequenceDB cache test"
        assert len(self.db._weakValueDict) == 0
        seq1 = self.db['seq1']

        # cache populated?
        assert len(self.db._weakValueDict) == 1
        assert 'seq1' in self.db._weakValueDict

        # cache functions?
        seq1_try2 = self.db['seq1']
        assert seq1 is seq1_try2

    def test_clear_cache(self):
        "SequenceDB clear_cache test"
        assert len(self.db._weakValueDict) == 0
        seq1 = self.db['seq1']

        # cache populated?
        assert len(self.db._weakValueDict) == 1
        assert 'seq1' in self.db._weakValueDict

        # clear_cache functions?
        self.db.clear_cache()
        seq1_try3 = self.db['seq1']
        assert seq1 is not seq1_try3

    def test_keys(self):
        "SequenceFileDB keys"
        k = self.db.keys()
        k.sort()
        assert k == ['seq1', 'seq2']

    def test_contains(self):
        "SequenceFileDB contains"
        assert 'seq1' in self.db, self.db.keys()
        assert 'seq2' in self.db
        assert 'foo' not in self.db

    def test_invert_class(self):
        "SequenceFileDB __invert__"
        seq = self.db['seq1']
        inversedb = ~self.db
        assert inversedb[seq] == 'seq1'
        assert seq in inversedb
        assert 'foo' not in inversedb

    def test_keys_info(self):
        "SequenceFileDB keys info"
        k = self.db.seqInfoDict.keys()
        k.sort()
        assert k == ['seq1', 'seq2']

    def test_contains_info(self):
        "SequenceFileDB contains info"
        assert 'seq1' in self.db.seqInfoDict
        assert 'seq2' in self.db.seqInfoDict
        assert 'foo' not in self.db.seqInfoDict

    def test_has_key(self):
        "SequenceFileDB has key"
        assert 'seq1' in self.db
        assert 'seq2' in self.db
        assert 'foo' not in self.db

    def test_get(self):
        "SequenceFileDB get"
        assert self.db.get('foo') is None
        assert self.db.get('seq1') is not None
        assert str(self.db.get('seq1')).startswith('atggtgtca')
        assert self.db.get('seq2') is not None
        assert str(self.db.get('seq2')).startswith('GTGTTGAA')

    def test_items(self):
        "SequenceFileDB items"
        i = [k for (k, v) in self.db.items()]
        i.sort()
        assert i == ['seq1', 'seq2']

    def test_iterkeys(self):
        "SequenceFileDB iterkeys"
        kk = self.db.keys()
        kk.sort()
        ik = list(self.db.iterkeys())
        ik.sort()
        assert kk == ik

    def test_itervalues(self):
        "SequenceFileDB itervalues"
        kv = self.db.values()
        kv.sort()
        iv = list(self.db.itervalues())
        iv.sort()
        assert kv == iv

    def test_iteritems(self):
        "SequenceFileDB iteritems"
        ki = self.db.items()
        ki.sort()
        ii = list(self.db.iteritems())
        ii.sort()
        assert ki == ii

    def test_readonly(self):
        "SequenceFileDB readonly"
        try:
            self.db.copy()  # what should 'copy' do on SequenceFileDB?
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.clear()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.setdefault('foo')
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.pop()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.popitem()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.update({})
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass

    # test some things other than dict behavior
    def test_keyerror(self):
        """SequenceFileDB keyerror.
        Make sure that the SequenceFileDB KeyError is informative."""
        try:
            self.db['foo']
        except KeyError, e:
            assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)