예제 #1
0
 def test_basic_construction(self):
     db = SequenceFileDB(self.dbfile)
     try:
         assert str(db.get('seq1')).startswith('atggtgtca')
         assert str(db.get('seq2')).startswith('GTGTTGAA')
     finally:
         db.close()
예제 #2
0
    def test_build_seqLenDict_with_reader(self):
        "Test that building things works properly when specifying a reader."

        class InfoBag(object):
            def __init__(self, **kw):
                self.__dict__.update(kw)

        # first, load the db & save the sequence info in a list
        l = []
        db = SequenceFileDB(self.dbfile)
        try:
            for k, v in db.items():
                info = InfoBag(id=k, length=len(v), sequence=str(v))
                l.append(info)
        finally:
            # now, erase the existing files, and recreate the db.
            db.close()
        self.trash_intermediate_files()

        # create a fake reader with access to the saved info
        def my_fake_reader(fp, filename, info_list=l):
            return info_list

        # now try creating with the fake reader
        db = SequenceFileDB(self.dbfile, reader=my_fake_reader)

        # did it work?
        try:
            assert str(db.get('seq1')).startswith('atggtgtca')
            assert str(db.get('seq2')).startswith('GTGTTGAA')
        finally:
            db.close()
예제 #3
0
 def test_basic_construction(self):
     db = SequenceFileDB(self.dbfile)
     try:
         assert str(db.get('seq1')).startswith('atggtgtca')
         assert str(db.get('seq2')).startswith('GTGTTGAA')
     finally:
         db.close()
예제 #4
0
    def test_inverse_add_behavior(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seq = seqdb['seq1']

            name = (~self.db)[seq]
        finally:
            seqdb.close()  # only need to close if exception occurs
예제 #5
0
    def test_inverse_add_behavior(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seq = seqdb['seq1']

            name = (~self.db)[seq]
        finally:
            seqdb.close() # only need to close if exception occurs
예제 #6
0
 def test_funny_key2(self):
     "check handling of ID containing multiple separators"
     dnaseq = testutil.datafile('funnyseq.fasta')
     seqdb = SequenceFileDB(dnaseq)  # contains 'seq1', 'seq2'
     try:
         pudb = PrefixUnionDict({'prefix': seqdb})
         seq = pudb['prefix.seq.2.even.longer']
     finally:
         seqdb.close()
예제 #7
0
 def test_funny_key2(self):
     "check handling of ID containing multiple separators"
     dnaseq = testutil.datafile('funnyseq.fasta')
     seqdb = SequenceFileDB(dnaseq)     # contains 'seq1', 'seq2'
     try:
         pudb = PrefixUnionDict({'prefix': seqdb})
         seq = pudb['prefix.seq.2.even.longer']
     finally:
         seqdb.close()
예제 #8
0
    def test_cache(self):
        "Sequence slice cache mechanics."

        dnaseq = testutil.datafile('dnaseq.fasta')
        db = SequenceFileDB(dnaseq)

        try:
            # create cache components
            cacheDict = {}
            cacheHint = db.cacheHint

            # get seq1
            seq1 = db['seq1']

            # _cache is only created on first cache attempt
            assert not hasattr(db, '_cache')

            # build an 'owner' object
            class AnonymousOwner(object):
                pass

            owner = AnonymousOwner()

            # save seq1 in cache
            cacheDict['seq1'] = (seq1.start, seq1.stop)
            cacheHint(cacheDict, owner)
            del cacheDict  # 'owner' now holds reference

            # peek into _cache and assert that only the ival coordinates
            # are stored
            v = db._cache.values()[0]
            assert len(v['seq1']) == 2
            del v

            # force a cache access & check that now we've stored actual string
            ival = str(seq1[5:10])
            v = db._cache.values()[0]
            # ...check that we've stored actual string
            assert len(v['seq1']) == 3

            # again force cache access, this time to the stored sequence string
            ival = str(seq1[5:10])

            # now, eliminate all references to the cache proxy dict
            del owner

            # trash unused objects - not strictly necessary, because there are
            # no islands of circular references & so all objects are already
            # deallocated, but that's implementation dependent.
            gc.collect()

            # ok, cached values should now be gone.
            v = db._cache.values()
            assert len(v) == 0
        finally:
            db.close()
예제 #9
0
    def test_cache(self):
        "Sequence slice cache mechanics."

        dnaseq = testutil.datafile('dnaseq.fasta')
        db = SequenceFileDB(dnaseq)

        try:
            # create cache components
            cacheDict = {}
            cacheHint = db.cacheHint

            # get seq1
            seq1 = db['seq1']

            # _cache is only created on first cache attempt
            assert not hasattr(db, '_cache')

            # build an 'owner' object
            class AnonymousOwner(object):
                pass
            owner = AnonymousOwner()

            # save seq1 in cache
            cacheDict['seq1'] = (seq1.start, seq1.stop)
            cacheHint(cacheDict, owner)
            del cacheDict                   # 'owner' now holds reference

            # peek into _cache and assert that only the ival coordinates
            # are stored
            v = db._cache.values()[0]
            assert len(v['seq1']) == 2
            del v

            # force a cache access & check that now we've stored actual string
            ival = str(seq1[5:10])
            v = db._cache.values()[0]
            # ...check that we've stored actual string
            assert len(v['seq1']) == 3

            # again force cache access, this time to the stored sequence string
            ival = str(seq1[5:10])

            # now, eliminate all references to the cache proxy dict
            del owner

            # trash unused objects - not strictly necessary, because there are
            # no islands of circular references & so all objects are already
            # deallocated, but that's implementation dependent.
            gc.collect()

            # ok, cached values should now be gone.
            v = db._cache.values()
            assert len(v) == 0
        finally:
            db.close()
예제 #10
0
    def test_basic_iadd(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            self.db += new_seq

            assert new_seq in self.db
            name = (~self.db)[new_seq]
            assert name == 'dnaseq.seq1', name

            ###

            seqdb2 = SequenceFileDB(dnaseq)
            try:
                # Munge the filepath for testing.
                seqdb2.filepath = 'foo'
                new_seq2 = seqdb2['seq1']

                self.db += new_seq2
                name2 = (~self.db)[new_seq2]
                assert name2 == 'foo.seq1', name2
            finally:
                seqdb2.close()
        finally:
            seqdb.close()
예제 #11
0
    def test_build_seqLenDict_with_bad_reader(self):
        "Test that building things fails properly with a bad reader."

        class InfoBag(object):
            def __init__(self, **kw):
                self.__dict__.update(kw)

        # first, load the db & save the sequence info in a list
        l = []
        db = SequenceFileDB(self.dbfile)
        try:
            for k, v in db.items():
                info = InfoBag(id=k, length=0, sequence=str(v))
                l.append(info)
        finally:
            # now, erase the existing files, and recreate the db.
            db.close()
        self.trash_intermediate_files()

        # create a fake reader with access to the saved info
        def my_fake_reader(fp, filename, info_list=l):
            return info_list

        # now try creating with the fake reader
        try:
            db = SequenceFileDB(self.dbfile, reader=my_fake_reader)
            try:
                assert 0, "should not reach here; db construction should fail!"
            finally:
                db.close()
        except ValueError:
            pass  # ValueError is expected
예제 #12
0
    def test_nlmsaslice_cache(self):
        "NLMSASlice sequence caching & removal"

        # set up sequences
        dnaseq = testutil.datafile('dnaseq.fasta')

        db = SequenceFileDB(dnaseq, autoGC=-1)  # use pure WeakValueDict...
        try:
            gc.collect()
            assert len(
                db._weakValueDict) == 0, '_weakValueDict should be empty'
            seq1, seq2 = db['seq1'], db['seq2']
            assert len(db._weakValueDict)==2, \
                    '_weakValueDict should have 2 seqs'

            # build referencing NLMSA
            mymap = NLMSA('test', 'memory', db, pairwiseMode=True)
            mymap += seq1
            mymap[seq1] += seq2
            mymap.build()

            # check: no cache
            assert not hasattr(db, '_cache'), 'should be no cache yet'

            seq1, seq2 = db['seq1'], db['seq2']  # re-retrieve
            # now retrieve a NLMSASlice, forcing entry of seq into cache
            ival = seq1[5:10]
            x = mymap[ival]

            assert len(db._cache.values()) != 0

            n1 = len(db._cache)
            assert n1 == 1, "should be exactly one cache entry, not %d" % \
                    (n1, )

            # ok, now trash referencing arguments & make sure of cleanup
            del x
            gc.collect()

            assert len(db._cache.values()) == 0

            n2 = len(db._cache)
            assert n2 == 0, '%d objects remain; cache memory leak!' % n2
            # FAIL because of __dealloc__ error in cnestedlist.NLMSASlice.

            # Drop our references, the cache should empty.
            del mymap, ival, seq1, seq2
            gc.collect()
            # check that db._weakValueDict cache is empty
            assert len(
                db._weakValueDict) == 0, '_weakValueDict should be empty'
        finally:
            db.close()
예제 #13
0
 def test_headerfile_create_conflict(self):
     "test non-empty prefixDict with a passed in PUD header file: conflict"
     subdb = SequenceFileDB(self.dbfile)
     try:
         header = testutil.datafile('prefixUnionDict-1.txt')
         try:
             db = PrefixUnionDict(filename=header, prefixDict={ 'foo' : subdb })
             assert 0, "should not get here"
         except TypeError:
             pass
     finally:
         subdb.close()
예제 #14
0
    def load_resource(self):
        #load genome sequence
        print 'loading the genome sequence [%s] for HGVS...' % self.genome_fn
        self.genome = SequenceFileDB(self.genome_fn)
        print 'done.'

        #load refseq into dic
        print 'loading the refseq transcript [%s] for HGVS...' % self.refseq_fn
        fp = open(self.refseq_fn, 'r')
        self.refseq = pyhgvs.utils.read_transcripts(fp)
        fp.close()
        print 'done.'
예제 #15
0
    def test_nlmsaslice_cache(self):
        "NLMSASlice sequence caching & removal"

        # set up sequences
        dnaseq = testutil.datafile('dnaseq.fasta')

        db = SequenceFileDB(dnaseq, autoGC=-1) # use pure WeakValueDict...
        try:
            gc.collect()
            assert len(db._weakValueDict)==0, '_weakValueDict should be empty'
            seq1, seq2 = db['seq1'], db['seq2']
            assert len(db._weakValueDict)==2, \
                    '_weakValueDict should have 2 seqs'

            # build referencing NLMSA
            mymap = NLMSA('test', 'memory', db, pairwiseMode=True)
            mymap += seq1
            mymap[seq1] += seq2
            mymap.build()

            # check: no cache
            assert not hasattr(db, '_cache'), 'should be no cache yet'

            seq1, seq2 = db['seq1'], db['seq2'] # re-retrieve
            # now retrieve a NLMSASlice, forcing entry of seq into cache
            ival = seq1[5:10]
            x = mymap[ival]

            assert len(db._cache.values()) != 0

            n1 = len(db._cache)
            assert n1 == 1, "should be exactly one cache entry, not %d" % \
                    (n1, )

            # ok, now trash referencing arguments & make sure of cleanup
            del x
            gc.collect()

            assert len(db._cache.values()) == 0


            n2 = len(db._cache)
            assert n2 == 0, '%d objects remain; cache memory leak!' % n2
            # FAIL because of __dealloc__ error in cnestedlist.NLMSASlice.

            # Drop our references, the cache should empty.
            del mymap, ival, seq1, seq2
            gc.collect()
            # check that db._weakValueDict cache is empty
            assert len(db._weakValueDict)==0, '_weakValueDict should be empty'
        finally:
            db.close()
예제 #16
0
 def test_headerfile_create_conflict(self):
     "test non-empty prefixDict with a passed in PUD header file: conflict"
     subdb = SequenceFileDB(self.dbfile)
     try:
         header = testutil.datafile('prefixUnionDict-1.txt')
         try:
             db = PrefixUnionDict(filename=header,
                                  prefixDict={'foo': subdb})
             assert 0, "should not get here"
         except TypeError:
             pass
     finally:
         subdb.close()
예제 #17
0
    def test_iadd_db_twice(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            self.db += new_seq
            name1 = (~self.db)[new_seq]

            self.db += new_seq              # should do nothing...
            name2 = (~self.db)[new_seq]
            assert name1 == name2           # ...leaving seq with same name.
        finally:
            seqdb.close()
예제 #18
0
    def test_build_seqLenDict_with_reader(self):
        "Test that building things works properly when specifying a reader."

        class InfoBag(object):

            def __init__(self, **kw):
                self.__dict__.update(kw)

        # first, load the db & save the sequence info in a list
        l = []
        db = SequenceFileDB(self.dbfile)
        try:
            for k, v in db.items():
                info = InfoBag(id=k, length=len(v), sequence=str(v))
                l.append(info)
        finally:
            # now, erase the existing files, and recreate the db.
            db.close()
        self.trash_intermediate_files()

        # create a fake reader with access to the saved info
        def my_fake_reader(fp, filename, info_list=l):
            return info_list

        # now try creating with the fake reader
        db = SequenceFileDB(self.dbfile, reader=my_fake_reader)

        # did it work?
        try:
            assert str(db.get('seq1')).startswith('atggtgtca')
            assert str(db.get('seq2')).startswith('GTGTTGAA')
        finally:
            db.close()
예제 #19
0
    def test_iadd_db_twice(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            self.db += new_seq
            name1 = (~self.db)[new_seq]

            self.db += new_seq  # should do nothing...
            name2 = (~self.db)[new_seq]
            assert name1 == name2  # ...leaving seq with same name.
        finally:
            seqdb.close()
예제 #20
0
    def test_no_db_info(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            assert getattr(seqdb, '_persistent_id', None) is None
            del seqdb.filepath

            self.db += new_seq
            name = (~self.db)[new_seq]
            assert name == 'noname0.seq1'
        finally:
            seqdb.close()
예제 #21
0
    def test_no_db_info(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            assert getattr(seqdb, '_persistent_id', None) is None
            del seqdb.filepath

            self.db += new_seq
            name = (~self.db)[new_seq]
            assert name == 'noname0.seq1'
        finally:
            seqdb.close()
예제 #22
0
    def test_inverse_noadd_behavior(self):
        # compare with test_inverse_add_behavior...
        db = SeqPrefixUnionDict(addAll=False)
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seq = seqdb['seq1']

            try:
                name = (~db)[seq]
                assert 0, "should not get here"
            except KeyError:
                pass
        finally:
            seqdb.close()
예제 #23
0
    def test_inverse_noadd_behavior(self):
        # compare with test_inverse_add_behavior...
        db = SeqPrefixUnionDict(addAll=False)
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seq = seqdb['seq1']

            try:
                name = (~db)[seq]
                assert 0, "should not get here"
            except KeyError:
                pass
        finally:
            seqdb.close()
예제 #24
0
 def test_no_file_given(self):
     "Make sure that a TypeError is raised when no file is available"
     try:
         db = SequenceFileDB()
         assert 0, "should not reach this point"
     except TypeError:
         pass
예제 #25
0
    def test_headerfile_write_fail(self):
        subdb = SequenceFileDB(self.dbfile)
        try:
            del subdb.filepath  # remove 'filepath' attribute for test
            db = PrefixUnionDict({'prefix': subdb})

            assert len(db) == 2
            assert 'prefix.seq1' in db

            output = testutil.tempdatafile('prefixUnionDict-write-fail.txt')
            try:
                db.writeHeaderFile(output)
            except AttributeError:
                pass
        finally:
            subdb.close()  # closes both db and subdb
예제 #26
0
def translate(variant,transcripts,get_transcript):
    genome = SequenceFileDB('hg19.fa') #pip install bsddb3 is required
    try:
        chrom, offset, ref, alt = hgvs.parse_hgvs_name(variant, genome, get_transcript=get_transcript)
    except:
        return 1
    return chrom, offset, ref, alt
예제 #27
0
    def __init__(self,
                 lookup=None,
                 filename=None,
                 db_filename=None,
                 default_seq=None):
        """
        A mock genome object that provides a pygr compatible interface.

        lookup: a list of ((chrom, start, end), seq) values that define
            a lookup table for genome sequence requests.
        filename: a stream or filename containing a lookup table.
        db_filename: a fasta file to use for genome sequence requests.  All
            requests are recorded and can be writen to a lookup table file
            using the `write` method.
        default_seq: if given, this base will always be returned if
            region is unavailable.
        """
        self._chroms = {}
        self._lookup = lookup if lookup is not None else {}
        self._genome = None
        self._default_seq = default_seq

        if db_filename:
            # Use a real genome database.
            if SequenceFileDB is None:
                raise ValueError('pygr is not available.')
            self._genome = SequenceFileDB(db_filename)
        elif filename:
            # Read genome sequence from lookup table.
            self.read(filename)
예제 #28
0
    def test_headerfile_write_fail(self):
        subdb = SequenceFileDB(self.dbfile)
        try:
            del subdb.filepath  # remove 'filepath' attribute for test
            db = PrefixUnionDict({'prefix': subdb})

            assert len(db) == 2
            assert 'prefix.seq1' in db

            output = testutil.tempdatafile('prefixUnionDict-write-fail.txt')
            try:
                db.writeHeaderFile(output)
            except AttributeError:
                pass
        finally:
            subdb.close() # closes both db and subdb
예제 #29
0
def codetest():
    "Test the code here before adding to doctest @CTB"
    import pygr
    from pygr.seqdb import SequenceFileDB
    db = SequenceFileDB(os.path.join('data', 'partial-yeast.fasta'))
    chr02 = db['chr02']
    start, stop = (87787, 86719)
    x = chr02[start:stop]
예제 #30
0
def main():
    global REFGENE

    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--readable_input', 
                        help='readable input file for conversion.')
    parser.add_argument('-o', '--writable_output', 
                        help='writable output file for conversion.')
    parser.add_argument('-g', '--genome_path', help='Link to hg38.fa.')
    parser.add_argument('-r', '--reference_genome', default='./hg38.BRCA.refGene.txt',
                        help='Link to hg38.BRCA.refgene.txt.')

    args = parser.parse_args()
    GENOME = SequenceFileDB(args.genome_path)
    REFGENE = args.reference_genome

    f_in = open(args.readable_input, "r")
    f_out = open(args.writable_output, "w")
    f_out.write("\t".join(OUTPUT_COLUMNS) + "\n")
    for index, line in enumerate(f_in):
        # 
        # Clean the line by removing leading or trailing spaces adjacent to tabs.  
        #
        line = re.sub("( )*\t( )*", "\t", line)
        items = np.array(line.rstrip().split("\t"))
        if index == 0:
            # Handle column names
            columns = np.array([i.replace(" ", "_") for i in items])
            index_to_save = [np.where(columns == i)[0][0] for i in COLUMNS_TO_SAVE]
            column_idx = dict(zip(COLUMNS_TO_SAVE, index_to_save))
            continue
        #
        # In the date last evaluated field, delete the time last evaluated if provided.
        #
        date_last_evaluated_idx = column_idx["Date_last_evaluated"]
        items[date_last_evaluated_idx] = items[date_last_evaluated_idx].split(' ')[0]
        OMIM_id_index = column_idx["Condition_ID_value"]
        items[OMIM_id_index] = convert_OMIM_id(items[OMIM_id_index])
        items[column_idx["HGVS"]] = cleanup_HGVS(items[column_idx["Reference_sequence"]],
                                 items[column_idx["HGVS"]], HP, EVM)
        HGVS_cDNA = items[column_idx["Reference_sequence"]] + ":" + items[column_idx["HGVS"]]
        print items[column_idx["Reference_sequence"]], items[column_idx["HGVS"]], HGVS_cDNA
        try:
            genome_coor, HGVS_p = convert_HGVS(HGVS_cDNA, GENOME)
        except:
            if (items[column_idx["HGVS"]]).find(";") > -1:
                genome_coor, HGVS_p = create_None_filler()
        aa_abrev_index = column_idx["Abbrev_AA_change"]
        if HGVS_p not in ["p.?", "p.(=)", "None"]:
            if items[aa_abrev_index] == '':
                items[aa_abrev_index] = HGVS_p_to_AA_abrev(HGVS_p)
        final_items = list(items[index_to_save])
        final_items.insert(1, genome_coor)
        final_items.append(HGVS_p)
        new_line = "\t".join(list(final_items)) + "\n"
        f_out.write(new_line)
    f_in.close()
    f_out.close()
예제 #31
0
    def test_build_seqLenDict_with_bad_reader(self):
        "Test that building things fails properly with a bad reader."

        class InfoBag(object):

            def __init__(self, **kw):
                self.__dict__.update(kw)

        # first, load the db & save the sequence info in a list
        l = []
        db = SequenceFileDB(self.dbfile)
        try:
            for k, v in db.items():
                info = InfoBag(id=k, length=0, sequence=str(v))
                l.append(info)
        finally:
            # now, erase the existing files, and recreate the db.
            db.close()
        self.trash_intermediate_files()

        # create a fake reader with access to the saved info
        def my_fake_reader(fp, filename, info_list=l):
            return info_list

        # now try creating with the fake reader
        try:
            db = SequenceFileDB(self.dbfile, reader=my_fake_reader)
            try:
                assert 0, "should not reach here; db construction should fail!"
            finally:
                db.close()
        except ValueError:
            pass                        # ValueError is expected
예제 #32
0
    def test_basic_iadd(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            self.db += new_seq

            assert new_seq in self.db
            name = (~self.db)[new_seq]
            assert name == 'dnaseq.seq1', name

            ###

            seqdb2 = SequenceFileDB(dnaseq)
            try:
                # Munge the filepath for testing.
                seqdb2.filepath = 'foo'
                new_seq2 = seqdb2['seq1']

                self.db += new_seq2
                name2 = (~self.db)[new_seq2]
                assert name2 == 'foo.seq1', name2
            finally:
                seqdb2.close()
        finally:
            seqdb.close()
예제 #33
0
def initialize(hg_fasta, snpeff_predictor_bin):
    ''' Load required databases:
         * human genome reference
         * refGene.txt'''
    global __GENOME__, __TRANSCRIPTS__

    __GENOME__ = SequenceFileDB(hg_fasta)
    with gzip.open(snpeff_predictor_bin) as f:
        __TRANSCRIPTS__ = read_snpeff_transcripts(f)
예제 #34
0
def test_invalid_coordinates():
    """
    Regression test for 17
    """
    if not SequenceFileDB:
        raise nose.SkipTest

    genome = SequenceFileDB('pyhgvs/tests/data/test_refseqs.fa')
    hgvs_name = 'NC_000005.10:g.177421339_177421327delACTCGAGTGCTCC'
    parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)
예제 #35
0
def main():
    global REFGENE

    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        '--readable_input',
                        type=argparse.FileType('r'),
                        help='Opened readable input file for conversion.')
    parser.add_argument('-o',
                        '--writable_output',
                        type=argparse.FileType('w'),
                        help='Opened writable output file for conversion.')
    parser.add_argument('-g', '--genome_path', help='Link to hg38.fa.')
    parser.add_argument('-r',
                        '--reference_genome',
                        default='./hg38.BRCA.refGene.txt',
                        help='Link to hg38.BRCA.refgene.txt.')

    args = parser.parse_args()
    GENOME = SequenceFileDB(args.genome_path)
    REFGENE = args.reference_genome

    f_out = args.writable_output
    f_out.write("\t".join(OUTPUT_COLUMNS) + "\n")
    f_in = args.readable_input
    for index, line in enumerate(f_in):
        items = np.array(line.rstrip().split("\t"))
        if index == 0:
            # Handle column names
            columns = np.array([i.replace(" ", "_") for i in items])
            index_to_save = [
                np.where(columns == i)[0][0] for i in COLUMNS_TO_SAVE
            ]
            column_idx = dict(zip(COLUMNS_TO_SAVE, index_to_save))
            continue
        OMIM_id_index = column_idx["Condition_ID_value"]
        items[OMIM_id_index] = convert_OMIM_id(items[OMIM_id_index])
        HGVS_cDNA = (items[column_idx["Reference_sequence"]] + ":" +
                     items[column_idx["HGVS"]])
        try:
            genome_coor, HGVS_p = convert_HGVS(HGVS_cDNA, GENOME)
        except:
            if (items[column_idx["HGVS"]]).find(";") > -1:
                genome_coor, HGVS_p = create_None_filler()
        aa_abrev_index = column_idx["Abbrev_AA_change"]
        if HGVS_p not in ["p.?", "p.(=)", "None"]:
            if items[aa_abrev_index] == '':
                items[aa_abrev_index] = HGVS_p_to_AA_abrev(HGVS_p)
        final_items = list(items[index_to_save])
        final_items.insert(1, genome_coor)
        final_items.append(HGVS_p)
        new_line = "\t".join(list(final_items)) + "\n"
        f_out.write(new_line)
    f_in.close()
    f_out.close()
예제 #36
0
 def __init__(self,
              inFile,
              genome,
              vkey=False,
              verbose=False,
              log=sys.stderr):
     self.inFile = inFile
     self.verbose = verbose
     self.log = log
     self.vkey = vkey
     self.genome = SequenceFileDB(genome)
     self.infoHeader = "[" + self.__class__.__name__ + "]"
예제 #37
0
def get_genome_coor(hgvs_c):
    genome = SequenceFileDB('data/hg19.fa')
    refGene = "/Users/Molly/Desktop/web-dev/hgvs_counsyl/hgvs/pyhgvs/data/genes.refGene"
    with open(refGene) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)

    def get_transcript(name):
        return transcripts.get(name)

    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        hgvs_c, genome, get_transcript=get_transcript)
    return chrom + ":" + str(offset) + ":" + ref + ">" + alt
예제 #38
0
def HGVS_to_GenomeCoor(HGVS):
    """use counsyl pyhgvs for this"""
    genome = SequenceFileDB('../data/hg19.fa')
    refGene = "../data/BRCA12.refGene.txt"
    with open(refGene) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)
    def get_transcript(name):
        return transcripts.get(name)
    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        HGVS, genome, get_transcript=get_transcript)
    genome_coordinate = chrom + ":" + str(offset) + ":" + ref + ">" + alt
    return genome_coordinate
예제 #39
0
    def __init__(self):
        """
        Initializes hg19 reference and reference transcripts
        """

        genome_path = os.path.join(os.path.dirname(__file__), 'resources', 'hg19.fa')
        refseq_path = os.path.join(os.path.dirname(__file__), 'resources', 'genes.refGene')

        # Read genome sequence using pygr.
        self.genome = SequenceFileDB(genome_path)

        # Read RefSeq transcripts into a python dict.
        with open(refseq_path) as infile:
            self.transcripts = hgvs.utils.read_transcripts(infile)
예제 #40
0
def main(argv=None):
    parser = make_parser()
    args = parser.parse_args(argv)
    genome = SequenceFileDB(args.genome)


    pwm = [IUPAC_SCORES[l] for l in args.consensus]
    pwm.extend([REQUIRED_SCORES[l] for l in args.required_3p_seq])
    pwm = motility.PWM(pwm)

    # find all matches
    with open(args.outfile, 'w') as outfile:
        for chrom in genome.keys():
            chromseq = str(genome[chrom])
            print "searching ", chrom, "of length", len(chromseq)
            if len(chromseq) < len(pwm):
                print 'chromosome/fragment', chrom, 'is too short'
                continue
            matches = pwm.find(chromseq, -args.mismatches)
            for start, stop, strand, seq in matches:
                score = pwm.calc_score(seq)
                outfile.write('\t'.join(
                    [chrom, str(start), str(stop), seq, str(score),
                     '+' if strand == 1 else '-']) + '\n')
예제 #41
0
    def test_iadd_duplicate_seqdb(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seqdb2 = SequenceFileDB(dnaseq)
            try:
                new_seq = seqdb['seq1']
                new_seq2 = seqdb2['seq1']

                self.db += new_seq
                try:
                    self.db += new_seq2
                    assert 0, "should never reach this point"
                except ValueError:
                    pass
            finally:
                seqdb2.close()
        finally:
            seqdb.close()
예제 #42
0
def test_name_to_variant_refseqs():
    """
    Convert HGVS names to variant coordinates using refseqs directly.
    """
    if not SequenceFileDB:
        print 'skip test_name_to_variant_refseqs'
        return
    genome = SequenceFileDB('pyhgvs/tests/data/test_refseqs.fa')

    for hgvs_name, variant, name_canonical, var_canonical in _name_variants:
        if not var_canonical or 'NM_' not in hgvs_name:
            # Only test transcript HGVS names.
            continue
        hgvs_variant = parse_hgvs_name(hgvs_name,
                                       genome,
                                       get_transcript=get_transcript)
        nose.tools.assert_equal(hgvs_variant, variant,
                                repr([hgvs_name, variant, hgvs_variant]))
예제 #43
0
    def test_pyhgvs_cdna_coordinate_correct(self):
        for i in self.data:
            pyhgvs_coord = i['pyhgvs_Genomic_Coordinate_38']
            pyhgvs_cDNA = i['pyhgvs_cDNA']
            genome = SequenceFileDB('../reference_genome/hg38/hg38.fa')

            def get_transcript(name):
                REFGENE = "../refgene38_brca.txt"
                with open(REFGENE) as infile:
                    TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile)
                return TRANSCRIPTS.get(name)

            chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
                pyhgvs_cDNA, genome, get_transcript=get_transcript)
            test_coord = chrom + ":" + "g." + str(
                offset) + ":" + ref + ">" + alt

            self.assertEqual(pyhgvs_coord, test_coord)
예제 #44
0
    def __init__(self):
        """
        Initializes hg19 reference and reference transcripts
        """
        #genome_path = '/Users/charlesmarkello/leidenv1.0/resources/hg38.fa'
        #refseq_path = '/Users/charlesmarkello/leidenv1.0/resources/refGene.txt'
        #genome_path = '/Users/charlesmarkello/leidenv1.0/resources/hg19.fa'
        #refseq_path = '/Users/charlesmarkello/leidenv1.0/resources/genes.refGene'
        genome_path = os.path.join(os.path.dirname(__file__), 'resources',
                                   'hg19.fa')
        refseq_path = os.path.join(os.path.dirname(__file__), 'resources',
                                   'genes.refGene')
        print 'genome_path: ', genome_path

        # Read genome sequence using pygr.
        self.genome = SequenceFileDB(genome_path)

        # Read RefSeq transcripts into a python dict.
        with open(refseq_path) as infile:
            self.transcripts = pyhgvs.utils.read_transcripts(infile)
예제 #45
0
    def test_iadd_duplicate_seqdb(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seqdb2 = SequenceFileDB(dnaseq)
            try:
                new_seq = seqdb['seq1']
                new_seq2 = seqdb2['seq1']

                self.db += new_seq
                try:
                    self.db += new_seq2
                    assert 0, "should never reach this point"
                except ValueError:
                    pass
            finally:
                seqdb2.close()
        finally:
            seqdb.close()
예제 #46
0
class SequenceFileDB_Test(unittest.TestCase):
    """
    Test for all of the basic dictionary functions on 'SequenceFileDB',
    among other things.
    """

    def setUp(self):
        "Test setup"
        dnaseq = testutil.datafile('dnaseq.fasta')
        self.db = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2'

        self.db._weakValueDict.clear()   # clear the cache

    def tearDown(self):
        self.db.close() # must close SequenceFileDB!

    def test_len(self):
        assert len(self.db) == 2

    def test_seqInfoDict_len(self):
        assert len(self.db.seqInfoDict) == 2

    def test_no_file_given(self):
        "Make sure that a TypeError is raised when no file is available"
        try:
            db = SequenceFileDB()
            assert 0, "should not reach this point"
        except TypeError:
            pass

    def test_seq_descriptor(self):
        "Check the '.seq' attribute (tied to a descriptor)"
        s = self.db['seq1']
        assert str(s) == str(s.seq)

    def test_cache(self):
        "SequenceDB cache test"
        assert len(self.db._weakValueDict) == 0
        seq1 = self.db['seq1']

        # cache populated?
        assert len(self.db._weakValueDict) == 1
        assert 'seq1' in self.db._weakValueDict

        # cache functions?
        seq1_try2 = self.db['seq1']
        assert seq1 is seq1_try2

    def test_clear_cache(self):
        "SequenceDB clear_cache test"
        assert len(self.db._weakValueDict) == 0
        seq1 = self.db['seq1']

        # cache populated?
        assert len(self.db._weakValueDict) == 1
        assert 'seq1' in self.db._weakValueDict

        # clear_cache functions?
        self.db.clear_cache()
        seq1_try3 = self.db['seq1']
        assert seq1 is not seq1_try3

    def test_keys(self):
        "SequenceFileDB keys"
        k = self.db.keys()
        k.sort()
        assert k == ['seq1', 'seq2']

    def test_contains(self):
        "SequenceFileDB contains"
        assert 'seq1' in self.db, self.db.keys()
        assert 'seq2' in self.db
        assert 'foo' not in self.db

    def test_invert_class(self):
        "SequenceFileDB __invert__"
        seq = self.db['seq1']
        inversedb = ~self.db
        assert inversedb[seq] == 'seq1'
        assert seq in inversedb
        assert 'foo' not in inversedb

    def test_keys_info(self):
        "SequenceFileDB keys info"
        k = self.db.seqInfoDict.keys()
        k.sort()
        assert k == ['seq1', 'seq2']

    def test_contains_info(self):
        "SequenceFileDB contains info"
        assert 'seq1' in self.db.seqInfoDict
        assert 'seq2' in self.db.seqInfoDict
        assert 'foo' not in self.db.seqInfoDict

    def test_has_key(self):
        "SequenceFileDB has key"
        assert 'seq1' in self.db
        assert 'seq2' in self.db
        assert 'foo' not in self.db

    def test_get(self):
        "SequenceFileDB get"
        assert self.db.get('foo') is None
        assert self.db.get('seq1') is not None
        assert str(self.db.get('seq1')).startswith('atggtgtca')
        assert self.db.get('seq2') is not None
        assert str(self.db.get('seq2')).startswith('GTGTTGAA')

    def test_items(self):
        "SequenceFileDB items"
        i = [k for (k, v) in self.db.items()]
        i.sort()
        assert i == ['seq1', 'seq2']

    def test_iterkeys(self):
        "SequenceFileDB iterkeys"
        kk = self.db.keys()
        kk.sort()
        ik = list(self.db.iterkeys())
        ik.sort()
        assert kk == ik

    def test_itervalues(self):
        "SequenceFileDB itervalues"
        kv = self.db.values()
        kv.sort()
        iv = list(self.db.itervalues())
        iv.sort()
        assert kv == iv

    def test_iteritems(self):
        "SequenceFileDB iteritems"
        ki = self.db.items()
        ki.sort()
        ii = list(self.db.iteritems())
        ii.sort()
        assert ki == ii

    def test_readonly(self):
        "SequenceFileDB readonly"
        try:
            self.db.copy()          # what should 'copy' do on SequenceFileDB?
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.clear()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.setdefault('foo')
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.pop()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.popitem()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.update({})
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass

    # test some things other than dict behavior
    def test_keyerror(self):
        """SequenceFileDB keyerror.
        Make sure that the SequenceFileDB KeyError is informative."""
        try:
            self.db['foo']
        except KeyError, e:
            assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)
예제 #47
0
 def setUp(self):
     dnaseq = testutil.datafile('dnaseq.fasta')
     self.seqdb = SequenceFileDB(dnaseq)     # contains 'seq1', 'seq2'
     self.db = SeqPrefixUnionDict({'prefix': self.seqdb})
예제 #48
0
class SeqPrefixUnionDict_Test(unittest.TestCase):
    """
    Test SeqPrefixUnionDict.
    """

    def setUp(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        self.seqdb = SequenceFileDB(dnaseq)     # contains 'seq1', 'seq2'
        self.db = SeqPrefixUnionDict({'prefix': self.seqdb})

    def tearDown(self):
        self.seqdb.close()

    def test_basic_iadd(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            self.db += new_seq

            assert new_seq in self.db
            name = (~self.db)[new_seq]
            assert name == 'dnaseq.seq1', name

            ###

            seqdb2 = SequenceFileDB(dnaseq)
            try:
                # Munge the filepath for testing.
                seqdb2.filepath = 'foo'
                new_seq2 = seqdb2['seq1']

                self.db += new_seq2
                name2 = (~self.db)[new_seq2]
                assert name2 == 'foo.seq1', name2
            finally:
                seqdb2.close()
        finally:
            seqdb.close()
        # NOTE, the important thing here is less the specific names that
        # are given (which are based on filepath) but that different names
        # are created for the various sequences when they are added.

    def test_iadd_db_twice(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            self.db += new_seq
            name1 = (~self.db)[new_seq]

            self.db += new_seq              # should do nothing...
            name2 = (~self.db)[new_seq]
            assert name1 == name2           # ...leaving seq with same name.
        finally:
            seqdb.close()

    def test_iadd_user_seq(self):
        seq = Sequence('ATGGCAGG', 'foo')
        self.db += seq

        name = (~self.db)[seq]
        assert name == 'user.foo'       # created a new 'user' db.

        # ok, make sure it doesn't wipe out the old 'user' db...
        seq2 = Sequence('ATGGCAGG', 'foo2')
        self.db += seq2

        name = (~self.db)[seq2]
        assert name == 'user.foo2'

        first_name = (~self.db)[seq]
        assert first_name == 'user.foo'

    def test_iadd_duplicate_seqdb(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seqdb2 = SequenceFileDB(dnaseq)
            try:
                new_seq = seqdb['seq1']
                new_seq2 = seqdb2['seq1']

                self.db += new_seq
                try:
                    self.db += new_seq2
                    assert 0, "should never reach this point"
                except ValueError:
                    pass
            finally:
                seqdb2.close()
        finally:
            seqdb.close()

    def test_no_db_info(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            new_seq = seqdb['seq1']

            assert getattr(seqdb, '_persistent_id', None) is None
            del seqdb.filepath

            self.db += new_seq
            name = (~self.db)[new_seq]
            assert name == 'noname0.seq1'
        finally:
            seqdb.close()

    def test_inverse_add_behavior(self):
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seq = seqdb['seq1']

            name = (~self.db)[seq]
        finally:
            seqdb.close() # only need to close if exception occurs

    def test_inverse_noadd_behavior(self):
        # compare with test_inverse_add_behavior...
        db = SeqPrefixUnionDict(addAll=False)
        dnaseq = testutil.datafile('dnaseq.fasta')
        seqdb = SequenceFileDB(dnaseq)
        try:
            seq = seqdb['seq1']

            try:
                name = (~db)[seq]
                assert 0, "should not get here"
            except KeyError:
                pass
        finally:
            seqdb.close()
예제 #49
0
    def setUp(self):
        "Test setup"
        dnaseq = testutil.datafile('dnaseq.fasta')
        self.db = SequenceFileDB(dnaseq) # contains 'seq1', 'seq2'

        self.db._weakValueDict.clear()   # clear the cache
예제 #50
0
파일: seqdb_test.py 프로젝트: ctb/pygr
 def setup(self):
     self.db = SequenceFileDB('dnaseq')     # contains 'seq1', 'seq2'
예제 #51
0
파일: seqdb_test.py 프로젝트: ctb/pygr
class SequenceFileDB_Test(object):
    """
    Test for all of the basic dictionary functions on 'SequenceFileDB'.
    """
    def setup(self):
        self.db = SequenceFileDB('dnaseq')     # contains 'seq1', 'seq2'
    def keys_test(self):
        k = self.db.keys()
        k.sort()
        assert k == ['seq1', 'seq2']
    def contains_test(self):
        assert 'seq1' in self.db, self.db.keys()
        assert 'seq2' in self.db
        assert 'foo' not in self.db
    def keys_info_test(self):
        k = self.db.seqInfoDict.keys()
        k.sort()
        assert k == ['seq1', 'seq2']
    def contains_info_test(self):
        assert 'seq1' in self.db.seqInfoDict
        assert 'seq2' in self.db.seqInfoDict
        assert 'foo' not in self.db.seqInfoDict
    def has_key_test(self):
        assert self.db.has_key('seq1')
        assert self.db.has_key('seq2')
        assert not self.db.has_key('foo')
    def get_test(self):
        assert self.db.get('foo') is None
        assert self.db.get('seq1') is not None
        assert str(self.db.get('seq1')).startswith('atggtgtca')
        assert self.db.get('seq2') is not None
        assert str(self.db.get('seq2')).startswith('GTGTTGAA')
    def items_test(self):
        i = [ k for (k,v) in self.db.items() ]
        i.sort()
        assert i == ['seq1', 'seq2']
    def iterkeys_test(self):
        kk = self.db.keys()
        kk.sort()
        ik = list(self.db.iterkeys())
        ik.sort()
        assert kk == ik
    def itervalues_test(self):
        kv = self.db.values()
        kv.sort()
        iv = list(self.db.itervalues())
        iv.sort()
        assert kv == iv
    def iteritems_test(self):
        ki = self.db.items()
        ki.sort()
        ii = list(self.db.iteritems())
        ii.sort()
        assert ki == ii
    def readonly_test(self):
        try:
            self.db.copy()              # what should 'copy' do on SequenceFileDB?
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.clear()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.setdefault('foo')
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.pop()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.popitem()
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
        try:
            self.db.update({})
            assert 0, 'this method should raise NotImplementedError'
        except NotImplementedError:
            pass
            
    # test some things other than dict behavior
    def keyerror_test(self):
        "Make sure that the SequenceFileDB KeyError is informative."
        try:
            self.db['foo']
        except KeyError, e:
            assert "no key 'foo' in database <SequenceFileDB" in str(e), str(e)