Exemplo n.º 1
0
    def test_build(self):
        'Test building an NLMSA and querying results'
        from pygr import seqdb, cnestedlist
        genomedict = {}
        for orgstr in msaSpeciesList:
            genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' +
                                                       orgstr)
        uniondict = seqdb.PrefixUnionDict(genomedict)
        if smallSampleKey:
            maflist = (os.path.join(mafDir, smallSampleKey + '.maf'), )
        else:
            maflist = glob.glob(os.path.join(mafDir, '*.maf'))
            maflist.sort()
        msaname = os.path.join(self.path, 'dm2_multiz15way')
        # 500MB VERSION
        msa1 = cnestedlist.NLMSA(msaname,
                                 'w',
                                 uniondict,
                                 maflist,
                                 maxlen=536870912,
                                 maxint=22369620)
        msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
        pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
        outfileName = os.path.join(testInputDir,
                                   'splicesite_dm2%s.txt' % smallSamplePostfix)
        outputName = os.path.join(
            testInputDir,
            'splicesite_dm2%s_multiz15way.txt' % smallSamplePostfix)
        newOutputName = os.path.join(self.path, 'splicesite_new1.txt')
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        outfile = open(newOutputName, 'w')
        for lines in open(tmpInputName, 'r').xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2]
            site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0:  # EMPTY EDGES
                wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \
                        '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            if len(edges2) == 0:  # EMPTY EDGES
                wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \
                        '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            saveList.sort()  # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(newOutputName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(tmpOutputName, 'r').read())
        assert md5old.digest() == md5new.digest()

        # TEXT<->BINARY TEST
        msafilelist = glob.glob(msaname + '*')
        msa.save_seq_dict()
        cnestedlist.dump_textfile(
            msaname, os.path.join(self.path, 'dm2_multiz15way.txt'))
        for filename in msafilelist:
            os.remove(filename)
        runPath = os.path.realpath(os.curdir)
        os.chdir(self.path)
        cnestedlist.textfile_to_binaries('dm2_multiz15way.txt')
        os.chdir(runPath)

        msa1 = cnestedlist.NLMSA(msaname, 'r')
        msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
        pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
        newOutputName = os.path.join(self.path, 'splicesite_new2.txt')
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        outfile = open(newOutputName, 'w')
        for lines in open(tmpInputName, 'r').xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2]
            site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0:  # EMPTY EDGES
                wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \
                        '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            if len(edges2) == 0:  # EMPTY EDGES
                wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \
                        '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            saveList.sort()  # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(newOutputName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(tmpOutputName, 'r').read())
        assert md5old.digest() == md5new.digest()
    def test_build(self):
        'Test building an NLMSA and querying results'
        from pygr import seqdb, cnestedlist
        genomedict = {}
        for orgstr in msaSpeciesList:
            genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.'
                                                       + orgstr)
        uniondict = seqdb.PrefixUnionDict(genomedict)
        if smallSampleKey:
            axtlist = glob.glob(os.path.join(axtDir, '*' + os.sep
                                             + smallSampleKey + '.*.net.axt'))
        else:
            axtlist = glob.glob(os.path.join(axtDir, '*' + os.sep
                                             + '*.*.net.axt'))
        axtlist.sort()
        msaname = os.path.join(self.path, 'hg18_pairwise5way')
        # 500MB VERSION
        msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, axtFiles=axtlist,
                                 maxlen=536870912, maxint=22369620)
        msa1.__doc__ = 'TEST NLMSA for hg18 pairwise5way'
        pygr.Data.addResource('TEST.MSA.UCSC.hg18_pairwise5way', msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource('TEST.MSA.UCSC.hg18_pairwise5way')
        outfileName = os.path.join(testInputDir, 'splicesite_hg18%s.txt'
                                   % smallSamplePostfix)
        outputName = os.path.join(testInputDir,
                                  'splicesite_hg18%s_pairwise5way.txt'
                                  % smallSamplePostfix)
        newOutputName = 'splicesite_new1.txt'
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        tmpNewOutputName = os.path.join(self.path, newOutputName)
        outfile = open(tmpNewOutputName, 'w')
        for lines in open(tmpInputName, 'r').xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict['hg18' + '.' + chrid][intstart:intstart + 2]
            site2 = msa.seqDict['hg18' + '.' + chrid][intend - 2:intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0: # EMPTY EDGES
                wlist = str(site1), 'hg18', chrid, intstart, intstart + 2, \
                        '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            if len(edges2) == 0: # EMPTY EDGES
                wlist = str(site2), 'hg18', chrid, intend - 2, intend, '', \
                        '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(tmpNewOutputName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(tmpOutputName, 'r').read())
        assert md5old.digest() == md5new.digest()

        # TEXT<->BINARY TEST
        msafilelist = glob.glob(msaname + '*')
        msa.save_seq_dict()
        cnestedlist.dump_textfile(msaname, os.path.join(self.path,
                                                      'hg18_pairwise5way.txt'))
        for filename in msafilelist:
            os.remove(filename)
        runPath = os.path.realpath(os.curdir)
        os.chdir(self.path)
        cnestedlist.textfile_to_binaries('hg18_pairwise5way.txt')
        os.chdir(runPath)

        msa1 = cnestedlist.NLMSA(msaname, 'r')
        msa1.__doc__ = 'TEST NLMSA for hg18 pairwise5way'
        pygr.Data.addResource('TEST.MSA.UCSC.hg18_pairwise5way', msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource('TEST.MSA.UCSC.hg18_pairwise5way')
        newOutputName = 'splicesite_new2.txt'
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        tmpNewOutputName = os.path.join(self.path, newOutputName)
        outfile = open(tmpNewOutputName, 'w')
        for lines in open(tmpInputName, 'r').xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict['hg18' + '.' + chrid][intstart:intstart + 2]
            site2 = msa.seqDict['hg18' + '.' + chrid][intend - 2:intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0: # EMPTY EDGES
                wlist = str(site1), 'hg18', chrid, intstart, intstart + 2, \
                        '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            if len(edges2) == 0: # EMPTY EDGES
                wlist = str(site2), 'hg18', chrid, intend - 2, intend, '', \
                        '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \
                        (~msa.seqDict)[src][dotindex + 1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \
                        (~msa.seqDict)[dest][dotindex + 1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, \
                        str(dest), destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(tmpNewOutputName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(tmpOutputName, 'r').read())
        assert md5old.digest() == md5new.digest()
Exemplo n.º 3
0
    def build_test(self): # BUILD NLMSA AND QUERY RESULT COMPARISON
        from pygr import seqdb, cnestedlist
        genomedict = {}
        for orgstr in msaSpeciesList:
            genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr)
        uniondict = seqdb.PrefixUnionDict(genomedict)
        import glob
        maflist = glob.glob(os.path.join(mafDir, 'chr4h.maf')) # CHR4H TESTING
        maflist.sort()
        msaname = os.path.join(self.path, 'dm2_multiz15way')
        msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen = 536870912, maxint = 22369620) # 500MB VERSION
        msa1.save_seq_dict()
        msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
        pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
        outfileName = os.path.join(testInputDir, 'splicesite_dm2_chr4h.txt') # CHR4H TESTING
        outputName = os.path.join(testInputDir, 'splicesite_dm2_chr4h_multiz15way.txt') # CHR4H TESTING
        newOutputName = os.path.join(self.path, 'splicesite_new1.txt')
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        outfile = open(newOutputName, 'w')
        for lines in open(tmpInputName, 'r').xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart+2]
            site2 = msa.seqDict['dm2' + '.' + chrid][intend-2:intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0: # EMPTY EDGES
                wlist = str(site1), 'dm2', chrid, intstart, intstart+2, '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            if len(edges2) == 0: # EMPTY EDGES
                wlist = str(site2), 'dm2', chrid, intend-2, intend, '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2: continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
                    destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2: continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
                    destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(newOutputName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(tmpOutputName, 'r').read())
        assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS

        # TEXT<->BINARY TEST
        msafilelist = glob.glob(msaname + '*')
        cnestedlist.dump_textfile(msaname, os.path.join(self.path, 'dm2_multiz15way.txt'))
        for filename in msafilelist: os.remove(filename)
        runPath = os.path.realpath(os.curdir)
        os.chdir(self.path)
        cnestedlist.textfile_to_binaries('dm2_multiz15way.txt')
        os.chdir(runPath)

        msa1 = cnestedlist.NLMSA(msaname, 'r')
        msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way'
        pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way')
        newOutputName = os.path.join(self.path, 'splicesite_new2.txt')
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        outfile = open(newOutputName, 'w')
        for lines in open(tmpInputName, 'r').xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), '\t')
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart+2]
            site2 = msa.seqDict['dm2' + '.' + chrid][intend-2:intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0: # EMPTY EDGES
                wlist = str(site1), 'dm2', chrid, intstart, intstart+2, '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            if len(edges2) == 0: # EMPTY EDGES
                wlist = str(site2), 'dm2', chrid, intend-2, intend, '', '', '', '', ''
                outfile.write('\t'.join(map(str, wlist)) + '\n')
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2: continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
                    destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2: continue
                dotindex = (~msa.seqDict)[src].index('.')
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex+1:]
                dotindex = (~msa.seqDict)[dest].index('.')
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex+1:]
                wlist = str(src), srcspecies, src1, src.start, src.stop, str(dest), \
                    destspecies, dest1, dest.start, dest.stop
                saveList.append('\t'.join(map(str, wlist)) + '\n')
            saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        md5old = hashlib.md5()
        md5old.update(open(newOutputName, 'r').read())
        md5new = hashlib.md5()
        md5new.update(open(tmpOutputName, 'r').read())
        assert md5old.digest() == md5new.digest() # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS
Exemplo n.º 4
0
    def build_test(self):  # BUILD NLMSA AND QUERY RESULT COMPARISON
        from pygr import seqdb, cnestedlist

        genomedict = {}
        for orgstr in msaSpeciesList:
            genomedict[orgstr] = pygr.Data.getResource("TEST.Seq.Genome." + orgstr)
        uniondict = seqdb.PrefixUnionDict(genomedict)
        if smallSampleKey:
            maflist = (os.path.join(mafDir, smallSampleKey + ".maf"),)
        else:
            maflist = glob.glob(os.path.join(mafDir, "*.maf"))
            maflist.sort()
        msaname = os.path.join(self.path, "hg18_multiz28way")
        msa1 = cnestedlist.NLMSA(msaname, "w", uniondict, maflist, maxlen=536870912, maxint=22369620)  # 500MB VERSION
        msa1.save_seq_dict()
        msa1.__doc__ = "TEST NLMSA for hg18 multiz28way"
        pygr.Data.addResource("TEST.MSA.UCSC.hg18_multiz28way", msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource("TEST.MSA.UCSC.hg18_multiz28way")
        outfileName = os.path.join(testInputDir, "splicesite_hg18%s.txt" % smallSamplePostfix)
        outputName = os.path.join(testInputDir, "splicesite_hg18%s_multiz28way.txt" % smallSamplePostfix)
        newOutputName = os.path.join(self.path, "splicesite_new1.txt")
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        outfile = open(newOutputName, "w")
        for lines in open(tmpInputName, "r").xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), "\t")
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict["hg18" + "." + chrid][intstart : intstart + 2]
            site2 = msa.seqDict["hg18" + "." + chrid][intend - 2 : intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0:  # EMPTY EDGES
                wlist = str(site1), "hg18", chrid, intstart, intstart + 2, "", "", "", "", ""
                outfile.write("\t".join(map(str, wlist)) + "\n")
            if len(edges2) == 0:  # EMPTY EDGES
                wlist = str(site2), "hg18", chrid, intend - 2, intend, "", "", "", "", ""
                outfile.write("\t".join(map(str, wlist)) + "\n")
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index(".")
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex + 1 :]
                dotindex = (~msa.seqDict)[dest].index(".")
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex + 1 :]
                wlist = (
                    str(src),
                    srcspecies,
                    src1,
                    src.start,
                    src.stop,
                    str(dest),
                    destspecies,
                    dest1,
                    dest.start,
                    dest.stop,
                )
                saveList.append("\t".join(map(str, wlist)) + "\n")
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index(".")
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex + 1 :]
                dotindex = (~msa.seqDict)[dest].index(".")
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex + 1 :]
                wlist = (
                    str(src),
                    srcspecies,
                    src1,
                    src.start,
                    src.stop,
                    str(dest),
                    destspecies,
                    dest1,
                    dest.start,
                    dest.stop,
                )
                saveList.append("\t".join(map(str, wlist)) + "\n")
            saveList.sort()  # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        import md5

        md5old = md5.new()
        md5old.update(open(newOutputName, "r").read())
        md5new = md5.new()
        md5new.update(open(tmpOutputName, "r").read())
        assert md5old.digest() == md5new.digest()  # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS

        # TEXT<->BINARY TEST
        msafilelist = glob.glob(msaname + "*")
        cnestedlist.dump_textfile(msaname, os.path.join(self.path, "hg18_multiz28way.txt"))
        for filename in msafilelist:
            os.remove(filename)
        runPath = os.path.realpath(os.curdir)
        os.chdir(self.path)
        cnestedlist.textfile_to_binaries("hg18_multiz28way.txt")
        os.chdir(runPath)

        msa1 = cnestedlist.NLMSA(msaname, "r")
        msa1.__doc__ = "TEST NLMSA for hg18 multiz28way"
        pygr.Data.addResource("TEST.MSA.UCSC.hg18_multiz28way", msa1)
        pygr.Data.save()
        msa = pygr.Data.getResource("TEST.MSA.UCSC.hg18_multiz28way")
        newOutputName = os.path.join(self.path, "splicesite_new2.txt")
        tmpInputName = self.copyFile(outfileName)
        tmpOutputName = self.copyFile(outputName)
        outfile = open(newOutputName, "w")
        for lines in open(tmpInputName, "r").xreadlines():
            chrid, intstart, intend, nobs = string.split(lines.strip(), "\t")
            intstart, intend, nobs = int(intstart), int(intend), int(nobs)
            site1 = msa.seqDict["hg18" + "." + chrid][intstart : intstart + 2]
            site2 = msa.seqDict["hg18" + "." + chrid][intend - 2 : intend]
            edges1 = msa[site1].edges()
            edges2 = msa[site2].edges()
            if len(edges1) == 0:  # EMPTY EDGES
                wlist = str(site1), "hg18", chrid, intstart, intstart + 2, "", "", "", "", ""
                outfile.write("\t".join(map(str, wlist)) + "\n")
            if len(edges2) == 0:  # EMPTY EDGES
                wlist = str(site2), "hg18", chrid, intend - 2, intend, "", "", "", "", ""
                outfile.write("\t".join(map(str, wlist)) + "\n")
            saveList = []
            for src, dest, e in edges1:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index(".")
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex + 1 :]
                dotindex = (~msa.seqDict)[dest].index(".")
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex + 1 :]
                wlist = (
                    str(src),
                    srcspecies,
                    src1,
                    src.start,
                    src.stop,
                    str(dest),
                    destspecies,
                    dest1,
                    dest.start,
                    dest.stop,
                )
                saveList.append("\t".join(map(str, wlist)) + "\n")
            for src, dest, e in edges2:
                if len(str(src)) != 2 or len(str(dest)) != 2:
                    continue
                dotindex = (~msa.seqDict)[src].index(".")
                srcspecies, src1 = (~msa.seqDict)[src][:dotindex], (~msa.seqDict)[src][dotindex + 1 :]
                dotindex = (~msa.seqDict)[dest].index(".")
                destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], (~msa.seqDict)[dest][dotindex + 1 :]
                wlist = (
                    str(src),
                    srcspecies,
                    src1,
                    src.start,
                    src.stop,
                    str(dest),
                    destspecies,
                    dest1,
                    dest.start,
                    dest.stop,
                )
                saveList.append("\t".join(map(str, wlist)) + "\n")
            saveList.sort()  # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS
            for saveline in saveList:
                outfile.write(saveline)
        outfile.close()
        import md5

        md5old = md5.new()
        md5old.update(open(newOutputName, "r").read())
        md5new = md5.new()
        md5new.update(open(tmpOutputName, "r").read())
        assert md5old.digest() == md5new.digest()  # MD5 COMPARISON INSTEAD OF COMPARING EACH CONTENTS