def test_build(self): 'Test building an NLMSA and querying results' from pygr import seqdb, cnestedlist genomedict = {} for orgstr in msaSpeciesList: genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr) uniondict = seqdb.PrefixUnionDict(genomedict) if smallSampleKey: maflist = (os.path.join(mafDir, smallSampleKey + '.maf'), ) else: maflist = glob.glob(os.path.join(mafDir, '*.maf')) maflist.sort() msaname = os.path.join(self.path, 'dm2_multiz15way') # 500MB VERSION msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen=536870912, maxint=22369620) msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way' pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way') outfileName = os.path.join(testInputDir, 'splicesite_dm2%s.txt' % smallSamplePostfix) outputName = os.path.join( testInputDir, 'splicesite_dm2%s_multiz15way.txt' % smallSamplePostfix) newOutputName = os.path.join(self.path, 'splicesite_new1.txt') tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2] site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \ '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \ '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(newOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest() # TEXT<->BINARY TEST msafilelist = glob.glob(msaname + '*') msa.save_seq_dict() cnestedlist.dump_textfile( msaname, os.path.join(self.path, 'dm2_multiz15way.txt')) for filename in msafilelist: os.remove(filename) runPath = os.path.realpath(os.curdir) os.chdir(self.path) cnestedlist.textfile_to_binaries('dm2_multiz15way.txt') os.chdir(runPath) msa1 = cnestedlist.NLMSA(msaname, 'r') msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way' pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way') newOutputName = os.path.join(self.path, 'splicesite_new2.txt') tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2] site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \ '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \ '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(newOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest()