def setUp(self, **kwargs): TestBase.setUp(self) dnaseq = testutil.datafile('dnaseq.fasta') tryannot = testutil.tempdatafile('tryannot') db = seqdb.BlastDB(dnaseq) try: db.__doc__ = 'little dna' self.pygrData.Bio.Test.dna = db annoDB = seqdb.AnnotationDB({1: ('seq1', 5, 10, 'fred'), 2: ('seq1', -60, -50, 'bob'), 3: ('seq2', -20, -10, 'mary')}, db, sliceAttrDict=dict(id=0, start=1, stop=2, name=3)) annoDB.__doc__ = 'trivial annotation' self.pygrData.Bio.Test.annoDB = annoDB nlmsa = cnestedlist.NLMSA(tryannot, 'w', pairwiseMode=True, bidirectional=False) try: for annID in annoDB: nlmsa.addAnnotation(annoDB[annID]) nlmsa.build() nlmsa.__doc__ = 'trivial map' self.pygrData.Bio.Test.map = nlmsa self.schema.Bio.Test.map = metabase.ManyToManyRelation(db, annoDB, bindAttrs=('exons', )) self.metabase.commit() self.metabase.clear_cache() finally: nlmsa.close() finally: db.close()
def makeNLMSA(annotDBList, dataPath='memory'): if dataPath == 'memory': annotMap = cnestedlist.NLMSA(dataPath, 'memory', pairwiseMode=True) else: annotMap = cnestedlist.NLMSA(dataPath, 'w', pairwiseMode=True) annotMap.__doc__ = 'NLMSA built against ' for annotDB in annotDBList: annotMap.__doc__ += ' %s, ' % (annotDB.__doc__) print '# Adding annotations to NLMSA from %s...' % annotDB.__doc__ for annot in annotDB.values(): annotMap.addAnnotation(annot) print '# Building annotation map...' if dataPath == 'memory': annotMap.build() else: annotMap.build(saveSeqDict=True) return annotMap
def bed2pygr(dbprefix, referencefile, bedfile, indir): collision_counter = defaultdict(int) chrdb = seqdb.SequenceFileDB(referencefile) annodb = annotation.AnnotationDB({}, chrdb) al = cnestedlist.NLMSA(dbprefix, 'w', pairwiseMode=True) load_bed(al, annodb, bedfile, collision_counter) al.build(saveSeqDict=True) genomeprefix = os.path.basename(referencefile).rsplit('.', 1)[0] print >> open(os.path.join(dbprefix) + '.genome', 'w'), genomeprefix
def populate_swissprot(): "Populate the current worldbase with swissprot data" # build BlastDB out of the sequences sp_hbb1 = testutil.datafile('sp_hbb1') sp = seqdb.BlastDB(sp_hbb1) sp.__doc__ = 'little swissprot' worldbase.Bio.Seq.Swissprot.sp42 = sp # also store a fragment hbb = sp['HBB1_TORMA'] ival = hbb[10:35] ival.__doc__ = 'fragment' worldbase.Bio.Seq.frag = ival # build a mapping to itself m = mapping.Mapping(sourceDB=sp, targetDB=sp) trypsin = sp['PRCA_ANAVA'] m[hbb] = trypsin m.__doc__ = 'map sp to itself' worldbase.Bio.Seq.spmap = m # create an annotation database and bind as exons attribute worldbase.schema.Bio.Seq.spmap = metabase.OneToManyRelation( sp, sp, bindAttrs=('buddy', )) annoDB = seqdb.AnnotationDB({1: ('HBB1_TORMA', 10, 50)}, sp, sliceAttrDict=dict(id=0, start=1, stop=2)) exon = annoDB[1] # generate the names where these will be stored tempdir = testutil.TempDir('exonAnnot') filename = tempdir.subfile('cnested') nlmsa = cnestedlist.NLMSA(filename, 'w', pairwiseMode=True, bidirectional=False) nlmsa.addAnnotation(exon) nlmsa.build() annoDB.__doc__ = 'a little annotation db' nlmsa.__doc__ = 'a little map' worldbase.Bio.Annotation.annoDB = annoDB worldbase.Bio.Annotation.map = nlmsa worldbase.schema.Bio.Annotation.map = \ metabase.ManyToManyRelation(sp, annoDB, bindAttrs=('exons', ))
def read_genbank_annots(gbfile, fastafile=None, featureType='CDS', geneQualifier='gene'): '''construct annotation DB for gene CDS intervals. NB: this assumes each gene consists of ONE interval. This cannot be used for multi-exon genes!''' try: gbparse = SeqIO.parse(gbfile, 'genbank') except TypeError: # SeqIO changed its interface? ifile = open(gbfile) try: gbparse = SeqIO.parse(ifile, 'genbank') gbseqs = list(gbparse) finally: ifile.close() else: gbseqs = list(gbparse) if fastafile is None: fastafile = gbfile.split('.')[0] + '.fna' genome = seqdb.SequenceFileDB(fastafile) genomeIndex = blast.BlastIDIndex(genome) # handle NCBI ID blobs properly annodb = annotation.AnnotationDB({}, genome, sliceAttrDict=dict(id=0, start=1, stop=2, orientation=3)) i = 0 for s in gbseqs: seqID = genomeIndex[s.id].id # find the right seq and get its actual ID for f in s.features: if f.type == featureType: try: name = f.qualifiers[geneQualifier][0] except KeyError: # keep the annotation even if label missing warnings.warn('Missing gene qualifier "%s" on %s annotation' % (geneQualifier, featureType)) name = 'unlabeled_%s_%d' % (featureType, i) i += 1 annodb.new_annotation(name, (seqID, f.location.start.position, f.location.end.position, f.strand)) al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True) for a in annodb.itervalues(): al.addAnnotation(a) al.build() return annodb, al, genome
def read_exon_annots(genome, genesFile='knownGene.txt'): '''read multi-exon transcript set and build exon annotation db and exon-to-gene mapping''' exonDict, genes, trLen = read_known_genes(genesFile) geneLengths = get_gene_maxlengths(genes, trLen) totalSize = sum(geneLengths.values()) annodb = annotation.AnnotationDB({}, genome, sliceAttrDict=dict(id=0, orientation=1, start=2, stop=3)) al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True, maxlen=1000000000) i = 0 exonGene = {} for t,geneID in exonDict.iteritems(): a = annodb.new_annotation(i, t) exonGene[i] = geneID i += 1 al.addAnnotation(a) al.build() return annodb, al, exonGene, totalSize, geneLengths
def bedToNLMSA(bedlines, genome, field_locations=dict(id=0, start=1, stop=2, name=3, score=4, orientation=-1)): "Build a pygr resource off of the BED file in_name" annotDB = annotation.AnnotationDB(None, genome, verbose=False, sliceAttrDict=field_locations) nlmsa = cnestedlist.NLMSA('tmp_bed', mode='memory', pairwiseMode=True, bidirectional=False) index = 0 skipped = 0 for line in bedlines: if not line: continue fields = line.strip().split('\t') orientation = 1 if len(fields) < 6 or fields[5] == '+' else -1 #print fields, orientation try: curAnnot = annotDB.new_annotation(index, fields + [orientation]) nlmsa.addAnnotation(curAnnot) index += 1 except KeyError as e: print ('Skipping row without matching chromosome: %s,' +\ 'message: %s') % (row.id, e.message) skipped += 1 #annotDB.close() nlmsa.build() return annotDB, nlmsa
def test_build(self): 'Test building an NLMSA and querying results' from pygr import seqdb, cnestedlist genomedict = {} for orgstr in msaSpeciesList: genomedict[orgstr] = pygr.Data.getResource('TEST.Seq.Genome.' + orgstr) uniondict = seqdb.PrefixUnionDict(genomedict) if smallSampleKey: maflist = (os.path.join(mafDir, smallSampleKey + '.maf'), ) else: maflist = glob.glob(os.path.join(mafDir, '*.maf')) maflist.sort() msaname = os.path.join(self.path, 'dm2_multiz15way') # 500MB VERSION msa1 = cnestedlist.NLMSA(msaname, 'w', uniondict, maflist, maxlen=536870912, maxint=22369620) msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way' pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way') outfileName = os.path.join(testInputDir, 'splicesite_dm2%s.txt' % smallSamplePostfix) outputName = os.path.join( testInputDir, 'splicesite_dm2%s_multiz15way.txt' % smallSamplePostfix) newOutputName = os.path.join(self.path, 'splicesite_new1.txt') tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2] site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \ '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \ '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(newOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest() # TEXT<->BINARY TEST msafilelist = glob.glob(msaname + '*') msa.save_seq_dict() cnestedlist.dump_textfile( msaname, os.path.join(self.path, 'dm2_multiz15way.txt')) for filename in msafilelist: os.remove(filename) runPath = os.path.realpath(os.curdir) os.chdir(self.path) cnestedlist.textfile_to_binaries('dm2_multiz15way.txt') os.chdir(runPath) msa1 = cnestedlist.NLMSA(msaname, 'r') msa1.__doc__ = 'TEST NLMSA for dm2 multiz15way' pygr.Data.addResource('TEST.MSA.UCSC.dm2_multiz15way', msa1) pygr.Data.save() msa = pygr.Data.getResource('TEST.MSA.UCSC.dm2_multiz15way') newOutputName = os.path.join(self.path, 'splicesite_new2.txt') tmpInputName = self.copyFile(outfileName) tmpOutputName = self.copyFile(outputName) outfile = open(newOutputName, 'w') for lines in open(tmpInputName, 'r').xreadlines(): chrid, intstart, intend, nobs = string.split(lines.strip(), '\t') intstart, intend, nobs = int(intstart), int(intend), int(nobs) site1 = msa.seqDict['dm2' + '.' + chrid][intstart:intstart + 2] site2 = msa.seqDict['dm2' + '.' + chrid][intend - 2:intend] edges1 = msa[site1].edges() edges2 = msa[site2].edges() if len(edges1) == 0: # EMPTY EDGES wlist = str(site1), 'dm2', chrid, intstart, intstart + 2, \ '', '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') if len(edges2) == 0: # EMPTY EDGES wlist = str(site2), 'dm2', chrid, intend - 2, intend, '', \ '', '', '', '' outfile.write('\t'.join(map(str, wlist)) + '\n') saveList = [] for src, dest, e in edges1: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') for src, dest, e in edges2: if len(str(src)) != 2 or len(str(dest)) != 2: continue dotindex = (~msa.seqDict)[src].index('.') srcspecies, src1 = (~msa.seqDict)[src][:dotindex], \ (~msa.seqDict)[src][dotindex + 1:] dotindex = (~msa.seqDict)[dest].index('.') destspecies, dest1 = (~msa.seqDict)[dest][:dotindex], \ (~msa.seqDict)[dest][dotindex + 1:] wlist = str(src), srcspecies, src1, src.start, src.stop, \ str(dest), destspecies, dest1, dest.start, dest.stop saveList.append('\t'.join(map(str, wlist)) + '\n') saveList.sort() # SORTED IN ORDER TO COMPARE WITH PREVIOUS RESULTS for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(newOutputName, 'r').read()) md5new = hashlib.md5() md5new.update(open(tmpOutputName, 'r').read()) assert md5old.digest() == md5new.digest()
def main(): if len(sys.argv) < 4: raise SystemExit try: MIN_SCORE = float(sys.argv[5]) except IndexError: pass print >> sys.stderr, 'Reading sequence databases...' queries = seqdb.SequenceFileDB(sys.argv[1]) targets = seqdb.SequenceFileDB(sys.argv[2]) print >> sys.stderr, len(queries), len(targets) try: align_file = open(sys.argv[3]) except IOError as e: print >> sys.stderr, 'Error: check alignment file.' raise e aligndb = cnestedlist.NLMSA('alignment', mode='memory', pairwiseMode=True) print >> sys.stderr, 'Adding sequences to an alignment database...' # for n, target in enumerate(targets): # aligndb += targets[target] # if n % 1000 == 0: print >> sys.stderr, '...', n target_list = set() for c, al in enumerate(parse_alignments(align_file)): aligndb += targets[al.target] target_list.add(al.target) add_alignment(aligndb, al, targets, queries) if c % 100 == 0: print >> sys.stderr, '...', c print >> sys.stderr, 'Building the alignment database...' aligndb.build() print >> sys.stderr, 'Constructing alignment graphs...' graph = nx.Graph() for c, target in enumerate(target_list): try: sub_ival = targets[target] for src, dest, edge in aligndb[sub_ival].edges(): source = repr(src).split('[')[0].lstrip('-') destination = repr(dest).split('[')[0].lstrip('-') graph.add_edge(source, destination) except KeyError: pass if c % 100 == 0: print >> sys.stderr, '...', c # nx.draw(graph) # plt.show() # print graph.nodes() logfile = open('assemgraph.log', 'w') visited_nodes = set() cluster_no = 0 for node in graph.nodes(): if node not in visited_nodes: filename1 = 'cluster_%d_targets' % cluster_no filename2 = 'cluster_%d_queries' % cluster_no ofile1 = open(filename1, 'w') ofile2 = open(filename2, 'w') print >> sys.stderr, \ 'Writing cluster %d to a file...' % cluster_no, vnodes, max_length = (write_sequence(node, graph, targets, queries, ofile1, ofile2)) visited_nodes.update(vnodes) for n in vnodes: size = len(targets[n]) if n in targets else len(queries[n]) print >> logfile, 'cluster_%d\t%s\t%d' % (cluster_no, n, size) ofile1.close() ofile2.close() print >> sys.stderr, '\ttotal nodes = %d' % len(vnodes) cluster_no += 1 print >> logfile, '***finished***' logfile.close()
def main(): als, als_chrDic, strDict = [], {}, {} #---------------------------------------------------------- # Required parameters #---------------------------------------------------------- infile = '' indir = '' outprefix = '' bl2seqPATH = '' #---------------------------------------------------------- # Optional parameters #---------------------------------------------------------- min_pair = 1 # Minimum # of discordant read-pairs min_span = 2 # Minimum # of fusion spanning reads min_cov = 10 # Minimum # of base-pairs for both genes. min_shift = 1 # Minimum shifting pattern(bp) around fusion point. if len(sys.argv) == 1: print print "GFP --- A tool to detect fusion genes using RNA-Seq" print "\nRequired parameters" print "\t-i <string> GSNAP result file." print "\t-d <string> Pre-built exon index directory." print "\t-o <string> Output prefix." print "\t--bl2seq <string> bl2seq excutable path." print print "Optional parameters" print "\t--mpair <integer> Minimum # of discordant read-pairs, DEFAULT: %d." % min_pair print "\t--mspan <integer> Minumum # of fusion spanning reads, DEFAULT: %d." % min_span print "\t--mcov <integer> Minimum # of base-pairs for both genes, DEFAULT: %d." % min_cov print "\t--mshift <integer> Minimum # of shifting pattern(bp), DEFAULT: %d." % min_shift print sys.exit(1) opts, args = getopt.getopt( sys.argv[1:], "i:d:o:", ["bl2seq=", "mpair=", "mspan=", "mcov=", "mshift="]) for opt, arg in opts: if opt == "-i": infile = arg elif opt == "-d": indir = arg elif opt == "-o": outprefix = arg elif opt == "--bl2seq": bl2seqPATH = arg elif opt == "--mpair": min_pair = int(arg) elif opt == "--mspan": min_span = int(arg) elif opt == "--mcov": min_cov = int(arg) elif opt == "--mshift": min_shift = int(arg) # Generate strand dictionary for line in open(os.path.join(indir, "transcript.bed")).readlines(): fields = line.rstrip().split("\t") strDict[fields[3]] = fields[-1] # Preprocessing for pygr print str(datetime.now()) + "\tPreprocessing pygr requirements..." bedfiles = [] for file in os.listdir(indir): if file.endswith(".bed") and file != "transcript.bed": bedfiles.append(file) for i in range(len(bedfiles)): als.append( cnestedlist.NLMSA(os.path.join(indir, bedfiles[i].split('.')[0]), 'r', pairwiseMode=True)) als_chrDic[bedfiles[i].split('.')[0]] = i # Read GSNAP result print str(datetime.now() ) + "\tReading GSNAP result & extracting fusion evidence..." read1Exons, read2Exons = '', '' aligns1, aligns2 = [], [] poss1, poss2 = '', '' strand1, strand2 = '', '' fi = open(infile, 'r') fo = open(outprefix + "_raw.txt", 'w') while 1: line = fi.readline() if not line: break if line.startswith('>'): # Read1 aligns1 = [] read1Exons = '' if line.split()[1] != '1': continue # Skip multiply-mapped read while 1: line = fi.readline() if line == "\n": break aligns1.append(line) poss1, read1Exons, strand1 = exonMapper(aligns1, fo, indir, als, als_chrDic, strDict) if line.startswith('<'): # Read2 aligns2 = [] read2Exons = '' if line.split()[1] != '1': continue while 1: line = fi.readline() if line == "\n": break aligns2.append(line) poss2, read2Exons, strand2 = exonMapper(aligns2, fo, indir, als, als_chrDic, strDict) if read1Exons != '' and read2Exons != '': sameFlg = False sep = re.compile("[;|]+") for read1exon in re.split(sep, read1Exons): for read2exon in re.split(sep, read2Exons): if read1exon.split('.')[1] == read2exon.split('.')[1]: sameFlg = True if not sameFlg: # Putative fusion pair # Donor check module! ts1, ts2 = [], [] # Transcript strand read1/read2 for read1exon in re.split(sep, read1Exons): for strand in strDict[read1exon.split('.')[0]].split( '/'): if not strand in ts1: ts1.append(strand) for read2exon in re.split(sep, read2Exons): for strand in strDict[read2exon.split('.')[0]].split( '/'): if not strand in ts2: ts2.append(strand) donor = check_pDonor(ts1, ts2, strand1, strand2) if donor != "NA": ctext = "pINTER" for pos1 in poss1.split(';'): for pos2 in poss2.split(';'): if pos1.split(':')[0] == pos2.split(':')[0]: ctext = "pINTRA" # No swap! fo.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (ctext, poss1, poss2, read1Exons, read2Exons, strand1, strand2, donor)) fo.close() fi.close() strDict.clear() #--------------------------------------------------- # Extracting fusion evidences is terminated. # Generating gene fusion candidates will start. #--------------------------------------------------- print str( datetime.now()) + "\tEstimating homology & distance between genes..." txPos, txFASTA = {}, {} build_txPos(indir, txPos) build_txFASTA(indir, txFASTA) # Start reading "outprefix_raw.txt" sep = re.compile("[;|]+") # Set separators homologous_pairs = {} fi = open(outprefix + "_raw.txt", 'r') fusions = [] # Fusion class list while 1: line = fi.readline() if not line: break fields = line.rstrip().split("\t") hNames, tNames = [], [] for exon in re.split(sep, fields[3]): hNames.append(exon.split('.')[0]) for exon in re.split(sep, fields[4]): tNames.append(exon.split('.')[0]) # Sequence Homology Detection by bl2seq is_homologous = check_homology(hNames, tNames, bl2seqPATH, homologous_pairs, txFASTA, outprefix) # Estimate genes' distance minDist = estimate_dist(hNames, tNames, txPos) if minDist != "NA": if minDist < 0: continue # Overlapping genes # Update Fusion list if not is_homologous: # Putative gene fusion donor_acceptor = fields[3].split('.')[1] + "\t" + fields[4].split( '.')[1] if fields[-1] == "read2" or fields[-1] == "tail": donor_acceptor = fields[4].split( '.')[1] + "\t" + fields[3].split('.')[1] type, ctext = fields[0][:1], fields[0][1:] does_exist = False for fusion in fusions: if donor_acceptor == fusion.name: does_exist = True if type == 'p': fusion.nPairs += 1 else: fusion.nSpans += 1 if minDist != "NA": if fusion.minDist == "NA": fusion.minDist = str(minDist) elif minDist < int(fusion.minDist): fusion.minDist = str(minDist) fusion.evidences.append(line.rstrip()) if not does_exist: fusion = Fusion(donor_acceptor, ctext) if type == 'p': fusion.nPairs += 1 else: fusion.nSpans += 1 fusion.minDist = str(minDist) fusion.evidences.append(line.rstrip()) fusions.append(fusion) os.system("rm %s_temp1.fasta %s_temp2.fasta %s_temp.bl2seqout" % (outprefix, outprefix, outprefix)) fi.close() homologous_pairs.clear() txFASTA.clear() txPos.clear() #--------------------------------------------------- # Generating fusion gene candidates is terminated. # Further filtering cascade will be applied. #--------------------------------------------------- print str(datetime.now() ) + "\tApplying filtering steps & generating output files..." fusionNum = 0 fo_list = open(outprefix + "_fusionList.txt", 'w') fo_list.write("ID\tdonor\tacceptor\tcontext\tdist\tnum_pair\tnum_span\n") fo_evidence = open(outprefix + "_fusionEvidence.txt", 'w') fo_evidence.write( "ID\tevidence_type\tdonor_pos\tacceptor_pos\tdonor_exon\tacceptor_exon\n" ) for fusion in fusions: if fusion.nPairs < min_pair or fusion.nSpans < min_span: continue spans = [] for evidence in fusion.evidences: if evidence.split("\t")[0][:1] == 's': spans.append(evidence) spanClusters = cov_filter(spans, min_cov) for cluster in spanClusters: if len(cluster) < min_span: continue shift_pass, fusionNum = shift_filter(cluster, min_shift, fo_evidence, fusionNum) if not shift_pass: continue for evidence in fusion.evidences: evidence_fields = evidence.split("\t") if evidence_fields[0][:1] == 'p': donor = evidence_fields[-1] if donor == "read1": fo_evidence.write( "GF%d\tread-pair\t%s\t%s\t%s\t%s\n" % (fusionNum, evidence_fields[1], evidence_fields[2], evidence_fields[3], evidence_fields[4])) else: fo_evidence.write( "GF%d\tread-pair\t%s\t%s\t%s\t%s\n" % (fusionNum, evidence_fields[2], evidence_fields[1], evidence_fields[4], evidence_fields[3])) fo_list.write("GF%d\t%s\t%s\t%s\t%d\t%d\n" % (fusionNum, fusion.name, fusion.ctext, fusion.minDist, fusion.nPairs, len(cluster))) fo_evidence.close() fo_list.close() print str(datetime.now()) + "\tGFP is successfully terminated."
def test_mysqlannot(self): 'Test building an AnnotationDB from MySQL' from pygr import seqdb, cnestedlist, sqlgraph dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2') # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS: MYSQL VERSION exon_slices = sqlgraph.SQLTableClustered( '%s.pygr_refGene_exonAnnot%s_dm2' % (testInputDB, smallSamplePostfix), clusterKey='chromosome', maxCache=0) exon_db = seqdb.AnnotationDB(exon_slices, dm2, sliceAttrDict=dict(id='chromosome', gene_id='name', exon_id='exon_id')) msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_SQL_dm2'), 'w', pairwiseMode=True, bidirectional=False) for id in exon_db: msa.addAnnotation(exon_db[id]) exon_db.clear_cache() # not really necessary; cache should autoGC exon_slices.clear_cache() msa.build() exon_db.__doc__ = 'SQL Exon Annotation Database for dm2' pygr.Data.addResource('TEST.Annotation.SQL.dm2.exons', exon_db) msa.__doc__ = 'SQL NLMSA Exon for dm2' pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.exons', msa) exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db, bindAttrs=('exon2', )) exon_schema.__doc__ = 'SQL Exon Schema for dm2' pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.exons', exon_schema) # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES: MYSQL VERSION splice_slices = sqlgraph.SQLTableClustered( '%s.pygr_refGene_spliceAnnot%s_dm2' % (testInputDB, smallSamplePostfix), clusterKey='chromosome', maxCache=0) splice_db = seqdb.AnnotationDB(splice_slices, dm2, sliceAttrDict=dict(id='chromosome', gene_id='name', splice_id='splice_id')) msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_SQL_dm2'), 'w', pairwiseMode=True, bidirectional=False) for id in splice_db: msa.addAnnotation(splice_db[id]) splice_db.clear_cache() # not really necessary; cache should autoGC splice_slices.clear_cache() msa.build() splice_db.__doc__ = 'SQL Splice Annotation Database for dm2' pygr.Data.addResource('TEST.Annotation.SQL.dm2.splices', splice_db) msa.__doc__ = 'SQL NLMSA Splice for dm2' pygr.Data.addResource('TEST.Annotation.NLMSA.SQL.dm2.splices', msa) splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db, bindAttrs=('splice2', )) splice_schema.__doc__ = 'SQL Splice Schema for dm2' pygr.Data.addSchema('TEST.Annotation.NLMSA.SQL.dm2.splices', splice_schema) # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC: # MYSQL VERSION ucsc_slices = sqlgraph.SQLTableClustered( '%s.pygr_phastConsElements15way%s_dm2' % (testInputDB, smallSamplePostfix), clusterKey='chromosome', maxCache=0) ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2, sliceAttrDict=dict(id='chromosome', gene_id='name', ucsc_id='ucsc_id')) msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements15way_SQL_dm2'), 'w', pairwiseMode=True, bidirectional=False) for id in ucsc_db: msa.addAnnotation(ucsc_db[id]) ucsc_db.clear_cache() # not really necessary; cache should autoGC ucsc_slices.clear_cache() msa.build() ucsc_db.__doc__ = 'SQL Most Conserved Elements for dm2' pygr.Data.addResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved', ucsc_db) msa.__doc__ = 'SQL NLMSA for Most Conserved Elements for dm2' pygr.Data.addResource( 'TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', msa) ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db, bindAttrs=('element2', )) ucsc_schema.__doc__ = \ 'SQL Schema for UCSC Most Conserved Elements for dm2' pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved', ucsc_schema) pygr.Data.save() pygr.Data.clear_cache() # QUERY TO EXON AND SPLICES ANNOTATION DATABASE dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2') exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.exons') splicemsa = \ pygr.Data.getResource('TEST.Annotation.NLMSA.SQL.dm2.splices') conservedmsa = \ pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.SQL.dm2.mostconserved') exons = pygr.Data.getResource('TEST.Annotation.SQL.dm2.exons') splices = pygr.Data.getResource('TEST.Annotation.SQL.dm2.splices') mostconserved = \ pygr.Data.getResource('TEST.Annotation.UCSC.SQL.dm2.mostconserved') # OPEN DM2_MULTIZ15WAY NLMSA msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r', trypath=[seqDir]) exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix) intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix) newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt') newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt') tmpexonAnnotFileName = self.copyFile(exonAnnotFileName) tmpintronAnnotFileName = self.copyFile(intronAnnotFileName) if smallSampleKey: chrList = [smallSampleKey] else: chrList = dm2.seqLenDict.keys() chrList.sort() outfile = open(newexonAnnotFileName, 'w') for chrid in chrList: slice = dm2[chrid] try: ex1 = exonmsa[slice] except KeyError: continue else: exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()] exlist1.sort() for ixx, exon in exlist1: saveList = [] tmp = exon.sequence tmpexon = exons[exon.exon_id] tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, \ tmpslice.start, tmpslice.stop try: out1 = conservedmsa[tmp] except KeyError: pass else: elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()] elementlist.sort() for iyy, element in elementlist: if element.stop - element.start < 100: continue score = int(string.split(element.gene_id, '=')[1]) if score < 100: continue tmp2 = element.sequence tmpelement = mostconserved[element.ucsc_id] # FOR REAL ELEMENT COORDINATE tmpslice2 = tmpelement.sequence wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop) slicestart, sliceend = max(tmp.start, tmp2.start),\ min(tmp.stop, tmp2.stop) tmp1 = msa.seqDict['dm2.' + chrid][slicestart: sliceend] edges = msa[tmp1].edges() for src, dest, e in edges: if src.stop - src.start < 100: continue palign, pident = e.pAligned(), e.pIdentity() if palign < 0.8 or pident < 0.8: continue palign, pident = '%.2f' % palign, \ '%.2f' % pident wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, (~msa.seqDict)[dest], str(dest), dest.start, dest.stop, palign, pident) saveList.append('\t'.join(map(str, wlist3)) + '\n') saveList.sort() for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpexonAnnotFileName, 'r').read()) md5new = hashlib.md5() md5new.update(open(newexonAnnotFileName, 'r').read()) assert md5old.digest() == md5new.digest() outfile = open(newintronAnnotFileName, 'w') for chrid in chrList: slice = dm2[chrid] try: sp1 = splicemsa[slice] except: continue else: splist1 = [(ix.splice_id, ix) for ix in sp1.keys()] splist1.sort() for ixx, splice in splist1: saveList = [] tmp = splice.sequence tmpsplice = splices[splice.splice_id] tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE wlist1 = 'INTRON', chrid, tmpsplice.splice_id, \ tmpsplice.gene_id, tmpslice.start, tmpslice.stop try: out1 = conservedmsa[tmp] except KeyError: pass else: elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()] elementlist.sort() for iyy, element in elementlist: if element.stop - element.start < 100: continue score = int(string.split(element.gene_id, '=')[1]) if score < 100: continue tmp2 = element.sequence tmpelement = mostconserved[element.ucsc_id] # FOR REAL ELEMENT COORDINATE tmpslice2 = tmpelement.sequence wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop) slicestart, sliceend = max(tmp.start, tmp2.start),\ min(tmp.stop, tmp2.stop) tmp1 = msa.seqDict['dm2.' + chrid][slicestart: sliceend] edges = msa[tmp1].edges() for src, dest, e in edges: if src.stop - src.start < 100: continue palign, pident = e.pAligned(), e.pIdentity() if palign < 0.8 or pident < 0.8: continue palign, pident = '%.2f' % palign, \ '%.2f' % pident wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, (~msa.seqDict)[dest], str(dest), dest.start, dest.stop, palign, pident) saveList.append('\t'.join(map(str, wlist3)) + '\n') saveList.sort() for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpintronAnnotFileName, 'r').read()) md5new = hashlib.md5() md5new.update(open(newintronAnnotFileName, 'r').read()) assert md5old.digest() == md5new.digest()
def test_collectionannot(self): 'Test building an AnnotationDB from file' from pygr import seqdb, cnestedlist, sqlgraph dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2') # BUILD ANNOTATION DATABASE FOR REFSEQ EXONS exon_slices = Collection( filename=os.path.join(self.path, 'refGene_exonAnnot_dm2.cdb'), intKeys=True, mode='cr', writeback=False) exon_db = seqdb.AnnotationDB(exon_slices, dm2, sliceAttrDict=dict(id=0, exon_id=1, orientation=2, gene_id=3, start=4, stop=5)) msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_exonAnnot_dm2'), 'w', pairwiseMode=True, bidirectional=False) for lines in open(os.path.join(testInputDir, 'refGene_exonAnnot%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines(): row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER exon_slices[row[1]] = row exon = exon_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON msa.addAnnotation(exon) # SAVE IT TO GENOME MAPPING exon_db.clear_cache() # not really necessary; cache should autoGC # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS exon_slices.close() msa.build() # FINALIZE GENOME ALIGNMENT INDEXES exon_db.__doc__ = 'Exon Annotation Database for dm2' pygr.Data.addResource('TEST.Annotation.dm2.exons', exon_db) msa.__doc__ = 'NLMSA Exon for dm2' pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.exons', msa) exon_schema = pygr.Data.ManyToManyRelation(dm2, exon_db, bindAttrs=('exon1', )) exon_schema.__doc__ = 'Exon Schema for dm2' pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.exons', exon_schema) # BUILD ANNOTATION DATABASE FOR REFSEQ SPLICES splice_slices = Collection( filename=os.path.join(self.path, 'refGene_spliceAnnot_dm2.cdb'), intKeys=True, mode='cr', writeback=False) splice_db = seqdb.AnnotationDB(splice_slices, dm2, sliceAttrDict=dict(id=0, splice_id=1, orientation=2, gene_id=3, start=4, stop=5)) msa = cnestedlist.NLMSA(os.path.join(self.path, 'refGene_spliceAnnot_dm2'), 'w', pairwiseMode=True, bidirectional=False) for lines in open(os.path.join(testInputDir, 'refGene_spliceAnnot%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines(): row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER splice_slices[row[1]] = row # GET THE ANNOTATION OBJECT FOR THIS EXON splice = splice_db[row[1]] msa.addAnnotation(splice) # SAVE IT TO GENOME MAPPING splice_db.clear_cache() # not really necessary; cache should autoGC # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS splice_slices.close() msa.build() # FINALIZE GENOME ALIGNMENT INDEXES splice_db.__doc__ = 'Splice Annotation Database for dm2' pygr.Data.addResource('TEST.Annotation.dm2.splices', splice_db) msa.__doc__ = 'NLMSA Splice for dm2' pygr.Data.addResource('TEST.Annotation.NLMSA.dm2.splices', msa) splice_schema = pygr.Data.ManyToManyRelation(dm2, splice_db, bindAttrs=('splice1', )) splice_schema.__doc__ = 'Splice Schema for dm2' pygr.Data.addSchema('TEST.Annotation.NLMSA.dm2.splices', splice_schema) # BUILD ANNOTATION DATABASE FOR MOST CONSERVED ELEMENTS FROM UCSC ucsc_slices = Collection( filename=os.path.join(self.path, 'phastConsElements15way_dm2.cdb'), intKeys=True, mode='cr', writeback=False) ucsc_db = seqdb.AnnotationDB(ucsc_slices, dm2, sliceAttrDict=dict(id=0, ucsc_id=1, orientation=2, gene_id=3, start=4, stop=5)) msa = cnestedlist.NLMSA(os.path.join(self.path, 'phastConsElements15way_dm2'), 'w', pairwiseMode=True, bidirectional=False) for lines in open(os.path.join(testInputDir, 'phastConsElements15way%s_dm2.txt' % smallSamplePostfix), 'r').xreadlines(): row = [x for x in lines.split('\t')] # CONVERT TO LIST SO MUTABLE row[1] = int(row[1]) # CONVERT FROM STRING TO INTEGER ucsc_slices[row[1]] = row ucsc = ucsc_db[row[1]] # GET THE ANNOTATION OBJECT FOR THIS EXON msa.addAnnotation(ucsc) # SAVE IT TO GENOME MAPPING ucsc_db.clear_cache() # not really necessary; cache should autoGC # SHELVE SHOULD BE EXPLICITLY CLOSED IN ORDER TO SAVE CURRENT CONTENTS ucsc_slices.close() msa.build() # FINALIZE GENOME ALIGNMENT INDEXES ucsc_db.__doc__ = 'Most Conserved Elements for dm2' pygr.Data.addResource('TEST.Annotation.UCSC.dm2.mostconserved', ucsc_db) msa.__doc__ = 'NLMSA for Most Conserved Elements for dm2' pygr.Data.addResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', msa) ucsc_schema = pygr.Data.ManyToManyRelation(dm2, ucsc_db, bindAttrs=('element1', )) ucsc_schema.__doc__ = 'Schema for UCSC Most Conserved Elements for dm2' pygr.Data.addSchema('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved', ucsc_schema) pygr.Data.save() pygr.Data.clear_cache() # force resources to reload when requested # QUERY TO EXON AND SPLICES ANNOTATION DATABASE dm2 = pygr.Data.getResource('TEST.Seq.Genome.dm2') exonmsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.exons') splicemsa = pygr.Data.getResource('TEST.Annotation.NLMSA.dm2.splices') conservedmsa = \ pygr.Data.getResource('TEST.Annotation.UCSC.NLMSA.dm2.mostconserved') exons = pygr.Data.getResource('TEST.Annotation.dm2.exons') splices = pygr.Data.getResource('TEST.Annotation.dm2.splices') mostconserved = \ pygr.Data.getResource('TEST.Annotation.UCSC.dm2.mostconserved') # OPEN DM2_MULTIZ15WAY NLMSA msa = cnestedlist.NLMSA(os.path.join(msaDir, 'dm2_multiz15way'), 'r', trypath=[seqDir]) exonAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Exons%s_dm2.txt' % smallSamplePostfix) intronAnnotFileName = os.path.join(testInputDir, 'Annotation_ConservedElement_Introns%s_dm2.txt' % smallSamplePostfix) newexonAnnotFileName = os.path.join(self.path, 'new_Exons_dm2.txt') newintronAnnotFileName = os.path.join(self.path, 'new_Introns_dm2.txt') tmpexonAnnotFileName = self.copyFile(exonAnnotFileName) tmpintronAnnotFileName = self.copyFile(intronAnnotFileName) if smallSampleKey: chrList = [smallSampleKey] else: chrList = dm2.seqLenDict.keys() chrList.sort() outfile = open(newexonAnnotFileName, 'w') for chrid in chrList: slice = dm2[chrid] try: ex1 = exonmsa[slice] except KeyError: continue else: exlist1 = [(ix.exon_id, ix) for ix in ex1.keys()] exlist1.sort() for ixx, exon in exlist1: saveList = [] tmp = exon.sequence tmpexon = exons[exon.exon_id] tmpslice = tmpexon.sequence # FOR REAL EXON COORDINATE wlist1 = 'EXON', chrid, tmpexon.exon_id, tmpexon.gene_id, \ tmpslice.start, tmpslice.stop try: out1 = conservedmsa[tmp] except KeyError: pass else: elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()] elementlist.sort() for iyy, element in elementlist: if element.stop - element.start < 100: continue score = int(string.split(element.gene_id, '=')[1]) if score < 100: continue tmp2 = element.sequence tmpelement = mostconserved[element.ucsc_id] # FOR REAL ELEMENT COORDINATE tmpslice2 = tmpelement.sequence wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop) slicestart, sliceend = max(tmp.start, tmp2.start),\ min(tmp.stop, tmp2.stop) tmp1 = msa.seqDict['dm2.' + chrid][slicestart: sliceend] edges = msa[tmp1].edges() for src, dest, e in edges: if src.stop - src.start < 100: continue palign, pident = e.pAligned(), e.pIdentity() if palign < 0.8 or pident < 0.8: continue palign, pident = '%.2f' % palign, \ '%.2f' % pident wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, (~msa.seqDict)[dest], str(dest), dest.start, dest.stop, palign, pident) saveList.append('\t'.join(map(str, wlist3)) + '\n') saveList.sort() for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpexonAnnotFileName, 'r').read()) md5new = hashlib.md5() md5new.update(open(newexonAnnotFileName, 'r').read()) assert md5old.digest() == md5new.digest() outfile = open(newintronAnnotFileName, 'w') for chrid in chrList: slice = dm2[chrid] try: sp1 = splicemsa[slice] except: continue else: splist1 = [(ix.splice_id, ix) for ix in sp1.keys()] splist1.sort() for ixx, splice in splist1: saveList = [] tmp = splice.sequence tmpsplice = splices[splice.splice_id] tmpslice = tmpsplice.sequence # FOR REAL EXON COORDINATE wlist1 = 'INTRON', chrid, tmpsplice.splice_id, \ tmpsplice.gene_id, tmpslice.start, tmpslice.stop try: out1 = conservedmsa[tmp] except KeyError: pass else: elementlist = [(ix.ucsc_id, ix) for ix in out1.keys()] elementlist.sort() for iyy, element in elementlist: if element.stop - element.start < 100: continue score = int(string.split(element.gene_id, '=')[1]) if score < 100: continue tmp2 = element.sequence tmpelement = mostconserved[element.ucsc_id] # FOR REAL ELEMENT COORDINATE tmpslice2 = tmpelement.sequence wlist2 = wlist1 + (tmpelement.ucsc_id, tmpelement.gene_id, tmpslice2.start, tmpslice2.stop) slicestart, sliceend = max(tmp.start, tmp2.start),\ min(tmp.stop, tmp2.stop) tmp1 = msa.seqDict['dm2.' + chrid][slicestart: sliceend] edges = msa[tmp1].edges() for src, dest, e in edges: if src.stop - src.start < 100: continue palign, pident = e.pAligned(), e.pIdentity() if palign < 0.8 or pident < 0.8: continue palign, pident = '%.2f' % palign, \ '%.2f' % pident wlist3 = wlist2 + ((~msa.seqDict)[src], str(src), src.start, src.stop, (~msa.seqDict)[dest], str(dest), dest.start, dest.stop, palign, pident) saveList.append('\t'.join(map(str, wlist3)) + '\n') saveList.sort() for saveline in saveList: outfile.write(saveline) outfile.close() md5old = hashlib.md5() md5old.update(open(tmpintronAnnotFileName, 'r').read()) md5new = hashlib.md5() md5new.update(open(newintronAnnotFileName, 'r').read()) assert md5old.digest() == md5new.digest()
def make_transcriptome(in_genes, out_files): """Splice UTR's and exons from gene annotations into a transcriptome. Creates a fasta-file of resulting genes and a gene to genome alignment. """ out_fasta, out_db, out_msa = out_files startCol = 1 msa = cnestedlist.NLMSA(out_msa, mode='w', pairwiseMode=True) genome = get_genome(None, None, touch_file=False) for chrom in genome.values(): msa += chrom outfile = open(out_fasta, 'w') gene_db = {} for i, line in enumerate(open(in_genes)): print i # parse fields = line.split('\t') name, chrom, strand = fields[startCol: startCol + 3] (txStart, txEnd, cdsStart, cdsEnd) = map(int, fields[startCol+3:startCol+7]) exons = zip(map(int, fields[startCol+8][:-1].split(',')), map(int, fields[startCol+9][:-1].split(','))) name2 = fields[startCol + 11] noncoding = name.startswith('NR_') or cdsStart < 0 or cdsEnd < 0 if 'hap' in chrom: continue # create a record for the gene seq_id = '%s_%s_%s' % (i, name, name2) if noncoding or len(exons) == 0: # add entire tx region region = genome[chrom][txStart:txEnd] sequence = seqdb.Sequence(str(region), seq_id) msa[region] += sequence else: # make the sequence by splicing parts seq = '' if txStart < cdsStart: seq += str(genome[chrom][txStart:cdsStart]) seq += ''.join(str(genome[chrom][e_start:e_end]) for e_start, e_end in exons if e_start < e_end) if exons[-1][1] < txEnd: seq += str(genome[chrom][exons[-1][1]:txEnd]) sequence = seqdb.Sequence(seq, seq_id) # save the sequence to a fasta file outfile.write('>%s\n%s\n' % (seq_id, str(sequence))) # make the alignment back to genomic coords p_start = 0 if txStart < cdsStart: region = genome[chrom][txStart:cdsStart] msa[region] += sequence[p_start:p_start + len(region)] p_start = len(region) for e_index, (e_start, e_end) in enumerate(exons): if e_start < e_end: region = genome[chrom][e_start:e_end] msa[region] += sequence[p_start:p_start + len(region)] p_start += len(region) if exons[-1][1] < txEnd: print exons[-1], txEnd region = genome[chrom][exons[-1][1]:txEnd] msa[region] += sequence[p_start:p_start + len(region)] p_start += len(region) gene_db[seq_id] = sequence msa.build(saveSeqDict=True) outfile.close() pickle.dump(gene_db, open(out_db, 'w'))