def parse(fileName, junctions): with open(fileName) as f: for alnObj in psl_parser.read(f, 'track'): tStarts = alnObj.attrib['tStarts'] blockSizes = alnObj.attrib['blockSizes'] tName = alnObj.attrib['tName'] construct(tName, tStarts, blockSizes, junctions)
def main(options, args): exons = {} clusters = {} newClusterID = 0 clusterConnections = {} linkedExons = {} exonPositions = {} endExons = {} singleton = 0 print >> sys.stderr, 'Minimum UTR length = ', options.minimumUTRLength print >> sys.stderr, 'Parsing and clustering exons..' for n, alnObj in enumerate(psl_parser.read(open(options.infile), 'track')): tStarts = alnObj.attrib['tStarts'] blockSizes = alnObj.attrib['blockSizes'] if len(blockSizes) == 1: singleton += 1 tName = alnObj.attrib['tName'] newClusterID = construct(tName, tStarts, blockSizes, exons, clusters, newClusterID, clusterConnections, linkedExons, exonPositions, endExons) if n % 1000 == 0: print >> sys.stderr, '...', n print >> sys.stderr, 'Total singletons = ', singleton sumExons = {} for ref, end in exons: try: sumExons[ref] += 1 except KeyError: sumExons[ref] = 1 for ref in sorted(sumExons): print >> sys.stderr, '\t%s has %d exon(s).' % (ref, sumExons[ref]) print >> sys.stderr, '\nTotal %d cluster(s) found.' % len(clusters) print >> sys.stderr, '\nMerging clusters..' mergedClusters = mergeClusters(clusters, clusterConnections) print >> sys.stderr, '\nCleaning up..' ignored = set([]) for cl in mergedClusters: allExons = mergedClusters[cl] cleanUpLinkedExons(allExons, linkedExons, exonPositions, ignored, options.minimumUTRLength) print >> sys.stderr, 'Modifying the right end of each transcript..' for cl in mergedClusters: findLongestEnd(mergedClusters[cl], linkedExons, endExons, exonPositions, ignored) print >> sys.stderr, '\nConstructing transcripts..' allPaths = {} visited = set([]) for n, cl in enumerate(mergedClusters): txExons = sorted(mergedClusters[cl]) paths = buildPaths(linkedExons, txExons, allPaths, ignored, visited) allPaths[cl] = paths if n % 1000 == 0: if n > 0: print >> sys.stderr, '... %d built..' % n genome = seqdb.SequenceFileDB(options.genome, verbose=False) '''Create isoform objects from allPaths and search for ORF. ''' print >> sys.stderr, '\nBuilding gene models..' allGenes = {} n = 0 for chrom, geneID in allPaths: n += 1 isoformID = 0 for isoExons in allPaths[(chrom, geneID)]: isoform = Isoform(chrom, geneID, isoformID, isoExons, genome) if chrom not in allGenes: allGenes[chrom] = {} allGenes[chrom][geneID] = [isoform] else: try: allGenes[chrom][geneID].append(isoform) except KeyError: allGenes[chrom][geneID] = [isoform] isoformID += 1 if n % 1000 == 0: print >> sys.stderr, '...', n print >> sys.stderr, '\nRemoving redundant sequences..' findRedundantSequence(allGenes) '''Creating sequence records for each DNA, RNA and protein sequences.''' isoformDNASeqs = [] isoformProteinSeqs = [] isoformRNASeqs = [] totalGenes = 0 for chrom in allGenes: for geneID in allGenes[chrom]: totalGenes += 1 isoformID = 0 for isoform in allGenes[chrom][geneID]: if not isoform.redundant: isoform.isoformID = isoformID isoformName = '%s:%d.%d' % (chrom, geneID, isoform.isoformID) DNARecord = SeqRecord(isoform.dnaSeq, id=isoformName) isoformDNASeqs.append(DNARecord) '''Search for ORF for non-redundant sequences''' print >> sys.stderr, 'searching ORF: %s:%d.%d' \ % (chrom, geneID,isoformID) findORF(isoform) if isoform.frame: proteinRecord = SeqRecord(isoform.proteinSeq, id=isoformName) RNARecord = SeqRecord(isoform.mrnaSeq, id=isoformName) isoformProteinSeqs.append(proteinRecord) isoformRNASeqs.append(RNARecord) isoformID += 1 if n > 0 and n % 1000 == 0: print >> sys.stderr, '...', n, 'transcripts done.' print >> sys.stderr, 'Total genes = %d\n\n', totalGenes print >> sys.stderr, 'Writing gene models to file...' writeBEDFile(allGenes, options.basename) print >> sys.stderr, 'Writing DNA sequences to file...' SeqIO.write(isoformDNASeqs, options.basename + '.dnas.fa', 'fasta') print >> sys.stderr, 'Writing RNA sequences to file...' SeqIO.write(isoformRNASeqs, options.basename + '.mrnas.fa', 'fasta') print >> sys.stderr, 'Writing protein sequences to file...' SeqIO.write(isoformProteinSeqs, options.basename + '.proteins.fa', 'fasta')
if exon_end == intron_start: as_exons = [exon_start] coverage = len([alread for alread in samfile.fetch(chr, exon_start, exon_end)]) print '>%d\t%d\t%d' % (exon_start, exon_end, coverage) break for intron_end in junctions[chr][intron_start]: # es = intron end site ex_start = intron_end # intron end site = exon start site for ex_end in exons[chr][ex_start]: coverage = len([alread for alread in samfile.fetch(chr, ex_start , ex_end)]) print '%d\t%d\t%d' % (ex_start, ex_end, coverage) as_exons.append(ex_end) as_exons.sort() print '%d-%d' % (as_exons[0], as_exons[-1]) if __name__ == '__main__': fname = sys.argv[1] samfile = pysam.Samfile(sys.argv[2], 'rb') SNP_file = sys.argv[3] comment = 'track' all_transcripts = [] for line in psl_parser.read(open(fname), comment): all_transcripts.append(line) exons, junctions = construct(all_transcripts) #AS_coverage(exons, junctions, samfile) #cassette_exon(exons, junctions, samfile) export_junctions(junctions)