def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms, fragment_size, totalcount, out_file): lib_name = (bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name +'.' + suffix +"1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index]) else: print bedfile, " is not found"; sys.exit(1) all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]} all_summary = {} for chrom in chroms: chrombed = chrom + extension entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculateExonIntrons_by_chrom (entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file) #if chrom == chroms[0]: #myid = reads_on_shared_exons.keys()[0] #test(entrez_genes_by_chrom, reads_on_shared_introns, myid) all_reads_on_shared_exons.update(reads_on_shared_exons) all_reads_on_shared_introns.update(reads_on_shared_introns) all_reads_on_merged_transcripts.update(reads_on_merged_transcripts) all_summary.update(summary) SeparateByChrom.cleanup(chroms, extension) return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, summary)
def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold, PAfile, extension, index): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, str(index)) else: print bedfile, " is not found" sys.exit(1) #This part is to access the polyadenylation sites PA1 = open(PAfile, 'r') PAsiteslist = [] PA2 = 'i' while PA2 != '': PA2 = PA1.readline() if PA2 != '': PA3 = PA2.strip('\n') PA4 = PA3.split('\t') PAsiteslist.append((PA4[0], PA4[1])) PA1.close() # Here the output is 'a', i.e. the output is appended to an existing file instead of creating one outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() #make sure the extension is always 0, otherwise the rest of the program might not work as intended tag_positions.append( associate_tags_with_regions.tag_position(sline, 0)) f.close() if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() #By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[ entrez_id] # an EntrezGene class object # get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site three_UTRs = gene.get_3UTRs() # Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker( three_UTRs, PAsiteslist, chrom, gene.strand, extension) #value should always be 1 as only 3'UTR with more than 1 polyA site need be considered if len(true3UTRends) > 1: #find all reads inside the 3'UTR inside_reads = associate_tags_with_3UTR( tag_positions, UTRregion_start, UTRregion_end) #finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them #PolyAsites potentially useful for output RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR( true3UTRstarts, true3UTRends, inside_reads, gene.strand, threshold) #important if one wants to output gene_symbol information gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[ 0] not in gene_symbol: gene_symbol.append( mytranscript.additional_annotations[0]) #outline to use to output RUDs outline = str( entrez_id ) + "\t" + chrom + "\t" + gene.strand + "\t" + str( basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n" #outline to use to output polyA information for a species #outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n" outf.write(outline) outf.close()
def getReadCount(KnownGenes, bedfile, chroms, fragment_size, region_type, upstream_extension, downstream_extension, totalcount, out_file): """ Known genes are made sure to be on one strand, and the bed file are reads for that strand The raw read file needs to conform to bed format """ ReadCount = {} # keyed by name, valued by (rc, length, rpkm) # Separate by chrom reads rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, [2]) else: print bedfile, " is not found" sys.exit(1) # dictionary has chrom as key and ucsc_lite object (name, chrom, strand, txStart, txEnd) as values if region_type == 'Promoter': region_dic = KnownGenes.getPromoters(upstream_extension, downstream_extension) elif region_type == 'GeneBody': region_dic = KnownGenes.getGenebodys(downstream_extension) elif region_type == 'ExtendedGeneBody': region_dic = KnownGenes.getExtendedGenebodys(upstream_extension, downstream_extension) elif region_type == 'PromoterGenebody': region_dic = KnownGenes.getPromotergenebodys(upstream_extension) elif region_type == 'GeneEnd': region_dic = KnownGenes.getGeneEnds(upstream_extension, downstream_extension) elif region_type == 'ExonicRegion': region_dic = KnownGenes.getExons() elif region_type == 'IntronicRegion': region_dic = KnownGenes.getIntrons() elif region_type == '5UTR': region_dic = KnownGenes.get5UTRs(upstream_extension, downstream_extension) elif region_type == '3UTR': region_dic = KnownGenes.get3UTRs(upstream_extension, downstream_extension) else: print region_type, "is not recognized" exit(1) outf = open(out_file, 'a') for chrom in chroms: chrombed = chrom + rawreadsextension1 if Utility_extended.fileExists(chrombed) and (chrom in KnownGenes.keys()): tag_position_list = [] inf = open(chrombed, 'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, fragment_size)) inf.close() if Utility_extended.is_list_sorted(tag_position_list) != 1: tag_position_list.sort() if len(region_dic[chrom]) > 0: for region in region_dic[chrom]: thisregion = [(region.txStart, region.txEnd)] (total_length, rc) = get_read_count_on_regions(thisregion, tag_position_list) if total_length > 0: RPKM = rc * (1000.0 / total_length) * ( 1000000 / float(totalcount)) else: assert rc < 0.01 RPKM = 0 outline = str(region.name) + '\t' + str(rc) + '\t' + str( total_length) + '\t' + str(RPKM) + '\n' outf.write(outline) ReadCount[region.name] = (rc, total_length, RPKM) outf.close() #SeparateByChrom.cleanup(chroms, rawreadsextension1) return ReadCount
def get_read_count_on_onic_transcript(KnownGenes, bedfile, chroms, fragment_size, region_type, totalcount, out_file): """ Return: a dictionary keyed by geneName valued by TotalReadCount,TotalLength, RPKM """ ReadCount = {} # keyed by name, valued by (rc, length, rpkm) # Separate by chrom reads rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, [2]) # sort by start else: print bedfile, " is not found" sys.exit(1) outf = open(out_file, 'a') for chrom in chroms: chrombed = chrom + rawreadsextension1 if Utility_extended.fileExists(chrombed) and (chrom in KnownGenes.keys()): tag_position_list = [] inf = open(chrombed, 'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, fragment_size)) inf.close() if Utility_extended.is_list_sorted(tag_position_list) != 1: tag_position_list.sort() for gene in KnownGenes[chrom]: if region_type == "ExonicTranscript": ons = gene.getExons() elif region_type == "IntronicTranscript": ons = gene.getIntrons() else: print region_type, "is not recognized." exit(1) if len(ons > 0): (total_length, rc) = get_read_count_on_regions(ons, tag_position_list) RPKM = rc * (1000.0 / total_length) * (1000000 / float(totalcount)) else: total_length = 0 rc = 0 RPKM = 0 outline = str(gene.name) + '\t' + str(rc) + '\t' + str( total_length) + '\t' + str(RPKM) + '\n' outf.write(outline) ReadCount[region.name] = (rc, total_length, RPKM) outf.close() #SeparateByChrom.cleanup(chroms, rawreadsextension1) return ReadCount
def main(argv): parser = OptionParser() parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", metavar="<file>", help="island bed file") parser.add_option("-t", "--RE_tree_pickle_file", action="store", type="string", dest="RE_Tree", metavar="<file>", help="file with RE tree in pickle format") parser.add_option("-l", "--RE_annotation_file_location", action="store", type="string", dest="RE_file_location", metavar="<file>", help="location of RE files named in repClass_repFamily_repName.txt") parser.add_option("-u", "--upstream_extension", action="store", type="int", dest="upstream_extension", help="upstream extension from start", metavar="<int>") parser.add_option("-d", "--downstream_extension", action="store", type="int", dest="downstream_extension", help="downstream extension from end", metavar="<int>") parser.add_option("-s", "--species", action="store", type="string", dest="species",help="species, mm8, hg18, etc", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting"; sys.exit(1); #Separate_by_chrom on bedfile lib_name = (opt.bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name +'.' + suffix +"1" if Utility_extended.fileExists(opt.bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) else: print bedfile, " is not found"; sys.exit(1) print "\nLoad the RE tree to get the RE file names" re_tree = pickle.load(open(opt.RE_Tree, 'rb')) (numb_classes, numb_families, numb_names) = get_read_count_on_REs.numbers(re_tree) print "There are %d classes, %d family, and %d names." %(numb_classes, numb_families, numb_names) total_num_islands = 0 total_num_RE_islands = 0 #cycle through chrom for chrom in chroms: # Get the islands island_list = [] print chrom chrom_length = chrom_lengths[chrom] chrombed = chrom + extension if Utility_extended.fileExists(chrombed): # load in each island inf = open(chrombed,'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() start = int(sline[1]) end = int(sline[2]) island_list.append( (start, end) ) inf.close() if Utility_extended.is_tuplelist_sorted(island_list, 0) != 1: island_list.sort(key = itemgetter[0]) # sort by start, assume non-overlapping else: print "%s can not be found" %chrombed island_flags = [0 for island in island_list] min_re_length = 10 for reClass in re_tree.keys(): for reFamily in re_tree[reClass].keys(): for reName in re_tree[reClass][reFamily]: re_file_name = "_".join([reClass, reFamily, reName]) + ".txt" #print re_file_name this_island_flags = assign_islands_to_REs(opt.RE_file_location, re_file_name, chrom, chrom_length, island_list, opt.upstream_extension, opt.downstream_extension, min_re_length) #Collect the results into island_flags for i in xrange(len(this_island_flags)): if this_island_flags[i] == 1: island_flags[i] = 1 print "There are %d island on %s" %(len(island_list),chrom) print "There are %d RE islands" %(sum(island_flags)) total_num_islands += len(island_list) total_num_RE_islands += sum(island_flags) SeparateByChrom.cleanup(chroms, extension) print "There are %d islands" %(total_num_islands) print "There are %d RE islands" %(total_num_RE_islands)
def main(argv): parser = OptionParser() parser.add_option("-b", "--bedfile", action="store", type="string", dest="bedfile", metavar="<file>", help="ChIP seq read file") parser.add_option( "-f", "--fragment_size", action="store", type="int", dest="fragment_size", help= "fragment_size determins the shift (half of fragment_size of ChIP-seq read position, in bps", metavar="<int>") parser.add_option("-t", "--RE_tree_pickle_file", action="store", type="string", dest="RE_Tree", metavar="<file>", help="file with RE tree in pickle format") parser.add_option( "-l", "--RE_annotation_file_location", action="store", type="string", dest="RE_file_location", metavar="<file>", help="location of RE files named in repClass_repFamily_repName.txt") parser.add_option("-u", "--upstream_extension", action="store", type="int", dest="upstream_extension", help="upstream extension from start", metavar="<int>") parser.add_option("-d", "--downstream_extension", action="store", type="int", dest="downstream_extension", help="downstream extension from end", metavar="<int>") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-n", "--feature_name", action="store", type="string", dest="feature_name", help="name of the library", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 16: parser.print_help() sys.exit(1) startTime = time.time() if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] chrom_lengths = GenomeData.species_chrom_lengths[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) total_count = get_total_tag_counts.get_total_tag_counts(opt.bedfile) #Separate_by_chrom on bedfile lib_name = (opt.bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(opt.bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: SeparateByChrom.separateByChrom(chroms, opt.bedfile, extension) else: print bedfile, " is not found" sys.exit(1) #load the RE tree to get the RE file names re_tree = pickle.load(open(opt.RE_Tree, 'rb')) (numb_classes, numb_families, numb_names) = numbers(re_tree) print "There are %d classes, %d family, and %d names." % ( numb_classes, numb_families, numb_names) #Prepare the summary read_counts = {} for reClass in re_tree.keys(): read_counts[reClass] = {} for reFamily in re_tree[reClass].keys(): read_counts[reClass][reFamily] = {} for reName in re_tree[reClass][reFamily]: read_counts[reClass][reFamily][reName] = {} #cycle through chrom for chrom in chroms: print chrom chrom_length = chrom_lengths[chrom] chrombed = chrom + extension if Utility_extended.fileExists(chrombed): # load in each read and shift tag_position_list = [] inf = open(chrombed, 'r') for line in inf: if not re.match("#", line): line = line.strip() sline = line.split() tag_position_list.append( associate_tags_with_regions.tag_position( sline, opt.fragment_size)) inf.close() if not Utility_extended.is_list_sorted(tag_position_list): tag_position_list.sort() #[tag_positions] min_re_length = 10 for reClass in re_tree.keys(): for reFamily in re_tree[reClass].keys(): for reName in re_tree[reClass][reFamily]: re_file_name = "_".join([reClass, reFamily, reName ]) + ".txt" #{id:{feature_name:value}} rc_dic = get_read_count( opt.RE_file_location, re_file_name, opt.feature_name, chrom, chrom_length, tag_position_list, total_count, opt.upstream_extension, opt.downstream_extension, min_re_length) # id is unique and updated only once, so this should be ok read_counts[reClass][reFamily][reName].update(rc_dic) #{reClass:{reFamily:{reName:{id:feature_name, value}}}} #feature_name include: feature_name + "_rc", feature_name + "_rpkm" #output_file_name = feature_name + "_on_" + "mm9_rmsk.pkl" #output = open(output_file_name, 'wb') #pickle.dump(read_counts, output) #output.close() #instead of outputing a huge one, let's output many small pieces breakdown_and_output(read_counts, opt.feature_name) repClass = 'LTR' repFamily = 'ERV1' repName = 'RLTR4_Mm' outfile_name = lib_name + "_on_" + "_".join([repClass, repFamily, repName ]) + ".dat" test(read_counts, repClass, repFamily, repName, outfile_name) SeparateByChrom.cleanup(chroms, extension) print "it took", time.time() - startTime, "seconds."
def main(argv): parser = OptionParser() parser.add_option( "-a", "--AluElementsFile", action="store", type="string", dest="Alus", help="input Alu annotation file for non-strand specific analysis", metavar="<file>") parser.add_option( "-u", "--entrez_genes_file", action="store", type="string", dest="entrez_collection", metavar="<file>", help= "file with curated known genes clustered by entrez ID in pickle format" ) parser.add_option("-o", "--outfile", action="store", type="string", dest="outfile", metavar="<file>", help="output file name for genes and tag numbers") parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") (opt, args) = parser.parse_args(argv) if len(argv) < 8: parser.print_help() sys.exit(1) startTime = time.time() if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) # entrez_collection is a dic (keyed by entrez_id) of lists of EntrezGene object annotation = open(opt.entrez_collection, 'rb') temp = pickle.load(annotation) my_entrez_genes = Entrez.KnownEntrezGenes(chroms, temp) annotation.close() #test entrez, checks out #id = my_entrez_genes.entrez_genes.keys()[0] #print id #for i in my_entrez_genes.entrez_genes[id].transcripts: #print i.getAll() lib_name = (opt.Alus).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(opt.Alus): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " SeparateByChrom.separateByChrom(chroms, opt.Alus, extension) else: print opt.Alus, " is not found" sys.exit(1) Alus_in_shared_intron = {} Alus_in_shared_exon = {} Alus_in_merged_transcript = {} for chrom in chroms: (shared_intron_Alus, shared_exon_Alus, merged_transcript_Alus) = assign_AluElements_to_intronexons_by_chrom( my_entrez_genes, chrom + extension, chrom) if chrom == chroms[0]: myid = shared_intron_Alus.keys()[0] test(my_entrez_genes, shared_intron_Alus, myid) Alus_in_shared_intron.update(shared_intron_Alus) Alus_in_shared_exon.update(shared_exon_Alus) Alus_in_merged_transcript.update(merged_transcript_Alus) #{entrezID:[(region=(start, end), Alu_count)]} Alus_in_shared_intron_dist = {} for myid in Alus_in_shared_intron.keys(): shared_intronic_regions_on_this_gene = Alus_in_shared_intron[myid] Alus_on_shared_intronic_regions_on_this_gene = [] for region in shared_intronic_regions_on_this_gene: region_coord, Alu_positions = region number_of_Alus = len(Alu_positions) Alus_on_shared_intronic_regions_on_this_gene.append( (region_coord, number_of_Alus)) Alus_in_shared_intron_dist[ myid] = Alus_on_shared_intronic_regions_on_this_gene outname = opt.outfile + "_Alu_distribution_in_shared_intron.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_shared_intron_dist, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_shared_intron.keys())) output.close() #total_intronic_regions = 0 #for myid in Alus_in_shared_intron.keys(): # total_intronic_regions += len(Alus_in_shared_intron[myid]) #print "There are %d genes with %d shared intronic regions " % (len(Alus_in_shared_intron.keys()), total_intronic_regions) #{entrezID:[(region, Alu_positions)]} outname = opt.outfile + "_Alus_in_shared_intron.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_shared_intron, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_shared_intron.keys())) output.close() #{entrezID:[(region, Alu_positions)]} outname = opt.outfile + "_Alus_in_shared_exon.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_shared_exon, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_shared_exon.keys())) output.close() #Though in this case the structure can be simpler: {entrezID:(region, Alu_count)}, it is better to make the interface uniform.{entrezID:[(region, Alu_count)]} Alus_in_merged_transcript_dist = {} for myid in Alus_in_merged_transcript.keys(): assert len(Alus_in_merged_transcript[myid]) == 1 region = (Alus_in_merged_transcript[myid])[0] region_coord, Alu_positions = region number_of_Alus = len(Alu_positions) Alus_in_merged_transcript_dist[myid] = [(region_coord, number_of_Alus)] outname = opt.outfile + "_Alu_distribution_in_merged_transcript.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_merged_transcript_dist, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_merged_transcript.keys())) output.close() #{entrezID:[(region, Alu_positions)]} outname = opt.outfile + "_Alus_in_merged_transcript.pkl" output = open(outname, 'wb') pickle.dump(Alus_in_merged_transcript, output) print "The number of genes output to %s is %d " % ( outname, len(Alus_in_merged_transcript.keys())) output.close() print "it took", time.time() - startTime, "seconds."
def Calculate3UTRUsage(entrez_genes, bedfile, column_index, chroms, fragment_size, downstream_extension, outfile): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 + '.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, [column_index]) else: print bedfile, " is not found" sys.exit(1) # Here the output is 'a' outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) # this_chrom_length = chrom_lengths[chrom] # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() tag_positions.append( associate_tags_with_regions.tag_position( sline, fragment_size)) if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() f.close() for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[ entrez_id] # an EntrezGene class object three_UTRs = gene.get_3UTRs(downstream_extension) print three_UTRs union = Utility_extended.union( three_UTRs ) # Find the union of 3UTRs [(start, end)], returns a [(start,end)] if len(union) > 1: print "There are disjoint 3UTRs in %s" % ( str(entrez_id)) else: # returns [((start, end), [tag_positions])], [tag_positions] = return[0][1] inside_reads = (Utility_extended. associate_simple_tags_with_regions( tag_positions, union))[0][1] total_read_count = len(inside_reads) RUD = CUTR_vs_AUTR(three_UTRs, inside_reads, gene.strand) ## For the set of genes, use the distal 3UTR at the designated representative 3UTR #myindex = Calculate3UTRUsageIndexFromCuratedGenes.find_distal_3UTR(genes) #gene = genes[myindex] #results = ThreeUTRCharacteristics(gene, inside_reads) gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[ 0] not in gene_symbol: gene_symbol.append( mytranscript.additional_annotations[0]) union_length = union[0][1] - union[0][0] + 1 outline = str(entrez_id) + "\t" + str( union_length) + "\t" + str(RUD) + "\t" + str( total_read_count) + "\t" + ','.join([ transcript.name for transcript in gene.transcripts ]) + "\t" + ','.join(gene_symbol) + "\n" outf.write(outline) outf.close()
def calculateExonIntrons(entrez_genes, bedfile, column_index, chroms, fragment_size, totalcount, out_file=None): """ entrez_genes is a EntrezGenes class object. The core is a entrez_genes.entrez_genes is a dic (keyed by entrez_id) of lists of EntrezGene object return: all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = {} #{entrezID:[((start, end), read_count)]} all_summary = {} # {entrezID:{attribute:value}} (summary[entrez_id])["merged_exons_rc"] = merged_exons_rc (summary[entrez_id])["merged_exon_RPKM"] = merged_exon_RPKM (summary[entrez_id])["merged_exons_total_length"] = merged_exons_total_length (summary[entrez_id])["shared_exons_rc"] = shared_exons_rc (summary[entrez_id])["shared_exon_RPKM"] = shared_exon_RPKM (summary[entrez_id])["shared_exons_total_length"] = shared_exons_total_length (summary[entrez_id])["shared_introns_rc"] = shared_introns_rc (summary[entrez_id])["shared_intron_RPKM"] = shared_intron_RPKM (summary[entrez_id])["shared_introns_total_length"] = shared_introns_total_length (summary[entrez_id])["merged_transcript_rc"] = merged_transcript_rc (summary[entrez_id])["merged_transcript_RPKM"] = merged_transcript_RPKM (summary[entrez_id])["merged_transcript_length"] = merged_transcript_length """ lib_name = (bedfile).split('/')[-1] # remove directory suffix = lib_name.split('.')[-1] # txt lib_name = lib_name.split('.')[0] extension = "-" + lib_name + '.' + suffix + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, extension) != 1: # Separate by chrom and sort by start print chroms, extension, " files do not exist, separate by chroms. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, extension, [column_index]) else: print bedfile, " is not found" sys.exit(1) all_reads_on_shared_exons = {} # {entrezID:[((start, end), read_count)]} all_reads_on_shared_introns = {} # {entrezID:[((start, end), read_count)]} all_reads_on_merged_transcripts = { } #{entrezID:[((start, end), read_count)]} all_summary = {} # {entrezID:{attributes}} for chrom in chroms: chrombed = chrom + extension if chrom in entrez_genes.chroms: entrez_genes_by_chrom = Entrez.KnownEntrezGenes( [chrom], entrez_genes.subset_by_chrom(chrom)) (reads_on_shared_exons, reads_on_shared_introns, reads_on_merged_transcripts, summary) = calculateExonIntrons_by_chrom(entrez_genes_by_chrom, chrombed, fragment_size, totalcount, out_file) #if chrom == chroms[0]: #myid = reads_on_shared_exons.keys()[0] #test(entrez_genes_by_chrom, reads_on_shared_introns, myid) all_reads_on_shared_exons.update(reads_on_shared_exons) all_reads_on_shared_introns.update(reads_on_shared_introns) all_reads_on_merged_transcripts.update(reads_on_merged_transcripts) all_summary.update(summary) print len(all_summary.keys()) SeparateByChrom.cleanup(chroms, extension) return (all_reads_on_shared_exons, all_reads_on_shared_introns, all_reads_on_merged_transcripts, all_summary)
def Calculate3UTRUsage(entrez_genes, bedfile, chroms, outfile, threshold, PAfile, extension, index): """ entrez genes are made sure to be on one strand, the bed file are reads for that strand entrez_genes is a KnownEntrezGenes class object The raw read file needs to conform to bed format column_index: column in bed file for sorting """ # Separate reads by chrom rawreadslibName1 = (bedfile).split('/')[-1] rawreadssuffix1 = rawreadslibName1.split('.')[-1] rawreadslibName1 = rawreadslibName1.split('.')[0] rawreadsextension1 = "-" + rawreadslibName1 +'.' + rawreadssuffix1 + "1" if Utility_extended.fileExists(bedfile): if Utility_extended.chrom_files_exist(chroms, rawreadsextension1) != 1: # Separate by chrom and sort by start print chroms, rawreadsextension1, " files do not exist, separate by chroms and sort each file according to the second column. " Utility_extended.separate_by_chrom_sort(chroms, bedfile, rawreadsextension1, str(index)) else: print bedfile, " is not found" sys.exit(1) #This part is to access the polyadenylation sites PA1 = open(PAfile, 'r') PAsiteslist = [] PA2 = 'i' while PA2 != '': PA2 = PA1.readline() if PA2 != '': PA3 = PA2.strip('\n') PA4 = PA3.split('\t') PAsiteslist.append((PA4[0],PA4[1])) PA1.close() # Here the output is 'a', i.e. the output is appended to an existing file instead of creating one outf = open(outfile, 'a') for chrom in chroms: if chrom in entrez_genes.chroms: # a KnownEntrezGenes object entrez_genes_by_chrom = Entrez.KnownEntrezGenes([chrom], entrez_genes.subset_by_chrom(chrom)) # Get the read locations if Utility_extended.fileExists(chrom + rawreadsextension1): f = open(chrom + rawreadsextension1, 'r') tag_positions = [] for line in f: line = line.strip() sline = line.split() #make sure the extension is always 0, otherwise the rest of the program might not work as intended tag_positions.append(associate_tags_with_regions.tag_position(sline, 0)) f.close() if not Utility_extended.is_list_sorted(tag_positions): tag_positions.sort() #By this point tag_positions is a sorted list of all the reads located on the strand and chromosome the code is currently dealing with for entrez_id in entrez_genes_by_chrom.entrez_ids: gene = entrez_genes_by_chrom.entrez_genes[entrez_id] # an EntrezGene class object # get_3UTRs gets the ENTREZ 3'UTR, which appears to generally give the beginning of the 3'UTR and a site very close to the most distal polyadenylation site three_UTRs = gene.get_3UTRs() # Mastertuplemaker uses the ENTREZ 3'UTR and the polyA sites given to create the true data for the 3'UTR needed for CUTR_vs_AUTR to work true3UTRstarts, true3UTRends, UTRregion_start, UTRregion_end, UTRbeginning = Mastertuplemaker(three_UTRs,PAsiteslist,chrom,gene.strand, extension) #value should always be 1 as only 3'UTR with more than 1 polyA site need be considered if len(true3UTRends) > 1: #find all reads inside the 3'UTR inside_reads = associate_tags_with_3UTR(tag_positions, UTRregion_start, UTRregion_end) #finds reads in each region of the 3'UTR and calculates aUTR/cUTR for each of them #PolyAsites potentially useful for output RUDs, basic_RUD, PolyAsites = CUTR_vs_AUTR(true3UTRstarts, true3UTRends, inside_reads, gene.strand, threshold) #important if one wants to output gene_symbol information gene_symbol = [] for mytranscript in gene.transcripts: if mytranscript.additional_annotations[0] not in gene_symbol: gene_symbol.append(mytranscript.additional_annotations[0]) #outline to use to output RUDs outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(basic_RUD) + "\t" + ",".join(map(str, RUDs)) + "\n" #outline to use to output polyA information for a species #outline = str(entrez_id) + "\t" + chrom + "\t" + gene.strand + "\t" + str(UTRbeginning) + "\t" + ",".join(map(str, PolyAsites)) + "\n" outf.write(outline) outf.close()