def combine_proximal_islands(islands, gap, window_size_buffer=3): """ islands: a list of BEd_GRAPH object: (chrom, start, end, value) Extend the regions found in the find_continuous_region function. If gap is not allowed, gap = 0, if one window is allowed, gap = window_size (200) Return a list of combined regions. """ #print len(islands); proximal_island_dist = gap + window_size_buffer Final_islands = [] if len(islands) > 0: if not Utility.is_bed_sorted(islands): islands.sort(key=operator.attrgetter('start')) current_island = islands[0] #print current_island; if len(islands) == 1: Final_islands = islands else: for index in range(1, len(islands)): dist = islands[index].start - current_island.end if dist <= proximal_island_dist: current_island.end = islands[index].end current_island.value += islands[index].value else: Final_islands.append(current_island) current_island = islands[index] # The last island: Final_islands.append(current_island) #print len(Final_islands); return Final_islands
def find_read_copy_distribution(sorted_bed_list): """ Input: sorted_bed_list: a list of sorted bed6 objects. Already assumed that the tags are from one chromosome and in one direction. Return: the histogram of the tag copies """ assert (Utility.is_bed_sorted(sorted_bed_list) == 1) unique_tag_histogram = [0] * 100 if (len(sorted_bed_list) != 0): total_number_tags = len(sorted_bed_list) current_value = (sorted_bed_list[0]).start current_count = 1 for index in range(1, len(sorted_bed_list)): item = sorted_bed_list[index] if (item.start != current_value): if (len(unique_tag_histogram) - 1) < current_count: unique_tag_histogram += [0] * ( current_count - (len(unique_tag_histogram) - 1)) unique_tag_histogram[current_count] += 1 current_value = item.start current_count = 1 #reset else: current_count += 1 #last read if (len(unique_tag_histogram) - 1) < current_count: unique_tag_histogram += [0] * (current_count - (len(unique_tag_histogram) - 1)) unique_tag_histogram[current_count] += 1 return unique_tag_histogram
def find_multi_copy_reads(sorted_bed_list, threshold): """ Input: sorted_bed_list: a list of sorted bed6 objects. Already assumed that the tags are from one chromosome and in one direction. threshold: the threshold for read copy Return: the list of BED6 reads with copy number above or equal to threshold. """ multiple_copy_read_list = [] temp_list = [] assert (Utility.is_bed_sorted(sorted_bed_list) == 1) if (len(sorted_bed_list) != 0): total_number_tags = len(sorted_bed_list) current_value = (sorted_bed_list[0]).start temp_list.append(sorted_bed_list[0]) current_count = 1 for index in range(1, len(sorted_bed_list)): item = sorted_bed_list[index] if (item.start != current_value): if (current_count >= threshold): #current_tag.score = current_count; #multiple_copy_read_list.append(current_tag); multiple_copy_read_list.extend(temp_list) current_value = item.start current_count = 1 #reset temp_list = [] temp_list.append(item) else: current_count += 1 temp_list.append(item) #last read if (current_count >= threshold): #item.score = current_count; #multiple_copy_read_list.append(item); multiple_copy_read_list.extend(temp_list) return multiple_copy_read_list
def find_n_copy_reads(sorted_bed_list, n): """ Input: sorted_bed_list: a list of sorted bed6 objects. Already assumed that the tags are from one chromosome and in one direction. n: the copies for a read Return: the list of BED6 reads with copy number equal to n. """ assert (Utility.is_bed_sorted(sorted_bed_list) == 1) n_copy_read_list = [] temp_list = [] if (len(sorted_bed_list) != 0): total_number_tags = len(sorted_bed_list) temp_list.append(sorted_bed_list[0]) current_value = (sorted_bed_list[0]).start current_count = 1 for index in range(1, len(sorted_bed_list)): item = sorted_bed_list[index] if (item.start != current_value): if (current_count == n): n_copy_read_list.extend(temp_list) current_value = item.start current_count = 1 #reset temp_list = [] temp_list.append(item) else: current_count += 1 temp_list.append(item) #last read if (current_count == threshold): n_copy_read_list.extend(temp_list) return n_copy_read_list
def filter_reads(sorted_bed_list, cutoff, outfile): """ A read has n copies in the sorted_bed_list. If n<=cutoff, all the copies are retained. If n>cutoff, only cutoff number of copies of the read are retained. Output: write bed objects with the extra redundant copies filtered out.If the number of reads in zero, then that file is not generated. Return: the number of reads remained """ assert (Utility.is_bed_sorted(sorted_bed_list) == 1) counter2 = 0 if (len(sorted_bed_list) != 0): out = open(outfile, 'w') total_number_tags = len(sorted_bed_list) current_value = (sorted_bed_list[0]).start current_count = 1 current_tag = sorted_bed_list[0] for index in range(1, len(sorted_bed_list)): item = sorted_bed_list[index] if (item.start != current_value): if (current_count <= cutoff): write(current_tag, out) counter2 += 1 current_value = item.start current_count = 1 current_tag = item else: if (current_count <= cutoff): write(current_tag, out) counter2 += 1 current_count += 1 if (current_count <= cutoff): #last tag write(current_tag, out) counter2 += 1 out.close() return counter2
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawreadfile", action="store", type="string", dest="readfile", metavar="<file>", help="raw read file in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-b", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count file") (opt, args) = parser.parse_args(argv) if len(argv) < 10: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species]; else: print "This species is not recognized, exiting"; sys.exit(1); islands = BED.BED(opt.species, opt.islandfile, "BED3", 0); if Utility.fileExists(opt.readfile): SeparateByChrom.separateByChrom(chroms, opt.readfile, '.bed1'); else: print opt.readfile, " not found"; sys.exit(1) total = 0; library_size = get_total_tag_counts.get_total_tag_counts(opt.readfile); scaling_factor = 1000000; out = open(opt.out_file, 'w'); for chrom in chroms: if chrom in islands.keys(): island_list = islands[chrom]; island_readcount_list=[0]*len(island_list); if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')); island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) read_file = chrom + ".bed1"; f = open(read_file,'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = tag_position(sline, opt.fragment_size) index = find_readcount_on_islands(island_start_list, island_end_list, position); if index >= 0: island_readcount_list[index] += 1; total += 1; f.close(); for index in xrange(len(island_list)): item = island_list[index]; normalized_read_count = island_readcount_list[index]/float(library_size) * scaling_factor; outline = item.chrom + "\t" + str(item.start) + "\t" + str(item.end) + "\t" + str(island_readcount_list[index]) + "\t" + str(normalized_read_count) + "\n"; out.write(outline); SeparateByChrom.cleanup(chroms, '.bed1'); out.close(); print "Total number of reads on islands are: ", total;
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18", metavar="<str>") parser.add_option("-a", "--rawchipreadfile", action="store", type="string", dest="chipreadfile", metavar="<file>", help="raw read file from chip in bed format") parser.add_option("-b", "--rawcontrolreadfile", action="store", type="string", dest="controlreadfile", metavar="<file>", help="raw read file from control in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after CHIP experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") parser.add_option("-t", "--mappable_fraction_of_genome_size ", action="store", type="float", dest="fraction", help="mapable fraction of genome size", metavar="<float>") (opt, args) = parser.parse_args(argv) if len(argv) < 14: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] genomesize = sum( GenomeData.species_chrom_lengths[opt.species].values()) genomesize = opt.fraction * genomesize else: print "This species is not recognized, exiting" sys.exit(1) chip_library_size = get_total_tag_counts.get_total_tag_counts( opt.chipreadfile) control_library_size = get_total_tag_counts.get_total_tag_counts( opt.controlreadfile) print "chip library size ", chip_library_size print "control library size ", control_library_size totalchip = 0 totalcontrol = 0 islands = BED.BED(opt.species, opt.islandfile, "BED3", 0) # separate by chrom the chip library if Utility.fileExists(opt.chipreadfile): SeparateByChrom.separateByChrom(chroms, opt.chipreadfile, '.bed1') else: print opt.chipreadfile, " not found" sys.exit(1) # separate by chrom the control library if Utility.fileExists(opt.controlreadfile): SeparateByChrom.separateByChrom(chroms, opt.controlreadfile, '.bed2') else: print opt.controlreadfile, " not found" sys.exit(1) island_chip_readcount = {} island_control_readcount = {} for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_chip_readcount_list = [0] * len(island_list) read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_chip_readcount_list[index] += 1 totalchip += 1 f.close() island_chip_readcount[chrom] = island_chip_readcount_list island_control_readcount_list = [0] * len(island_list) read_file = chrom + ".bed2" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_control_readcount_list[index] += 1 totalcontrol += 1 f.close() island_control_readcount[chrom] = island_control_readcount_list chip_background_read = chip_library_size - totalchip control_background_read = control_library_size - totalcontrol #scaling_factor = chip_background_read*1.0/control_background_read; scaling_factor = chip_library_size * 1.0 / control_library_size print "Total number of chip reads on islands is: ", totalchip print "Total number of control reads on islands is: ", totalcontrol #print "chip_background_read ", chip_background_read #print "control_background_read ", control_background_read out = open(opt.out_file, 'w') pvalue_list = [] result_list = [] for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] observation = (island_chip_readcount[chrom])[index] control_tag = (island_control_readcount[chrom])[index] if (island_control_readcount[chrom])[index] > 0: #average = (island_control_readcount[chrom])[index] * scaling_factor; average = control_tag * scaling_factor fc = float(observation) / float(average) else: length = item.end - item.start + 1 average = length * control_library_size * 1.0 / genomesize average = min(0.25, average) * scaling_factor fc = float(observation) / float(average) if observation > average: pvalue = scipy.stats.poisson.sf( (island_chip_readcount[chrom])[index], average)[()] else: pvalue = 1 pvalue_list.append(pvalue) item_dic = {} item_dic['chrom'] = item.chrom item_dic['start'] = item.start item_dic['end'] = item.end item_dic['chip'] = observation item_dic['control'] = control_tag item_dic['pvalue'] = pvalue item_dic['fc'] = fc result_list.append(item_dic) pvaluearray = scipy.array(pvalue_list) pvaluerankarray = scipy.stats.rankdata(pvaluearray) totalnumber = len(result_list) for i in range(totalnumber): item = result_list[i] alpha = pvalue_list[i] * totalnumber / pvaluerankarray[i] if alpha > 1: alpha = 1 outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str( item['end']) + "\t" + str(item['chip']) + "\t" + str( item['control']) + "\t" + str(item['pvalue']) + "\t" + str( item['fc']) + "\t" + str(alpha) + "\n" out.write(outline) #pvalue_list.sort() #for item in result_list: #pvalue = float(item['pvalue']) #alpha = pvalue * len(result_list) / (pvalue_list.index(pvalue) + 1) #if alpha > 1: #alpha = 1; #outline = item['chrom'] + "\t" + str(item['start']) + "\t" + str(item['end']) + "\t" + str(item['chip']) + "\t" + str(item['control']) + "\t" + str(item['pvalue']) + "\t" + str(item['fc']) + "\t" + str(alpha) + "\n"; #out.write(outline); out.close() SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2')
def main(argv): parser = OptionParser() parser.add_option("-s", "--species", action="store", type="string", dest="species", help="species, mm8, hg18, etc", metavar="<str>") parser.add_option("-a", "--rawreadfileA", action="store", type="string", dest="readfileA", metavar="<file>", help="raw read file A in bed format") parser.add_option("-b", "--rawreadfileB", action="store", type="string", dest="readfileB", metavar="<file>", help="raw read file B in bed format") parser.add_option("-f", "--fragment_size", action="store", type="int", dest="fragment_size", metavar="<int>", help="average size of a fragment after A experiment") parser.add_option("-d", "--islandfile", action="store", type="string", dest="islandfile", metavar="<file>", help="island file in BED format") parser.add_option("-o", "--outfile", action="store", type="string", dest="out_file", metavar="<file>", help="island read count summary file") (opt, args) = parser.parse_args(argv) if len(argv) < 12: parser.print_help() sys.exit(1) if opt.species in GenomeData.species_chroms.keys(): chroms = GenomeData.species_chroms[opt.species] else: print "This species is not recognized, exiting" sys.exit(1) if not Utility.fileExists(opt.readfileA): print opt.readfileA, " not found" sys.exit(1) if not Utility.fileExists(opt.readfileB): print opt.readfileB, " not found" sys.exit(1) A_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileA) B_library_size = get_total_tag_counts.get_total_tag_counts(opt.readfileB) print "Library size of ", opt.readfileA, ": ", A_library_size print "Library size of ", opt.readfileB, ": ", B_library_size totalA = 0 totalB = 0 islands = BED.BED(opt.species, opt.islandfile, "BED3", 0) # separate by chrom the A library SeparateByChrom.separateByChrom(chroms, opt.readfileA, '.bed1') # separate by chrom the B library SeparateByChrom.separateByChrom(chroms, opt.readfileB, '.bed2') island_A_readcount = {} island_B_readcount = {} #Find read counts on the islands for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] if Utility.is_bed_sorted(island_list) == 0: island_list.sort(key=operator.attrgetter('start')) island_start_list = [] island_end_list = [] for item in island_list: island_start_list.append(item.start) island_end_list.append(item.end) island_A_readcount_list = [0] * len(island_list) read_file = chrom + ".bed1" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_A_readcount_list[index] += 1 totalA += 1 f.close() island_A_readcount[chrom] = island_A_readcount_list island_B_readcount_list = [0] * len(island_list) read_file = chrom + ".bed2" f = open(read_file, 'r') for line in f: if not re.match("#", line): line = line.strip() sline = line.split() position = associate_tags_with_regions.tag_position( sline, opt.fragment_size) index = associate_tags_with_regions.find_readcount_on_islands( island_start_list, island_end_list, position) if index >= 0: island_B_readcount_list[index] += 1 totalB += 1 f.close() island_B_readcount[chrom] = island_B_readcount_list #A_background_read = A_library_size - totalA; #B_background_read = B_library_size - totalB; print "Total number of A reads on islands is: ", totalA print "Total number of B reads on islands is: ", totalB # Calculate the p value. library_scaling_factor = A_library_size * 1.0 / B_library_size #A vs B pseudo_count = 1 pvalue_A_vs_B_list = [] pvalue_B_vs_A_list = [] for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] Acount = (island_A_readcount[chrom])[index] Bcount = (island_B_readcount[chrom])[index] pvalue_A_vs_B = pvaule(Acount, Bcount, library_scaling_factor, pseudo_count) pvalue_A_vs_B_list.append(pvalue_A_vs_B) pvalue_B_vs_A = pvaule(Bcount, Acount, 1 / library_scaling_factor, pseudo_count) pvalue_B_vs_A_list.append(pvalue_B_vs_A) #Calculate the FDR fdr_A_vs_B_list = fdr(pvalue_A_vs_B_list) fdr_B_vs_A_list = fdr(pvalue_B_vs_A_list) #Output the islands read counts, normalized read counts, fc, pvalue both ways scaling_factor = 1000000 out = open(opt.out_file, 'w') outline = '#chrom' + "\t" + 'start' + "\t" + 'end' + "\t" + "Readcount_A" + "\t" + 'Normalized_Readcount_A' + "\t" + 'ReadcountB' + "\t" + 'Normalized_Readcount_B' + "\t" + "Fc_A_vs_B" + "\t" + "pvalue_A_vs_B" + "\t" + "FDR_A_vs_B" + "\t" + "Fc_B_vs_A" + "\t" + "pvalue_B_vs_A" + "\t" + "FDR_B_vs_A" + "\n" out.write(outline) ii = 0 for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: island_list = islands[chrom] for index in xrange(len(island_list)): item = island_list[index] Acount = (island_A_readcount[chrom])[index] Bcount = (island_B_readcount[chrom])[index] normalized_A = Acount / float( A_library_size) * scaling_factor normalized_B = Bcount / float( B_library_size) * scaling_factor fc_A_vs_B = ( (Acount + pseudo_count) * 1.0 / (Bcount + pseudo_count)) / library_scaling_factor fc_B_vs_A = ( (Bcount + pseudo_count) * 1.0 / (Acount + pseudo_count)) * library_scaling_factor outline = item.chrom + "\t" + str(item.start) + "\t" + str( item.end) + "\t" + str(Acount) + "\t" + str( normalized_A) + "\t" + str(Bcount) + "\t" + str( normalized_B ) + "\t" + str(fc_A_vs_B) + "\t" + str( pvalue_A_vs_B_list[ii]) + "\t" + str( fdr_A_vs_B_list[ii] ) + "\t" + str(fc_B_vs_A) + "\t" + str( pvalue_B_vs_A_list[ii]) + "\t" + str( fdr_B_vs_A_list[ii]) + "\n" out.write(outline) ii += 1 out.close() SeparateByChrom.cleanup(chroms, '.bed1') SeparateByChrom.cleanup(chroms, '.bed2') # Calculate the correlations using normalized read counts A_array = () B_array = () for chrom in chroms: if chrom in islands.keys(): if len(islands[chrom]) != 0: temp_array = scipy.array(island_A_readcount[chrom]) A_array = scipy.concatenate((temp_array, A_array)) temp_array = scipy.array(island_B_readcount[chrom]) B_array = scipy.concatenate((temp_array, B_array)) #Normalization to reads per million A_array = A_array / float(A_library_size) * scaling_factor B_array = B_array / float(B_library_size) * scaling_factor pearson = scipy.stats.pearsonr(A_array, B_array) print "Pearson's correlation is: ", pearson[0], " with p-value ", pearson[ 1] spearman = scipy.stats.spearmanr(A_array, B_array) print "Spearman's correlation is: ", spearman[ 0], " with p-value ", spearman[1]