def pair_clusters_by_bin((key, fwd_clusters, rev_clusters, bam_file_name, verbose)): print "processing cluster pairs on %s" % (key) #print "pairing clusters in parallel for chr %s" % fwd_clusters[0].chr non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters) if verbose: print "non overlapping fwd clusters\t%d" % (len(non_overlapping_fwd_clusters)) non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters) if verbose: print "non overlapping rev clusters\t%d" % (len(non_overlapping_rev_clusters)) proper_pair_bam = pysam.Samfile(bam_file_name, "rb") #print "haha" #print "ok1" #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after cluster_pairs = [] paired_fwd_clusters_indices = [] paired_rev_clusters_indices = [] for fwd_index, fwd_cluster in enumerate(non_overlapping_fwd_clusters): for rev_index, rev_cluster in enumerate(non_overlapping_rev_clusters): if fwd_cluster.is_overlapping_strict(rev_cluster): new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster) #print new_cluster_pair.get_chr() reads = proper_pair_bam.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end()) new_cluster_pair.calc_zygosity(reads) #print "poop" if new_cluster_pair.get_insertion_int_end() < new_cluster_pair.get_insertion_int_start(): if True: print "cluster pair not paired!" else: cluster_pairs.append(new_cluster_pair) paired_fwd_clusters_indices.append(fwd_index) paired_rev_clusters_indices.append(rev_index) #make lists of unpaired clusters unpaired_fwd_clusters = [] unpaired_rev_clusters = [] for fwd_index in range(len(non_overlapping_fwd_clusters)): if fwd_index not in paired_fwd_clusters_indices: unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index]) for rev_index in range(len(non_overlapping_rev_clusters)): if rev_index not in paired_rev_clusters_indices: unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index]) return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)
def pair_clusters_by_bin((key, fwd_clusters, rev_clusters, bam_file_name, verbose, bed_file_handle, streaming, min_cluster_size)): print "processing cluster pairs on %s" % (key) #print "pairing clusters in parallel for chr %s" % fwd_clusters[0].chr non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters,min_cluster_size) if verbose: print "non overlapping fwd clusters\t%d" % (len(non_overlapping_fwd_clusters)) non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters,min_cluster_size) if verbose: print "non overlapping rev clusters\t%d" % (len(non_overlapping_rev_clusters)) if not streaming: proper_pair_bam = pysam.Samfile(bam_file_name, "rb") #print "haha" #print "ok1" #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after cluster_pairs = [] paired_fwd_clusters_indices = [] paired_rev_clusters_indices = [] last_intersect=0 bed_string = "" for fwd_index in range(0,len(non_overlapping_fwd_clusters)): #if fwd_cluster.num_reads < min_cluster_size: # continue fwd_cluster=non_overlapping_fwd_clusters[fwd_index] for rev_index in range(last_intersect, len(non_overlapping_rev_clusters)): #if rev_cluster.num_reads < min_cluster_size: # continue rev_cluster=non_overlapping_rev_clusters[rev_index] if fwd_cluster.is_overlapping_strict(rev_cluster): new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster) last_intersect=rev_index #print new_cluster_pair.get_chr() if not streaming: reads = proper_pair_bam.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end()) new_cluster_pair.calc_zygosity(reads) else: bed_line = new_cluster_pair.to_bed() bed_string = bed_string + "\n" + bed_line #print "poop" if new_cluster_pair.get_insertion_int_end() < new_cluster_pair.get_insertion_int_start(): if True: print "cluster pair not paired!" else: cluster_pairs.append(new_cluster_pair) paired_fwd_clusters_indices.append(fwd_index) paired_rev_clusters_indices.append(rev_index) elif fwd_cluster.intersection_end < rev_cluster.intersection_start: break #make lists of unpaired clusters unpaired_fwd_clusters = [] unpaired_rev_clusters = [] for fwd_index in range(len(non_overlapping_fwd_clusters)): if fwd_index not in paired_fwd_clusters_indices: unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index]) for rev_index in range(len(non_overlapping_rev_clusters)): if rev_index not in paired_rev_clusters_indices: unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index]) if streaming: return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string) else: return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)
def generate_clusters(self, verbose, psorted_bamfile_name, bed_file_handle, streaming, min_cluster_size): ##################### BEGIN NON PARALLEL VERSION ###################################### #cluster fwd intervals fwd_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd"] fwd_clusters = cluster_read_pairs_all(fwd_read_pairs) print "******************total fwd clusters found: %d" % len(fwd_clusters) non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters,min_cluster_size) print "******************total fwd non-overlapping clusters found: %d" % len(non_overlapping_fwd_clusters) #cluster rev intervals rev_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev"] rev_clusters = cluster_read_pairs_all(rev_read_pairs) print "******************total rev clusters found: %d" % len(rev_clusters) non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters,min_cluster_size) print "******************total rev non-overlapping clusters found: %d" % len(non_overlapping_rev_clusters) #bam_file_name = output_prefix + ".proper_pair.sorted.bam" psorted_bamfile = pysam.Samfile(psorted_bamfile_name, "rb") #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after cluster_pairs = [] paired_fwd_clusters_indices = [] paired_rev_clusters_indices = [] bed_string = "" last_intersect=0 # iterate over combinations of fwd and rev clusters, skipping if clusters dont meet min size requirements for fwd_index in range(0,len(non_overlapping_fwd_clusters)): #if fwd_cluster.num_reads < min_cluster_size: # continue fwd_cluster=non_overlapping_fwd_clusters[fwd_index] for rev_index in range(last_intersect, len(non_overlapping_rev_clusters)): #if rev_cluster.num_reads < min_cluster_size: # continue rev_cluster=non_overlapping_rev_clusters[rev_index] if fwd_cluster.is_overlapping_strict(rev_cluster): last_intersect=rev_index new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster) if not streaming: reads = proper_pair_bam.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end()) new_cluster_pair.calc_zygosity(reads) else: bed_line = new_cluster_pair.to_bed() if bed_string == "": bed_string = bed_line else: bed_string = bed_string + "\n" + bed_line if new_cluster_pair.insertion_int_end < new_cluster_pair.insertion_int_start: if True: print "cluster pair not paired!" else: cluster_pairs.append(new_cluster_pair) paired_fwd_clusters_indices.append(fwd_index) paired_rev_clusters_indices.append(rev_index) elif fwd_cluster.intersection_end < rev_cluster.intersection_start: break #make lists of unpaired clusters unpaired_fwd_clusters = [] unpaired_rev_clusters = [] for fwd_index in range(len(non_overlapping_fwd_clusters)): if fwd_index not in paired_fwd_clusters_indices: unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index]) for rev_index in range(len(non_overlapping_rev_clusters)): if rev_index not in paired_rev_clusters_indices: unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index]) print "******************total cluster pairs found: %d" % len(cluster_pairs) if verbose: for (fwd_cluster, rev_cluster) in cluster_pairs: print "*************************cluster_pair:**************************************" print "fwd cluster:" print "cluster coordinates: %s %d %d" % (fwd_cluster[0].interval_chr, fwd_cluster[0].interval_start, fwd_cluster[-1].interval_end ) print " ".join(read.str_int() for read in fwd_cluster) print " ".join(read.str_TE_annot_list() for read in fwd_cluster) print "rev cluster:" print "cluster coordinates: %s %d %d" % (rev_cluster[0].interval_chr, rev_cluster[0].interval_start, rev_cluster[-1].interval_end ) print " ".join(read.str_int() for read in rev_cluster) print " ".join(read.str_TE_annot_list() for read in rev_cluster) return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string)
def pair_clusters_by_bin( (key, fwd_clusters, rev_clusters, bam_file_name, verbose, bed_file_handle, streaming, min_cluster_size)): print "processing cluster pairs on %s" % (key) #print "pairing clusters in parallel for chr %s" % fwd_clusters[0].chr non_overlapping_fwd_clusters = remove_overlapping_clusters( fwd_clusters, min_cluster_size) if verbose: print "non overlapping fwd clusters\t%d" % ( len(non_overlapping_fwd_clusters)) non_overlapping_rev_clusters = remove_overlapping_clusters( rev_clusters, min_cluster_size) if verbose: print "non overlapping rev clusters\t%d" % ( len(non_overlapping_rev_clusters)) if not streaming: proper_pair_bam = pysam.Samfile(bam_file_name, "rb") #print "haha" #print "ok1" #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after cluster_pairs = [] paired_fwd_clusters_indices = [] paired_rev_clusters_indices = [] last_intersect = 0 bed_string = "" for fwd_index in range(0, len(non_overlapping_fwd_clusters)): #if fwd_cluster.num_reads < min_cluster_size: # continue fwd_cluster = non_overlapping_fwd_clusters[fwd_index] for rev_index in range(last_intersect, len(non_overlapping_rev_clusters)): #if rev_cluster.num_reads < min_cluster_size: # continue rev_cluster = non_overlapping_rev_clusters[rev_index] if fwd_cluster.is_overlapping_strict(rev_cluster): new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster) last_intersect = rev_index #print new_cluster_pair.get_chr() if not streaming: reads = proper_pair_bam.fetch( new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end()) new_cluster_pair.calc_zygosity(reads) else: bed_line = new_cluster_pair.to_bed() bed_string = bed_string + "\n" + bed_line #print "poop" if new_cluster_pair.get_insertion_int_end( ) < new_cluster_pair.get_insertion_int_start(): if True: print "cluster pair not paired!" else: cluster_pairs.append(new_cluster_pair) paired_fwd_clusters_indices.append(fwd_index) paired_rev_clusters_indices.append(rev_index) elif fwd_cluster.intersection_end < rev_cluster.intersection_start: break #make lists of unpaired clusters unpaired_fwd_clusters = [] unpaired_rev_clusters = [] for fwd_index in range(len(non_overlapping_fwd_clusters)): if fwd_index not in paired_fwd_clusters_indices: unpaired_fwd_clusters.append( non_overlapping_fwd_clusters[fwd_index]) for rev_index in range(len(non_overlapping_rev_clusters)): if rev_index not in paired_rev_clusters_indices: unpaired_rev_clusters.append( non_overlapping_rev_clusters[rev_index]) if streaming: return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string) else: return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)
def generate_clusters(self, verbose, psorted_bamfile_name, bed_file_handle, streaming, min_cluster_size): ##################### BEGIN NON PARALLEL VERSION ###################################### #cluster fwd intervals fwd_read_pairs = [ read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd" ] fwd_clusters = cluster_read_pairs_all(fwd_read_pairs) print "******************total fwd clusters found: %d" % len( fwd_clusters) non_overlapping_fwd_clusters = remove_overlapping_clusters( fwd_clusters, min_cluster_size) print "******************total fwd non-overlapping clusters found: %d" % len( non_overlapping_fwd_clusters) #cluster rev intervals rev_read_pairs = [ read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev" ] rev_clusters = cluster_read_pairs_all(rev_read_pairs) print "******************total rev clusters found: %d" % len( rev_clusters) non_overlapping_rev_clusters = remove_overlapping_clusters( rev_clusters, min_cluster_size) print "******************total rev non-overlapping clusters found: %d" % len( non_overlapping_rev_clusters) #bam_file_name = output_prefix + ".proper_pair.sorted.bam" psorted_bamfile = pysam.Samfile(psorted_bamfile_name, "rb") #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after cluster_pairs = [] paired_fwd_clusters_indices = [] paired_rev_clusters_indices = [] bed_string = "" last_intersect = 0 # iterate over combinations of fwd and rev clusters, skipping if clusters dont meet min size requirements for fwd_index in range(0, len(non_overlapping_fwd_clusters)): #if fwd_cluster.num_reads < min_cluster_size: # continue fwd_cluster = non_overlapping_fwd_clusters[fwd_index] for rev_index in range(last_intersect, len(non_overlapping_rev_clusters)): #if rev_cluster.num_reads < min_cluster_size: # continue rev_cluster = non_overlapping_rev_clusters[rev_index] if fwd_cluster.is_overlapping_strict(rev_cluster): last_intersect = rev_index new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster) if not streaming: reads = proper_pair_bam.fetch( new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end()) new_cluster_pair.calc_zygosity(reads) else: bed_line = new_cluster_pair.to_bed() if bed_string == "": bed_string = bed_line else: bed_string = bed_string + "\n" + bed_line if new_cluster_pair.insertion_int_end < new_cluster_pair.insertion_int_start: if True: print "cluster pair not paired!" else: cluster_pairs.append(new_cluster_pair) paired_fwd_clusters_indices.append(fwd_index) paired_rev_clusters_indices.append(rev_index) elif fwd_cluster.intersection_end < rev_cluster.intersection_start: break #make lists of unpaired clusters unpaired_fwd_clusters = [] unpaired_rev_clusters = [] for fwd_index in range(len(non_overlapping_fwd_clusters)): if fwd_index not in paired_fwd_clusters_indices: unpaired_fwd_clusters.append( non_overlapping_fwd_clusters[fwd_index]) for rev_index in range(len(non_overlapping_rev_clusters)): if rev_index not in paired_rev_clusters_indices: unpaired_rev_clusters.append( non_overlapping_rev_clusters[rev_index]) print "******************total cluster pairs found: %d" % len( cluster_pairs) if verbose: for (fwd_cluster, rev_cluster) in cluster_pairs: print "*************************cluster_pair:**************************************" print "fwd cluster:" print "cluster coordinates: %s %d %d" % ( fwd_cluster[0].interval_chr, fwd_cluster[0].interval_start, fwd_cluster[-1].interval_end) print " ".join(read.str_int() for read in fwd_cluster) print " ".join(read.str_TE_annot_list() for read in fwd_cluster) print "rev cluster:" print "cluster coordinates: %s %d %d" % ( rev_cluster[0].interval_chr, rev_cluster[0].interval_start, rev_cluster[-1].interval_end) print " ".join(read.str_int() for read in rev_cluster) print " ".join(read.str_TE_annot_list() for read in rev_cluster) return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string)
def generate_clusters(self, verbose, psorted_bamfile_name): ##################### BEGIN NON PARALLEL VERSION ###################################### #cluster fwd intervals fwd_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd"] fwd_clusters = cluster_read_pairs_all(fwd_read_pairs) print "******************total fwd clusters found: %d" % len(fwd_clusters) non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters) print "******************total fwd non-overlapping clusters found: %d" % len(non_overlapping_fwd_clusters) #cluster rev intervals rev_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev"] rev_clusters = cluster_read_pairs_all(rev_read_pairs) print "******************total rev clusters found: %d" % len(rev_clusters) non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters) print "******************total rev non-overlapping clusters found: %d" % len(non_overlapping_rev_clusters) #bam_file_name = output_prefix + ".proper_pair.sorted.bam" psorted_bamfile = pysam.Samfile(psorted_bamfile_name, "rb") #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after cluster_pairs = [] paired_fwd_clusters_indices = [] paired_rev_clusters_indices = [] for fwd_index, fwd_cluster in enumerate(non_overlapping_fwd_clusters): for rev_index, rev_cluster in enumerate(non_overlapping_rev_clusters): if fwd_cluster.is_overlapping_strict(rev_cluster): new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster) reads = psorted_bamfile.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end()) new_cluster_pair.calc_zygosity(reads) if new_cluster_pair.insertion_int_end < new_cluster_pair.insertion_int_start: if True: print "cluster pair not paired!" else: cluster_pairs.append(new_cluster_pair) paired_fwd_clusters_indices.append(fwd_index) paired_rev_clusters_indices.append(rev_index) #make lists of unpaired clusters unpaired_fwd_clusters = [] unpaired_rev_clusters = [] for fwd_index in range(len(non_overlapping_fwd_clusters)): if fwd_index not in paired_fwd_clusters_indices: unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index]) for rev_index in range(len(non_overlapping_rev_clusters)): if rev_index not in paired_rev_clusters_indices: unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index]) print "******************total cluster pairs found: %d" % len(cluster_pairs) if verbose: for (fwd_cluster, rev_cluster) in cluster_pairs: print "*************************cluster_pair:**************************************" print "fwd cluster:" print "cluster coordinates: %s %d %d" % (fwd_cluster[0].interval_chr, fwd_cluster[0].interval_start, fwd_cluster[-1].interval_end ) print " ".join(read.str_int() for read in fwd_cluster) print " ".join(read.str_TE_annot_list() for read in fwd_cluster) print "rev cluster:" print "cluster coordinates: %s %d %d" % (rev_cluster[0].interval_chr, rev_cluster[0].interval_start, rev_cluster[-1].interval_end ) print " ".join(read.str_int() for read in rev_cluster) print " ".join(read.str_TE_annot_list() for read in rev_cluster) return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)