def remove_duplicates(self,args): """ - First create a table of those that overlap - Then create merged entries based on the overlap matrix """ if(not self.genes_spanning_left_junction or not self.genes_spanning_right_junction): raise Exception("Gene annotations on dataset '"+self.name+"' were not found") else: old_count = len(self) if(self.name.find("vs.") == -1): self.logger.info("Duplication removal: "+self.name+" ("+str(old_count)+" fusions)") unique_fusions = [] if(args.matching_method in ["overlap","subset","egm"]): from CompareFusionsBySpanningGenes import CompareFusionsBySpanningGenes overlap = CompareFusionsBySpanningGenes(False,False,args) else: raise Exception("Unknown overlap method for removing duplicates: '"+args.matching_method+"' for dataset "+self.name) stats_duplicates = 0 stats_non_gene_spanning = 0 fusions_to_add = [] for chromosome_left in self.index.items(): for chromosome_right in chromosome_left[1].items(): all_fusions = chromosome_right[1] n = len(all_fusions) queue = range(n) while(len(queue) > 0): duplicates = [] for i in queue: fusion_1 = all_fusions[i] if(fusion_1): is_duplicate = False if(len(fusion_1.get_annotated_genes_left()) == 0 or len(fusion_1.get_annotated_genes_right()) == 0): stats_non_gene_spanning += 1 all_fusions[i] = False else: for j in range(i+1,n): fusion_2 = all_fusions[j] if(fusion_2): match = overlap.match_fusions(fusion_1,fusion_2,False) if(match): fusion_1 = match all_fusions[i] = match all_fusions[j] = False is_duplicate = True if(is_duplicate): duplicates.append(i) else: unique_fusions.append(fusion_1) queue = duplicates for fusion in all_fusions: if(fusion): fusions_to_add.append(fusion) self.flush() for fusion in fusions_to_add: self.add_fusion(fusion) if(self.name.find("vs.") == -1): self.logger.info("* Full: "+str(old_count)) self.logger.info("* Gene-spanning: "+str(old_count-stats_non_gene_spanning)) self.logger.info("* Unique: "+str(len(self))) return len(self)
def overlay_fusions(self, sparse=True, export_dir=False, args=None): """ The SPARSE variable should only be True if the outpot format is 'summary', because all the overlap objects are removed. This makes the algorithm much more effictent (reduces space complexity from 0.5(n^2) => 2n). """ n = len(self.datasets) self.logger.info("Determining the overlap of fusion genes in " + str(n) + " datasets") self.matrix_tmp = {} for i in range(len(self.datasets)): self.matrix_tmp[str(i + 1)] = self.datasets[i] #comparisons = self.find_combination_table(n) if (args.format == "list" and export_dir != False): if args.long_gene_size > 0: large_genes = "Spans large gene (>" + str( args.long_gene_size) + "bp)" else: large_genes = "Spans large gene (feature disabled)" export_dir.write("Left-genes\tRight-genes\t" + large_genes + "\t" + "\t".join(self.dataset_names) + "\n") ri = 0 for r in self.find_combination_table(len(self.datasets)): r_0 = self.find_combination_table_r_i(len(self.datasets), ri, 0) # First cleanup the memory - reduces space complexity from 0.5(n^2) => 2n. In addition, memory should decrease in time dont_remove = [] matches_this_iteration = set([]) #for c in r: #keys = self.create_keys(c) #dont_remove.append(keys[0]) #dont_remove.append(keys[1]) #if(args.format != "list"): #for candidate in self.matrix_tmp.keys(): #if candidate not in dont_remove: #del(self.matrix_tmp[candidate]) # Then run analysis for c in r: keys = self.create_keys(c) comparison = CompareFusionsBySpanningGenes( self.matrix_tmp[keys[0]], self.matrix_tmp[keys[1]], args) matches = comparison.find_overlap() matches_this_iteration = matches_this_iteration | matches[3] if (not sparse and export_dir): if (args.format == "extensive"): matches[0].export_to_CG_Junctions_file( export_dir + "/" + matches[0].name + ".CG-junctions.txt") self.matrix_tmp[keys[2]] = matches[0] self.matches_total[keys[2]] = len(matches[0]) if ( args.format == "list" ): # Write those that are not marked to go to the next iteration to a file if (len(r_0) > 2): for export_key in self.find_combination_table_r( len(self.datasets), ri - 1): #previous_comparisons:#comparisons[ri-1]: export_key = [str(x) for x in export_key] export_key = '.'.join(export_key) self.matrix_tmp[export_key].export_to_list( export_dir, self.dataset_names, matches_this_iteration, args) del ( self.matrix_tmp[export_key] ) ## if this was once in a list to be removed, remove... else: for export_key in [ str(i + 1) for i in range(len(self.datasets)) ]: self.matrix_tmp[export_key].export_to_list( export_dir, self.dataset_names, matches_this_iteration, args) #del(self.matrix_tmp[export_key]) ## if this was once in a list to be removed, remove... ri += 1 if (args.format == "list" and export_dir != False): export_key = '.'.join([str(x) for x in r_0]) self.matrix_tmp[export_key].export_to_list( export_dir, self.dataset_names, set([]), args) ## if this was once in a list to be removed, remove...? return matches
def overlay_fusions(self,sparse=True,export_dir=False,args=None): """ The SPARSE variable should only be True if the outpot format is 'summary', because all the overlap objects are removed. This makes the algorithm much more effictent (reduces space complexity from 0.5(n^2) => 2n). """ n = len(self.datasets) self.logger.info("Determining the overlap of fusion genes in "+str(n)+" datasets") self.matrix_tmp = {} for i in range(len(self.datasets)): self.matrix_tmp[str(i+1)] = self.datasets[i] #comparisons = self.find_combination_table(n) if(args.format=="list" and export_dir != False): if args.long_gene_size > 0: large_genes = "Spans large gene (>"+str(args.long_gene_size)+"bp)" else: large_genes = "Spans large gene (feature disabled)" export_dir.write("Left-genes\tRight-genes\t"+large_genes+"\t"+"\t".join(self.dataset_names)+"\n") ri = 0 for r in self.find_combination_table(len(self.datasets)): r_0 = self.find_combination_table_r_i(len(self.datasets),ri,0) # First cleanup the memory - reduces space complexity from 0.5(n^2) => 2n. In addition, memory should decrease in time dont_remove = [] matches_this_iteration = set([]) #for c in r: #keys = self.create_keys(c) #dont_remove.append(keys[0]) #dont_remove.append(keys[1]) #if(args.format != "list"): #for candidate in self.matrix_tmp.keys(): #if candidate not in dont_remove: #del(self.matrix_tmp[candidate]) # Then run analysis for c in r: keys = self.create_keys(c) comparison = CompareFusionsBySpanningGenes(self.matrix_tmp[keys[0]],self.matrix_tmp[keys[1]],args) matches = comparison.find_overlap() matches_this_iteration = matches_this_iteration | matches[3] if(not sparse and export_dir): if(args.format=="extensive"): matches[0].export_to_CG_Junctions_file(export_dir+"/"+matches[0].name+".CG-junctions.txt") self.matrix_tmp[keys[2]] = matches[0] self.matches_total[keys[2]] = len(matches[0]) if(args.format=="list"):# Write those that are not marked to go to the next iteration to a file if(len(r_0) > 2): for export_key in self.find_combination_table_r(len(self.datasets),ri-1):#previous_comparisons:#comparisons[ri-1]: export_key = [str(x) for x in export_key] export_key = '.'.join(export_key) self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,matches_this_iteration,args) del(self.matrix_tmp[export_key]) ## if this was once in a list to be removed, remove... else: for export_key in [str(i+1) for i in range(len(self.datasets))]: self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,matches_this_iteration,args) #del(self.matrix_tmp[export_key]) ## if this was once in a list to be removed, remove... ri += 1 if(args.format == "list" and export_dir != False): export_key = '.'.join([str(x) for x in r_0]) self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,set([]),args) ## if this was once in a list to be removed, remove...? return matches
def remove_duplicates(self, args): """ - First create a table of those that overlap - Then create merged entries based on the overlap matrix """ if (not self.genes_spanning_left_junction or not self.genes_spanning_right_junction): raise Exception("Gene annotations on dataset '" + self.name + "' were not found") else: old_count = len(self) if (self.name.find("vs.") == -1): self.logger.info("Duplication removal: " + self.name + " (" + str(old_count) + " fusions)") unique_fusions = [] if (args.matching_method in ["overlap", "subset", "egm"]): from CompareFusionsBySpanningGenes import CompareFusionsBySpanningGenes overlap = CompareFusionsBySpanningGenes(False, False, args) else: raise Exception( "Unknown overlap method for removing duplicates: '" + args.matching_method + "' for dataset " + self.name) stats_duplicates = 0 stats_non_gene_spanning = 0 fusions_to_add = [] for chromosome_left in self.index.items(): for chromosome_right in chromosome_left[1].items(): all_fusions = chromosome_right[1] n = len(all_fusions) queue = range(n) while (len(queue) > 0): duplicates = [] for i in queue: fusion_1 = all_fusions[i] if (fusion_1): is_duplicate = False if (len(fusion_1.get_annotated_genes_left(False)) == 0 or len( fusion_1.get_annotated_genes_right( False)) == 0): stats_non_gene_spanning += 1 all_fusions[i] = False else: for j in range(i + 1, n): fusion_2 = all_fusions[j] if (fusion_2): match = overlap.match_fusions( fusion_1, fusion_2, False) if (match): merged_matches = fusion_1.matches | fusion_2.matches fusion_1.matches = merged_matches fusion_1.acceptor_donor_direction = match.acceptor_donor_direction fusion_1.left_strand = match.left_strand fusion_1.right_strand = match.right_strand fusion_1.annotated_genes_left = match.annotated_genes_left fusion_1.annotated_genes_right = match.annotated_genes_right all_fusions[i] = fusion_1 all_fusions[j] = False is_duplicate = True match.prepare_deletion() del (match) if (is_duplicate): duplicates.append(i) else: unique_fusions.append(fusion_1) queue = duplicates for fusion in all_fusions: if (fusion): fusions_to_add.append(fusion) self.flush() for fusion in fusions_to_add: self.add_fusion(fusion) if (self.name.find("vs.") == -1): self.logger.debug("* Full: " + str(old_count)) self.logger.debug("* Gene-spanning: " + str(old_count - stats_non_gene_spanning)) self.logger.debug("* Unique: " + str(len(self))) return len(self)