def filter_gff(hits): hits_to_keep = [] hits = sorted(hits,key= lambda x: int(x[0][3])) #Get only the features annotated as genes gene_annotations = [x for y in hits for x in y if x[2] == 'gene'] #Get the start,end, and score for each gene annotation range_list = [(int(x[3]),int(x[4])) for x in gene_annotations] kept_indicies = range_connectivity(range_list) if len(kept_indicies) > 1: overlapping_indicies = [] non_overlapping_indicies = [] for ix in range(len(kept_indicies)-1): #print range_list[ix],range_list[ix+1],tuple_overlap(range_list[ix],range_list[ix+1]) if tuple_overlap(range_list[ix],range_list[ix+1]): overlapping_indicies.append((kept_indicies[ix],kept_indicies[ix+1])) else: non_overlapping_indicies.append(kept_indicies[ix]) for pair in overlapping_indicies: if int(gene_annotations[pair[0]][5]) > int(gene_annotations[pair[1]][5]): non_overlapping_indicies.append(pair[0]) else: non_overlapping_indicies.append(pair[1]) if not tuple_overlap(range_list[-2],range_list[-1]): non_overlapping_indicies.append(kept_indicies[-1]) return [hits[x] for x in non_overlapping_indicies] else: return [hits[x] for x in kept_indicies]
def filter_gff(hits,merge=True): hits_to_keep = [] hits = sorted(hits,key= lambda x: int(x[0][3])) #Get only the features annotated as genes gene_annotations = [x for y in hits for x in y if x[2] == 'gene'] #Get the start,end, and score for each gene annotation range_list = [(int(x[3]),int(x[4])) for x in gene_annotations] #print(range_list) kept_indicies = range_connectivity(range_list) kept_range_list = [range_list[x] for x in kept_indicies] #print(kept_indicies) if len(kept_indicies) > 1: overlapping_indicies = [] non_overlapping_indicies = [] for ix in range(len(kept_indicies)-1): #print range_list[ix],range_list[ix+1],tuple_overlap(range_list[ix],range_list[ix+1]) if tuple_overlap(kept_range_list[ix],kept_range_list[ix+1]): if kept_indicies[ix] not in overlapping_indicies: # if not tuple_overlap(kept_range_list[ix-1],kept_range_list[ix]): overlapping_indicies.append(kept_indicies[ix]) if kept_indicies[ix+1] not in overlapping_indicies: overlapping_indicies.append(kept_indicies[ix+1]) else: non_overlapping_indicies.append(kept_indicies[ix]) #print overlapping_indicies if overlapping_indicies: best_score = score_filter([hits[x] for x in overlapping_indicies]) if best_score: non_overlapping_indicies.append(best_score) else: if merge: #merge the gene first gene_anno = [] cds_anno = [] exon_anno = [] intron_anno = [] similarity_anno = [] misc_anno = [] #print(hits) for x in hits: for l in x: if l[2] == 'gene': gene_anno.append(l) elif l[2] == "cds": cds_anno.append(l) elif l[2] == "exon": exon_anno.append(l) elif l[2] == "intron": intron_anno.append(l) elif l[2] == "similarity": similarity_anno.append(l) else: misc_anno.append(l) print("Merging {} annotations".format(len(gene_anno))) joined_gene = join_zones(gene_anno) joined_cds = join_zones(cds_anno) joined_exon = join_zones(exon_anno) joined_similarity = join_zones(similarity_anno) joined_hit = [joined_gene,joined_cds,joined_exon] if intron_anno: joined_intron = join_zones(intron_anno) joined_hit.append(joined_intron) joined_hit.append(joined_similarity) joined_hit += misc_anno hits.append(joined_hit) kept_indicies.append(len(hits)-1) non_overlapping_indicies.append(len(kept_indicies)-1) else: longest = longest_hit([hits[x] for x in overlapping_indicies]) if longest: non_overlapping_indicies.append(longest) # for pair in overlapping_indicies: # if int(gene_annotations[pair[0]][5]) > int(gene_annotations[pair[1]][5]): # non_overlapping_indicies.append(pair[0]) # else: # non_overlapping_indicies.append(pair[1]) # if not tuple_overlap(range_list[-2],range_list[-1]): # non_overlapping_indicies.append(kept_indicies[-1]) #print kept_indicies #print non_overlapping_indicies return [hits[kept_indicies[x]] for x in sorted(non_overlapping_indicies)]#.sort()] else: return [hits[x] for x in kept_indicies]
def filter_gff(hits, merge=True): hits_to_keep = [] hits = sorted(hits, key=lambda x: int(x[0][3])) #Get only the features annotated as genes gene_annotations = [x for y in hits for x in y if x[2] == 'gene'] #Get the start,end, and score for each gene annotation range_list = [(int(x[3]), int(x[4])) for x in gene_annotations] #print(range_list) kept_indicies = range_connectivity(range_list) kept_range_list = [range_list[x] for x in kept_indicies] #print(kept_indicies) if len(kept_indicies) > 1: overlapping_indicies = [] non_overlapping_indicies = [] for ix in range(len(kept_indicies) - 1): #print range_list[ix],range_list[ix+1],tuple_overlap(range_list[ix],range_list[ix+1]) if tuple_overlap(kept_range_list[ix], kept_range_list[ix + 1]): if kept_indicies[ix] not in overlapping_indicies: # if not tuple_overlap(kept_range_list[ix-1],kept_range_list[ix]): overlapping_indicies.append(kept_indicies[ix]) if kept_indicies[ix + 1] not in overlapping_indicies: overlapping_indicies.append(kept_indicies[ix + 1]) else: non_overlapping_indicies.append(kept_indicies[ix]) #print overlapping_indicies if overlapping_indicies: best_score = score_filter([hits[x] for x in overlapping_indicies]) if best_score: non_overlapping_indicies.append(best_score) else: if merge: #merge the gene first gene_anno = [] cds_anno = [] exon_anno = [] intron_anno = [] similarity_anno = [] misc_anno = [] #print(hits) for x in hits: for l in x: if l[2] == 'gene': gene_anno.append(l) elif l[2] == "cds": cds_anno.append(l) elif l[2] == "exon": exon_anno.append(l) elif l[2] == "intron": intron_anno.append(l) elif l[2] == "similarity": similarity_anno.append(l) else: misc_anno.append(l) print("Merging {} annotations".format(len(gene_anno))) joined_gene = join_zones(gene_anno) joined_cds = join_zones(cds_anno) joined_exon = join_zones(exon_anno) joined_similarity = join_zones(similarity_anno) joined_hit = [joined_gene, joined_cds, joined_exon] if intron_anno: joined_intron = join_zones(intron_anno) joined_hit.append(joined_intron) joined_hit.append(joined_similarity) joined_hit += misc_anno hits.append(joined_hit) kept_indicies.append(len(hits) - 1) non_overlapping_indicies.append(len(kept_indicies) - 1) else: longest = longest_hit( [hits[x] for x in overlapping_indicies]) if longest: non_overlapping_indicies.append(longest) # for pair in overlapping_indicies: # if int(gene_annotations[pair[0]][5]) > int(gene_annotations[pair[1]][5]): # non_overlapping_indicies.append(pair[0]) # else: # non_overlapping_indicies.append(pair[1]) # if not tuple_overlap(range_list[-2],range_list[-1]): # non_overlapping_indicies.append(kept_indicies[-1]) #print kept_indicies #print non_overlapping_indicies try: return_value = [] for x in sorted(non_overlapping_indicies): try: return_value.append(hits[kept_indicies[x]]) except Exception, e: print(e) print("x is: {}".format(x)) print("kept_indicies is: {}".format(kept_indicies)) print("hits is: {}".format(hits)) return return_value except Exception, e: print(e)