예제 #1
0
def filter_gff(hits):
	hits_to_keep = []
	hits = sorted(hits,key= lambda x: int(x[0][3]))
	#Get only the features annotated as genes
	gene_annotations = [x for y in hits for x in y if x[2] == 'gene']
	#Get the start,end, and score for each gene annotation
	range_list = [(int(x[3]),int(x[4])) for x in gene_annotations]
	kept_indicies = range_connectivity(range_list)
	if len(kept_indicies) > 1:
		overlapping_indicies = []
		non_overlapping_indicies = []
		for ix in range(len(kept_indicies)-1):
			#print range_list[ix],range_list[ix+1],tuple_overlap(range_list[ix],range_list[ix+1])
			if tuple_overlap(range_list[ix],range_list[ix+1]):
				overlapping_indicies.append((kept_indicies[ix],kept_indicies[ix+1]))
			else:
				non_overlapping_indicies.append(kept_indicies[ix])
		for pair in overlapping_indicies:
			if int(gene_annotations[pair[0]][5]) > int(gene_annotations[pair[1]][5]):
				non_overlapping_indicies.append(pair[0])
			else:
				non_overlapping_indicies.append(pair[1])
		if not tuple_overlap(range_list[-2],range_list[-1]):
				non_overlapping_indicies.append(kept_indicies[-1])
		return 	[hits[x] for x in non_overlapping_indicies]	
				
	else:
		return [hits[x] for x in kept_indicies]			
예제 #2
0
def filter_gff(hits,merge=True):
    hits_to_keep = []
    hits = sorted(hits,key= lambda x: int(x[0][3]))
    #Get only the features annotated as genes
    gene_annotations = [x for y in hits for x in y if x[2] == 'gene']
    #Get the start,end, and score for each gene annotation
    range_list = [(int(x[3]),int(x[4])) for x in gene_annotations]
    #print(range_list)
    kept_indicies = range_connectivity(range_list)
    kept_range_list = [range_list[x] for x in kept_indicies]
    #print(kept_indicies)
    if len(kept_indicies) > 1:
        overlapping_indicies = []
        non_overlapping_indicies = []
        for ix in range(len(kept_indicies)-1):
            #print range_list[ix],range_list[ix+1],tuple_overlap(range_list[ix],range_list[ix+1])
            if tuple_overlap(kept_range_list[ix],kept_range_list[ix+1]):
                if kept_indicies[ix] not in overlapping_indicies:
#                    if not tuple_overlap(kept_range_list[ix-1],kept_range_list[ix]):
                     overlapping_indicies.append(kept_indicies[ix])
                if kept_indicies[ix+1] not in overlapping_indicies:
                     overlapping_indicies.append(kept_indicies[ix+1])
            else:
                non_overlapping_indicies.append(kept_indicies[ix])
        #print overlapping_indicies
        if overlapping_indicies:
            best_score = score_filter([hits[x] for x in overlapping_indicies])
            if best_score:
                non_overlapping_indicies.append(best_score)
            else:
                if merge:
                    #merge the gene first
                    gene_anno = []
                    cds_anno = []
                    exon_anno = []
                    intron_anno = []
                    similarity_anno = []
                    misc_anno = []
                    #print(hits)
                    for x in hits:
                        for l in x:
                            if l[2] == 'gene':
                                gene_anno.append(l)
                            elif l[2] == "cds":
                                cds_anno.append(l)
                            elif l[2] == "exon":
                                exon_anno.append(l)
                            elif l[2] == "intron":
                                intron_anno.append(l)
                            elif l[2] == "similarity":
                                similarity_anno.append(l)
                            else:
                                misc_anno.append(l)
                    print("Merging {} annotations".format(len(gene_anno)))
                    joined_gene = join_zones(gene_anno)
                    joined_cds = join_zones(cds_anno)
                    joined_exon = join_zones(exon_anno)
                    joined_similarity =  join_zones(similarity_anno)
                    joined_hit = [joined_gene,joined_cds,joined_exon]
                    if intron_anno:
                        joined_intron = join_zones(intron_anno)
                        joined_hit.append(joined_intron)
                    joined_hit.append(joined_similarity)
                    joined_hit +=  misc_anno
                    hits.append(joined_hit)
                    kept_indicies.append(len(hits)-1)
                    non_overlapping_indicies.append(len(kept_indicies)-1)
                else:
                    longest = longest_hit([hits[x] for x in overlapping_indicies])
                    if longest:
                        non_overlapping_indicies.append(longest)
        

#         for pair in overlapping_indicies:
#             if int(gene_annotations[pair[0]][5]) > int(gene_annotations[pair[1]][5]):
#                 non_overlapping_indicies.append(pair[0])
#             else:
#                 non_overlapping_indicies.append(pair[1])
#         if not tuple_overlap(range_list[-2],range_list[-1]):
#                 non_overlapping_indicies.append(kept_indicies[-1])
        #print kept_indicies
        #print non_overlapping_indicies

        return [hits[kept_indicies[x]] for x in sorted(non_overlapping_indicies)]#.sort()]    
                
    else:
        return [hits[x] for x in kept_indicies]            
예제 #3
0
def filter_gff(hits, merge=True):
    hits_to_keep = []
    hits = sorted(hits, key=lambda x: int(x[0][3]))
    #Get only the features annotated as genes
    gene_annotations = [x for y in hits for x in y if x[2] == 'gene']
    #Get the start,end, and score for each gene annotation
    range_list = [(int(x[3]), int(x[4])) for x in gene_annotations]
    #print(range_list)
    kept_indicies = range_connectivity(range_list)
    kept_range_list = [range_list[x] for x in kept_indicies]
    #print(kept_indicies)
    if len(kept_indicies) > 1:
        overlapping_indicies = []
        non_overlapping_indicies = []
        for ix in range(len(kept_indicies) - 1):
            #print range_list[ix],range_list[ix+1],tuple_overlap(range_list[ix],range_list[ix+1])
            if tuple_overlap(kept_range_list[ix], kept_range_list[ix + 1]):
                if kept_indicies[ix] not in overlapping_indicies:
                    #                    if not tuple_overlap(kept_range_list[ix-1],kept_range_list[ix]):
                    overlapping_indicies.append(kept_indicies[ix])
                if kept_indicies[ix + 1] not in overlapping_indicies:
                    overlapping_indicies.append(kept_indicies[ix + 1])
            else:
                non_overlapping_indicies.append(kept_indicies[ix])
        #print overlapping_indicies
        if overlapping_indicies:
            best_score = score_filter([hits[x] for x in overlapping_indicies])
            if best_score:
                non_overlapping_indicies.append(best_score)
            else:
                if merge:
                    #merge the gene first
                    gene_anno = []
                    cds_anno = []
                    exon_anno = []
                    intron_anno = []
                    similarity_anno = []
                    misc_anno = []
                    #print(hits)
                    for x in hits:
                        for l in x:
                            if l[2] == 'gene':
                                gene_anno.append(l)
                            elif l[2] == "cds":
                                cds_anno.append(l)
                            elif l[2] == "exon":
                                exon_anno.append(l)
                            elif l[2] == "intron":
                                intron_anno.append(l)
                            elif l[2] == "similarity":
                                similarity_anno.append(l)
                            else:
                                misc_anno.append(l)
                    print("Merging {} annotations".format(len(gene_anno)))
                    joined_gene = join_zones(gene_anno)
                    joined_cds = join_zones(cds_anno)
                    joined_exon = join_zones(exon_anno)
                    joined_similarity = join_zones(similarity_anno)
                    joined_hit = [joined_gene, joined_cds, joined_exon]
                    if intron_anno:
                        joined_intron = join_zones(intron_anno)
                        joined_hit.append(joined_intron)
                    joined_hit.append(joined_similarity)
                    joined_hit += misc_anno
                    hits.append(joined_hit)
                    kept_indicies.append(len(hits) - 1)
                    non_overlapping_indicies.append(len(kept_indicies) - 1)
                else:
                    longest = longest_hit(
                        [hits[x] for x in overlapping_indicies])
                    if longest:
                        non_overlapping_indicies.append(longest)


#         for pair in overlapping_indicies:
#             if int(gene_annotations[pair[0]][5]) > int(gene_annotations[pair[1]][5]):
#                 non_overlapping_indicies.append(pair[0])
#             else:
#                 non_overlapping_indicies.append(pair[1])
#         if not tuple_overlap(range_list[-2],range_list[-1]):
#                 non_overlapping_indicies.append(kept_indicies[-1])
#print kept_indicies
#print non_overlapping_indicies

        try:
            return_value = []
            for x in sorted(non_overlapping_indicies):
                try:
                    return_value.append(hits[kept_indicies[x]])
                except Exception, e:
                    print(e)
                    print("x is: {}".format(x))
                    print("kept_indicies is: {}".format(kept_indicies))
                    print("hits is: {}".format(hits))
            return return_value
        except Exception, e:
            print(e)