def cns_opp_strand(cnss, qgene, sgene): cnss = list(cnss) cnss = map(change_orient,cnss) sgene[0] *= -1 sgene[1] *= -1 cnss = [(c[0], c[1], c[2], c[3],c[-2]) for c in remove_crossing_cnss(cnss, qgene, sgene)] cnss_fixed = [(c[0], c[1], -c[2], -c[3],c[-1]) for c in cnss] return cnss_fixed
def remove_crossing_hits(exon_hits,qfeat,sfeat): """uses find cns remove overlaping hits and corssing on each exon""" qgene =[qfeat['start'], qfeat['end']] sgene =[sfeat['start'], sfeat['end']] orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 exon_hits = list(exon_hits) if orient == -1: for i, hit in enumerate(exon_hits): hit = list(hit) hit[2] *= -1 hit[3] *= -1 exon_hits[i] = tuple(hit) sgene[0] *= -1 sgene[1] *= -1 non_crossing_hits = [(c[0], c[1], c[2], c[3], c[-2]) for c in remove_crossing_cnss(exon_hits,qgene,sgene)] if orient == -1: non_crossing_hits == [(c[0],c[1],-c[2], -c[3], c[-1]) for c in remove_crossing_cnss(exon_hits,qgene,sgene)] #non_crossing_dict = {str(locs):psudo[str(locs)] for locs in non_crossing_hits} return non_crossing_hits
def protein_parse(hit,gene,gene_bed, hit_bed): "creates a protein fasta and non translated exon fasta \ blastx them and parse the results" hit_fasta = "{0}q.fasta".format('/Users/gturco/code/freeling_lab/pseudo/data/rice_v6_setaria64/') gene_fasta = "{0}s.fasta".format('/Users/gturco/code/freeling_lab/pseudo/data/rice_v6_setaria64/') if len(re.findall('X',gene_bed.row_cds_sequence(gene['accn']))) >0: return "masked", "masked","masked" else: protein_fasta(hit_bed,hit,False,hit_fasta) protein_fasta(gene_bed,gene,True,gene_fasta) #cmd = "/Users/gturco/blast-2.2.25/bin/bl2seq -p blastx -G 11 -E 1 -W 3 -e 0.001 -D 1 -i {0} -j {1} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR'".format(hit_fasta,gene_fasta) cmd = "/Users/gturco/ncbi-blast-2.2.25+/bin/blastx -gapopen 11 -gapextend 1 -word_size 3 -evalue 0.001 -outfmt '7 gaps qframe std' -query {0} -subject {1} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR'".format(hit_fasta,gene_fasta) #print >>sys.stderr, "{1} {2} cmd : {0} ".format(cmd,gene,hit) res = commands.getoutput(cmd) print >>sys.stderr, res frame_dict = {'1':{"alignment":[],"qstart":[],"gaps":[]}, '2':{"alignment":[],"qstart":[],"gaps":[]}, '3':{"alignment":[],"qstart":[],"gaps":[]},'-1':{"alignment":[],"qstart":[],"gaps":[]}, '-2':{"alignment":[],"qstart":[],"gaps":[]}, '-3':{"alignment":[],"qstart":[],"gaps":[]}} qhit =[hit['start'], hit['end']] sgene =[gene['start'], gene['end']] locs_list = [] for line in res.split("\n"): if not line: continue if "WARNING:" in line: continue if "ERROR" in line: continue line = line.split("\t") locs = map(int, line[8:12]) locs.extend(map(float, line[12:])) frame = line[1] gaps = line[0] qstart = min(locs[0],locs[1]) length = line[5] frame_dict[frame]["alignment"].append(length) frame_dict[frame]["qstart"].append(qstart) frame_dict[frame]["gaps"].append(gaps) #frame_lengths = [(sum(frame_dict[key]),key) for key in frame_dict.keys()] #frame_lengths.sort() #largest_frame = frame_lengths[-1][1] #if largest_frame < len....: # frame_shift #find stop codon from largest frame + start site.... locs = tuple(locs) locs_list.update((locs,)) #print >>sys.stderr, "locs_list: {0}".format(locs_list) #non_crossing = [(c[0], c[1], c[2], c[3]) for c in remove_intersecting_hits(list(locs_list))] non_crossing = [(c[0], c[1], c[2], c[3], c[4]) for c in remove_crossing_cnss(list(locs_list),qhit,sgene)] frame_shift = False if len(non_crossing) > 1: frame_shift = False total_hit_len = sum([abs(q_start-q_end) for q_start,q_end,s_start,s_end, evalu in non_crossing]) total_gene_len = sum(abs(s_start-s_end) for q_start,q_end, s_start,s_end, evalu in non_crossing) print >>sys.stderr,non_crossing ref_hit_len = len(hit_bed.row_cds_sequence(hit['accn'])) ref_gene_len = len(gene_bed.row_cds_sequence(gene['accn'])) print >>sys.stderr,"hit_total {0} \n gene_len {1}".format(total_hit_len,ref_hit_len) #print >>sys.stderr, total_hit_len hit_len = total_hit_len/float(ref_hit_len) gene_len = total_gene_len/float(ref_gene_len/3) return hit_len, gene_len, frame_shift
def parse_blast(blast_str, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta): blast = [] slope = orient qgene = [qfeat['start'], qfeat['end']] sgene = [sfeat['start'], sfeat['end']] sgene = sgene[::slope] center = sum(qgene)/2., sum(sgene)/2. intercept = center[1] - slope * center[0] x = np.linspace(qgene[0] - qpad, qgene[1] + qpad, 50) y = slope * x + intercept feats_nearby = get_feats_nearby(qgene,sgene,qfeat,sfeat,x,y,qbed,sbed) qgene_space_poly,qgene_poly,sgene_space_poly,sgene_poly = get_genespace(qfeat,sfeat,qgene,sgene) intronic_removed = 0 cnss = set([]) for line in blast_str.split("\n"): if "WARNING:" in line: continue if "ERROR" in line: continue if line == '': continue line = line.split("\t") if float(line[-1]) < 29.5: continue #finds 15/15 match # if float(line[-1]) < 33.4: continue #finds 17/17 match locs = map(int, line[6:10]) locs.extend(map(float, line[10:])) xx = locs[:2] yy = locs[2:4] ####################################################### # MAIZE BOWTIE : JUST 5 PRIME 3 PRIME ####################################################### qcenter = sum(qgene)/2 scenter = sum(sgene)/2 * orient qcns_center = sum(xx)/2 scns_center = sum(yy)/2 * orient if scns_center > scenter and qcns_center < qcenter: continue if qcns_center > qcns_center and scns_center < scenter : continue # to be saved. a hit must either be in an intron in both # genes, or in neither. ########################################################## # DEAL WITH INTRONIC cnss in the gene of interest. ########################################################## xls = LineString([(0, locs[0]), (0, locs[1])]) yls = LineString([(0, locs[2]), (0, locs[3])]) locs = tuple(locs) # make it hashable. if qgene_poly.intersects(xls) and sgene_poly.intersects(yls): cnss.update((locs,)) continue # has to be both or neither. if qgene_space_poly.intersects(xls) or sgene_space_poly.intersects(yls): intronic_removed += 1 continue ########################################################## ############################################################### # for all other genes, if it's in an intron, we dont keep it. ############################################################### intronic = False # get rid of stuff that overlaps another gene: for sub, (start, stop) in (('q', locs[:2]), ('s', locs[2:4])): feats = feats_nearby[sub] if feats is None: continue # the current hsp is overlapping another gene. we dont want that... if feats.contains(Point(0, start)) or feats.contains(Point(0, stop)): intronic = True break if intronic: continue ########################################################## cnss.update((locs,)) # cant cross with < 2 cnss. # get rid of the eval, bitscore stuff. if len(cnss) < 2: return [(c[0], c[1], c[2], c[3],c[-1]) for c in cnss] cnss = list(cnss) #################################################################################### #########split cns into groups based on inversion, seq marks in maize ########## ################################################################################# def group_cns(cnss, group): """input list of cns and list of groups , this puts the cns in a dictionary fmt key = group values = cns that fall within range of group""" for cns in cnss: if cns[2] in range(group[0],group[1]): # group start and end pos key = group cns_groups.setdefault(key, []).append(cns) cns_groups = {} inversion_groups = find_inversions(unmasked_fasta, sfeat, spad) [group_cns(cnss, group) for group in inversion_groups] # creates dict where key = group value is appended cns # for each goup of cns values run the followiung cns_by_group = [] for key in cns_groups.keys(): # # first group, groups into smaller groups on strand values = cns_groups[key] opp_strand = [] same_strand = [] for cns in values: if slope == 1 and cns[2] > cns[3]: opp_strand.append(cns) elif slope == -1 and cns[2] < cns[3]: opp_strand.append(cns) else: same_strand.append(cns) # need to flip to negative so the overlapping stuff still works. if orient == -1: same_strand = map(change_orient, same_strand) opp_strand = map(change_orient, opp_strand) sgene[0] *= -1 sgene[1] *= -1 if abs(sgene[1]) in range(key[0], key[1]): # if the cns fall in same group as gene we know its same stand as gene and dont need to run rest cnss_same_strand = [(c[0], c[1], c[2], c[3],c[-1]) for c in remove_crossing_cnss(same_strand, qgene, sgene)] map(cns_by_group.append, cnss_same_strand) else: cnss_same_strand = [(c[0], c[1], c[2], c[3],c[-1]) for c in remove_crossing_cnss(same_strand, qgene, sgene)] cnss_opp_strand = cns_opp_strand(opp_strand, qgene, sgene) # alternitive for cns on opp strand if len(cnss_same_strand) < len(cnss_opp_strand): map(cns_by_group.append, cnss_opp_strand) else: # what about if they are the same, use non reverse complment map(cns_by_group.append, cnss_same_strand) if orient == -1: cns_by_group = [(c[0], c[1], -c[2], -c[3],c[-1]) for c in cns_by_group] return cns_by_group