def SNP_MIP_Gap(chrome,start,end,min_hom_length,max_hom_length,tm_min,tm_max,gc_threshold_min,gc_threshold_max,ref,alt): returnStr="" ## New code on 1/21/2015 # SNP on the gap fill (use + for gap) # if gap fill is 2 bases, it will do -+, +- # if gap fill is 3 bases, it will do --+, -+-, +-- ## Fetch sequences for n in range(1,gap_num+1): for j in range(0,n): upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length-(n-j-1),max_hom_length).lower() downstream_seq = nibFragger(chrome.replace("chr",""),end+1+j,max_hom_length).lower() upstream_list = get_seq(upstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False) downstream_list = get_seq(downstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True) gapfill_W = nibFragger(chrome.replace("chr",""),start-(n-j-1),n) tmplist= list(gapfill_W) tmplist[n-j-1]=alt gapfill_M = "".join(tmplist) upstream_pos = chrome+":"+str(start-(n-j-1)-upstream_list[0][3])+"-"+str(start-(n-j-1)-1) downstream_pos = chrome+":"+str(end+1+j)+"-"+str(end+1+j+downstream_list[0][3]-1) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_GF",GapFillBase_M=gapfill_M,GapFillBase_W=gapfill_W) returnStr = returnStr.rstrip("\n") returnStr+="\t"+upstream_pos+"\t"+downstream_pos+"\t"+getName(n,j)+"\n" returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_GF",GapFillBase_M=revcomp(gapfill_M),GapFillBase_W=revcomp(gapfill_W)) returnStr = returnStr.rstrip("\n") returnStr+="\t"+upstream_pos+"\t"+downstream_pos+"\t"+getName(n,j)+"\n" return returnStr
continue else: #tmp = line.rstrip().split("\t") mipName, upstream,gapfill,downstream = line.rstrip().split("\t")[0:4] strand = "+" mm, count = mipName.split("-") count=int(count) if count % 2 == 0: strand = "-" ##based on upstream coordinate to excat seq upStart,upStop = upstream.split("-") downStart,downStop = downstream.split("-") myUpSeq = nibFragger(chrom,upStart,int(upStop)-int(upStart)+1) myDownSeq = nibFragger(chrom,downStart,int(downStop)-int(downStart)+1) h2pos="" h1pos="" H1Seq="" H2Seq="" if strand == "+": h2pos=upstream h1pos=downstream H2Seq = myUpSeq H1Seq = myDownSeq else: h1pos=upstream
def MNP_MIP(chrome, start, end, min_hom_length, max_hom_length, tm_min, tm_max, gc_threshold_min, gc_threshold_max, ref, alt): returnStr = "" # MNP on the H2 forward and H1 reverse ## Fetch sequences upstream_seq = nibFragger( chrome.replace("chr", ""), end - max_hom_length + 1, max_hom_length).lower() downstream_seq = nibFragger( chrome.replace("chr", ""), end + 2, max_hom_length).lower() gapfill = nibFragger(chrome.replace("chr", ""), end + 1, 1) upstream_seq = replaceString(upstream_seq, alt, first=False) upstream_list = get_seq( upstream_seq, MIN_LENGTH=min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max, right2left=False) downstream_list = get_seq( downstream_seq, MIN_LENGTH=min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max, right2left=True) returnStr += make_Hom_pairs( upstream_list, downstream_list, hom_strand="+", MIP_Mut_Alignment="MNP_on_H2", GapFillBase_M=gapfill, GapFillBase_W=gapfill) returnStr += make_Hom_pairs( upstream_list, downstream_list, hom_strand="-", MIP_Mut_Alignment="MNP_on_H1", GapFillBase_M=revcomp(gapfill), GapFillBase_W=revcomp(gapfill)) ## MNP on the H2 reverse Strand or H1 forward strand upstream_seq = nibFragger( chrome.replace("chr", ""), start - max_hom_length - 1, max_hom_length).lower() downstream_seq = nibFragger( chrome.replace("chr", ""), start, max_hom_length).lower() gapfill = nibFragger(chrome.replace("chr", ""), start - 1, 1) ### replace the first chracter downstream_seq_replaced = replaceString(downstream_seq, alt, first=True) upstream_list = get_seq( upstream_seq, MIN_LENGTH=min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max, right2left=False) downstream_list = get_seq( downstream_seq_replaced, MIN_LENGTH=min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max, right2left=True) returnStr += make_Hom_pairs( upstream_list, downstream_list, hom_strand="-", MIP_Mut_Alignment="MNP_on_H2", GapFillBase_M=revcomp(gapfill), GapFillBase_W=revcomp(gapfill)) returnStr += make_Hom_pairs( upstream_list, downstream_list, hom_strand="+", MIP_Mut_Alignment="MNP_on_H1", GapFillBase_M=gapfill, GapFillBase_W=gapfill) ## Do MNP_on_H2GF for i in range(1, len(ref) + 1): upstream_seq = nibFragger( chrome.replace("chr", ""), start - max_hom_length - 1 + i, max_hom_length).lower() downstream_seq = nibFragger( chrome.replace("chr", ""), start + i, max_hom_length).lower() gapfillM = alt[i - 1] gapfillW = ref[i - 1] upOverlapWithMutation = i - 1 downOverlapWithMutation = len(ref) - i if upOverlapWithMutation > 0: upstream_seq = replaceString( upstream_seq, alt[:i - 1], first=False) if downOverlapWithMutation > 0: downstream_seq = replaceString(downstream_seq, alt[i:], first=True) upstream_list = get_seq( upstream_seq, MIN_LENGTH=min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max, right2left=False) downstream_list = get_seq( downstream_seq, MIN_LENGTH=min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max, right2left=True) MIP_name = "MNP_on_" if upOverlapWithMutation > 0: MIP_name += "H2" MIP_name += "Gap" if downOverlapWithMutation > 0: MIP_name += "H1" returnStr += make_Hom_pairs( upstream_list, downstream_list, hom_strand="+", MIP_Mut_Alignment=MIP_name, GapFillBase_M=gapfillM, GapFillBase_W=gapfillW) MIP_name = "MNP_on_" if downOverlapWithMutation > 0: MIP_name += "H2" MIP_name += "Gap" if upOverlapWithMutation > 0: MIP_name += "H1" returnStr += make_Hom_pairs( upstream_list, downstream_list, hom_strand="-", MIP_Mut_Alignment=MIP_name, GapFillBase_M=revcomp(gapfillM), GapFillBase_W=revcomp(gapfillW)) return returnStr
for line in open(input_file): parts = line.rstrip().split("\t") if line.startswith("Gene"): continue info("Doing " + "|".join(parts[:3])) gene, mutation_AA, mutation_cDNA, chrome, start, end, ref, alt, gene_strand, cosmic, tumor_type = parts[ 0], parts[1], parts[2], parts[3], int(parts[4]), int( parts[5]), parts[6], parts[7], parts[8], parts[9], parts[10] ## Only works for SNP ## Situation 1: SNP_on_GF (snp in gap fill) flank5 = nibFragger(chrome.replace("chr", ""), start - 50, 50) flank3 = nibFragger(chrome.replace("chr", ""), end + 1, 50) forwardSeq = flank5.lower() + "[" + ref + "/" + alt + "]" + flank3.lower() reverseSeq = revcomp(flank3).lower() + "[" + revcomp(ref) + "/" + revcomp( alt) + "]" + revcomp(flank5).lower() resultStr = "" mType = determineMutationType(ref, alt) if mType == "SNP": resultStr = SNP_MIP_Gap(chrome, start, end, min_hom_length, max_hom_length, tm_min, tm_max, gc_threshold_min, gc_threshold_max, ref, alt) else: print("Right now, it only supports SNP") continue
def SNP_MIP_original(chrome,start,end,min_hom_length,max_hom_length,tm_min,tm_max,gc_threshold_min,gc_threshold_max,ref,alt): returnStr="" # SNP on the gap fill ## Fetch sequences if (re.search(r"[AT]",ref) and re.search(r"[CG]",alt)) or (re.search(r"[AT]",alt) and re.search(r"[CG]",ref)): upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length,max_hom_length).lower() downstream_seq = nibFragger(chrome.replace("chr",""),end+1,max_hom_length).lower() upstream_list = get_seq(upstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False) downstream_list = get_seq(downstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_GF",GapFillBase_M=alt,GapFillBase_W=ref) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_GF",GapFillBase_M=revcomp(alt),GapFillBase_W=revcomp(ref)) ## SNP on the H2 forward strand and H1 reverse - Mutation Type upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length+1,max_hom_length).lower() downstream_seq = nibFragger(chrome.replace("chr",""),end+2,max_hom_length).lower() gapfill = nibFragger(chrome.replace("chr",""),end+1,1) ### replace the last chracter upstream_seq_replaced=replaceString(upstream_seq,alt,first=False) upstream_list = get_seq(upstream_seq_replaced, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False) downstream_list = get_seq(downstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_H2_M",GapFillBase_M=gapfill,GapFillBase_W=gapfill) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_H1_M",GapFillBase_M=revcomp(gapfill),GapFillBase_W=revcomp(gapfill)) ## SNP on the H2 forward strand and H1 reverse - Wild Type upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length+1,max_hom_length).lower() downstream_seq = nibFragger(chrome.replace("chr",""),end+2,max_hom_length).lower() gapfill = nibFragger(chrome.replace("chr",""),end+1,1) ### replace the last chracter upstream_seq_replaced=replaceString(upstream_seq,ref,first=False) upstream_list = get_seq(upstream_seq_replaced, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False) downstream_list = get_seq(downstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_H2_W",GapFillBase_M=gapfill,GapFillBase_W=gapfill) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_H1_W",GapFillBase_M=revcomp(gapfill),GapFillBase_W=revcomp(gapfill)) ## SNP on the H2 reverse Strand or H1 forward strand - Mutation Type upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length-1,max_hom_length).lower() downstream_seq = nibFragger(chrome.replace("chr",""),end,max_hom_length).lower() Mgapfill = nibFragger(chrome.replace("chr",""),end-1,1) ### replace the first chracter downstream_seq_replaced=replaceString(downstream_seq,alt,first=True) upstream_list = get_seq(upstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False) downstream_list = get_seq(downstream_seq_replaced, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_H2_M",GapFillBase_M=revcomp(gapfill),GapFillBase_W=revcomp(gapfill)) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_H1_M",GapFillBase_M=gapfill,GapFillBase_W=gapfill) ## SNP on the H2 reverse Strand or H1 forward strand - Wild Type upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length-1,max_hom_length).lower() downstream_seq = nibFragger(chrome.replace("chr",""),end,max_hom_length).lower() Mgapfill = nibFragger(chrome.replace("chr",""),end-1,1) ### replace the first chracter downstream_seq_replaced=replaceString(downstream_seq,ref,first=True) upstream_list = get_seq(upstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False) downstream_list = get_seq(downstream_seq_replaced, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_H2_W",GapFillBase_M=revcomp(gapfill),GapFillBase_W=revcomp(gapfill)) returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_H1_W",GapFillBase_M=gapfill,GapFillBase_W=gapfill) return returnStr
output = open(output_file, "w") output.write("hom_name\tregion_index\tchrom\tregion_start\tregion_stop\thom_start\thom_stop\tseq\tseq_tm\tgc_count\tgc_pct\tdust_scoreH1\tdust_scoreH2\tdust_pct_H1\tdust_pct_H2\thp_run\tSNPs\tSMs\n") out_fasta = open(output_fasta, "w") out_bed = open(output_bed, "w") info("There are total of {} exons".format(str(len(target_regions)))) for region_index, (chrom, start, stop) in enumerate(target_regions): #For every position in the index, design a hom - extend mip to the right i = region_index+1 bp = str(abs(stop-start+1)) info("Finding homs on exon {} ({}-{}| {}bp)".format(str(i),str(start),str(stop),bp)) for hom_position in range(start - mip_offset - 20, stop + mip_offset): hom_seq = nibFragger(chrom.replace("chr",""), hom_position, 35) opt_seq, opt_tm = optimize_seq(hom_seq) gc_count = opt_seq.count("C") + opt_seq.count("G") gc_content = gc_count / float(len(opt_seq)) (hp_run, dust_score_H1, dust_score_H2, dust_pct_H1, dust_pct_H2) = score_dust(opt_seq) #Disqualify homs based on thresholds if len(opt_seq) > max_hom_length or len(opt_seq) < min_hom_length: continue if opt_tm < tm_min or opt_tm > tm_max: continue if gc_content > gc_threshold_max or gc_content < gc_threshold_min: continue SNPs = snp_finder.local_snps(hom_position, hom_position + len(opt_seq) - 1) SMs = snp_finder.local_sms(hom_position, hom_position + len(opt_seq) - 1) end_position = hom_position + len(opt_seq) - 1 output.write("{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{}\t{:.2f}\t{}\t{}\t{:.2f}\t{:.2f}\t{}\t{}\t{}\n".format(region_index, hom_position, region_index, chrom, start, stop, hom_position, end_position, opt_seq, opt_tm, \ gc_count, gc_content, dust_score_H1, dust_score_H2, dust_pct_H1, dust_pct_H2, hp_run, SNPs, SMs))