def format_main(args): # generate bedpe file hout = open(args.output, 'w') with open(args.result_file, 'r') as hin: for line in hin: if line.startswith("#"): continue if utils.header_check(line.rstrip('\n')): header_info.read(line.rstrip('\n')) continue F = line.rstrip('\n').split('\t') if F[header_info.variant_type] in ["inversion", "translocation"]: continue if abs(int(F[header_info.pos_1]) - int(F[header_info.pos_2])) > int(args.max_size_thres): continue if F[header_info.variant_type] == "deletion": ref_seq = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]), int(F[header_info.pos_2]) - 1) alt_seq = ref_seq[0] if F[header_info.inserted_seq] == "---" else ref_seq[0] + F[header_info.inserted_seq] pos = F[header_info.pos_1] elif F[header_info.variant_type] == "tandem_duplication": alt_seq = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]) - 1, int(F[header_info.pos_2])) alt_seq = alt_seq if F[header_info.inserted_seq] == "---" else alt_seq + F[header_info.inserted_seq] ref_seq = alt_seq[0] pos = str(int(F[header_info.pos_1]) - 1) print('\t'.join([F[header_info.chr_1], pos, '.', ref_seq, alt_seq, '.', "PASS", '.']), file = hout) hout.close()
def nonB_DB_main(args): from . import nonB_DB all_nonB_DB_type = ["A_Phased_Repeat", "Direct_Repeat", "G_Quadruplex_Motif", "Inverted_Repeat", "Mirror_Repeat", "Short_Tandem_Repeat", "Z_DNA_Motif"] if not os.path.exists(args.result_file): raise ValueError("file not exists: " + args.result_file) nonB_DB_bed = args.nonB_DB nonB_DB_tb = pysam.TabixFile(nonB_DB_bed) hout = open(args.output, 'w') with open(args.result_file, 'r') as hin: for line in hin: if utils.header_check(line.rstrip('\n')): header_info.read(line.rstrip('\n')) print_header = line.rstrip('\n') + '\t' + '\t'.join([x + "_dist1" + '\t' + x + "_dist2" for x in all_nonB_DB_type]) print(print_header, file = hout) continue F = line.rstrip('\n').split('\t') # for meta info print if F[0].startswith("#"): print('\t'.join(F), file = hout) continue if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]): print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \ (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \ F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr) continue chr_ucsc1 = F[header_info.chr_1] if F[header_info.chr_1].startswith("chr") else "chr" + F[header_info.chr_1] chr_ucsc2 = F[header_info.chr_2] if F[header_info.chr_2].startswith("chr") else "chr" + F[header_info.chr_2] print_dist_bar = '' for nonB_DB_type in all_nonB_DB_type: nonB_DB_dist1 = nonB_DB.nonB_DB_dist_check(chr_ucsc1, int(F[header_info.pos_1]), F[header_info.dir_1], nonB_DB_tb, nonB_DB_type) nonB_DB_dist2 = nonB_DB.nonB_DB_dist_check(chr_ucsc2, int(F[header_info.pos_2]), F[header_info.dir_2], nonB_DB_tb, nonB_DB_type) print_dist_bar = print_dist_bar + '\t' + str(nonB_DB_dist1) + '\t' + str(nonB_DB_dist2) print('\t'.join(F) + print_dist_bar, file = hout) hout.close()
def AID_main(args): # make directory for output if necessary if os.path.dirname(args.output) != "" and not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) hout = open(args.output, 'w') with open(args.result_file, 'r') as hin: for line in hin: if line.startswith("#"): continue if utils.header_check(line.rstrip('\n')): line = line.rstrip('\n') header_info.read(line) print(line + '\t' + "CG_motif_info_1" + '\t' + "CG_motif_info_2" + '\t' + "WGCW_motif_info_1" + '\t' + "WGCW_motif_info_2", file = hout) continue F = line.rstrip('\n').split('\t') if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]): print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \ (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \ F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr) continue seq1 = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]) - args.check_size, int(F[header_info.pos_1]) + args.check_size) seq2 = my_seq.get_seq(args.reference, F[header_info.chr_2], int(F[header_info.pos_2]) - args.check_size, int(F[header_info.pos_2]) + args.check_size) CG_starts_1 = [match.start() - 10 for match in re.finditer(r'CG', seq1)] CG_starts_2 = [match.start() - 10 for match in re.finditer(r'CG', seq2)] WGCW_starts_1 = [match.start() - 10 for match in re.finditer(r'[AT]GC[AT]', seq1)] WGCW_starts_2 = [match.start() - 10 for match in re.finditer(r'[AT]GC[AT]', seq2)] if len(CG_starts_1) == 0: CG_starts_1.append("---") if len(CG_starts_2) == 0: CG_starts_2.append("---") if len(WGCW_starts_1) == 0: WGCW_starts_1.append("---") if len(WGCW_starts_2) == 0: WGCW_starts_2.append("---") print('\t'.join(F) + '\t' + \ ','.join([str(x) for x in CG_starts_1]) + '\t' + \ ','.join([str(x) for x in CG_starts_2]) + '\t' + \ ','.join([str(x) for x in WGCW_starts_1]) + '\t' + \ ','.join([str(x) for x in WGCW_starts_2]), file = hout) hout.close()
def homology_main(args): from . import homology hout = open(args.output, 'w') with open(args.result_file, 'r') as hin: for line in hin: if line.startswith("#"): continue if utils.header_check(line.rstrip('\n')): header_info.read(line.rstrip('\n')) print_header = line.rstrip('\n') + '\t' + "Homology_Match" print(print_header, file = hout) continue F = line.rstrip('\n').split('\t') # for meta info print if F[0].startswith("#"): print('\t'.join(F), file = hout) continue if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]): print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \ (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \ F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr) continue var_size = 500000 if F[header_info.variant_type] == "deletion": var_size = int(F[header_info.pos_2]) - int(F[header_info.pos_1]) - 1 elif F[header_info.variant_type] == "tandem_duplication": var_size = int(F[header_info.pos_2]) - int(F[header_info.pos_1]) + 1 homology_match = homology.check_homology(F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2], args.reference, min(var_size, 100)) print('\t'.join(F) + '\t' + str(homology_match), file = hout) hout.close()
def RSS_main(args): # make directory for output if necessary if os.path.dirname(args.output) != "" and not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) rss_pwm = my_seq.generate_rss_pwm() hout = open(args.output, 'w') with open(args.result_file, 'r') as hin: for line in hin: if line.startswith("#"): continue if utils.header_check(line.rstrip('\n')): line = line.rstrip('\n') header_info.read(line) print(line + '\t' + "RSS_score_1" + '\t' + "RSS_info_1" + '\t' + "RSS_score_2" + '\t' + "RSS_info_2", file = hout) continue F = line.rstrip('\n').split('\t') if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]): print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \ (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \ F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr) continue seq1 = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]) - args.check_size, int(F[header_info.pos_1]) + args.check_size) seq2 = my_seq.get_seq(args.reference, F[header_info.chr_2], int(F[header_info.pos_2]) - args.check_size, int(F[header_info.pos_2]) + args.check_size) rss_info_1 = my_seq.get_max_rss_score(seq1, rss_pwm[0], rss_pwm[1]) rss_info_2 = my_seq.get_max_rss_score(seq2, rss_pwm[0], rss_pwm[1]) print('\t'.join(F) + '\t' + \ str(round(rss_info_1[0], 3)) + '\t' + \ ';'.join([rss_info_1[1], rss_info_1[2], str(int(rss_info_1[3] - 50)), str(rss_info_1[4]), str(rss_info_1[5])]) + '\t' + \ str(round(rss_info_2[0], 3)) + '\t' + \ ';'.join([rss_info_2[1], rss_info_2[2], str(int(rss_info_2[3] - 50)), str(rss_info_2[4]), str(rss_info_2[5])]), file = hout) hout.close()
def primer_main(args): from genomon_sv import realignmentFunction from primer3 import bindings # make directory for output if necessary if os.path.dirname(args.output) != "" and not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) param = {"reference_genome": args.reference, "split_refernece_thres": 1000, "validate_sequence_length": 250} hout = open(args.output, 'w') with open(args.result_file, 'r') as hin: for line in hin: if line.startswith("#"): continue if utils.header_check(line.rstrip('\n')): line = line.rstrip('\n') header_info.read(line) print(line + '\t' + "Primer1" + '\t' + "Primer2" + '\t' + "Primer3" + '\t' + "Primer4" + '\t' + "Primer5", file = hout) continue F = line.rstrip('\n').split('\t') chr1, pos1, dir1, chr2, pos2, dir2, junc_seq = F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \ F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2], F[header_info.inserted_seq] if utils.check_atypical_chromosomes(chr1, chr2): print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \ (chr1, pos1, dir1, chr2, pos2, dir2), file = sys.stderr) continue junc_seq_len = 0 if junc_seq == "---" else len(junc_seq) realignmentFunction.getRefAltForSV(args.output + ".contig.tmp.fa", chr1, pos1, dir1, chr2, pos2, dir2, junc_seq, args.reference, 1000, 250) with open(args.output + ".contig.tmp.fa") as hin2: lines2 = hin2.readlines() for i in range(len(lines2)): lines2[i] = lines2[i].rstrip('\n') if lines2[i].startswith('>') and lines2[i].endswith("alt"): seq = lines2[i + 1].rstrip('\n') primer = bindings.designPrimers( { 'SEQUENCE_ID': 'MH1000', 'SEQUENCE_TEMPLATE': seq, 'SEQUENCE_TARGET': [225,50 + junc_seq_len], 'SEQUENCE_INCLUDED_REGION': [10, len(seq) - 20] }, { 'PRIMER_PRODUCT_SIZE_RANGE': [[150,250],[100,300],[301,400],[401,500]], }) primer_left_right = ["---"] * 5 for i in range(5): if "PRIMER_LEFT_" + str(i) + "_SEQUENCE" in primer and "PRIMER_RIGHT_" + str(i) + "_SEQUENCE" in primer and \ "PRIMER_LEFT_" + str(i) + "_TM" in primer and "PRIMER_RIGHT_" + str(i) + "_TM" in primer and \ "PRIMER_PAIR_" + str(i) + "_PRODUCT_SIZE" in primer: primer_left_right[i] = primer["PRIMER_LEFT_" + str(i) + "_SEQUENCE"] + ";" + primer["PRIMER_RIGHT_" + str(i) + "_SEQUENCE"] + ';' + \ str(round(primer["PRIMER_LEFT_" + str(i) + "_TM"], 3)) + ";" + str(round(primer["PRIMER_RIGHT_" + str(i) + "_TM"], 3)) + ';' + \ str(primer["PRIMER_PAIR_" + str(i) + "_PRODUCT_SIZE"]) print('\t'.join(F) + '\t' + '\t'.join(primer_left_right), file = hout) hout.close() subprocess.check_call(["rm", "-rf", args.output + ".contig.tmp.fa"])
def realign_main(args): from genomon_sv import filterFunction if args.tumor_bam is None: print("tumor_bam file should be input", file = sys.stderr) sys.exit(1) # make directory for output if necessary if os.path.dirname(args.output) != "" and not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) matchedControlFlag = True if args.control_bam is not None else False if args.control_bam is None: args.control_bam = "" # generate bedpe file hout = open(args.output + ".tmp1.bedpe", 'w') i = 0 with open(args.result_file, 'r') as hin: for line in hin: if line.startswith("#"): continue if utils.header_check(line.rstrip('\n')): line = line.rstrip('\n') header_info.read(line) continue F = line.rstrip('\n').split('\t') if utils.check_atypical_chromosomes(F[header_info.chr_1], F[header_info.chr_2]): print("Skip a SV incolving atypical chromosomes: %s,%s,%s,%s,%s,%s" % \ (F[header_info.chr_1], F[header_info.pos_1], F[header_info.dir_1], \ F[header_info.chr_2], F[header_info.pos_2], F[header_info.dir_2]), file = sys.stderr) continue print('\t'.join([F[header_info.chr_1], str(int(F[header_info.pos_1]) - 1), F[header_info.pos_1], \ F[header_info.chr_2], str(int(F[header_info.pos_2]) - 1), F[header_info.pos_2], \ "genoemonSV_" + str(i), F[header_info.inserted_seq], F[header_info.dir_1], F[header_info.dir_2]] + \ ["---" for i in range(14)]), file = hout) i = i + 1 hout.close() filterFunction.validateByRealignment(args.output + ".tmp1.bedpe", args.output + ".tmp2.bedpe", args.tumor_bam, args.control_bam, args.reference, "-stepSize=5 -repMatch=2253", 500, 5000, 1000, 5, 1000, 1000) key2AF_info = {} with open(args.output + ".tmp2.bedpe", 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') key = '\t'.join(F[:7]) tumorAF = 0.0 if float(F[7]) + float(F[8]) > 0: tumorAF = float(F[8]) / (float(F[7]) + float(F[8])) tumorAF = str(round(tumorAF, 4)) normalAF = "---" if matchedControlFlag == True: normalAF = 0.0 if float(F[9]) + float(F[10]) > 0: normalAF = float(F[10]) / (float(F[9]) + float(F[10])) normalAF = str(round(normalAF, 4)) if matchedControlFlag == True: key2AF_info[key] = '\t'.join([F[7], F[8], tumorAF, F[9], F[10], normalAF, F[11]]) else: key2AF_info[key] = '\t'.join([F[7], F[8], tumorAF]) hout = open(args.output, 'w') with open(args.result_file, 'r') as hin: for line in hin: if line.startswith("#"): continue if utils.header_check(line.rstrip('\n')): line = line.rstrip('\n') if matchedControlFlag == True: print(line + '\t' + "Num_Tumor_Ref_Read_Pair_re" + '\t' + "Num_Tumor_Var_Read_Pair_re" + '\t' + "Tumor_VAF_re" + '\t' + \ "Num_Control_Ref_Read_Pair_re" + '\t'+ "Num_Control_Var_Read_Pair_re" + '\t' + "Control_VAF_re" + '\t' + \ "Minus_Log_Fisher_P_value_re", file = hout) else: print(line + '\t' + "Num_Tumor_Ref_Read_Pair_re" + '\t' + "Num_Tumor_Var_Read_Pair_re" + '\t' + "Tumor_VAF_re", file = hout) continue F = line.rstrip('\n').split('\t') key = '\t'.join(F[:7]) if key not in key2AF_info: continue print('\t'.join(F) + '\t' + key2AF_info[key], file = hout) hout.close() subprocess.check_call(["rm", "-rf", args.output + ".tmp1.bedpe"]) subprocess.check_call(["rm", "-rf", args.output + ".tmp2.bedpe"])
def merge_control_main(args): import genomon_sv.mergeFunction, genomon_sv.utils # make directory for output if necessary if os.path.dirname(args.output_file) != "" and not os.path.exists(os.path.dirname(args.output_file)): os.makedirs(os.path.dirname(args.output_file)) hout = open(args.output_file + ".temp", 'w') tumor_type_list = {} gene2type_sample = {} with open(args.result_list, 'r') as hin: for line in hin: label, tumor_type, result_file = line.rstrip('\n').split('\t') # label, result_file = line.rstrip('\n').split('\t') if tumor_type not in tumor_type_list: tumor_type_list[tumor_type] = 1 if not os.path.exists(result_file): raise ValueError("file not exists: " + result_file) num = 1 with open(result_file, 'r') as hin: for line in hin: if line.startswith("#"): continue if utils.header_check(line.rstrip('\n')): line = line.rstrip('\n') header_info.read(line) continue F = line.rstrip('\n').split('\t') inseqLen = len(F[header_info.inserted_seq]) if F[header_info.inserted_seq] != "---" else 0 print('\t'.join([F[header_info.chr_1], str(int(F[header_info.pos_1]) - 1), F[header_info.pos_1], \ F[header_info.chr_2], str(int(F[header_info.pos_2]) - 1), F[header_info.pos_2], \ "junction_" + str(num), str(inseqLen), \ F[header_info.dir_1], F[header_info.dir_2], label, "1"]), file = hout) num = num + 1 hout.close() # utils.processingMessage("sorting the aggregated junction file") genomon_sv.utils.sortBedpe(args.output_file + ".temp", args.output_file + ".temp.sort") # utils.processingMessage("merging the same junction in the aggregated junction file") genomon_sv.mergeFunction.organizeControl(args.output_file + ".temp.sort", args.output_file + ".temp.merged", 20) # utils.processingMessage("sorting the merged junction file") genomon_sv.utils.sortBedpe(args.output_file + ".temp.merged", args.output_file + ".temp.merged.sort") # utils.processingMessage("compressing the merged junction file") genomon_sv.utils.compress_index_bed(args.output_file + ".temp.merged.sort", args.output_file) # remove intermediate files subprocess.check_call(["rm", "-rf", args.output_file + ".temp"]) subprocess.check_call(["rm", "-rf", args.output_file + ".temp.sort"]) subprocess.check_call(["rm", "-rf", args.output_file + ".temp.merged"]) subprocess.check_call(["rm", "-rf", args.output_file + ".temp.merged.sort"])