def vcf_main(args): # generate bedpe file hout = open(args.output, 'w') with open(args.result_file, 'r') as hin: for line in hin: if line.startswith("Chr_1" + '\t' + "Pos_1"): header_info.read(line.rstrip('\n')) continue F = line.rstrip('\n').split('\t') if F[header_info.variant_type] in ["inversion", "translocation"]: continue if abs(int(F[header_info.pos_1]) - int(F[header_info.pos_2])) > int(args.max_size_thres): continue if F[header_info.variant_type] == "deletion": ref_seq = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]), int(F[header_info.pos_2]) - 1) alt_seq = ref_seq[0] if F[header_info.inserted_seq] == "---" else ref_seq[0] + F[header_info.inserted_seq] pos = F[1] elif F[header_info.variant_type] == "tandem_duplication": alt_seq = my_seq.get_seq(args.reference, F[header_info.chr_1], int(F[header_info.pos_1]) - 1, int(F[header_info.pos_2])) alt_seq = alt_seq if F[header_info.inserted_seq] == "---" else alt_seq + F[header_info.inserted_seq] ref_seq = alt_seq[0] pos = str(int(F[header_info.pos_1]) - 1) print >> hout, '\t'.join([F[header_info.chr_1], pos, '.', ref_seq, alt_seq, '.', "PASS", '.']) hout.close()
def generate_contig(input_file, output_file, tumor_bp_file, tumor_bam, reference_genome, min_contig_length): tumor_bp_db = pysam.TabixFile(tumor_bp_file) readid2key = {} with open(input_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') if F[0] == "Chr": continue tabixErrorFlag = 0 try: records = tumor_bp_db.fetch(F[0], int(F[1]) - 1, int(F[1]) + 1) except Exception as inst: print >> sys.stderr, "%s: %s" % (type(inst), inst.args) tabixErrorMsg = str(inst.args) tabixErrorFlag = 1 if tabixErrorFlag == 0: for record_line in records: record = record_line.split('\t') if record[0] == F[0] and (int(record[1]) + 1) == int( F[1]) and record[3] == F[2] and record[4] == F[3]: for readid in record[5].split(';'): readid2key[re.sub(r'/\d$', '', readid)] = ','.join(F[:4]) bamfile = pysam.Samfile(tumor_bam, "rb") hout = open(output_file + ".tmp2.contig.unsorted", 'w') for read in bamfile.fetch(): if read.qname in readid2key: flags = format(int(read.flag), "#014b")[:1:-1] # skip supplementary alignment if flags[8] == "1" or flags[11] == "1": continue # skip duplicated reads if flags[10] == "1": continue print >> hout, readid2key[read.qname] + '\t' + read.qname + ( "/1" if flags[6] == "1" else "/2") + '\t' + read.query_sequence hout.close() hout = open(output_file + ".tmp2.contig.sorted", 'w') subprocess.call(["sort", "-k1,1", output_file + ".tmp2.contig.unsorted"], stdout=hout) hout.close() temp_key = "" temp_id2seq = {} temp_junc_seq = "" key2contig = {} with open(output_file + ".tmp2.contig.sorted") as hin: for line in hin: F = line.rstrip('\n').split('\t') if temp_key != F[0]: if len(temp_id2seq) > 0: key2contig[temp_key] = assemble_seq( temp_id2seq, temp_junc_seq, output_file) temp_key = F[0] temp_id2seq = {} FF = temp_key.split(',') if FF[2] == "+": temp_junc_seq = my_seq.get_seq(reference_genome, FF[0], int(FF[1]) - 20, int(FF[1])) else: temp_junc_seq = my_seq.reverse_complement( my_seq.get_seq(reference_genome, FF[0], int(FF[1]), int(FF[1]) + 20)) temp_id2seq[F[1]] = F[2] if len(temp_id2seq) > 0: key2contig[temp_key] = assemble_seq(temp_id2seq, temp_junc_seq, output_file) hout = open(output_file, 'w') with open(input_file, 'r') as hin: for line in hin: F = line.rstrip('\n').split('\t') key = ','.join(F[:4]) if key not in key2contig: continue contig = key2contig[key] if len(contig) < min_contig_length: continue # if contig[:8] != F[3][:8]: continue print >> hout, '\t'.join(F) + '\t' + contig hout.close()
def generate_template_seq(output_file, reference, mut_chr, mut_start, mut_end, mut_ref, mut_alt, motif_chr, motif_start, motif_end, motif_type, motif_strand, junc_list, donor_size, acceptor_size, template_size): donor_size_exon, donor_size_intron = [int(x) for x in donor_size.split(',')] acceptor_size_intron, acceptor_size_exon = [int(x) for x in acceptor_size.split(',')] unique_junc_list = list(set(junc_list.split(','))) if motif_type == "donor": if motif_strand == '+': motif_exon_start, motif_exon_end = motif_start, motif_start + donor_size_exon - 1 motif_intron_start, motif_intron_end = motif_start + donor_size_exon, motif_start + donor_size_exon + donor_size_intron - 1 else: motif_intron_start, motif_intron_end = motif_start, motif_start + donor_size_intron - 1 motif_exon_start, motif_exon_end = motif_start + donor_size_intron, motif_start + donor_size_exon + donor_size_intron - 1 else: # acceptor if motif_strand == '+': motif_intron_start, motif_intron_end = motif_start, motif_start + acceptor_size_intron - 1 motif_exon_start, motif_exon_end = motif_start + acceptor_size_intron, motif_start + acceptor_size_exon + acceptor_size_intron - 1 else: motif_exon_start, motif_exon_end = motif_start, motif_start + acceptor_size_exon - 1 motif_intron_start, motif_intron_end = motif_start + acceptor_size_exon, motif_start + acceptor_size_exon + acceptor_size_intron - 1 key2seq = {} # annotated splice junction without mutation cnum = 0 for junc in sorted(unique_junc_list): junc_match = re.match(r'([^ \t\n\r\f\v,]+)\:(\d+)\-(\d+)', junc) junc_chr, junc_start, junc_end = junc_match.group(1), int(junc_match.group(2)), int(junc_match.group(3)) seq = my_seq.get_seq(reference, junc_chr, junc_start - template_size + 1, junc_start) + \ my_seq.get_seq(reference, junc_chr, junc_end, junc_end + template_size - 1) key2seq["splice_junction_negative_" + str(cnum)] = seq cnum = cnum + 1 # annotated splice junction with mutation (only when mutations occur within exonic motif region) if mut_start >= motif_exon_start and mut_end <= motif_exon_end: cnum = 0 for junc in sorted(unique_junc_list): junc_match = re.match(r'([^ \t\n\r\f\v,]+)\:(\d+)\-(\d+)', junc) junc_chr, junc_start, junc_end = junc_match.group(1), int(junc_match.group(2)), int(junc_match.group(3)) if (motif_type == "donor" and motif_strand == '+') or (motif_type == "acceptor" and motif_strand == '-'): mut_seq_tmp = my_seq.get_seq(reference, junc_chr, junc_start - mut_seq_margin + 1, junc_start) mut_start_rel, mut_end_rel = mut_start - junc_start + mut_seq_margin - 1, mut_end - junc_start + mut_seq_margin - 1 else: mut_seq_tmp = my_seq.get_seq(reference, junc_chr, junc_end, junc_end + mut_seq_margin - 1) mut_start_rel, mut_end_rel = mut_start - junc_end, mut_end - junc_end # for debug if mut_ref != '-' and mut_seq_tmp[mut_start_rel:(mut_end_rel + 1)] != mut_ref: print >> sys.stderr, '\t'.join([mut_chr, str(mut_start), str(mut_end), mut_ref, mut_alt, junc]) print >> sys.stderr, '\t'.join([mut_seq_tmp[mut_start_rel:(mut_end_rel + 1)], mut_ref]) print >> sys.stderr, "mutation inconsistent!!!" sys.exit(1) # SNV if mut_ref != '-' and mut_alt != '-': mut_seq_tmp = mut_seq_tmp[:mut_start_rel] + mut_alt + mut_seq_tmp[(mut_end_rel + 1):] # deletion if mut_alt == '-': mut_seq_tmp = mut_seq_tmp[:mut_start_rel] + mut_seq_tmp[(mut_end_rel + 1):] # insertion if mut_ref == '-': mut_seq_tmp = mut_seq_tmp[:(mut_start_rel + 1)] + mut_alt + mut_seq_tmp[(mut_start_rel + 1):] if (motif_type == "donor" and motif_strand == '+') or (motif_type == "acceptor" and motif_strand == '-'): seq = mut_seq_tmp[(-template_size):] + my_seq.get_seq(reference, junc_chr, junc_end, junc_end + template_size - 1) else: seq = my_seq.get_seq(reference, junc_chr, junc_start - template_size + 1, junc_start) + mut_seq_tmp[:(template_size)] key2seq["splice_junction_positive_" + str(cnum)] = seq cnum = cnum + 1 # intron retention without mutation if (motif_type == "donor" and motif_strand == '+') or (motif_type == "acceptor" and motif_strand == '-'): seq_left_tmp = my_seq.get_seq(reference, junc_chr, motif_exon_end - mut_seq_margin + 1, motif_exon_end) seq_right_tmp = my_seq.get_seq(reference, junc_chr, motif_intron_start, motif_intron_start + mut_seq_margin - 1) boundary_pos = motif_exon_end else: seq_left_tmp = my_seq.get_seq(reference, junc_chr, motif_intron_end - mut_seq_margin + 1, motif_intron_end) seq_right_tmp = my_seq.get_seq(reference, junc_chr, motif_exon_start, motif_exon_start + mut_seq_margin - 1) boundary_pos = motif_intron_end key2seq["intron_retention_negative"] = seq_left_tmp[(-template_size):] + seq_right_tmp[:(template_size)] # intron retention with mutation mut_seq_left_tmp, mut_seq_right_tmp = seq_left_tmp, seq_right_tmp # in this case, we remove nucleotides from concatenated sequences mut_start_rel, mut_end_rel = mut_start - boundary_pos + mut_seq_margin - 1, mut_end - boundary_pos + mut_seq_margin - 1 mut_seq_tmp = mut_seq_left_tmp + mut_seq_right_tmp # SNV if mut_ref != '-' and mut_alt != '-': # for debug if mut_seq_tmp[mut_start_rel] != mut_ref: print >> sys.stderr, "mutation inconsistent!!!" sys.exit(1) mut_seq_tmp = mut_seq_tmp[:mut_start_rel] + mut_alt + mut_seq_tmp[(mut_end_rel + 1):] mut_seq_start_pos = mut_seq_margin - template_size mut_seq_end_pos = mut_seq_start_pos + 2 * template_size key2seq["intron_retention_positive"] = mut_seq_tmp[mut_seq_start_pos:mut_seq_end_pos] elif mut_alt == '-': # deletion # for debug if mut_seq_tmp[mut_start_rel:(mut_end_rel + 1)] != mut_ref != '-': print >> sys.stderr, "mutation inconsistent!!!" sys.exit(1) mut_seq_tmp = mut_seq_tmp[:mut_start_rel] + mut_seq_tmp[(mut_end_rel + 1):] del_size_left = max(0, (min(boundary_pos, mut_end) - mut_start)) mut_seq_start_pos = mut_seq_margin - template_size - del_size_left mut_seq_end_pos = mut_seq_start_pos + 2 * template_size key2seq["intron_retention_positive"] = mut_seq_tmp[mut_seq_start_pos:mut_seq_end_pos] elif mut_ref == '-': #insertion mut_seq_tmp = mut_seq_tmp[:(mut_start_rel + 1)] + mut_alt + mut_seq_tmp[(mut_start_rel + 1):] ins_size_left = len(mut_alt) mut_seq_start_pos = mut_seq_margin - template_size + ins_size_left mut_seq_end_pos = mut_seq_start_pos + 2 * template_size key2seq["intron_retention_positive"] = mut_seq_tmp[mut_seq_start_pos:mut_seq_end_pos] hout = open(output_file, 'w') for key in sorted(key2seq): print >> hout, ">" + key + '\n' + key2seq[key] hout.close()
def make_mut_db(input_file, output_file_prefix, reference): hout = open(output_file_prefix + ".bed", "w") with open(input_file, "r") as hin: ref_ind = -1 alt_ind = -1 tum_ref_ind = -1 nor_ref_ind = -1 tum_var_ind = -1 nor_var_ind = -1 fisher_ind = -1 header = hin.readline().rstrip("\n").split("\t") for i in range(0, len(header)): if header[i] == "Ref": ref_ind = i if header[i] == "Alt": alt_ind = i if header[i] == "readPairNum_tumor": tum_ref_ind = i if header[i] == "variantPairNum_tumor": tum_var_ind = i if header[i] == "readPairNum_normal": nor_ref_ind = i if header[i] == "variantPairNum_normal": nor_var_ind = i if header[i] == "P-value(fhsher_realignment)": fisher_ind = i for line in hin: F = line.rstrip("\n").split("\t") if len(F[ref_ind]) >= 10 or len(F[alt_ind]) >= 10: bed_key = F[0] + "\t" + str(int(F[1]) - 1) + "\t" + F[2] read_info = ( F[tum_ref_ind] + "\t" + F[tum_var_ind] + "\t" + str(round(float(F[tum_var_ind]) / (float(F[tum_ref_ind]) + float(F[tum_var_ind])), 3)) + "\t" + F[nor_ref_ind] + "\t" + F[nor_var_ind] + "\t" + str(round(float(F[nor_var_ind]) / (float(F[nor_ref_ind]) + float(F[nor_var_ind])), 3)) + "\t" + F[fisher_ind] ) var_info = "" # deletion if len(F[ref_ind]) >= 10: var_info = ( F[0] + "\t" + str(int(F[1]) - 1) + "\t" + "+" + "\t" + F[0] + "\t" + str(int(F[1]) + len(F[ref_ind])) + "\t" + "-" + "\t" + "---" + "\t" + "deletion" ) # gene_annotation = get_gene_annotation(F[0], str(int(F[1]) - 1), F[0], str(int(F[1]) + len(F[ref_ind])), gene_tb, exon_tb) gene_annotation = "---" + "\t" + "---" + "\t" + "---" + "\t" + "---" print >> hout, bed_key + "\t" + var_info + "\t" + gene_annotation + "\t" + read_info # tandem_duplication elif len(F[alt_ind]) >= 10: # tandem_duplication check flanking_seq_1 = my_seq.get_seq(reference, F[0], int(F[1]) + 1, int(F[1]) + len(F[alt_ind])) flanking_seq_1_match = my_seq.exact_alignment(F[alt_ind], flanking_seq_1) flanking_seq_2 = my_seq.get_seq(reference, F[0], int(F[1]) - len(F[alt_ind]) + 1, int(F[1])) flanking_seq_2_match = my_seq.exact_alignment(F[alt_ind], flanking_seq_2) # print '\t'.join(F[0:4]) # print F[alt_ind] + '\t' + flanking_seq_1 + '\t' + str(flanking_seq_1_match) # print F[alt_ind] + '\t' + flanking_seq_2 + '\t' + str(flanking_seq_2_match) if flanking_seq_1_match == len(F[alt_ind]) or flanking_seq_2_match == len(F[alt_ind]): var_info = ( F[0] + "\t" + str(int(F[1]) + 1) + "\t" + "-" + "\t" + F[0] + "\t" + str(int(F[1]) + len(F[alt_ind])) + "\t" + "+" + "\t" + "---" + "\t" + "tandem_duplication" ) # gene_annotation = get_gene_annotation(F[0], str(int(F[1]) + 1), F[0], str(int(F[1]) + len(F[alt_ind])), gene_tb, exon_tb) gene_annotation = "---" + "\t" + "---" + "\t" + "---" + "\t" + "---" print >> hout, bed_key + "\t" + var_info + "\t" + gene_annotation + "\t" + read_info hout.close() subprocess.call(["bgzip", "-f", output_file_prefix + ".bed"]) subprocess.call(["tabix", "-p", "bed", output_file_prefix + ".bed.gz"])