def write_copynumber(vcf_file, sample, vcf_out, cn_list): #go through the VCF and add the read depth annotations in_header = True header = [] vcf = Vcf() i = 0 s_index = -1 for line in vcf_file: if in_header: if line[0] == '#' and line[1] == '#': header.append(line) continue if line[0] == '#' and line[1] != '#': try: s_index = line.rstrip().split('\t').index(sample) except ValueError: sys.stderr.write( "Please input valid VCF, format field for " + sample + " not found in VCF") sys.exit(1) line = '\t'.join( map(str, line.rstrip().split('\t')[:9] + [sample])) header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this? if s_index == -1: sys.stderr.write("Input a valid sample name: " + sample + " not found in a provided VCF") sys.exit(1) v = v[:9] + [v[s_index]] if not any("SVTYPE=BND" in s for s in v): if "CN" not in v[8]: v[8] = v[8] + ":CN" v[9] = v[9] + ":" + str(cn_list[i]) else: cn_index = v[8].rstrip().split(":").index("CN") gts = v[9].rstrip().split(":") gts[cn_index] = str(cn_list[i]) v[9] = ":".join(gts) i += 1 # write the VCF vcf_out.write('\t'.join(v) + '\n') vcf_out.close() return
def write_copynumber(vcf_file, sample, vcf_out, cn_list): #go through the VCF and add the read depth annotations in_header = True header = [] vcf = Vcf() i = 0 s_index = -1 cn_bad = -1 in cn_list if cn_bad: sys.stderr.write( 'cnvnator was unable to produce a copynumber value for one or more chromosomes. All copynumber values will be set to missing.' ) cn_list = ['.'] * len(cn_list) for line in vcf_file: if in_header: if line[0] == '#' and line[1] == '#': header.append(line) continue if line[0] == '#' and line[1] != '#': try: s_index = line.rstrip().split('\t').index(sample) except ValueError: sys.stderr.write( "Please input valid VCF, format field for {0} not found in VCF" .format(sample)) sys.exit(1) line = '\t'.join( map(str, line.rstrip().split('\t')[:9] + [sample])) header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this? if s_index == -1: sys.stderr.write( "Input a valid sample name: {0} not found in a provided VCF". format(sample)) sys.exit(1) v = v[:9] + [v[s_index]] if not any("SVTYPE=BND" in s for s in v): update_line_copynumber(v, cn_list, i) i += 1 # write the VCF vcf_out.write('\t'.join(v) + '\n') vcf_out.close() return
def write_copynumber(vcf_file, sample, vcf_out, cn_list): #go through the VCF and add the read depth annotations in_header = True header = [] vcf = Vcf() i = 0 s_index = -1 for line in vcf_file: if in_header: if line[0] == '#' and line[1] == '#': header.append(line) continue if line[0] == '#' and line[1] != '#': try: s_index = line.rstrip().split('\t').index(sample) except ValueError: sys.stderr.write("Please input valid VCF, format field for " + sample + " not found in VCF") sys.exit(1) line = '\t'.join(map(str, line.rstrip().split('\t')[:9] + [sample])) header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this? if s_index == -1: sys.stderr.write("Input a valid sample name: " + sample + " not found in a provided VCF") sys.exit(1) v = v[:9] + [v[s_index]] if not any("SVTYPE=BND" in s for s in v): if "CN" not in v[8]: v[8] = v[8] + ":CN" v[9] = v[9] + ":" + str(cn_list[i]) else: cn_index = v[8].rstrip().split(":").index("CN") gts = v[9].rstrip().split(":") gts[cn_index] = str(cn_list[i]) v[9] = ":".join(gts) i += 1 # write the VCF vcf_out.write('\t'.join(v) + '\n') vcf_out.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.') vcf.add_format('GTR', 1, 'String', 'Refined genotype.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF')) < 0.01: vcf_out.write(line) else: df = load_df(var, exclude, sex) recdf = recluster(df) if ct == 1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: if s in recdf.index: var.genotype(s).set_format("GTR", recdf.loc[s, 'GTR']) var.genotype(s).set_format( "GQR", '{:.2f}'.format(recdf.loc[s, 'gq_re'])) else: var.genotype(s).set_format("GTR", "./.") var.genotype(s).set_format("GQR", 0) vcf_out.write( var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file): vcf = Vcf() header = [] in_header = True sex={} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) outf=open(diag_outfile, 'w', 4096) ct=1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.') vcf.add_format('GTR', 1, 'String', 'Refined genotype.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF'))<0.01: vcf_out.write(line) else: df=load_df(var, exclude, sex) recdf=recluster(df) if ct==1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: if s in recdf.index: var.genotype(s).set_format("GTR", recdf.loc[s,'GTR']) var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re'])) else: var.genotype(s).set_format("GTR", "./.") var.genotype(s).set_format("GQR", 0) vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file, batch_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) batch = dict() if batch_file is not None: for line in batch_file: fields = line.rstrip().split('\t') if fields[1] == 'None': raise RuntimeError('Batch file contains a batch label of None. This label is reserved.') batch[fields[0]] = fields[1] outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQO', 1, 'Integer', 'Quality of original genotype') vcf.add_format('GTO', 1, 'String', 'Genotype before refinement') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL prior to reclassification # DUPs can be quite complicated in their allelic structure # and thus less amenable to refinement by clustering in many cases # INV and BNDs are also unclear. # See earlier commits for code of previous attempts to refine these. if svtype not in ['DEL', 'MEI']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF')) < 0.01: vcf_out.write(line) else: df = load_df(var, exclude, sex, batch) recdf = recluster(df) if ct == 1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: g = var.genotype(s) g.set_format("GTO", g.get_format("GT")) g.set_format("GQO", g.get_format("GQ")) if s in recdf.index: var.genotype(s).set_format("GT", recdf.loc[s, 'GTR']) var.genotype(s).set_format("GQ", '{:.0f}'.format(recdf.loc[s, 'gq_re'])) else: var.genotype(s).set_format("GT", "./.") var.genotype(s).set_format("GQ", 0) vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return