def test_eight_column_vcf(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20090805', '##source=myImputationProgramV3.1', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>', '##phasing=partial', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">', '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">', '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">', '##ALT=<ID=DEL,Description="DELETION">', '##FILTER=<ID=q10,Description="Quality below 10">', '##FILTER=<ID=s50,Description="Less than 50% of samples have data">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">', '#CHROM POS ID REF ALT QUAL FILTER INFO' ] v = Vcf() v.add_header(header_lines) expected_header_lines = header_lines expected_header_lines[1] = '##fileDate=' + time.strftime('%Y%m%d') self.assertEqual(v.get_header(), '\n'.join(expected_header_lines)) v.add_sample('ScottPilgrim') self.assertEqual(v.sample_to_col('ScottPilgrim'), 9) post_sample_add_header_lines = [ '##fileformat=VCFv4.2', '##fileDate=' + time.strftime('%Y%m%d'), '##source=myImputationProgramV3.1', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>', '##phasing=partial', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">', '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">', '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">', '##ALT=<ID=DEL,Description="DELETION">', '##FILTER=<ID=q10,Description="Quality below 10">', '##FILTER=<ID=s50,Description="Less than 50% of samples have data">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ScottPilgrim' ] self.assertEqual(v.get_header(), '\n'.join(post_sample_add_header_lines))
def test_eight_column_vcf(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20090805', '##source=myImputationProgramV3.1', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>', '##phasing=partial', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">', '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">', '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">', '##ALT=<ID=DEL,Description="DELETION">', '##FILTER=<ID=q10,Description="Quality below 10">', '##FILTER=<ID=s50,Description="Less than 50% of samples have data">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">', '#CHROM POS ID REF ALT QUAL FILTER INFO'] v = Vcf() v.add_header(header_lines) expected_header_lines = header_lines expected_header_lines[1] = '##fileDate=' + time.strftime('%Y%m%d') self.assertEqual(v.get_header(), '\n'.join(expected_header_lines)) v.add_sample('ScottPilgrim') self.assertEqual(v.sample_to_col('ScottPilgrim'), 9) post_sample_add_header_lines = [ '##fileformat=VCFv4.2', '##fileDate=' + time.strftime('%Y%m%d'), '##source=myImputationProgramV3.1', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="H**o sapiens",taxonomy=x>', '##phasing=partial', '##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">', '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">', '##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">', '##ALT=<ID=DEL,Description="DELETION">', '##FILTER=<ID=q10,Description="Quality below 10">', '##FILTER=<ID=s50,Description="Less than 50% of samples have data">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">', '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', '##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ScottPilgrim'] self.assertEqual(v.get_header(), '\n'.join(post_sample_add_header_lines))
def sname_filter(input_stream, filter_file, output_stream, complement): ''' This reads a VCF stream, determines if the line overlaps any from the filter_file by sname and outputs. ''' filter_list = load_filter_file(filter_file) vcf = Vcf() in_header = True header_lines = list() sample_list = None for line in input_stream: if in_header: header_lines.append(line) if line[0:6] == '#CHROM': in_header = False vcf.add_header(header_lines) vcf.add_info('FOUND', '.', 'String', 'Variant id in other file') output_stream.write(vcf.get_header() + '\n') else: v = Variant(line.rstrip().split('\t'), vcf) sname_set = set_from_string(v.get_info('SNAME')) found = overlapping_ids(sname_set, filter_list) if bool(found) != complement: v.set_info('FOUND', ','.join(found)) output_stream.write(v.get_var_string() + '\n')
def bedpeToVcf(bedpe_file, vcf_out): myvcf = Vcf() converter = BedpeToVcfConverter(myvcf) in_header = True # parse the bedpe data header = list() for line in bedpe_file: if in_header: if line[0:2] == '##': header.append(line) continue elif line[0] == '#' and line[1] != '#': sample_list_str = line.rstrip().split('\t', 20)[-1] header.append('\t'.join([ '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', sample_list_str ])) continue else: in_header = False myvcf.add_header(header) myvcf.file_format = 'VCFv4.2' vcf_out.write(myvcf.get_header() + '\n') # bedpe = Bedpe(line.rstrip().split('\t')) variants = converter.convert(bedpe) for v in variants: vcf_out.write(v.get_var_string() + '\n') # close the VCF output file vcf_out.close() return
def run_from_args(args): vcf = Vcf() vcf_out=sys.stdout in_header = True header_lines = list() with su.InputStream(args.manta_vcf) as input_stream: for line in input_stream: if in_header: header_lines.append(line) if line[0:6] == '#CHROM': in_header=False vcf.add_header(header_lines) vcf.add_info('PRPOS', '1', 'String', 'Breakpoint probability dist') vcf.add_info('PREND', '1', 'String', 'Breakpoint probability dist') vcf.add_info('STRANDS', '.', 'String', 'Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--') vcf.add_info('SU', '.', 'Integer', 'Number of pieces of evidence supporting the variant across all samples') vcf.add_info('PE', '.', 'Integer', 'Number of paired-end reads supporting the variant across all samples') vcf.add_info('SR', '.', 'Integer', 'Number of split reads supporting the variant across all samples') vcf.add_info('INSLEN_ORIG', '.', 'Integer', 'Original insertion length') vcf.add_info('CIPOS95', '2', 'Integer', 'Confidence interval (95%) around POS for imprecise variants') vcf.add_info('CIEND95', '2', 'Integer', 'Confidence interval (95%) around END for imprecise variants') vcf.add_info('SECONDARY', '0', 'Flag', 'Secondary breakend in a multi-line variant') vcf_out.write(vcf.get_header()+'\n') else: v = Variant(line.rstrip().split('\t'), vcf) convert_variant(v, args.max_ins) vcf_out.write(v.get_var_string()+"\n")
def bedpeToVcf(bedpe_file, vcf_out): myvcf = Vcf() converter = BedpeToVcfConverter(myvcf) in_header = True # parse the bedpe data header = list() for line in bedpe_file: if in_header: if line[0:2] == '##': header.append(line) continue elif line[0] == '#' and line[1] != '#': sample_list_str = line.rstrip().split('\t', 20)[-1] header.append('\t'.join([ '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', sample_list_str ] )) continue else: in_header = False myvcf.add_header(header) myvcf.file_format='VCFv4.2' vcf_out.write(myvcf.get_header() + '\n') # bedpe = Bedpe(line.rstrip().split('\t')) variants = converter.convert(bedpe) for v in variants: vcf_out.write(v.get_var_string() + '\n') # close the VCF output file and header if no variants found if in_header == True: myvcf.add_header(header) myvcf.file_format='VCFv4.2' vcf_out.write(myvcf.get_header() + '\n') vcf_out.close() return
def write_copynumber(vcf_file, sample, vcf_out, cn_list): #go through the VCF and add the read depth annotations in_header = True header = [] vcf = Vcf() i = 0 s_index = -1 cn_bad = -1 in cn_list if cn_bad: sys.stderr.write( 'cnvnator was unable to produce a copynumber value for one or more chromosomes. All copynumber values will be set to missing.' ) cn_list = ['.'] * len(cn_list) for line in vcf_file: if in_header: if line[0] == '#' and line[1] == '#': header.append(line) continue if line[0] == '#' and line[1] != '#': try: s_index = line.rstrip().split('\t').index(sample) except ValueError: sys.stderr.write( "Please input valid VCF, format field for {0} not found in VCF" .format(sample)) sys.exit(1) line = '\t'.join( map(str, line.rstrip().split('\t')[:9] + [sample])) header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this? if s_index == -1: sys.stderr.write( "Input a valid sample name: {0} not found in a provided VCF". format(sample)) sys.exit(1) v = v[:9] + [v[s_index]] if not any("SVTYPE=BND" in s for s in v): update_line_copynumber(v, cn_list, i) i += 1 # write the VCF vcf_out.write('\t'.join(v) + '\n') vcf_out.close() return
def write_copynumber(vcf_file, sample, vcf_out, cn_list): #go through the VCF and add the read depth annotations in_header = True header = [] vcf = Vcf() i = 0 s_index = -1 for line in vcf_file: if in_header: if line[0] == '#' and line[1] == '#': header.append(line) continue if line[0] == '#' and line[1] != '#': try: s_index = line.rstrip().split('\t').index(sample) except ValueError: sys.stderr.write( "Please input valid VCF, format field for " + sample + " not found in VCF") sys.exit(1) line = '\t'.join( map(str, line.rstrip().split('\t')[:9] + [sample])) header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this? if s_index == -1: sys.stderr.write("Input a valid sample name: " + sample + " not found in a provided VCF") sys.exit(1) v = v[:9] + [v[s_index]] if not any("SVTYPE=BND" in s for s in v): if "CN" not in v[8]: v[8] = v[8] + ":CN" v[9] = v[9] + ":" + str(cn_list[i]) else: cn_index = v[8].rstrip().split(":").index("CN") gts = v[9].rstrip().split(":") gts[cn_index] = str(cn_list[i]) v[9] = ":".join(gts) i += 1 # write the VCF vcf_out.write('\t'.join(v) + '\n') vcf_out.close() return
def test_add_info_after(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20090805', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003' ] extra_line = '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">' v = Vcf() v.add_header(header_lines) v.add_info_after('DP', 'DB', 0, 'Flag', 'dbSNP membership, build 129') expected_lines = header_lines[0:4] + [extra_line] + header_lines[4:] expected_lines[1] = '##fileDate=' + time.strftime('%Y%m%d') self.assertEqual(v.get_header(), '\n'.join(expected_lines)) v2 = Vcf() v2.add_header(header_lines) v2.add_info_after('AF', 'DB', 0, 'Flag', 'dbSNP membership, build 129') expected_lines2 = header_lines[0:5] + [extra_line] + header_lines[5:] expected_lines2[1] = '##fileDate=' + time.strftime('%Y%m%d') self.assertEqual(v2.get_header(), '\n'.join(expected_lines2))
def test_add_info_after(self): header_lines = [ '##fileformat=VCFv4.2', '##fileDate=20090805', '##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta', '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">', '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003'] extra_line = '##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">' v = Vcf() v.add_header(header_lines) v.add_info_after('DP', 'DB', 0, 'Flag', 'dbSNP membership, build 129') expected_lines = header_lines[0:4] + [extra_line] + header_lines[4:] expected_lines[1] = '##fileDate=' + time.strftime('%Y%m%d') self.assertEqual(v.get_header(), '\n'.join(expected_lines)) v2 = Vcf() v2.add_header(header_lines) v2.add_info_after('AF', 'DB', 0, 'Flag', 'dbSNP membership, build 129') expected_lines2 = header_lines[0:5] + [extra_line] + header_lines[5:] expected_lines2[1] = '##fileDate=' + time.strftime('%Y%m%d') self.assertEqual(v2.get_header(), '\n'.join(expected_lines2))
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters') #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette') vcf_out.write(vcf.get_header() + '\n') var = Variant(line.rstrip().split('\t'), vcf) df = load_df(var, sex) df1 = get_silhouette(df) sil_avg = df1.iloc[0, df1.columns.get_loc('sil_gt_avg')] #sil_ind=df1.loc[:, 'sil_gt'] var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') if ct == 1: df1.to_csv(outf, header=True) ct += 1 else: df1.to_csv(outf, header=False) vcf_out.close() vcf_in.close() outf.close() gender_file.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file): vcf = Vcf() header = [] in_header = True sex={} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) outf=open(diag_outfile, 'w', 4096) ct=1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters') #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette') vcf_out.write(vcf.get_header() + '\n') var = Variant(line.rstrip().split('\t'), vcf) df=load_df(var, sex) df1=get_silhouette(df) sil_avg=df1.iloc[0, df1.columns.get_loc('sil_gt_avg')] #sil_ind=df1.loc[:, 'sil_gt'] var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') if ct==1: df1.to_csv(outf, header=True) ct += 1 else: df1.to_csv(outf, header=False) vcf_out.close() vcf_in.close() outf.close() gender_file.close() return
def write_copynumber(vcf_file, sample, vcf_out, cn_list): #go through the VCF and add the read depth annotations in_header = True header = [] vcf = Vcf() i = 0 s_index = -1 for line in vcf_file: if in_header: if line[0] == '#' and line[1] == '#': header.append(line) continue if line[0] == '#' and line[1] != '#': try: s_index = line.rstrip().split('\t').index(sample) except ValueError: sys.stderr.write("Please input valid VCF, format field for " + sample + " not found in VCF") sys.exit(1) line = '\t'.join(map(str, line.rstrip().split('\t')[:9] + [sample])) header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_format('CN', 1, 'Float', 'Copy number of structural variant segment.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') # XXX Is this second check necessary? Wouldn't this be handled above? Missing header would hit this? if s_index == -1: sys.stderr.write("Input a valid sample name: " + sample + " not found in a provided VCF") sys.exit(1) v = v[:9] + [v[s_index]] if not any("SVTYPE=BND" in s for s in v): if "CN" not in v[8]: v[8] = v[8] + ":CN" v[9] = v[9] + ":" + str(cn_list[i]) else: cn_index = v[8].rstrip().split(":").index("CN") gts = v[9].rstrip().split(":") gts[cn_index] = str(cn_list[i]) v[9] = ":".join(gts) i += 1 # write the VCF vcf_out.write('\t'.join(v) + '\n') vcf_out.close() return
def sv_classify(vcf_in, vcf_out, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, p_cnv, het_del_fit, hom_del_fit, params, diag_outfile, method): vcf = Vcf() header = [] in_header = True sex = {} # read sample genders for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) if diag_outfile is not None: outf=open(diag_outfile, 'w', 4096) outf.write("varid\torig_svtype\tsvlen\tnum_pos_samps\tnb_support\tls_support\thybrid_support\thas_rd_support\n") for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL', 'DUP']: vcf_out.write(line) continue var = Variant(v, vcf) # check intersection with mobile elements if ae_dict is not None and var.info['SVTYPE'] in ['DEL']: ae = annotation_intersect(var, ae_dict, f_overlap) if ae is not None: if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'): ae = 'ME:' + ae var.alt = '<DEL:%s>' % ae var.info['SVTYPE'] = 'MEI' vcf_out.write(var.get_var_string(True) + '\n') continue #count positively genotyped samples num_pos_samps = 0 num_total_samps=len(var.sample_list) for s in var.sample_list: if var.genotype(s).get_format('GT') not in ["./.", "0/0"]: num_pos_samps += 1 nb_support = False ls_support = False hybrid_support = False has_rd_support = False if num_pos_samps == 0: vcf_out.write(line) else: df=load_df(var, exclude, sex) if method=='large_sample': ls_support = has_rd_support_by_ls(df, slope_threshold, rsquared_threshold, num_pos_samps) has_rd_support=ls_support elif method=='naive_bayes': nb_support = has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params, p_cnv) has_rd_support=nb_support elif method=='hybrid': ls_support, nb_support, hybrid_support = has_rd_support_hybrid(df, het_del_fit, hom_del_fit, params, p_cnv, slope_threshold, rsquared_threshold, num_pos_samps) has_rd_support=hybrid_support if has_rd_support: vcf_out.write(line) else: for m_var in to_bnd_strings(var, True): vcf_out.write(m_var + '\n') if diag_outfile is not None: svlen=df['svlen'][0] outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(ls_support)+"\t"+str(hybrid_support)+"\t"+str(has_rd_support)+"\n") vcf_out.close() if diag_outfile is not None: outf.close() vcf_in.close() vcf_out.close() gender_file.close() if exclude_file is not None: exclude_file.close() return
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold): vcf_out = sys.stdout vcf = Vcf() header = [] in_header = True min_pos_samps_for_regression = 10 gender = {} # read sample genders for line in gender_file: v = line.rstrip().split('\t') gender[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) # write the output header vcf_out.write(vcf.get_header() + '\n') # split variant line, quick pre-check if the SVTYPE is BND, and skip if so v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL', 'DUP']: vcf_out.write(line) continue # parse the VCF line var = Variant(v, vcf, True) # check intersection with mobile elements if ae_dict is not None and var.info['SVTYPE'] in ['DEL']: ae = annotation_intersect(var, ae_dict, f_overlap) if ae is not None: if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'): ae = 'ME:' + ae var.alt = '<DEL:%s>' % ae var.info['SVTYPE'] = 'MEI' vcf_out.write(var.get_var_string(True) + '\n') continue # # write to directory # writedir = 'data/r11.100kb.dup' # annotate based on read depth if var.info['SVTYPE'] in ['DEL', 'DUP']: # count the number of positively genotyped samples num_pos_samps = 0; for s in var.sample_list: if s in exclude: continue if var.genotype(s).get_format('GT') not in ["./.", "0/0"]: num_pos_samps += 1 if num_pos_samps < min_pos_samps_for_regression: if has_low_freq_depth_support(var, gender, exclude): # has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_rd') # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_rd') # write variant #vcf_out.write(var.get_var_string(True) + '\n') vcf_out.write(line) else: # has_low_freq_depth_support(var, gender, exclude, writedir + '/low_freq_no_rd') # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/low_freq_no_rd') for m_var in to_bnd_strings(var): vcf_out.write(m_var + '\n') else: if has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold): # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_rd') # has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_rd') # write variant #vcf_out.write(var.get_var_string(True) + '\n') vcf_out.write(line) else: # has_high_freq_depth_support(var, gender, exclude, slope_threshold, rsquared_threshold, writedir + '/high_freq_no_rd') # has_low_freq_depth_support(var, gender, exclude, writedir + '/high_freq_no_rd') for m_var in to_bnd_strings(var): vcf_out.write(m_var + '\n') vcf_out.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file, batch_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) batch = dict() if batch_file is not None: for line in batch_file: fields = line.rstrip().split('\t') if fields[1] == 'None': raise RuntimeError('Batch file contains a batch label of None. This label is reserved.') batch[fields[0]] = fields[1] outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQO', 1, 'Integer', 'Quality of original genotype') vcf.add_format('GTO', 1, 'String', 'Genotype before refinement') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL prior to reclassification # DUPs can be quite complicated in their allelic structure # and thus less amenable to refinement by clustering in many cases # INV and BNDs are also unclear. # See earlier commits for code of previous attempts to refine these. if svtype not in ['DEL', 'MEI']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF')) < 0.01: vcf_out.write(line) else: df = load_df(var, exclude, sex, batch) recdf = recluster(df) if ct == 1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: g = var.genotype(s) g.set_format("GTO", g.get_format("GT")) g.set_format("GQO", g.get_format("GQ")) if s in recdf.index: var.genotype(s).set_format("GT", recdf.loc[s, 'GTR']) var.genotype(s).set_format("GQ", '{:.0f}'.format(recdf.loc[s, 'gq_re'])) else: var.genotype(s).set_format("GT", "./.") var.genotype(s).set_format("GQ", 0) vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def sv_classify(vcf_in, gender_file, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, het_del_fit, hom_del_fit, params, diag_outfile): vcf_out = sys.stdout vcf = Vcf() header = [] in_header = True min_pos_samps_for_regression = 10 sex = {} # read sample genders for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) if diag_outfile is not None: outf=open(diag_outfile, 'w', 4096) for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) vcf_out.write(vcf.get_header() + '\n') # split variant line, quick pre-check if the SVTYPE is BND, and skip if so v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL', 'DUP']: vcf_out.write(line) continue # parse the VCF line var = Variant(v, vcf, True) # check intersection with mobile elements if ae_dict is not None and var.info['SVTYPE'] in ['DEL']: ae = annotation_intersect(var, ae_dict, f_overlap) if ae is not None: if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'): ae = 'ME:' + ae var.alt = '<DEL:%s>' % ae var.info['SVTYPE'] = 'MEI' vcf_out.write(var.get_var_string(True) + '\n') continue # for now, don't worry about sex chromosomes if (var.chrom == 'X' or var.chrom == 'Y'): vcf_out.write(line) continue #count positively genotyped samples num_pos_samps = 0; for s in var.sample_list: if s in exclude: continue if var.genotype(s).get_format('GT') not in ["./.", "0/0"]: num_pos_samps += 1 high_freq_support = False low_freq_support = False nb_support = False if num_pos_samps == 0: vcf_out.write(line) else: df=load_df(var, exclude, sex) if has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params): nb_support = True if num_pos_samps < min_pos_samps_for_regression: if has_low_freq_depth_support(df): low_freq_support = True vcf_out.write(line) else: for m_var in to_bnd_strings(var, True ): vcf_out.write(m_var + '\n') else: if has_high_freq_depth_support(df, slope_threshold, rsquared_threshold): high_freq_support = True vcf_out.write(line) else: for m_var in to_bnd_strings(var, True): vcf_out.write(m_var + '\n') if diag_outfile is not None: svlen=df['svlen'][0] outf.write(var.var_id+"\t"+svtype+"\t"+str(svlen)+"\t"+str(num_pos_samps)+"\t"+str(nb_support)+"\t"+str(high_freq_support)+"\t"+str(low_freq_support)+"\n") vcf_out.close() if diag_outfile is not None: outf.close() return
def l_cluster_by_line(file_name, tempdir, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'): v_id = 0 in_header = True header = [] vcf = Vcf() vcf_out = sys.stdout with InputStream(file_name, tempdir) as vcf_stream: BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' sample_order = [] for line in vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split( '\t') # #CHROM line split -> list -D for headline in header: if headline[:8] == '##SAMPLE' and headline.rstrip( )[13:-1] != 'VARIOUS': sample_order.append( headline.rstrip()[13:-1] ) # maybe add sample name to samplr_order list. -D hline = '' # Parsed #CHROM line from 'v' -D if include_genotypes: v = v[:9] # Remove possible VARIOUS -D v.extend(sample_order) hline = '\t'.join(v) else: v = v[:8] # No FORMAT field here. -D hline = '\t'.join(v) header.append(hline) in_header = False vcf.add_header(header) vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint') if include_genotypes: vcf_out.write(vcf.get_header() + '\n') else: vcf_out.write( vcf.get_header(include_samples=False) + '\n') # Not including samples here. -D continue # Header is now parsed, then the main dish. -D b = Breakpoint( l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop) # percent_slop and fixed_slop is 0. -D if (len(BP_l) == 0) or ( (b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type) ): # Same chrom svtype and start is small than previous end. -D BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.left.end) BP_chr_l = b.left.chrom BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme) BP_l = [b] BP_max_end_l = b.left.end BP_chr_l = b.left.chrom BP_sv_type = b.sv_type if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.') vcf.add_format('GTR', 1, 'String', 'Refined genotype.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF')) < 0.01: vcf_out.write(line) else: df = load_df(var, exclude, sex) recdf = recluster(df) if ct == 1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: if s in recdf.index: var.genotype(s).set_format("GTR", recdf.loc[s, 'GTR']) var.genotype(s).set_format( "GQR", '{:.2f}'.format(recdf.loc[s, 'gq_re'])) else: var.genotype(s).set_format("GTR", "./.") var.genotype(s).set_format("GQR", 0) vcf_out.write( var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name): # FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!! bList = list() headerObj=Vcf() #co-opt the VCF header object if cohort_name is None: cohort_name=str(str(bFile).split('/')[-1]) if bFile == "stdin": bData = sys.stdin elif bFile.endswith('.gz'): bData = gzip.open(bFile, 'rb') else: bData = open(bFile, 'r') for bLine in bData: if bLine.startswith(pass_prefix): continue bentry = Bedpe(bLine.rstrip().split('\t')) if bentry.af is None: sys.stderr.write('No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n') sys.exit(1) bList.append(bentry) if aFile == "stdin": aData = sys.stdin elif aFile.endswith('.gz'): aData = gzip.open(aFile, 'rb') else: aData = open(aFile, 'r') in_header=True header_lines = [] sample_list = None for aLine in aData: if pass_prefix is not None and aLine.startswith(pass_prefix): if aLine[0] == '#' and aLine[1] != '#': sample_list = aLine.rstrip().split('\t', 14)[-1] else: header_lines.append(aLine) continue else: if in_header == True: headerObj.add_header(header_lines) headerObj.add_info(cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' ) headerObj.add_info(cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' ) header = headerObj.get_header() bedpe_out.write(header[:header.rfind('\n')] + '\n') if len(sample_list) > 0: bedpe_out.write('\t'.join(['#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'INFO_A','INFO_B', sample_list] ) + '\n') else: bedpe_out.write('\t'.join(['#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'INFO_A','INFO_B'] ) + '\n') in_header=False a = Bedpe(aLine.rstrip().split('\t')) if a.af is None: sys.stderr.write('No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n') sys.exit(1) for b in bList: add(a,b,max_distance) bedpe_out.write(get_var_string(a, cohort_name))
def execute(self, output_handle=sys.stdout): in_header = True header = [] vcf = Vcf() vcf_out = output_handle # read input VCF for line in self.vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') header.append('\t'.join(v)) in_header = False vcf.add_header(header) vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed') vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes') vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples') # write header vcf_out.write(vcf.get_header() + '\n') #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n') continue v = line.rstrip().split('\t') var = Variant(v, vcf, fixed_genotypes=True) # extract genotypes from VCF num_alt = len(var.alt.split(',')) alleles = [0] * (num_alt + 1) num_samp = 0 gt = [var.genotype(s).get_format('GT') for s in var.sample_list] for gt_string in gt: if '.' in gt_string: continue gt = gt_string.split('/') if len(gt) == 1: gt = gt_string.split('|') gt = map(int, gt) for i in xrange(len(gt)): alleles[gt[i]] += 1 # iterate the number of non-reference samples if sum(gt) > 0: num_samp += 1 allele_sum = float(sum(alleles)) allele_freq = ['.'] * len(alleles) # populate AF if allele_sum > 0: for i in xrange(len(alleles)): allele_freq[i] = alleles[i] / allele_sum var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]])) else: var.info['AF'] = ','.join(map(str, allele_freq[1:])) # populate NSAMP var.info['NSAMP'] = num_samp var.info['MSQ'] = self.calc_msq(var) # after all samples have been processed, write vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') vcf_out.close()
def execute(self, output_handle=sys.stdout): in_header = True header = [] vcf = Vcf() vcf_out = output_handle # read input VCF for line in self.vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') header.append('\t'.join(v)) in_header = False vcf.add_header(header) vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed') vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes') vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples') # write header vcf_out.write(vcf.get_header() + '\n') #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n') continue v = line.rstrip().split('\t') var = Variant(v, vcf) # extract genotypes from VCF num_alt = len(var.alt.split(',')) alleles = [0] * (num_alt + 1) num_samp = 0 sum_sq = 0.0 for gt in var.genotypes(): gt_string = gt.get_format('GT') if '.' not in gt_string: indexes = self.numeric_alleles(gt_string) for i in indexes: alleles[i] += 1 # iterate the number of non-reference samples if sum(indexes) > 0: num_samp += 1 try: sum_sq += float(gt.get_format('SQ')) except KeyError: pass allele_sum = float(sum(alleles)) allele_freq = ['.'] * len(alleles) # populate AF if allele_sum > 0: for i in xrange(len(alleles)): allele_freq[i] = alleles[i] / allele_sum var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]])) else: var.info['AF'] = ','.join(map(str, allele_freq[1:])) # populate NSAMP var.info['NSAMP'] = num_samp if num_samp > 0: msq = '%0.2f' % (sum_sq / num_samp) else: msq = '.' var.info['MSQ'] = msq # after all samples have been processed, write vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') vcf_out.close()
def l_cluster_by_line(file_name, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'): v_id = 0 in_header = True header = [] vcf = Vcf() vcf_out=sys.stdout with InputStream(file_name) as vcf_stream: BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' sample_order = [] for line in vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v=line.rstrip().split('\t') for headline in header: if headline[:8] == '##SAMPLE': sample_order.append(headline.rstrip()[13:-1]) hline='' if include_genotypes : v.extend(sample_order) hline='\t'.join(v) else : v=v[:8] hline='\t'.join(v) header.append(hline) in_header=False vcf.add_header(header) vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint') if include_genotypes: vcf_out.write(vcf.get_header()+'\n') else: vcf_out.write(vcf.get_header(False)+'\n') continue b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop) if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)): BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.left.end) BP_chr_l = b.left.chrom BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme) BP_l = [b] BP_max_end_l = b.left.end BP_sv_type = b.sv_type BP_chr_l = b.left.chrom if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
def sv_classify(vcf_in, vcf_out, gender_file, sex_chrom_names, exclude_file, ae_dict, f_overlap, slope_threshold, rsquared_threshold, p_cnv, het_del_fit, hom_del_fit, params, diag_outfile, method): vcf = Vcf() header = [] in_header = True sex = {} # read sample genders for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) if diag_outfile is not None: outf=open(diag_outfile, 'w', 4096) outf.write("varid\torig_svtype\tsvlen\tnum_pos_samps\tnb_support\tls_support\thybrid_support\thas_rd_support\n") for line in vcf_in: if in_header: if line[0] == '#': header.append(line) continue else: in_header = False vcf.add_header(header) vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL', 'DUP']: vcf_out.write(line) continue var = Variant(v, vcf) # check intersection with mobile elements if ae_dict is not None and var.info['SVTYPE'] in ['DEL']: ae = annotation_intersect(var, ae_dict, f_overlap) if ae is not None: if ae.startswith('SINE') or ae.startswith('LINE') or ae.split('|')[2].startswith('SVA'): ae = 'ME:' + ae var.alt = '<DEL:%s>' % ae var.info['SVTYPE'] = 'MEI' vcf_out.write(var.get_var_string(True) + '\n') continue #count positively genotyped samples num_pos_samps = 0 num_total_samps=len(var.sample_list) for s in var.sample_list: if var.genotype(s).get_format('GT') not in ["./.", "0/0"]: num_pos_samps += 1 nb_support = False ls_support = False hybrid_support = False has_rd_support = False if num_pos_samps == 0: vcf_out.write(line) else: df = load_df(var, exclude, sex, sex_chrom_names) if method == 'large_sample': ls_support = has_rd_support_by_ls(df, slope_threshold, rsquared_threshold, num_pos_samps) has_rd_support = ls_support elif method == 'naive_bayes': nb_support = has_rd_support_by_nb(df, het_del_fit, hom_del_fit, params, p_cnv) has_rd_support = nb_support elif method == 'hybrid': ls_support, nb_support, hybrid_support = has_rd_support_hybrid( df, het_del_fit, hom_del_fit, params, p_cnv, slope_threshold, rsquared_threshold, num_pos_samps ) has_rd_support=hybrid_support if has_rd_support: vcf_out.write(line) else: for m_var in to_bnd_strings(var, True): vcf_out.write(m_var + '\n') if diag_outfile is not None: svlen=df['svlen'][0] outf.write( '\t'.join(( var.var_id, svtype, str(svlen), str(num_pos_samps), str(nb_support), str(ls_support), str(hybrid_support), str(has_rd_support) )) + "\n" ) vcf_out.close() if diag_outfile is not None: outf.close() vcf_in.close() vcf_out.close() gender_file.close() if exclude_file is not None: exclude_file.close() return
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name): # FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!! bList = list() headerObj = Vcf() #co-opt the VCF header object if cohort_name is None: cohort_name = str(str(bFile).split('/')[-1]) if bFile == "stdin": bData = sys.stdin elif bFile.endswith('.gz'): bData = gzip.open(bFile, 'rb') else: bData = open(bFile, 'r') for bLine in bData: if bLine.startswith(pass_prefix): continue bentry = Bedpe(bLine.rstrip().split('\t')) if bentry.af is None: sys.stderr.write( 'No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n' ) sys.exit(1) bList.append(bentry) if aFile == "stdin": aData = sys.stdin elif aFile.endswith('.gz'): aData = gzip.open(aFile, 'rb') else: aData = open(aFile, 'r') in_header = True header_lines = [] sample_list = None for aLine in aData: if pass_prefix is not None and aLine.startswith(pass_prefix): if aLine[0] == '#' and aLine[1] != '#': sample_list = aLine.rstrip().split('\t', 14)[-1] else: header_lines.append(aLine) continue else: if in_header == True: headerObj.add_header(header_lines) headerObj.add_info( cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')') headerObj.add_info( cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')') header = headerObj.get_header() bedpe_out.write(header[:header.rfind('\n')] + '\n') if len(sample_list) > 0: bedpe_out.write('\t'.join([ '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'INFO_A', 'INFO_B', sample_list ]) + '\n') else: bedpe_out.write('\t'.join([ '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'INFO_A', 'INFO_B' ]) + '\n') in_header = False a = Bedpe(aLine.rstrip().split('\t')) if a.af is None: sys.stderr.write( 'No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n' ) sys.exit(1) for b in bList: add(a, b, max_distance) bedpe_out.write(get_var_string(a, cohort_name) + '\n')
def l_cluster_by_line(file_name, tempdir, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'): v_id = 0 in_header = True header = [] vcf = Vcf() vcf_out = sys.stdout with InputStream(file_name, tempdir) as vcf_stream: BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' sample_order = [] for line in vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') for headline in header: if headline[:8] == '##SAMPLE': sample_order.append(headline.rstrip()[13:-1]) hline = '' if include_genotypes: v.extend(sample_order) hline = '\t'.join(v) else: v = v[:8] hline = '\t'.join(v) header.append(hline) in_header = False vcf.add_header(header) vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint') if include_genotypes: vcf_out.write(vcf.get_header() + '\n') else: vcf_out.write(vcf.get_header(False) + '\n') continue b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop) if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)): BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.left.end) BP_chr_l = b.left.chrom BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme) BP_l = [b] BP_max_end_l = b.left.end BP_sv_type = b.sv_type BP_chr_l = b.left.chrom if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file): vcf = Vcf() header = [] in_header = True sex={} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) outf=open(diag_outfile, 'w', 4096) ct=1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.') vcf.add_format('GTR', 1, 'String', 'Refined genotype.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF'))<0.01: vcf_out.write(line) else: df=load_df(var, exclude, sex) recdf=recluster(df) if ct==1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: if s in recdf.index: var.genotype(s).set_format("GTR", recdf.loc[s,'GTR']) var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re'])) else: var.genotype(s).set_format("GTR", "./.") var.genotype(s).set_format("GQR", 0) vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def bedpeToVcf(bedpe_file, vcf_out): myvcf = Vcf() in_header = True # parse the bedpe data header = list() for line in bedpe_file: if in_header: if line[0:2] == '##': header.append(line) continue elif line[0] == '#' and line[1] != '#': sample_list_str = line.rstrip().split('\t', 14)[-1] header.append('\t'.join([ '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', sample_list_str ] )) continue else: in_header = False myvcf.add_header(header) myvcf.file_format='VCFv4.2' vcf_out.write(myvcf.get_header() + '\n') # bedpe = Bedpe(line.rstrip().split('\t')) if bedpe.svtype == 'BND': bedpe1_list = [ bedpe.c1, bedpe.b1 + 1, bedpe.name + '_1', #ID 'N', '<' + str(bedpe.svtype) + '>', #ALT bedpe.score, bedpe.filter ] bedpe1_list.extend(bedpe.misc) var1 = Variant(bedpe1_list, myvcf) if bedpe.o1 == '+': if bedpe.o2 == '-': var1.alt = '%s[%s:%s[' % (var1.ref, bedpe.c2, bedpe.b2 + 1) elif bedpe.o2 == '+': var1.alt = '%s]%s:%s]' % (var1.ref, bedpe.c2, bedpe.b2 + 1) elif bedpe.o1 == '-': if bedpe.o2 == '+': var1.alt = ']%s:%s]%s' % (bedpe.c2, bedpe.b2 + 1, var1.ref) elif bedpe.o2 == '-': var1.alt = '[%s:%s[%s' % (bedpe.c2, bedpe.b2 + 1, var1.ref) misc = copy.deepcopy(bedpe.misc) strands = re.split('=|:',''.join(filter(lambda x: 'STRANDS=' in x, bedpe.misc[0].split(";")))) strands_str = str(strands[0]) + '=' + str(strands[1][::-1]) + ':' + str(strands[2]) misc[0]=misc[0].replace(''.join(filter(lambda x: 'STRANDS=' in x, bedpe.misc[0].split(";"))), strands_str) #add the cipos ciend,cipos95 and ciend95 variables misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIPOS=' in x, bedpe.misc[0].split(";"))),'CIPOS='+ re.split('=',''.join(filter(lambda x: 'CIEND=' in x, bedpe.misc[0].split(";"))))[1]) misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIEND=' in x, bedpe.misc[0].split(";"))),'CIEND='+ re.split('=',''.join(filter(lambda x: 'CIPOS=' in x, bedpe.misc[0].split(";"))))[1]) misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIPOS95=' in x, bedpe.misc[0].split(";"))),'CIPOS95='+ re.split('=',''.join(filter(lambda x: 'CIEND95=' in x, bedpe.misc[0].split(";"))))[1]) misc[0]=misc[0].replace(''.join(filter(lambda x: 'CIEND95=' in x, bedpe.misc[0].split(";"))),'CIEND95='+ re.split('=',''.join(filter(lambda x: 'CIPOS95=' in x, bedpe.misc[0].split(";"))))[1]) #Change MATEID misc[0]= misc[0].replace(''.join(filter(lambda x: 'MATEID=' in x, bedpe.misc[0].split(";"))),'MATEID=' + bedpe.name + '_2') #ADD IDENTIFIER FOR SECONDARY BREAKEND MATE misc[0]=misc[0].replace(''.join(filter(lambda x: 'EVENT=' in x, bedpe.misc[0].split(";"))),''.join(filter(lambda x: 'EVENT=' in x, bedpe.misc[0].split(";"))) + ';SECONDARY;') bedpe2_list = [ bedpe.c2, #chrom1 bedpe.b2 + 1, bedpe.name + '_2', #ID 'N', '<' + str(bedpe.svtype) + '>', #ALT bedpe.score, bedpe.filter ] bedpe2_list.extend(misc) var2 = Variant(bedpe2_list, myvcf) # add the strands field. For variant 2 must switch the order if bedpe.o2 == '+': if bedpe.o1 == '-': var2.alt = '%s[%s:%s[' % (var2.ref, bedpe.c1, bedpe.b1 + 1) elif bedpe.o1 == '+': var2.alt = '%s]%s:%s]' % (var2.ref, bedpe.c1, bedpe.b1 + 1) elif bedpe.o2 == '-': if bedpe.o1 == '+': var2.alt = ']%s:%s]%s' % (bedpe.c1, bedpe.b1 + 1, var2.ref) elif bedpe.o1 == '-': var2.alt = '[%s:%s[%s' % (bedpe.c1, bedpe.b1 + 1, var2.ref) if bedpe.malformedFlag == 0: vcf_out.write(var1.get_var_string() + '\n') vcf_out.write(var2.get_var_string() + '\n') elif bedpe.malformedFlag == 1: vcf_out.write(var2.get_var_string() + '\n') elif bedpe.malformedFlag == 2: vcf_out.write(var1.get_var_string() + '\n') else: # set VCF info elements for simple events bedpe_list = [ bedpe.c1, #chrom1 bedpe.b1 + 1, bedpe.name, #ID 'N', '<' + str(bedpe.svtype) + '>', #ALT bedpe.score, bedpe.filter ] bedpe_list.extend(bedpe.misc) var = Variant(bedpe_list, myvcf) # write the record to the VCF output file vcf_out.write(var.get_var_string() + '\n') # close the VCF output file vcf_out.close() return