def sname_filter(input_stream, filter_file, output_stream, complement): ''' This reads a VCF stream, determines if the line overlaps any from the filter_file by sname and outputs. ''' filter_list = load_filter_file(filter_file) vcf = Vcf() in_header = True header_lines = list() sample_list = None for line in input_stream: if in_header: header_lines.append(line) if line[0:6] == '#CHROM': in_header = False vcf.add_header(header_lines) vcf.add_info('FOUND', '.', 'String', 'Variant id in other file') output_stream.write(vcf.get_header() + '\n') else: v = Variant(line.rstrip().split('\t'), vcf) sname_set = set_from_string(v.get_info('SNAME')) found = overlapping_ids(sname_set, filter_list) if bool(found) != complement: v.set_info('FOUND', ','.join(found)) output_stream.write(v.get_var_string() + '\n')
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters') #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette') vcf_out.write(vcf.get_header() + '\n') var = Variant(line.rstrip().split('\t'), vcf) df = load_df(var, sex) df1 = get_silhouette(df) sil_avg = df1.iloc[0, df1.columns.get_loc('sil_gt_avg')] #sil_ind=df1.loc[:, 'sil_gt'] var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') if ct == 1: df1.to_csv(outf, header=True) ct += 1 else: df1.to_csv(outf, header=False) vcf_out.close() vcf_in.close() outf.close() gender_file.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file): vcf = Vcf() header = [] in_header = True sex={} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) outf=open(diag_outfile, 'w', 4096) ct=1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('SIL_GT_AVG', '1', 'Float', 'Average silhouette of genotype clusters') #vcf.add_format('SIL_GT', '1', 'Float', 'Per-sample genotype cluster silhouette') vcf_out.write(vcf.get_header() + '\n') var = Variant(line.rstrip().split('\t'), vcf) df=load_df(var, sex) df1=get_silhouette(df) sil_avg=df1.iloc[0, df1.columns.get_loc('sil_gt_avg')] #sil_ind=df1.loc[:, 'sil_gt'] var.info['SIL_GT_AVG'] = '%0.2f' % sil_avg vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') if ct==1: df1.to_csv(outf, header=True) ct += 1 else: df1.to_csv(outf, header=False) vcf_out.close() vcf_in.close() outf.close() gender_file.close() return
def execute(self, output_handle=sys.stdout): in_header = True header = [] vcf = Vcf() vcf_out = output_handle # read input VCF for line in self.vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') header.append('\t'.join(v)) in_header = False vcf.add_header(header) vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed') vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes') vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples') # write header vcf_out.write(vcf.get_header() + '\n') #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n') continue v = line.rstrip().split('\t') var = Variant(v, vcf) # extract genotypes from VCF num_alt = len(var.alt.split(',')) alleles = [0] * (num_alt + 1) num_samp = 0 sum_sq = 0.0 for gt in var.genotypes(): gt_string = gt.get_format('GT') if '.' not in gt_string: indexes = self.numeric_alleles(gt_string) for i in indexes: alleles[i] += 1 # iterate the number of non-reference samples if sum(indexes) > 0: num_samp += 1 try: sum_sq += float(gt.get_format('SQ')) except KeyError: pass allele_sum = float(sum(alleles)) allele_freq = ['.'] * len(alleles) # populate AF if allele_sum > 0: for i in xrange(len(alleles)): allele_freq[i] = alleles[i] / allele_sum var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]])) else: var.info['AF'] = ','.join(map(str, allele_freq[1:])) # populate NSAMP var.info['NSAMP'] = num_samp if num_samp > 0: msq = '%0.2f' % (sum_sq / num_samp) else: msq = '.' var.info['MSQ'] = msq # after all samples have been processed, write vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') vcf_out.close()
def l_cluster_by_line(file_name, tempdir, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'): v_id = 0 in_header = True header = [] vcf = Vcf() vcf_out = sys.stdout with InputStream(file_name, tempdir) as vcf_stream: BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' sample_order = [] for line in vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') for headline in header: if headline[:8] == '##SAMPLE': sample_order.append(headline.rstrip()[13:-1]) hline = '' if include_genotypes: v.extend(sample_order) hline = '\t'.join(v) else: v = v[:8] hline = '\t'.join(v) header.append(hline) in_header = False vcf.add_header(header) vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint') if include_genotypes: vcf_out.write(vcf.get_header() + '\n') else: vcf_out.write(vcf.get_header(False) + '\n') continue b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop) if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)): BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.left.end) BP_chr_l = b.left.chrom BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme) BP_l = [b] BP_max_end_l = b.left.end BP_sv_type = b.sv_type BP_chr_l = b.left.chrom if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name): # FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!! bList = list() headerObj = Vcf() #co-opt the VCF header object if cohort_name is None: cohort_name = str(str(bFile).split('/')[-1]) if bFile == "stdin": bData = sys.stdin elif bFile.endswith('.gz'): bData = gzip.open(bFile, 'rb') else: bData = open(bFile, 'r') for bLine in bData: if bLine.startswith(pass_prefix): continue bentry = Bedpe(bLine.rstrip().split('\t')) if bentry.af is None: sys.stderr.write( 'No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n' ) sys.exit(1) bList.append(bentry) if aFile == "stdin": aData = sys.stdin elif aFile.endswith('.gz'): aData = gzip.open(aFile, 'rb') else: aData = open(aFile, 'r') in_header = True header_lines = [] sample_list = None for aLine in aData: if pass_prefix is not None and aLine.startswith(pass_prefix): if aLine[0] == '#' and aLine[1] != '#': sample_list = aLine.rstrip().split('\t', 14)[-1] else: header_lines.append(aLine) continue else: if in_header == True: headerObj.add_header(header_lines) headerObj.add_info( cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')') headerObj.add_info( cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')') header = headerObj.get_header() bedpe_out.write(header[:header.rfind('\n')] + '\n') if len(sample_list) > 0: bedpe_out.write('\t'.join([ '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'INFO_A', 'INFO_B', sample_list ]) + '\n') else: bedpe_out.write('\t'.join([ '#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'INFO_A', 'INFO_B' ]) + '\n') in_header = False a = Bedpe(aLine.rstrip().split('\t')) if a.af is None: sys.stderr.write( 'No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n' ) sys.exit(1) for b in bList: add(a, b, max_distance) bedpe_out.write(get_var_string(a, cohort_name) + '\n')
def varLookup(aFile, bFile, bedpe_out, max_distance, pass_prefix, cohort_name): # FIXME The following code is heavily duplicated with vcftobedpe and bedpetovcf. Harmonize!!! bList = list() headerObj=Vcf() #co-opt the VCF header object if cohort_name is None: cohort_name=str(str(bFile).split('/')[-1]) if bFile == "stdin": bData = sys.stdin elif bFile.endswith('.gz'): bData = gzip.open(bFile, 'rb') else: bData = open(bFile, 'r') for bLine in bData: if bLine.startswith(pass_prefix): continue bentry = Bedpe(bLine.rstrip().split('\t')) if bentry.af is None: sys.stderr.write('No allele frequency for variant found in -b file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n') sys.exit(1) bList.append(bentry) if aFile == "stdin": aData = sys.stdin elif aFile.endswith('.gz'): aData = gzip.open(aFile, 'rb') else: aData = open(aFile, 'r') in_header=True header_lines = [] sample_list = None for aLine in aData: if pass_prefix is not None and aLine.startswith(pass_prefix): if aLine[0] == '#' and aLine[1] != '#': sample_list = aLine.rstrip().split('\t', 14)[-1] else: header_lines.append(aLine) continue else: if in_header == True: headerObj.add_header(header_lines) headerObj.add_info(cohort_name + '_AF', '.', 'Float', 'Allele frequency(ies) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' ) headerObj.add_info(cohort_name + '_VarID', '.', 'Integer', 'List of Variant ID(s) for matching variants found in the ' + cohort_name + ' vcf' + ' (' + str(str(bFile).split('/')[-1]) + ')' ) header = headerObj.get_header() bedpe_out.write(header[:header.rfind('\n')] + '\n') if len(sample_list) > 0: bedpe_out.write('\t'.join(['#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'INFO_A','INFO_B', sample_list] ) + '\n') else: bedpe_out.write('\t'.join(['#CHROM_A', 'START_A', 'END_A', 'CHROM_B', 'START_B', 'END_B', 'ID', 'QUAL', 'STRAND_A', 'STRAND_B', 'TYPE', 'FILTER', 'INFO_A','INFO_B'] ) + '\n') in_header=False a = Bedpe(aLine.rstrip().split('\t')) if a.af is None: sys.stderr.write('No allele frequency for variant found in -a file. This tool requires allele frequency information to function. Please add with svtools afreq and rerun\n') sys.exit(1) for b in bList: add(a,b,max_distance) bedpe_out.write(get_var_string(a, cohort_name))
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.') vcf.add_format('GTR', 1, 'String', 'Refined genotype.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF')) < 0.01: vcf_out.write(line) else: df = load_df(var, exclude, sex) recdf = recluster(df) if ct == 1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: if s in recdf.index: var.genotype(s).set_format("GTR", recdf.loc[s, 'GTR']) var.genotype(s).set_format( "GQR", '{:.2f}'.format(recdf.loc[s, 'gq_re'])) else: var.genotype(s).set_format("GTR", "./.") var.genotype(s).set_format("GQR", 0) vcf_out.write( var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file): vcf = Vcf() header = [] in_header = True sex={} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) outf=open(diag_outfile, 'w', 4096) ct=1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQR', 1, 'Float', 'Quality of refined genotype.') vcf.add_format('GTR', 1, 'String', 'Refined genotype.') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL or DUP prior to reclassification if svtype not in ['DEL']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF'))<0.01: vcf_out.write(line) else: df=load_df(var, exclude, sex) recdf=recluster(df) if ct==1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0,:].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0,:].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: if s in recdf.index: var.genotype(s).set_format("GTR", recdf.loc[s,'GTR']) var.genotype(s).set_format("GQR", '{:.2f}'.format(recdf.loc[s,'gq_re'])) else: var.genotype(s).set_format("GTR", "./.") var.genotype(s).set_format("GQR", 0) vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return
def run_from_args(args): vcf = Vcf() vcf_out=sys.stdout in_header = True header_lines = list() with su.InputStream(args.manta_vcf) as input_stream: for line in input_stream: if in_header: header_lines.append(line) if line[0:6] == '#CHROM': in_header=False vcf.add_header(header_lines) vcf.add_info('PRPOS', '1', 'String', 'Breakpoint probability dist') vcf.add_info('PREND', '1', 'String', 'Breakpoint probability dist') vcf.add_info('STRANDS', '.', 'String', 'Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--') vcf.add_info('SU', '.', 'Integer', 'Number of pieces of evidence supporting the variant across all samples') vcf.add_info('PE', '.', 'Integer', 'Number of paired-end reads supporting the variant across all samples') vcf.add_info('SR', '.', 'Integer', 'Number of split reads supporting the variant across all samples') vcf.add_info('INSLEN_ORIG', '.', 'Integer', 'Original insertion length') vcf.add_info('CIPOS95', '2', 'Integer', 'Confidence interval (95%) around POS for imprecise variants') vcf.add_info('CIEND95', '2', 'Integer', 'Confidence interval (95%) around END for imprecise variants') vcf.add_info('SECONDARY', '0', 'Flag', 'Secondary breakend in a multi-line variant') vcf_out.write(vcf.get_header()+'\n') else: v = Variant(line.rstrip().split('\t'), vcf) convert_variant(v, args.max_ins) vcf_out.write(v.get_var_string()+"\n")
def l_cluster_by_line(file_name, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'): v_id = 0 in_header = True header = [] vcf = Vcf() vcf_out=sys.stdout with InputStream(file_name) as vcf_stream: BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' sample_order = [] for line in vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v=line.rstrip().split('\t') for headline in header: if headline[:8] == '##SAMPLE': sample_order.append(headline.rstrip()[13:-1]) hline='' if include_genotypes : v.extend(sample_order) hline='\t'.join(v) else : v=v[:8] hline='\t'.join(v) header.append(hline) in_header=False vcf.add_header(header) vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint') if include_genotypes: vcf_out.write(vcf.get_header()+'\n') else: vcf_out.write(vcf.get_header(False)+'\n') continue b = Breakpoint(l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop) if (len(BP_l) == 0) or ((b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type)): BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.left.end) BP_chr_l = b.left.chrom BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme) BP_l = [b] BP_max_end_l = b.left.end BP_sv_type = b.sv_type BP_chr_l = b.left.chrom if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
def execute(self, output_handle=sys.stdout): in_header = True header = [] vcf = Vcf() vcf_out = output_handle # read input VCF for line in self.vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split('\t') header.append('\t'.join(v)) in_header = False vcf.add_header(header) vcf.add_info('AF', 'A', 'Float', 'Allele Frequency, for each ALT allele, in the same order as listed') vcf.add_info('NSAMP', '1', 'Integer', 'Number of samples with non-reference genotypes') vcf.add_info('MSQ', '1', 'Float', 'Mean sample quality of positively genotyped samples') # write header vcf_out.write(vcf.get_header() + '\n') #vcf_out.write('\t' + '\t'.join(v[8:]) + '\n') continue v = line.rstrip().split('\t') var = Variant(v, vcf, fixed_genotypes=True) # extract genotypes from VCF num_alt = len(var.alt.split(',')) alleles = [0] * (num_alt + 1) num_samp = 0 gt = [var.genotype(s).get_format('GT') for s in var.sample_list] for gt_string in gt: if '.' in gt_string: continue gt = gt_string.split('/') if len(gt) == 1: gt = gt_string.split('|') gt = map(int, gt) for i in xrange(len(gt)): alleles[gt[i]] += 1 # iterate the number of non-reference samples if sum(gt) > 0: num_samp += 1 allele_sum = float(sum(alleles)) allele_freq = ['.'] * len(alleles) # populate AF if allele_sum > 0: for i in xrange(len(alleles)): allele_freq[i] = alleles[i] / allele_sum var.info['AF'] = ','.join(map(str, ['%.4g' % a for a in allele_freq[1:]])) else: var.info['AF'] = ','.join(map(str, allele_freq[1:])) # populate NSAMP var.info['NSAMP'] = num_samp var.info['MSQ'] = self.calc_msq(var) # after all samples have been processed, write vcf_out.write(var.get_var_string(use_cached_gt_string=True) + '\n') vcf_out.close()
def l_cluster_by_line(file_name, tempdir, percent_slop=0, fixed_slop=0, use_product=False, include_genotypes=False, weighting_scheme='unweighted'): v_id = 0 in_header = True header = [] vcf = Vcf() vcf_out = sys.stdout with InputStream(file_name, tempdir) as vcf_stream: BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' sample_order = [] for line in vcf_stream: if in_header: if line.startswith('##'): header.append(line) continue elif line.startswith('#CHROM'): v = line.rstrip().split( '\t') # #CHROM line split -> list -D for headline in header: if headline[:8] == '##SAMPLE' and headline.rstrip( )[13:-1] != 'VARIOUS': sample_order.append( headline.rstrip()[13:-1] ) # maybe add sample name to samplr_order list. -D hline = '' # Parsed #CHROM line from 'v' -D if include_genotypes: v = v[:9] # Remove possible VARIOUS -D v.extend(sample_order) hline = '\t'.join(v) else: v = v[:8] # No FORMAT field here. -D hline = '\t'.join(v) header.append(hline) in_header = False vcf.add_header(header) vcf.add_info('ALG', '1', 'String', 'Algorithm used to merge this breakpoint') if include_genotypes: vcf_out.write(vcf.get_header() + '\n') else: vcf_out.write( vcf.get_header(include_samples=False) + '\n') # Not including samples here. -D continue # Header is now parsed, then the main dish. -D b = Breakpoint( l_bp.parse_vcf_record(line), percent_slop=percent_slop, fixed_slop=fixed_slop) # percent_slop and fixed_slop is 0. -D if (len(BP_l) == 0) or ( (b.left.start <= BP_max_end_l) and (b.left.chrom == BP_chr_l) and (b.sv_type == BP_sv_type) ): # Same chrom svtype and start is small than previous end. -D BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.left.end) BP_chr_l = b.left.chrom BP_sv_type = b.sv_type else: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme) BP_l = [b] BP_max_end_l = b.left.end BP_chr_l = b.left.chrom BP_sv_type = b.sv_type if len(BP_l) > 0: v_id = r_cluster(BP_l, sample_order, v_id, use_product, vcf, vcf_out, include_genotypes, weighting_scheme)
def run_gt_refine(vcf_in, vcf_out, diag_outfile, gender_file, exclude_file, batch_file): vcf = Vcf() header = [] in_header = True sex = {} for line in gender_file: v = line.rstrip().split('\t') sex[v[0]] = int(v[1]) exclude = [] if exclude_file is not None: for line in exclude_file: exclude.append(line.rstrip()) batch = dict() if batch_file is not None: for line in batch_file: fields = line.rstrip().split('\t') if fields[1] == 'None': raise RuntimeError('Batch file contains a batch label of None. This label is reserved.') batch[fields[0]] = fields[1] outf = open(diag_outfile, 'w', 4096) ct = 1 for line in vcf_in: if in_header: if line[0] == "#": header.append(line) continue else: in_header = False vcf.add_header(header) vcf.add_info('MEDGQR', '1', 'Float', 'Median quality for refined GT') vcf.add_info('Q10GQR', '1', 'Float', 'Q10 quality for refined GT') vcf.add_format('GQO', 1, 'Integer', 'Quality of original genotype') vcf.add_format('GTO', 1, 'String', 'Genotype before refinement') vcf_out.write(vcf.get_header() + '\n') v = line.rstrip().split('\t') info = v[7].split(';') svtype = None for x in info: if x.startswith('SVTYPE='): svtype = x.split('=')[1] break # bail if not DEL prior to reclassification # DUPs can be quite complicated in their allelic structure # and thus less amenable to refinement by clustering in many cases # INV and BNDs are also unclear. # See earlier commits for code of previous attempts to refine these. if svtype not in ['DEL', 'MEI']: vcf_out.write(line) continue var = Variant(v, vcf) sys.stderr.write("%s\n" % var.var_id) sys.stderr.write("%f\n" % float(var.get_info('AF'))) if float(var.get_info('AF')) < 0.01: vcf_out.write(line) else: df = load_df(var, exclude, sex, batch) recdf = recluster(df) if ct == 1: recdf.to_csv(outf, header=True) ct += 1 else: recdf.to_csv(outf, header=False) var.set_info("MEDGQR", '{:.2f}'.format(recdf.iloc[0, :].loc['med_gq_re'])) var.set_info("Q10GQR", '{:.2f}'.format(recdf.iloc[0, :].loc['q10_gq_re'])) recdf.set_index('sample', inplace=True) for s in var.sample_list: g = var.genotype(s) g.set_format("GTO", g.get_format("GT")) g.set_format("GQO", g.get_format("GQ")) if s in recdf.index: var.genotype(s).set_format("GT", recdf.loc[s, 'GTR']) var.genotype(s).set_format("GQ", '{:.0f}'.format(recdf.loc[s, 'gq_re'])) else: var.genotype(s).set_format("GT", "./.") var.genotype(s).set_format("GQ", 0) vcf_out.write(var.get_var_string(use_cached_gt_string=False) + '\n') vcf_out.close() vcf_in.close() gender_file.close() outf.close() if exclude_file is not None: exclude_file.close() return