def canonicalize(filename, output_name): with open(output_name + "tmp.vcf", 'w') as canon_file: # Left-align indels and output variants as constitutent indels tenkit.log_subprocess.check_call([ 'vcfallelicprimitives', '--keep-info', '-t', 'VCFALLELICPRIMITIVE', filename ], stdout=canon_file) with open(output_name + "tmp2.vcf", 'w') as fixed_vcf: # the reason we are doing this is because vcfallelicprimitives screws up the vcf format in some places in the info fields # it changes some tags from KEY=.; to KEY=; which is invalid. bcftools fixes this, but we dont actually want to filter anything # this should do that tenkit.log_subprocess.check_call( ['bcftools', 'filter', output_name + "tmp.vcf"], stdout=fixed_vcf) with open(output_name + "tmp3.vcf", 'w') as unphased_file: vcf_in = tk_io.VariantFileReader(output_name + "tmp2.vcf") unsupported_genotype_filter = [( "UNSUPPORTED_GENOTYPE", "If genotype field contains '.' we assume that this is due to making a single sample vcf from a multiple sample vcf in which this sample does not contain the variant." )] tenx = ('TENX', '0', 'Flag', "called by 10X", None, None) vcf_out = tk_io.VariantFileWriter( unphased_file, template_file=open(output_name + "tmp2.vcf"), new_info_fields=[tenx], new_filters=unsupported_genotype_filter) for record in vcf_in.record_getter(): sample = record.samples[0] unsupported_genotype = False try: if len(sample.gt_alleles) > 0: genotype1 = sample.gt_alleles[0] if genotype1 == '.': unsupported_genotype = True else: unsupported_genotype = True if len(sample.gt_alleles) > 1: genotype2 = sample.gt_alleles[1] if genotype2 is '.': unsupported_genotype = True except: unsupported_genotype = True if unsupported_genotype: record.FILTER = ["UNSUPPORTED_GENOTYPE"] vcf_out.write_record(record) tk_tabix.sort_vcf(output_name + "tmp3.vcf", output_name)
def combine_vcfs(output_filename, input_vcf_filenames): tmp_filename = output_filename + ".tmp" for (i, fn) in enumerate(input_vcf_filenames): if i == 0: args = 'cat ' + fn subprocess.check_call(args + " > " + tmp_filename, shell=True) else: args = 'grep -v "^#" ' + fn ret = subprocess.call(args + " >> " + tmp_filename, shell=True) if ret == 2: raise Exception("grep call failed: " + args) # Sort and index the files tk_tabix.sort_vcf(tmp_filename, output_filename) tk_tabix.index_vcf(output_filename) os.remove(tmp_filename)
def join(args, outs, chunk_defs, chunk_outs): # mapping of cluster ID -> VCFs to_merge = collections.defaultdict(list) for o, d in zip(chunk_outs, chunk_defs): to_merge[d.cluster_id].append(o.variant_subset) # merge each VCF subset for a cluster merged_vcfs = [] for cluster_id, vcf_list in to_merge.iteritems(): merged_vcf = martian.make_path('{}.vcf'.format(cluster_id)) tk_io.combine_vcfs(merged_vcf, vcf_list) merged_vcfs.append(merged_vcf + '.gz') # final merge to make one combined VCF tmp = martian.make_path('tmp.vcf') cmd = ['vcf-merge'] + merged_vcfs with open(tmp, 'w') as outf: subprocess.check_call(cmd, stdout=outf) # Sort and index the files tk_tabix.sort_vcf(tmp, outs.variants.replace('.gz', '')) tk_tabix.index_vcf(outs.variants.replace('.gz', '')) os.remove(tmp)
def main_sort_variants(args, outs): if args.input is None or args.input == [None]: outs.default = None else: outs.coerce_strings() # List of inputs sort_filename = outs.default[0:(len(outs.default)-3)] if type(args.input) == type([]): files = [f for f in args.input if os.path.isfile(f)] if len(files) == 0: outs.defaut = None return cat_filename = outs.default[:-3] tk_io.combine_vcfs(cat_filename, args.input) # Single input else: if not os.path.exists(args.input): outs.default = None return tk_tabix.sort_vcf(args.input, sort_filename) tk_tabix.index_vcf(sort_filename)