예제 #1
0
def canonicalize(filename, output_name):
    with open(output_name + "tmp.vcf", 'w') as canon_file:
        # Left-align indels and output variants as constitutent indels
        tenkit.log_subprocess.check_call([
            'vcfallelicprimitives', '--keep-info', '-t', 'VCFALLELICPRIMITIVE',
            filename
        ],
                                         stdout=canon_file)
    with open(output_name + "tmp2.vcf", 'w') as fixed_vcf:
        # the reason we are doing this is because vcfallelicprimitives screws up the vcf format in some places in the info fields
        #  it changes some tags from KEY=.; to KEY=; which is invalid. bcftools fixes this, but we dont actually want to filter anything
        #   this should do that
        tenkit.log_subprocess.check_call(
            ['bcftools', 'filter', output_name + "tmp.vcf"], stdout=fixed_vcf)
    with open(output_name + "tmp3.vcf", 'w') as unphased_file:
        vcf_in = tk_io.VariantFileReader(output_name + "tmp2.vcf")
        unsupported_genotype_filter = [(
            "UNSUPPORTED_GENOTYPE",
            "If genotype field contains '.' we assume that this is due to making a single sample vcf from a multiple sample vcf in which this sample does not contain the variant."
        )]
        tenx = ('TENX', '0', 'Flag', "called by 10X", None, None)
        vcf_out = tk_io.VariantFileWriter(
            unphased_file,
            template_file=open(output_name + "tmp2.vcf"),
            new_info_fields=[tenx],
            new_filters=unsupported_genotype_filter)
        for record in vcf_in.record_getter():
            sample = record.samples[0]
            unsupported_genotype = False
            try:
                if len(sample.gt_alleles) > 0:
                    genotype1 = sample.gt_alleles[0]
                    if genotype1 == '.':
                        unsupported_genotype = True
                else:
                    unsupported_genotype = True
                if len(sample.gt_alleles) > 1:
                    genotype2 = sample.gt_alleles[1]
                    if genotype2 is '.':
                        unsupported_genotype = True
            except:
                unsupported_genotype = True
            if unsupported_genotype:
                record.FILTER = ["UNSUPPORTED_GENOTYPE"]
            vcf_out.write_record(record)
    tk_tabix.sort_vcf(output_name + "tmp3.vcf", output_name)
예제 #2
0
def combine_vcfs(output_filename, input_vcf_filenames):
    tmp_filename = output_filename + ".tmp"

    for (i, fn) in enumerate(input_vcf_filenames):
        if i == 0:
            args = 'cat ' + fn
            subprocess.check_call(args + " > " + tmp_filename, shell=True)
        else:
            args = 'grep -v "^#" ' + fn
            ret = subprocess.call(args + " >> " + tmp_filename, shell=True)
            if ret == 2:
                raise Exception("grep call failed: " + args)

    # Sort and index the files
    tk_tabix.sort_vcf(tmp_filename, output_filename)
    tk_tabix.index_vcf(output_filename)

    os.remove(tmp_filename)
def join(args, outs, chunk_defs, chunk_outs):
    # mapping of cluster ID -> VCFs
    to_merge = collections.defaultdict(list)
    for o, d in zip(chunk_outs, chunk_defs):
        to_merge[d.cluster_id].append(o.variant_subset)

    # merge each VCF subset for a cluster
    merged_vcfs = []
    for cluster_id, vcf_list in to_merge.iteritems():
        merged_vcf = martian.make_path('{}.vcf'.format(cluster_id))
        tk_io.combine_vcfs(merged_vcf, vcf_list)
        merged_vcfs.append(merged_vcf + '.gz')

    # final merge to make one combined VCF
    tmp = martian.make_path('tmp.vcf')
    cmd = ['vcf-merge'] + merged_vcfs
    with open(tmp, 'w') as outf:
        subprocess.check_call(cmd, stdout=outf)
    # Sort and index the files
    tk_tabix.sort_vcf(tmp, outs.variants.replace('.gz', ''))
    tk_tabix.index_vcf(outs.variants.replace('.gz', ''))
    os.remove(tmp)
예제 #4
0
def main_sort_variants(args, outs):
    if args.input is None or args.input == [None]:
        outs.default = None
    else:
        outs.coerce_strings()

        # List of inputs
        sort_filename = outs.default[0:(len(outs.default)-3)]
        if type(args.input) == type([]):
            files = [f for f in args.input if os.path.isfile(f)]
            if len(files) == 0:
                outs.defaut = None
                return
            cat_filename = outs.default[:-3]
            tk_io.combine_vcfs(cat_filename, args.input)

        # Single input
        else:
            if not os.path.exists(args.input):
                outs.default = None
                return
            tk_tabix.sort_vcf(args.input, sort_filename)
            tk_tabix.index_vcf(sort_filename)