예제 #1
0
def split(args):
    vc_mode, variant_caller, precalled_filename, gatk_path = tk_io.get_vc_mode(
        args.vc_precalled, args.variant_mode)
    precalled_file = None
    if vc_mode == "precalled" or vc_mode == "precalled_plus":
        mem_gb = 8
        threads = 1
        precalled_file = martian.make_path("precalled_vcf.vcf")
        tenkit.log_subprocess.check_call(
            ['cp', precalled_filename, precalled_file])
        tk_tabix.index_vcf(precalled_file)
        precalled_file = precalled_file + ".gz"
    if vc_mode != "precalled":
        if variant_caller == 'freebayes':
            mem_gb = 5
            threads = 1
        elif variant_caller == "gatk":
            mem_gb = 8
            threads = 2
            # make sure the gatk jar file exists
            if gatk_path is None:
                martian.throw(
                    "variant_caller 'gatk' selected, must supply path to gatk jar file -- e.g. \"gatk:/path/to/GenomeAnalysisTK.jar\""
                )

            gatk_loc = gatk_path
            if not (os.path.exists(gatk_loc)):
                martian.throw(
                    "variant_caller 'gatk' selected, gatk jar file does not exist: %s"
                    % gatk_loc)
        else:
            raise NotSupportedException('Variant caller not supported: ' +
                                        vc_mode)

    primary_contigs = tk_reference.load_primary_contigs(args.reference_path)
    bam_chunk_size_gb = 3.0

    if args.restrict_locus is None:
        loci = tk_chunks.get_sized_bam_chunks(args.input,
                                              bam_chunk_size_gb,
                                              contig_whitelist=primary_contigs,
                                              extra_args={
                                                  '__mem_gb': mem_gb,
                                                  '__threads': threads,
                                                  'split_input': precalled_file
                                              })
    else:
        loci = [{'locus': args.restrict_locus}]

    return {'chunks': loci}
예제 #2
0
def combine_vcfs(output_filename, input_vcf_filenames):
    tmp_filename = output_filename + ".tmp"

    for (i, fn) in enumerate(input_vcf_filenames):
        if i == 0:
            args = 'cat ' + fn
            subprocess.check_call(args + " > " + tmp_filename, shell=True)
        else:
            args = 'grep -v "^#" ' + fn
            ret = subprocess.call(args + " >> " + tmp_filename, shell=True)
            if ret == 2:
                raise Exception("grep call failed: " + args)

    # Sort and index the files
    tk_tabix.sort_vcf(tmp_filename, output_filename)
    tk_tabix.index_vcf(output_filename)

    os.remove(tmp_filename)
def join(args, outs, chunk_defs, chunk_outs):
    # mapping of cluster ID -> VCFs
    to_merge = collections.defaultdict(list)
    for o, d in zip(chunk_outs, chunk_defs):
        to_merge[d.cluster_id].append(o.variant_subset)

    # merge each VCF subset for a cluster
    merged_vcfs = []
    for cluster_id, vcf_list in to_merge.iteritems():
        merged_vcf = martian.make_path('{}.vcf'.format(cluster_id))
        tk_io.combine_vcfs(merged_vcf, vcf_list)
        merged_vcfs.append(merged_vcf + '.gz')

    # final merge to make one combined VCF
    tmp = martian.make_path('tmp.vcf')
    cmd = ['vcf-merge'] + merged_vcfs
    with open(tmp, 'w') as outf:
        subprocess.check_call(cmd, stdout=outf)
    # Sort and index the files
    tk_tabix.sort_vcf(tmp, outs.variants.replace('.gz', ''))
    tk_tabix.index_vcf(outs.variants.replace('.gz', ''))
    os.remove(tmp)
예제 #4
0
def main_sort_variants(args, outs):
    if args.input is None or args.input == [None]:
        outs.default = None
    else:
        outs.coerce_strings()

        # List of inputs
        sort_filename = outs.default[0:(len(outs.default)-3)]
        if type(args.input) == type([]):
            files = [f for f in args.input if os.path.isfile(f)]
            if len(files) == 0:
                outs.defaut = None
                return
            cat_filename = outs.default[:-3]
            tk_io.combine_vcfs(cat_filename, args.input)

        # Single input
        else:
            if not os.path.exists(args.input):
                outs.default = None
                return
            tk_tabix.sort_vcf(args.input, sort_filename)
            tk_tabix.index_vcf(sort_filename)