예제 #1
0
def varsim_somatic_main():
    main_parser = argparse.ArgumentParser(
        description="VarSim: somatic workflow",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    main_parser.add_argument("--out_dir",
                             metavar="Out directory",
                             help="Output directory",
                             default="somatic_out")
    main_parser.add_argument("--work_dir",
                             metavar="Work directory",
                             help="Work directory",
                             default="somatic_work")
    main_parser.add_argument("--log_dir",
                             metavar="Log directory",
                             help="Directory to log to",
                             default="somatic_log")
    main_parser.add_argument("--reference",
                             metavar="FASTA",
                             help="Reference genome",
                             required=True,
                             type=file)
    main_parser.add_argument("--seed",
                             metavar="INT",
                             help="Random number seed",
                             type=int,
                             default=0)
    main_parser.add_argument("--sex",
                             metavar="Sex",
                             help="Sex of the person (MALE/FEMALE)",
                             required=False,
                             type=str,
                             choices=["MALE", "FEMALE"],
                             default="MALE")
    main_parser.add_argument("--id",
                             metavar="id",
                             help="Sample ID",
                             required=True)
    main_parser.add_argument("--simulator",
                             metavar="simulator",
                             help="Read simulator to use",
                             required=False,
                             type=str,
                             choices=["art", "dwgsim"],
                             default="art")
    main_parser.add_argument(
        "--simulator_executable",
        metavar="PATH",
        help="Path to the executable of the read simulator chosen",
        required=True,
        type=file)
    main_parser.add_argument("--varsim_jar",
                             metavar="PATH",
                             help="Path to VarSim.jar (deprecated)",
                             type=file,
                             default=None,
                             required=False)
    main_parser.add_argument("--read_length",
                             metavar="INT",
                             help="Length of read to simulate",
                             default=100,
                             type=int)
    main_parser.add_argument(
        "--nlanes",
        metavar="INT",
        help=
        "Number of lanes to generate, coverage will be divided evenly over the lanes. Simulation is parallized over lanes. Each lane will have its own pair of files",
        default=3,
        type=int)
    main_parser.add_argument("--total_coverage",
                             metavar="FLOAT",
                             help="Total coverage to simulate",
                             default=1.0,
                             type=float)
    main_parser.add_argument("--mean_fragment_size",
                             metavar="INT",
                             help="Mean fragment size",
                             default=350,
                             type=int)
    main_parser.add_argument("--sd_fragment_size",
                             metavar="INT",
                             help="Standard deviation of fragment size",
                             default=50,
                             type=int)

    main_parser.add_argument("--force_five_base_encoding",
                             action="store_true",
                             help="Force bases to be ACTGN")
    main_parser.add_argument("--filter",
                             action="store_true",
                             help="Only use PASS variants")
    main_parser.add_argument("--keep_temp",
                             action="store_true",
                             help="Keep temporary files")
    main_parser.add_argument("--java_max_mem",
                             metavar="XMX",
                             help="max java memory",
                             default="10g",
                             type=str)
    main_parser.add_argument("--java",
                             metavar="PATH",
                             help="path to java",
                             default="java",
                             type=str)
    main_parser.add_argument("--python",
                             metavar="PATH",
                             help="path to python",
                             default="python",
                             type=str)
    main_parser.add_argument('--version',
                             action='version',
                             version=get_version())

    input_vcf_group = main_parser.add_argument_group("Input VCFs options")
    input_vcf_group.add_argument(
        "--cosmic_vcf",
        metavar="VCF",
        help=
        "COSMIC database VCF. Need to specify when random COSMIC sampling is enabled."
    )
    input_vcf_group.add_argument("--normal_vcf",
                                 metavar="VCF",
                                 help="Normal VCF from previous VarSim run",
                                 required=True)
    input_vcf_group.add_argument("--somatic_vcfs",
                                 metavar="VCF",
                                 nargs="+",
                                 help="Somatic VCF",
                                 default=[])
    input_vcf_group.add_argument(
        "--merge_priority",
        choices=["sn", "ns"],
        help=
        "Priority of merging (lowest first) somatic (s) and normal truth (n).",
        default="sn")

    pipeline_control_group = main_parser.add_argument_group(
        "Pipeline control options. Disable parts of the pipeline.")
    pipeline_control_group.add_argument("--disable_rand_vcf",
                                        action="store_true",
                                        help="Disable RandVCF2VCF somatic")
    pipeline_control_group.add_argument("--disable_vcf2diploid",
                                        action="store_true",
                                        help="Disable vcf2diploid")
    pipeline_control_group.add_argument("--disable_sim",
                                        action="store_true",
                                        help="Disable read simulation")

    # RandVCF2VCF seed num_SNP num_INS num_DEL num_MNP num_COMPLEX percent_novel min_length_lim max_length_lim reference_file file.vcf
    rand_vcf_group = main_parser.add_argument_group(
        "RandVCF2VCF somatic options")
    rand_vcf_group.add_argument("--som_num_snp",
                                metavar="INT",
                                help="Number of somatic SNPs",
                                default=9000,
                                type=int)
    rand_vcf_group.add_argument("--som_num_ins",
                                metavar="INT",
                                help="Number of somatic insertions",
                                default=1000,
                                type=int)
    rand_vcf_group.add_argument("--som_num_del",
                                metavar="INT",
                                help="Number of somatic deletions",
                                default=1000,
                                type=int)
    rand_vcf_group.add_argument("--som_num_mnp",
                                metavar="INT",
                                help="Number of somatic MNPs",
                                default=100,
                                type=int)
    rand_vcf_group.add_argument("--som_num_complex",
                                metavar="INT",
                                help="Number of somatic complex variants",
                                default=100,
                                type=int)
    # rand_vcf_group.add_argument("--som_percent_novel", metavar="percent_novel", help="Percent novel", default=0, type=float)
    rand_vcf_group.add_argument("--som_min_length_lim",
                                metavar="INT",
                                help="Min length lim",
                                default=0,
                                type=int)
    rand_vcf_group.add_argument("--som_max_length_lim",
                                metavar="INT",
                                help="Max length lim",
                                default=49,
                                type=int)
    # rand_vcf_group.add_argument("--som_vcf", metavar="in_vcf", help="Input somatic variant database VCF", type=file, required=False)
    rand_vcf_group.add_argument(
        "--som_prop_het",
        metavar="FLOAT",
        help="Proportion of somatic heterozygous variants",
        default=1.0,
        type=float)
    rand_vcf_group.add_argument(
        "--sv_insert_seq",
        metavar="FILE",
        help=
        "Path to file containing concatenation of real insertion sequences",
        type=file,
        required=True)

    dwgsim_group = main_parser.add_argument_group("DWGSIM options")
    dwgsim_group.add_argument("--dwgsim_start_e",
                              metavar="first_base_error_rate",
                              help="Error rate on the first base",
                              default=0.0001,
                              type=float)
    dwgsim_group.add_argument("--dwgsim_end_e",
                              metavar="last_base_error_rate",
                              help="Error rate on the last base",
                              default=0.0015,
                              type=float)
    dwgsim_group.add_argument("--dwgsim_options",
                              help="DWGSIM command-line options",
                              default="",
                              required=False)

    art_group = main_parser.add_argument_group("ART options")
    art_group.add_argument("--profile_1",
                           metavar="profile_file1",
                           help="Profile for first end",
                           default=None,
                           type=file)
    art_group.add_argument("--profile_2",
                           metavar="profile_file2",
                           help="Profile for second end",
                           default=None,
                           type=file)
    art_group.add_argument("--art_options",
                           help="ART command-line options",
                           default="",
                           required=False)

    args = main_parser.parse_args()

    args.java = utils.get_java(args.java)
    check_java(args.java)
    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    makedirs([args.log_dir, args.out_dir])

    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"),
                        filemode="w",
                        level=logging.DEBUG,
                        format=FORMAT)
    logger = logging.getLogger(varsim_somatic_main.__name__)

    if not args.disable_sim:
        if not args.simulator_executable:
            logger.error(
                "Please specify %s binary with --simulator_executable option" %
                args.simulator)
            sys.exit(os.EX_USAGE)
        check_executable(args.simulator_executable.name)

    t_s = time.time()

    cosmic_sampled_vcfs = []
    if not args.disable_rand_vcf:
        if not args.cosmic_vcf:
            logger.error(
                "COSMIC database VCF not specified using --cosmic_vcf")
            sys.exit(os.EX_USAGE)
        rand_vcf_stdout = open(os.path.join(args.out_dir, "random.cosmic.vcf"),
                               "w")
        rand_vcf_stderr = open(os.path.join(args.log_dir, "random.cosmic.err"),
                               "w")
        cosmic_sampled_vcfs = [rand_vcf_stdout.name]

        # Not able to support novel yet for COSMIC variants
        randvcf_options = RandVCFOptions(args.som_num_snp, args.som_num_ins,
                                         args.som_num_del, args.som_num_mnp,
                                         args.som_num_complex, 0,
                                         args.som_min_length_lim,
                                         args.som_max_length_lim,
                                         args.som_prop_het)
        run_randvcf(os.path.realpath(args.cosmic_vcf), rand_vcf_stdout,
                    rand_vcf_stderr, args.seed, args.sex, randvcf_options,
                    args.reference.name, args.java)

    normal_vcfs = [args.normal_vcf]
    somatic_vcfs = cosmic_sampled_vcfs + args.somatic_vcfs
    fixed_somatic_vcfs = []
    if somatic_vcfs:
        vcfs_dir = os.path.join(args.out_dir, "somatic_vcfs")
        makedirs([vcfs_dir])
        count = 0
        for index, vcf in enumerate(somatic_vcfs):
            copied_vcf = os.path.join(vcfs_dir, "%d.vcf" % index)
            logger.info(
                "Copying somatic VCF %s to %s and adding VARSIMSOMATIC id to entries if missing"
                % (vcf, copied_vcf))
            with open(vcf, "r") as vcf_fd, open(copied_vcf,
                                                "w") as copied_vcf_fd:
                for line in vcf_fd:
                    if line.startswith("#"):
                        copied_vcf_fd.write(line)
                    else:
                        line_fields = line.split("\t")
                        line_fields[2] = (
                            "VARSIMSOMATIC%d" %
                            count) if line_fields[2] == "." else (
                                "%s,VARSIMSOMATIC%d" % (line_fields[2], count))
                        copied_vcf_fd.write("\t".join(line_fields))
                        count += 1
            fixed_somatic_vcfs.append(copied_vcf)

    vcf_files = (fixed_somatic_vcfs +
                 normal_vcfs) if args.merge_priority == "sn" else (
                     normal_vcfs + fixed_somatic_vcfs)
    vcf_files = map(os.path.realpath, filter(None, vcf_files))

    processes = run_vcfstats(vcf_files, args.out_dir, args.log_dir, args.java)

    # Run VarSim
    varsim_stdout = open(os.path.join(args.log_dir, "som_varsim.out"), "w")
    varsim_stderr = open(os.path.join(args.log_dir, "som_varsim.log"), "w")

    vcf_arg_list = ["--vcfs"] + vcf_files

    # need to fix the store true ones
    filter_arg_list = ["--filter"] if args.filter else []
    disable_sim_arg_list = ["--disable_sim"] if args.disable_sim else []
    force_five_base_encoding_arg_list = [
        "--force_five_base_encoding"
    ] if args.force_five_base_encoding else []
    keep_temp_arg_list = ["--keep_temp"] if args.keep_temp else []
    profile_1_arg_list = ["--profile_1", args.profile_1.name
                          ] if args.profile_1 is not None else []
    profile_2_arg_list = ["--profile_2", args.profile_2.name
                          ] if args.profile_2 is not None else []
    other_varsim_opts = []
    if args.simulator == "dwgsim":
        other_varsim_opts = [
            "--dwgsim_start_e",
            str(args.dwgsim_start_e), "--dwgsim_end_e",
            str(args.dwgsim_end_e)
        ]
        if args.dwgsim_options:
            other_varsim_opts += ["--dwgsim_options", str(args.dwgsim_options)]
    elif args.simulator == "art" and args.art_options:
        other_varsim_opts += ["--art_options", args.art_options]

    args.python = utils.get_python(args.python)
    varsim_command = [args.python, os.path.realpath(VARSIM_PY),
                      "--out_dir", str(os.path.realpath(args.out_dir)),
                      "--work_dir", str(os.path.realpath(args.work_dir)),
                      "--log_dir", str(os.path.realpath(os.path.join(args.log_dir, "varsim"))),
                      "--reference", str(os.path.realpath(args.reference.name)),
                      "--seed", str(args.seed),
                      "--sex", str(args.sex),
                      "--id", str(args.id),
                      "--simulator", str(args.simulator),
                      "--simulator_executable", str(args.simulator_executable.name),
                      "--read_length", str(args.read_length),
                      "--nlanes", str(args.nlanes),
                      "--total_coverage", str(args.total_coverage),
                      "--mean_fragment_size", str(args.mean_fragment_size),
                      "--sd_fragment_size", str(args.sd_fragment_size),
                      "--disable_rand_vcf",
                      "--disable_rand_dgv",
        "--sv_insert_seq", args.sv_insert_seq.name] + other_varsim_opts + vcf_arg_list + filter_arg_list + disable_sim_arg_list \
                     + force_five_base_encoding_arg_list + keep_temp_arg_list + profile_1_arg_list + profile_2_arg_list
    varsim_command = " ".join(varsim_command)
    p_varsim = subprocess.Popen(varsim_command,
                                stdout=varsim_stdout,
                                stderr=varsim_stderr,
                                shell=True)
    logger.info("Executing command " + varsim_command + " with pid " +
                str(p_varsim.pid))
    processes.append(p_varsim)

    processes = monitor_processes(processes)

    # Split the tumor truth VCF into normal variants and somatic variants
    tumor_vcf = os.path.realpath(
        os.path.join(args.out_dir, "%s.truth.vcf" % args.id))
    normal_vcf = os.path.join(args.out_dir, "%s_norm.vcf" % args.id)
    somatic_vcf = os.path.join(args.out_dir, "%s_somatic.vcf" % args.id)
    logger.info("Splitting the truth VCF %s into normal and somatic VCFs" %
                tumor_vcf)
    with open(tumor_vcf, "r") as tumor_truth_fd, \
        open(normal_vcf, "w") as normal_vcf_fd, \
        open(somatic_vcf, "w") as somatic_vcf_fd:
        for line in tumor_truth_fd:
            if line.startswith("#"):
                somatic_vcf_fd.write(line)
                normal_vcf_fd.write(line)
                continue
            if line.find("VARSIMSOMATIC") >= 0:
                somatic_vcf_fd.write(line)
            else:
                normal_vcf_fd.write(line)

    run_vcfstats([normal_vcf, somatic_vcf], args.out_dir, args.log_dir,
                 args.java)

    logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))
예제 #2
0
def varsim_main(
        reference,
        simulator,  # use None to disable simulation
        simulator_exe,
        total_coverage,
        variant_vcfs=[],
        sampling_vcf=None,
        dgv_file=None,
        randvcf_options=None,  # use None to disable RandVCF
        randdgv_options=None,  # use None to disable RandDGV 
        nlanes=1,
        simulator_options="",
        sample_id="VarSim_Sample",
        log_dir="log",
        out_dir="out",
        sv_insert_seq=None,
        seed=0,
        sex="MALE",
        remove_filtered=False,
        keep_temp=False,
        force_five_base_encoding=False,
        lift_ref=False,
        disable_vcf2diploid=False,
        java="java"):

    check_java(java)

    # make the directories we need
    makedirs([log_dir, out_dir])

    logger = logging.getLogger(varsim_main.__name__)

    # Make sure we can actually execute the executable
    if simulator:
        if simulator not in ["dwgsim", "art", "longislnd"]:
            raise NotImplementedError(
                "Simulation method {} not implemented".format(simulator))
        check_executable(simulator_exe)

    processes = []

    t_s = time.time()

    variant_vcfs = map(os.path.realpath, variant_vcfs)

    if sv_insert_seq:
        in_vcfs = []
        for i, vcf in enumerate(variant_vcfs):
            tool_work_dir = os.path.join(out_dir, "filled_in", str(i))
            makedirs([tool_work_dir])
            in_vcfs.append(
                fill_missing_sequences(vcf, sample_id,
                                       os.path.realpath(sv_insert_seq),
                                       reference, tool_work_dir, tool_work_dir,
                                       java))
        variant_vcfs = map(os.path.realpath, in_vcfs)
    else:
        logger.warn(
            "Not filling in SV sequences since no insert sequence file provided"
        )

    open_fds = []
    if randvcf_options:
        if not sampling_vcf:
            logger.error("Need to provide the VCF for random sampling")
            raise ValueError("Sampling VCF missing")

        rand_vcf_out_fd = open(os.path.join(out_dir, "random.vc.vcf"), "w")
        rand_vcf_log_fd = open(os.path.join(log_dir, "RandVCF2VCF.err"), "w")
        variant_vcfs.append(os.path.realpath(rand_vcf_out_fd.name))
        run_randvcf(os.path.realpath(sampling_vcf), rand_vcf_out_fd,
                    rand_vcf_log_fd, seed, sex, randvcf_options, reference,
                    java)
        open_fds += [rand_vcf_out_fd, rand_vcf_log_fd]

    if randdgv_options:
        if not sv_insert_seq:
            raise ValueError("Need SV sequence file to fill in SV sequences")

        if not dgv_file:
            logger.error("Need to provide the DGV file for random sampling")
            raise ValueError("DGV file missing")

        rand_dgv_stdout = open(os.path.join(out_dir, "random.sv.vcf"), "w")
        rand_dgv_stderr = open(os.path.join(log_dir, "RandDGV2VCF.err"), "w")
        variant_vcfs.append(os.path.realpath(rand_dgv_stdout.name))
        run_randdgv(dgv_file, rand_dgv_stdout, rand_dgv_stderr, seed, sex,
                    randdgv_options, reference, sv_insert_seq, java)
        open_fds += [rand_dgv_stdout, rand_dgv_stderr]

    processes = monitor_processes(processes)
    for open_fd in open_fds:
        open_fd.close()

    merged_reference = os.path.join(out_dir, "%s.fa" % (sample_id))
    merged_truth_vcf = os.path.join(out_dir, "%s.truth.vcf" % (sample_id))
    merged_map = os.path.join(out_dir, "%s.map" % (sample_id))

    processes = run_vcfstats(variant_vcfs, out_dir, log_dir, java)

    if not disable_vcf2diploid:
        logger.info("vcf2diploid started")
        vcf2diploid_stdout = open(os.path.join(out_dir, "vcf2diploid.out"),
                                  "w")
        vcf2diploid_stderr = open(os.path.join(log_dir, "vcf2diploid.err"),
                                  "w")
        vcf_arg_list = sum([["-vcf", v] for v in variant_vcfs], [])
        filter_arg_list = ["-pass"] if remove_filtered else []
        vcf2diploid_command = [
            java, utils.JAVA_XMX, "-jar", VARSIMJAR, "vcf2diploid", "-t", sex,
            "-id", sample_id, "-chr",
            os.path.realpath(reference)
        ] + filter_arg_list + vcf_arg_list + ["-no_contig_id"]

        logger.info("Executing command " + " ".join(vcf2diploid_command))
        subprocess.check_call(vcf2diploid_command,
                              stdout=vcf2diploid_stdout,
                              stderr=vcf2diploid_stderr,
                              cwd=out_dir)

        processes = monitor_processes(processes)

        # Now concatenate the .fa from vcf2diploid
        contigs = get_contigs_list(reference)
        contig_fastas = map(
            lambda (x, y): os.path.join(out_dir, "%s_%s_%s.fa" %
                                        (x, sample_id, y)),
            itertools.product(contigs, ["maternal", "paternal"]))
        fastas_to_cat = filter(os.path.isfile, contig_fastas)
        concatenate_files(fastas_to_cat,
                          merged_reference,
                          remove_original=True)

        if os.path.getsize(merged_reference) == 0:
            logger.error(
                "Merged FASTA is empty. Something bad happened. Exiting")
            raise RuntimeError("Empty FASTA generated by vcf2diploid")

        # contatenate the vcfs
        vcfs_to_cat = filter(
            os.path.isfile,
            map(lambda x: os.path.join(out_dir, "%s_%s.vcf" % (x, sample_id)),
                contigs))
        concatenate_files(vcfs_to_cat,
                          merged_truth_vcf,
                          header_str="#",
                          simple_cat=False,
                          remove_original=True)

        run_vcfstats([merged_truth_vcf], out_dir, log_dir, java)
        logger.info("vcf2diploid done")

        if lift_ref:
            lifted_dir = os.path.join(out_dir, "lifted")
            makedirs([lifted_dir])
            #quick fix for issue of CN
            convertCN([merged_truth_vcf], "two2one")
            merged_truth_vcf = lift_vcfs([merged_truth_vcf],
                                         os.path.join(lifted_dir, "truth.vcf"),
                                         None,
                                         tabix_index=False)
            #quick fix for issue of CN
            convertCN([merged_truth_vcf], "one2two")
            pysam.tabix_index(merged_truth_vcf, force=True, preset='vcf')
            merged_map = lift_maps([merged_map],
                                   os.path.join(lifted_dir, "truth.map"))

    if processes:
        processes = monitor_processes(processes)

    # Now generate the reads using art/pbsim/dwgsim
    tmp_files = []
    if simulator:
        fifos = []
        fastqs = []
        sim_ts = time.time()
        coverage_per_lane = total_coverage * 0.5 / nlanes
        processes = []

        fifo_src_dst = []
        if simulator == "dwgsim":
            for i, end in itertools.product(xrange(nlanes), [1, 2]):
                fifo_src_dst.append(
                    ("simulated.lane%d.read%d.fastq" % (i, end),
                     "simulated.lane%d.read%d.fq.gz" % (i, end)))
        elif simulator == "art":
            for i, end, suffix in itertools.product(xrange(nlanes), [1, 2],
                                                    ["fq", "aln"]):
                fifo_src_dst.append(
                    ("simulated.lane%d.read%d.%s" % (i, end, suffix),
                     "simulated.lane%d.read%d.%s.gz" % (i, end, suffix)))
        else:  # simulator == "longislnd":
            pass

        for fifo_name, dst in fifo_src_dst:
            fifos.append(os.path.join(out_dir, fifo_name))
            if os.path.exists(fifos[-1]): os.remove(fifos[-1])
            os.mkfifo(fifos[-1])

            gzip_stderr = open(os.path.join(log_dir, "gzip.%s" % (fifo_name)),
                               "w")
            gzip_command = "cat %s | gzip -2 > %s" % (
                fifos[-1], os.path.join(out_dir, dst))
            logger.info("Executing command %s" % (gzip_command))
            gzip_p = subprocess.Popen(gzip_command,
                                      stdout=None,
                                      stderr=gzip_stderr,
                                      shell=True)
            logger.info(" with pid " + str(gzip_p.pid))
            processes.append(gzip_p)
            tmp_files.append(os.path.join(out_dir, dst))

        simulator_commands_files = []
        if simulator == "dwgsim":
            for i in xrange(nlanes):
                simulator_command = "{} {} -C {} -z {} {} {}".format(
                    os.path.realpath(simulator_exe), simulator_options,
                    coverage_per_lane, seed + i, merged_reference,
                    os.path.join(out_dir, "simulated.lane%d" % (i)))
                simulator_commands_files.append(
                    (simulator_command,
                     os.path.join(log_dir, "dwgsim.lane%d.out" % (i)),
                     os.path.join(log_dir, "dwgsim.lane%d.err" % (i))))
        elif simulator == "art":
            for i in xrange(nlanes):
                simulator_command = "{} {} -i {} -f {} -rs {} -o {}".format(
                    simulator_exe, simulator_options, merged_reference,
                    coverage_per_lane, seed + i,
                    os.path.join(out_dir, "simulated.lane%d.read" % (i)))
                simulator_commands_files.append(
                    (simulator_command,
                     os.path.join(log_dir, "art.lane%d.out" % (i)),
                     os.path.join(log_dir, "art.lane%d.err" % (i))))
        else:  # simulator == "longislnd":
            simulator_command = "{} {} --coverage {} --out {} --fasta {}".format(
                simulator_exe, simulator_options, total_coverage * 0.5,
                os.path.join(out_dir, "longislnd_sim"), merged_reference)
            simulator_commands_files.append(
                (simulator_command, os.path.join(log_dir, "longislnd.out"),
                 os.path.join(log_dir, "longislnd.err")))

        simulator_fds = []
        for command, stdout, stderr in simulator_commands_files:
            stdout_fd = open(stdout, "w")
            stderr_fd = open(stderr, "w")
            process = subprocess.Popen(command,
                                       stdout=stdout_fd,
                                       stderr=stderr_fd,
                                       shell=True,
                                       close_fds=True)
            logger.info("Executing command {} with pid {}".format(
                command, process.pid))
            processes.append(process)
            simulator_fds += [stdout_fd, stderr_fd]

        monitor_processes(processes)

        for fd in simulator_fds:
            fd.close()

        processes = []

        logger.info("Read generation took %g seconds" % (time.time() - sim_ts))

        sim_t_liftover = time.time()

        # Now start lifting over the gzipped files
        if simulator != "longislnd":
            for i in xrange(nlanes):
                liftover_stdout = open(
                    os.path.join(log_dir, "lane%d.out" % (i)), "w")
                liftover_stderr = open(
                    os.path.join(log_dir, "liftover%d.log" % (i)), "w")
                fastq_liftover_command = "%s -server %s -jar %s fastq_liftover -map %s -id %d " \
                                         "-fastq <(gunzip -c %s/simulated.lane%d.read1.fq.gz) " \
                                         "-fastq <(gunzip -c %s/simulated.lane%d.read2.fq.gz) " \
                                         "-out >(gzip -1 > %s/lane%d.read1.fq.gz) " \
                                         "-out >(gzip -1 > %s/lane%d.read2.fq.gz)" % (
                                             java,
                                             utils.JAVA_XMX,
                                             VARSIMJAR, merged_map, i, out_dir, i,
                                             out_dir, i, out_dir, i,
                                             out_dir, i)
                if force_five_base_encoding:
                    fastq_liftover_command += " -force_five_base_encoding "
                if simulator == "art":
                    fastq_liftover_command += " -type art " \
                                              "-aln <(gunzip -c %s/simulated.lane%d.read1.aln.gz) " \
                                              "-aln <(gunzip -c %s/simulated.lane%d.read2.aln.gz)" % (
                                                  out_dir, i, out_dir, i)
                elif simulator == "pbsim":
                    fastq_liftover_command += " -type pbsim " \
                                              "-maf <(gunzip -c %s/simulated.lane%d.read1.maf.gz) " \
                                              "-ref %s/simulated.lane%d.ref " % (out_dir, i, out_dir, i)
                fastq_liftover_command = "bash -c \"%s\"" % (
                    fastq_liftover_command)
                logger.info("Executing command " + fastq_liftover_command)
                subprocess.check_call(fastq_liftover_command,
                                      stdout=liftover_stdout,
                                      stderr=liftover_stderr,
                                      shell=True)
                fastqs.append(
                    os.path.join(out_dir, "lane%d.read%d.fq.gz" % (i, end)))
        else:
            # liftover the read map files
            read_map_files = list(
                glob.glob(os.path.join(out_dir, "longislnd_sim", "*.bed")))
            merged_raw_readmap = os.path.join(out_dir, "longislnd_sim",
                                              "merged_readmap.bed")
            concatenate_files(read_map_files, merged_raw_readmap)
            read_maps = "-longislnd %s" % merged_raw_readmap
            read_map_liftover_command = "%s %s -server -jar %s longislnd_liftover " % (
                java, utils.JAVA_XMX, VARSIMJAR
            ) + read_maps + " -map %s " % merged_map + " -out %s" % (
                os.path.join(out_dir, sample_id + ".truth.map"))
            read_map_liftover_stderr = open(
                os.path.join(log_dir, "longislnd_liftover.err"), "w")
            logger.info("Executing command " + read_map_liftover_command)
            subprocess.check_call(read_map_liftover_command,
                                  stdout=None,
                                  stderr=read_map_liftover_stderr,
                                  shell=True)

        monitor_processes(processes)

        logger.info("Liftover took %g seconds" %
                    (time.time() - sim_t_liftover))

        sim_te = max(sim_ts + 1, time.time())
        bytes_written = sum([os.path.getsize(fastq) for fastq in fastqs])
        logger.info("Took %g seconds, %ld Mbytes written, %g MB/s" %
                    (sim_te - sim_ts, bytes_written / 1024.0 / 1024.0,
                     bytes_written / 1024.0 / 1024.0 / (sim_te - sim_ts)))

        for fifo in fifos:
            os.remove(fifo)

    if not keep_temp:
        logger.info("Cleaning up intermediate files")
        for f in tmp_files:
            os.remove(f)
    logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))
예제 #3
0
파일: varsim.py 프로젝트: bioinform/varsim
        bytes_written = sum([os.path.getsize(fastq) for fastq in fastqs])
        logger.info("Took %g seconds, %ld Mbytes written, %g MB/s" % (
            sim_te - sim_ts, bytes_written / 1024.0 / 1024.0, bytes_written / 1024.0 / 1024.0 / (sim_te - sim_ts)))

        for fifo in fifos:
            os.remove(fifo)

    if not keep_temp:
        logger.info("Cleaning up intermediate files")
        for f in tmp_files:
            os.remove(f)
    logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))


if __name__ == "__main__":
    check_java()

    main_parser = argparse.ArgumentParser(description="VarSim: A high-fidelity simulation validation framework",
                                          formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    main_parser.add_argument("--out_dir", metavar="DIR",
                             help="Output directory for the simulated genome, reads and variants", required=False,
                             default="out")
    main_parser.add_argument("--work_dir", metavar="DIR", help="Work directory, currently not used", required=False,
                             default="work")
    main_parser.add_argument("--log_dir", metavar="DIR", help="Log files of all steps are kept here", required=False,
                             default="log")
    main_parser.add_argument("--reference", metavar="FASTA", help="Reference genome that variants will be inserted into",
                             required=True)
    main_parser.add_argument("--seed", metavar="seed", help="Random number seed for reproducibility", type=int, default=0)
    main_parser.add_argument("--sex", metavar="Sex", help="Sex of the person (MALE/FEMALE)", required=False, type=str,
                             choices=["MALE", "FEMALE"], default="MALE")
예제 #4
0
파일: varsim.py 프로젝트: bioinform/varsim
def varsim_main(reference,
                simulator, # use None to disable simulation
                simulator_exe,
                total_coverage,
                variant_vcfs=[],
                sampling_vcf=None,
                dgv_file=None,
                randvcf_options=None, # use None to disable RandVCF
                randdgv_options=None, # use None to disable RandDGV 
                nlanes=1,
                simulator_options="",
                sample_id="VarSim_Sample",
                log_dir="log",
                out_dir="out",
                sv_insert_seq=None,
                seed=0,
                sex="MALE",
                remove_filtered=False,
                keep_temp=False,
                force_five_base_encoding=False,
                lift_ref=False,
                disable_vcf2diploid=False):
    check_java()

    # make the directories we need
    makedirs([log_dir, out_dir])

    logger = logging.getLogger(varsim_main.__name__)

    # Make sure we can actually execute the executable
    if simulator:
        if simulator not in ["dwgsim", "art", "longislnd"]:
            raise NotImplementedError("Simulation method {} not implemented".format(simulator))
        check_executable(simulator_exe)

    processes = []

    t_s = time.time()

    variant_vcfs = map(os.path.realpath, variant_vcfs)

    if sv_insert_seq:
        in_vcfs = []
        for i, vcf in enumerate(variant_vcfs):
            tool_work_dir = os.path.join(out_dir, "filled_in", str(i))
            makedirs([tool_work_dir])
            in_vcfs.append(fill_missing_sequences(vcf, sample_id, os.path.realpath(sv_insert_seq), reference, tool_work_dir, tool_work_dir))
        variant_vcfs = map(os.path.realpath, in_vcfs)
    else:
        logger.warn("Not filling in SV sequences since no insert sequence file provided")

    open_fds = []
    if randvcf_options:
        if not sampling_vcf:
            logger.error("Need to provide the VCF for random sampling")
            raise ValueError("Sampling VCF missing")

        rand_vcf_out_fd = open(os.path.join(out_dir, "random.vc.vcf"), "w")
        rand_vcf_log_fd = open(os.path.join(log_dir, "RandVCF2VCF.err"), "w")
        variant_vcfs.append(os.path.realpath(rand_vcf_out_fd.name))
        run_randvcf(os.path.realpath(sampling_vcf), rand_vcf_out_fd, rand_vcf_log_fd, seed, sex, randvcf_options, reference)
        open_fds += [rand_vcf_out_fd, rand_vcf_log_fd]

    if randdgv_options:
        if not sv_insert_seq:
            raise ValueError("Need SV sequence file to fill in SV sequences")

        if not dgv_file:
            logger.error("Need to provide the DGV file for random sampling")
            raise ValueError("DGV file missing")

        rand_dgv_stdout = open(os.path.join(out_dir, "random.sv.vcf"), "w")
        rand_dgv_stderr = open(os.path.join(log_dir, "RandDGV2VCF.err"), "w")
        variant_vcfs.append(os.path.realpath(rand_dgv_stdout.name))
        run_randdgv(dgv_file, rand_dgv_stdout, rand_dgv_stderr, seed, sex, randdgv_options, reference, sv_insert_seq)
        open_fds += [rand_dgv_stdout, rand_dgv_stderr]

    processes = monitor_processes(processes)
    for open_fd in open_fds:
        open_fd.close()

    merged_reference = os.path.join(out_dir, "%s.fa" % (sample_id))
    merged_truth_vcf = os.path.join(out_dir, "%s.truth.vcf" % (sample_id))
    merged_map = os.path.join(out_dir, "%s.map" % (sample_id))

    processes = run_vcfstats(variant_vcfs, out_dir, log_dir)

    if not disable_vcf2diploid:
        logger.info("vcf2diploid started")
        vcf2diploid_stdout = open(os.path.join(out_dir, "vcf2diploid.out"), "w")
        vcf2diploid_stderr = open(os.path.join(log_dir, "vcf2diploid.err"), "w")
        vcf_arg_list = sum([["-vcf", v] for v in variant_vcfs], [])
        filter_arg_list = ["-pass"] if remove_filtered else []
        vcf2diploid_command = ["java", utils.JAVA_XMX, "-jar", VARSIMJAR, "vcf2diploid",
                               "-t", sex,
                               "-id", sample_id,
                               "-chr", os.path.realpath(reference)] + filter_arg_list + vcf_arg_list + ["-no_contig_id"]

        logger.info("Executing command " + " ".join(vcf2diploid_command))
        subprocess.check_call(vcf2diploid_command, stdout=vcf2diploid_stdout, stderr=vcf2diploid_stderr,
                                         cwd=out_dir)

        processes = monitor_processes(processes)

        # Now concatenate the .fa from vcf2diploid
        contigs = get_contigs_list(reference)
        contig_fastas = map(lambda (x, y): os.path.join(out_dir, "%s_%s_%s.fa" % (x, sample_id, y)), itertools.product(contigs, ["maternal", "paternal"]))
        fastas_to_cat = filter(os.path.isfile, contig_fastas)
        concatenate_files(fastas_to_cat, merged_reference, remove_original=True)

        if os.path.getsize(merged_reference) == 0:
            logger.error("Merged FASTA is empty. Something bad happened. Exiting")
            raise RuntimeError("Empty FASTA generated by vcf2diploid")

        # contatenate the vcfs
        vcfs_to_cat = filter(os.path.isfile, map(lambda x: os.path.join(out_dir, "%s_%s.vcf" % (x, sample_id)), contigs))
        concatenate_files(vcfs_to_cat, merged_truth_vcf, header_str="#", simple_cat=False, remove_original=True)

        monitor_processes(run_vcfstats([merged_truth_vcf], out_dir, log_dir))
        logger.info("vcf2diploid done")

        if lift_ref:
            lifted_dir = os.path.join(out_dir, "lifted")
            makedirs([lifted_dir])
            #quick fix for issue of CN
            convertCN([merged_truth_vcf], "two2one")
            merged_truth_vcf = lift_vcfs([merged_truth_vcf], os.path.join(lifted_dir, "truth.vcf"), None, tabix_index=False)
            #quick fix for issue of CN
            convertCN([merged_truth_vcf], "one2two")
            pysam.tabix_index(merged_truth_vcf, force=True, preset='vcf')
            merged_map = lift_maps([merged_map], os.path.join(lifted_dir, "truth.map"))

    if processes:
        processes = monitor_processes(processes)

    # Now generate the reads using art/pbsim/dwgsim
    tmp_files = []
    if simulator:
        fifos = []
        fastqs = []
        sim_ts = time.time()
        coverage_per_lane = total_coverage * 0.5 / nlanes
        processes = []

        fifo_src_dst = []
        if simulator == "dwgsim":
            for i, end in itertools.product(xrange(nlanes), [1, 2]):
                fifo_src_dst.append(
                    ("simulated.lane%d.read%d.fastq" % (i, end),
                     "simulated.lane%d.read%d.fq.gz" % (i, end)))
        elif simulator == "art":
            for i, end, suffix in itertools.product(xrange(nlanes), [1, 2], ["fq", "aln"]):
                fifo_src_dst.append(("simulated.lane%d.read%d.%s" % (i, end, suffix),
                                     "simulated.lane%d.read%d.%s.gz" % (i, end, suffix)))
        else: # simulator == "longislnd":
            pass

        for fifo_name, dst in fifo_src_dst:
            fifos.append(os.path.join(out_dir, fifo_name))
            if os.path.exists(fifos[-1]): os.remove(fifos[-1])
            os.mkfifo(fifos[-1])

            gzip_stderr = open(os.path.join(log_dir, "gzip.%s" % (fifo_name)), "w")
            gzip_command = "cat %s | gzip -2 > %s" % (fifos[-1], os.path.join(out_dir, dst))
            logger.info("Executing command %s" % (gzip_command) )
            gzip_p = subprocess.Popen(gzip_command, stdout = None, stderr = gzip_stderr, shell = True)
            logger.info( " with pid " + str(gzip_p.pid))
            processes.append(gzip_p)
            tmp_files.append(os.path.join(out_dir, dst))

        simulator_commands_files = []
        if simulator == "dwgsim":
            for i in xrange(nlanes):
                simulator_command = "{} {} -C {} -z {} {} {}".format(os.path.realpath(simulator_exe), simulator_options, coverage_per_lane, seed + i, merged_reference, os.path.join(out_dir, "simulated.lane%d" % (i)))
                simulator_commands_files.append((simulator_command, os.path.join(log_dir, "dwgsim.lane%d.out" % (i)), os.path.join(log_dir, "dwgsim.lane%d.err" % (i))))
        elif simulator == "art":
            for i in xrange(nlanes):
                simulator_command = "{} {} -i {} -f {} -rs {} -o {}".format(simulator_exe, simulator_options, merged_reference, coverage_per_lane, seed + i, os.path.join(out_dir, "simulated.lane%d.read" % (i)))
                simulator_commands_files.append((simulator_command, os.path.join(log_dir, "art.lane%d.out" % (i)), os.path.join(log_dir, "art.lane%d.err" % (i))))
        else: # simulator == "longislnd":
            simulator_command = "{} {} --coverage {} --out {} --fasta {}".format(simulator_exe, simulator_options, total_coverage * 0.5, os.path.join(out_dir, "longislnd_sim"), merged_reference)
            simulator_commands_files.append((simulator_command, os.path.join(log_dir, "longislnd.out"), os.path.join(log_dir, "longislnd.err")))

        simulator_fds = []
        for command, stdout, stderr in simulator_commands_files:
            stdout_fd = open(stdout, "w")
            stderr_fd = open(stderr, "w")
            process = subprocess.Popen(command, stdout=stdout_fd, stderr=stderr_fd, shell=True, close_fds=True)
            logger.info("Executing command {} with pid {}".format(command, process.pid))
            processes.append(process)
            simulator_fds += [stdout_fd, stderr_fd]

        monitor_processes(processes)

        for fd in simulator_fds:
            fd.close()

        processes = []

        logger.info("Read generation took %g seconds" % (time.time() - sim_ts))

        sim_t_liftover = time.time()

        # Now start lifting over the gzipped files
        if simulator != "longislnd":
            for i in xrange(nlanes):
                liftover_stdout = open(os.path.join(log_dir, "lane%d.out" % (i)), "w")
                liftover_stderr = open(os.path.join(log_dir, "liftover%d.log" % (i)), "w")
                fastq_liftover_command = "java -server %s -jar %s fastq_liftover -map %s -id %d " \
                                         "-fastq <(gunzip -c %s/simulated.lane%d.read1.fq.gz) " \
                                         "-fastq <(gunzip -c %s/simulated.lane%d.read2.fq.gz) " \
                                         "-out >(gzip -1 > %s/lane%d.read1.fq.gz) " \
                                         "-out >(gzip -1 > %s/lane%d.read2.fq.gz)" % (
                                             utils.JAVA_XMX, 
                                             VARSIMJAR, merged_map, i, out_dir, i,
                                             out_dir, i, out_dir, i,
                                             out_dir, i)
                if force_five_base_encoding:
                    fastq_liftover_command += " -force_five_base_encoding "
                if simulator == "art":
                    fastq_liftover_command += " -type art " \
                                              "-aln <(gunzip -c %s/simulated.lane%d.read1.aln.gz) " \
                                              "-aln <(gunzip -c %s/simulated.lane%d.read2.aln.gz)" % (
                                                  out_dir, i, out_dir, i)
                elif simulator == "pbsim":
                    fastq_liftover_command += " -type pbsim " \
                                              "-maf <(gunzip -c %s/simulated.lane%d.read1.maf.gz) " \
                                              "-ref %s/simulated.lane%d.ref " % (out_dir, i, out_dir, i)
                fastq_liftover_command = "bash -c \"%s\"" % (fastq_liftover_command)
                logger.info("Executing command " + fastq_liftover_command)
                subprocess.check_call(fastq_liftover_command, stdout = liftover_stdout, stderr = liftover_stderr, shell = True)
                fastqs.append(os.path.join(out_dir, "lane%d.read%d.fq.gz" % (i, end)))
        else:
            # liftover the read map files
            read_map_files = list(glob.glob(os.path.join(out_dir, "longislnd_sim", "*.bed")))
            merged_raw_readmap = os.path.join(out_dir, "longislnd_sim", "merged_readmap.bed")
            concatenate_files(read_map_files, merged_raw_readmap)
            read_maps = "-longislnd %s" % merged_raw_readmap 
            read_map_liftover_command = "java %s -server -jar %s longislnd_liftover " % (utils.JAVA_XMX, VARSIMJAR) + read_maps + " -map %s " % merged_map + " -out %s" % (os.path.join(out_dir, sample_id + ".truth.map"))
            read_map_liftover_stderr = open(os.path.join(log_dir, "longislnd_liftover.err"), "w")
            logger.info("Executing command " + read_map_liftover_command )
            subprocess.check_call(read_map_liftover_command, stdout = None, stderr = read_map_liftover_stderr, shell = True)

        monitor_processes(processes)

        logger.info("Liftover took %g seconds" % (time.time() - sim_t_liftover))

        sim_te = max(sim_ts + 1, time.time())
        bytes_written = sum([os.path.getsize(fastq) for fastq in fastqs])
        logger.info("Took %g seconds, %ld Mbytes written, %g MB/s" % (
            sim_te - sim_ts, bytes_written / 1024.0 / 1024.0, bytes_written / 1024.0 / 1024.0 / (sim_te - sim_ts)))

        for fifo in fifos:
            os.remove(fifo)

    if not keep_temp:
        logger.info("Cleaning up intermediate files")
        for f in tmp_files:
            os.remove(f)
    logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))
예제 #5
0
        bytes_written = sum([os.path.getsize(fastq) for fastq in fastqs])
        logger.info("Took %g seconds, %ld Mbytes written, %g MB/s" % (
            sim_te - sim_ts, bytes_written / 1024.0 / 1024.0, bytes_written / 1024.0 / 1024.0 / (sim_te - sim_ts)))

        for fifo in fifos:
            os.remove(fifo)

    if not keep_temp:
        logger.info("Cleaning up intermediate files")
        for f in tmp_files:
            os.remove(f)
    logger.info("Done! (%g hours)" % ((time.time() - t_s) / 3600.0))


if __name__ == "__main__":
    check_java()

    main_parser = argparse.ArgumentParser(description="VarSim: A high-fidelity simulation validation framework",
                                          formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    main_parser.add_argument("--out_dir", metavar="DIR",
                             help="Output directory for the simulated genome, reads and variants", required=False,
                             default="out")
    main_parser.add_argument("--work_dir", metavar="DIR", help="Work directory, currently not used", required=False,
                             default="work")
    main_parser.add_argument("--log_dir", metavar="DIR", help="Log files of all steps are kept here", required=False,
                             default="log")
    main_parser.add_argument("--reference", metavar="FASTA", help="Reference genome that variants will be inserted into",
                             required=True)
    main_parser.add_argument("--seed", metavar="seed", help="Random number seed for reproducibility", type=int, default=0)
    main_parser.add_argument("--sex", metavar="Sex", help="Sex of the person (MALE/FEMALE)", required=False, type=str,
                             choices=["MALE", "FEMALE"], default="MALE")
예제 #6
0
    parse_jsons(jsonfile, stats)
    print("Non-SV stats")
    print_stats(stats)
    sv_stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, sv_stats, count_sv=True)
    print("SV stats")
    print_stats(sv_stats)
    all_stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, all_stats, count_all=True)
    print("Overall stats")
    print_stats(all_stats)
    return tp, fn, fp, t


if __name__ == "__main__":
    utils.check_java()

    main_parser = argparse.ArgumentParser(
        description="VarSim: A high-fidelity simulation validation framework",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    main_parser.add_argument("--reference",
                             metavar="FASTA",
                             help="reference filename",
                             required=True,
                             type=str)
    main_parser.add_argument("--sdf",
                             metavar="SDF",
                             help="SDF formatted reference folder",
                             required=False,
                             type=str,
                             default='')
예제 #7
0
def process(args):
    '''
    main
    :param args:
    :return:
    '''
    args.java = utils.get_java(args.java)
    utils.check_java(args.java)

    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = utils.get_loglevel(args.loglevel)
    if args.log_to_file:
        logging.basicConfig(filename=args.log_to_file,
                            filemode="w",
                            level=loglevel,
                            format=FORMAT)
    else:
        logging.basicConfig(level=loglevel, format=FORMAT)

    if len(args.vcfs) > 1:
        raise NotImplementedError(
            'right now only support one prediction VCF. Quick workaround: src/sort_vcf.sh vcf1 vcf2 > merged.vcf'
        )

    global LOGGER
    LOGGER = logging.getLogger(__name__)
    LOGGER.info('working hard ...')

    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    args.out_dir = os.path.abspath(args.out_dir)
    args.reference = os.path.abspath(args.reference)
    utils.makedirs([args.out_dir])

    varsim_prefix = os.path.join(args.out_dir, 'varsim_compare_results')
    varsim_comparator = VarSimVCFComparator(
        prefix=varsim_prefix,
        true_vcf=args.true_vcf,
        reference=args.reference,
        regions=None,
        sample=args.sample,
        vcfs=args.vcfs,
        exclude_filtered=args.exclude_filtered,
        disallow_partial_fp=args.disallow_partial_fp,
        match_geno=args.match_geno,
        log_to_file=args.log_to_file,
        opts=args.vcfcompare_options,
        java=args.java)
    varsim_tp, varsim_fn, varsim_fp = varsim_comparator.get_tp(
    ), varsim_comparator.get_fn(), varsim_comparator.get_fp()
    varsim_tp = utils.sort_and_compress(varsim_tp)
    varsim_fn = utils.sort_and_compress(varsim_fn)
    varsim_fp = utils.sort_and_compress(varsim_fp)
    #run vcfeval
    sdf = args.sdf
    if not sdf:
        LOGGER.info(
            "user did not supply SDF-formatted reference, trying to generate one..."
        )
        sdf = generate_sdf(args.reference, args.log_to_file, java=args.java)
    '''for vcfeval
    sample column must be present, and not empty
    if single-sample vcf, vcfeval doesn't check if samples match in truth and call
    in multi-sample vcf, sample name must be specified
    right now
    '''
    vcfeval_prefix = os.path.join(args.out_dir, 'vcfeval_compare_results')
    if os.path.exists(vcfeval_prefix):
        LOGGER.warn('{0} exists, removing ...'.format(vcfeval_prefix))
        shutil.rmtree(vcfeval_prefix)
    vcfeval_comparator = RTGVCFComparator(
        prefix=vcfeval_prefix,
        true_vcf=varsim_fn,
        reference=sdf,
        regions=None,
        sample=args.sample,
        vcfs=[varsim_fp],
        exclude_filtered=args.exclude_filtered,
        match_geno=args.match_geno,
        log_to_file=args.log_to_file,
        opts=args.vcfeval_options,
        java=args.java)
    vcfeval_tp, vcfeval_tp_predict = vcfeval_comparator.get_tp(
    ), vcfeval_comparator.get_tp_predict()
    augmented_tp, augmented_fn, augmented_fp, augmented_t = merge_results(
        outdir=args.out_dir,
        varsim_tp=varsim_tp,
        varsim_fn=varsim_fn,
        vcfeval_tp=vcfeval_tp,
        varsim_fp=varsim_fp,
        vcfeval_tp_predict=vcfeval_tp_predict)
    augmented_tp, augmented_fn, augmented_fp, augmented_t = summarize_results(
        os.path.join(args.out_dir, "augmented"),
        augmented_tp,
        augmented_fn,
        augmented_fp,
        augmented_t,
        var_types=args.var_types,
        sv_length=args.sv_length,
        regions=args.regions,
        bed_either=args.bed_either,
        java=args.java)

    if args.master_vcf and args.call_vcf:
        match_false(augmented_fp,
                    [args.call_vcf, args.master_vcf, augmented_fn],
                    args.out_dir, args.sample, args.log_to_file,
                    args.vcfeval_options, sdf, args.java)
        match_false(augmented_fn, [args.call_vcf], args.out_dir, args.sample,
                    args.log_to_file, args.vcfeval_options, sdf, args.java)

    LOGGER.info(
        "Variant comparison done.\nTrue positive: {0}\nFalse negative: {1}\nFalse positive: {2}\n"
        .format(augmented_tp, augmented_fn, augmented_fp))
예제 #8
0
        "Path to file containing concatenation of real insertion sequences",
        required=False)
    rand_dgv_group.add_argument("--sv_dgv",
                                metavar="DGV_FILE",
                                help="DGV file containing structural variants",
                                required=False)
    rand_dgv_group.add_argument(
        "--sv_prop_het",
        metavar="FLOAT",
        help="Proportion of heterozygous structural variants",
        default=0.6,
        type=float)

    args = main_parser.parse_args()
    args.java = utils.get_java(args.java)
    check_java(args.java)

    utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem
    makedirs([args.out_dir])

    # Setup logging
    FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
    loglevel = get_loglevel(args.loglevel)
    if not args.log_to_stderr:
        logging.basicConfig(filename=os.path.join(args.out_dir, "varsim.log"),
                            filemode="w",
                            level=loglevel,
                            format=FORMAT)
    else:
        logging.basicConfig(level=loglevel, format=FORMAT)
예제 #9
0
    parse_jsons(jsonfile, stats)
    print("Non-SV stats")
    print_stats(stats)
    sv_stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, sv_stats, count_sv=True)
    print("SV stats")
    print_stats(sv_stats)
    all_stats = {k: {ii: 0 for ii in metrics} for k in var_types}
    parse_jsons(jsonfile, all_stats, count_all=True)
    print("Overall stats")
    print_stats(all_stats)
    return tp, fn, fp, t


if __name__ == "__main__":
    utils.check_java()

    main_parser = argparse.ArgumentParser(description="VarSim: A high-fidelity simulation validation framework",
                                          formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    main_parser.add_argument("--reference", metavar="FASTA", help="reference filename", required=True, type=str)
    main_parser.add_argument("--sdf", metavar="SDF", help="SDF formatted reference folder", required=False, type=str, default='')
    main_parser.add_argument("--out_dir", metavar="OUTDIR", help="output folder", required=True, type=str)
    main_parser.add_argument("--vcfs", metavar="VCF", help="variant calls to be evaluated", nargs="+", default=[], required = True)
    main_parser.add_argument("--var_types", metavar="VARTYPE", help="variant types", nargs="+",
                             default=['SNP','Insertion','Complex','Deletion'],
                             choices = ['SNP', 'Deletion', 'Insertion', 'Inversion', 'TandemDup',
                                       'Complex', 'TransDup', 'TansDel', 'InterDup', 'Translocation'], required = False)
    main_parser.add_argument("--true_vcf", metavar="VCF", help="Input small variant sampling VCF, usually dbSNP", required = True)
    main_parser.add_argument("--regions", help="BED file to restrict analysis [Optional]", required = False, type=str)
    main_parser.add_argument("--sample", metavar = "SAMPLE", help="sample name", required = False, type=str)
    main_parser.add_argument("--exclude_filtered", action = 'store_true', help="only consider variants with PASS or . in FILTER column", required = False)