type=str) longislnd_group = main_parser.add_argument_group("LongISLND options") longislnd_group.add_argument("--longislnd_options", help="LongISLND options", default="") args = main_parser.parse_args() args.java = utils.get_java(args.java) utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem makedirs([args.log_dir, args.out_dir]) # Setup logging FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' loglevel = get_loglevel(args.loglevel) if not args.log_to_stderr: logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"), filemode="w", level=loglevel, format=FORMAT) else: logging.basicConfig(level=loglevel, format=FORMAT) simulator = None if args.disable_sim else args.simulator simulator_opts = "" if args.simulator == "dwgsim": simulator_opts = "-e {1},{2} -E {1},{2} -d {3} -s {4} -1 {5} -2 {5} {6}".format( args.dwgsim_start_e, args.dwgsim_end_e, args.mean_fragment_size, args.sd_fragment_size, args.read_length, args.dwgsim_options) elif args.simulator == "art":
art_group.add_argument("--art_options", help="ART command-line options", default="") pbsim_group = main_parser.add_argument_group("PBSIM options") pbsim_group.add_argument("--model_qc", metavar="model_qc", help="PBSIM QC model", default=None, type=str) longislnd_group = main_parser.add_argument_group("LongISLND options") longislnd_group.add_argument("--longislnd_options", help="LongISLND options", default="") args = main_parser.parse_args() utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem makedirs([args.log_dir, args.out_dir]) # Setup logging FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' loglevel = get_loglevel(args.loglevel) if not args.log_to_stderr: logging.basicConfig(filename=os.path.join(args.log_dir, "varsim.log"), filemode="w", level=loglevel, format=FORMAT) else: logging.basicConfig(level=loglevel, format=FORMAT) simulator = None if args.disable_sim else args.simulator simulator_opts = "" if args.simulator == "dwgsim": simulator_opts = "-e {1},{2} -E {1},{2} -d {3} -s {4} -1 {5} -2 {5} {6}".format(args.dwgsim_start_e, args.dwgsim_end_e, args.mean_fragment_size, args.sd_fragment_size, args.read_length, args.dwgsim_options) elif args.simulator == "art": profile_opts = "-1 {} -2 {}".format(args.profile_1, args.profile_2) if (args.profile_1 and args.profile_2) else "" simulator_opts = "-p -l {} -m {} -s {} {} {}".format(args.read_length, args.mean_fragment_size, args.sd_fragment_size, profile_opts, args.art_options) elif args.simulator == "longislnd": simulator_opts = args.longislnd_options elif args.simulator == "pbsim":
def process(args): ''' main :param args: :return: ''' # Setup logging FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' loglevel = utils.get_loglevel(args.loglevel) logging.basicConfig(level=loglevel, format=FORMAT) global LOGGER LOGGER = logging.getLogger(__name__) LOGGER.info('running {}'.format(' '.join(sys.argv))) dup_mode = None if args.mode == 'first_duplicate': dup_mode = utils.COMBINE_KEEP_FIRST_DUPLICATE elif args.mode == 'all_duplicate': dup_mode = utils.COMBINE_KEEP_ALL_DUPLICATE elif args.mode == 'no_duplicate': dup_mode = utils.COMBINE_KEEP_NO_DUPLICATE else: raise ValueError """ scenarios: vcf vcf.gz vcf.gz + vcf.gz.tbi """ input_vcfs = args.vcfs for i in range(len(args.vcfs)): current_vcf = args.vcfs[i] if current_vcf.endswith(".gz") and os.path.isfile(current_vcf + ".tbi"): input_vcfs[i] = current_vcf elif current_vcf.endswith(".gz"): LOGGER.info('indexing {}'.format(current_vcf)) utils.index_vcf_gz(current_vcf) input_vcfs[i] = current_vcf else: LOGGER.info('sort and index {}'.format(current_vcf)) input_vcfs[i] = utils.sort_and_compress(current_vcf, mode=2, overwrite=args.overwrite) output_vcf = args.output_prefix + '.vcf' if input_vcfs and len(input_vcfs) == 1: output_vcf = output_vcf + '.gz' output_vcf_idx = output_vcf + '.tbi' if (not args.overwrite) and \ (os.path.isfile(output_vcf) or os.path.isfile(output_vcf_idx)): LOGGER.warn( '{} or {} exists, use --overwrite otherwise do nothing.'. format(output_vcf, output_vcf_idx)) else: shutil.copyfile(input_vcfs[0], output_vcf) shutil.copyfile(input_vcfs[0], output_vcf_idx) else: output_vcf = utils.combine_vcf(output_vcf, input_vcfs, duplicate_handling_mode=dup_mode) LOGGER.info('{} done'.format(output_vcf)) return
def process(args): ''' main :param args: :return: ''' # Setup logging FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' loglevel = utils.get_loglevel(args.loglevel) if args.log_to_file: logging.basicConfig(filename=args.log_to_file, filemode="w", level=loglevel, format=FORMAT) else: logging.basicConfig(level=loglevel, format=FORMAT) if len(args.vcfs) > 1: raise NotImplementedError( 'right now only support one prediction VCF. Quick workaround: src/sort_vcf.sh vcf1 vcf2 > merged.vcf' ) global LOGGER LOGGER = logging.getLogger(__name__) LOGGER.info('working hard ...') utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem args.out_dir = os.path.abspath(args.out_dir) args.reference = os.path.abspath(args.reference) utils.makedirs([args.out_dir]) varsim_prefix = os.path.join(args.out_dir, 'varsim_compare_results') varsim_comparator = VarSimVCFComparator( prefix=varsim_prefix, true_vcf=args.true_vcf, reference=args.reference, regions=None, sample=args.sample, vcfs=args.vcfs, exclude_filtered=args.exclude_filtered, disallow_partial_fp=args.disallow_partial_fp, match_geno=args.match_geno, log_to_file=args.log_to_file, opts=args.vcfcompare_options) varsim_tp, varsim_fn, varsim_fp = varsim_comparator.get_tp( ), varsim_comparator.get_fn(), varsim_comparator.get_fp() varsim_tp = utils.sort_and_compress(varsim_tp) varsim_fn = utils.sort_and_compress(varsim_fn) varsim_fp = utils.sort_and_compress(varsim_fp) #run vcfeval sdf = args.sdf if not sdf: LOGGER.info( "user did not supply SDF-formatted reference, trying to generate one..." ) sdf = generate_sdf(args.reference, args.log_to_file) '''for vcfeval sample column must be present, and not empty if single-sample vcf, vcfeval doesn't check if samples match in truth and call in multi-sample vcf, sample name must be specified right now ''' vcfeval_prefix = os.path.join(args.out_dir, 'vcfeval_compare_results') if os.path.exists(vcfeval_prefix): LOGGER.warn('{0} exists, removing ...'.format(vcfeval_prefix)) shutil.rmtree(vcfeval_prefix) vcfeval_comparator = RTGVCFComparator( prefix=vcfeval_prefix, true_vcf=varsim_fn, reference=sdf, regions=None, sample=args.sample, vcfs=[varsim_fp], exclude_filtered=args.exclude_filtered, match_geno=args.match_geno, log_to_file=args.log_to_file, opts=args.vcfeval_options) vcfeval_tp, vcfeval_tp_predict = vcfeval_comparator.get_tp( ), vcfeval_comparator.get_tp_predict() augmented_tp, augmented_fn, augmented_fp, augmented_t = merge_results( outdir=args.out_dir, varsim_tp=varsim_tp, varsim_fn=varsim_fn, vcfeval_tp=vcfeval_tp, varsim_fp=varsim_fp, vcfeval_tp_predict=vcfeval_tp_predict) augmented_tp, augmented_fn, augmented_fp, augmented_t = summarize_results( os.path.join(args.out_dir, "augmented"), augmented_tp, augmented_fn, augmented_fp, augmented_t, var_types=args.var_types, sv_length=args.sv_length, regions=args.regions, bed_either=args.bed_either) LOGGER.info( "Variant comparison done.\nTrue positive: {0}\nFalse negative: {1}\nFalse positive: {2}\n" .format(augmented_tp, augmented_fn, augmented_fp))
def process(args): ''' main :param args: :return: ''' # Setup logging FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' loglevel = utils.get_loglevel(args.loglevel) if args.log_to_file: logging.basicConfig(filename=args.log_to_file, filemode="w", level=loglevel, format=FORMAT) else: logging.basicConfig(level=loglevel, format=FORMAT) if len(args.vcfs) > 1: raise NotImplementedError('right now only support one prediction VCF. Quick workaround: src/sort_vcf.sh vcf1 vcf2 > merged.vcf') global LOGGER LOGGER = logging.getLogger(__name__) LOGGER.info('working hard ...') utils.JAVA_XMX = utils.JAVA_XMX + args.java_max_mem args.out_dir = os.path.abspath(args.out_dir) args.reference = os.path.abspath(args.reference) utils.makedirs([args.out_dir]) varsim_prefix = os.path.join(args.out_dir, 'varsim_compare_results') varsim_comparator = VarSimVCFComparator(prefix=varsim_prefix, true_vcf = args.true_vcf, reference = args.reference, regions = None, sample = args.sample, vcfs = args.vcfs, exclude_filtered = args.exclude_filtered, disallow_partial_fp = args.disallow_partial_fp, match_geno = args.match_geno, log_to_file= args.log_to_file, opts = args.vcfcompare_options) varsim_tp, varsim_fn, varsim_fp = varsim_comparator.get_tp(), varsim_comparator.get_fn(), varsim_comparator.get_fp() varsim_tp = utils.sort_and_compress(varsim_tp) varsim_fn = utils.sort_and_compress(varsim_fn) varsim_fp = utils.sort_and_compress(varsim_fp) #run vcfeval sdf = args.sdf if not sdf: LOGGER.info("user did not supply SDF-formatted reference, trying to generate one...") sdf = generate_sdf(args.reference, args.log_to_file) '''for vcfeval sample column must be present, and not empty if single-sample vcf, vcfeval doesn't check if samples match in truth and call in multi-sample vcf, sample name must be specified right now ''' vcfeval_prefix = os.path.join(args.out_dir, 'vcfeval_compare_results') if os.path.exists(vcfeval_prefix): LOGGER.warn('{0} exists, removing ...'.format(vcfeval_prefix)) shutil.rmtree(vcfeval_prefix) vcfeval_comparator = RTGVCFComparator(prefix=vcfeval_prefix, true_vcf = varsim_fn, reference = sdf, regions = None, sample = args.sample, vcfs = [varsim_fp], exclude_filtered = args.exclude_filtered, match_geno = args.match_geno, log_to_file= args.log_to_file, opts = args.vcfeval_options) vcfeval_tp, vcfeval_tp_predict = vcfeval_comparator.get_tp(), vcfeval_comparator.get_tp_predict() augmented_tp, augmented_fn, augmented_fp, augmented_t = merge_results( outdir = args.out_dir, varsim_tp = varsim_tp, varsim_fn = varsim_fn, vcfeval_tp = vcfeval_tp, varsim_fp = varsim_fp, vcfeval_tp_predict = vcfeval_tp_predict) augmented_tp, augmented_fn, augmented_fp, augmented_t = summarize_results(os.path.join(args.out_dir,"augmented"), augmented_tp, augmented_fn, augmented_fp, augmented_t, var_types= args.var_types, sv_length= args.sv_length, regions = args.regions, bed_either = args.bed_either) LOGGER.info("Variant comparison done.\nTrue positive: {0}\nFalse negative: {1}\nFalse positive: {2}\n". format(augmented_tp, augmented_fn, augmented_fp))