def _main(args): logging.init_logger(log_fn=args.log_filename) if args.ground_truth_cov_min < 1: LOGGER.warning( "--ground-truth-cov-min must be 1 or greater. " + "Setting to 1." ) args.ground_truth_cov_min = 1 if args.nanopore_cov_min < 1: LOGGER.warning( "--nanopore-cov-min must be 1 or greater. " + "Setting to 1." ) args.nanopore_cov_min = 1 LOGGER.info("Checking for consistent contig names") mod_db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME) check_matching_attrs( args.ground_truth_bed, args.strand_offset, mod_db_fn, args.mod_bases ) LOGGER.info("Processing batches") process_all_batches( args.processes, args.batch_size, args.ground_truth_bed, args.out_low_coverage_sites, args.out_per_site_mod_thresholds, mod_db_fn, args.strand_offset, args.ground_truth_cov_min, args.nanopore_cov_min, args.mod_bases, args.valid_sites, )
def _main(args): logging.init_logger() pdf_fp = PdfPages(args.out_pdf) out_fp = (sys.stdout if args.out_filename is None else open( args.out_filename, 'w')) (samp1_cov, samp1_mod_cov, samp1_all_cov, samp2_cov, samp2_mod_cov, samp2_all_cov, valid_pos) = parse_inputs(args.sample1_bed_methyl_files, args.sample2_bed_methyl_files, args.strand_offset, args.sample_names, args.valid_positions, out_fp) (samp1_mod_pct, samp2_mod_pct, samp1_valid_cov, samp2_valid_cov) = compute_filt_mod_pct(samp1_cov, samp1_mod_cov, samp2_cov, samp2_mod_cov, valid_pos, args.coverage_threshold, args.sample_names, out_fp) plot_hm(samp1_mod_pct, samp2_mod_pct, args.heatmap_num_bins, args.sample_names, out_fp, pdf_fp) plot_cov(samp1_all_cov, samp2_all_cov, samp1_valid_cov, samp2_valid_cov, args.sample_names, pdf_fp) pdf_fp.close() if out_fp is not sys.stdout: out_fp.close()
def _main(args): logging.init_logger(log_fn=args.log_filename, quiet=args.quiet) # add required attributes for loading guppy, but not valid options for # this script. args.do_not_use_guppy_server = False args.output_directory = args.guppy_logs_output_directory try: mh.mkdir(args.output_directory, False) except mh.MegaError: LOGGER.warning( "Guppy logs output directory exists. Potentially overwriting " + "guppy logs." ) args = add_trim_guppy_none(args) args.outputs = [mh.PR_MOD_NAME] # make edge_buffer >= context_bases to simplify processing if args.edge_buffer < args.mod_context_bases: LOGGER.warning( "[--edge-buffer] less than [--mod-context-bases]. Setting " + "[--edge-buffer] to value from [--mod-context-bases]" ) args.edge_buffer = args.mod_context_bases LOGGER.info("Loading model.") backend_params = backends.parse_backend_params(args) with backends.ModelInfo(backend_params, args.processes) as model_info: check_map_sig_alphabet(model_info, args.mapped_signal_file) motifs = parse_motifs(args.motif, model_info, args.modified_bases_set) can_labs, mod_labs = extract_label_conversions(model_info) can_post_indices = model_info.can_indices.astype(np.uintp) all_mod_llrs, all_can_llrs = compute_diff_scores( args.mapped_signal_file, model_info, args.mod_context_bases, args.edge_buffer, args.num_reads, motifs, can_labs, mod_labs, can_post_indices, ) mod_summary = [ ( mod, len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0, len(all_can_llrs[mod]) if mod in all_can_llrs else 0, ) for mod in set(all_mod_llrs).union(all_can_llrs) ] LOGGER.info( "Data summary:\n\tmod\tmod_N\tcan_N\n" + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary) ) output_mods_data(all_mod_llrs, all_can_llrs, args.out_filename)
def _main(args): logging.init_logger() LOGGER.info("Opening VCF files.") source_vars = pysam.VariantFile(args.diploid_called_variants) h1_vars = pysam.VariantFile(args.haplotype1_variants) h2_vars = pysam.VariantFile(args.haplotype2_variants) try: contigs0 = list(source_vars.header.contigs.keys()) source_vars.fetch(contigs0[0]) h1_vars.fetch(next(iter(h1_vars.header.contigs.keys()))) h2_vars.fetch(next(iter(h2_vars.header.contigs.keys()))) except ValueError: raise mh.MegaError( "Variant files must be indexed. Use bgzip and tabix.") LOGGER.info("Processing variants.") out_vars = open(args.out_vcf, "w") out_vars.write( HEADER.format("\n".join( (CONTIG_HEADER_LINE.format(ctg.name, ctg.length) for ctg in source_vars.header.contigs.values())))) bar = tqdm( total=len(contigs0), smoothing=0, unit=" contigs", dynamic_ncols=True, desc="Variant Processing", mininterval=0, ) for contig in contigs0: for curr_s_rec, curr_h1_rec, curr_h2_rec in tqdm( iter_contig_vars( get_contig_iter(source_vars, contig), get_contig_iter(h1_vars, contig), get_contig_iter(h2_vars, contig), contig, bar, args.force_invalid_variant_processing, ), smoothing=0, unit=" variants", dynamic_ncols=True, leave=False, desc="{} Variants".format(contig), ): write_var(curr_s_rec, curr_h1_rec, curr_h2_rec, out_vars, contig) bar.update(1) out_vars.close() variants.index_variants(args.out_vcf)
def _main(args): logging.init_logger(quiet=args.quiet) if (args.ground_truth_data is None and args.control_megalodon_results_dir is None): LOGGER.error( "Must provide either --control-megalodon-results-dir or " + "--ground-truth-data") sys.exit() db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME) if args.ground_truth_data is not None: LOGGER.info("Parsing ground truth data") gt_mod_pos, gt_can_pos = mh.parse_ground_truth_file( args.ground_truth_data, include_strand=args.strand_specific_sites) LOGGER.info( ("Loaded ground truth data with {} modified sites and {} " + "canonical sites.").format(len(gt_mod_pos), len(gt_can_pos))) LOGGER.info("Reading ground truth modified base statistics from " + "database.") all_mod_llrs, all_can_llrs = mods.extract_stats_at_valid_sites( db_fn, [gt_mod_pos, gt_can_pos], include_strand=args.strand_specific_sites, ) else: LOGGER.info("Reading ground truth modified base statistics from " + "database") all_mod_llrs = mods.extract_all_stats(db_fn) LOGGER.info("Reading ground truth modified base statistics from " + "canonical sample database") all_can_llrs = mods.extract_all_stats( mh.get_megalodon_fn(args.control_megalodon_results_dir, mh.PR_MOD_NAME)) mod_summary = [( mod, len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0, len(all_can_llrs[mod]) if mod in all_can_llrs else 0, ) for mod in set(all_mod_llrs).union(all_can_llrs)] LOGGER.info("Data summary:\n\tmod\tmod_N\tcan_N\n" + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary)) output_mods_data( all_mod_llrs, all_can_llrs, args.modified_bases_set, args.exclude_modified_bases, args.out_filename, )
def _main(args): logging.init_logger() mh.prep_out_fn(args.out_filename, args.overwrite) LOGGER.info("Processing {}".format( args.modified_base_calibration_files[-1])) calib_data = np.load(args.modified_base_calibration_files[-1]) stratify_type = str(calib_data[calibration.MOD_STRAT_TYPE_TXT]) num_calib_vals = np.int(calib_data[calibration.SMOOTH_NVALS_TXT]) mod_calibs = {} for mod_base in calib_data[calibration.MOD_BASES_TXT]: LOGGER.info("\tUsing {} calibration".format(mod_base)) mod_calibs[mod_base] = ( calib_data[mod_base + calibration.LLR_RANGE_SUFFIX].copy(), calib_data[mod_base + calibration.CALIB_TABLE_SUFFIX].copy(), ) for mod_calib_fn in args.modified_base_calibration_files[-2::-1]: LOGGER.info("Processing {}".format(mod_calib_fn)) calib_data = np.load(mod_calib_fn) assert stratify_type == str(calib_data[calibration.MOD_STRAT_TYPE_TXT]) assert num_calib_vals == np.int( calib_data[calibration.SMOOTH_NVALS_TXT]) for mod_base in calib_data[calibration.MOD_BASES_TXT]: # overwrite calibration data with files passed earlier if mod_base in mod_calibs: LOGGER.info("\tOverwriting {} calibration".format(mod_base)) else: LOGGER.info("\tUsing {} calibration".format(mod_base)) mod_calibs[mod_base] = ( calib_data[mod_base + calibration.LLR_RANGE_SUFFIX].copy(), calib_data[mod_base + calibration.CALIB_TABLE_SUFFIX].copy(), ) save_kwargs = {} for mod_base, (mod_llr_range, mod_calib) in mod_calibs.items(): save_kwargs[mod_base + calibration.LLR_RANGE_SUFFIX] = mod_llr_range save_kwargs[mod_base + calibration.CALIB_TABLE_SUFFIX] = mod_calib # save calibration table for reading into mod calibration table LOGGER.info("Saving calibrations to file.") mod_bases = list(mod_calibs.keys()) np.savez( args.out_filename, stratify_type=stratify_type, smooth_nvals=num_calib_vals, mod_bases=mod_bases, **save_kwargs, )