def _main(args): log_suffix = ('aggregation' if args.output_suffix is None else 'aggregation.' + args.output_suffix) logging.init_logger(args.megalodon_directory, out_suffix=log_suffix) LOGGER.debug('Command: """' + ' '.join(sys.argv) + '"""') if args.mod_aggregate_method == mh.MOD_EM_NAME: mod_agg_info = mods.AGG_INFO(mh.MOD_EM_NAME, None) elif args.mod_aggregate_method == mh.MOD_BIN_THRESH_NAME: mod_agg_info = mods.AGG_INFO( mh.MOD_BIN_THRESH_NAME, args.mod_binary_threshold) valid_read_ids = mh.parse_read_ids(args.read_ids_filename) aggregate.aggregate_stats( args.outputs, args.megalodon_directory, args.processes, args.write_vcf_log_probs, args.heterozygous_factors, variants.HAPLIOD_MODE if args.haploid else variants.DIPLOID_MODE, mod_agg_info, args.write_mod_log_probs, args.mod_output_formats, args.suppress_progress, valid_read_ids, args.output_suffix, args.aggregate_batch_size) if mh.VAR_NAME in args.outputs: LOGGER.info('Sorting output variant file') variant_fn = mh.add_fn_suffix( mh.get_megalodon_fn(args.megalodon_directory, mh.VAR_NAME), args.output_suffix) sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted') variants.sort_variants(variant_fn, sort_variant_fn) LOGGER.info('Indexing output variant file') variants.index_variants(sort_variant_fn)
def main(): args = get_parser().parse_args() vars0_idx = pysam.VariantFile(args.diploid_called_variants) vars1_idx = pysam.VariantFile(args.haplotype1_variants) vars2_idx = pysam.VariantFile(args.haplotype2_variants) try: contigs0 = list(vars0_idx.header.contigs.keys()) vars0_idx.fetch(next(iter(contigs0)), 0, 0) contigs1 = list(vars1_idx.header.contigs.keys()) vars1_idx.fetch(next(iter(contigs1)), 0, 0) contigs2 = list(vars2_idx.header.contigs.keys()) vars2_idx.fetch(next(iter(contigs2)), 0, 0) except ValueError: raise mh.MegaError( 'Variants file must be indexed. Use bgzip and tabix.') out_vars = open(args.out_vcf, 'w') out_vars.write( HEADER.format('\n'.join( (CONTIG_HEADER_LINE.format(ctg.name, ctg.length) for ctg in vars0_idx.header.contigs.values())))) for contig in set(contigs0).intersection(contigs1).intersection(contigs2): for curr_v0_rec, curr_v1_rec, curr_v2_rec in iter_contig_vars( iter(vars0_idx.fetch(contig)), iter(vars1_idx.fetch(contig)), iter(vars2_idx.fetch(contig))): if curr_v0_rec is None: continue write_var(curr_v0_rec, curr_v1_rec, curr_v2_rec, out_vars, contig) out_vars.close() variants.index_variants(args.out_vcf)
def _main(args): logging.init_logger() LOGGER.info("Loading reference") aligner = mappy.Aligner(str(args.reference), preset=str("map-ont"), best_n=1) LOGGER.info("Loading variants") var_data = variants.VarInfo(args.in_vcf, aligner, args.max_indel_size, keep_var_fp_open=True) contigs = var_data.variants_idx.header.contigs.values() LOGGER.info("Atomizing variants") with open(args.out_vcf, "w") as out_vars: # preprocess contigs to set contig lengths for VCF header ctg_lens = {} for ctg in contigs: chrm_seq = aligner.seq(ctg.name) if len(chrm_seq) != ctg.length: LOGGER.warning( ("Mismatched contig lengths ({}) between " + "reference ({}) and input VCF ({}) using length from " "reference").format(ctg.name, len(chrm_seq), ctg.length)) ctg_lens[ctg.name] = len(chrm_seq) out_vars.write("\n".join(HEADER + [ CONTIG_HEADER_LINE.format(ctg, ctg_len) for ctg, ctg_len in ctg_lens.items() ] + [ variants.CONTEXT_BASE_MI_LINE, COMMAND_HEADER_LINE.format(" ".join(sys.argv)), FIELDS_LINE, ]) + "\n") for ctg in contigs: chrm_seq = aligner.seq(ctg.name) map_pos = mapping.MAP_POS( chrm=ctg.name, strand=None, start=0, end=len(chrm_seq), q_trim_start=None, q_trim_end=None, ) for var in var_data.fetch_read_variants(map_pos, mh.seq_to_int(chrm_seq)): out_vars.write( RECORD_LINE.format( chrm=ctg.name, pos=var.ref_start + 1, rid=var.id, ref=var.ref, alts=",".join(var.alts), info=variants.HAS_CONTEXT_BASE_TAG if var.has_context_base else ".", )) LOGGER.info("Indexing output variant file") variants.index_variants(args.out_vcf)
def _main(args): logging.init_logger() LOGGER.info("Opening VCF files.") source_vars = pysam.VariantFile(args.diploid_called_variants) h1_vars = pysam.VariantFile(args.haplotype1_variants) h2_vars = pysam.VariantFile(args.haplotype2_variants) try: contigs0 = list(source_vars.header.contigs.keys()) source_vars.fetch(contigs0[0]) h1_vars.fetch(next(iter(h1_vars.header.contigs.keys()))) h2_vars.fetch(next(iter(h2_vars.header.contigs.keys()))) except ValueError: raise mh.MegaError( "Variant files must be indexed. Use bgzip and tabix.") LOGGER.info("Processing variants.") out_vars = open(args.out_vcf, "w") out_vars.write( HEADER.format("\n".join( (CONTIG_HEADER_LINE.format(ctg.name, ctg.length) for ctg in source_vars.header.contigs.values())))) bar = tqdm( total=len(contigs0), smoothing=0, unit=" contigs", dynamic_ncols=True, desc="Variant Processing", mininterval=0, ) for contig in contigs0: for curr_s_rec, curr_h1_rec, curr_h2_rec in tqdm( iter_contig_vars( get_contig_iter(source_vars, contig), get_contig_iter(h1_vars, contig), get_contig_iter(h2_vars, contig), contig, bar, args.force_invalid_variant_processing, ), smoothing=0, unit=" variants", dynamic_ncols=True, leave=False, desc="{} Variants".format(contig), ): write_var(curr_s_rec, curr_h1_rec, curr_h2_rec, out_vars, contig) bar.update(1) out_vars.close() variants.index_variants(args.out_vcf)
def _main(args): logging.init_logger() LOGGER.info('Loading reference') aligner = mapping.alignerPlus(str(args.reference), preset=str('map-ont'), best_n=1) aligner.add_ref_lens() LOGGER.info('Loading variants') var_data = variants.VarData(args.in_vcf, args.max_indel_size, keep_var_fp_open=True, aligner=aligner) contigs = var_data.variants_idx.header.contigs.values() LOGGER.info('Atomizing variants') with open(args.out_vcf, 'w') as out_vars: out_vars.write('\n'.join(HEADER + [ CONTIG_HEADER_LINE.format(ctg.name, ctg.length) for ctg in contigs ] + [ variants.CONTEXT_BASE_MI_LINE, COMMAND_HEADER_LINE.format(' '.join(sys.argv)), FIELDS_LINE ]) + '\n') for ctg in contigs: chrm_seq = aligner.seq(ctg.name) if len(chrm_seq) != ctg.length: LOGGER.warning(('Mismatched contig lengths ({}) between ' + 'reference ({}) and input VCF ({})').format( ctg.name, len(chrm_seq), ctg.length)) map_pos = mapping.MAP_POS(chrm=ctg.name, strand=None, start=0, end=len(chrm_seq), q_trim_start=None, q_trim_end=None) for var in var_data.fetch_read_variants(map_pos, mh.seq_to_int(chrm_seq)): out_vars.write( RECORD_LINE.format(chrm=ctg.name, pos=var.ref_start + 1, rid=var.id, ref=var.ref, alts=','.join(var.alts), info=variants.HAS_CONTEXT_BASE_TAG if var.has_context_base else '.')) LOGGER.info('Indexing output variant file') variants.index_variants(args.out_vcf)