def call_variant( can_post, post_mapped_start, r_var_pos, rl_cumsum, r_to_q_poss, var_ref_seq, var_alt_seq, context_bases, all_paths, np_ref_seq=None, ref_seq=None): var_context_bases = (context_bases[0] if len(var_ref_seq) == len(var_alt_seq) else context_bases[1]) pos_bb = min(var_context_bases, r_var_pos) if ref_seq is None: pos_ab = min(var_context_bases, np_ref_seq.shape[0] - r_var_pos - len(var_ref_seq)) pos_ref_seq = np_ref_seq[r_var_pos - pos_bb: r_var_pos + pos_ab + len(var_ref_seq)] else: pos_ab = min(var_context_bases, len(ref_seq) - r_var_pos - len(var_ref_seq)) pos_ref_seq = mh.seq_to_int(ref_seq[ r_var_pos - pos_bb:r_var_pos + pos_ab + len(var_ref_seq)]) pos_alt_seq = np.concatenate([ pos_ref_seq[:pos_bb], mh.seq_to_int(var_alt_seq), pos_ref_seq[pos_bb + len(var_ref_seq):]]) blk_start = rl_cumsum[r_to_q_poss[r_var_pos - pos_bb]] blk_end = rl_cumsum[r_to_q_poss[r_var_pos + pos_ab] + 1] if blk_end - blk_start < max(len(pos_ref_seq), len(pos_alt_seq)): return np.NAN loc_ref_score = variants.score_seq( can_post, pos_ref_seq, post_mapped_start + blk_start, post_mapped_start + blk_end, all_paths) loc_alt_score = variants.score_seq( can_post, pos_alt_seq, post_mapped_start + blk_start, post_mapped_start + blk_end, all_paths) return loc_ref_score - loc_alt_score
def _main(args): logging.init_logger() LOGGER.info("Loading reference") aligner = mappy.Aligner(str(args.reference), preset=str("map-ont"), best_n=1) LOGGER.info("Loading variants") var_data = variants.VarInfo(args.in_vcf, aligner, args.max_indel_size, keep_var_fp_open=True) contigs = var_data.variants_idx.header.contigs.values() LOGGER.info("Atomizing variants") with open(args.out_vcf, "w") as out_vars: # preprocess contigs to set contig lengths for VCF header ctg_lens = {} for ctg in contigs: chrm_seq = aligner.seq(ctg.name) if len(chrm_seq) != ctg.length: LOGGER.warning( ("Mismatched contig lengths ({}) between " + "reference ({}) and input VCF ({}) using length from " "reference").format(ctg.name, len(chrm_seq), ctg.length)) ctg_lens[ctg.name] = len(chrm_seq) out_vars.write("\n".join(HEADER + [ CONTIG_HEADER_LINE.format(ctg, ctg_len) for ctg, ctg_len in ctg_lens.items() ] + [ variants.CONTEXT_BASE_MI_LINE, COMMAND_HEADER_LINE.format(" ".join(sys.argv)), FIELDS_LINE, ]) + "\n") for ctg in contigs: chrm_seq = aligner.seq(ctg.name) map_pos = mapping.MAP_POS( chrm=ctg.name, strand=None, start=0, end=len(chrm_seq), q_trim_start=None, q_trim_end=None, ) for var in var_data.fetch_read_variants(map_pos, mh.seq_to_int(chrm_seq)): out_vars.write( RECORD_LINE.format( chrm=ctg.name, pos=var.ref_start + 1, rid=var.id, ref=var.ref, alts=",".join(var.alts), info=variants.HAS_CONTEXT_BASE_TAG if var.has_context_base else ".", )) LOGGER.info("Indexing output variant file") variants.index_variants(args.out_vcf)
def _main(args): logging.init_logger() LOGGER.info('Loading reference') aligner = mapping.alignerPlus(str(args.reference), preset=str('map-ont'), best_n=1) aligner.add_ref_lens() LOGGER.info('Loading variants') var_data = variants.VarData(args.in_vcf, args.max_indel_size, keep_var_fp_open=True, aligner=aligner) contigs = var_data.variants_idx.header.contigs.values() LOGGER.info('Atomizing variants') with open(args.out_vcf, 'w') as out_vars: out_vars.write('\n'.join(HEADER + [ CONTIG_HEADER_LINE.format(ctg.name, ctg.length) for ctg in contigs ] + [ variants.CONTEXT_BASE_MI_LINE, COMMAND_HEADER_LINE.format(' '.join(sys.argv)), FIELDS_LINE ]) + '\n') for ctg in contigs: chrm_seq = aligner.seq(ctg.name) if len(chrm_seq) != ctg.length: LOGGER.warning(('Mismatched contig lengths ({}) between ' + 'reference ({}) and input VCF ({})').format( ctg.name, len(chrm_seq), ctg.length)) map_pos = mapping.MAP_POS(chrm=ctg.name, strand=None, start=0, end=len(chrm_seq), q_trim_start=None, q_trim_end=None) for var in var_data.fetch_read_variants(map_pos, mh.seq_to_int(chrm_seq)): out_vars.write( RECORD_LINE.format(chrm=ctg.name, pos=var.ref_start + 1, rid=var.id, ref=var.ref, alts=','.join(var.alts), info=variants.HAS_CONTEXT_BASE_TAG if var.has_context_base else '.')) LOGGER.info('Indexing output variant file') variants.index_variants(args.out_vcf)
def process_read( bc_res, caller_conn, map_thr_buf, do_false_ref, context_bases=mh.DEFAULT_VAR_CONTEXT_BASES, edge_buffer=mh.DEFAULT_EDGE_BUFFER, max_indel_len=MAX_INDEL_LEN, all_paths=ALL_PATHS, every_n=TEST_EVERY_N_LOCS, max_pos_per_read=MAX_POS_PER_READ, ): ( sig_info, called_read, rl_cumsum, can_post, ) = bc_res r_ref_seq, r_to_q_poss, r_ref_pos, _, _ = mapping.map_read( caller_conn, called_read, backends.SIGNAL_DATA(*sig_info))[0] np_ref_seq = mh.seq_to_int(r_ref_seq) if np_ref_seq.shape[0] < edge_buffer * 2: raise NotImplementedError( "Mapping too short for calibration statistic computation.") # get mapped start in post and run len to mapped bit of output post_mapped_start = rl_cumsum[r_ref_pos.q_trim_start] mapped_rl_cumsum = ( rl_cumsum[r_ref_pos.q_trim_start:r_ref_pos.q_trim_end + 1] - post_mapped_start) # candidate variant locations within a read var_poss = list( range(edge_buffer, np_ref_seq.shape[0] - edge_buffer, every_n))[:max_pos_per_read] read_var_calls = [] if do_false_ref: # first process reference false calls (need to spoof an incorrect # reference for mapping and signal remapping) for r_var_pos in var_poss: # first test single base swap SNPs try: score, var_ref_seq, var_alt_seq = call_alt_true_indel( 0, r_var_pos, r_ref_seq, called_read.seq, map_thr_buf, context_bases, can_post, rl_cumsum, all_paths, ) read_var_calls.append((False, score, var_ref_seq, var_alt_seq)) except mh.MegaError: # introduced error either causes read not to map, # mapping trims the location of interest or invalid ref or alt # sequence pass # then test small indels for indel_size in range(1, max_indel_len + 1): try: score, var_ref_seq, var_alt_seq = call_alt_true_indel( indel_size, r_var_pos, r_ref_seq, called_read.seq, map_thr_buf, context_bases, can_post, rl_cumsum, all_paths, ) read_var_calls.append( (False, score, var_ref_seq, var_alt_seq)) except mh.MegaError: pass try: score, var_ref_seq, var_alt_seq = call_alt_true_indel( -indel_size, r_var_pos, r_ref_seq, called_read.seq, map_thr_buf, context_bases, can_post, rl_cumsum, all_paths, ) read_var_calls.append( (False, score, var_ref_seq, var_alt_seq)) except mh.MegaError: pass # now test reference correct variants for r_var_pos in var_poss: if (len( set(r_ref_seq[r_var_pos - max(context_bases):r_var_pos + max_indel_len + 1 + max(context_bases)]).difference(CAN_BASES_SET)) > 0): # skip reference positions with N's in any context continue # test simple SNP first var_ref_seq = r_ref_seq[r_var_pos] for var_alt_seq in CAN_BASES_SET.difference(var_ref_seq): try: score = call_variant( can_post, post_mapped_start, r_var_pos, mapped_rl_cumsum, r_to_q_poss, var_ref_seq, var_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq, ) except mh.MegaError: # invalid reference or alternative sequence continue read_var_calls.append((True, score, var_ref_seq, var_alt_seq)) # then test indels for indel_size in range(1, max_indel_len + 1): # test deletion var_ref_seq = r_ref_seq[r_var_pos:r_var_pos + indel_size + 1] var_alt_seq = r_ref_seq[r_var_pos] try: score = call_variant( can_post, post_mapped_start, r_var_pos, mapped_rl_cumsum, r_to_q_poss, var_ref_seq, var_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq, ) except mh.MegaError: # invalid reference or alternative sequence continue read_var_calls.append((True, score, var_ref_seq, var_alt_seq)) # test random insertion var_ref_seq = r_ref_seq[r_var_pos] var_alt_seq = var_ref_seq + "".join( choice(CAN_BASES) for _ in range(indel_size)) try: score = call_variant( can_post, post_mapped_start, r_var_pos, mapped_rl_cumsum, r_to_q_poss, var_ref_seq, var_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq, ) except mh.MegaError: # invalid reference or alternative sequence continue read_var_calls.append((True, score, var_ref_seq, var_alt_seq)) return read_var_calls