def variant_map_from(var_fn, tree, is_tree_empty): Y = {} truth_alt_dict = {} miss_variant_set = set() if var_fn is None: return Y, miss_variant_set, truth_alt_dict f = subprocess_popen(shlex.split("gzip -fdc %s" % (var_fn))) for row in f.stdout: if row[0] == "#": continue columns = row.strip().split() ctg_name, position_str, ref_base, alt_base, genotype1, genotype2 = columns key = ctg_name + ":" + position_str if genotype1 == '-1' or genotype2 == '-1': miss_variant_set.add(key) continue if not (is_tree_empty or is_region_in(tree, ctg_name, int(position_str))): continue Y[key] = output_labels_from_vcf_columns(columns) ref_base_list, alt_base_list = decode_alt(ref_base, alt_base) truth_alt_dict[int(position_str)] = (ref_base_list, alt_base_list) f.stdout.close() f.wait() return Y, miss_variant_set, truth_alt_dict
def variant_map_from(var_fn, tree, is_tree_empty): Y = {} if var_fn is None: return Y f = subprocess_popen(shlex.split("gzip -fdc %s" % (var_fn))) for row in f.stdout: columns = row.split() ctg_name, position_str = columns[0], columns[1] if not (is_tree_empty or is_region_in(tree, ctg_name, int(position_str))): continue key = ctg_name + ":" + position_str Y[key] = output_labels_from_vcf_columns(columns) f.stdout.close() f.wait() return Y
def Run(args): tree = bed_tree_from(bed_file_path=args.bed_fn) logging.info("Counting the number of Truth Variants in %s ..." % args.tensor_var_fn) v = 0 d = {} f = subprocess_popen(shlex.split("gzip -fdc %s" % (args.tensor_var_fn))) for row in f.stdout: row = row.strip().split() ctgName = row[0] pos = int(row[1]) key = "-".join([ctgName, str(pos)]) v += 1 d[key] = 1 f.stdout.close() f.wait() logging.info("%d Truth Variants" % v) t = v * args.amp logging.info("%d non-variants to be picked" % t) logging.info("Counting the number of usable non-variants in %s ..." % args.tensor_can_fn) c = 0 f = subprocess_popen(shlex.split("gzip -fdc %s" % (args.tensor_can_fn))) for row in f.stdout: row = row.strip().split() ctgName = row[0] pos = int(row[1]) if args.bed_fn != None: if not is_region_in(tree, ctgName, pos): continue key = "-".join([ctgName, str(pos)]) if key in d: continue c += 1 f.stdout.close() f.wait() logging.info("%d usable non-variant" % c) r = float(t) / c r = r if r <= 1 else 1 logging.info("%.2f of all non-variants are selected" % r) o1 = 0 o2 = 0 output_fpo = open(args.output_fn, "wb") output_fh = subprocess_popen(shlex.split("gzip -c"), stdin=PIPE, stdout=output_fpo) f = subprocess_popen(shlex.split("gzip -fdc %s" % (args.tensor_var_fn))) for row in f.stdout: row = row.strip() output_fh.stdin.write(row) output_fh.stdin.write("\n") o1 += 1 f.stdout.close() f.wait() f = subprocess_popen(shlex.split("gzip -fdc %s" % (args.tensor_can_fn))) for row in f.stdout: rawRow = row.strip() row = rawRow.split() ctgName = row[0] pos = int(row[1]) if args.bed_fn != None: if not is_region_in(tree, ctgName, pos): continue key = "-".join([ctgName, str(pos)]) if key in d: continue if random() < r: output_fh.stdin.write(rawRow) output_fh.stdin.write("\n") o2 += 1 f.stdout.close() f.wait() output_fh.stdin.close() output_fh.wait() output_fpo.close() logging.info("%.2f/%.2f Truth Variants/Non-variants outputed" % (o1, o2))
def Run(args): basedir = os.path.dirname(__file__) callVarBamBin = basedir + "/../clair.py callVarBam" pypyBin = executable_command_string_from(args.pypy, exit_on_not_found=True) samtoolsBin = executable_command_string_from(args.samtools, exit_on_not_found=True) chkpnt_fn = file_path_from(args.chkpnt_fn, suffix=".meta", exit_on_not_found=True) bam_fn = file_path_from(args.bam_fn, exit_on_not_found=True) ref_fn = file_path_from(args.ref_fn, exit_on_not_found=True) fai_fn = file_path_from(args.ref_fn + ".fai", exit_on_not_found=True) bed_fn = file_path_from(args.bed_fn) vcf_fn = file_path_from(args.vcf_fn) output_prefix = args.output_prefix af_threshold = args.threshold tree = bed_tree_from(bed_file_path=bed_fn) minCoverage = args.minCoverage sampleName = args.sampleName delay = args.delay threads = args.tensorflowThreads qual = args.qual is_include_all_contigs = args.includingAllContigs region_chunk_size = args.refChunkSize stop_consider_left_edge = command_option_from(args.stop_consider_left_edge, 'stop_consider_left_edge') log_path = command_option_from(args.log_path, 'log_path', option_value=args.log_path) pysam_for_all_indel_bases = command_option_from(args.pysam_for_all_indel_bases, 'pysam_for_all_indel_bases') haploid_mode = command_option_from(args.haploid, 'haploid') output_for_ensemble = command_option_from(args.output_for_ensemble, 'output_for_ensemble') debug = command_option_from(args.debug, 'debug') qual = command_option_from(args.qual, 'qual', option_value=args.qual) fast_plotting = command_option_from(args.fast_plotting, 'fast_plotting') call_var_bam_command_options = [ ExecuteCommand('python3', callVarBamBin), CommandOption('chkpnt_fn', chkpnt_fn), CommandOption('ref_fn', ref_fn), CommandOption('bam_fn', bam_fn), CommandOption('threshold', af_threshold), CommandOption('minCoverage', minCoverage), CommandOption('pypy', pypyBin), CommandOption('samtools', samtoolsBin), CommandOption('delay', delay), CommandOption('threads', threads), CommandOption('sampleName', sampleName), # optional command options CommandOption('vcf_fn', vcf_fn) if vcf_fn is not None else None, qual, stop_consider_left_edge, debug, pysam_for_all_indel_bases, haploid_mode, output_for_ensemble, ] activation_only_command_options = [ CommandOptionWithNoValue('activation_only'), log_path, CommandOption('max_plot', args.max_plot), CommandOption('parallel_level', args.parallel_level), CommandOption('workers', args.workers), fast_plotting, ] if args.activation_only else [] is_bed_file_provided = bed_fn is not None command_string = command_string_from(call_var_bam_command_options + activation_only_command_options) with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name = columns[0] if not is_include_all_contigs and str(contig_name) not in major_contigs: continue region_start, region_end = 0, 0 contig_length = int(columns[1]) while region_end < contig_length: region_start = region_end region_end = region_start + region_chunk_size if region_end > contig_length: region_end = contig_length output_fn = "%s.%s_%d_%d.vcf" % (output_prefix, contig_name, region_start, region_end) is_region_in_bed = is_bed_file_provided and is_region_in(tree, contig_name, region_start, region_end) need_output_command = not is_bed_file_provided or is_region_in_bed if not need_output_command: continue additional_command_options = [ CommandOption('ctgName', contig_name), CommandOption('ctgStart', region_start), CommandOption('ctgEnd', region_end), CommandOption('call_fn', output_fn), CommandOption('bed_fn', bed_fn) if is_region_in_bed else None ] print(command_string + " " + command_string_from(additional_command_options))
def CreateTensorPileup(args): """ Create pileup tensor for pileup model training or calling. Use slide window to scan the whole candidate regions, keep all candidates over specific minimum allelic frequency and minimum depth, use samtools mpileup to store pileup info for pileup tensor generation. Only scan candidate regions once, we could directly get all variant candidates directly. """ ctg_start = args.ctgStart ctg_end = args.ctgEnd fasta_file_path = args.ref_fn ctg_name = args.ctgName samtools_execute_command = args.samtools bam_file_path = args.bam_fn chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base chunk_num = args.chunk_num tensor_can_output_path = args.tensor_can_fn minimum_af_for_candidate = args.min_af minimum_snp_af_for_candidate = args.snp_min_af minimum_indel_af_for_candidate = args.indel_min_af min_coverage = args.minCoverage platform = args.platform confident_bed_fn = args.bed_fn is_confident_bed_file_given = confident_bed_fn is not None alt_fn = args.indel_fn extend_bed = args.extend_bed is_extend_bed_file_given = extend_bed is not None min_mapping_quality = args.minMQ min_base_quality = args.minBQ fast_mode = args.fast_mode vcf_fn = args.vcf_fn is_known_vcf_file_provided = vcf_fn is not None call_snp_only = args.call_snp_only global test_pos test_pos = None # 1-based regions [start, end] (start and end inclusive) ref_regions = [] reads_regions = [] known_variants_set = set() tree, bed_start, bed_end = bed_tree_from(bed_file_path=extend_bed, contig_name=ctg_name, return_bed_region=True) fai_fn = file_path_from(fasta_file_path, suffix=".fai", exit_on_not_found=True, sep='.') if not is_confident_bed_file_given and chunk_id is not None: contig_length = 0 with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name = columns[0] if contig_name != ctg_name: continue contig_length = int(columns[1]) chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num ctg_start = chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size if is_confident_bed_file_given and chunk_id is not None: chunk_size = (bed_end - bed_start) // chunk_num + 1 if ( bed_end - bed_start) % chunk_num else (bed_end - bed_start) // chunk_num ctg_start = bed_start + 1 + chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size if is_known_vcf_file_provided and chunk_id is not None: known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name) total_variants_size = len(known_variants_list) chunk_variants_size = total_variants_size // chunk_num if total_variants_size % chunk_num == 0 else total_variants_size // chunk_num + 1 chunk_start_pos = chunk_id * chunk_variants_size known_variants_set = set( known_variants_list[chunk_start_pos:chunk_start_pos + chunk_variants_size]) if len(known_variants_set) == 0: return ctg_start, ctg_end = min(known_variants_set), max(known_variants_set) is_ctg_name_given = ctg_name is not None is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None if is_ctg_range_given: extend_start = ctg_start - no_of_positions extend_end = ctg_end + no_of_positions reads_regions.append( region_from(ctg_name=ctg_name, ctg_start=extend_start, ctg_end=extend_end)) reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion reference_start = 1 if reference_start < 1 else reference_start ref_regions.append( region_from(ctg_name=ctg_name, ctg_start=reference_start, ctg_end=reference_end)) elif is_ctg_name_given: reads_regions.append(region_from(ctg_name=ctg_name)) ref_regions.append(region_from(ctg_name=ctg_name)) reference_start = 1 reference_sequence = reference_sequence_from( samtools_execute_command=samtools_execute_command, fasta_file_path=fasta_file_path, regions=ref_regions) if reference_sequence is None or len(reference_sequence) == 0: sys.exit( log_error( "[ERROR] Failed to load reference sequence from file ({}).". format(fasta_file_path))) if is_confident_bed_file_given and ctg_name not in tree: sys.exit( log_error("[ERROR] ctg_name {} not exists in bed file({}).".format( ctg_name, confident_bed_fn))) # samtools mpileup options # reverse-del: deletion in forward/reverse strand were marked as '*'/'#' min_base_quality = 0 if args.gvcf else min_base_quality max_depth = param.max_depth_dict[ args.platform] if args.platform else args.max_depth mq_option = ' --min-MQ {}'.format(min_mapping_quality) bq_option = ' --min-BQ {}'.format(min_base_quality) flags_option = ' --excl-flags {}'.format(param.SAMTOOLS_VIEW_FILTER_FLAG) max_depth_option = ' --max-depth {}'.format(max_depth) bed_option = ' -l {}'.format( extend_bed) if is_extend_bed_file_given else "" gvcf_option = ' -a' if args.gvcf else "" samtools_mpileup_process = subprocess_popen( shlex.split("{} mpileup {} -r {} --reverse-del".format( samtools_execute_command, bam_file_path, " ".join(reads_regions), ) + mq_option + bq_option + bed_option + flags_option + max_depth_option + gvcf_option)) if tensor_can_output_path != "PIPE": tensor_can_fpo = open(tensor_can_output_path, "wb") tensor_can_fp = subprocess_popen(shlex.split("{} -c".format( param.zstd)), stdin=PIPE, stdout=tensor_can_fpo) else: tensor_can_fp = TensorStdout(sys.stdout) # whether save all alternative information, only for debug mode if alt_fn: alt_fp = open(alt_fn, 'w') pos_offset = 0 pre_pos = -1 tensor = [[]] * sliding_window_size candidate_position = [] all_alt_dict = {} depth_dict = {} af_dict = {} # to generate gvcf, it is needed to record whole genome statistical information if args.gvcf: nonVariantCaller = variantInfoCalculator( gvcfWritePath=args.temp_file_dir, ref_path=args.ref_fn, bp_resolution=args.bp_resolution, ctgName=ctg_name, sample_name='.'.join( [args.sampleName, ctg_name, str(ctg_start), str(ctg_end)]), p_err=args.base_err, gq_bin_size=args.gq_bin_size) confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn, contig_name=ctg_name, bed_ctg_start=extend_start, bed_ctg_end=extend_end) empty_pileup_flag = True for row in samtools_mpileup_process.stdout: empty_pileup_flag = False columns = row.strip().split('\t', maxsplit=5) pos = int(columns[1]) pileup_bases = columns[4] reference_base = reference_sequence[pos - reference_start].upper() valid_reference_flag = True within_flag = True if args.gvcf: if not valid_reference_flag: nonVariantCaller.make_gvcf_online({}, push_current=True) if ctg_start != None and ctg_end != None: within_flag = pos >= ctg_start and pos < ctg_end elif ctg_start != None and ctg_end == None: within_flag = pos >= ctg_start elif ctg_start == None and ctg_end != None: within_flag = pos <= ctg_end else: within_flag = True if columns[3] == '0' and within_flag and valid_reference_flag: cur_site_info = { 'chr': columns[0], 'pos': pos, 'ref': reference_base, 'n_total': 0, 'n_ref': 0 } nonVariantCaller.make_gvcf_online(cur_site_info) continue # start with a new region, clear all sliding windows cache, avoid memory occupation if pre_pos + 1 != pos: pos_offset = 0 tensor = [[]] * sliding_window_size candidate_position = [] pre_pos = pos # a condition to skip some positions creating tensor,but return allele summary # allele count function pileup_tensor, alt_dict, af, depth, pass_af, pileup_list, max_del_length = generate_tensor( pos=pos, pileup_bases=pileup_bases, reference_sequence=reference_sequence, reference_start=reference_start, reference_base=reference_base, minimum_af_for_candidate=minimum_af_for_candidate, minimum_snp_af_for_candidate=minimum_snp_af_for_candidate, minimum_indel_af_for_candidate=minimum_indel_af_for_candidate, platform=platform, fast_mode=fast_mode, call_snp_only=call_snp_only) if args.gvcf and within_flag and valid_reference_flag: cur_n_total = 0 cur_n_ref = 0 for _key, _value in pileup_list: if (_key == reference_base): cur_n_ref = _value cur_n_total += _value cur_site_info = { 'chr': columns[0], 'pos': pos, 'ref': reference_base, 'n_total': cur_n_total, 'n_ref': cur_n_ref } nonVariantCaller.make_gvcf_online(cur_site_info) pass_confident_bed = not is_confident_bed_file_given or is_region_in( tree=confident_bed_tree, contig_name=ctg_name, region_start=pos - 1, region_end=pos + max_del_length + 1) # 0-based if (pass_confident_bed and reference_base in 'ACGT' and (pass_af and depth >= min_coverage) and not is_known_vcf_file_provided) or ( is_known_vcf_file_provided and pos in known_variants_set): candidate_position.append(pos) all_alt_dict[pos] = alt_dict depth_dict[pos] = depth af_dict[pos] = af tensor[pos_offset] = pileup_tensor # save pileup tensor for each candidate position with nearby flanking_base_num bp distance pos_offset = (pos_offset + 1) % sliding_window_size if len(candidate_position ) and pos - candidate_position[0] == flanking_base_num: center = candidate_position.pop(0) has_empty_tensor = sum([True for item in tensor if not len(item)]) if not has_empty_tensor: depth = depth_dict[center] ref_seq = reference_sequence[center - (flanking_base_num) - reference_start:center + flanking_base_num + 1 - reference_start] concat_tensor = tensor[pos_offset:] + tensor[0:pos_offset] alt_info = str(depth) + '-' + ' '.join([ ' '.join([item[0], str(item[1])]) for item in list(all_alt_dict[center].items()) ]) l = "%s\t%d\t%s\t%s\t%s" % ( ctg_name, center, ref_seq, " ".join( " ".join("%d" % x for x in innerlist) for innerlist in concat_tensor), alt_info) tensor_can_fp.stdin.write(l) tensor_can_fp.stdin.write("\n") if alt_fn: alt_info = ' '.join([ ' '.join([item[0], str(item[1])]) for item in list(all_alt_dict[center].items()) ]) alt_fp.write('\t'.join([ ctg_name + ' ' + str(center), str(depth), alt_info, str(af_dict[center]) ]) + '\n') del all_alt_dict[center], depth_dict[center], af_dict[center] if args.gvcf and len(nonVariantCaller.current_block) != 0: nonVariantCaller.write_to_gvcf_batch(nonVariantCaller.current_block, nonVariantCaller.cur_min_DP, nonVariantCaller.cur_raw_gq) if args.gvcf and empty_pileup_flag: nonVariantCaller.write_empty_pileup(ctg_name, ctg_start, ctg_end) if args.gvcf: nonVariantCaller.close_vcf_writer() samtools_mpileup_process.stdout.close() samtools_mpileup_process.wait() if tensor_can_output_path != "PIPE": tensor_can_fp.stdin.close() tensor_can_fp.wait() tensor_can_fpo.close() if alt_fn: alt_fp.close()
def make_candidates(args): gen4Training = args.gen4Training variant_file_path = args.var_fn bed_file_path = args.bed_fn fasta_file_path = args.ref_fn ctg_name = args.ctgName ctg_start = args.ctgStart ctg_end = args.ctgEnd output_probability = args.outputProb samtools_execute_command = args.samtools minimum_depth_for_candidate = args.minCoverage minimum_af_for_candidate = args.threshold minimum_mapping_quality = args.minMQ bam_file_path = args.bam_fn candidate_output_path = args.can_fn is_using_stdout_for_output_candidate = candidate_output_path == "PIPE" is_building_training_dataset = gen4Training == True is_variant_file_given = variant_file_path is not None is_bed_file_given = bed_file_path is not None is_ctg_name_given = ctg_name is not None is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None if is_building_training_dataset: # minimum_depth_for_candidate = 0 minimum_af_for_candidate = 0 # preparation for candidates near variants need_consider_candidates_near_variant = is_building_training_dataset and is_variant_file_given variants_map = variants_map_from( variant_file_path) if need_consider_candidates_near_variant else {} non_variants_map = non_variants_map_near_variants_from(variants_map) no_of_candidates_near_variant = 0 no_of_candidates_outside_variant = 0 # update output probabilities for candidates near variants # original: (7000000.0 * 2.0 / 3000000000) ratio_of_candidates_near_variant_to_candidates_outside_variant = 1.0 output_probability_near_variant = ( 3500000.0 * ratio_of_candidates_near_variant_to_candidates_outside_variant * RATIO_OF_NON_VARIANT_TO_VARIANT / 14000000) output_probability_outside_variant = 3500000.0 * RATIO_OF_NON_VARIANT_TO_VARIANT / ( 3000000000 - 14000000) if not isfile("{}.fai".format(fasta_file_path)): print("Fasta index {}.fai doesn't exist.".format(fasta_file_path), file=sys.stderr) sys.exit(1) # 1-based regions [start, end] (start and end inclusive) regions = [] reference_start, reference_end = None, None if is_ctg_range_given: reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion reference_start = 1 if reference_start < 1 else reference_start regions.append( region_from(ctg_name=ctg_name, ctg_start=reference_start, ctg_end=reference_end)) elif is_ctg_name_given: regions.append(region_from(ctg_name=ctg_name)) reference_sequence = reference_sequence_from( samtools_execute_command=samtools_execute_command, fasta_file_path=fasta_file_path, regions=regions) if reference_sequence is None or len(reference_sequence) == 0: print( "[ERROR] Failed to load reference seqeunce from file ({}).".format( fasta_file_path), file=sys.stderr) sys.exit(1) tree = bed_tree_from(bed_file_path=bed_file_path) if is_bed_file_given and ctg_name not in tree: print("[ERROR] ctg_name({}) not exists in bed file({}).".format( ctg_name, bed_file_path), file=sys.stderr) sys.exit(1) samtools_view_process = subprocess_popen( shlex.split("{} view -F {} {} {}".format( samtools_execute_command, param.SAMTOOLS_VIEW_FILTER_FLAG, bam_file_path, " ".join(regions)))) if is_using_stdout_for_output_candidate: can_fp = CandidateStdout(sys.stdout) else: can_fpo = open(candidate_output_path, "wb") can_fp = subprocess_popen(shlex.split("gzip -c"), stdin=PIPE, stdout=can_fpo) pileup = defaultdict(lambda: { "A": 0, "C": 0, "G": 0, "T": 0, "I": 0, "D": 0, "N": 0 }) POS = 0 number_of_reads_processed = 0 while True: row = samtools_view_process.stdout.readline() is_finish_reading_output = row == '' and samtools_view_process.poll( ) is not None if row: columns = row.strip().split() if columns[0][0] == "@": continue RNAME = columns[2] if RNAME != ctg_name: continue POS = int( columns[3] ) - 1 # switch from 1-base to 0-base to match sequence index MAPQ = int(columns[4]) CIGAR = columns[5] SEQ = columns[9].upper( ) # uppercase for SEQ (regexp is \*|[A-Za-z=.]+) reference_position = POS query_position = 0 if MAPQ < minimum_mapping_quality: continue if CIGAR == "*" or is_too_many_soft_clipped_bases_for_a_read_from( CIGAR): continue number_of_reads_processed += 1 advance = 0 for c in str(CIGAR): if c.isdigit(): advance = advance * 10 + int(c) continue if c == "S": query_position += advance elif c == "M" or c == "=" or c == "X": for _ in range(advance): base = evc_base_from(SEQ[query_position]) pileup[reference_position][base] += 1 # those CIGAR operations consumes query and reference reference_position += 1 query_position += 1 elif c == "I": pileup[reference_position - 1]["I"] += 1 # insertion consumes query query_position += advance elif c == "D": pileup[reference_position - 1]["D"] += 1 # deletion consumes reference reference_position += advance # reset advance advance = 0 positions = [x for x in pileup.keys() if x < POS ] if not is_finish_reading_output else list(pileup.keys()) positions.sort() for zero_based_position in positions: base_count = depth = reference_base = temp_key = None # ctg and bed checking (region [ctg_start, ctg_end] is 1-based, inclusive start and end positions) pass_ctg = not is_ctg_range_given or ctg_start <= zero_based_position + 1 <= ctg_end pass_bed = not is_bed_file_given or is_region_in( tree, ctg_name, zero_based_position) if not pass_bed or not pass_ctg: continue # output probability checking pass_output_probability = True if is_building_training_dataset and is_variant_file_given: temp_key = ctg_name + ":" + str(zero_based_position + 1) pass_output_probability = (temp_key not in variants_map and ( (temp_key in non_variants_map and random.uniform(0, 1) <= output_probability_near_variant) or (temp_key not in non_variants_map and random.uniform( 0, 1) <= output_probability_outside_variant))) elif is_building_training_dataset: pass_output_probability = random.uniform( 0, 1) <= output_probability if not pass_output_probability: continue # for depth checking and af checking try: reference_base = evc_base_from(reference_sequence[ zero_based_position - (0 if reference_start is None else (reference_start - 1))]) position_dict = pileup[zero_based_position] except: continue # depth checking base_count = list(position_dict.items()) depth = sum( x[1] for x in base_count) - position_dict["I"] - position_dict["D"] if depth < minimum_depth_for_candidate: continue # af checking denominator = depth if depth > 0 else 1 base_count.sort( key=lambda x: -x[1]) # sort base_count descendingly pass_af = (base_count[0][0] != reference_base or (float(base_count[1][1]) / denominator) >= minimum_af_for_candidate) if not pass_af: continue # output 1-based candidate if temp_key is not None and temp_key in non_variants_map: no_of_candidates_near_variant += 1 elif temp_key is not None and temp_key not in non_variants_map: no_of_candidates_outside_variant += 1 output = [ctg_name, zero_based_position + 1, reference_base, depth] output.extend(["%s %d" % x for x in base_count]) output = " ".join([str(x) for x in output]) + "\n" can_fp.stdin.write(output) for zero_based_position in positions: del pileup[zero_based_position] if is_finish_reading_output: break if need_consider_candidates_near_variant: print("# of candidates near variant: ", no_of_candidates_near_variant) print("# of candidates outside variant: ", no_of_candidates_outside_variant) samtools_view_process.stdout.close() samtools_view_process.wait() if not is_using_stdout_for_output_candidate: can_fp.stdin.close() can_fp.wait() can_fpo.close() if number_of_reads_processed == 0: print( "No read has been process, either the genome region you specified has no read cover, or please check the correctness of your BAM input (%s)." % (bam_file_path), file=sys.stderr) sys.exit(0)
def get_training_array(tensor_fn, var_fn, bed_fn, shuffle=True, is_allow_duplicate_chr_pos=False): tree = bed_tree_from(bed_file_path=bed_fn) is_tree_empty = len(tree.keys()) == 0 Y = variant_map_from(var_fn, tree, is_tree_empty) X = {} f = subprocess_popen(shlex.split("gzip -fdc %s" % (tensor_fn))) total = 0 mat = np.empty(input_tensor_size, dtype=np.float32) for row in f.stdout: chrom, coord, seq, mat = unpack_a_tensor_record(*(row.split())) if not (is_tree_empty or is_region_in(tree, chrom, int(coord))): continue seq = seq.upper() if seq[param.flankingBaseNum] not in BASIC_BASES: continue key = chrom + ":" + coord x = np.reshape(mat, (no_of_positions, matrix_row, matrix_num)) for i in range(1, matrix_num): x[:, :, i] -= x[:, :, 0] if key not in X: X[key] = np.copy(x) elif is_allow_duplicate_chr_pos: new_key = "" for character in PREFIX_CHAR_STR: tmp_key = character + key if tmp_key not in X: new_key = tmp_key break if len(new_key) > 0: X[new_key] = np.copy(x) is_reference = key not in Y if is_reference: Y[key] = output_labels_from_reference( BASE2ACGT[seq[param.flankingBaseNum]]) total += 1 if total % 100000 == 0: print("Processed %d tensors" % total, file=sys.stderr) f.stdout.close() f.wait() # print "[INFO] size of X: {}, size of Y: {}".format(len(X), len(Y)) all_chr_pos = sorted(X.keys()) if shuffle == True: np.random.shuffle(all_chr_pos) X_compressed, Y_compressed, pos_compressed = [], [], [] X_array, Y_array, pos_array = [], [], [] count = 0 total = 0 for key in all_chr_pos: total += 1 X_array.append(X[key]) del X[key] if key in Y: Y_array.append(Y[key]) pos_array.append(key) if not is_allow_duplicate_chr_pos: del Y[key] elif is_allow_duplicate_chr_pos: tmp_key = key[1:] Y_array.append(Y[tmp_key]) pos_array.append(tmp_key) count += 1 if count == param.bloscBlockSize: X_compressed.append(blosc_pack_array(np.array(X_array))) Y_compressed.append(blosc_pack_array(np.array(Y_array))) pos_compressed.append(blosc_pack_array(np.array(pos_array))) X_array, Y_array, pos_array = [], [], [] count = 0 if total % 50000 == 0: print("Compressed %d/%d tensor" % (total, len(all_chr_pos)), file=sys.stderr) if count > 0: X_compressed.append(blosc_pack_array(np.array(X_array))) Y_compressed.append(blosc_pack_array(np.array(Y_array))) pos_compressed.append(blosc_pack_array(np.array(pos_array))) return total, X_compressed, Y_compressed, pos_compressed
def MergeVcf_illumina(args): # region vcf merge for illumina, as read realignment will make candidate varaints shift and missing. bed_fn_prefix = args.bed_fn_prefix output_fn = args.output_fn full_alignment_vcf_fn = args.full_alignment_vcf_fn pileup_vcf_fn = args.pileup_vcf_fn # true vcf var contig_name = args.ctgName QUAL = args.qual bed_fn = None if not os.path.exists(bed_fn_prefix): exit( log_error("[ERROR] Input directory: {} not exists!").format( bed_fn_prefix)) all_files = os.listdir(bed_fn_prefix) all_files = [ item for item in all_files if item.startswith(contig_name + '.') ] if len(all_files) != 0: bed_fn = os.path.join(bed_fn_prefix, "full_aln_regions_{}".format(contig_name)) with open(bed_fn, 'w') as output_file: for file in all_files: with open(os.path.join(bed_fn_prefix, file)) as f: output_file.write(f.read()) is_haploid_precise_mode_enabled = args.haploid_precise is_haploid_sensitive_mode_enabled = args.haploid_sensitive print_ref = args.print_ref_calls tree = bed_tree_from(bed_file_path=bed_fn, padding=param.no_of_positions, contig_name=contig_name) unzip_process = subprocess_popen( shlex.split("gzip -fdc %s" % (pileup_vcf_fn))) output_dict = {} header = [] pileup_count = 0 for row in unzip_process.stdout: if row[0] == '#': header.append(row) continue columns = row.strip().split() ctg_name = columns[0] if contig_name != None and ctg_name != contig_name: continue pos = int(columns[1]) qual = float(columns[5]) pass_bed = is_region_in(tree, ctg_name, pos) ref_base, alt_base = columns[3], columns[4] is_reference = (alt_base == "." or ref_base == alt_base) if is_haploid_precise_mode_enabled: row = update_haploid_precise_genotype(columns) if is_haploid_sensitive_mode_enabled: row = update_haploid_sensitive_genotype(columns) if not pass_bed: if not is_reference: row = MarkLowQual(row, QUAL, qual) output_dict[pos] = row pileup_count += 1 elif print_ref: output_dict[pos] = row pileup_count += 1 unzip_process.stdout.close() unzip_process.wait() realigned_vcf_unzip_process = subprocess_popen( shlex.split("gzip -fdc %s" % (full_alignment_vcf_fn))) realiged_read_num = 0 for row in realigned_vcf_unzip_process.stdout: if row[0] == '#': continue columns = row.strip().split() ctg_name = columns[0] if contig_name != None and ctg_name != contig_name: continue pos = int(columns[1]) qual = float(columns[5]) ref_base, alt_base = columns[3], columns[4] is_reference = (alt_base == "." or ref_base == alt_base) if is_haploid_precise_mode_enabled: row = update_haploid_precise_genotype(columns) if is_haploid_sensitive_mode_enabled: row = update_haploid_sensitive_genotype(columns) if is_region_in(tree, ctg_name, pos): if not is_reference: row = MarkLowQual(row, QUAL, qual) output_dict[pos] = row realiged_read_num += 1 elif print_ref: output_dict[pos] = row realiged_read_num += 1 logging.info('[INFO] Pileup positions variants proceeded in {}: {}'.format( contig_name, pileup_count)) logging.info( '[INFO] Realigned positions variants proceeded in {}: {}'.format( contig_name, realiged_read_num)) realigned_vcf_unzip_process.stdout.close() realigned_vcf_unzip_process.wait() with open(output_fn, 'w') as output_file: output_list = header + [ output_dict[pos] for pos in sorted(output_dict.keys()) ] output_file.write(''.join(output_list))
def bin_reader_generator_from(tensor_fn, Y_true_var, Y, is_tree_empty, tree, miss_variant_set, truth_alt_dict, is_allow_duplicate_chr_pos=False, maximum_non_variant_ratio=None): """ Bin reader generator for bin file generation. tensor_fn: tensor file. Y_true_var: dictionary (contig name: label information) containing all true variant information (should not be changed). Y: dictionary (contig name: label information) to store all variant and non variant information. tree: dictionary(contig name : intervaltree) for quick region querying. miss_variant_set: sometimes there will have true variant missing after downsampling reads. truth_alt_dict: unified truth reference base and alternative bases to find read support. is_allow_duplicate_chr_pos: whether allow duplicate positions when training, if there exists downsampled data, lower depth will add a random prefix character. maximum_non_variant_ratio: define a maximum non variant ratio for training, we always expect use more non variant data, while it would greatly increase training time, especially in ont data, here we usually use 1:1 or 1:2 for variant candidate: non variant candidate. """ X = {} ref_list = [] total = 0 variant_set_with_read_support = set() variants_without_read_support = 0 for row_idx, row in enumerate(tensor_fn): chrom, coord, seq, string, alt_info = row.split("\t") alt_info = alt_info.rstrip() if not (is_tree_empty or is_region_in(tree, chrom, int(coord))): continue seq = seq.upper() if seq[param.flankingBaseNum] not in 'ACGT': continue key = chrom + ":" + coord is_reference = key not in Y_true_var if key in miss_variant_set: continue have_read_support = find_read_support(pos=coord, truth_alt_dict=truth_alt_dict, alt_info=alt_info) if have_read_support is not None and not have_read_support: miss_variant_set.add(key) variants_without_read_support += 1 continue variant_set_with_read_support.add(key) if key not in X: X[key] = (string, alt_info, seq) if is_reference: ref_list.append(key) elif is_allow_duplicate_chr_pos: new_key = "" for character in PREFIX_CHAR_STR: tmp_key = character + key if tmp_key not in X: new_key = tmp_key break if len(new_key) > 0: X[new_key] = (string, alt_info, seq) if is_reference: ref_list.append(new_key) if is_reference and key not in Y: Y[key] = output_labels_from_reference(BASE2BASE[seq[param.flankingBaseNum]]) if len(X) == shuffle_bin_size: if maximum_non_variant_ratio is not None: _filter_non_variants(X, ref_list, maximum_non_variant_ratio) yield X, total, False X = {} ref_list = [] total += 1 if total % 100000 == 0: print("[INFO] Processed %d tensors" % total, file=sys.stderr) print("[INFO] Variants with read support/variants without read support: {}/{}".format(len(variant_set_with_read_support), variants_without_read_support)) if maximum_non_variant_ratio is not None: _filter_non_variants(X, ref_list, maximum_non_variant_ratio) yield X, total, True
def CreateTensorPileup(args): """ Create pileup tensor for pileup model training or calling. Use slide window to scan the whole candidate regions, keep all candidates over specific minimum allelic frequency and minimum depth, use samtools mpileup to store pileup info for pileup tensor generation. Only scan candidate regions once, we could directly get all variant candidates directly. """ ctg_start = args.ctgStart ctg_end = args.ctgEnd fasta_file_path = args.ref_fn ctg_name = args.ctgName bam_file_path = args.bam_fn chunk_id = args.chunk_id - 1 if args.chunk_id else None # 1-base to 0-base chunk_num = args.chunk_num minimum_snp_af_for_candidate = args.snp_min_af minimum_indel_af_for_candidate = args.indel_min_af min_coverage = args.minCoverage min_mapping_quality = args.minMQ platform = args.platform vcf_fn = file_path_from(args.vcf_fn) is_known_vcf_file_provided = vcf_fn is not None confident_bed_fn = file_path_from(args.extend_bed) is_confident_bed_file_given = confident_bed_fn is not None extend_bed = file_path_from(args.extend_bed) is_extend_bed_file_given = extend_bed is not None fast_mode = args.fast_mode call_snp_only = args.call_snp_only enable_long_indel = args.enable_long_indel # 1-based regions [start, end] (start and end inclusive) tree, bed_start, bed_end = bed_tree_from(bed_file_path=extend_bed, contig_name=ctg_name, return_bed_region=True) fai_fn = file_path_from(fasta_file_path, suffix=".fai", exit_on_not_found=True, sep='.') fast_mode = platform == 'ont' and fast_mode minimum_snp_af_for_candidate = max( minimum_snp_af_for_candidate, param.min_af_dict[platform] ) if fast_mode else minimum_snp_af_for_candidate min_coverage = max(min_coverage, 4) if fast_mode else min_coverage max_indel_length = param.maximum_variant_length_that_need_infer if not enable_long_indel else param.maximum_variant_length_that_need_infer_include_long_indel if not is_confident_bed_file_given and chunk_id is not None: contig_length = 0 with open(fai_fn, 'r') as fai_fp: for row in fai_fp: columns = row.strip().split("\t") contig_name = columns[0] if contig_name != ctg_name: continue contig_length = int(columns[1]) chunk_size = contig_length // chunk_num + 1 if contig_length % chunk_num else contig_length // chunk_num ctg_start = chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size if is_confident_bed_file_given and chunk_id is not None: chunk_size = (bed_end - bed_start) // chunk_num + 1 if ( bed_end - bed_start) % chunk_num else (bed_end - bed_start) // chunk_num ctg_start = bed_start + 1 + chunk_size * chunk_id # 0-base to 1-base ctg_end = ctg_start + chunk_size if is_known_vcf_file_provided and chunk_id is not None: known_variants_list = vcf_candidates_from(vcf_fn=vcf_fn, contig_name=ctg_name) total_variants_size = len(known_variants_list) chunk_variants_size = total_variants_size // chunk_num if total_variants_size % chunk_num == 0 else total_variants_size // chunk_num + 1 chunk_start_pos = chunk_id * chunk_variants_size known_variants_set = set( known_variants_list[chunk_start_pos:chunk_start_pos + chunk_variants_size]) if len(known_variants_set) == 0: return [], [], [] ctg_start, ctg_end = min(known_variants_set), max(known_variants_set) is_ctg_name_given = ctg_name is not None is_ctg_range_given = is_ctg_name_given and ctg_start is not None and ctg_end is not None if is_ctg_range_given: ctg_start = max(1, ctg_start) extend_start = max(1, ctg_start - no_of_positions) extend_end = ctg_end + no_of_positions region_str = "{}:{}-{}".format(ctg_name, extend_start, extend_end) region = Region.from_string(region_str) confident_bed_tree = bed_tree_from(bed_file_path=confident_bed_fn, contig_name=ctg_name, bed_ctg_start=extend_start, bed_ctg_end=extend_end) if args.gvcf: from preprocess.utils import variantInfoCalculator nonVariantCaller = variantInfoCalculator( gvcfWritePath=args.temp_file_dir, ref_path=args.ref_fn, bp_resolution=args.bp_resolution, ctgName=ctg_name, sample_name='.'.join( [args.sampleName, ctg_name, str(ctg_start), str(ctg_end)]), p_err=args.base_err, gq_bin_size=args.gq_bin_size) chunk_result, all_alt_info_list, gvcf_output = pileup_counts_clair3( region, bam=bam_file_path, fasta=fasta_file_path, min_depth=min_coverage, min_snp_af=minimum_snp_af_for_candidate, min_indel_af=minimum_indel_af_for_candidate, min_mq=min_mapping_quality, max_indel_length=max_indel_length, call_snp_only=call_snp_only, max_depth=param.max_depth, gvcf=args.gvcf) # slice all candidates tensor according to the alternative information np_pileup_data, all_position_info, all_alt_info = [], [], [] for idx, (pos, pos_info, alt_info) in enumerate(all_alt_info_list): pos = int(pos) pass_confident_bed = not is_confident_bed_file_given or is_region_in( tree=confident_bed_tree, contig_name=ctg_name, region_start=pos - 1, region_end=pos + 1) pass_vcf_region = not is_known_vcf_file_provided or ( is_known_vcf_file_provided and pos in known_variants_set) if not pass_confident_bed or not pass_vcf_region: continue start, end = pos - flanking_base_num, pos + flanking_base_num + 1 for result in chunk_result: if start - 1 >= result[1][0][0] and end <= result[1][-1][0]: offset = start - result[1][0][0] - 1 tensor = result[0][offset:offset + no_of_positions] # mainly because no coverage in flanking windows if tensor.shape != (no_of_positions, channel_size): continue # check any empty columns in flanking position, those columns with all zeros if np.sum(np.sum(tensor == 0, axis=1) == channel_size) > 0: continue np_pileup_data.append(tensor) all_position_info.append(pos_info) all_alt_info.append(alt_info) np_pileup_data = np.array(np_pileup_data, dtype=np.int32) if args.gvcf: from shared.utils import reference_sequence_from, region_from samtools_execute_command = args.samtools ref_regions = [] reference_start, reference_end = ctg_start - param.expandReferenceRegion, ctg_end + param.expandReferenceRegion reference_start = 1 if reference_start < 1 else reference_start ref_regions.append( region_from(ctg_name=ctg_name, ctg_start=reference_start, ctg_end=reference_end)) reference_sequence = reference_sequence_from( samtools_execute_command=samtools_execute_command, fasta_file_path=fasta_file_path, regions=ref_regions) offset = 0 if ctg_start == 1 else 1 empty_pileup_flag = False start = ctg_start - extend_start + offset end = ctg_end + 1 - extend_start + offset if sum(gvcf_output[1][start:end]) == 0: empty_pileup_flag = True for pos in range(ctg_start, ctg_end): if empty_pileup_flag: break ref_count = gvcf_output[0][pos - extend_start + offset] total_count = gvcf_output[1][pos - extend_start + offset] if pos - reference_start >= len(reference_sequence): continue reference_base = reference_sequence[pos - reference_start] if (ref_count == 0 and total_count == 0): cur_site_info = { 'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': 0, 'n_ref': 0 } nonVariantCaller.make_gvcf_online(cur_site_info) continue cur_site_info = { 'chr': ctg_name, 'pos': pos, 'ref': reference_base, 'n_total': total_count, 'n_ref': ref_count } nonVariantCaller.make_gvcf_online(cur_site_info) if len(nonVariantCaller.current_block) != 0: nonVariantCaller.write_to_gvcf_batch( nonVariantCaller.current_block, nonVariantCaller.cur_min_DP, nonVariantCaller.cur_raw_gq) if empty_pileup_flag: nonVariantCaller.write_empty_pileup(ctg_name, ctg_start, ctg_end) nonVariantCaller.close_vcf_writer() return np_pileup_data, all_position_info, all_alt_info