def filter_bad_variants(self, reference_file, input_vcf, output_prefix, snp_filter_name='ambiguous_snp', snp_QD=2.0, snp_FS=60.0, snp_MQ=40.0, snp_HaplotypeScore=13.0, snp_MappingQualityRankSum=-12.5, snp_ReadPosRankSum=-8.0, indel_filter_name='ambiguous_indel', indel_QD=2.0, indel_ReadPosRankSum=-20.0, indel_FS=200.0, combine_vcf=False, sequence_dict_file=None, picard_memory="1g", picard_dir=None): from RouToolPa.Tools.GATK4 import SelectVariants4 from RouToolPa.Tools.Picard import SortVcf snp_raw_vcf = "%s.snp.raw.vcf" % output_prefix indel_raw_vcf = "%s.indel.raw.vcf" % output_prefix snp_filtered_vcf = "%s.snp.with_filters.vcf" % output_prefix indel_filtered_vcf = "%s.indel.with_filters.vcf" % output_prefix snp_good_vcf = "%s.snp.good.vcf" % output_prefix indel_good_vcf = "%s.indel.good.vcf" % output_prefix unsorted_combined_filtered_vcf = "%s.combined.with_filters.unsorted.vcf" % output_prefix unsorted_combined_good_vcf = "%s.combined.good.unsorted.vcf" % output_prefix combined_filtered_vcf = "%s.combined.with_filters.sorted.vcf" % output_prefix combined_good_vcf = "%s.combined.good.sorted.vcf" % output_prefix SelectVariants4.path = self.path SortVcf.jar_path = picard_dir SortVcf.max_memory = picard_memory #CombineVariants.jar_path = self.jar_path SelectVariants4.get_SNP(reference_file, input_vcf, snp_raw_vcf) SelectVariants4.get_indel(reference_file, input_vcf, indel_raw_vcf) self.filter_bad_SNP(reference_file, snp_raw_vcf, snp_filtered_vcf, filter_name=snp_filter_name, QD=snp_QD, FS=snp_FS, MQ=snp_MQ, #HaplotypeScore=snp_HaplotypeScore, MappingQualityRankSum=snp_MappingQualityRankSum, ReadPosRankSum=snp_ReadPosRankSum) self.filter_bad_indel(reference_file, indel_raw_vcf, indel_filtered_vcf, filter_name=indel_filter_name, QD=indel_QD, ReadPosRankSum=indel_ReadPosRankSum, FS=indel_FS) SelectVariants4.remove_entries_with_filters(reference_file, snp_filtered_vcf, snp_good_vcf) SelectVariants4.remove_entries_with_filters(reference_file, indel_filtered_vcf, indel_good_vcf) if combine_vcf: VCFRoutines.combine_same_samples_vcfs(unsorted_combined_filtered_vcf, vcf_list=[snp_filtered_vcf, indel_filtered_vcf], order_vcf_files=False, sort=True, chunk_folder=None, chunk_prefix=None, chunk_suffix=None, starting_chunk=None, chunk_number_list=None, close_fd_after=False, extension_list=[".vcf", ]) VCFRoutines.combine_same_samples_vcfs(unsorted_combined_good_vcf, vcf_list=[snp_good_vcf, indel_good_vcf], order_vcf_files=False, sort=True, chunk_folder=None, chunk_prefix=None, chunk_suffix=None, starting_chunk=None, chunk_number_list=None, close_fd_after=False, extension_list=[".vcf", ]) if sequence_dict_file: SortVcf.sort_vcf(unsorted_combined_filtered_vcf, combined_filtered_vcf, seq_dict=sequence_dict_file) SortVcf.sort_vcf(unsorted_combined_good_vcf, combined_good_vcf, seq_dict=sequence_dict_file) """
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import sys import argparse from RouToolPa.Routines import VCFRoutines parser = argparse.ArgumentParser() parser.add_argument("-o", "--output", action="store", dest="output", default=sys.stdout, help="Output file. Default: stdout") parser.add_argument("-i", "--vcf_list", action="store", dest="vcf_list", required=True, type=VCFRoutines.make_list_of_path_to_files_from_string, help="Comma-separated list of vcf files") parser.add_argument("-s", "--sort", action="store_true", dest="sort", default=False, help="Sort vcf files. Default:False") parser.add_argument("-r", "--order_vcf_files", action="store_true", dest="order_vcf_files", default=False, help="Order vcf files by name using natural sorting. Default:False") args = parser.parse_args() VCFRoutines.combine_same_samples_vcfs(args.output, vcf_list=args.vcf_list, close_fd_after=False, extension_list=[".vcf", ], order_vcf_files=args.order_vcf_files, sort=args.sort)
from RouToolPa.Routines import VCFRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gvcf", action="store", dest="input_gvcf", help="Input gvcf file", required=True) parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument("-r", "--reference", action="store", dest="reference", required=True, help="Fasta with reference genome") args = parser.parse_args() VCFRoutines.check_gvcf_integrity(args.input_gvcf, args.output_prefix, reference=args.reference, length_dict=None, parsing_mode="parse")
def parallel_call(self, reference, alignment, output_dir, output_prefix, stand_call_conf=30, max_region_length=1000000, max_seqs_per_region=100, length_dict=None, parsing_mode="parse", region_list=None, region_file_format='simple', remove_intermediate_files=False, cpus_per_task=1, handling_mode="local", job_name=None, log_prefix=None, error_log_prefix=None, max_running_jobs=None, max_running_time=None, max_memmory_per_cpu=None, modules_list=None, environment_variables_dict=None, black_list_scaffold_id_file=None, gvcf_mode=False, ignore_softclipped_bases=False): splited_dir = "%s/splited/" % output_dir regions_dir = "%s/regions/" % output_dir from RouToolPa.Tools.GATK4 import SortVcf4 sequence_dict = reference[:-5] + "dict" SortVcf4.max_memory = self.max_memory SortVcf4.path = self.path for directory in output_dir, splited_dir: self.safe_mkdir(directory) if black_list_scaffold_id_file: if isinstance(black_list_scaffold_id_file, str): black_scaffolds_list = IdList(filename=black_list_scaffold_id_file) else: black_scaffolds_list = black_list_scaffold_id_file else: black_scaffolds_list = [] region_list, \ scaffold_to_region_correspondence_dict = self.prepare_region_list_by_length(max_length=max_region_length, max_seq_number=max_seqs_per_region, length_dict=length_dict, reference=None if length_dict is not None else reference, parsing_mode=parsing_mode, output_dir=regions_dir, black_list_scaffolds=black_scaffolds_list, region_file_format=region_file_format if handling_mode != "slurm" else 'GATK') if region_list is None else region_list options = self.parse_options_for_parallel_run(reference, alignment, stand_call_conf=stand_call_conf, gvcf_mode=gvcf_mode, ignore_softclipped_bases=False) #options += " -nct 1" options_list = [] output_index = 1 output_file_list = [] output_extension = "g.vcf" if gvcf_mode else "vcf" if handling_mode == 'local': for regions in region_list: output_file = "%s/%s_%i.%s" % (splited_dir, output_prefix, output_index, output_extension) region_options = " -O %s" % output_file output_file_list.append(output_file) #for region in regions: # region_options += " -L %s:%i-%i" % (region[0], region[1], region[2]) for region in regions: if isinstance(region, str): region_options += " -L %s" % region elif len(region) == 1: region_options += " -L %s" % region[0] elif len(region) == 3: region_options += " -L %s:%i-%i" % (region[0], region[1], region[2]) options_list.append(options + region_options) output_index += 1 print("Variant calling....") self.parallel_execute(options_list, cmd=("gatk --java-options -Xmx%s HaplotypeCaller" % self.max_memory) if self.max_memory else None) unsorted_combined_vcf = "%s/%s.unsorted.%s" % (output_dir, output_prefix, output_extension) sorted_combined_vcf = "%s/%s.%s" % (output_dir, output_prefix, output_extension) print("Combining variants...") VCFRoutines.combine_same_samples_vcfs(unsorted_combined_vcf, vcf_list=output_file_list, order_vcf_files=True, close_fd_after=False, extension_list=[".vcf",]) print("Sorting...") SortVcf4.sort_vcf(unsorted_combined_vcf, sorted_combined_vcf, sequence_dict) shutil.rmtree(splited_dir) shutil.rmtree(regions_dir) os.remove(unsorted_combined_vcf) elif handling_mode == 'slurm': number_of_regions = len(region_list) region_file = "%s/splited/region_${SLURM_ARRAY_TASK_ID}.list" % regions_dir output_file = "%s/%s_${SLURM_ARRAY_TASK_ID}.%s" % (splited_dir, output_prefix, output_extension) options += " -O %s" % output_file options += " -L %s" % region_file slurm_cmd = "gatk --java-options -Xmx%s HaplotypeCaller" % self.max_memory if self.max_memory else "gatk HaplotypeCaller" slurm_cmd += " %s" % options last_job_id = self.slurm_run_job(job_name, log_prefix, slurm_cmd, error_log_prefix, "%s%s.slurm" % (output_dir, output_prefix), task_index_list=None, start_task_index=1, end_task_index=number_of_regions, max_running_jobs=max_running_jobs, max_running_time=max_running_time, cpus_per_task=cpus_per_task, max_memmory_per_cpu=max_memmory_per_cpu, modules_list=modules_list, environment_variables_dict=environment_variables_dict) print("Submitted job %s" % last_job_id) #VCFRoutines.combine_same_samples_vcfs(output_file_list, # output, # close_fd_after=False, # extension_list=gvcf_extension_list) else: print("ERROR!!! Unrecognized handling mode!")
def parallel_gvcf_call(self, reference, alignment, output_dir, output_prefix, output, genotyping_mode="DISCOVERY", stand_call_conf=30, max_region_length=1000000, max_seqs_per_region=100, length_dict=None, parsing_mode="parse", region_list=None, region_file_format='simple', remove_intermediate_files=False, gvcf_extension_list=[ "g.vcf", ], cpus_per_task=1, handling_mode="local", job_name=None, log_prefix=None, error_log_prefix=None, max_running_jobs=None, max_running_time=None, max_memmory_per_cpu=None, modules_list=None, environment_variables_dict=None): splited_dir = "%s/splited_gvcf/" % output_dir regions_dir = "%s/regions/" % output_dir for directory in output_dir, splited_dir: self.safe_mkdir(directory) region_list, \ scaffold_to_region_correspondence_dict = self.prepare_region_list_by_length(max_length=max_region_length, max_seq_number=max_seqs_per_region, length_dict=length_dict, reference=None if length_dict is not None else reference, parsing_mode=parsing_mode, output_dir=regions_dir, region_file_format=region_file_format if handling_mode != "slurm" else 'GATK') if region_list is None else region_list options = self.parse_options_for_parallel_run( reference, alignment, genotyping_mode=genotyping_mode, stand_call_conf=stand_call_conf, gvcf_mode=True) options += " -nct 1" options_list = [] output_index = 1 output_file_list = [] if handling_mode == 'local': for regions in region_list: output_file = "%s/%s_%i.g.vcf" % (splited_dir, output_prefix, output_index) region_options = " -o %s" % output_file output_file_list.append(output_file) #for region in regions: # region_options += " -L %s:%i-%i" % (region[0], region[1], region[2]) for region in regions: if isinstance(region, str): region_options += " -L %s" % region elif len(region) == 1: region_options += " -L %s" % region[0] elif len(region) == 3: region_options += " -L %s:%i-%i" % ( region[0], region[1], region[2]) options_list.append(options + region_options) output_index += 1 self.parallel_execute(options_list) VCFRoutines.combine_same_samples_vcfs( output, vcf_list=output_file_list, order_vcf_files=True, close_fd_after=False, extension_list=gvcf_extension_list) elif handling_mode == 'slurm': number_of_regions = len(region_list) region_file = "%s/splited/region_${SLURM_ARRAY_TASK_ID}.list" % regions_dir output_file = "%s/%s_${SLURM_ARRAY_TASK_ID}.g.vcf" % ( splited_dir, output_prefix) options += " -o %s" % output_file options += " -L %s" % region_file slurm_cmd = "java -Xmx%s -jar %s/%s %s" % ( self.max_memory, self.jar_path, self.jar, options) last_job_id = self.slurm_run_job( job_name, log_prefix, slurm_cmd, error_log_prefix, "%s%s.slurm" % (output_dir, output_prefix), task_index_list=None, start_task_index=1, end_task_index=number_of_regions, max_running_jobs=max_running_jobs, max_running_time=max_running_time, cpus_per_task=cpus_per_task, max_memmory_per_cpu=max_memmory_per_cpu, modules_list=modules_list, environment_variables_dict=environment_variables_dict) print("Submitted job %s" % last_job_id) #VCFRoutines.combine_same_samples_vcfs(output_file_list, # output, # close_fd_after=False, # extension_list=gvcf_extension_list) else: print("ERROR!!! Unrecognized handling mode!")
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import VCFRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input vcf file with mutations") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument("-m", "--mode", action="store", dest="mode", default="one", help="Operation mode. Allowed: 'one'(default) - variant will be treated as heterozygous if " "there is at least one heterozygous sample, 'all' - all samples have to be heterozygous") args = parser.parse_args() VCFRoutines.extract_heterozygous_variants(args.input, args.output_prefix, mode=args.mode, verbose=True)
def parallel_genotype( self, reference, gvcf_list, splited_dir, splited_prefix, output_vcf, max_total_scaffold_length_per_chunk=100000, max_scaffold_number_per_chunk=5, length_dict=None, parsing_mode="parse", region_list=None, extension_list=[ "g.vcf", ], #disable_auto_index_creation_and_locking_when_reading_rods=True, max_alternate_alleles=None, picard_jar_path=None): self.safe_mkdir(splited_dir) regions_list,\ scaffold_to_region_correspondence_dict = self.prepare_region_list_by_length(max_length=max_total_scaffold_length_per_chunk, max_seq_number=max_scaffold_number_per_chunk, length_dict=length_dict, reference=None if length_dict is not None else reference, parsing_mode=parsing_mode, output_dir="%s/regions/" % splited_dir, split_scaffolds=False) if region_list is None else region_list options = self.parse_options_for_parallel_run( reference, gvcf_list, extension_list=extension_list, max_alternate_alleles=max_alternate_alleles, #disable_auto_index_creation_and_locking_when_reading_rods=disable_auto_index_creation_and_locking_when_reading_rods ) output_index = 1 options_list = [] region_vcf_list = [] for regions in regions_list: region_options = " -O %s/%s_%i.vcf" % (splited_dir, splited_prefix, output_index) region_vcf_list.append("%s/%s_%i.vcf" % (splited_dir, splited_prefix, output_index)) for region in regions: if isinstance(region, str): region_options += " -L %s" % region elif len(region) == 1: region_options += " -L %s" % region[0] elif len(region) == 3: region_options += " -L %s:%i-%i" % (region[0], region[1], region[2]) options_list.append(options + region_options) output_index += 1 self.parallel_execute(options_list) unsorted_vcf = "%s.unsorted.vcf" VCFRoutines.combine_same_samples_vcfs(unsorted_vcf, vcf_list=region_vcf_list, order_vcf_files=True, close_fd_after=False, extension_list=[ ".vcf", ]) sequence_dict = reference[:-6] + ".dict" SortVcf.jar_path = picard_jar_path SortVcf.sort_vcf(unsorted_vcf, output_vcf, seq_dict=sequence_dict)