def transcriptome_regions_path(self, alignment_path, parameters): transcriptome_regions_path = alignment_path + "aligned_coverage_regions.bed" if not os.path.exists(transcriptome_regions_path): bam_path = alignment_path + "Out.bam" coverage_path = alignment_path + "Out.base_coverage" min_coverage = 2 # Create coverage file command = "bedtools genomecov -d -ibam /{}".format(bam_path) output_parameters = { "log_is_output": True, "out_file_path": coverage_path, "log_file_path": parameters["destination"] + "Coverage.log" } self.run_docker(command, parameters, output_parameters) file_utils.validate_file_content(coverage_path) # Create BED from coverage file command = "python base_coverage_to_bed.py /{} {} /{}".format( coverage_path, str(min_coverage), transcriptome_regions_path) self.run_docker(command, parameters, log_file_name="CoverageToBed.log") file_utils.validate_file_content(transcriptome_regions_path) return transcriptome_regions_path
def run(self, parameters): destination = parameters["destination"] experiment = parameters["experiment"] data_handler = parameters["data_handler"] in_file_path = experiment.get_input_directory(self.id) + "Out.bam" reference_path = data_handler.reference_path(experiment) # Remove duplicates deduplicated_path = destination + "Deduplicated.bam" metrics_path = destination + "Deduplicate.metrics" command = "gatk MarkDuplicates -I /{} -O /{} -M /{} " \ "--VALIDATION_STRINGENCY=SILENT".format( in_file_path, deduplicated_path, metrics_path ) output_parameters = {"log_file_path": destination + "Deduplicate.log"} self.run_docker(command, parameters, output_parameters) file_utils.validate_file_content(deduplicated_path) # Remove introns out_file_path = destination + "Out.bam" command = "gatk SplitNCigarReads -R /{} -I /{} -O /{} --tmp-dir /{}".format( reference_path, deduplicated_path, out_file_path, destination) output_parameters = {"log_file_path": destination + "SplitN.log"} self.run_docker(command, parameters, output_parameters) file_utils.validate_file_content(out_file_path) file_utils.delete(deduplicated_path)
def align(self, parameters, sam_file_path): command = self.alignment_command(parameters) output_parameters = { "log_is_output": not self.creates_output, "out_file_path": sam_file_path } self.run_docker(command, parameters, output_parameters) self.conclude_alignment(parameters, sam_file_path) file_utils.validate_file_content(sam_file_path)
def alignment_command(self, parameters): dataset = parameters["dataset"] genome_index_path = parameters["genome_index_path"] command = "novoalign -o SAM -f" file_utils.validate_file_content(genome_index_path) for direction, specification in dataset.get("data").items(): command += " /{}".format(specification["path"]) command += " -d /{}".format(genome_index_path) command += " -r All 10" # report max. 10 alignments per read command += " -v 0 70 70 '[>]([^:]*)'" # group junction and exon sequences together if self.__fasta_input(parameters): command += " -F FA" return command
def run(self, parameters): experiment = parameters["experiment"] soft_clips_exist = experiment.get_aligner_soft_clips() in_file_path = experiment.get_input_directory(self.id) + "Out.bam" out_file_path = parameters["destination"] + "Out.bam" command = "python Opossum.py --BamFile=/{} --OutFile=/{} --SoftClipsExist={}".format( in_file_path, out_file_path, soft_clips_exist ) self.run_docker(command, parameters) file_utils.validate_file_content(out_file_path) self.__post_process(parameters, out_file_path)
def run(self, parameters): experiment = parameters["experiment"] reference_id = experiment.get("reference") destination = parameters["destination"] vcf_file_path = destination + "Out.vcf" alignment_path = experiment.get("pipeline")["alignment"]["directory"] confidence_regions_path = alignment_path + "confidence_calls.bed".format( reference_id) # Intersect confidence regions with transcriptome regions if not already done if not os.path.exists(confidence_regions_path): confidence_genome_regions_path = "data/giab/{}/confidence_calls.bed".format( reference_id) transcriptome_regions_path = self.transcriptome_regions_path( alignment_path, parameters) self.bedtools("intersect", confidence_genome_regions_path, transcriptome_regions_path, confidence_regions_path, parameters) file_utils.validate_file_content(confidence_regions_path) # Filter data if necessary action_handler = parameters["action_handler"] additional_commands = "" if hasattr(action_handler, "chromosomes"): # Escape spaces for bash space_escape = "%%" additional_commands = "--location{}{}".format( space_escape, ",".join(action_handler.chromosomes)) command = "./hap.py /data/giab/{0}/confidence_calls.vcf /{1}Out.vcf " \ "-f /{2} " \ "-o /{1}Evaluation " \ "-r /data/references/{0}.fa " \ "--location {3}".format( reference_id, destination, confidence_regions_path, additional_commands ) output_parameters = {"log_file_path": destination + "Evaluation.log"} self.run_docker(command, parameters, output_parameters) for file_name in os.listdir(destination): if file_name.startswith("Evaluation"): file_path = destination + file_name if not file_utils.file_has_content(file_path): file_utils.delete(file_path)
def run(self, parameters): destination = parameters["destination"] experiment = parameters["experiment"] data_handler = parameters["data_handler"] in_file_path = experiment.get_input_directory(self.id) + "Out.vcf" reference_path = data_handler.reference_path(experiment) out_file_path = destination + "Out.vcf" command = "gatk VariantFiltration -R /{} -V /{} -O /{} " \ "-window 35 -cluster 3 --filter-name FS -filter 'FS > 30.0' " \ " --filter-name QD -filter 'QD < 2.0'".format( reference_path, in_file_path, out_file_path ) self.run_docker(command, parameters) file_utils.validate_file_content(out_file_path)
def post_process(self, parameters, sam_file_path, bam_file_path): destination = parameters["destination"] dataset = parameters["dataset"] # Convert to BAM, add read groups and sort command = "gatk AddOrReplaceReadGroups -I /{} -O /{} -SO coordinate " \ "-ID foo -LB bar -PL illumina -SM Sample1 -PU foo.bar " \ "--TMP_DIR {} " \ "--CREATE_INDEX".format( sam_file_path, bam_file_path, destination ) output_parameters = {"log_file_path": destination + "Conversion.log"} self.run_docker(command, parameters, output_parameters) file_utils.validate_file_content(bam_file_path) # Delete SAM file if not needed in evaluation (which is for BEERS sets) evaluation = dataset.get("evaluation") if evaluation == None or evaluation["type"] != "beers": file_utils.delete(sam_file_path) # Create reference indices data_handler = parameters["data_handler"] experiment = parameters["experiment"] reference_path = data_handler.reference_path(experiment) reference_index_path = data_handler.reference_path( experiment, alternate_file_ending=".fa.fai") reference_dict_path = data_handler.reference_path( experiment, alternate_file_ending=".dict") # Generate index of reference if not there if not os.path.exists(reference_index_path): command = "samtools faidx /{}".format(reference_path) output_parameters = {"log_file_path": destination + "Index.log"} self.run_docker(command, parameters, output_parameters) # Generate dict or reference if not there if not os.path.exists(reference_dict_path): command = "gatk CreateSequenceDictionary -R /{} -O /{}".format( reference_path, reference_dict_path) output_parameters = {"log_file_path": destination + "Dict.log"} self.run_docker(command, parameters, output_parameters)
def run(self, parameters, in_file_path=None): experiment = parameters["experiment"] destination = parameters["destination"] data_handler = parameters["data_handler"] docker_client = parameters["docker_client"] reference_path = data_handler.reference_path(experiment) # Run variant calling in_file_path = in_file_path or experiment.get_input_directory( self.id) + "Out.bam" out_file_path = destination + "Out.vcf" command = "gatk HaplotypeCaller -I /{} -O /{} -R /{} " \ "--dont-use-soft-clipped-bases " \ "--standard-min-confidence-threshold-for-calling 20".format( in_file_path, out_file_path, data_handler.reference_path(experiment) ) command = self.add_filters(command) self.run_docker(command, parameters) file_utils.validate_file_content(out_file_path)