def run(self, parameters): destination = parameters["destination"] experiment = parameters["experiment"] data_handler = parameters["data_handler"] in_file_path = experiment.get_input_directory(self.id) + "Out.bam" reference_path = data_handler.reference_path(experiment) # Remove duplicates deduplicated_path = destination + "Deduplicated.bam" metrics_path = destination + "Deduplicate.metrics" command = "gatk MarkDuplicates -I /{} -O /{} -M /{} " \ "--VALIDATION_STRINGENCY=SILENT".format( in_file_path, deduplicated_path, metrics_path ) output_parameters = {"log_file_path": destination + "Deduplicate.log"} self.run_docker(command, parameters, output_parameters) file_utils.validate_file_content(deduplicated_path) # Remove introns out_file_path = destination + "Out.bam" command = "gatk SplitNCigarReads -R /{} -I /{} -O /{} --tmp-dir /{}".format( reference_path, deduplicated_path, out_file_path, destination) output_parameters = {"log_file_path": destination + "SplitN.log"} self.run_docker(command, parameters, output_parameters) file_utils.validate_file_content(out_file_path) file_utils.delete(deduplicated_path)
def get_p_falciparum(genome_id, file_path): url = "http://bp1.s3.amazonaws.com/malaria.tar.bz2" download_path = reference_directory + "malaria.tar.bz2" file_utils.download(url, download_path) print("Unzipping {}...".format(genome_id), flush=True) unzipped_directory = file_utils.unzip(download_path) os.rename(unzipped_directory + "/genome_sequence_pfal.fa", file_path) file_utils.delete(download_path) file_utils.delete(unzipped_directory)
def run(self, parameters): docker_client = parameters["docker_client"] data_handler = parameters["data_handler"] experiment = parameters["experiment"] destination = parameters["destination"] dataset = data_handler.datasets.select(experiment.get("dataset")) sam_file_path = destination + "Out.sam" bam_file_path = destination + "Out.bam" # Define genome index path and temp path (will be renamed if successful) parameters["reference_id"] = experiment.get("reference") genome_index_path = data_handler.genome_index_path(experiment, self.id) temp_genome_index_path = genome_index_path + ".running" # If neccessary, build genome index if not os.path.exists(genome_index_path): try: index_parameters = { "docker_client": docker_client, "destination": destination, "genome_index_path": temp_genome_index_path, "reference_path": data_handler.reference_path(experiment), "dataset": dataset, "reference_base_path": data_handler.reference_directory, "reference_id": parameters["reference_id"] } self.build_genome_index(index_parameters) except: file_utils.delete(temp_genome_index_path) raise os.rename(temp_genome_index_path, genome_index_path) # Run alignment alignment_parameters = { "docker_client": docker_client, "destination": destination, "genome_index_path": genome_index_path, "dataset": dataset, "reference_id": parameters["reference_id"], "reference_base_path": data_handler.reference_directory } self.align(alignment_parameters, sam_file_path) # Create sorted BAM file from SAM file post_processing_parameters = { "docker_client": docker_client, "docker_image": "gatk", "destination": destination, "data_handler": data_handler, "experiment": experiment, "dataset": dataset } self.post_process(post_processing_parameters, sam_file_path, bam_file_path)
def setup(self): super().setup() os.mkdir(self.directory) try: self.content["error"] = False self.__store_data() except Exception as error: file_utils.delete(self.directory) file_utils.delete(self.path) self.content["error"] = True raise error
def get_human_genome(genome_id, file_path): url = "http://hgdownload.soe.ucsc.edu/goldenPath/" url += "{0}/bigZips/{0}.2bit".format(genome_id) two_bit_path = file_path + ".2bit" started_tasks.append(two_bit_path) file_utils.download(url, two_bit_path) finished_tasks.append(two_bit_path) # Convert .2bit file to .fa print("Extracting {} from 2bit file...".format(genome_id), flush=True) os.system("chmod +x {0}twoBitToFa && {0}twoBitToFa {1} {2}".format( reference_directory, two_bit_path, file_path)) file_utils.delete(two_bit_path)
def run(self, parameters): dataset = parameters["dataset"] destination = parameters["destination"] command = "bash evaluate_alignment.sh {} {} /{}".format( dataset.get("readLength"), destination, dataset.get("evaluation")["truth_file"]["path"]) output_parameters = {"log_file_path": destination + "Evaluation.log"} self.run_docker(command, parameters, output_parameters) for file_name in ["Evaluation.multi.txt", "Evaluation.txt"]: file_path = destination + file_name if not file_utils.file_has_content(file_path): file_utils.delete(file_path)
def run(self, parameters): experiment = parameters["experiment"] reference_id = experiment.get("reference") destination = parameters["destination"] vcf_file_path = destination + "Out.vcf" alignment_path = experiment.get("pipeline")["alignment"]["directory"] confidence_regions_path = alignment_path + "confidence_calls.bed".format( reference_id) # Intersect confidence regions with transcriptome regions if not already done if not os.path.exists(confidence_regions_path): confidence_genome_regions_path = "data/giab/{}/confidence_calls.bed".format( reference_id) transcriptome_regions_path = self.transcriptome_regions_path( alignment_path, parameters) self.bedtools("intersect", confidence_genome_regions_path, transcriptome_regions_path, confidence_regions_path, parameters) file_utils.validate_file_content(confidence_regions_path) # Filter data if necessary action_handler = parameters["action_handler"] additional_commands = "" if hasattr(action_handler, "chromosomes"): # Escape spaces for bash space_escape = "%%" additional_commands = "--location{}{}".format( space_escape, ",".join(action_handler.chromosomes)) command = "./hap.py /data/giab/{0}/confidence_calls.vcf /{1}Out.vcf " \ "-f /{2} " \ "-o /{1}Evaluation " \ "-r /data/references/{0}.fa " \ "--location {3}".format( reference_id, destination, confidence_regions_path, additional_commands ) output_parameters = {"log_file_path": destination + "Evaluation.log"} self.run_docker(command, parameters, output_parameters) for file_name in os.listdir(destination): if file_name.startswith("Evaluation"): file_path = destination + file_name if not file_utils.file_has_content(file_path): file_utils.delete(file_path)
def get_file(file_id, direction, directory): print("Downloading {} file...".format(direction), flush=True) zip_name = "{}.fastq.gz".format(file_id) url = "https://www.encodeproject.org/files/{}/@@download/{}".format( file_id, zip_name) download_path = directory + "/" + zip_name file_utils.download(url, download_path) print("Unzipping {} file...".format(direction), flush=True) file_utils.unzip(download_path) file_utils.delete(download_path) original_name = "{}.fastq".format(file_id) file_origin = "{}/{}".format(directory, original_name) file_destination = "{}/{}{}".format(directory, direction, fastq_file_ending) os.rename(file_origin, file_destination) return original_name, file_destination
def post_process(self, parameters, sam_file_path, bam_file_path): destination = parameters["destination"] dataset = parameters["dataset"] # Convert to BAM, add read groups and sort command = "gatk AddOrReplaceReadGroups -I /{} -O /{} -SO coordinate " \ "-ID foo -LB bar -PL illumina -SM Sample1 -PU foo.bar " \ "--TMP_DIR {} " \ "--CREATE_INDEX".format( sam_file_path, bam_file_path, destination ) output_parameters = {"log_file_path": destination + "Conversion.log"} self.run_docker(command, parameters, output_parameters) file_utils.validate_file_content(bam_file_path) # Delete SAM file if not needed in evaluation (which is for BEERS sets) evaluation = dataset.get("evaluation") if evaluation == None or evaluation["type"] != "beers": file_utils.delete(sam_file_path) # Create reference indices data_handler = parameters["data_handler"] experiment = parameters["experiment"] reference_path = data_handler.reference_path(experiment) reference_index_path = data_handler.reference_path( experiment, alternate_file_ending=".fa.fai") reference_dict_path = data_handler.reference_path( experiment, alternate_file_ending=".dict") # Generate index of reference if not there if not os.path.exists(reference_index_path): command = "samtools faidx /{}".format(reference_path) output_parameters = {"log_file_path": destination + "Index.log"} self.run_docker(command, parameters, output_parameters) # Generate dict or reference if not there if not os.path.exists(reference_dict_path): command = "gatk CreateSequenceDictionary -R /{} -O /{}".format( reference_path, reference_dict_path) output_parameters = {"log_file_path": destination + "Dict.log"} self.run_docker(command, parameters, output_parameters)
def log_output(docker_container): if stdout_file_path != None: out_file = open(stdout_file_path, "ab") for line in docker_container.logs(stdout=True, stderr=False, stream=True): out_file.write(line) out_file.close() if stderr_file_path != None: log_file = open(stderr_file_path, "ab") for line in docker_container.logs(stdout=False, stderr=True, stream=True): log_file.write(line) log_file.close() docker_container.reload() if docker_container.status != "exited": docker_container.stop() # If log file is empty, delete it. If only one file is written it is # expected to be the log file, if both files are written, it is expected # to be stderr_file_path log_file_path = stderr_file_path or stdout_file_path if not file_utils.file_has_content(log_file_path): file_utils.delete(log_file_path)
def clean_up(self): # In case of an server stop, clean up references and experiments for reference in os.listdir(self.reference_directory): if reference.endswith(".running"): file_utils.delete( os.path.join(self.reference_directory, reference)) for experiment_id, experiment in self.experiments.all().items(): status = experiment.get("status") pipeline = experiment.get("pipeline") error_message = "Server stopped unexpectedly" errored_action = list(pipeline.keys())[0] if status == self.constants["experiment"]["WAITING"]: experiment.mark_error(errored_action, error_message) if status == self.constants["experiment"]["RUNNING"]: for action, pipeline_step in pipeline.items(): started = "started" in pipeline_step and pipeline_step[ "started"] completed = "completed" in pipeline_step and pipeline_step[ "completed"] if started and not completed: errored_action = action self.cache.clean_up(experiment, action) experiment.mark_error(errored_action, error_message)
def get_baruzzo(dataset, directory): zip_name = "{}.tar.bz2".format(dataset["file_name"]) url = "http://bp1.s3.amazonaws.com/{}".format(zip_name) download_path = directory + "/" + zip_name file_utils.download(url, download_path) print("Unzipping {}...".format(dataset["name"]), flush=True) file_utils.unzip(download_path) # Move files to /beers directory beers_directory = directory + "/beers/" file_utils.create_directory(beers_directory) for file_name in os.listdir(directory): file_path = directory + "/" + file_name if not os.path.isdir(file_path) and not file_path == download_path: shutil.move(file_path, beers_directory + file_name) # Move FASTQ files to root and rename def setup_file(direction): file_name = "{}.{}.fa".format(dataset["id"], direction) file_origin = beers_directory + file_name file_destination = "{}/{}{}".format(directory, direction, fastq_file_ending) os.rename(file_origin, file_destination) return file_name, file_destination forward_file_name, forward_file_path = setup_file( constants["dataset"]["FORWARD"]) reverse_file_name, reverse_file_path = setup_file( constants["dataset"]["REVERSE"]) # Move CIG file to root and rename truth_file_name = "{}.cig".format(dataset["id"]) truth_file_path = directory + "/truth.cig" os.rename(beers_directory + truth_file_name, truth_file_path) file_utils.delete(download_path) file_utils.delete(beers_directory) write_dataset_json({ "id": dataset["id"], "name": dataset["name"], "readLength": "100", "data": { constants["dataset"]["FORWARD"]: { "name": forward_file_name, "path": forward_file_path, }, constants["dataset"]["REVERSE"]: { "name": reverse_file_name, "path": reverse_file_path, } }, "evaluation": { "type": "beers", "truth_file": { "name": truth_file_name, "path": truth_file_path } } })
def remove_tools(): for tool_name in tools: tool_path = reference_directory + tool_name file_utils.delete(tool_path)
if not os.path.isdir(dataset_directory): file_utils.create_directory(dataset_directory) log_task_start(dataset["name"], dataset_directory) dataset_getter(dataset, dataset_directory) log_task_end(dataset["name"], dataset_directory) else: log_data_present(dataset["name"]) ################### # SCRIPT EXECUTION ################### print("", flush=True) print("Downloading data", flush=True) print("", flush=True) file_utils.create_directory(reference_directory) file_utils.create_directory(datasets_directory) try: get_tools() get_genomes() get_datasets() remove_tools() finally: for path in started_tasks: if not path in finished_tasks: print("An error occured, deleting {}".format(path)) file_utils.delete(path)
def clean_up(self): file_utils.delete(self.directory)
def delete(self): file_utils.delete(self.path)