def __execute_step(self, action, experiment): action_handler_id = experiment.get("pipeline")[action]["id"] action_handler = next( (service for service in self.services if service.id == action_handler_id), None) file_path = self.data_handler.cache.lookup(experiment, action) if file_path: # Step is cached # Check if evaluation was run, maybe run evaluation evaluation_handler = self.__evaluation_handler(experiment, action) has_evaluation = next((file for file in os.listdir(file_path) if file.startswith("Evaluation")), False) if evaluation_handler and not has_evaluation: experiment.start_action(action) experiment.mark_cached(action) self.__run_evaluation(file_path, experiment, evaluation_handler, action_handler) experiment.complete_action(action) else: experiment.start_action(action, cached=True) else: # Run step experiment.start_action(action) file_path = self.data_handler.cache.create_path(experiment, action) file_utils.create_directory(file_path) action_handler.run({ "docker_client": self.docker_client, "data_handler": self.data_handler, "experiment": experiment, "destination": file_path }) self.__run_evaluation_if_specified(file_path, experiment, action, action_handler) experiment.complete_action(action) experiment.add_download(action, file_path) return experiment
def get_datasets(): for dataset in rna_seq_data: if ONLY_SIMULATED and not dataset["id"].startswith("simulated"): print("Skipping {} (only simulated)".format(dataset["name"])) continue if ONLY_GIAB and dataset["id"] != "GM12878": print("Skipping {} (only giab)".format(dataset["name"])) continue dataset_directory = datasets_directory + dataset["id"] dataset_getter = dataset["getter"] if not os.path.isdir(dataset_directory): file_utils.create_directory(dataset_directory) log_task_start(dataset["name"], dataset_directory) dataset_getter(dataset, dataset_directory) log_task_end(dataset["name"], dataset_directory) else: log_data_present(dataset["name"])
def prepare_indexing(self, parameters): file_utils.create_directory(parameters["genome_index_path"])
def get_baruzzo(dataset, directory): zip_name = "{}.tar.bz2".format(dataset["file_name"]) url = "http://bp1.s3.amazonaws.com/{}".format(zip_name) download_path = directory + "/" + zip_name file_utils.download(url, download_path) print("Unzipping {}...".format(dataset["name"]), flush=True) file_utils.unzip(download_path) # Move files to /beers directory beers_directory = directory + "/beers/" file_utils.create_directory(beers_directory) for file_name in os.listdir(directory): file_path = directory + "/" + file_name if not os.path.isdir(file_path) and not file_path == download_path: shutil.move(file_path, beers_directory + file_name) # Move FASTQ files to root and rename def setup_file(direction): file_name = "{}.{}.fa".format(dataset["id"], direction) file_origin = beers_directory + file_name file_destination = "{}/{}{}".format(directory, direction, fastq_file_ending) os.rename(file_origin, file_destination) return file_name, file_destination forward_file_name, forward_file_path = setup_file( constants["dataset"]["FORWARD"]) reverse_file_name, reverse_file_path = setup_file( constants["dataset"]["REVERSE"]) # Move CIG file to root and rename truth_file_name = "{}.cig".format(dataset["id"]) truth_file_path = directory + "/truth.cig" os.rename(beers_directory + truth_file_name, truth_file_path) file_utils.delete(download_path) file_utils.delete(beers_directory) write_dataset_json({ "id": dataset["id"], "name": dataset["name"], "readLength": "100", "data": { constants["dataset"]["FORWARD"]: { "name": forward_file_name, "path": forward_file_path, }, constants["dataset"]["REVERSE"]: { "name": reverse_file_name, "path": reverse_file_path, } }, "evaluation": { "type": "beers", "truth_file": { "name": truth_file_name, "path": truth_file_path } } })
if not os.path.isdir(dataset_directory): file_utils.create_directory(dataset_directory) log_task_start(dataset["name"], dataset_directory) dataset_getter(dataset, dataset_directory) log_task_end(dataset["name"], dataset_directory) else: log_data_present(dataset["name"]) ################### # SCRIPT EXECUTION ################### print("", flush=True) print("Downloading data", flush=True) print("", flush=True) file_utils.create_directory(reference_directory) file_utils.create_directory(datasets_directory) try: get_tools() get_genomes() get_datasets() remove_tools() finally: for path in started_tasks: if not path in finished_tasks: print("An error occured, deleting {}".format(path)) file_utils.delete(path)
def __init__(self, directory, Instance): self.directory = directory self.Instance = Instance file_utils.create_directory(self.directory)