def align_sampled_data( self, pipeline_input: PipelineInput, fraction: float, method: SamplingMethod, fraction_to_samples_dir: t.Dict[t.Any, t.Any], sample_member_names: t.List[str], ): self.samples_info[fraction][method.value][ "aligned_sequence_data_path"] = f"{fraction_to_samples_dir[fraction]}aligned_method_{method.value}.fasta" if os.path.exists( self.samples_info[fraction][method.value] ["aligned_sequence_data_path"]) and os.path.getsize( self.samples_info[fraction][method.value] ["aligned_sequence_data_path"]) >= os.path.getsize( self.samples_info[fraction][ method.value]["unaligned_sequence_data_path"]): logger.info( f"Alignment of sample of fraction {fraction} using method {method.value} already exists." ) else: if pipeline_input.use_full_alignment_in_sample: logger.info( "Trimming full alignment to include only the sampled sequences" ) all_aligned_sequence_records = list( SeqIO.parse(self.aligned_sequence_data_path, "fasta")) sampled_aligned_sequence_records = [ record for record in all_aligned_sequence_records if record.name in sample_member_names ] SeqIO.write( sampled_aligned_sequence_records, self.samples_info[fraction][method.value] ["aligned_sequence_data_path"], "fasta", ) else: logger.info( f"Aligning the sampled data to {self.samples_info[fraction][method.value]['aligned_sequence_data_path']}" ) BaseTools.align( self.samples_info[fraction][ method.value]["unaligned_sequence_data_path"], self.samples_info[fraction][method.value] ["aligned_sequence_data_path"], pipeline_input.sequence_data_type, pipeline_input.samples_alignment_method, pipeline_input.samples_alignment_params, ) logger.info( f"Aligned sample records written to {self.samples_info[fraction][method.value]['aligned_sequence_data_path']}" )
def set_command( self, input_path: str, output_path: str, additional_params: t.Optional[t.Dict[str, str]], parallelize: bool, cluster_data_dir: str, sequence_data_type: SequenceDataType = SequenceDataType.CODON, input_tree_path: t.Optional[str] = None, ) -> t.List[str]: """ :param input_path: path to the input of the program :param output_path: path to the output of the program :param additional_params: additional parameters to run the program with (maps parameter name to value) :param parallelize: boolean indicating weather program execution should be parallelized :param cluster_data_dir: directory to concat to directory arguments in case of parallelization on the cluster :param sequence_data_type: indicates the type of :param input_tree_path: path in which the input tree for paml will be generated :return: a string representing the command """ program_input_path = ( input_path if not parallelize else input_path.replace( os.environ["container_data_dir"], cluster_data_dir)) program_output_path = ( output_path if not parallelize else output_path.replace( os.environ["container_data_dir"], cluster_data_dir)) if not input_tree_path: input_tree_path = f"{os.path.dirname(program_input_path)}/hyphy_tree.nwk" # program_output_path = program_output_path if os.path.isdir(program_output_path) else os.path.dirname(program_output_path) self.set_additional_params(additional_params, parallelize, cluster_data_dir, return_as_str=False) if additional_params and "input_tree_path" in additional_params: input_tree_path = additional_params["input_tree_path"] BaseTools.build_tree( input_path, input_tree_path, sequence_data_type, self.tree_reconstruction_method, self.tree_reconstruction_prams, ) cmd = f"printf '1\\n5\\n{program_input_path}\\n{input_tree_path}\\n' | hyphy" return [ cmd, f"cp -r {program_input_path}.BUSTED.json {program_output_path}" ]
def write_output_to_simulation_pipeline_json( program_output: t.Dict[str, t.Any], output_path: str, additional_simulation_parameters: t.Dict[str, t.Any], ): """ :param program_output: output of the program to translate to simulation params :param output_path: output path for simulation pipeline input json :param additional_simulation_parameters: additional parameters :return: """ substitution_model_params_values = { "kappa": program_output["kappa"], "selection_parameters": program_output["selection_parameters"], } simulation_input_parameters = { "substitution_model": "", "substitution_model_params": substitution_model_params_values, "states_frequencies": program_output["states_frequencies"], "tree_random": False, "tree_length": program_output["tree_length"], "simulation_tree_path": program_output["tree_path"], "pinv": 0, "alpha": 0, "ngamcat": 0, } simulation_input_parameters.update(additional_simulation_parameters) simulation_input = BaseTools.jsonable_encoder( SimulationInput(**simulation_input_parameters)) clean_simulation_input = { k: v for k, v in simulation_input.items() if v is not None } with open(output_path, "w") as output_file: json.dump(obj=clean_simulation_input, fp=output_file)
def run_program( program_name: ProgramName, sequence_data_path: click.Path, alignment_path: str, sequence_data_type: SequenceDataType, program_output_path: str, additional_params: t.Optional[t.Dict] = None, ) -> t.Union[str, str, str, str]: """ :param program_name: name of program to execute :param sequence_data_path: unaligned sequence data :param alignment_path: path in which an alignment will be created :param program_output_path: path to which the program output will be written :param sequence_data_type: sequence data type :param additional_params: additional program parameters, if needed :return: path to job completion validator file """ # align the data if not os.path.exists(alignment_path): BaseTools.align( input_path=sequence_data_path, output_path=alignment_path, sequence_data_type=sequence_data_type, alignment_method=AlignmentMethod.MAFFT, ) # create a program instance program_to_exec = program_to_callable[program_name]() # run the inference program (in the case of paml, the control file will be generated in the default directory completion_validator_path = program_to_exec.exec( input_path=alignment_path, output_path=program_output_path, aux_dir=f"{os.path.dirname(sequence_data_path)}/", additional_params=additional_params, parallelize=True, cluster_data_dir=os.path.dirname(alignment_path), priority=0, queue="itaym", wait_until_complete=False, get_completion_validator=True, ) return completion_validator_path
def build_sampled_tree( self, pipeline_input: PipelineInput, fraction: float, method: SamplingMethod, fraction_to_samples_dir: t.Dict[t.Any, t.Any], sample_member_names: t.List[str], ): self.samples_info[fraction][method.value][ "tree_path"] = f"{fraction_to_samples_dir[fraction]}method_{method.value}_tree.nwk" if (os.path.exists( self.samples_info[fraction][method.value]["tree_path"]) and os.path.getsize( self.samples_info[fraction][method.value]["tree_path"]) > 0): logger.info( f"Tree of sample of fraction {fraction} using method {method.value} already exists." ) else: if pipeline_input.use_full_tree_in_sample: logger.info( "Pruning full tree to include only the sampled sequences") full_tree = Tree(self.tree_path) full_tree.prune(sample_member_names) full_tree.write(outfile=self.samples_info[fraction][ method.value]["tree_path"]) else: logger.info( f"Building tree based on sampled data to {self.samples_info[fraction][method.value]['tree_path']}" ) BaseTools.build_tree( self.samples_info[fraction][method.value] ["aligned_sequence_data_path"], self.samples_info[fraction][method.value]["tree_path"], pipeline_input.sequence_data_type, pipeline_input.tree_reconstruction_method, pipeline_input.tree_reconstruction_params, ) logger.info( f"Tree of sample records written to {self.samples_info[fraction][method.value]['tree_path']}" )
def write_output_to_simulation_pipeline_json( program_output: t.Dict[str, t.Any], output_path: str, additional_simulation_parameters: t.Dict[str, t.Any], ): """ :param program_output: output of the program to translate to simulation params :param output_path: output path for simulation pipeline input json :param additional_simulation_parameters: additional parameters :return: none, writes simulation input parameters to json """ transitions_rates = sum([ program_output["fits"]["Nucleotide GTR"]["Rate Distributions"] [f"Substitution rate from nucleotide {transition[0]} to nucleotide {transition[1]}"] for transition in [("A", "G"), ("C", "T")] ]) transversions_rates = sum([ program_output["fits"]["Nucleotide GTR"]["Rate Distributions"] [f"Substitution rate from nucleotide {transversion[0]} to nucleotide {transversion[1]}"] for transversion in [("A", "C"), ("A", "T"), ("C", "G"), ("G", "T")] ]) kappa = transversions_rates / transitions_rates hyphy_selection_parameters = program_output["fits"][ "Unconstrained model"]["Rate Distributions"]["Test"] simulation_selection_parameters = { int(cat): { "prop": hyphy_selection_parameters[cat]["proportion"], "w": hyphy_selection_parameters[cat]["omega"], } for cat in hyphy_selection_parameters } if simulation_selection_parameters[2]["prop"] > 0.05: logger.info( f"{output_path} is a fitting candidate for a simulation study involving positive selection" ) simulation_input_parameters = { "substitution_model": "", "substitution_model_params": { "kappa": kappa, "selection_parameters": simulation_selection_parameters, }, "states_frequencies": program_output["fits"] ["MG94xREV with separate rates for branch sets"] ["Equilibrium frequencies"], "tree_random": False, "tree_length": program_output["tree_length"], "simulation_tree_path": program_output["tree_path"], "pinv": 0, "alpha": 0, "ngamcat": 0, } simulation_input_parameters.update(additional_simulation_parameters) simulation_input = BaseTools.jsonable_encoder( SimulationInput(**simulation_input_parameters)) clean_simulation_input = { k: v for k, v in simulation_input.items() if v is not None } with open(output_path, "w") as output_file: json.dump(obj=clean_simulation_input, fp=output_file)
def set_command( self, input_path: str, output_path: str, additional_params: t.Optional[t.Dict[str, str]], parallelize: bool, cluster_data_dir: str, sequence_data_type: SequenceDataType = SequenceDataType.CODON, control_file_path: str = f"{os.getcwd()}/paml.ctl", input_tree_path: str = f"{os.getcwd()}/paml_tree.nwk", ) -> t.List[str]: """ :param input_path: path to the input of the program :param output_path: path to the output of the program :param additional_params: additional parameters to run the program with (maps parameter name to value) :param parallelize: boolean indicating weather program execution should be parallelized :param cluster_data_dir: directory to concat to directory arguments in case of parallelization on the cluster :param sequence_data_type: indicates the type of :param control_file_path: path in which a control file will be generated :param input_tree_path: path in which the input tree for paml will be generated :return: a list of strings representing the command """ program_input_path = ( input_path if not parallelize or not os.environ["in_container"] else input_path.replace( os.environ["container_data_dir"], cluster_data_dir)) program_output_path = ( output_path if not parallelize or not os.environ["in_container"] else output_path.replace( os.environ["container_data_dir"], cluster_data_dir)) self.set_additional_params(additional_params, parallelize, cluster_data_dir, return_as_str=False) if additional_params and "input_tree_path" in additional_params: input_tree_path = additional_params["input_tree_path"] BaseTools.build_tree( input_path, input_tree_path, sequence_data_type, self.tree_reconstruction_method, self.tree_reconstruction_prams, ) # shorted file paths for paml (gives up upon receiving file paths > 120 chars) shared_dir = os.path.commonprefix([ program_input_path, input_tree_path, program_output_path, control_file_path, ]) program_input_path = program_input_path.replace(shared_dir, "./").replace( "//", "/") input_tree_path = input_tree_path.replace(shared_dir, "./").replace("//", "/") program_output_path = program_output_path.replace(shared_dir, "./").replace( "//", "/") self.set_control_file( program_input_path, input_tree_path, program_output_path, control_file_path, additional_params, sequence_data_type=sequence_data_type, ) control_file_path = control_file_path.replace(shared_dir, "./").replace("//", "/") return [f"cd {shared_dir}", f"{self.program_exe} {control_file_path}"]
def __init__(self, pipeline_input: PipelineInput): # set initial parameters dataset_name_regex = re.compile("(.*)\.") dataset_name = dataset_name_regex.search( os.path.basename( pipeline_input.unaligned_sequence_data_path)).group(1) self.pipeline_dir = pipeline_input.pipeline_dir self.sequence_data_type = pipeline_input.sequence_data_type # prepare input for pipeline processed_data_dir = f"{self.pipeline_dir}/input_data/" os.makedirs(processed_data_dir, exist_ok=True) logger.info(f"Setting input for pipeline at {processed_data_dir}") # save unaligned sequence data self.unaligned_sequence_data_path = ( f"{processed_data_dir}{dataset_name}_unaligned.fasta") # simplify input sequences names self.new_to_orig_names_map = BaseTools.simplify_names( input_path=pipeline_input.unaligned_sequence_data_path, output_path=self.unaligned_sequence_data_path, ) new_to_orig_names_map_path = ( f"{pipeline_input.pipeline_dir}/new_to_orig_names_map.pickle") with open(new_to_orig_names_map_path, "wb") as outfile: pickle.dump(self.new_to_orig_names_map, outfile) logger.info( f"Unaligned sequence data saved at {self.unaligned_sequence_data_path}" ) self.unaligned_sequence_data = list( SeqIO.parse(self.unaligned_sequence_data_path, "fasta")) logger.info( f"Processed sequence data of size {len(self.unaligned_sequence_data)}" ) # save aligned sequence data self.aligned_sequence_data_path = ( f"{processed_data_dir}{dataset_name}_aligned.fasta") if pipeline_input.aligned_sequence_data_path: BaseTools.simplify_names( pipeline_input.aligned_sequence_data_path, self.aligned_sequence_data_path, self.new_to_orig_names_map, ) logger.info( f"Aligned data saved at {self.aligned_sequence_data_path}") else: BaseTools.align( self.unaligned_sequence_data_path, self.aligned_sequence_data_path, pipeline_input.sequence_data_type, pipeline_input.alignment_method, pipeline_input.alignment_params, ) logger.info( f"Alignment generated successfully using {pipeline_input.alignment_method.value}" ) # save tree self.tree_path = f"{processed_data_dir}{dataset_name}_tree.nwk" if pipeline_input.tree_path: BaseTools.simplify_names(pipeline_input.tree_path, self.tree_path, self.new_to_orig_names_map) logger.info(f"Tree saved at {self.tree_path}") else: BaseTools.build_tree( self.aligned_sequence_data_path, self.tree_path, self.sequence_data_type, pipeline_input.tree_reconstruction_method, pipeline_input.tree_reconstruction_params, ) logger.info( f"Tree reconstructed successfully at {self.tree_path} using {pipeline_input.tree_reconstruction_method.value}" ) # set sampling info structure self.samples_info = dict() for fraction in pipeline_input.sampling_fractions: self.samples_info[fraction] = dict() for method in pipeline_input.sampling_methods: self.samples_info[fraction][method.value] = { "unaligned_sequence_data_path": None, "aligned_sequence_data_path": None, "tree_path": None, "aux_dir": None, "programs_performance": dict(), } for program_name in pipeline_input.programs: self.samples_info[fraction][method.value][ "programs_performance"][program_name.value] = { "input_path": None, "output_path": None, "aux_dir": None, "result": dict(), "full_data_result": None, "reference_data": None, }