示例#1
0
    def align_sampled_data(
        self,
        pipeline_input: PipelineInput,
        fraction: float,
        method: SamplingMethod,
        fraction_to_samples_dir: t.Dict[t.Any, t.Any],
        sample_member_names: t.List[str],
    ):
        self.samples_info[fraction][method.value][
            "aligned_sequence_data_path"] = f"{fraction_to_samples_dir[fraction]}aligned_method_{method.value}.fasta"
        if os.path.exists(
                self.samples_info[fraction][method.value]
            ["aligned_sequence_data_path"]) and os.path.getsize(
                self.samples_info[fraction][method.value]
                ["aligned_sequence_data_path"]) >= os.path.getsize(
                    self.samples_info[fraction][
                        method.value]["unaligned_sequence_data_path"]):
            logger.info(
                f"Alignment of sample of fraction {fraction} using method {method.value} already exists."
            )
        else:
            if pipeline_input.use_full_alignment_in_sample:
                logger.info(
                    "Trimming full alignment to include only the sampled sequences"
                )

                all_aligned_sequence_records = list(
                    SeqIO.parse(self.aligned_sequence_data_path, "fasta"))
                sampled_aligned_sequence_records = [
                    record for record in all_aligned_sequence_records
                    if record.name in sample_member_names
                ]
                SeqIO.write(
                    sampled_aligned_sequence_records,
                    self.samples_info[fraction][method.value]
                    ["aligned_sequence_data_path"],
                    "fasta",
                )
            else:
                logger.info(
                    f"Aligning the sampled data to {self.samples_info[fraction][method.value]['aligned_sequence_data_path']}"
                )
                BaseTools.align(
                    self.samples_info[fraction][
                        method.value]["unaligned_sequence_data_path"],
                    self.samples_info[fraction][method.value]
                    ["aligned_sequence_data_path"],
                    pipeline_input.sequence_data_type,
                    pipeline_input.samples_alignment_method,
                    pipeline_input.samples_alignment_params,
                )
                logger.info(
                    f"Aligned sample records written to {self.samples_info[fraction][method.value]['aligned_sequence_data_path']}"
                )
示例#2
0
    def set_command(
        self,
        input_path: str,
        output_path: str,
        additional_params: t.Optional[t.Dict[str, str]],
        parallelize: bool,
        cluster_data_dir: str,
        sequence_data_type: SequenceDataType = SequenceDataType.CODON,
        input_tree_path: t.Optional[str] = None,
    ) -> t.List[str]:
        """
        :param input_path: path to the input of the program
        :param output_path: path to the output of the program
        :param additional_params: additional parameters to run the program with (maps parameter name to value)
        :param parallelize: boolean indicating weather program execution should be parallelized
        :param cluster_data_dir: directory to concat to directory arguments in case of parallelization on the cluster
        :param sequence_data_type: indicates the type of
        :param input_tree_path: path in which the input tree for paml will be generated
        :return: a string representing the command
        """
        program_input_path = (
            input_path if not parallelize else input_path.replace(
                os.environ["container_data_dir"], cluster_data_dir))
        program_output_path = (
            output_path if not parallelize else output_path.replace(
                os.environ["container_data_dir"], cluster_data_dir))
        if not input_tree_path:
            input_tree_path = f"{os.path.dirname(program_input_path)}/hyphy_tree.nwk"
        # program_output_path = program_output_path if os.path.isdir(program_output_path) else os.path.dirname(program_output_path)
        self.set_additional_params(additional_params,
                                   parallelize,
                                   cluster_data_dir,
                                   return_as_str=False)

        if additional_params and "input_tree_path" in additional_params:
            input_tree_path = additional_params["input_tree_path"]
        BaseTools.build_tree(
            input_path,
            input_tree_path,
            sequence_data_type,
            self.tree_reconstruction_method,
            self.tree_reconstruction_prams,
        )

        cmd = f"printf '1\\n5\\n{program_input_path}\\n{input_tree_path}\\n' | hyphy"
        return [
            cmd,
            f"cp -r {program_input_path}.BUSTED.json {program_output_path}"
        ]
 def write_output_to_simulation_pipeline_json(
     program_output: t.Dict[str, t.Any],
     output_path: str,
     additional_simulation_parameters: t.Dict[str, t.Any],
 ):
     """
     :param program_output: output of the program to translate to simulation params
     :param output_path: output path for simulation pipeline input json
     :param additional_simulation_parameters:  additional parameters
     :return:
     """
     substitution_model_params_values = {
         "kappa": program_output["kappa"],
         "selection_parameters": program_output["selection_parameters"],
     }
     simulation_input_parameters = {
         "substitution_model": "",
         "substitution_model_params": substitution_model_params_values,
         "states_frequencies": program_output["states_frequencies"],
         "tree_random": False,
         "tree_length": program_output["tree_length"],
         "simulation_tree_path": program_output["tree_path"],
         "pinv": 0,
         "alpha": 0,
         "ngamcat": 0,
     }
     simulation_input_parameters.update(additional_simulation_parameters)
     simulation_input = BaseTools.jsonable_encoder(
         SimulationInput(**simulation_input_parameters))
     clean_simulation_input = {
         k: v
         for k, v in simulation_input.items() if v is not None
     }
     with open(output_path, "w") as output_file:
         json.dump(obj=clean_simulation_input, fp=output_file)
def run_program(
    program_name: ProgramName,
    sequence_data_path: click.Path,
    alignment_path: str,
    sequence_data_type: SequenceDataType,
    program_output_path: str,
    additional_params: t.Optional[t.Dict] = None,
) -> t.Union[str, str, str, str]:
    """
    :param program_name: name of program to execute
    :param sequence_data_path: unaligned sequence data
    :param alignment_path: path in which an alignment will be created
    :param program_output_path: path to which the program output will be written
    :param sequence_data_type: sequence data type
    :param additional_params: additional program parameters, if needed
    :return: path to job completion validator file
    """

    # align the data
    if not os.path.exists(alignment_path):
        BaseTools.align(
            input_path=sequence_data_path,
            output_path=alignment_path,
            sequence_data_type=sequence_data_type,
            alignment_method=AlignmentMethod.MAFFT,
        )

    # create a program instance
    program_to_exec = program_to_callable[program_name]()

    # run the inference program (in the case of paml, the control file will be generated in the default directory
    completion_validator_path = program_to_exec.exec(
        input_path=alignment_path,
        output_path=program_output_path,
        aux_dir=f"{os.path.dirname(sequence_data_path)}/",
        additional_params=additional_params,
        parallelize=True,
        cluster_data_dir=os.path.dirname(alignment_path),
        priority=0,
        queue="itaym",
        wait_until_complete=False,
        get_completion_validator=True,
    )

    return completion_validator_path
示例#5
0
 def build_sampled_tree(
     self,
     pipeline_input: PipelineInput,
     fraction: float,
     method: SamplingMethod,
     fraction_to_samples_dir: t.Dict[t.Any, t.Any],
     sample_member_names: t.List[str],
 ):
     self.samples_info[fraction][method.value][
         "tree_path"] = f"{fraction_to_samples_dir[fraction]}method_{method.value}_tree.nwk"
     if (os.path.exists(
             self.samples_info[fraction][method.value]["tree_path"])
             and os.path.getsize(
                 self.samples_info[fraction][method.value]["tree_path"]) >
             0):
         logger.info(
             f"Tree of sample of fraction {fraction} using method {method.value} already exists."
         )
     else:
         if pipeline_input.use_full_tree_in_sample:
             logger.info(
                 "Pruning full tree to include only the sampled sequences")
             full_tree = Tree(self.tree_path)
             full_tree.prune(sample_member_names)
             full_tree.write(outfile=self.samples_info[fraction][
                 method.value]["tree_path"])
         else:
             logger.info(
                 f"Building tree based on sampled data to {self.samples_info[fraction][method.value]['tree_path']}"
             )
             BaseTools.build_tree(
                 self.samples_info[fraction][method.value]
                 ["aligned_sequence_data_path"],
                 self.samples_info[fraction][method.value]["tree_path"],
                 pipeline_input.sequence_data_type,
                 pipeline_input.tree_reconstruction_method,
                 pipeline_input.tree_reconstruction_params,
             )
             logger.info(
                 f"Tree of sample records written to {self.samples_info[fraction][method.value]['tree_path']}"
             )
示例#6
0
 def write_output_to_simulation_pipeline_json(
     program_output: t.Dict[str, t.Any],
     output_path: str,
     additional_simulation_parameters: t.Dict[str, t.Any],
 ):
     """
     :param program_output: output of the program to translate to simulation params
     :param output_path: output path for simulation pipeline input json
     :param additional_simulation_parameters:  additional parameters
     :return: none, writes simulation input parameters to json
     """
     transitions_rates = sum([
         program_output["fits"]["Nucleotide GTR"]["Rate Distributions"]
         [f"Substitution rate from nucleotide {transition[0]} to nucleotide {transition[1]}"]
         for transition in [("A", "G"), ("C", "T")]
     ])
     transversions_rates = sum([
         program_output["fits"]["Nucleotide GTR"]["Rate Distributions"]
         [f"Substitution rate from nucleotide {transversion[0]} to nucleotide {transversion[1]}"]
         for transversion in [("A", "C"), ("A", "T"), ("C", "G"), ("G",
                                                                   "T")]
     ])
     kappa = transversions_rates / transitions_rates
     hyphy_selection_parameters = program_output["fits"][
         "Unconstrained model"]["Rate Distributions"]["Test"]
     simulation_selection_parameters = {
         int(cat): {
             "prop": hyphy_selection_parameters[cat]["proportion"],
             "w": hyphy_selection_parameters[cat]["omega"],
         }
         for cat in hyphy_selection_parameters
     }
     if simulation_selection_parameters[2]["prop"] > 0.05:
         logger.info(
             f"{output_path} is a fitting candidate for a simulation study involving positive selection"
         )
     simulation_input_parameters = {
         "substitution_model":
         "",
         "substitution_model_params": {
             "kappa": kappa,
             "selection_parameters": simulation_selection_parameters,
         },
         "states_frequencies":
         program_output["fits"]
         ["MG94xREV with separate rates for branch sets"]
         ["Equilibrium frequencies"],
         "tree_random":
         False,
         "tree_length":
         program_output["tree_length"],
         "simulation_tree_path":
         program_output["tree_path"],
         "pinv":
         0,
         "alpha":
         0,
         "ngamcat":
         0,
     }
     simulation_input_parameters.update(additional_simulation_parameters)
     simulation_input = BaseTools.jsonable_encoder(
         SimulationInput(**simulation_input_parameters))
     clean_simulation_input = {
         k: v
         for k, v in simulation_input.items() if v is not None
     }
     with open(output_path, "w") as output_file:
         json.dump(obj=clean_simulation_input, fp=output_file)
    def set_command(
        self,
        input_path: str,
        output_path: str,
        additional_params: t.Optional[t.Dict[str, str]],
        parallelize: bool,
        cluster_data_dir: str,
        sequence_data_type: SequenceDataType = SequenceDataType.CODON,
        control_file_path: str = f"{os.getcwd()}/paml.ctl",
        input_tree_path: str = f"{os.getcwd()}/paml_tree.nwk",
    ) -> t.List[str]:
        """
        :param input_path: path to the input of the program
        :param output_path: path to the output of the program
        :param additional_params: additional parameters to run the program with (maps parameter name to value)
        :param parallelize: boolean indicating weather program execution should be parallelized
        :param cluster_data_dir: directory to concat to directory arguments in case of parallelization on the cluster
        :param sequence_data_type: indicates the type of
        :param control_file_path: path in which a control file will be generated
        :param input_tree_path: path in which the input tree for paml will be generated
        :return: a list of strings representing the command
        """
        program_input_path = (
            input_path if not parallelize
            or not os.environ["in_container"] else input_path.replace(
                os.environ["container_data_dir"], cluster_data_dir))
        program_output_path = (
            output_path if not parallelize
            or not os.environ["in_container"] else output_path.replace(
                os.environ["container_data_dir"], cluster_data_dir))
        self.set_additional_params(additional_params,
                                   parallelize,
                                   cluster_data_dir,
                                   return_as_str=False)

        if additional_params and "input_tree_path" in additional_params:
            input_tree_path = additional_params["input_tree_path"]
        BaseTools.build_tree(
            input_path,
            input_tree_path,
            sequence_data_type,
            self.tree_reconstruction_method,
            self.tree_reconstruction_prams,
        )
        # shorted file paths for paml (gives up upon receiving file paths > 120 chars)
        shared_dir = os.path.commonprefix([
            program_input_path,
            input_tree_path,
            program_output_path,
            control_file_path,
        ])
        program_input_path = program_input_path.replace(shared_dir,
                                                        "./").replace(
                                                            "//", "/")
        input_tree_path = input_tree_path.replace(shared_dir,
                                                  "./").replace("//", "/")
        program_output_path = program_output_path.replace(shared_dir,
                                                          "./").replace(
                                                              "//", "/")

        self.set_control_file(
            program_input_path,
            input_tree_path,
            program_output_path,
            control_file_path,
            additional_params,
            sequence_data_type=sequence_data_type,
        )
        control_file_path = control_file_path.replace(shared_dir,
                                                      "./").replace("//", "/")
        return [f"cd {shared_dir}", f"{self.program_exe} {control_file_path}"]
示例#8
0
    def __init__(self, pipeline_input: PipelineInput):

        # set initial parameters
        dataset_name_regex = re.compile("(.*)\.")
        dataset_name = dataset_name_regex.search(
            os.path.basename(
                pipeline_input.unaligned_sequence_data_path)).group(1)
        self.pipeline_dir = pipeline_input.pipeline_dir
        self.sequence_data_type = pipeline_input.sequence_data_type

        # prepare input for pipeline
        processed_data_dir = f"{self.pipeline_dir}/input_data/"
        os.makedirs(processed_data_dir, exist_ok=True)
        logger.info(f"Setting input for pipeline at {processed_data_dir}")

        # save unaligned sequence data
        self.unaligned_sequence_data_path = (
            f"{processed_data_dir}{dataset_name}_unaligned.fasta")
        # simplify input sequences names
        self.new_to_orig_names_map = BaseTools.simplify_names(
            input_path=pipeline_input.unaligned_sequence_data_path,
            output_path=self.unaligned_sequence_data_path,
        )
        new_to_orig_names_map_path = (
            f"{pipeline_input.pipeline_dir}/new_to_orig_names_map.pickle")
        with open(new_to_orig_names_map_path, "wb") as outfile:
            pickle.dump(self.new_to_orig_names_map, outfile)
        logger.info(
            f"Unaligned sequence data saved at {self.unaligned_sequence_data_path}"
        )
        self.unaligned_sequence_data = list(
            SeqIO.parse(self.unaligned_sequence_data_path, "fasta"))
        logger.info(
            f"Processed sequence data of size {len(self.unaligned_sequence_data)}"
        )

        # save aligned sequence data
        self.aligned_sequence_data_path = (
            f"{processed_data_dir}{dataset_name}_aligned.fasta")
        if pipeline_input.aligned_sequence_data_path:
            BaseTools.simplify_names(
                pipeline_input.aligned_sequence_data_path,
                self.aligned_sequence_data_path,
                self.new_to_orig_names_map,
            )
            logger.info(
                f"Aligned data saved at {self.aligned_sequence_data_path}")
        else:
            BaseTools.align(
                self.unaligned_sequence_data_path,
                self.aligned_sequence_data_path,
                pipeline_input.sequence_data_type,
                pipeline_input.alignment_method,
                pipeline_input.alignment_params,
            )
            logger.info(
                f"Alignment generated successfully using {pipeline_input.alignment_method.value}"
            )

        # save tree
        self.tree_path = f"{processed_data_dir}{dataset_name}_tree.nwk"
        if pipeline_input.tree_path:
            BaseTools.simplify_names(pipeline_input.tree_path, self.tree_path,
                                     self.new_to_orig_names_map)
            logger.info(f"Tree saved at {self.tree_path}")
        else:
            BaseTools.build_tree(
                self.aligned_sequence_data_path,
                self.tree_path,
                self.sequence_data_type,
                pipeline_input.tree_reconstruction_method,
                pipeline_input.tree_reconstruction_params,
            )
            logger.info(
                f"Tree reconstructed successfully at {self.tree_path} using {pipeline_input.tree_reconstruction_method.value}"
            )

        # set sampling info structure
        self.samples_info = dict()
        for fraction in pipeline_input.sampling_fractions:
            self.samples_info[fraction] = dict()
            for method in pipeline_input.sampling_methods:
                self.samples_info[fraction][method.value] = {
                    "unaligned_sequence_data_path": None,
                    "aligned_sequence_data_path": None,
                    "tree_path": None,
                    "aux_dir": None,
                    "programs_performance": dict(),
                }
                for program_name in pipeline_input.programs:
                    self.samples_info[fraction][method.value][
                        "programs_performance"][program_name.value] = {
                            "input_path": None,
                            "output_path": None,
                            "aux_dir": None,
                            "result": dict(),
                            "full_data_result": None,
                            "reference_data": None,
                        }