Exemplo n.º 1
0
    def __init__(self, *args, **kwargs):
        super(TestMutation, self).__init__(*args, **kwargs)

        self.model_file = "{}/couplings/RASH_HUMAN_b03.model".format(MONOMER_PATH)
        self.c = CouplingsModel(self.model_file)
        self.c0 = self.c.to_independent_model()

        self.singles = pd.read_csv(
            "{}/mutate/RASH_HUMAN_b03_single_mutant_matrix.csv".format(MONOMER_PATH)
        )
Exemplo n.º 2
0
def standard(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using plmc. Use complex protocol
    for heteromultimeric complexes instead.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required
        and infer_plmc()

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        raw_ec_file
        model_file
        num_sites
        num_sequences
        effective_sequences

        focus_mode (passed through)
        focus_sequence (passed through)
        segments (passed through)
    """
    # for additional required parameters, see infer_plmc()
    check_required(kwargs, [
        "prefix",
        "min_sequence_distance",
    ])

    prefix = kwargs["prefix"]

    # infer ECs and load them
    outcfg, ecs, segments = infer_plmc(**kwargs)
    model = CouplingsModel(outcfg["model_file"])

    # following computations are mostly specific to monomer pipeline
    is_single_segment = segments is None or len(segments) == 1
    outcfg = {
        **outcfg,
        **_postprocess_inference(ecs,
                                 kwargs,
                                 model,
                                 outcfg,
                                 prefix,
                                 generate_enrichment=is_single_segment,
                                 generate_line_plot=is_single_segment)
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
Exemplo n.º 3
0
def complex(**kwargs):
    """
    Protocol:

    Infer ECs for protein complexes from alignment using plmc.
    Allows user to select scoring protocol.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required
        and infer_plmc()

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        raw_ec_file
        model_file
        num_sites
        num_sequences
        effective_sequences

        focus_mode (passed through)
        focus_sequence (passed through)
        segments (passed through)
    """
    # for additional required parameters, see infer_plmc()
    check_required(
        kwargs,
        [
            "prefix", "min_sequence_distance",
            "scoring_model", "use_all_ecs_for_scoring",
        ]
    )

    prefix = kwargs["prefix"]

    # infer ECs and load them
    outcfg, ecs, segments = infer_plmc(**kwargs)
    model = CouplingsModel(outcfg["model_file"])

    # following computations are mostly specific to complex pipeline

    # add mixture model probability
    if kwargs["scoring_model"] in SCORING_MODELS:
        if kwargs["use_all_ecs_for_scoring"] is not None:
            use_all_ecs = kwargs["use_all_ecs_for_scoring"]
        else:
            use_all_ecs = False

        ecs = complex_probability(
            ecs, kwargs["scoring_model"], use_all_ecs
        )

    else:
        raise InvalidParameterError(
            "Invalid scoring_model parameter: " +
            "{}. Valid options are: {}".format(
                kwargs["protocol"], ", ".join(SCORING_MODELS)
            )
        )

    # also create line-drawing script (for multiple chains)
    # by convention, we map first segment to chain A,
    # second to B, a.s.f.
    chain_mapping = dict(
        zip(
            [s.segment_id for s in segments],
            string.ascii_uppercase,
        )
    )

    outcfg = {
        **outcfg,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_line_plot=True,
            generate_enrichment=False,
            ec_filter="segment_i != segment_j or abs(i - j) >= {}",
            chain=chain_mapping
        )
    }
    
    # save just the inter protein ECs
    ## TODO: eventually have this accomplished by _postprocess_inference
    ## right now avoiding a second call with a different ec_filter
    ecs = pd.read_csv(outcfg["ec_file"])
    outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv"
    inter_ecs = ecs.query("segment_i != segment_j")
    inter_ecs.to_csv(outcfg["inter_ec_file"], index=False)

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_complex.outcfg", outcfg)

    # TODO: make the following complex-ready
    # EC enrichment:
    #
    # 1) think about making EC enrichment complex-ready and add
    # it back here - so far it only makes sense if all ECs are
    # on one segment
    #
    # EVzoom:
    #
    # 1) at the moment, EVzoom will use numbering before remapping
    # we should eventually get this to a point where segments + residue
    # index are displayed on EVzoom
    #
    # 2) note that this will currently use the default mixture model
    # selection for determining the EC cutoff, rather than the selection
    # used for the EC table above

    return outcfg
Exemplo n.º 4
0
class TestMutation(TestCase):
    def __init__(self, *args, **kwargs):
        super(TestMutation, self).__init__(*args, **kwargs)

        self.model_file = "{}/couplings/RASH_HUMAN_b03.model".format(
            TRAVIS_PATH)
        self.c = CouplingsModel(self.model_file)
        self.c0 = self.c.to_independent_model()

        self.singles = pd.read_csv(
            "{}/mutate/RASH_HUMAN_b03_single_mutant_matrix.csv".format(
                TRAVIS_PATH))

    def test_extract_mutations_normal(self):
        """
        tests whether extract mutations correctly separates mutation strings
        :return:
        """
        mutation_string = "A143K,M100K"
        mutation_target = [(143, "A", "K"), (100, "M", "K")]
        mutations = extract_mutations(mutation_string)
        self.assertEqual(mutations, mutation_target)

    def test_extract_mutations_null_input(self):
        """
        test whether extract mutations returns nothing if given an incorrect string
        :return:
        """
        mutation_string = ""
        mutation_target = []
        mutations = extract_mutations(mutation_string)
        self.assertEqual(mutations, mutation_target)

    def test_extract_mutations_wt(self):
        """
        test whether extract mutations returns wt value if given "wt" string
        :return:
        """
        mutation_string = "wt"
        mutation_target = []
        mutations = extract_mutations(mutation_string)
        self.assertEqual(mutations, mutation_target)

    def test_predict_mutation_table(self):
        """
        tests whether predict mutation table returns a correct table of mutations
        :return:
        """
        singles = self.singles.drop("prediction_independent", axis=1)

        _singles = predict_mutation_table(
            self.c0, singles, output_column="prediction_independent")

        pd.testing.assert_frame_equal(self.singles, _singles)

    def test_single_mutant_matrix(self):
        """
        tests whether single mutant matrix returns the correct pd.DataFrame
        :return:
        """
        _singles = single_mutant_matrix(self.c,
                                        output_column="prediction_epistatic")

        singles = self.singles.drop("prediction_independent", axis=1)

        pd.testing.assert_frame_equal(singles, _singles)

    def test_split_mutants_single(self):
        """

        :return:
        """
        mutations = ["A124K", "M122L", "A156V"]

        data = pd.DataFrame({"mutations": mutations})

        output = pd.DataFrame(
            {
                "pos": [124, 122, 156],
                "wt": ["A", "M", "A"],
                "subs": ["K", "L", "V"],
                "mutations": mutations,
                "num_mutations": [1, 1, 1]
            },
            dtype=object)

        output = output[["mutations", "num_mutations", "pos", "wt", "subs"]]
        output["num_mutations"] = output["num_mutations"].astype(int)
        output["pos"] = output["pos"].astype(str)

        split_mutations = split_mutants(data, "mutations")

        pd.testing.assert_frame_equal(output, split_mutations)

    def test_split_mutants_double(self):
        """

        :return:
        """
        mutations = ["A124K,W145Y", "M122L", "A156V"]

        data = pd.DataFrame({"mutations": mutations})

        output = pd.DataFrame(
            {
                "pos": ["124,145", "122", "156"],
                "wt": ["A,W", "M", "A"],
                "subs": ["K,Y", "L", "V"],
                "mutations": mutations,
                "num_mutations": [2, 1, 1]
            },
            dtype=object)

        output = output[["mutations", "num_mutations", "pos", "wt", "subs"]]
        output["num_mutations"] = output["num_mutations"].astype(int)
        output["pos"] = output["pos"].astype(str)

        split_mutations = split_mutants(data, "mutations")

        pd.testing.assert_frame_equal(output, split_mutations)
Exemplo n.º 5
0
def standard(**kwargs):
    """
    Protocol:
    Compare ECs for single proteins (or domains)
    to 3D structure information

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * mutation_matrix_file
        * [mutation_dataset_predicted_file]
    """
    check_required(kwargs, [
        "prefix",
        "model_file",
        "mutation_dataset_file",
    ])

    prefix = kwargs["prefix"]

    outcfg = {
        "mutation_matrix_file": prefix + "_single_mutant_matrix.csv",
        "mutation_matrix_plot_files": [],
    }

    # make sure model file exists
    verify_resources("Model parameter file does not exist",
                     kwargs["model_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load couplings object, and create independent model
    c = CouplingsModel(kwargs["model_file"])
    c0 = c.to_independent_model()

    for model, type_ in [(c, "Epistatic"), (c0, "Independent")]:
        # interactive plot using bokeh
        filename = prefix + "_{}_model".format(type_.lower(), )
        output_file(filename + ".html", "{} model".format(type_))
        fig = evcouplings.visualize.mutations.plot_mutation_matrix(
            model, engine="bokeh")
        save(fig)
        outcfg["mutation_matrix_plot_files"].append(filename + ".html")

        # static matplotlib plot
        evcouplings.visualize.mutations.plot_mutation_matrix(model)
        plt.savefig(filename + ".pdf", bbox_inches="tight")
        outcfg["mutation_matrix_plot_files"].append(filename + ".pdf")

    # create single mutation matrix table,
    # add prediction by independent model and
    # save to file
    singles = single_mutant_matrix(c, output_column="prediction_epistatic")

    singles = predict_mutation_table(c0, singles, "prediction_independent")

    singles.to_csv(outcfg["mutation_matrix_file"], index=False)

    # Pymol scripts
    outcfg["mutations_epistatic_pml_files"] = []
    for model in ["epistatic", "independent"]:
        pml_filename = prefix + "_{}_model.pml".format(model)
        evcouplings.visualize.mutations.mutation_pymol_script(
            singles, pml_filename, effect_column="prediction_" + model)
        outcfg["mutations_epistatic_pml_files"].append(pml_filename)

    # predict experimental dataset if given
    dataset_file = kwargs["mutation_dataset_file"]
    if dataset_file is not None:
        verify_resources("Dataset file does not exist", dataset_file)
        data = pd.read_csv(dataset_file, comment="#")

        # add epistatic model prediction
        data_pred = predict_mutation_table(c, data, "prediction_epistatic")

        # add independent model prediction
        data_pred = predict_mutation_table(c0, data_pred,
                                           "prediction_independent")

        outcfg[
            "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv"
        data_pred.to_csv(outcfg["mutation_dataset_predicted_file"],
                         index=False)

    return outcfg
Exemplo n.º 6
0
def standard(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using plmc.

    .. todo::

        1. make EC enrichment calculation segment-ready
        2. explain meaning of parameters in detail.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences
        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
            "min_sequence_distance", # "save_model",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    # add mixture model probability
    ecs = pairs.add_mixture_probability(ecs)

    if segments is not None:  # and (len(segments) > 1 or not kwargs["focus_mode"]):
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    # write updated table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"]
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs)
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # load parameters
            c = CouplingsModel(outcfg["model_file"])

            # create JSON output and write to file
            f.write(
                evzoom_json(c) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg