示例#1
0
def verify_prefix(verify_subdir=True, **config):
    """
    Check if configuration contains a prefix,
    and that prefix is a valid directory we
    can write to on the filesystem
    
    Parameters
    ----------
    verify_subdir : bool, optional (default: True)
        Check if we can create subdirectory containing
        full prefix. Set this to False for outer evcouplings
        app loop.
    **config
        Input configuration for pipeline
        
    Returns
    -------
    prefix : str
        Verified prefix
    """
    # check we have a prefix entry, otherwise all hope is lost...
    try:
        prefix = config["global"]["prefix"]
    except KeyError:
        raise InvalidParameterError(
            "Configuration does not include 'prefix' setting in "
            "'global' section")

    # make sure prefix is also specified
    if prefix is None:
        raise InvalidParameterError(
            "'prefix' must be specified and cannot be None")

    # verify that prefix is workable in terms
    # of filesystem
    try:
        # make prefix folder
        create_prefix_folders(prefix)

        # try if we can write in the folder
        with open(prefix + ".test__", "w") as f:
            pass

        # get rid of the file again
        os.remove(prefix + ".test__")

        if verify_subdir:
            # make sure we can create a subdirectory
            sub_prefix = insert_dir(prefix, "test__")
            create_prefix_folders(sub_prefix)

            # remove again
            os.rmdir(path.dirname(sub_prefix))

    except OSError as e:
        raise InvalidParameterError(
            "Not a valid prefix: {}".format(prefix)) from e

    return prefix
示例#2
0
def run_hhfilter(input_file,
                 output_file,
                 threshold=95,
                 columns="a2m",
                 binary="hhfilter"):
    """
    Redundancy-reduce a sequence alignment using hhfilter
    from the HHsuite alignment suite.

    Parameters
    ----------
    input_file : str
        Path to input alignment in A2M/FASTA format
    output_file : str
        Path to output alignment (will be in A3M format)
    threshold : int, optional (default: 95)
        Sequence identity threshold for maximum pairwise
        identity (between 0 and 100)
    columns : {"first", "a2m"}, optional (default: "a2m")
        Definition of match columns (based on first sequence
        or upper-case columns (a2m))
    binary : str
        Path to hhfilter binary

    Returns
    -------
    str
        output_file

    Raises
    ------
    ResourceError
        If output alignment is non-existent/empty
    ValueError
        Upon invalid value of columns parameter
    """
    if columns not in ["first", "a2m"]:
        raise ValueError("Invalid column selection: {}".format(columns))

    verify_resources("Alignment file does not exist or is empty", input_file)

    create_prefix_folders(output_file)

    cmd = [
        binary, "-i", input_file, "-o", output_file, "-id",
        str(threshold), "-M", columns, "-v",
        str(2)
    ]

    return_code, stdout, stderr = run(cmd)

    verify_resources(
        "hhfilter returned empty alignment: "
        "stdout={} stderr={} file={}".format(stdout, stderr, output_file),
        output_file)

    return output_file
示例#3
0
def inter_dists(sifts_result_i, sifts_result_j, structures=None,
                atom_filter=None, intersect=False, output_prefix=None,
                model=0, raise_missing=True):
    """
    Compute inter-chain distances (between different entities)
    in PDB file. Resulting distance map is typically not
    symmetric, with either axis corresponding to either chain.
    Inter-distances are calculated on all combinations of chains
    that have the same PDB id in sifts_result_i and sifts_result_j.

    Parameters
    ----------
    sifts_result_i : SIFTSResult
        Input structures and mapping to use
        for first axis of computed distance map
    sifts_result_j : SIFTSResult
        Input structures and mapping to use
        for second axis of computed distance map
    structures : str or dict, optional (default: None)

        * If str: Load structures from directory this string
          points to. Missing structures will be fetched
          from web.

        * If dict: dictionary with lower-case PDB ids as keys
          and PDB objects as values. This dictionary has to
          contain all necessary structures, missing ones will
          not be fetched. This dictionary can be created using
          pdb.load_structures.

    atom_filter : str, optional (default: None)
        Filter coordinates to contain only these atoms. E.g.
        set to "CA" to compute C_alpha - C_alpha distances
        instead of minimum atom distance over all atoms in
        both residues.
    intersect : bool, optional (default: False)
        If True, intersect indices of the given
        distance maps. Otherwise, union of indices
        will be used.
    output_prefix : str, optional (default: None)
        If given, save individual and final contact maps
        to files prefixed with this string. The appended
        file suffixes map to row index in sifts_results.hits
    model : int, optional (default: 0)
        Index of model in PDB structure that should be used
    raise_missing : bool, optional (default: True)
        Raise a ResourceError if any of the input structures can
        not be loaded; otherwise, ignore missing entries.

    Returns
    -------
    DistanceMap
        Computed aggregated distance map
        across all input structures

    Raises
    ------
    ValueError
        If sifts_result_i or sifts_result_j is empty
        (no structure hits)
    ResourceError
        If any structure could not be loaded and raise_missing is True
    """
    def _get_chains(sifts_result):
        return {
            i: _prepare_chain(
                structures, r["pdb_id"], r["pdb_chain"],
                atom_filter, sifts_result.mapping[r["mapping_index"]],
                model
            )
            for i, r in sifts_result.hits.iterrows()
            if raise_missing or r["pdb_id"] in structures
        }

    if len(sifts_result_i.hits) == 0 or len(sifts_result_j.hits) == 0:
        raise ValueError(
            "sifts_result_i or sifts_result_j is empty "
            "(no structure hits, but at least one required)"
        )

    # if no structures given, or path to files, load first
    structures = _prepare_structures(
        structures,
        sifts_result_i.hits.pdb_id.append(
            sifts_result_j.hits.pdb_id
        ),
        raise_missing
    )

    # aggegrated distance map
    agg_distmap = None

    # create output folder if necessary
    if output_prefix is not None:
        create_prefix_folders(output_prefix)

    # determine which combinations of chains to look at
    # (anything that has same PDB identifier)
    combis = sifts_result_i.hits.reset_index().merge(
        sifts_result_j.hits.reset_index(),
        on="pdb_id", suffixes=("_i", "_j")
    )

    # extract chains for each subunit
    chains_i = _get_chains(sifts_result_i)
    chains_j = _get_chains(sifts_result_j)

    # go through all chain combinations
    for i, r in combis.iterrows():
        # skip missing structures
        if not raise_missing and r["pdb_id"] not in structures:
            continue

        index_i = r["index_i"]
        index_j = r["index_j"]

        # skip empty chains
        if (len(chains_i[index_i].residues) == 0 or
                len(chains_j[index_j].residues) == 0):
            continue

        # compute distance map for current chain pair
        distmap = DistanceMap.from_coords(
            chains_i[index_i],
            chains_j[index_j],
        )

        # save individual distance map
        if output_prefix is not None:
            distmap.to_file("{}_{}_{}".format(
                output_prefix, index_i, index_j)
            )

        # aggregate with other chain combinations
        if agg_distmap is None:
            agg_distmap = distmap
        else:
            agg_distmap = DistanceMap.aggregate(
                agg_distmap, distmap, intersect=intersect
            )

    return agg_distmap
示例#4
0
def standard(**kwargs):
    """
    Protocol:
    Compare ECs for single proteins (or domains)
    to 3D structure information

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * ec_file_compared_all
        * ec_file_compared_all_longrange
        * pdb_structure_hits
        * distmap_monomer
        * distmap_multimer
        * contact_map_files
        * remapped_pdb_files
    """
    check_required(kwargs, [
        "prefix",
        "ec_file",
        "min_sequence_distance",
        "pdb_mmtf_dir",
        "atom_filter",
        "compare_multimer",
        "distance_cutoff",
        "target_sequence_file",
        "scale_sizes",
    ])

    prefix = kwargs["prefix"]

    outcfg = {
        "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv",
        "ec_compared_longrange_file":
        prefix + "_CouplingScoresCompared_longrange.csv",
        "pdb_structure_hits_file": prefix + "_structure_hits.csv",
        "pdb_structure_hits_unfiltered_file":
        prefix + "_structure_hits_unfiltered.csv",
        # cannot have the distmap files end with "_file" because there are
        # two files (.npy and .csv), which would cause problems with automatic
        # checking if those files exist
        "distmap_monomer": prefix + "_distance_map_monomer",
        "distmap_multimer": prefix + "_distance_map_multimer",
    }

    # make sure EC file exists
    verify_resources("EC file does not exist", kwargs["ec_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store auxiliary files here (too much for average user)
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    create_prefix_folders(aux_prefix)

    # Step 1: Identify 3D structures for comparison
    sifts_map, sifts_map_full = _identify_structures(
        **{
            **kwargs,
            "prefix": aux_prefix,
        })

    # save selected PDB hits
    sifts_map.hits.to_csv(outcfg["pdb_structure_hits_file"], index=False)

    # also save full list of hits
    sifts_map_full.hits.to_csv(outcfg["pdb_structure_hits_unfiltered_file"],
                               index=False)

    # Step 2: Compute distance maps

    # load all structures at once
    structures = load_structures(sifts_map.hits.pdb_id,
                                 kwargs["pdb_mmtf_dir"],
                                 raise_missing=False)

    # compute distance maps and save
    # (but only if we found some structure)
    if len(sifts_map.hits) > 0:
        d_intra = intra_dists(sifts_map,
                              structures,
                              atom_filter=kwargs["atom_filter"],
                              output_prefix=aux_prefix + "_distmap_intra")
        d_intra.to_file(outcfg["distmap_monomer"])

        # save contacts to separate file
        outcfg["monomer_contacts_file"] = prefix + "_contacts_monomer.csv"
        d_intra.contacts(kwargs["distance_cutoff"]).to_csv(
            outcfg["monomer_contacts_file"], index=False)

        # compute multimer distances, if requested;
        # note that d_multimer can be None if there
        # are no structures with multiple chains
        if kwargs["compare_multimer"]:
            d_multimer = multimer_dists(sifts_map,
                                        structures,
                                        atom_filter=kwargs["atom_filter"],
                                        output_prefix=aux_prefix +
                                        "_distmap_multimer")
        else:
            d_multimer = None

        # if we have a multimer contact mapin the end, save it
        if d_multimer is not None:
            d_multimer.to_file(outcfg["distmap_multimer"])
            outcfg[
                "multimer_contacts_file"] = prefix + "_contacts_multimer.csv"

            # save contacts to separate file
            d_multimer.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg["multimer_contacts_file"], index=False)
        else:
            outcfg["distmap_multimer"] = None

        # at this point, also create remapped structures (e.g. for
        # later comparison of folding results)
        verify_resources("Target sequence file does not exist",
                         kwargs["target_sequence_file"])

        # create target sequence map for remapping structure
        with open(kwargs["target_sequence_file"]) as f:
            header, seq = next(read_fasta(f))

        seq_id, seq_start, seq_end = parse_header(header)
        seqmap = dict(zip(range(seq_start, seq_end + 1), seq))

        # remap structures, swap mapping index and filename in
        # dictionary so we have a list of files in the dict keys
        outcfg["remapped_pdb_files"] = {
            filename: mapping_index
            for mapping_index, filename in remap_chains(
                sifts_map, aux_prefix, seqmap).items()
        }
    else:
        # if no structures, can not compute distance maps
        d_intra = None
        d_multimer = None
        outcfg["distmap_monomer"] = None
        outcfg["distmap_multimer"] = None
        outcfg["remapped_pdb_files"] = None

    # Step 3: Compare ECs to distance maps

    ec_table = pd.read_csv(kwargs["ec_file"])

    # identify number of sites in EC model
    num_sites = len(
        set.union(set(ec_table.i.unique()), set(ec_table.j.unique())))

    for out_file, min_seq_dist in [
        ("ec_compared_longrange_file", kwargs["min_sequence_distance"]),
        ("ec_compared_all_file", 0),
    ]:
        # compare ECs only if we minimally have intra distance map
        if d_intra is not None:
            coupling_scores_compared(ec_table,
                                     d_intra,
                                     d_multimer,
                                     dist_cutoff=kwargs["distance_cutoff"],
                                     output_file=outcfg[out_file],
                                     min_sequence_dist=min_seq_dist)
        else:
            outcfg[out_file] = None

    # also create line-drawing script if we made the csv
    if outcfg["ec_compared_longrange_file"] is not None:
        ecs_longrange = pd.read_csv(outcfg["ec_compared_longrange_file"])

        outcfg[
            "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml"
        pairs.ec_lines_pymol_script(ecs_longrange.iloc[:num_sites, :],
                                    outcfg["ec_lines_compared_pml_file"],
                                    distance_cutoff=kwargs["distance_cutoff"])

    # Step 4: Make contact map plots
    # if no structures available, defaults to EC-only plot

    outcfg["contact_map_files"] = _make_contact_maps(ec_table, d_intra,
                                                     d_multimer, **kwargs)

    return outcfg
示例#5
0
def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start,
                     **kwargs):
    """
    Apply pairwise identity filtering, fragment filtering, and exclusion
    of columns with too many gaps to a sequence alignment. Also generates
    files describing properties of the alignment such as frequency distributions,
    conservation, and "old-style" alignment statistics files.

    .. note::

        assumes focus alignment (otherwise unprocessed) as input.

    .. todo::

        come up with something more clever  to filter fragments than fixed width
        (e.g. use 95% quantile of length distribution as reference point)

    Parameters
    ----------
    focus_ali : Alignment
        Focus-mode input alignment
    target_seq_index : int
        Index of target sequence in alignment
    target_seq_id : str
        Identifier of target sequence (without range)
    region_start : int
        Index of first sequence position in target sequence
    kwargs : See required arguments in source code

    Returns
    -------
    outcfg : Dict
        File products generated by the function:

        * alignment_file
        * statistics_file
        * frequencies_file
        * identities_file
        * raw_focus_alignment_file
    ali : Alignment
        Final processed alignment
    """
    check_required(kwargs, [
        "prefix",
        "seqid_filter",
        "hhfilter",
        "minimum_sequence_coverage",
        "minimum_column_coverage",
        "compute_num_effective_seqs",
        "theta",
    ])

    prefix = kwargs["prefix"]

    create_prefix_folders(prefix)

    focus_fasta_file = prefix + "_raw_focus.fasta"

    outcfg = {
        "alignment_file": prefix + ".a2m",
        "statistics_file": prefix + "_alignment_statistics.csv",
        "frequencies_file": prefix + "_frequencies.csv",
        "identities_file": prefix + "_identities.csv",
        "raw_focus_alignment_file": focus_fasta_file,
    }

    # swap target sequence to first position if it is not
    # the first sequence in alignment;
    # this is particularly important for hhfilter run
    # because target sequence might otherwise be filtered out
    if target_seq_index != 0:
        indices = np.arange(0, len(focus_ali))
        indices[0] = target_seq_index
        indices[target_seq_index] = 0
        target_seq_index = 0
        focus_ali = focus_ali.select(sequences=indices)

    with open(focus_fasta_file, "w") as f:
        focus_ali.write(f, "fasta")

    # apply pairwise identity filter (using hhfilter)
    if kwargs["seqid_filter"] is not None:
        filtered_file = prefix + "_filtered.a3m"

        at.run_hhfilter(focus_fasta_file,
                        filtered_file,
                        threshold=kwargs["seqid_filter"],
                        columns="first",
                        binary=kwargs["hhfilter"])

        with open(filtered_file) as f:
            focus_ali = Alignment.from_file(f, "a3m")

        # final FASTA alignment before applying A2M format modifications
        filtered_fasta_file = prefix + "_raw_focus_filtered.fasta"
        with open(filtered_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

    ali = focus_ali

    # filter fragments
    # come up with something more clever here than fixed width
    # (e.g. use 95% quantile of length distribution as reference point)
    min_cov = kwargs["minimum_sequence_coverage"]
    if min_cov is not None:
        if isinstance(min_cov, int):
            min_cov /= 100

        keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov
        ali = ali.select(sequences=keep_seqs)

    # Calculate frequencies, conservation and identity to query
    # on final alignment (except for lowercase modification)
    # Note: running hhfilter might cause a loss of the target seque
    # if it is not the first sequence in the file! To be sure that
    # nothing goes wrong, target_seq_index should always be 0.
    describe_seq_identities(ali, target_seq_index=target_seq_index).to_csv(
        outcfg["identities_file"], float_format="%.3f", index=False)

    describe_frequencies(ali, region_start,
                         target_seq_index=target_seq_index).to_csv(
                             outcfg["frequencies_file"],
                             float_format="%.3f",
                             index=False)

    coverage_stats = describe_coverage(ali, prefix, region_start,
                                       kwargs["minimum_column_coverage"])

    # keep list of uppercase sequence positions in alignment
    pos_list = np.arange(region_start, region_start + ali.L, dtype="int32")

    # Make columns with too many gaps lowercase
    min_col_cov = kwargs["minimum_column_coverage"]
    if min_col_cov is not None:
        if isinstance(min_col_cov, int):
            min_col_cov /= 100

        lc_cols = ali.count(ali._match_gap, axis="pos") > 1 - min_col_cov
        ali = ali.lowercase_columns(lc_cols)

        # if we remove columns, we have to update list of positions
        pos_list = pos_list[~lc_cols]
    else:
        lc_cols = None

    # compute effective number of sequences
    # (this is intended for cases where coupling stage is
    # not run, but this number is wanted nonetheless)
    if kwargs["compute_num_effective_seqs"]:
        # make sure we only compute N_eff on the columns
        # that would be used for model inference, dispose
        # the rest
        if lc_cols is None:
            cut_ali = ali
        else:
            cut_ali = ali.select(columns=~lc_cols)

        # compute sequence weights
        cut_ali.set_weights(kwargs["theta"])

        # N_eff := sum of all sequence weights
        n_eff = float(cut_ali.weights.sum())

        # patch into coverage statistics (N_eff column)
        coverage_stats.loc[:, "N_eff"] = n_eff
    else:
        n_eff = None

    # save coverage statistics to file
    coverage_stats.to_csv(outcfg["statistics_file"],
                          float_format="%.3f",
                          index=False)

    # store description of final sequence alignment in outcfg
    # (note these parameters will be updated by couplings protocol)
    outcfg.update({
        "num_sites": len(pos_list),
        "num_sequences": len(ali),
        "effective_sequences": n_eff,
        "region_start": region_start,
    })

    # create segment in outcfg
    outcfg["segments"] = [
        Segment("aa", target_seq_id, region_start, region_start + ali.L - 1,
                pos_list).to_list()
    ]

    with open(outcfg["alignment_file"], "w") as f:
        ali.write(f, "fasta")

    return outcfg, ali
示例#6
0
def complex(**kwargs):
    """
    Protocol:

    Run monomer alignment protocol and postprocess it for
    EVcomplex calculations

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the alignment protocol, and
        the following additional field:

        genome_location_file : path to file containing
            the genomic locations for CDs's corresponding to
            identifiers in the alignment.

    """
    check_required(kwargs, [
        "prefix", "alignment_protocol", "uniprot_to_embl_table",
        "ena_genome_location_table"
    ])

    verify_resources("Uniprot to EMBL mapping table does not exist",
                     kwargs["uniprot_to_embl_table"])

    verify_resources("ENA genome location table does not exist",
                     kwargs["ena_genome_location_table"])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # run the regular alignment protocol
    # (standard, existing, ...)
    alignment_protocol = kwargs["alignment_protocol"]

    if alignment_protocol not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid choice for alignment protocol: {}".format(
                alignment_protocol))

    outcfg = PROTOCOLS[kwargs["alignment_protocol"]](**kwargs)

    # if the user selected the existing alignment protocol
    # they can supply an input annotation file
    # which overwrites the annotation file generated by the existing protocol
    if alignment_protocol == "existing":
        check_required(kwargs, ["override_annotation_file"])

        if kwargs["override_annotation_file"] is not None:
            verify_resources("Override annotation file does not exist",
                             kwargs["override_annotation_file"])

            outcfg["annotation_file"] = prefix + "_annotation.csv"
            annotation_data = pd.read_csv(kwargs["override_annotation_file"])
            annotation_data.to_csv(outcfg["annotation_file"])

    # extract cds identifiers for alignment uniprot IDs
    cds_ids = extract_cds_ids(outcfg["alignment_file"],
                              kwargs["uniprot_to_embl_table"])

    # extract genome location information from ENA
    genome_location_filename = prefix + "_genome_location.csv"

    genome_location_table = extract_embl_annotation(
        cds_ids, kwargs["ena_genome_location_table"], genome_location_filename)

    genome_location_table = add_full_header(genome_location_table,
                                            outcfg["alignment_file"])

    genome_location_table.to_csv(genome_location_filename)
    outcfg["genome_location_file"] = genome_location_filename

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".align_complex.outcfg", outcfg)

    return outcfg
示例#7
0
def hmmbuild_and_search(**kwargs):
    """
    Protocol:

    Build HMM from sequence alignment using hmmbuild and 
    search against a sequence database using hmmsearch.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
        # this file is starting point of pipeline;
        # check if input alignment actually exists

        verify_resources("Input alignment does not exist",
                         input_alignment_file)

        # first try to autodetect format of alignment
        with open(input_alignment_file) as f:
            format = detect_format(f)
            if format is None:
                raise InvalidParameterError(
                    "Format of input alignment {} could not be "
                    "automatically detected.".format(input_alignment_file))

        with open(input_alignment_file) as f:
            ali_raw = Alignment.from_file(f, format)

        # Target sequence of alignment
        sequence_id = kwargs["sequence_id"]

        if sequence_id is None:
            raise InvalidParameterError(
                "Parameter sequence_id must be defined")

        # First, find focus sequence in alignment
        focus_index = None
        for i, id_ in enumerate(ali_raw.ids):
            if id_.startswith(sequence_id):
                focus_index = i
                break

        # if we didn't find it, cannot continue
        if focus_index is None:
            raise InvalidParameterError(
                "Target sequence {} could not be found in alignment".format(
                    sequence_id))

        # identify what columns (non-gap) to keep for focus
        # this should be all columns in the raw_focus_alignment_file
        # but checking anyway
        focus_seq = ali_raw[focus_index]
        focus_cols = np.array([
            c not in [ali_raw._match_gap, ali_raw._insert_gap]
            for c in focus_seq
        ])

        # extract focus alignment
        focus_ali = ali_raw.select(columns=focus_cols)
        focus_seq_nogap = "".join(focus_ali[focus_index])

        # determine region of sequence. If first_index is given,
        # use that in any case, otherwise try to autodetect
        full_focus_header = ali_raw.ids[focus_index]
        focus_id = full_focus_header.split()[0]

        # try to extract region from sequence header
        id_, region_start, region_end = parse_header(focus_id)

        # override with first_index if given
        if kwargs["first_index"] is not None:
            region_start = kwargs["first_index"]
            region_end = region_start + len(focus_seq_nogap) - 1

        if region_start is None or region_end is None:
            raise InvalidParameterError(
                "Could not extract region information " +
                "from sequence header {} ".format(full_focus_header) +
                "and first_index parameter is not given.")

        # resubstitute full sequence ID from identifier
        # and region information
        header = "{}/{}-{}".format(id_, region_start, region_end)

        focus_ali.ids[focus_index] = header

        # write target sequence to file
        target_sequence_file = prefix + ".fa"
        with open(target_sequence_file, "w") as f:
            write_fasta([(header, focus_seq_nogap)], f)

        # swap target sequence to first position if it is not
        # the first sequence in alignment;
        # this is particularly important for hhfilter run
        # because target sequence might otherwise be filtered out
        if focus_index != 0:
            indices = np.arange(0, len(focus_ali))
            indices[0] = focus_index
            indices[focus_index] = 0
            focus_index = 0
            focus_ali = focus_ali.select(sequences=indices)

        # write the raw focus alignment for hmmbuild
        focus_fasta_file = prefix + "_raw_focus_input.fasta"
        with open(focus_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

        return focus_fasta_file, target_sequence_file, region_start, region_end

    # define the gap threshold for inclusion in HMM's build by HMMbuild.
    SYMFRAC_HMMBUILD = 0.0

    # check for required options
    check_required(kwargs, [
        "prefix", "sequence_id", "alignment_file", "use_bitscores",
        "domain_threshold", "sequence_threshold", "database", "cpu", "nobias",
        "reuse_alignment", "hmmbuild", "hmmsearch"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # prepare input alignment for hmmbuild
    focus_fasta_file, target_sequence_file, region_start, region_end = \
        _format_alignment_for_hmmbuild(
            kwargs["alignment_file"], **kwargs
        )

    # run hmmbuild_and_search... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for hmmsearch
        sequence_length = region_end - region_start + 1
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], sequence_length)

        # create the hmm
        hmmbuild_result = at.run_hmmbuild(
            alignment_file=focus_fasta_file,
            prefix=prefix,
            symfrac=SYMFRAC_HMMBUILD,
            cpu=kwargs["cpu"],
            binary=kwargs["hmmbuild"],
        )
        hmmfile = hmmbuild_result.hmmfile

        # run the alignment from the hmm
        ali = at.run_hmmsearch(
            hmmfile=hmmfile,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            binary=kwargs["hmmsearch"],
        )

        # get rid of huge stdout log file immediately
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())
        # only item from hmmsearch_result to save is the hmmfile
        ali["hmmfile"] = hmmfile

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "input_raw_focus_alignment": focus_fasta_file,
        "target_sequence_file": target_sequence_file,
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # convert the raw output alignment to fasta format
    # and add the appropriate query sequecne
    raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix)
    outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
示例#8
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_valid_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        # for now, call the last two columns
        # "fn" and "cn" to prevent compare
        # stage from crashing
        names=["i", "A_i", "j", "A_j", "fn", "cn"]
        # names=["i", "A_i", "j", "A_j", "mi", "di"]
    ).sort_values(
        by="cn",
        ascending=False
    )

    is_single_segment = segments is None or len(segments) == 1
    outcfg = {
        **outcfg,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_enrichment=is_single_segment,
            generate_line_plot=is_single_segment
        )
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg)

    return outcfg
示例#9
0
def run_jobs(configs, global_config, overwrite=False, workdir=None):
    """
    Submit config to pipeline

    Parameters
    ----------
    configs : dict
        Configurations for individual subjobs
    global_config : dict
        Master configuration (if only one job,
        the contents of this dictionary will be
        equal to the single element of config_files)
    """
    python = executable
    pipeline_path = path.abspath(pipeline.__file__)
    summarize_path = path.abspath(summarize.__file__)

    cmd_base = "{} {}".format(python, pipeline_path)
    summ_base = "{} {}".format(python, summarize_path)

    # determine output directory for config files
    prefix = global_config["global"]["prefix"]

    # integrate working directory into output prefix
    # if it is given; if prefix contains an absolute path,
    # this will override the workdir according to
    # implementation of path.join()
    if workdir is not None:
        out_prefix = path.join(workdir, prefix)
    else:
        out_prefix = prefix

    # save configuration file, make sure we do not overwrite previous run
    # if overwrite protection is activated
    # (but only if it is a valid configuration file with contents)
    cfg_filename = CONFIG_NAME.format(out_prefix)

    if not overwrite and valid_file(cfg_filename):
        raise InvalidParameterError(
            "Existing configuration file {} ".format(cfg_filename) +
            "indicates current prefix {} ".format(prefix) +
            "would overwrite existing results. Use --yolo " +
            "flag to deactivate overwrite protection (e.g. for "
            "restarting a job or running a different stage)."
        )

    # make sure working directory exists
    create_prefix_folders(cfg_filename)

    # write global config file
    write_config_file(cfg_filename, global_config)

    # also write individual subjob configuration files
    # (we have to write these before submitting, since
    # the job summarizer needs the paths to all files)
    for subjob_prefix, subjob_cfg in configs.items():
        # determine working dir for each subjob, since subjob
        # prefix may contain slashes leading to subfolder creation
        if workdir is not None:
            subjob_out_prefix = path.join(workdir, subjob_prefix)
        else:
            subjob_out_prefix = subjob_prefix

        subcfg_filename = CONFIG_NAME.format(subjob_out_prefix)

        # make sure output subfolder exists
        create_prefix_folders(subcfg_filename)

        # write subjob configuration file
        write_config_file(subcfg_filename, subjob_cfg)

    # now create list of subjob config files relative to working
    # directory (above, we allow to run submitted in arbitrary directory)
    config_files = [
        CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs
    ]

    # create command for summarizer (needs to know all subjob config files)
    summ_cmd = "{} {} {} {}".format(
        summ_base,
        global_config["pipeline"],
        global_config["global"]["prefix"],
        " ".join(config_files)
    )

    # create submitter from global (pre-unrolling) configuration
    submitter = utils.SubmitterFactory(
        global_config["environment"]["engine"],
        db_path=out_prefix + "_job_database.txt"
    )

    # collect individual submitted jobs here
    commands = []

    # prepare individual jobs for submission
    for job, job_cfg in configs.items():
        job_prefix = job_cfg["global"]["prefix"]
        job_cfg_file = CONFIG_NAME.format(job)

        # set job status in database to pending
        pipeline.update_job_status(job_cfg, status=database.EStatus.PEND)

        # create submission command
        env = job_cfg["environment"]
        cmd = utils.Command(
            [
                "{} {}".format(cmd_base, job_cfg_file),
                summ_cmd
            ],
            name=job_prefix,
            environment=env["configuration"],
            workdir=workdir,
            resources={
                utils.EResource.queue: env["queue"],
                utils.EResource.time: env["time"],
                utils.EResource.mem: env["memory"],
                utils.EResource.nodes: env["cores"],
                utils.EResource.out: job_prefix + "_stdout.log",
                utils.EResource.error: job_prefix + "_stderr.log",
            }
        )

        # store job for later dependency creation
        commands.append(cmd)

        # finally, submit job
        submitter.submit(cmd)

    # submit final summarizer
    # (hold for now - summarizer is run after each subjob finishes)

    # wait for all runs to finish (but only if blocking)
    submitter.join()
示例#10
0
def run_hmmscan(query,
                database,
                prefix,
                use_model_threshold=True,
                threshold_type="cut_ga",
                use_bitscores=True,
                domain_threshold=None,
                seq_threshold=None,
                nobias=False,
                cpu=None,
                stdout_redirect=None,
                binary="hmmscan"):
    """
    Run hmmscan of HMMs in database against sequences in query
    to identify matches of these HMMs.
    Refer to HMMER Userguide for explanation of these parameters.

    Parameters
    ----------
    query : str
        File containing query sequence(s)
    database : str
        File containing HMM database (prepared with hmmpress)
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_model_threshold: bool (default: True)
        Use model-specific inclusion thresholds from
        HMM database rather than global bitscore/E-value
        thresholds (use_bitscores, domain_threshold and
        seq_threshold are overriden by this flag).
    threshold-type: {"cut_ga", "cut_nc", "cut_tc"} (default: "cut_ga")
        Use gathering (default), noise or trusted cutoff
        to define scan hits. Please refer to HMMER manual for
        details.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
        Overriden by use_model_threshold flag.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    binary : str (default: "hmmscan")
        Path to hmmscan binary (put in PATH for
        default to work)

    Returns
    -------
    HmmscanResult
        namedtuple with fields corresponding to the different
        output files (prefix, output, tblout, domtblout, pfamtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources("Input file does not exist or is empty", query, database)

    create_prefix_folders(prefix)

    result = HmmscanResult(
        prefix,
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout", prefix + ".domtblout", prefix + ".pfamtblout")

    cmd = [
        binary,
        "-o",
        result.output,
        "--tblout",
        result.tblout,
        "--domtblout",
        result.domtblout,
        "--pfamtblout",
        result.pfamtblout,
        "--notextw",
        "--acc",
    ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    # either use model-specific threshold, or custom
    # bitscore/E-value thresholds
    if use_model_threshold:
        THRESHOLD_CHOICES = ["cut_ga", "cut_nc", "cut_tc"]
        if threshold_type not in THRESHOLD_CHOICES:
            raise ValueError("Invalid model threshold, valid choices are: " +
                             ", ".join(THRESHOLD_CHOICES))

        cmd += ["--" + threshold_type]
    else:
        if seq_threshold is None or domain_threshold is None:
            raise ValueError("Must define sequence- and domain-level reporting"
                             "thresholds, or use gathering threshold instead.")

        if use_bitscores:
            cmd += [
                "-T",
                str(seq_threshold),
                "--domT",
                str(domain_threshold),
            ]
        else:
            cmd += [
                "-E",
                str(seq_threshold),
                "--domE",
                str(domain_threshold),
            ]

    cmd += [database, query]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created a table with hits
    verify_resources(
        "hmmscan did not return results: "
        "stdout={} stderr={} file={}".format(stdout, stderr, result.domtblout),
        result.domtblout)

    return result
示例#11
0
def run_jackhmmer(query,
                  database,
                  prefix,
                  use_bitscores,
                  domain_threshold,
                  seq_threshold,
                  iterations=5,
                  nobias=False,
                  cpu=None,
                  stdout_redirect=None,
                  checkpoints_hmm=False,
                  checkpoints_ali=False,
                  binary="jackhmmer"):
    """
    Run jackhmmer sequence search against target database.
    Refer to HMMER Userguide for explanation of these parameters.

    Parameters
    ----------
    query : str
        File containing query sequence
    database : str
        File containing sequence database
    prefix : str
        Prefix path for output files. Folder structure in
        the prefix will be created if not existing.
    use_bitscores : bool
        Use bitscore inclusion thresholds rather than E-values.
    domain_threshold : int or float or str
        Inclusion threshold applied on the domain level
        (e.g. "1E-03" or 0.001 or 50)
    seq_threshold : int or float or str
        Inclusion threshold applied on the sequence level
        (e.g. "1E-03" or 0.001 or 50)
    iterations : int
        number of jackhmmer search iterations
    nobias : bool, optional (default: False)
        Turn of bias correction
    cpu : int, optional (default: None)
        Number of CPUs to use for search. Uses all if None.
    stdout_redirect : str, optional (default: None)
        Redirect bulky stdout instead of storing
        with rest of results (use "/dev/null" to dispose)
    checkpoints_hmm : bool, optional (default: False)
        Store checkpoint HMMs to prefix.<iter>.hmm
    checkpoints_ali : bool, optional (default: False)
        Store checkpoint alignments to prefix.<iter>.sto
    binary : str (default: "jackhmmer")
        Path to jackhmmer binary (put in PATH for
        default to work)

    Returns
    -------
    JackhmmerResult
        namedtuple with fields corresponding to the different
        output files (prefix, alignment, output, tblout, domtblout)

    Raises
    ------
    ExternalToolError, ResourceError
    """
    verify_resources("Input file does not exist or is empty", query, database)

    create_prefix_folders(prefix)

    # store filenames of all individual results;
    # these will be returned as result of the
    # function.
    result = JackhmmerResult(
        prefix, prefix + ".sto",
        prefix + ".output" if stdout_redirect is None else stdout_redirect,
        prefix + ".tblout", prefix + ".domtblout")

    cmd = [
        binary, "-N",
        str(iterations), "-o", result.output, "-A", result.alignment,
        "--tblout", result.tblout, "--domtblout", result.domtblout, "--noali",
        "--notextw"
    ]

    # reporting thresholds are set accordingly to
    # inclusion threshold to reduce memory footprit
    if use_bitscores:
        cmd += [
            "-T",
            str(seq_threshold), "--domT",
            str(domain_threshold), "--incT",
            str(seq_threshold), "--incdomT",
            str(domain_threshold)
        ]
    else:
        cmd += [
            "-E",
            str(seq_threshold), "--domE",
            str(domain_threshold), "--incE",
            str(seq_threshold), "--incdomE",
            str(domain_threshold)
        ]

    # number of CPUs
    if cpu is not None:
        cmd += ["--cpu", str(cpu)]

    # bias correction filter
    if nobias:
        cmd += ["--nobias"]

    # save checkpoints for alignments and HMMs?
    if checkpoints_ali:
        cmd += ["--chkali", prefix]
    if checkpoints_hmm:
        cmd += ["--chkhmm", prefix]

    cmd += [query, database]

    return_code, stdout, stderr = run(cmd)

    # also check we actually created some sort of alignment
    verify_resources(
        "jackhmmer returned empty alignment: "
        "stdout={} stderr={} file={}".format(stdout, stderr, result.alignment),
        result.alignment)

    return result
示例#12
0
def complex(**kwargs):
    """
    Protocol:
    Mutation effect prediction and visualization for protein complexes

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * mutation_matrix_file
        * [mutation_dataset_predicted_file]
    """
    check_required(
        kwargs, ["prefix", "model_file", "mutation_dataset_file", "segments"])

    prefix = kwargs["prefix"]

    outcfg = {
        "mutation_matrix_file": prefix + "_single_mutant_matrix.csv",
        "mutation_matrix_plot_files": [],
    }

    # make sure model file exists
    verify_resources("Model parameter file does not exist",
                     kwargs["model_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load segments to create couplings object
    segment_objects = []
    for segment_list in kwargs["segments"]:
        segment_objects.append(Segment.from_list(segment_list))

    first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id
    second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id

    first_chain_name = Segment.from_list(
        kwargs["segments"][0]).default_chain_name()
    second_chain_name = Segment.from_list(
        kwargs["segments"][1]).default_chain_name()

    # load couplings object
    c = MultiSegmentCouplingsModel(kwargs["model_file"], *segment_objects)

    # create the independent model
    c0 = c.to_independent_model()

    # create the inter-protein only Jij model
    ci = c.to_inter_segment_model()

    for model, type_ in [(c, "Epistatic"), (c0, "Independent"),
                         (ci, "Inter_segment")]:
        # interactive plot using bokeh
        filename = prefix + "_{}_model".format(type_.lower(), )
        output_file(filename + ".html", "{} model".format(type_))
        fig = evcouplings.visualize.mutations.plot_mutation_matrix(
            model, engine="bokeh")
        save(fig)
        outcfg["mutation_matrix_plot_files"].append(filename + ".html")

        # static matplotlib plot
        evcouplings.visualize.mutations.plot_mutation_matrix(model)
        plt.savefig(filename + ".pdf", bbox_inches="tight")
        outcfg["mutation_matrix_plot_files"].append(filename + ".pdf")

    # create single mutation matrix table,
    # add prediction by independent model and
    # save to file
    singles = single_mutant_matrix(c, output_column="prediction_epistatic")

    singles = predict_mutation_table(c0, singles, "prediction_independent")

    singles = predict_mutation_table(ci, singles, "prediction_inter_segment")

    singles.to_csv(outcfg["mutation_matrix_file"], index=False)

    # Pymol scripts
    outcfg["mutations_epistatic_pml_files"] = []
    for model in ["epistatic", "independent", "inter_segment"]:
        pml_filename = prefix + "_{}_model.pml".format(model)
        evcouplings.visualize.mutations.mutation_pymol_script(
            singles,
            pml_filename,
            effect_column="prediction_" + model,
            segment_to_chain_mapping={
                first_segment_name: first_chain_name,
                second_segment_name: second_chain_name
            })
        outcfg["mutations_epistatic_pml_files"].append(pml_filename)

    # predict experimental dataset if given
    dataset_file = kwargs["mutation_dataset_file"]
    if dataset_file is not None:
        verify_resources("Dataset file does not exist", dataset_file)
        data = pd.read_csv(dataset_file, comment="#", sep=",")

        if "segment" not in data.columns:
            raise ValueError("Input mutation dataset file does not contain "
                             "a column called 'segment' to specify the "
                             "protein of origin for each mutation")

        # add epistatic model prediction
        data_pred = predict_mutation_table(c, data, "prediction_epistatic")

        # add independent model prediction
        data_pred = predict_mutation_table(c0, data_pred,
                                           "prediction_independent")

        data_pred = predict_mutation_table(ci, data_pred, "inter_segment")

        outcfg[
            "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv"
        data_pred.to_csv(outcfg["mutation_dataset_predicted_file"],
                         index=False)

    return outcfg
示例#13
0
def standard(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using plmc.

    .. todo::

        1. make EC enrichment calculation segment-ready
        2. explain meaning of parameters in detail.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences
        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
            "min_sequence_distance", # "save_model",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    # add mixture model probability
    ecs = pairs.add_mixture_probability(ecs)

    if segments is not None:  # and (len(segments) > 1 or not kwargs["focus_mode"]):
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    # write updated table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"]
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs)
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # load parameters
            c = CouplingsModel(outcfg["model_file"])

            # create JSON output and write to file
            f.write(
                evzoom_json(c) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
示例#14
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        # for now, call the last two columns
        # "fn" and "cn" to prevent compare
        # stage from crashing
        names=["i", "A_i", "j", "A_j", "fn", "cn"]
        # names=["i", "A_i", "j", "A_j", "mi", "di"]
    ).sort_values(
        by="cn",
        ascending=False
    )

    # write the sorted ECs table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"],
                score_column="cn"  # "di
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs, score="cn")  # "di"
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # create JSON output and write to file
            f.write(
                evzoom_json(model) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
示例#15
0
def genome_distance(**kwargs):
    """
    Protocol:

    Concatenate alignments based on genomic distance

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * alignment_file
        * raw_alignment_file
        * focus_mode
        * focus_sequence
        * segments
        * frequencies_file
        * identities_file
        * num_sequences
        * num_sites
        * raw_focus_alignment_file
        * statistics_file

    """

    check_required(
        kwargs,
        [
            "prefix",
            "first_alignment_file", "second_alignment_file",
            "first_focus_sequence", "second_focus_sequence",
            "first_focus_mode", "second_focus_mode",
            "first_region_start", "second_region_start",
            "first_segments", "second_segments",
            "genome_distance_threshold",
            "first_genome_location_file",
            "second_genome_location_file",
            "first_annotation_file",
            "second_annotation_file"
        ]
    )

    prefix = kwargs["prefix"]

    # make sure input alignments exist
    verify_resources(
        "Input alignment does not exist",
        kwargs["first_alignment_file"], kwargs["second_alignment_file"]
    )

    verify_resources(
        "Genome location file does not exist",
        kwargs["first_genome_location_file"],
        kwargs["second_genome_location_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load the information for each monomer alignment
    alignment_1 = kwargs["first_alignment_file"]
    alignment_2 = kwargs["second_alignment_file"]

    genome_location_filename_1 = kwargs["first_genome_location_file"]
    genome_location_filename_2 = kwargs["second_genome_location_file"]

    gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0)
    gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0)

    # find all possible matches
    possible_partners = find_possible_partners(
        gene_location_table_1, gene_location_table_2
    )

    # find the best reciprocal matches
    id_pairing_unfiltered = best_reciprocal_matching(possible_partners)

    # filter best reciprocal matches by genome distance threshold
    if kwargs["genome_distance_threshold"]:
        distance_threshold = kwargs["genome_distance_threshold"]
        id_pairing = id_pairing_unfiltered.query("distance < @distance_threshold")
    else:
        id_pairing = id_pairing_unfiltered

    id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"]
    id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"]

    # write concatenated alignment with distance filtering
    # TODO: save monomer alignments?
    target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \
        write_concatenated_alignment(
            id_pairing,
            alignment_1,
            alignment_2,
            kwargs["first_focus_sequence"],
            kwargs["second_focus_sequence"]
        )

    # save the alignment files
    raw_alignment_file = prefix + "_raw.fasta"
    with open(raw_alignment_file, "w") as of:
        raw_ali.write(of)

    mon_alignment_file_1 = prefix + "_monomer_1.fasta"
    with open(mon_alignment_file_1, "w") as of:
        mon_ali_1.write(of)   

    mon_alignment_file_2 = prefix + "_monomer_2.fasta"
    with open(mon_alignment_file_2, "w") as of:
        mon_ali_2.write(of)   

    # filter the alignment
    aln_outcfg, _ = modify_alignment(
        raw_ali,
        target_seq_index,
        target_seq_id,
        kwargs["first_region_start"],
        **kwargs
    )

    # make sure we return all the necessary information:
    # * alignment_file: final concatenated alignment that will go into plmc
    # * focus_sequence: this is the identifier of the concatenated target
    #   sequence which will be passed into plmc with -f
    outcfg = aln_outcfg
    outcfg["raw_alignment_file"] = raw_alignment_file
    outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1
    outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2
    outcfg["focus_sequence"] = target_seq_id

    # Update the segments
    outcfg = modify_complex_segments(outcfg, **kwargs)

    # Describe the statistics of the concatenation
    outcfg = _run_describe_concatenation(outcfg, **kwargs)

    # plot the genome distance distribution
    outcfg["distance_plot_file"] = prefix + "_distplot.pdf"
    plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"])

    return outcfg
示例#16
0
def run_jobs(configs,
             global_config,
             overwrite=False,
             workdir=None,
             abort_on_error=True,
             environment=None):
    """
    Submit config to pipeline

    Parameters
    ----------
    configs : dict
        Configurations for individual subjobs
    global_config : dict
        Master configuration (if only one job,
        the contents of this dictionary will be
        equal to the single element of config_files)
    overwrite : bool, optional (default: False)
        If True, allows overwriting previous run of the same
        config, otherwise will fail if results from previous
        execution are present
    workdir : str, optional (default: None)
        Workdir in which to run job (will combine
        workdir and prefix in joint path)
    abort_on_error : bool, optional (default: True)
        Abort entire job submission if error occurs for
        one of the jobs by propagating RuntimeError
    environment : str, optional (default: None)
        Allow to pass value for environment parameter
        of submitter, will override environment.configuration
        from global_config (e.g., for setting environment
        variables like passwords)

    Returns
    -------
    job_ids : dict
        Mapping from subjob prefix (keys in configs parameter)
        to identifier returned by submitter for each of the jobs
        that was *successfully* submitted (i.e. missing keys from
        configs param indicate these jobs could not be submitted).

    Raises
    ------
    RuntimeError
        If error encountered during submission and abort_on_error
        is True
    """
    cmd_base = environ.get("EVCOUPLINGS_RUNCFG_APP") or "evcouplings_runcfg"
    summ_base = environ.get(
        "EVCOUPLINGS_SUMMARIZE_APP") or "evcouplings_summarize"

    # determine output directory for config files
    prefix = global_config["global"]["prefix"]

    # integrate working directory into output prefix
    # if it is given; if prefix contains an absolute path,
    # this will override the workdir according to
    # implementation of path.join()
    if workdir is not None:
        out_prefix = path.join(workdir, prefix)
    else:
        out_prefix = prefix

    # save configuration file, make sure we do not overwrite previous run
    # if overwrite protection is activated
    # (but only if it is a valid configuration file with contents)
    cfg_filename = CONFIG_NAME.format(out_prefix)

    if not overwrite and valid_file(cfg_filename):
        raise InvalidParameterError(
            "Existing configuration file {} ".format(cfg_filename) +
            "indicates current prefix {} ".format(prefix) +
            "would overwrite existing results. Use --yolo " +
            "flag to deactivate overwrite protection (e.g. for "
            "restarting a job or running a different stage).")

    # make sure working directory exists
    create_prefix_folders(cfg_filename)

    # write global config file
    write_config_file(cfg_filename, global_config)

    # also write individual subjob configuration files
    # (we have to write these before submitting, since
    # the job summarizer needs the paths to all files)
    for subjob_prefix, subjob_cfg in configs.items():
        # determine working dir for each subjob, since subjob
        # prefix may contain slashes leading to subfolder creation
        if workdir is not None:
            subjob_out_prefix = path.join(workdir, subjob_prefix)
        else:
            subjob_out_prefix = subjob_prefix

        subcfg_filename = CONFIG_NAME.format(subjob_out_prefix)

        # make sure output subfolder exists
        create_prefix_folders(subcfg_filename)

        # write subjob configuration file
        write_config_file(subcfg_filename, subjob_cfg)

    # now create list of subjob config files relative to working
    # directory (above, we allow to run submitted in arbitrary directory)
    config_files = [
        CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs
    ]

    # create command for summarizer (needs to know all subjob config files)
    summ_cmd = "{} {} {} {}".format(summ_base, global_config["pipeline"],
                                    global_config["global"]["prefix"],
                                    " ".join(config_files))

    # create submitter from global (pre-unrolling) configuration
    submitter = utils.SubmitterFactory(global_config["environment"]["engine"],
                                       db_path=out_prefix +
                                       "_job_database.txt")

    # collect individual submitted jobs here
    commands = []

    # record subjob IDs returned by submitter for each job
    job_ids = {}

    # prepare individual jobs for submission
    for job, job_cfg in configs.items():
        job_prefix = job_cfg["global"]["prefix"]
        job_cfg_file = CONFIG_NAME.format(job)

        # create submission command
        env = job_cfg["environment"]
        cmd = utils.Command(
            ["{} {}".format(cmd_base, job_cfg_file), summ_cmd],
            name=job_prefix,
            environment=environment or env["configuration"],
            workdir=workdir,
            resources={
                utils.EResource.queue: env["queue"],
                utils.EResource.time: env["time"],
                utils.EResource.mem: env["memory"],
                utils.EResource.nodes: env["cores"],
                utils.EResource.out: job_prefix + "_stdout.log",
                utils.EResource.error: job_prefix + "_stderr.log",
            })

        # store job for later dependency creation
        commands.append(cmd)

        tracker = get_result_tracker(job_cfg)

        try:
            # finally, submit job
            current_job_id = submitter.submit(cmd)

            # store run identifier returned by submitter
            # TODO: consider storing current_job_id using tracker right away
            job_ids[job] = current_job_id

            # set job status in database to pending
            tracker.update(status=EStatus.PEND)

        except RuntimeError as e:
            # set job as failed in database
            tracker.update(status=EStatus.FAIL, message=str(e))

            # fail entire job submission if requested
            if abort_on_error:
                raise

    # submit final summarizer
    # (hold for now - summarizer is run after each subjob finishes)

    # wait for all runs to finish (but only if blocking)
    submitter.join()

    # return job identifiers
    return job_ids
示例#17
0
def best_hit(**kwargs):
    """
    Protocol:

    Concatenate alignments based on the best hit 
    to the focus sequence in each species

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        alignment_file
        raw_alignment_file
        focus_mode
        focus_sequence
        segments
        frequencies_file
        identities_file
        num_sequences
        num_sites
        raw_focus_alignment_file
        statistics_file
    """
    check_required(
        kwargs,
        [
            "prefix",
            "first_alignment_file", "second_alignment_file",
            "first_focus_sequence", "second_focus_sequence",
            "first_focus_mode", "second_focus_mode",
            "first_segments", "second_segments",
            "first_identities_file", "second_identities_file",
            "first_annotation_file", "second_annotation_file",
            "use_best_reciprocal", "paralog_identity_threshold"
        ]
    )

    prefix = kwargs["prefix"]

    # make sure input alignments
    verify_resources(
        "Input alignment does not exist",
        kwargs["first_alignment_file"], kwargs["second_alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    def _load_monomer_info(annotations_file, identities_file,
                           target_sequence, alignment_file,
                           use_best_reciprocal, identity_threshold):

        # read in annotation to a file and rename the appropriate column
        annotation_table = read_species_annotation_table(annotations_file)

        # read identity file
        similarities = pd.read_csv(identities_file)

        # create a pd.DataFrame containing the best hit in each organism
        most_similar_in_species = most_similar_by_organism(similarities, annotation_table)

        if use_best_reciprocal:
            paralogs = find_paralogs(
                target_sequence, annotation_table, similarities,
                identity_threshold
            )

            most_similar_in_species = filter_best_reciprocal(
                alignment_file, paralogs, most_similar_in_species
            )

        return most_similar_in_species

    # load the information about each monomer alignment
    most_similar_in_species_1 = _load_monomer_info(
        kwargs["first_annotation_file"],
        kwargs["first_identities_file"],
        kwargs["first_focus_sequence"],
        kwargs["first_alignment_file"],
        kwargs["use_best_reciprocal"],
        kwargs["paralog_identity_threshold"]
    )

    most_similar_in_species_2 = _load_monomer_info(
        kwargs["second_annotation_file"],
        kwargs["second_identities_file"],
        kwargs["second_focus_sequence"],
        kwargs["second_alignment_file"],
        kwargs["use_best_reciprocal"],
        kwargs["paralog_identity_threshold"]
    )

    # merge the two dataframes to get all species found in 
    # both alignments
    species_intersection = most_similar_in_species_1.merge(
        most_similar_in_species_2,
        how="inner",  # takes the intersection
        on="species",  # merges on species identifiers
        suffixes=("_1", "_2")
    )

    # write concatenated alignment with distance filtering
    # TODO: save monomer alignments?
    target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \
        write_concatenated_alignment(
            species_intersection,
            kwargs["first_alignment_file"],
            kwargs["second_alignment_file"],
            kwargs["first_focus_sequence"],
            kwargs["second_focus_sequence"]
        )

    # save the alignment files
    raw_alignment_file = prefix + "_raw.fasta"
    with open(raw_alignment_file, "w") as of:
        raw_ali.write(of)

    mon_alignment_file_1 = prefix + "_monomer_1.fasta"
    with open(mon_alignment_file_1, "w") as of:
        mon_ali_1.write(of)

    mon_alignment_file_2 = prefix + "_monomer_2.fasta"
    with open(mon_alignment_file_2, "w") as of:
        mon_ali_2.write(of)

    aln_outcfg, _ = modify_alignment(
        raw_ali,
        target_seq_index,
        target_seq_id,
        kwargs["first_region_start"],
        **kwargs
    )

    # make sure we return all the necessary information:
    # * alignment_file: final concatenated alignment that will go into plmc
    # * focus_sequence: this is the identifier of the concatenated target
    #   sequence which will be passed into plmc with -f
    outcfg = aln_outcfg
    outcfg["raw_alignment_file"] = raw_alignment_file
    outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1
    outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2
    outcfg["focus_sequence"] = target_seq_id

    # Update the segments
    outcfg = modify_complex_segments(outcfg, **kwargs)

    # Describe the statistics of the concatenation
    outcfg = _run_describe_concatenation(outcfg, **kwargs)

    return outcfg
示例#18
0
def standard(**kwargs):
    """
    Protocol:
    Compare ECs for single proteins (or domains)
    to 3D structure information

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * mutation_matrix_file
        * [mutation_dataset_predicted_file]
    """
    check_required(kwargs, [
        "prefix",
        "model_file",
        "mutation_dataset_file",
    ])

    prefix = kwargs["prefix"]

    outcfg = {
        "mutation_matrix_file": prefix + "_single_mutant_matrix.csv",
        "mutation_matrix_plot_files": [],
    }

    # make sure model file exists
    verify_resources("Model parameter file does not exist",
                     kwargs["model_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # load couplings object, and create independent model
    c = CouplingsModel(kwargs["model_file"])
    c0 = c.to_independent_model()

    for model, type_ in [(c, "Epistatic"), (c0, "Independent")]:
        # interactive plot using bokeh
        filename = prefix + "_{}_model".format(type_.lower(), )
        output_file(filename + ".html", "{} model".format(type_))
        fig = evcouplings.visualize.mutations.plot_mutation_matrix(
            model, engine="bokeh")
        save(fig)
        outcfg["mutation_matrix_plot_files"].append(filename + ".html")

        # static matplotlib plot
        evcouplings.visualize.mutations.plot_mutation_matrix(model)
        plt.savefig(filename + ".pdf", bbox_inches="tight")
        outcfg["mutation_matrix_plot_files"].append(filename + ".pdf")

    # create single mutation matrix table,
    # add prediction by independent model and
    # save to file
    singles = single_mutant_matrix(c, output_column="prediction_epistatic")

    singles = predict_mutation_table(c0, singles, "prediction_independent")

    singles.to_csv(outcfg["mutation_matrix_file"], index=False)

    # Pymol scripts
    outcfg["mutations_epistatic_pml_files"] = []
    for model in ["epistatic", "independent"]:
        pml_filename = prefix + "_{}_model.pml".format(model)
        evcouplings.visualize.mutations.mutation_pymol_script(
            singles, pml_filename, effect_column="prediction_" + model)
        outcfg["mutations_epistatic_pml_files"].append(pml_filename)

    # predict experimental dataset if given
    dataset_file = kwargs["mutation_dataset_file"]
    if dataset_file is not None:
        verify_resources("Dataset file does not exist", dataset_file)
        data = pd.read_csv(dataset_file, comment="#")

        # add epistatic model prediction
        data_pred = predict_mutation_table(c, data, "prediction_epistatic")

        # add independent model prediction
        data_pred = predict_mutation_table(c0, data_pred,
                                           "prediction_independent")

        outcfg[
            "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv"
        data_pred.to_csv(outcfg["mutation_dataset_predicted_file"],
                         index=False)

    return outcfg
示例#19
0
def execute(**config):
    """
    Execute a pipeline configuration

    Parameters
    ----------
    **config
        Input configuration for pipeline
        (see pipeline config files for
        example of how this should look like)

    Returns
    -------
    global_state : dict
        Global output state of pipeline
    """
    check_required(config, ["pipeline", "stages", "global"])

    # check if valid pipeline was selected
    if config["pipeline"] not in PIPELINES:
        raise InvalidParameterError("Not a valid pipeline selection. "
                                    "Valid choices are:\n{}".format(", ".join(
                                        PIPELINES.keys())))

    stages = config["stages"]
    if stages is None:
        raise InvalidParameterError("No stages defined, need at least one.")

    # get definition of selected pipeline
    pipeline = PIPELINES[config["pipeline"]]
    prefix = config["global"]["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # this is the global state of results as
    # we move through different stages of
    # the pipeline
    global_state = config["global"]

    # keep track of how many stages are still
    # to be run, so we can leave out stages at
    # the end of workflow below
    num_stages_to_run = len(stages)

    # get job tracker
    tracker = get_result_tracker(config)

    # set job status to running and also initalize global state
    tracker.update(status=EStatus.RUN, results=global_state)

    # iterate through individual stages
    for (stage, runner, key_prefix) in pipeline:
        # check if anything else is left to
        # run, otherwise skip
        if num_stages_to_run == 0:
            break

        # check if config for stage is there
        check_required(config, [stage])

        # output files for stage into an individual folder
        stage_prefix = insert_dir(prefix, stage)
        create_prefix_folders(stage_prefix)

        # config files for input and output of stage
        stage_incfg = "{}_{}.incfg".format(stage_prefix, stage)
        stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage)

        # update current stage of job
        tracker.update(stage=stage)

        # check if stage should be executed
        if stage in stages:
            # global state inserted at end, overrides any
            # stage-specific settings (except for custom prefix)
            incfg = {
                **config["tools"],
                **config["databases"],
                **config[stage],
                **global_state, "prefix": stage_prefix
            }
            # save input of stage in config file
            write_config_file(stage_incfg, incfg)

            # run stage
            outcfg = runner(**incfg)

            # prefix output keys if this parameter is
            # given in stage configuration, to avoid
            # name clashes if same protocol run multiple times
            if key_prefix is not None:
                outcfg = {key_prefix + k: v for k, v in outcfg.items()}

            # save output of stage in config file
            write_config_file(stage_outcfg, outcfg)

            # one less stage to put through after we ran this...
            num_stages_to_run -= 1
        else:
            # skip state by injecting state from previous run
            verify_resources(
                "Trying to skip, but output configuration "
                "for stage '{}' does not exist. Has it already "
                "been run?".format(stage, stage), stage_outcfg)

            # read output configuration
            outcfg = read_config_file(stage_outcfg)

            # verify all the output files are there
            outfiles = [
                filepath for f, filepath in outcfg.items()
                if f.endswith("_file") and filepath is not None
            ]

            verify_resources(
                "Output files from stage '{}' "
                "missing".format(stage), *outfiles)

        # update global state with outputs of stage
        global_state = {**global_state, **outcfg}

        # update state in tracker accordingly
        tracker.update(results=outcfg)

    # create results archive
    archive_file = create_archive(config, global_state, prefix)

    # only store results archive if a result file was created
    if archive_file is not None:
        global_state["archive_file"] = archive_file

        # prepare update for tracker, but only store in last
        # go when job is set to done
        tracker_archive_update = {"archive_file": archive_file}
    else:
        tracker_archive_update = None

    # set job status to done and transfer archive if selected for syncing
    tracker.update(status=EStatus.DONE, results=tracker_archive_update)

    # delete selected output files if requested;
    # tracker does not need to update here since it won't
    # sync entries of delete list in the first place
    global_state = delete_outputs(config, global_state)

    # write final global state of pipeline
    write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state)

    return global_state
示例#20
0
def standard(**kwargs):
    """
    Protocol:
    Predict 3D structure from evolutionary couplings

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sec_struct_file
        * folding_ec_file
        * folded_structure_files
    """
    check_required(
        kwargs,
        [
            "prefix", "engine", "ec_file", "target_sequence_file",
            "segments", "folding_config_file", "cut_to_alignment_region",
            "sec_struct_method", "reuse_sec_struct",
            "sec_struct_file", "filter_sec_struct_clashes",
            "min_sequence_distance", "fold_probability_cutoffs",
            "fold_lowest_count", "fold_highest_count", "fold_increase",
            "num_models", "psipred", "cpu", "remapped_pdb_files",
            "cleanup",
        ]
    )

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    outcfg = {
        "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv",
        "sec_struct_file": prefix + "_secondary_structure.csv",
    }

    # get secondary structure prediction
    # check if we should (and can) reuse output file from previous run
    if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]):
        residues = pd.read_csv(outcfg["sec_struct_file"])
    else:
        residues = secondary_structure(**kwargs)

    # make pymol secondary structure assignment script
    outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml"
    pymol_secondary_structure(
        residues, outcfg["secondary_structure_pml_file"]
    )

    # load ECs and filter for long-range pairs
    verify_resources(
        "EC file does not exist", kwargs["ec_file"]
    )
    ecs_all = pd.read_csv(kwargs["ec_file"])
    ecs = ecs_all.query("abs(i - j) > {}".format(
        kwargs["min_sequence_distance"])
    )

    # find secondary structure clashes
    ecs = secstruct_clashes(ecs, residues)
    ecs.to_csv(outcfg["folding_ec_file"], index=False)

    # if requested, filter clashes out before folding
    if kwargs["filter_sec_struct_clashes"]:
        ecs_fold = ecs.loc[~ecs.ss_clash]
    else:
        ecs_fold = ecs

    # cut modelled region to aligned region, if selected
    if kwargs["cut_to_alignment_region"]:
        segments = kwargs["segments"]
        # infer region from segment positions if we have it
        if segments is not None:
            positions = Segment.from_list(segments[0]).positions
        else:
            # otherwise get from EC values (could be misleading if
            # EC list is truncated, so only second option)
            positions = set(ecs.i.unique()).union(ecs.j.unique())

        # limit modelled positions to covered region
        first_pos, last_pos = min(positions), max(positions)
        residues.loc[:, "in_model"] = False
        residues.loc[
            (residues.i >= first_pos) & (residues.i <= last_pos),
            "in_model"
        ] = True
    else:
        # otherwise include all positions in model
        residues.loc[:, "in_model"] = True

    # save secondary structure prediction
    residues.to_csv(outcfg["sec_struct_file"], index=False)

    # only use the residues that will be in model for folding
    residues_fold = residues.loc[residues.in_model]

    # after all the setup, now fold the structures...
    # to speed things up, parallelize this to the number of
    # available CPUs
    num_procs = kwargs["cpu"]
    if num_procs is None:
        num_procs = 1

    # first define all the sub-runs...
    folding_runs = []

    # ... based on mixture model probability
    cutoffs = kwargs["fold_probability_cutoffs"]
    if cutoffs is not None and "probability" in ecs_fold.columns:
        if not isinstance(cutoffs, list):
            cutoffs = [cutoffs]

        for c in cutoffs:
            sig_ecs = ecs_fold.query("probability >= @c")
            if len(sig_ecs) > 0:
                folding_runs.append(
                    (sig_ecs,
                     "_significant_ECs_{}".format(c))
                )

    # ... and on simple EC counts/bins
    flc = kwargs["fold_lowest_count"]
    fhc = kwargs["fold_highest_count"]
    fi = kwargs["fold_increase"]
    if flc is not None and fhc is not None and fi is not None:
        num_sites = len(
            set.union(set(ecs.i.unique()), set(ecs.j.unique()))
        )

        # transform fraction of number of sites into discrete number of ECs
        def _discrete_count(x):
            if isinstance(x, float):
                x = ceil(x * num_sites)
            return int(x)

        # range of plots to make
        lowest = _discrete_count(flc)
        highest = _discrete_count(fhc)
        step = _discrete_count(fi)

        # append to list of jobs to run
        folding_runs += [
            (
                ecs_fold.iloc[:c],
                "_{}".format(c)
            )
            for c in range(lowest, highest + 1, step)
        ]

    # set up method to drive the folding of each job
    method = kwargs["engine"]

    # store structures in an auxiliary subdirectory, after folding
    # final models will be moved to main folding dir. Depending
    # on cleanup setting, the aux directory will be removed
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    aux_dir = path.dirname(aux_prefix)

    folding_runs = [
        (job_ecs, aux_prefix + job_suffix)
        for (job_ecs, job_suffix) in folding_runs
    ]

    if method == "cns_dgsa":
        folder = partial(
            cns_dgsa_fold,
            residues_fold,
            config_file=kwargs["folding_config_file"],
            num_structures=kwargs["num_models"],
            log_level=None,
            binary=kwargs["cns"]
        )
    else:
        raise InvalidParameterError(
            "Invalid folding engine: {} ".format(method) +
            "Valid selections are: cns_dgsa"
        )

    # then apply folding function to each sub-run
    pool = mp.Pool(processes=num_procs)
    results = pool.starmap(folder, folding_runs)

    # make double sure that the pool is cleaned up,
    # or SIGTERM upon exit will interfere with
    # interrupt signal interception
    pool.close()
    pool.join()

    # merge result dictionaries into one dict
    folded_files = {
        k: v for subres in results for k, v in subres.items()
    }

    # move structures from aux into main folding dir
    fold_dir = path.dirname(prefix)
    prediction_files = []
    for name, file_path in folded_files.items():
        # move file (use copy to allow overwriting)
        shutil.copy(file_path, fold_dir)

        # update file path to main folding dir,
        # and put in a flat list of result files
        prediction_files.append(
            file_path.replace(aux_prefix, prefix)
        )

    outcfg["folded_structure_files"] = prediction_files

    # remove aux dir if cleanup is requested
    if kwargs["cleanup"]:
        shutil.rmtree(aux_dir)

    # apply ranking to predicted models
    ranking = dihedral_ranking(prediction_files, residues)

    # apply clustering (all available methods), but only
    # if we have something to cluster
    if len(prediction_files) > 1:
        clustering = maxcluster_clustering_table(
            prediction_files, binary=kwargs["maxcluster"]
        )

        # join ranking with clustering
        ranking = ranking.merge(clustering, on="filename", how="left")

    # sort by score (best models first)
    ranking = ranking.sort_values(by="ranking_score", ascending=False)

    # store as file
    outcfg["folding_ranking_file"] = prefix + "_ranking.csv"
    ranking.to_csv(outcfg["folding_ranking_file"], index=False)

    # apply comparison to existing structures
    if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0:
        experimental_files = kwargs["remapped_pdb_files"]

        comp_all, comp_singles = compare_models_maxcluster(
            list(experimental_files.keys()), prediction_files,
            norm_by_intersection=True, distance_cutoff=None,
            binary=kwargs["maxcluster"]
        )

        # merge with ranking and save
        comparison = ranking.merge(
            comp_all, on="filename", how="left"
        ).sort_values(by="tm", ascending=False)
        outcfg["folding_comparison_file"] = prefix + "_comparison.csv"
        comparison.to_csv(outcfg["folding_comparison_file"], index=False)

        # also store comparison to structures in individual files
        ind_comp_files = {}
        for filename, comp_single in comp_singles.items():
            comparison_s = ranking.merge(
                comp_single, on="filename", how="left"
            ).sort_values(by="tm", ascending=False)
            basename = path.splitext(path.split(filename)[1])[0]
            ind_file = path.join(fold_dir, basename + ".csv")

            # map back to original key from remapped_pdb_files as a key for this list
            ind_comp_files[ind_file] = experimental_files[filename]
            comparison_s.to_csv(ind_file, index=False)

        outcfg["folding_individual_comparison_files"] = ind_comp_files

    return outcfg
示例#21
0
def infer_plmc(**kwargs):
    """
    Run EC computation on alignment. This function contains
    the functionality shared between monomer and complex EC
    inference.
    
    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required
    
    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        raw_ec_file
        model_file
        num_sites
        num_sequences
        effective_sequences

        focus_mode (passed through)
        focus_sequence (passed through)
        segments (passed through)

    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file",
            "focus_mode", "focus_sequence", "theta",
            "alphabet", "segments", "ignore_gaps", "iterations",
            "lambda_h", "lambda_J", "lambda_group",
            "scale_clusters",
            "cpu", "plmc", "reuse_ecs",
        ]
    )

    prefix = kwargs["prefix"]

    # for now disable option to not save model, since
    # otherwise mutate stage will crash. To remove model
    # file at end, use delete option in management section.
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # the following are passed through stage...
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    # regularization strength on couplings J_ij
    lambda_J = kwargs["lambda_J"]

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # first determine size of alphabet;
    # default is amino acid alphabet
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
        alphabet_setting = None
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

        # if we have protein alphabet, do not set
        # as plmc parameter since default parameter,
        # has some implementation advantages for focus mode
        if alphabet == ALPHABET_PROTEIN:
            alphabet_setting = None
        else:
            alphabet_setting = alphabet

    # scale lambda_J to proportionally compensate
    # for higher number of J_ij compared to h_i?
    if kwargs["lambda_J_times_Lq"]:
        num_symbols = len(alphabet)

        # if we ignore gaps, there is one character less
        if kwargs["ignore_gaps"]:
            num_symbols -= 1

        # second, determine number of uppercase positions
        # that are included in the calculation
        with open(kwargs["alignment_file"]) as f:
            seq_id, seq = next(read_fasta(f))

        # gap character is by convention first char in alphabet
        gap = alphabet[0]
        uppercase = [
            c for c in seq if c == c.upper() or c == gap
        ]
        L = len(uppercase)

        # finally, scale lambda_J
        lambda_J *= (num_symbols - 1) * (L - 1)

    # run plmc... or reuse pre-exisiting results from previous run
    plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file):
        plmc_result = read_config_file(plm_outcfg_file)

        # check if the EC/parameter files are there
        required_files = [outcfg["raw_ec_file"]]

        if outcfg["model_file"] is not None:
            required_files += [outcfg["model_file"]]

        verify_resources(
            "Tried to reuse ECs, but empty or "
            "does not exist",
            *required_files
        )

    else:
        # run plmc binary
        plmc_result = ct.run_plmc(
            kwargs["alignment_file"],
            outcfg["raw_ec_file"],
            outcfg["model_file"],
            focus_seq=kwargs["focus_sequence"],
            alphabet=alphabet_setting,
            theta=kwargs["theta"],
            scale=kwargs["scale_clusters"],
            ignore_gaps=kwargs["ignore_gaps"],
            iterations=kwargs["iterations"],
            lambda_h=kwargs["lambda_h"],
            lambda_J=lambda_J,
            lambda_g=kwargs["lambda_group"],
            cpu=kwargs["cpu"],
            binary=kwargs["plmc"],
        )

        # save iteration table to file
        iter_table_file = prefix + "_iteration_table.csv"
        plmc_result.iteration_table.to_csv(
            iter_table_file
        )

        # turn namedtuple into dictionary to make
        # restarting code nicer
        plmc_result = dict(plmc_result._asdict())

        # then replace table with filename so
        # we can store results in config file
        plmc_result["iteration_table"] = iter_table_file

        # save results of search for possible restart
        write_config_file(plm_outcfg_file, plmc_result)

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": plmc_result["num_valid_sites"],
        "num_valid_sequences": plmc_result["num_valid_seqs"],
        "effective_sequences": plmc_result["effective_samples"],
        "region_start": plmc_result["region_start"],
    })

    # read and sort ECs
    ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"])

    if segments is not None:
        # create index mapping
        seg_mapper = mapping.SegmentIndexMapper(
            kwargs["focus_mode"], outcfg["region_start"], *segments
        )

        # apply to EC table
        ecs = mapping.segment_map_ecs(ecs, seg_mapper)

    return outcfg, ecs, segments
示例#22
0
def secondary_structure(**kwargs):
    """
    Predict or load secondary structure for an
    input sequence

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    residues : pandas.DataFrame
        Table with sequence and secondary structure
        in columns i, A_i and sec_struct_3state
    """
    check_required(
        kwargs,
        [
            "prefix", "target_sequence_file",
            "segments", "sec_struct_method",
            "sec_struct_file", "psipred",
        ]
    )

    prefix = kwargs["prefix"]
    create_prefix_folders(prefix)

    secstruct_file = kwargs["sec_struct_file"]
    if secstruct_file is not None:
        verify_resources(
            "Secondary structure prediction file does not exist/is empty",
            secstruct_file
        )
        residues = pd.read_csv(secstruct_file)
    else:
        # make sure target sequence file is there so we can
        # predict secondary structure
        target_seq_file = kwargs["target_sequence_file"]
        verify_resources(
            "Sequence file does not exist/is empty", target_seq_file
        )

        # we need to figure out what the index of the first residue
        # in the target sequence is; obtain first index from segment
        # information if possible
        if kwargs["segments"] is not None:
            s = Segment.from_list(kwargs["segments"][0])
            first_index = s.region_start
        else:
            # otherwise try to get it from sequence file
            first_index = None

            with open(target_seq_file) as f:
                header, _ = next(read_fasta(f))
                if header is not None:
                    _, first_index, _ = parse_header(header)

                # if we cannot identify first index from header,
                # do not make guesses but fail
                if first_index is None:
                    raise InvalidParameterError(
                        "Could not unambiguously identify sequence range from "
                        "FASTA header, needs to specified as id/start-end: {}".format(
                            header
                        )
                    )

        # finally, run secondary structure prediction
        if kwargs["sec_struct_method"] == "psipred":
            # store psipred output in a separate directory
            output_dir = path.join(path.dirname(prefix), "psipred")

            # run psipred
            ss2_file, horiz_file = run_psipred(
                target_seq_file, output_dir, binary=kwargs["psipred"]
            )

            # parse output, renumber to first index
            residues = read_psipred_prediction(
                horiz_file, first_index=first_index
            )
        else:
            raise InvalidParameterError(
                "Secondary structure prediction method not implemented: "
                "{}. Valid choices: psipred".format(kwargs["sec_struct_method"])
            )

    # return predicted table
    return residues
示例#23
0
def standard(**kwargs):
    """
    Protocol:

    Standard buildali4 workflow (run iterative jackhmmer
    search against sequence database, than determine which
    sequences and columns to include in the calculation based
    on coverage and maximum gap thresholds).

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sequence_id (passed through from input)
        * first_index (passed through from input)
        * alignment_file
        * raw_alignment_file
        * raw_focus_alignment_file
        * statistics_file
        * target_sequence_file
        * sequence_file
        * annotation_file
        * frequencies_file
        * identities_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments

    ali : Alignment
        Final sequence alignment

    """
    check_required(kwargs, [
        "prefix",
        "extract_annotation",
    ])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # first step of protocol is to get alignment using
    # jackhmmer; initialize output configuration with
    # results of this search
    jackhmmer_outcfg = jackhmmer_search(**kwargs)
    stockholm_file = jackhmmer_outcfg["raw_alignment_file"]

    segment = Segment.from_list(jackhmmer_outcfg["segments"][0])
    target_seq_id = segment.sequence_id
    region_start = segment.region_start
    region_end = segment.region_end

    # read in stockholm format (with full annotation)
    with open(stockholm_file) as a:
        ali_raw = Alignment.from_file(a, "stockholm")

    # and store as FASTA file first (disabled for now
    # since equivalent information easily be obtained
    # from Stockholm file
    """
    ali_raw_fasta_file = prefix + "_raw.fasta"
    with open(ali_raw_fasta_file, "w") as f:
        ali_raw.write(f, "fasta")
    """

    # save annotation in sequence headers (species etc.)
    if kwargs["extract_annotation"]:
        annotation_file = prefix + "_annotation.csv"
        annotation = extract_header_annotation(ali_raw)
        annotation.to_csv(annotation_file, index=False)

    # center alignment around focus/search sequence
    focus_cols = np.array([c != "-" for c in ali_raw[0]])
    focus_ali = ali_raw.select(columns=focus_cols)

    target_seq_index = 0
    mod_outcfg, ali = modify_alignment(focus_ali, target_seq_index,
                                       target_seq_id, region_start, **kwargs)

    #  merge results of jackhmmer_search and modify_alignment stage
    outcfg = {
        **jackhmmer_outcfg,
        **mod_outcfg, "annotation_file": annotation_file
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".align_standard.outcfg", outcfg)

    # return results of protocol
    return outcfg
示例#24
0
def genome_distance(**kwargs):
    """
    Protocol:

    Concatenate alignments based on genomic distance

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

        .. todo::

            Explain meaning of parameters in detail.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        .. todo::

            this is the full list normally returned by alignment protocol, decide which ones to keep.
            Mandatory:

            * alignment_file
            * focus_sequence
            * focus_mode
            * segments

        * alignment_file
        * [raw_alignment_file]
        * statistics_file
        * target_sequence_file
        * sequence_file
        * [annotation_file]
        * frequencies_file
        * identities_file
        * [hittable_file]
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix",
        "first_raw_focus_alignment_file",
        "second_raw_focus_alignment_file",
        "first_focus_sequence",
        "second_focus_sequence",
        "first_focus_mode",
        "second_focus_mode",
        "first_segments",
        "second_segments",
    ])

    prefix = kwargs["prefix"]

    # make sure input alignments
    verify_resources("Input alignment does not exist",
                     kwargs["first_alignment_file"],
                     kwargs["second_alignment_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # -------------------------------------------------
    # TODO: implement concatenation functionality and
    # postprocessing functionality here
    # -------------------------------------------------

    def _modify_segments(seg_list, seg_prefix):
        # extract segments from list representation into objects
        segs = [Segment.from_list(s) for s in seg_list]
        # update segment IDs
        for i, s in enumerate(segs, start=1):
            s.segment_id = "{}_{}".format(seg_prefix, i)

        return segs

    # merge segments - this allows to have more than one segment per
    # "monomer" alignment
    segments_1 = _modify_segments(kwargs["first_segments"], "A")
    segments_2 = _modify_segments(kwargs["second_segments"], "B")
    segments_complex = segments_1 + segments_2

    # make sure we return all the necessary information:
    # * alignment_file: final concatenated alignment that will go into plmc
    # * focus_sequence: this is the identifier of the concatenated target
    #   sequence which will be passed into plmc with -f

    outcfg = {
        "alignment_file": None,  # TODO: specify
        "focus_mode": True,
        "focus_sequence": None,  # TODO: specify
        "segments": [s.to_list() for s in segments_complex],
        # optional but good to have:
        "num_sites": None,
        "num_sequences": None,
        # "effective_sequences": n_eff # TODO: could compute this like in align stage
        # TODO: there are more outputs that we could add here (not mandatory),
        # e.g. single column frequencies in concatenated alignment
    }

    return outcfg
示例#25
0
def existing(**kwargs):
    """
    Protocol:

    Use external sequence alignment and extract all relevant
    information from there (e.g. sequence, region, etc.),
    then apply gap & fragment filtering as usual

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sequence_id (passed through from input)
        * alignment_file
        * raw_focus_alignment_file
        * statistics_file
        * sequence_file
        * first_index
        * target_sequence_file
        * annotation_file (None)
        * frequencies_file
        * identities_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "input_alignment", "sequence_id", "first_index",
        "extract_annotation"
    ])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # this file is starting point of pipeline;
    # check if input alignment actually exists
    input_alignment = kwargs["input_alignment"]
    verify_resources("Input alignment does not exist", input_alignment)

    # first try to autodetect format of alignment
    with open(input_alignment) as f:
        format = detect_format(f)
        if format is None:
            raise InvalidParameterError(
                "Format of input alignment {} could not be "
                "automatically detected.".format(input_alignment))

    with open(input_alignment) as f:
        ali_raw = Alignment.from_file(f, format)

    # save annotation in sequence headers (species etc.)
    annotation_file = None
    if kwargs["extract_annotation"]:
        annotation_file = prefix + "_annotation.csv"
        from_anno_line = (format == "stockholm")
        annotation = extract_header_annotation(ali_raw,
                                               from_annotation=from_anno_line)
        annotation.to_csv(annotation_file, index=False)

    # Target sequence of alignment
    sequence_id = kwargs["sequence_id"]

    if sequence_id is None:
        raise InvalidParameterError("Parameter sequence_id must be defined")

    # First, find focus sequence in alignment
    focus_index = None
    for i, id_ in enumerate(ali_raw.ids):
        if id_.startswith(sequence_id):
            focus_index = i
            break

    # if we didn't find it, cannot continue
    if focus_index is None:
        raise InvalidParameterError(
            "Target sequence {} could not be found in alignment".format(
                sequence_id))

    # identify what columns (non-gap) to keep for focus
    focus_seq = ali_raw[focus_index]
    focus_cols = np.array([
        c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq
    ])

    # extract focus alignment
    focus_ali = ali_raw.select(columns=focus_cols)
    focus_seq_nogap = "".join(focus_ali[focus_index])

    # determine region of sequence. If first_index is given,
    # use that in any case, otherwise try to autodetect
    full_focus_header = ali_raw.ids[focus_index]
    focus_id = full_focus_header.split()[0]

    # try to extract region from sequence header
    id_, region_start, region_end = parse_header(focus_id)

    # override with first_index if given
    if kwargs["first_index"] is not None:
        region_start = kwargs["first_index"]
        region_end = region_start + len(focus_seq_nogap) - 1

    if region_start is None or region_end is None:
        raise InvalidParameterError(
            "Could not extract region information " +
            "from sequence header {} ".format(full_focus_header) +
            "and first_index parameter is not given.")

    # resubstitute full sequence ID from identifier
    # and region information
    header = "{}/{}-{}".format(id_, region_start, region_end)

    focus_ali.ids[focus_index] = header

    # write target sequence to file
    target_sequence_file = prefix + ".fa"
    with open(target_sequence_file, "w") as f:
        write_fasta([(header, focus_seq_nogap)], f)

    # apply sequence identity and fragment filters,
    # and gap threshold
    mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_,
                                       region_start, **kwargs)

    # generate output configuration of protocol
    outcfg = {
        **mod_outcfg,
        "sequence_id": sequence_id,
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "target_sequence_file": target_sequence_file,
        "focus_sequence": header,
        "focus_mode": True,
    }

    if annotation_file is not None:
        outcfg["annotation_file"] = annotation_file

    # dump config to YAML file for debugging/logging
    write_config_file(prefix + ".align_existing.outcfg", outcfg)

    # return results of protocol
    return outcfg
示例#26
0
def run_plmc(alignment, couplings_file, param_file=None,
             focus_seq=None, alphabet=None, theta=None,
             scale=None, ignore_gaps=False, iterations=None,
             lambda_h=None, lambda_J=None, lambda_g=None,
             cpu=None, binary="plmc"):
    """
    Run plmc on sequence alignment and store
    files with model parameters and pair couplings.

    Parameters
    ----------
    alignment : str
        Path to input sequence alignment
    couplings_file : str
        Output path for file with evolutionary couplings
        (folder will be created)
    param_file : str
        Output path for binary file containing model
        parameters (folder will be created)
    focus_seq : str, optional (default: None)
        Name of focus sequence, if None, non-focus mode
        will be used
    alphabet : str, optional (default: None)
        Alphabet for model inference. If None, standard
        amino acid alphabet including gap will be used.
        First character in string corresponds to gap
        character (relevant for ignore_gaps).
    theta : float, optional (default: None)
        Sequences with pairwise identity >= theta
        will be clustered and their sequence weights
        downweighted as 1 / num_cluster_members.
        Important: Note that plmc will be parametrized using
        1 - theta. If None, default value in plmc will be used,
        which corresponds to theta=0.8 (plmc setting 0.2).
    scale : float, optional (default: None)
        Scale weights of clusters by this value.
        If None, default value in plmc (1.0) will be used
    ignore_gaps : bool, optional (default: False)
        Exclude gaps from parameter inference. Gap
        character is first character of alphabet
        parameter.
    iterations : int, optional (default: None)
        Maximum iterations for optimization.
    lambda_h : float, optional (default: None)
        l2 regularization strength on fields.
        If None, plmc default will be used.
    lambda_J : float, optional (default: None)
        l2-regularization strength on couplings.
        If None, plmc default will be used
    lambda_g : float, optional (default: None)
        group l1-regularization strength on couplings
        If None, plmc default will be used.
    cpu : Number of cores to use for running plmc.
        Note that plmc has to be compiled in openmp
        mode to runnable with multiple cores.
        Can also be set to "max".
    binary : str, optional (default: "plmc")
        Path to plmc binary

    Returns
    -------
    PlmcResult
        namedtuple containing output files and
        parsed fields from console output of plmc

    Raises
    ------
    ExternalToolError
    """
    create_prefix_folders(couplings_file)

    # Make sure input alignment exists
    verify_resources(
        "Alignment file does not exist", alignment
    )

    cmd = [
        binary,
        "-c", couplings_file,
    ]

    # store eij file if explicitly requested
    if param_file is not None:
        create_prefix_folders(param_file)
        cmd += ["-o", param_file]

    # focus sequence mode and ID
    if focus_seq is not None:
        # TODO: for now split exclude sequence
        # region from focus seq name, otherwise
        # plmc does not remap names. If this
        # behaviour changes in plmc, remove the
        # following line.
        focus_seq = focus_seq.split("/")[0]
        cmd += ["-f", focus_seq]

    # exclude gaps from calculation?
    if ignore_gaps:
        cmd += ["-g"]

    # maximum number of iterations, can also be "max"
    if iterations is not None:
        cmd += ["-m", str(iterations)]

    # set custom alphabet
    # (first character is gap by default in nogap mode)
    if alphabet is not None:
        cmd += ["-a", alphabet]

    # sequence reweighting
    if theta is not None:
        # transform into plmc convention (1-theta)
        theta = 1.0 - theta
        cmd += ["-t", str(theta)]

    # cluster weight
    if scale is not None:
        cmd += ["-s", str(scale)]

    # L2 regularization weight for fields
    if lambda_h is not None:
        cmd += ["-lh", str(lambda_h)]

    # L2 regularization weight for pair couplings
    if lambda_J is not None:
        cmd += ["-le", str(lambda_J)]

    # Group L1 regularization weight for pair couplings
    if lambda_g is not None:
        cmd += ["-lg", str(lambda_g)]

    # Number of cores to use for calculation
    if cpu is not None:
        cmd += ["-n", str(cpu)]

    # finally also add input alignment (main parameter)
    cmd += [alignment]

    # TODO: for now do not check returncode because sometimes
    # returncode == -11 (segfault) despite successful calculation
    return_code, stdout, stderr = run(cmd, check_returncode=False)

    # TODO: remove this segfault-hunting output once fixed
    if return_code != 0:
        # if not a segfault, still raise exception
        if return_code != -11:
            from evcouplings.utils.system import ExternalToolError
            raise ExternalToolError(
                "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
                    cmd, return_code, stdout, stderr
                )
            )

        print("PLMC NON-ZERO RETURNCODE:", return_code)
        print(cmd)
        print(" ".join(cmd))
        print("stdout:", stdout)
        print("stderr:", stderr)

    iter_df, out_fields = parse_plmc_log(stderr)

    # also check we actually calculated couplings...
    if not valid_file(couplings_file):
        raise ResourceError(
            "plmc returned no couplings: stdout={} stderr={} file={}".format(
                stdout, stderr, couplings_file
            )
        )

    # ... and parameter file, if requested
    if param_file and not valid_file(param_file):
        raise ResourceError(
            "plmc returned no parameter file: stdout={} stderr={} file={}".format(
                stdout, stderr, param_file
            )
        )

    return PlmcResult(
        couplings_file, param_file,
        iter_df, *out_fields
    )
示例#27
0
def jackhmmer_search(**kwargs):
    """
    Protocol:

    Iterative jackhmmer search against a sequence database.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    .. todo::
        explain meaning of parameters in detail.

    Returns
    -------
    outcfg : dict
        Output configuration of the protocol, including
        the following fields:

        * sequence_id (passed through from input)
        * first_index (passed through from input)
        * target_sequence_file
        * sequence_file
        * raw_alignment_file
        * hittable_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "sequence_id", "sequence_file", "sequence_download_url",
        "region", "first_index", "use_bitscores", "domain_threshold",
        "sequence_threshold", "database", "iterations", "cpu", "nobias",
        "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer",
        "extract_annotation"
    ])
    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store search sequence file here
    target_sequence_file = prefix + ".fa"
    full_sequence_file = prefix + "_full.fa"

    # make sure search sequence is defined and load it
    full_seq_file, (full_seq_id, full_seq) = fetch_sequence(
        kwargs["sequence_id"], kwargs["sequence_file"],
        kwargs["sequence_download_url"], full_sequence_file)

    # cut sequence to target region and save in sequence_file
    # (this is the main sequence file used downstream)
    (region_start, region_end), cut_seq = cut_sequence(full_seq,
                                                       kwargs["sequence_id"],
                                                       kwargs["region"],
                                                       kwargs["first_index"],
                                                       target_sequence_file)

    # run jackhmmer... allow to reuse pre-exisiting
    # Stockholm alignment file here
    ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg"

    # determine if to rerun, only possible if previous results
    # were stored in ali_outcfg_file
    if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file):
        ali = read_config_file(ali_outcfg_file)

        # check if the alignment file itself is also there
        verify_resources(
            "Tried to reuse alignment, but empty or "
            "does not exist", ali["alignment"], ali["domtblout"])
    else:
        # otherwise, we have to run the alignment
        # modify search thresholds to be suitable for jackhmmer
        seq_threshold, domain_threshold = search_thresholds(
            kwargs["use_bitscores"], kwargs["sequence_threshold"],
            kwargs["domain_threshold"], len(cut_seq))

        # run search process
        ali = at.run_jackhmmer(
            query=target_sequence_file,
            database=kwargs[kwargs["database"]],
            prefix=prefix,
            use_bitscores=kwargs["use_bitscores"],
            domain_threshold=domain_threshold,
            seq_threshold=seq_threshold,
            iterations=kwargs["iterations"],
            nobias=kwargs["nobias"],
            cpu=kwargs["cpu"],
            checkpoints_hmm=kwargs["checkpoints_hmm"],
            checkpoints_ali=kwargs["checkpoints_ali"],
            binary=kwargs["jackhmmer"],
        )

        # get rid of huge stdout log file immediately
        # (do not use /dev/null option of jackhmmer function
        # to make no assumption about operating system)
        try:
            os.remove(ali.output)
        except OSError:
            pass

        # turn namedtuple into dictionary to make
        # restarting code nicer
        ali = dict(ali._asdict())

        # save results of search for possible restart
        write_config_file(ali_outcfg_file, ali)

    # prepare output dictionary with result files
    outcfg = {
        "sequence_id": kwargs["sequence_id"],
        "target_sequence_file": target_sequence_file,
        "sequence_file": full_sequence_file,
        "first_index": kwargs["first_index"],
        "focus_mode": True,
        "raw_alignment_file": ali["alignment"],
        "hittable_file": ali["domtblout"],
    }

    # define a single protein segment based on target sequence
    outcfg["segments"] = [
        Segment("aa", kwargs["sequence_id"], region_start, region_end,
                range(region_start, region_end + 1)).to_list()
    ]

    outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"],
                                                 region_start, region_end)

    return outcfg
示例#28
0
def cns_dgsa_fold(residues,
                  ec_pairs,
                  prefix,
                  config_file=None,
                  secstruct_column="sec_struct_3state",
                  num_structures=20,
                  min_cycles=5,
                  log_level=None,
                  binary="cns"):
    """
    Predict 3D structure coordinates using distance geometry
    and simulated annealing-based folding protocol
     
    Parameters
    ----------
    residues : pandas.DataFrame
        Table containing positions (column i), residue
        type (column A_i), and secondary structure for
        each position
    ec_pairs : pandas.DataFrame
        Table with EC pairs that will be turned
        into distance restraints
        (with columns i, j, A_i, A_j)
    prefix : str
        Prefix for output files (can include directories).
        Folders will be created automatically.
    config_file : str, optional (default: None)
        Path to config file with folding settings. If None,
        will use default settings included in package
        (restraints.yml)
    secstruct_column : str, optional (default: sec_struct_3state)
        Column name in residues dataframe from which secondary
        structure will be extracted (has to be H, E, or C).
    num_structures : int, optional (default: 20)
        Number of trial structures to generate
    min_cycles : int, optional (default: 5)
        Number of minimization cycles at end of protocol
    log_level : {None, "quiet", "verbose"}, optional (default: None)
        Don't keep CNS log files, or switch to different degrees
        of verbosity ("verbose" needed to obtain violation information)
    binary : str, optional (default: "cns")
        Path of CNS binary

    Returns
    -------
    final_models : dict
        Mapping from model name to path of model
    """
    def _run_inp(inp_str, output_prefix):
        with open(output_prefix + ".inp", "w") as f:
            f.write(inp_str)

        if log_level is not None:
            log_file = output_prefix + ".log"
        else:
            log_file = None

        run_cns(inp_str, log_file=log_file, binary=binary)

    # make sure output directory exists
    create_prefix_folders(prefix)

    # CNS doesn't like paths above a certain length, so we
    # will change into working directory and keep paths short.
    # For this reason, extract path and filename prefix
    dir_, rootname = path.split(prefix)
    cwd = os.getcwd()

    if dir_ != "":
        os.chdir(dir_)

    # create restraints (EC pairs and secondary structure-based)
    ec_tbl = rootname + "_couplings.tbl"
    ss_dist_tbl = rootname + "_ss_distance.tbl"
    ss_angle_tbl = rootname + "_ss_angle.tbl"

    ec_dist_restraints(ec_pairs, ec_tbl, cns_dist_restraint, config_file)

    secstruct_dist_restraints(residues, ss_dist_tbl, cns_dist_restraint,
                              config_file, secstruct_column)

    secstruct_angle_restraints(residues, ss_angle_tbl, cns_dihedral_restraint,
                               config_file, secstruct_column)

    # create sequence file
    seq = "".join(residues.A_i)
    seq_file = rootname + ".seq"
    cns_seq_file(seq, seq_file)

    # set up input files for folding
    # make molecular topology file (will be written to mtf_file)
    mtf_file = rootname + ".mtf"
    _run_inp(
        cns_mtf_inp(seq_file,
                    mtf_file,
                    first_index=residues.i.min(),
                    disulfide_bridges=None), mtf_file)

    # make extended PDB file (will be in extended_file)
    extended_file = rootname + "_extended.pdb"
    _run_inp(cns_extended_inp(mtf_file, extended_file), extended_file)

    # fold using dg_sa protocol (filenames will have suffixes _1, _2, ...)

    # have to pass either quiet or verbose to CNS (but will not store
    # log file if log_level is None).
    if log_level is None:
        dgsa_log_level = "quiet"
    else:
        dgsa_log_level = log_level

    _run_inp(
        cns_dgsa_inp(extended_file,
                     mtf_file,
                     rootname,
                     ec_tbl,
                     ss_dist_tbl,
                     ss_angle_tbl,
                     num_structures=num_structures,
                     log_level=dgsa_log_level), rootname + "_dgsa")

    # add hydrogen atoms and minimize (for all
    # generated candidate structures from dg_sa)

    # keep track of final predicted structures
    final_models = {}

    for i in range(1, num_structures + 1):
        input_root = "{}_{}".format(rootname, i)
        input_model = input_root + ".pdb"

        # check if we actually got the model from dg_sa
        if not valid_file(input_model):
            continue

        # run generate_easy protocol to add hydrogen atoms
        easy_pdb = input_root + "_h.pdb"
        easy_mtf = input_root + "_h.mtf"
        _run_inp(cns_generate_easy_inp(input_model, easy_pdb, easy_mtf),
                 input_root + "_h")

        # then minimize
        min_pdb = input_root + "_hMIN.pdb"

        _run_inp(
            cns_minimize_inp(easy_pdb,
                             easy_mtf,
                             min_pdb,
                             num_cycles=min_cycles), input_root + "_hMIN")

        if valid_file(min_pdb):
            final_models[min_pdb] = path.join(dir_, min_pdb)

    # change back into original directory
    os.chdir(cwd)

    return final_models
示例#29
0
def complex(**kwargs):
    """
    Protocol:
    Compare ECs for a complex to
    3D structure

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * ec_file_compared_all
        * ec_file_compared_all_longrange
        * pdb_structure_hits
        * distmap_monomer
        * distmap_multimer
        * contact_map_files
        * remapped_pdb_files
    """
    check_required(kwargs, [
        "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir",
        "atom_filter", "first_compare_multimer", "second_compare_multimer",
        "distance_cutoff", "first_sequence_id", "second_sequence_id",
        "first_sequence_file", "second_sequence_file", "first_segments",
        "second_segments", "first_target_sequence_file",
        "second_target_sequence_file", "scale_sizes"
    ])

    prefix = kwargs["prefix"]

    outcfg = {
        # initialize output EC files
        "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv",
        "ec_compared_longrange_file":
        prefix + "_CouplingScoresCompared_longrange.csv",
        "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv",

        # initialize output inter distancemap files
        "distmap_inter": prefix + "_distmap_inter",
        "inter_contacts_file": prefix + "_inter_contacts_file"
    }

    # Add PDB comparison files for first and second monomer
    for monomer_prefix in ["first", "second"]:
        outcfg = {
            **outcfg,
            monomer_prefix + "_pdb_structure_hits_file":
            "{}_{}_structure_hits.csv".format(prefix, monomer_prefix),
            monomer_prefix + "_pdb_structure_hits_unfiltered_file":
            "{}_{}_structure_hits_unfitered.csv".format(
                prefix, monomer_prefix),
            monomer_prefix + "_distmap_monomer":
            "{}_{}_distance_map_monomer".format(prefix, monomer_prefix),
            monomer_prefix + "_distmap_multimer":
            "{}_{}_distance_map_multimer".format(prefix, monomer_prefix),
        }

    # make sure EC file exists
    verify_resources("EC file does not exist", kwargs["ec_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store auxiliary files here (too much for average user)
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    create_prefix_folders(aux_prefix)

    # store auxiliary files here (too much for average user)
    first_aux_prefix = insert_dir(aux_prefix,
                                  "first_monomer",
                                  rootname_subdir=False)
    create_prefix_folders(first_aux_prefix)

    # store auxiliary files here (too much for average user)
    second_aux_prefix = insert_dir(aux_prefix,
                                   "second_monomer",
                                   rootname_subdir=False)
    create_prefix_folders(second_aux_prefix)

    # Step 1: Identify 3D structures for comparison
    def _identify_monomer_structures(name_prefix, outcfg, aux_prefix):
        # create a dictionary with kwargs for just the current monomer
        # remove the "prefix" kwargs so that we can replace with the
        # aux prefix when calling _identify_structures
        # only replace first occurrence of name_prefix
        monomer_kwargs = {
            k.replace(name_prefix + "_", "", 1): v
            for k, v in kwargs.items() if "prefix" not in k
        }

        # this field needs to be set explicitly else it gets overwritten by concatenated file
        monomer_kwargs["alignment_file"] = kwargs[name_prefix +
                                                  "_alignment_file"]
        monomer_kwargs["raw_focus_alignment_file"] = kwargs[
            name_prefix + "_raw_focus_alignment_file"]

        # identify structures for that monomer
        sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs,
                                                         prefix=aux_prefix)

        # save selected PDB hits
        sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"],
                              index=False)

        # also save full list of hits
        sifts_map_full.hits.to_csv(
            outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"],
            index=False)
        return outcfg, sifts_map

    outcfg, first_sifts_map = _identify_monomer_structures(
        "first", outcfg, first_aux_prefix)
    outcfg, second_sifts_map = _identify_monomer_structures(
        "second", outcfg, second_aux_prefix)

    # get the segment names from the kwargs
    segment_list = kwargs["segments"]

    # Make sure user provided exactly two segments
    if len(segment_list) != 2:
        raise InvalidParameterError(
            "Compare stage for protein complexes requires exactly two segments"
        )

    first_segment_name = kwargs["segments"][0][0]
    second_segment_name = kwargs["segments"][1][0]

    # Step 2: Compute distance maps
    def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):

        # prepare a sequence map to remap the structures we have found
        verify_resources("Target sequence file does not exist",
                         kwargs[name_prefix + "_target_sequence_file"])

        # create target sequence map for remapping structure
        with open(kwargs[name_prefix + "_target_sequence_file"]) as f:
            header, seq = next(read_fasta(f))

        # create target sequence map for remapping structure
        seq_id, seq_start, seq_end = parse_header(header)
        seqmap = dict(zip(range(seq_start, seq_end + 1), seq))

        # compute distance maps and save
        # (but only if we found some structure)
        if len(sifts_map.hits) > 0:
            d_intra = intra_dists(sifts_map,
                                  structures,
                                  atom_filter=kwargs["atom_filter"],
                                  output_prefix=aux_prefix + "_" +
                                  name_prefix + "_distmap_intra")
            d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"])

            # save contacts to separate file
            outcfg[
                name_prefix +
                "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv"
            d_intra.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg[name_prefix + "_monomer_contacts_file"], index=False)

            # compute multimer distances, if requested;
            # note that d_multimer can be None if there
            # are no structures with multiple chains
            if kwargs[name_prefix + "_compare_multimer"]:
                d_multimer = multimer_dists(sifts_map,
                                            structures,
                                            atom_filter=kwargs["atom_filter"],
                                            output_prefix=aux_prefix + "_" +
                                            name_prefix + "_distmap_multimer")
            else:
                d_multimer = None

            # if we have a multimer contact map, save it
            if d_multimer is not None:
                d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"])
                outcfg[
                    name_prefix +
                    "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv"

                # save contacts to separate file
                d_multimer.contacts(kwargs["distance_cutoff"]).to_csv(
                    outcfg[name_prefix + "_multimer_contacts_file"],
                    index=False)
            else:
                outcfg[name_prefix + "_distmap_multimer"] = None

            # create remapped structures (e.g. for
            # later comparison of folding results)
            # remap structures, swap mapping index and filename in
            # dictionary so we have a list of files in the dict keys
            outcfg[name_prefix + "_remapped_pdb_files"] = {
                filename: mapping_index
                for mapping_index, filename in remap_chains(
                    sifts_map,
                    aux_prefix,
                    seqmap,
                    chain_name=chain_name,
                    raise_missing=kwargs["raise_missing"]).items()
            }

        else:
            # if no structures, cannot compute distance maps
            d_intra = None
            d_multimer = None
            outcfg[name_prefix + "_distmap_monomer"] = None
            outcfg[name_prefix + "_distmap_multimer"] = None
            outcfg[name_prefix + "remapped_pdb_files"] = None

        return d_intra, d_multimer, seqmap

    # load all structures for both monomers
    all_structures = set(first_sifts_map.hits.pdb_id).union(
        set(second_sifts_map.hits.pdb_id))
    structures = load_structures(all_structures,
                                 kwargs["pdb_mmtf_dir"],
                                 raise_missing=False)

    d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps(
        first_sifts_map, "first", "A")
    d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps(
        second_sifts_map, "second", "B")

    # compute inter distance map if sifts map for each monomer exists
    if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0:
        d_inter = inter_dists(first_sifts_map,
                              second_sifts_map,
                              raise_missing=kwargs["raise_missing"])
        # if there were overlapping PDBs, save the results
        if d_inter is not None:
            d_inter.to_file(outcfg["distmap_inter"])

            # save contacts to separate file
            d_inter.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg["inter_contacts_file"], index=False)

    else:
        outcfg["inter_contacts_file"] = None
        d_inter = None

    # # Step 3: Compare ECs to distance maps
    ec_table = pd.read_csv(kwargs["ec_file"])

    for out_file, min_seq_dist in [
        ("ec_compared_longrange_file", kwargs["min_sequence_distance"]),
        ("ec_compared_all_file", 0),
    ]:

        # compare ECs only if we have an intra distance map
        # for at least one monomer - inter can't exist unless
        # we have both monomers
        if (d_intra_i is not None) or (d_intra_j is not None):
            # compare distances individually for each segment pair
            ecs_intra_i = ec_table.query(
                "segment_i == segment_j == @first_segment_name")
            if d_intra_i is not None:
                ecs_intra_i_compared = coupling_scores_compared(
                    ecs_intra_i,
                    d_intra_i,
                    d_multimer_i,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=min_seq_dist)
            else:
                # If no distance map, the distance is saved as np.nan
                ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan)

            ecs_intra_j = ec_table.query(
                "segment_i == segment_j == @second_segment_name")
            if d_intra_j is not None:
                ecs_intra_j_compared = coupling_scores_compared(
                    ecs_intra_j,
                    d_intra_j,
                    d_multimer_j,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=min_seq_dist)
            else:
                ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan)

            ecs_inter = ec_table.query("segment_i != segment_j")
            if d_inter is not None:
                ecs_inter_compared = coupling_scores_compared(
                    ecs_inter,
                    d_inter,
                    dist_map_multimer=None,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=
                    None  # does not apply for inter-protein ECs
                )
            else:
                ecs_inter_compared = ecs_inter.assign(dist=np.nan)

            # combine the tables
            ec_table_compared = pd.concat([
                ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared
            ])

            # rename the precision column to "segmentwise_precision"
            # because we calculated precision for each segment independently
            ec_table_compared = ec_table_compared.rename(
                columns={"precision": "segmentwise_precision"})
            # TODO: change "cn" to "score" eventually
            ec_table_compared = ec_table_compared.sort_values("cn",
                                                              ascending=False)

            # add the total precision
            # TODO: implement different cutoffs for intra vs inter contacts
            ec_table_compared = add_precision(
                ec_table_compared, dist_cutoff=kwargs["distance_cutoff"])

            # save to file
            # all ecs
            ec_table_compared.to_csv(outcfg[out_file])

            # save the inter ECs to a file
            ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"])

    # create the inter-ecs line drawing script
    if outcfg["ec_compared_inter_file"] is not None and kwargs[
            "plot_highest_count"] is not None:
        inter_ecs = ec_table.query("segment_i != segment_j")

        outcfg[
            "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml"

        pairs.ec_lines_pymol_script(
            inter_ecs.iloc[:kwargs["plot_highest_count"], :],
            outcfg["ec_lines_compared_pml_file"],
            distance_cutoff=kwargs["distance_cutoff"],
            chain={
                first_segment_name: "A",
                second_segment_name: "B"
            })

    # Remap the complex crystal structures, if available
    if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0:
        outcfg["complex_remapped_pdb_files"] = {
            filename: mapping_index
            for mapping_index, filename in remap_complex_chains(
                first_sifts_map,
                second_sifts_map,
                seqmap_i,
                seqmap_j,
                output_prefix=aux_prefix,
                raise_missing=kwargs["raise_missing"]).items()
        }

    # Step 4: Make contact map plots
    # if no structures available, defaults to EC-only plot
    outcfg["contact_map_files"] = _make_complex_contact_maps(
        ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter,
        first_segment_name, second_segment_name, **kwargs)

    return outcfg
示例#30
0
def multimer_dists(sifts_result, structures=None, atom_filter=None,
                   intersect=False, output_prefix=None, model=0,
                   raise_missing=True):
    """
    Compute homomultimer distances (between repeated copies of the
    same entity) in PDB file. Resulting distance matrix will be
    symmetric by minimization over upper and lower triangle of matrix,
    even if the complex structure is not symmetric.

    Parameters
    ----------
    sifts_result : SIFTSResult
        Input structures and mapping to use
        for distance map calculation
    structures : str or dict, optional (default: None)
        If str: Load structures from directory this string
        points to. Missing structures will be fetched
        from web.

        If dict: dictionary with lower-case PDB ids as keys
        and PDB objects as values. This dictionary has to
        contain all necessary structures, missing ones will
        not be fetched. This dictionary can be created using
        pdb.load_structures.
    atom_filter : str, optional (default: None)
        Filter coordinates to contain only these atoms. E.g.
        set to "CA" to compute C_alpha - C_alpha distances
        instead of minimum atom distance over all atoms in
        both residues.
    intersect : bool, optional (default: False)
        If True, intersect indices of the given
        distance maps. Otherwise, union of indices
        will be used.
    output_prefix : str, optional (default: None)
        If given, save individual and final contact maps
        to files prefixed with this string. The appended
        file suffixes map to row index in sifts_results.hits
    model : int, optional (default: 0)
        Index of model in PDB structure that should be used
    raise_missing : bool, optional (default: True)
        Raise a ResourceError if any of the input structures can
        not be loaded; otherwise, ignore missing entries.

    Returns
    -------
    DistanceMap
        Computed aggregated distance map
        across all input structures

    Raises
    ------
    ValueError
        If sifts_result is empty (no structure hits)
    ResourceError
        If any structure could not be loaded and raise_missing is True
    """
    if len(sifts_result.hits) == 0:
        raise ValueError(
            "sifts_result is empty (no structure hits, but at least one required)"
        )

    # if no structures given, or path to files, load first
    structures = _prepare_structures(
        structures, sifts_result.hits.pdb_id, raise_missing
    )

    # aggegrated distance map
    agg_distmap = None

    # create output folder if necessary
    if output_prefix is not None:
        create_prefix_folders(output_prefix)

    # go through each structure
    for pdb_id, grp in sifts_result.hits.reset_index().groupby("pdb_id"):
        # skip missing structures
        if not raise_missing and pdb_id not in structures:
            continue

        # extract all chains for this structure
        chains = [
            (
                r["index"],
                _prepare_chain(
                    structures, r["pdb_id"], r["pdb_chain"],
                    atom_filter, sifts_result.mapping[r["mapping_index"]],
                    model
                )
            )
            for i, r in grp.iterrows()
        ]

        # compare all possible pairs of chains
        for (index_i, ch_i), (index_j, ch_j) in combinations(chains, 2):
            # skip empty chains (e.g. residues lost during remapping)
            if len(ch_i.residues) == 0 or len(ch_j.residues) == 0:
                continue

            distmap = DistanceMap.from_coords(ch_i, ch_j)

            # symmetrize matrix (for ECs we are only interested if a pair
            # is close in some combination)
            distmap_sym = DistanceMap.aggregate(
                distmap, distmap.transpose(), intersect=intersect
            )
            distmap_sym.symmetric = True

            # save individual distance map
            if output_prefix is not None:
                distmap_sym.to_file("{}_{}_{}".format(
                    output_prefix, index_i, index_j)
                )

            # aggregate with other chain combinations
            if agg_distmap is None:
                agg_distmap = distmap_sym
            else:
                agg_distmap = DistanceMap.aggregate(
                    agg_distmap, distmap_sym, intersect=intersect
                )

    return agg_distmap