示例#1
0
def remove_clan_overlaps(pfam_table):
    """
    Remove overlapping Pfam hits from same Pfam clan
    (equivalent of PfamScan.pl). Currently only
    allows to remove overlaps by domain bitscore.

    .. todo::

        is bitscore the most sensible choice if different length hits?

    Parameters
    ----------
    pfam_table : pd.DataFrame
        Pfam hit table as generated by pfam_hits() function
        (must contain Pfam clan annotation).

    Returns
    -------
    pd.DataFrame
        Pfam hit table with lower-scoring overlaps removed
    """
    # could make this a parameter, if switching to E-values
    # we would have to changing sorting order of DataFrame
    # and sign of comparison further below.
    score = "domain_score"

    # group by sequence ID and clan to resolve overlaps
    grouped = pfam_table.sort_values(
        by=score, ascending=False
    ).groupby(
        by=["query_name", "clan_id"], as_index=False, sort=False
    )

    # store index value of all entries to discard
    remove_hits = []

    for (uniprot_ac, clan_name), grp in grouped:
        # safety check here that we are not grouping hits that are
        # not in the same clan (missing value) if pandas ever changed
        # the behaviour of groupby to not iterate through groups
        # with missing values. Otherwise, we would have to skip grouop.
        assert clan_name.startswith("CL")

        # go through all pairwise combinations of hits
        for idx1, hit1 in grp.iterrows():
            for idx2, hit2 in grp.iterrows():
                if idx1 < idx2:
                    if range_overlap(
                        (int(hit1["ali_from"]), int(hit1["ali_to"]) + 1),
                        (int(hit2["ali_from"]), int(hit2["ali_to"]) + 1),
                    ) > 0:
                        if float(hit1[score]) >= float(hit2[score]):
                            remove_hits.append(idx2)
                        else:
                            remove_hits.append(idx1)

    return pfam_table.loc[~pfam_table.index.isin(remove_hits)]
示例#2
0
    def by_alignment(self, min_overlap=20, reduce_chains=False, **kwargs):
        """
        Find structures by sequence alignment between
        query sequence and sequences in PDB.

        Parameters
        ----------
        min_overlap : int, optional (default: 20)
            Require at least this many aligned positions
            with the target structure
        reduce_chains : bool, optional (Default: True)
            If true, keep only first chain per PDB ID
            (i.e. remove redundant occurrences of same
            protein in PDB structures). Should be set to
            False to identify homomultimeric contacts.
        **kwargs
            Defines the behaviour of find_homologs() function
            used to find homologs by sequence alignment:
            - which alignment method is used 
              (pdb_alignment_method: {"jackhmmer", "hmmsearch"}, 
              default: "jackhmmer"),
            - parameters passed into the protocol for the selected
              alignment method (evcouplings.align.jackhmmer_search or
              evcouplings.align.hmmbuild_and_search).
              
              Default parameters are set in the HMMER_CONFIG string in this
              module, other parameters will need to be overriden; these
              minimally are:
              - for pdb_alignment_method == "jackhmmer":
                - sequence_id : str, identifier of target sequence
                - jackhmmer : str, path to jackhmmer binary if not on path                
              - for pdb_alignment_method == "hmmsearch":
                - sequence_id : str, identifier of target sequence
                - raw_focus_alignment_file : str, path to input alignment file  
                - hmmbuild : str, path to hmmbuild binary if not on path
                - hmmsearch : str, path to search binary if not on path
            - additionally, if "prefix" is given,
              individual mappings will be saved to files suffixed
              by the respective key in mapping table.

        Returns
        -------
        SIFTSResult
            Record of hits and mappings found for this
            query sequence by alignment. See by_pdb_id()
            for detailed explanation of fields.
        """
        def _create_mapping(r):
            _, query_start, query_end = parse_header(ali.ids[0])

            # create mapping from query into PDB Uniprot sequence
            # A_i will be query sequence indices, A_j Uniprot sequence indices
            m = map_indices(ali[0], query_start, query_end,
                            ali[r["alignment_id"]], r["alignment_start"],
                            r["alignment_end"])

            # create mapping from PDB Uniprot into seqres numbering
            # j will be Uniprot sequence index, k seqres index
            n = pd.DataFrame({
                "j":
                list(range(r["uniprot_start"], r["uniprot_end"] + 1)),
                "k":
                list(range(r["resseq_start"], r["resseq_end"] + 1)),
            })

            # need to convert to strings since other mapping has indices as strings
            n.loc[:, "j"] = n.j.astype(str)
            n.loc[:, "k"] = n.k.astype(str)

            # join over Uniprot indices (i.e. j);
            # get rid of any position that is not aligned
            mn = m.merge(n, on="j", how="inner").dropna()

            # extract final mapping from seqres (k) to query (i)
            map_ = dict(zip(mn.k, mn.i))

            return map_, mn

        if self.sequence_file is None:
            raise ValueError("Need to have SIFTS sequence file. "
                             "Create using create_sequence_file() "
                             "method or constructor.")

        ali, hits = find_homologs(sequence_database=self.sequence_file,
                                  **kwargs)

        # merge with internal table to identify overlap of
        # aligned regions and regions with structural coverage
        hits = hits.merge(self.table, on="uniprot_ac", suffixes=("", "_"))

        # add 1 to end of range since overlap function treats
        # ends as exclusive, while ends here are inclusive
        hits.loc[:, "overlap"] = [
            range_overlap((r["uniprot_start"], r["uniprot_end"] + 1),
                          (r["alignment_start"], r["alignment_end"] + 1))
            for i, r in hits.iterrows()
        ]

        # collect complete index mappings in here...
        mappings = {}
        # ... as well as dataframe rows for assignment of hit to mapping
        mapping_rows = []

        # complication: if there are multiple segments per hit and chain, we should
        # reduce these into a single mapping (even though split mappings
        # are possible in principle) so we can count unique number of hits etc.
        hit_columns = ["alignment_id", "pdb_id", "pdb_chain"]
        for i, (hit, grp) in enumerate(hits.groupby(hit_columns)):
            agg_mapping = {}
            agg_df = pd.DataFrame()
            # go through each segment
            for j, r in grp.iterrows():
                # compute mapping for that particular segment
                map_j, map_j_df = _create_mapping(r)

                # add to overall mapping dictionary for this hit
                agg_mapping.update(map_j)
                agg_df = agg_df.append(map_j_df)

            # store assignment of group to mapping index
            mapping_rows.append(list(hit) + [i, len(grp) > 1])

            mappings[i] = agg_mapping

            # store index mappings if filename prefix is given
            prefix = kwargs.get("prefix", None)
            if prefix is not None:
                agg_df = agg_df.rename(
                    columns={
                        "j": "uniprot_of_pdb_index",
                        "A_j": "uniprot_of_pdb_residue",
                        "k": "pdb_seqres_index",
                    })

                agg_df.to_csv("{}_mapping{}.csv".format(prefix, i),
                              index=False)

        # create dataframe from mapping rows
        mapping_df = pd.DataFrame(mapping_rows,
                                  columns=hit_columns + [
                                      "mapping_index",
                                      "grouped_segments",
                                  ])

        # now group again, to aggregate full hit dataframe
        def _agg_type(x):
            if x == "overlap":
                return "sum"
            elif x.endswith("_start"):
                return "min"
            elif x.endswith("end"):
                return "max"
            else:
                return "first"

        agg_types = OrderedDict([(c, _agg_type(c)) for c in hits.columns
                                 if c not in hit_columns])

        # only aggregate if we have anything to aggregate,
        # otherwise pandas drops the index columns
        # alignment_id, pdb_id, pdb_chain and things go
        # wrong horribly in the following join
        if len(hits) > 0:
            hits_grouped = hits.groupby(hit_columns).agg(
                agg_types).reset_index()
        else:
            hits_grouped = hits

        # join with mapping information
        hits_grouped = hits_grouped.merge(mapping_df, on=hit_columns)

        # remove hits with too little residue coverage
        hits_grouped = hits_grouped.query("overlap >= @min_overlap")

        hits_grouped.loc[:, "bitscore"] = pd.to_numeric(
            hits_grouped.loc[:, "bitscore"], errors="coerce")
        hits_grouped = hits_grouped.sort_values(by="bitscore", ascending=False)

        # if requested, only keep one chain per PDB;
        # sort by score before this to keep best hit
        if reduce_chains:
            hits_grouped = hits_grouped.groupby("pdb_id").first().reset_index()
            # sort again, just to be sure...
            hits_grouped = hits_grouped.sort_values(by="bitscore",
                                                    ascending=False)

        # remove any zombie mappings we did not keep in table
        mappings = {
            idx: map_
            for idx, map_ in mappings.items()
            if idx in hits_grouped.mapping_index.values
        }

        return SIFTSResult(hits_grouped, mappings)