Пример #1
0
    def from_file(cls, filename, file_format="pdb"):
        """
        Initialize structure from PDB/mmCIF file

        Parameters
        ----------
        filename : str
            Path of file
        file_format : {"pdb", "cif"}, optional (default: "pdb")
            Format of structure (old PDB format or mmCIF)

        Returns
        -------
        ClassicPDB
            Initialized PDB structure
        """
        try:
            if file_format == "pdb":
                from Bio.PDB import PDBParser
                parser = PDBParser(QUIET=True)
            elif file_format == "cif":
                from Bio.PDB import FastMMCIFParser
                parser = FastMMCIFParser(QUIET=True)
            else:
                raise InvalidParameterError(
                    "Invalid file_format, valid options are: pdb, cif"
                )

            structure = parser.get_structure("", filename)
            return cls(structure)
        except FileNotFoundError as e:
            raise ResourceError(
                "Could not find file {}".format(filename)
            ) from e
Пример #2
0
    def from_id(cls, pdb_id):
        """
        Initialize structure by PDB ID (fetches
        structure from RCSB servers)

        Parameters
        ----------
        pdb_id : str
            PDB identifier (e.g. 1hzx)

        Returns
        -------
        PDB
            initialized PDB structure
        """
        from urllib.error import URLError
        from Bio.PDB import PDBList
        pdblist = PDBList()

        try:
            # download PDB file to temporary directory
            pdb_file = pdblist.retrieve_pdb_file(pdb_id, pdir=tempdir())
            return cls.from_file(pdb_file, file_format="pdb")
        except URLError as e:
            raise ResourceError(
                "Could not fetch PDB data for {}".format(pdb_id)
            ) from e
Пример #3
0
    def _insert_file(self, filename, parent_id):
        """
        Insert file from filesystem into database

        Parameters
        ----------
        filename : str
            Path to file that is to be inserted
        parent_id : bson.ObjectId
            MongoDB identifier of job document this
            file is linked to

        Returns
        -------
        dict
            Dictionary with keys "filename" (original file
            path) and "fs_id" (ObjectId of inserted file
            in GridFS)
        """
        def _insert():
            with open(filename, "rb") as f:
                return self.fs.put(f,
                                   parent_id=parent_id,
                                   job_id=self.job_id,
                                   filename=filename,
                                   time_saved=datetime.utcnow())

        try:
            id_ = self._retry_query(_insert)
        except OSError as e:
            raise ResourceError(
                "Could not read {} for storing in MongoDB backend".format(
                    filename)) from e

        return {"filename": filename, "fs_id": id_}
Пример #4
0
def fetch_sequence(sequence_id, sequence_file, sequence_download_url,
                   out_file):
    """
    Fetch sequence either from database based on identifier, or from
    input sequence file.

    Parameters
    ----------
    sequence_id : str
        Identifier of sequence that should be retrieved
    sequence_file : str
        File containing sequence. If None, sqeuence will
        be downloaded from sequence_download_url
    sequence_download_url : str
        URL from which to download missing sequence. Must
        contain "{}" at the position where sequence ID will
        be inserted into download URL (using str.format).
    out_file : str
        Output file in which sequence will be stored, if
        sequence_file is not existing.

    Returns
    -------
    str
        Path of file with stored sequence (can be sequence_file
        or out_file)
    tuple (str, str)
        Identifier of sequence as stored in file, and sequence
    """
    if sequence_file is None:
        get(sequence_download_url.format(sequence_id),
            out_file,
            allow_redirects=True)
    else:
        # if we have sequence file, try to copy it
        try:
            copy(sequence_file, out_file)
        except FileNotFoundError:
            raise ResourceError(
                "sequence_file does not exist: {}".format(sequence_file))

    # also make sure input file has something in it
    verify_resources("Input sequence missing", out_file)

    with open(out_file) as f:
        seq = next(read_fasta(f))

    return out_file, seq
Пример #5
0
    def from_file(cls, filename):
        """
        Initialize structure from MMTF file

        Parameters
        ----------
        filename : str
            Path of MMTF file

        Returns
        -------
        PDB
            initialized PDB structure
        """
        try:
            return cls(parse(filename))
        except FileNotFoundError as e:
            raise ResourceError(
                "Could not find file {}".format(filename)
            ) from e
Пример #6
0
    def from_id(cls, pdb_id):
        """
        Initialize structure by PDB ID (fetches
        structure from RCSB servers)

        Parameters
        ----------
        pdb_id : str
            PDB identifier (e.g. 1hzx)

        Returns
        -------
        PDB
            initialized PDB structure
        """
        try:
            return cls(fetch(pdb_id))
        except HTTPError as e:
            raise ResourceError(
                "Could not fetch MMTF data for {}".format(pdb_id)
            ) from e
Пример #7
0
def fetch_uniprot_mapping(ids, from_="ACC", to="ACC", format="fasta"):
    """
    Fetch data from UniProt ID mapping service
    (e.g. download set of sequences)

    Parameters
    ----------
    ids : list(str)
        List of UniProt identifiers for which to
        retrieve mapping
    from_ : str, optional (default: "ACC")
        Source identifier (i.e. contained in "ids" list)
    to : str, optional (default: "ACC")
        Target identifier (to which source should be mapped)
    format : str, optional (default: "fasta")
        Output format to request from Uniprot server

    Returns
    -------
    str:
        Response from UniProt server
    """
    params = {
        "from": from_,
        "to": to,
        "format": format,
        "query": " ".join(ids)
    }
    url = UNIPROT_MAPPING_URL
    r = requests.post(url, data=params)

    if r.status_code != requests.codes.ok:
        raise ResourceError("Invalid status code ({}) for URL: {}".format(
            r.status_code, url))

    return r.text
Пример #8
0
def run_plmc(alignment, couplings_file, param_file=None,
             focus_seq=None, alphabet=None, theta=None,
             scale=None, ignore_gaps=False, iterations=None,
             lambda_h=None, lambda_J=None, lambda_g=None,
             cpu=None, binary="plmc"):
    """
    Run plmc on sequence alignment and store
    files with model parameters and pair couplings.

    Parameters
    ----------
    alignment : str
        Path to input sequence alignment
    couplings_file : str
        Output path for file with evolutionary couplings
        (folder will be created)
    param_file : str
        Output path for binary file containing model
        parameters (folder will be created)
    focus_seq : str, optional (default: None)
        Name of focus sequence, if None, non-focus mode
        will be used
    alphabet : str, optional (default: None)
        Alphabet for model inference. If None, standard
        amino acid alphabet including gap will be used.
        First character in string corresponds to gap
        character (relevant for ignore_gaps).
    theta : float, optional (default: None)
        Sequences with pairwise identity >= theta
        will be clustered and their sequence weights
        downweighted as 1 / num_cluster_members.
        Important: Note that plmc will be parametrized using
        1 - theta. If None, default value in plmc will be used,
        which corresponds to theta=0.8 (plmc setting 0.2).
    scale : float, optional (default: None)
        Scale weights of clusters by this value.
        If None, default value in plmc (1.0) will be used
    ignore_gaps : bool, optional (default: False)
        Exclude gaps from parameter inference. Gap
        character is first character of alphabet
        parameter.
    iterations : int, optional (default: None)
        Maximum iterations for optimization.
    lambda_h : float, optional (default: None)
        l2 regularization strength on fields.
        If None, plmc default will be used.
    lambda_J : float, optional (default: None)
        l2-regularization strength on couplings.
        If None, plmc default will be used
    lambda_g : float, optional (default: None)
        group l1-regularization strength on couplings
        If None, plmc default will be used.
    cpu : Number of cores to use for running plmc.
        Note that plmc has to be compiled in openmp
        mode to runnable with multiple cores.
        Can also be set to "max".
    binary : str, optional (default: "plmc")
        Path to plmc binary

    Returns
    -------
    PlmcResult
        namedtuple containing output files and
        parsed fields from console output of plmc

    Raises
    ------
    ExternalToolError
    """
    create_prefix_folders(couplings_file)

    # Make sure input alignment exists
    verify_resources(
        "Alignment file does not exist", alignment
    )

    cmd = [
        binary,
        "-c", couplings_file,
    ]

    # store eij file if explicitly requested
    if param_file is not None:
        create_prefix_folders(param_file)
        cmd += ["-o", param_file]

    # focus sequence mode and ID
    if focus_seq is not None:
        # TODO: for now split exclude sequence
        # region from focus seq name, otherwise
        # plmc does not remap names. If this
        # behaviour changes in plmc, remove the
        # following line.
        focus_seq = focus_seq.split("/")[0]
        cmd += ["-f", focus_seq]

    # exclude gaps from calculation?
    if ignore_gaps:
        cmd += ["-g"]

    # maximum number of iterations, can also be "max"
    if iterations is not None:
        cmd += ["-m", str(iterations)]

    # set custom alphabet
    # (first character is gap by default in nogap mode)
    if alphabet is not None:
        cmd += ["-a", alphabet]

    # sequence reweighting
    if theta is not None:
        # transform into plmc convention (1-theta)
        theta = 1.0 - theta
        cmd += ["-t", str(theta)]

    # cluster weight
    if scale is not None:
        cmd += ["-s", str(scale)]

    # L2 regularization weight for fields
    if lambda_h is not None:
        cmd += ["-lh", str(lambda_h)]

    # L2 regularization weight for pair couplings
    if lambda_J is not None:
        cmd += ["-le", str(lambda_J)]

    # Group L1 regularization weight for pair couplings
    if lambda_g is not None:
        cmd += ["-lg", str(lambda_g)]

    # Number of cores to use for calculation
    if cpu is not None:
        cmd += ["-n", str(cpu)]

    # finally also add input alignment (main parameter)
    cmd += [alignment]

    # TODO: for now do not check returncode because sometimes
    # returncode == -11 (segfault) despite successful calculation
    return_code, stdout, stderr = run(cmd, check_returncode=False)

    # TODO: remove this segfault-hunting output once fixed
    if return_code != 0:
        # if not a segfault, still raise exception
        if return_code != -11:
            from evcouplings.utils.system import ExternalToolError
            raise ExternalToolError(
                "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
                    cmd, return_code, stdout, stderr
                )
            )

        print("PLMC NON-ZERO RETURNCODE:", return_code)
        print(cmd)
        print(" ".join(cmd))
        print("stdout:", stdout)
        print("stderr:", stderr)

    iter_df, out_fields = parse_plmc_log(stderr)

    # also check we actually calculated couplings...
    if not valid_file(couplings_file):
        raise ResourceError(
            "plmc returned no couplings: stdout={} stderr={} file={}".format(
                stdout, stderr, couplings_file
            )
        )

    # ... and parameter file, if requested
    if param_file and not valid_file(param_file):
        raise ResourceError(
            "plmc returned no parameter file: stdout={} stderr={} file={}".format(
                stdout, stderr, param_file
            )
        )

    return PlmcResult(
        couplings_file, param_file,
        iter_df, *out_fields
    )
Пример #9
0
def substitute_config(**kwargs):
    """
    Substitute command line arguments into config file

    Parameters
    ----------
    **kwargs
        Command line parameters to be substituted
        into configuration file

    Returns
    -------
    dict
        Updated configuration
    """
    # mapping of command line parameters to config file entries
    CONFIG_MAP = {
        "prefix": ("global", "prefix"),
        "protein": ("global", "sequence_id"),
        "seqfile": ("global", "sequence_file"),
        "alignment": ("align", "input_alignment"),
        "iterations": ("align", "iterations"),
        "id": ("align", "seqid_filter"),
        "seqcov": ("align", "minimum_sequence_coverage"),
        "colcov": ("align", "minimum_column_coverage"),
        "theta": ("global", "theta"),
        "plmiter": ("couplings", "iterations"),
        "queue": ("environment", "queue"),
        "time": ("environment", "time"),
        "cores": ("environment", "cores"),
        "memory": ("environment", "memory"),
    }

    # try to read in configuration
    config_file = kwargs["config"]
    if not valid_file(config_file):
        raise ResourceError(
            "Config file does not exist or is empty: {}".format(config_file))

    config = read_config_file(config_file, preserve_order=True)

    # substitute command-line parameters into configuration
    # (if straightforward substitution)
    for param, value in kwargs.items():
        if param in CONFIG_MAP and value is not None:
            outer, inner = CONFIG_MAP[param]
            config[outer][inner] = value

    # make sure that number of CPUs requested by
    # programs within pipeline does not exceed
    # number of cores requested in environment
    if config["environment"]["cores"] is not None:
        config["global"]["cpu"] = config["environment"]["cores"]

    # handle the more complicated parameters

    # If alignment is given, run "existing" protocol
    if kwargs.get("alignment", None) is not None:
        # TODO: think about what to do if sequence_file is given
        # (will not be used)
        config["align"]["protocol"] = "existing"

    # subregion of protein
    if kwargs.get("region", None) is not None:
        region = kwargs["region"]
        m = re.search("(\d+)-(\d+)", region)
        if m:
            start, end = map(int, m.groups())
            config["global"]["region"] = [start, end]
        else:
            raise InvalidParameterError(
                "Region string does not have format "
                "start-end (e.g. 5-123):".format(region))

    # pipeline stages to run
    if kwargs.get("stages", None) is not None:
        config["stages"] = kwargs["stages"].replace(" ", "").split(",")

    # sequence alignment input database
    if kwargs.get("database", None) is not None:
        db = kwargs["database"]
        # check if we have a predefined sequence database
        # if so, use it; otherwise, interpret as file path
        if db in config["databases"]:
            config["align"]["database"] = db
        else:
            config["align"]["database"] = "custom"
            config["databases"]["custom"] = db

    # make sure bitscore and E-value thresholds are exclusively set
    if kwargs.get("bitscores", None) is not None and kwargs.get(
            "evalues", None) is not None:
        raise InvalidParameterError(
            "Can not specify bitscore and E-value threshold at the same time.")

    if kwargs.get("bitscores", None) is not None:
        thresholds = kwargs["bitscores"]
        bitscore = True
    elif kwargs.get("evalues", None) is not None:
        thresholds = kwargs["evalues"]
        bitscore = False
    else:
        thresholds = None

    if thresholds is not None:
        T = thresholds.replace(" ", "").split(",")
        try:
            x_cast = [(float(t) if "." in t else int(t)) for t in T]
        except ValueError:
            raise InvalidParameterError(
                "Bitscore/E-value threshold(s) must be numeric: "
                "{}".format(thresholds))

        config["align"]["use_bitscores"] = bitscore

        # check if we have a single threshold (single job)
        # or if we need to create an array of jobs
        if len(x_cast) == 1:
            config["align"]["domain_threshold"] = x_cast[0]
            config["align"]["sequence_threshold"] = x_cast[0]
        else:
            config["batch"] = {}
            for t in x_cast:
                sub_prefix = ("_b" if bitscore else "_e") + str(t)
                config["batch"][sub_prefix] = {
                    "align": {
                        "domain_threshold": t,
                        "sequence_threshold": t,
                    }
                }

    return config
Пример #10
0
    def create_sequence_file(self,
                             output_file,
                             chunk_size=1000,
                             max_retries=100):
        """
        Create FASTA sequence file containing all UniProt
        sequences of proteins in SIFTS. This file is required
        for homology-based structure identification and
        index remapping.
        This function will also automatically associate
        the sequence file with the SIFTS object.

        Parameters
        ----------
        output_file : str
            Path at which to store sequence file
        chunk_size : int, optional (default: 1000)
            Retrieve sequences from UniProt in chunks of this size
            (too large chunks cause the mapping service to stall)
        max_retries : int, optional (default: 100)
            Allow this many retries when fetching sequences
            from UniProt ID mapping service, which unfortunately
            often suffers from connection failures.
        """
        ids = self.table.uniprot_ac.unique().tolist()

        # retrieve sequences in chunks since ID mapping service
        # tends to fail on large requests
        id_chunks = [
            ids[i:i + chunk_size] for i in range(0, len(ids), chunk_size)
        ]

        # store individual retrieved chunks as list of strings
        seq_chunks = []

        # keep track of how many retries were necessary and
        # abort if number exceeds max_retries
        num_retries = 0

        for ch in id_chunks:
            # fetch sequence chunk;
            # if there is a problem retry as long as we stay within
            # maximum number of retries
            while True:
                try:
                    seqs = fetch_uniprot_mapping(ch)
                    break
                except requests.ConnectionError as e:
                    # count as failed try
                    num_retries += 1

                    # if we retried too often, abort
                    if num_retries > max_retries:
                        raise ResourceError(
                            "Could not fetch sequences for SIFTS mapping tables from UniProt since "
                            "maximum number of retries after connection errors was exceeded. Retry "
                            "at a later time, or call SIFTS.create_sequence_file() with a higher value "
                            "for max_retries.") from e

            # rename identifiers in sequence file, so
            # we can circumvent Uniprot sequence identifiers
            # being prefixed by hmmer if a hit has exactly the
            # same identifier as the query sequence
            seqs = seqs.replace(
                ">sp|",
                ">evsp|",
            ).replace(
                ">tr|",
                ">evtr|",
            )

            assert seqs.endswith("\n")

            # store for writing
            seq_chunks.append(seqs)

        # store sequences to FASTA file in one go at the end
        with open(output_file, "w") as f:
            f.write("".join(seq_chunks))

        self.sequence_file = output_file

        # add Uniprot ID column to SIFTS table
        self._add_uniprot_ids()