Пример #1
0
    def __init__(self, **kwargs):
        """
        Create new SQL-based tracker. For now, this tracker will ignore file_list
        and store all file paths in the database except for those in delete_list.

        Parameters
        ----------
        connection_string : str
            SQLite connection URI. Must include database name,
            and username/password if authentication is used.
        job_id : str
            Unique job identifier of job which should be tracked
        prefix : str
            Prefix of pipeline job
        pipeline : str
            Name of pipeline that is running
        file_list : list(str)
            List of file item keys from outconfig that should
            be stored in database. For now, this parameter has no
            effect and all file paths will be stored in database.
        delete_list : list(str)
            List of file item keys from outconfig that will be deleted
            after run is finished. These files cannot be stored as paths
            to the pipeline result in the output.
        config : dict(str)
            Entire configuration dictionary of job
        retry_max_number : int, optional (default: None)
            Maximum number of attemps to perform database queries / updates.
            If None, will try forever.
        retry_wait : int, optional (default: None)
            Time in seconds between retries to connect to database
        """
        super().__init__(**kwargs)

        # for SQL tracker, job ID may not be longer than 255 chars to not interfere with older SQL DBs
        if len(self.job_id) > 255:
            raise InvalidParameterError(
                "Length of job_id for SQL tracker may not exceed 255 characters for database compatibility reasons"
            )

        # create SQLAlchemy engine and session maker to
        # instantiate later sessions
        self._engine = create_engine(self.connection_string)
        self._Session = sessionmaker(bind=self._engine)

        # Make sure all tables are there in database
        Base.metadata.create_all(bind=self._engine)
Пример #2
0
def run(**kwargs):
    """
    Run inference protocol to calculate ECs from
    input sequence alignment.

    Parameters
    ----------
    Mandatory kwargs arguments:
        protocol: EC protocol to run
        prefix: Output prefix for all generated files

    Returns
    -------
    outcfg : dict
        Output configuration of couplings stage
        Dictionary with results in following fields:
        (in brackets: not mandatory)

         ec_file
         effective_sequences
         [enrichment_file]
         focus_mode
         focus_sequence
         model_file
         num_sequences
         num_sites
         raw_ec_file
         region_start
         segments
    """
    check_required(kwargs, ["protocol"])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())
            )
        )

    return PROTOCOLS[kwargs["protocol"]](**kwargs)
Пример #3
0
def run(**kwargs):
    """
    Run alignment concatenation protocol

    Parameters
    ----------
    Mandatory kwargs arguments:
        protocol: concatenation protocol to run
        prefix: Output prefix for all generated files

    Returns
    -------
    outcfg : dict
        Output configuration of concatenation stage
        Dictionary with results in following fields:
        (in brackets: not mandatory)

        alignment_file
        raw_alignment_file
        focus_mode
        focus_sequence
        segments
        frequencies_file
        identities_file
        num_sequences
        num_sites
        raw_focus_alignment_file
        statistics_file

    """
    check_required(kwargs, ["protocol"])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(
                kwargs["protocol"], ", ".join(PROTOCOLS.keys())
            )
        )

    return PROTOCOLS[kwargs["protocol"]](**kwargs)
Пример #4
0
def run(**kwargs):
    """
    Run alignment protocol to generate multiple sequence
    alignment from input sequence.

    Parameters
    ----------
    Mandatory kwargs arguments:
        protocol: Alignment protocol to run
        prefix: Output prefix for all generated files

    Optional:

    Returns
    -------
    Alignment
    Dictionary with results of stage in following fields (in brackets - not returned by all protocols):

        * alignment_file
        * [raw_alignment_file]
        * statistics_file
        * target_sequence_file
        * sequence_file
        * [annotation_file]
        * frequencies_file
        * identities_file
        * [hittable_file]
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, ["protocol"])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join(
                PROTOCOLS.keys())))

    return PROTOCOLS[kwargs["protocol"]](**kwargs)
Пример #5
0
def run(**kwargs):
    """
    Run alignment concatenation protocol

    Parameters
    ----------
    Mandatory kwargs arguments:
        protocol: concatenation protocol to run
        prefix: Output prefix for all generated files

    Returns
    -------
    outcfg : dict
        Output configuration of concatenation stage
        Dictionary with results in following fields (in brackets: not mandatory):

        .. todo::

            to be finalized after implementing protocols

        * alignment_file
        * focus_mode
        * focus_sequence
        * segments
        * num_sites
        * num_sequences
    """
    check_required(kwargs, ["protocol"])

    if kwargs["protocol"] not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid protocol selection: " +
            "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join(
                PROTOCOLS.keys())))

    return PROTOCOLS[kwargs["protocol"]](**kwargs)
Пример #6
0
def execute(**config):
    """
    Execute a pipeline configuration

    Parameters
    ----------
    **config
        Input configuration for pipeline
        (see pipeline config files for
        example of how this should look like)

    Returns
    -------
    global_state : dict
        Global output state of pipeline
    """
    check_required(config, ["pipeline", "stages", "global"])

    # check if valid pipeline was selected
    if config["pipeline"] not in PIPELINES:
        raise InvalidParameterError("Not a valid pipeline selection. "
                                    "Valid choices are:\n{}".format(", ".join(
                                        PIPELINES.keys())))

    stages = config["stages"]
    if stages is None:
        raise InvalidParameterError("No stages defined, need at least one.")

    # get definition of selected pipeline
    pipeline = PIPELINES[config["pipeline"]]
    prefix = config["global"]["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # this is the global state of results as
    # we move through different stages of
    # the pipeline
    global_state = config["global"]

    # keep track of how many stages are still
    # to be run, so we can leave out stages at
    # the end of workflow below
    num_stages_to_run = len(stages)

    # get job tracker
    tracker = get_result_tracker(config)

    # set job status to running and also initalize global state
    tracker.update(status=EStatus.RUN, results=global_state)

    # iterate through individual stages
    for (stage, runner, key_prefix) in pipeline:
        # check if anything else is left to
        # run, otherwise skip
        if num_stages_to_run == 0:
            break

        # check if config for stage is there
        check_required(config, [stage])

        # output files for stage into an individual folder
        stage_prefix = insert_dir(prefix, stage)
        create_prefix_folders(stage_prefix)

        # config files for input and output of stage
        stage_incfg = "{}_{}.incfg".format(stage_prefix, stage)
        stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage)

        # update current stage of job
        tracker.update(stage=stage)

        # check if stage should be executed
        if stage in stages:
            # global state inserted at end, overrides any
            # stage-specific settings (except for custom prefix)
            incfg = {
                **config["tools"],
                **config["databases"],
                **config[stage],
                **global_state, "prefix": stage_prefix
            }
            # save input of stage in config file
            write_config_file(stage_incfg, incfg)

            # run stage
            outcfg = runner(**incfg)

            # prefix output keys if this parameter is
            # given in stage configuration, to avoid
            # name clashes if same protocol run multiple times
            if key_prefix is not None:
                outcfg = {key_prefix + k: v for k, v in outcfg.items()}

            # save output of stage in config file
            write_config_file(stage_outcfg, outcfg)

            # one less stage to put through after we ran this...
            num_stages_to_run -= 1
        else:
            # skip state by injecting state from previous run
            verify_resources(
                "Trying to skip, but output configuration "
                "for stage '{}' does not exist. Has it already "
                "been run?".format(stage, stage), stage_outcfg)

            # read output configuration
            outcfg = read_config_file(stage_outcfg)

            # verify all the output files are there
            outfiles = [
                filepath for f, filepath in outcfg.items()
                if f.endswith("_file") and filepath is not None
            ]

            verify_resources(
                "Output files from stage '{}' "
                "missing".format(stage), *outfiles)

        # update global state with outputs of stage
        global_state = {**global_state, **outcfg}

        # update state in tracker accordingly
        tracker.update(results=outcfg)

    # create results archive
    archive_file = create_archive(config, global_state, prefix)

    # only store results archive if a result file was created
    if archive_file is not None:
        global_state["archive_file"] = archive_file

        # prepare update for tracker, but only store in last
        # go when job is set to done
        tracker_archive_update = {"archive_file": archive_file}
    else:
        tracker_archive_update = None

    # set job status to done and transfer archive if selected for syncing
    tracker.update(status=EStatus.DONE, results=tracker_archive_update)

    # delete selected output files if requested;
    # tracker does not need to update here since it won't
    # sync entries of delete list in the first place
    global_state = delete_outputs(config, global_state)

    # write final global state of pipeline
    write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state)

    return global_state
Пример #7
0
def create_archive(config, outcfg, prefix):
    """
    Create archive of files generated by pipeline

    Parameters
    ----------
    config : dict-like
        Input configuration of job. Uses
        config["management"]["archive"] (list of key
        used to index outcfg) to determine
        which files should be added to archive
    outcfg : dict-like
        Output configuration of job
    prefix : str
        Prefix of job, will be used to define archive
        file path (prefix + archive type-specific extension)
    """
    # allowed output archive formats
    ALLOWED_FORMATS = ["targz", "zip"]

    # determine selected output format, default is .tar.gz
    archive_format = config.get("management", {}).get("archive_format",
                                                      "targz")

    # determine keys (corresponding to files) in
    # outcfg that should be stored
    archive_keys = config.get("management", {}).get("archive", None)

    # if no files selected for archiving, return immediately
    if archive_keys is None:
        return

    # check if selected format is valid
    if archive_format not in ALLOWED_FORMATS:
        raise InvalidParameterError(
            "Invalid format for output archive: {}. ".format(archive_format) +
            "Valid options are: " + ", ".join(ALLOWED_FORMATS))

    # create explicit list of files that would go into archive and are valid files
    archive_files = [(file_path, file_key, idx)
                     for (file_path, file_key,
                          idx) in iterate_files(outcfg, subset=archive_keys)
                     if valid_file(file_path)]

    # if there are no file, don't create archive
    if len(archive_files) == 0:
        return

    if archive_format == "targz":
        final_archive_file = prefix + ".tar.gz"
        with tarfile.open(final_archive_file, "w:gz") as tar:
            for (file_path, file_key, idx) in archive_files:
                tar.add(file_path)
    elif archive_format == "zip":
        final_archive_file = prefix + ".zip"
        with zipfile.ZipFile(final_archive_file, "w",
                             zipfile.ZIP_DEFLATED) as zip_:
            for (file_path, file_key, idx) in archive_files:
                zip_.write(file_path)

    return final_archive_file
Пример #8
0
def run_cns(inp_script=None, inp_file=None, log_file=None, binary="cns"):
    """
    Run CNSsolve 1.21 (without worrying about environment setup)

    Note that the user is responsible for verifying the output products
    of CNS, since their paths are determined by .inp scripts and
    hard to check automatically and in a general way.

    Either input_script or input_file has to be specified.

    Parameters
    ----------
    inp_script : str, optional (default: None)
        CNS ".inp" input script (actual commands, not file)
    inp_file : str, optional (default: None)
        Path to .inp input script file. Will override
        inp_script if also specified.
    log_file : str, optional (default: None)
        Save CNS stdout output to this file
    binary : str, optional (default: "cns")
        Absolute path of CNS binary

    Raises
    ------
    ExternalToolError
        If call to CNS fails
    InvalidParameterError
        If no input script (file or string) given
    """
    # make sure we have absolute path
    binary = path.abspath(binary)

    # extract main installation directory
    cns_main_dir = binary
    for i in range(3):
        cns_main_dir = path.dirname(cns_main_dir)

    # create environment
    env = deepcopy(os.environ)
    library_dir = path.join(cns_main_dir, "libraries")
    module_dir = path.join(cns_main_dir, "modules")

    env["CNS_SOLVE"] = cns_main_dir
    env["CNS_LIB"] = library_dir
    env["CNS_MODULE"] = module_dir
    env["CNS_HELPLIB"] = path.join(cns_main_dir, "helplip")

    for var, subdir in [
        ("CNS_TOPPAR", "toppar"),
        ("CNS_CONFDB", "confdb"),
        ("CNS_XTALLIB", "xtal"),
        ("CNS_NMRLIB", "nmr"),
        ("CNS_XRAYLIB", "xray"),
    ]:
        env[var] = path.join(library_dir, subdir)

    for var, subdir in [
        ("CNS_XTALMODULE", "xtal"),
        ("CNS_NMRMODULE", "nmr"),
    ]:
        env[var] = path.join(module_dir, subdir)

    if inp_script is None and inp_file is None:
        raise InvalidParameterError(
            "Must specify either input_script or input_file")

    # read input script, this is fed into CNS using stdin
    if inp_file is not None:
        with open(inp_file) as f:
            inp_script = "".join(f.readlines())

    # run and store output
    return_code, stdout, stderr = run(binary, stdin=inp_script)

    # write stdout output to log file
    if log_file is not None:
        with open(log_file, "w") as f:
            f.write(stdout)
Пример #9
0
def run_cns_13(inp_script=None,
               inp_file=None,
               log_file=None,
               source_script=None,
               binary="cns"):
    """
    Run CNSsolve 1.3

    Note that the user is responsible for verifying the output products
    of CNS, since their paths are determined by .inp scripts and
    hard to check automatically and in a general way.

    Either input_script or input_file has to be specified.

    Parameters
    ----------
    inp_script : str, optional (default: None)
        CNS ".inp" input script (actual commands, not file)
    inp_file : str, optional (default: None)
        Path to .inp input script file. Will override
        inp_script if also specified.
    log_file : str, optional (default: None)
        Save CNS stdout output to this file
    source_script : str, optional (default: None)
        Script to set CNS environment variables.
        This should typically point to .cns_solve_env_sh
        in the CNS installation main directory (the
        shell script itself needs to be edited to
        contain the path of the installation)
    binary : str, optional (default: "cns")
        Name of CNS binary

    Raises
    ------
    ExternalToolError
        If call to CNS fails
    InvalidParameterError
        If no input script (file or string) given
    """
    # usually need to source script to set up environment for CNS
    if source_script is not None:
        cmd = "source {};".format(source_script)
    else:
        cmd = ""

    cmd += binary

    if inp_script is None and inp_file is None:
        raise InvalidParameterError(
            "Must specify either input_script or input_file")

    # read input script, this is fed into CNS using stdin
    if inp_file is not None:
        with open(inp_file) as f:
            inp_script = "".join(f.readlines())

    # run and store output
    return_code, stdout, stderr = run(cmd, stdin=inp_script, shell=True)

    # write stdout output to log file
    if log_file is not None:
        with open(log_file, "w") as f:
            f.write(stdout)
Пример #10
0
def get_result_tracker(config):
    """
    Create result tracker from configuration

    Parameters
    ----------
    config : dict
        Complete job configuration, including
        "global" and "management" sections.

    Returns
    -------
    evcouplings.utils.tracker.ResultTracker
        Job tracker instance according to config
    """
    # first make copy of config so tracker can't influence
    # job in any way by accident
    config = deepcopy(config)

    management = config.get("management", {})
    tracker_type = management.get("tracker_type")

    # if no tracker selected, return NullTracker right away
    # and don't bother with all parameter setup below
    if tracker_type is None:
        return NullTracker()

    # connection string for database (or the like)
    connection_string = management.get("connection_string")

    # get unique job ID, job prefix and pipeline
    job_id = management.get("job_id", None)
    prefix = config.get("global", {}).get("prefix", None)
    pipeline = config.get("pipeline")

    # list of files that tracker should store
    file_list = management.get("tracker_file_list", None)

    # list of files that pipeline will delete
    delete_list = management.get("delete", [])

    # if we don't have these settings, cannot track job
    if connection_string is None:
        raise InvalidParameterError(
            "Must provide parameter 'connection_string' in management section "
            "of config when using a tracker.")

    if job_id is None:
        raise InvalidParameterError(
            "Must provide unique 'job_id' in management section "
            "of config when using a tracker.")

    # see if we have authentication information in the
    # environment variables (for careful people...)
    # Default is to authenticate using username/password
    # in URI
    env_tracker_username = environ.get(TRACKER_USERNAME_KEY)
    env_tracker_password = environ.get(TRACKER_PASSWORD_KEY)

    # substitute username/password into connection string
    # (will only have an effect if these are present)
    if connection_string is not None:
        connection_string = connection_string.format(
            username=env_tracker_username, password=env_tracker_password)

    # retry settings
    retry_max_number = management.get("tracker_max_retries",
                                      TRACKER_MAX_NUM_RETRIES)
    retry_wait = management.get("tracker_retry_wait", TRACKER_RETRY_WAIT)

    kwargs = {
        "connection_string": connection_string,
        "job_id": job_id,
        "prefix": prefix,
        "pipeline": pipeline,
        "file_list": file_list,
        "delete_list": delete_list,
        "config": config,
        "retry_max_number": retry_max_number,
        "retry_wait": retry_wait
    }

    # all fields that go into database itself rather than inside config
    # are extracted from config object by now; config param as such only serves
    # as record of the entire configuration and shouldn't be accessed inside
    # tracker to extract any sort of parametrization of the tracker
    if tracker_type == "mongodb":
        from evcouplings.utils.tracker.mongodb import MongoDBTracker
        return MongoDBTracker(**kwargs)
    elif tracker_type == "sql":
        from evcouplings.utils.tracker.sql import SQLTracker
        return SQLTracker(**kwargs)
    else:
        raise InvalidParameterError(
            "Not a valid job result tracker: '{}'. "
            "Valid options are: None, 'sql', 'mongodb'".format(tracker_type))
Пример #11
0
def complex(**kwargs):
    """
    Protocol:

    Run monomer alignment protocol and postprocess it for
    EVcomplex calculations

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the alignment protocol, and
        the following additional field:

        genome_location_file : path to file containing
            the genomic locations for CDs's corresponding to
            identifiers in the alignment.

    """
    check_required(kwargs, [
        "prefix", "alignment_protocol", "uniprot_to_embl_table",
        "ena_genome_location_table"
    ])

    verify_resources("Uniprot to EMBL mapping table does not exist",
                     kwargs["uniprot_to_embl_table"])

    verify_resources("ENA genome location table does not exist",
                     kwargs["ena_genome_location_table"])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # run the regular alignment protocol
    # (standard, existing, ...)
    alignment_protocol = kwargs["alignment_protocol"]

    if alignment_protocol not in PROTOCOLS:
        raise InvalidParameterError(
            "Invalid choice for alignment protocol: {}".format(
                alignment_protocol))

    outcfg = PROTOCOLS[kwargs["alignment_protocol"]](**kwargs)

    # if the user selected the existing alignment protocol
    # they can supply an input annotation file
    # which overwrites the annotation file generated by the existing protocol
    if alignment_protocol == "existing":
        check_required(kwargs, ["override_annotation_file"])

        if kwargs["override_annotation_file"] is not None:
            verify_resources("Override annotation file does not exist",
                             kwargs["override_annotation_file"])

            outcfg["annotation_file"] = prefix + "_annotation.csv"
            annotation_data = pd.read_csv(kwargs["override_annotation_file"])
            annotation_data.to_csv(outcfg["annotation_file"])

    # extract cds identifiers for alignment uniprot IDs
    cds_ids = extract_cds_ids(outcfg["alignment_file"],
                              kwargs["uniprot_to_embl_table"])

    # extract genome location information from ENA
    genome_location_filename = prefix + "_genome_location.csv"

    genome_location_table = extract_embl_annotation(
        cds_ids, kwargs["ena_genome_location_table"], genome_location_filename)

    genome_location_table = add_full_header(genome_location_table,
                                            outcfg["alignment_file"])

    genome_location_table.to_csv(genome_location_filename)
    outcfg["genome_location_file"] = genome_location_filename

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".align_complex.outcfg", outcfg)

    return outcfg
Пример #12
0
def run_jobs(configs,
             global_config,
             overwrite=False,
             workdir=None,
             abort_on_error=True,
             environment=None):
    """
    Submit config to pipeline

    Parameters
    ----------
    configs : dict
        Configurations for individual subjobs
    global_config : dict
        Master configuration (if only one job,
        the contents of this dictionary will be
        equal to the single element of config_files)
    overwrite : bool, optional (default: False)
        If True, allows overwriting previous run of the same
        config, otherwise will fail if results from previous
        execution are present
    workdir : str, optional (default: None)
        Workdir in which to run job (will combine
        workdir and prefix in joint path)
    abort_on_error : bool, optional (default: True)
        Abort entire job submission if error occurs for
        one of the jobs by propagating RuntimeError
    environment : str, optional (default: None)
        Allow to pass value for environment parameter
        of submitter, will override environment.configuration
        from global_config (e.g., for setting environment
        variables like passwords)

    Returns
    -------
    job_ids : dict
        Mapping from subjob prefix (keys in configs parameter)
        to identifier returned by submitter for each of the jobs
        that was *successfully* submitted (i.e. missing keys from
        configs param indicate these jobs could not be submitted).

    Raises
    ------
    RuntimeError
        If error encountered during submission and abort_on_error
        is True
    """
    cmd_base = environ.get("EVCOUPLINGS_RUNCFG_APP") or "evcouplings_runcfg"
    summ_base = environ.get(
        "EVCOUPLINGS_SUMMARIZE_APP") or "evcouplings_summarize"

    # determine output directory for config files
    prefix = global_config["global"]["prefix"]

    # integrate working directory into output prefix
    # if it is given; if prefix contains an absolute path,
    # this will override the workdir according to
    # implementation of path.join()
    if workdir is not None:
        out_prefix = path.join(workdir, prefix)
    else:
        out_prefix = prefix

    # save configuration file, make sure we do not overwrite previous run
    # if overwrite protection is activated
    # (but only if it is a valid configuration file with contents)
    cfg_filename = CONFIG_NAME.format(out_prefix)

    if not overwrite and valid_file(cfg_filename):
        raise InvalidParameterError(
            "Existing configuration file {} ".format(cfg_filename) +
            "indicates current prefix {} ".format(prefix) +
            "would overwrite existing results. Use --yolo " +
            "flag to deactivate overwrite protection (e.g. for "
            "restarting a job or running a different stage).")

    # make sure working directory exists
    create_prefix_folders(cfg_filename)

    # write global config file
    write_config_file(cfg_filename, global_config)

    # also write individual subjob configuration files
    # (we have to write these before submitting, since
    # the job summarizer needs the paths to all files)
    for subjob_prefix, subjob_cfg in configs.items():
        # determine working dir for each subjob, since subjob
        # prefix may contain slashes leading to subfolder creation
        if workdir is not None:
            subjob_out_prefix = path.join(workdir, subjob_prefix)
        else:
            subjob_out_prefix = subjob_prefix

        subcfg_filename = CONFIG_NAME.format(subjob_out_prefix)

        # make sure output subfolder exists
        create_prefix_folders(subcfg_filename)

        # write subjob configuration file
        write_config_file(subcfg_filename, subjob_cfg)

    # now create list of subjob config files relative to working
    # directory (above, we allow to run submitted in arbitrary directory)
    config_files = [
        CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs
    ]

    # create command for summarizer (needs to know all subjob config files)
    summ_cmd = "{} {} {} {}".format(summ_base, global_config["pipeline"],
                                    global_config["global"]["prefix"],
                                    " ".join(config_files))

    # create submitter from global (pre-unrolling) configuration
    submitter = utils.SubmitterFactory(global_config["environment"]["engine"],
                                       db_path=out_prefix +
                                       "_job_database.txt")

    # collect individual submitted jobs here
    commands = []

    # record subjob IDs returned by submitter for each job
    job_ids = {}

    # prepare individual jobs for submission
    for job, job_cfg in configs.items():
        job_prefix = job_cfg["global"]["prefix"]
        job_cfg_file = CONFIG_NAME.format(job)

        # create submission command
        env = job_cfg["environment"]
        cmd = utils.Command(
            ["{} {}".format(cmd_base, job_cfg_file), summ_cmd],
            name=job_prefix,
            environment=environment or env["configuration"],
            workdir=workdir,
            resources={
                utils.EResource.queue: env["queue"],
                utils.EResource.time: env["time"],
                utils.EResource.mem: env["memory"],
                utils.EResource.nodes: env["cores"],
                utils.EResource.out: job_prefix + "_stdout.log",
                utils.EResource.error: job_prefix + "_stderr.log",
            })

        # store job for later dependency creation
        commands.append(cmd)

        tracker = get_result_tracker(job_cfg)

        try:
            # finally, submit job
            current_job_id = submitter.submit(cmd)

            # store run identifier returned by submitter
            # TODO: consider storing current_job_id using tracker right away
            job_ids[job] = current_job_id

            # set job status in database to pending
            tracker.update(status=EStatus.PEND)

        except RuntimeError as e:
            # set job as failed in database
            tracker.update(status=EStatus.FAIL, message=str(e))

            # fail entire job submission if requested
            if abort_on_error:
                raise

    # submit final summarizer
    # (hold for now - summarizer is run after each subjob finishes)

    # wait for all runs to finish (but only if blocking)
    submitter.join()

    # return job identifiers
    return job_ids
Пример #13
0
def substitute_config(**kwargs):
    """
    Substitute command line arguments into config file

    Parameters
    ----------
    **kwargs
        Command line parameters to be substituted
        into configuration file

    Returns
    -------
    dict
        Updated configuration
    """
    # mapping of command line parameters to config file entries
    CONFIG_MAP = {
        "prefix": ("global", "prefix"),
        "protein": ("global", "sequence_id"),
        "seqfile": ("global", "sequence_file"),
        "alignment": ("align", "input_alignment"),
        "iterations": ("align", "iterations"),
        "id": ("align", "seqid_filter"),
        "seqcov": ("align", "minimum_sequence_coverage"),
        "colcov": ("align", "minimum_column_coverage"),
        "theta": ("global", "theta"),
        "plmiter": ("couplings", "iterations"),
        "queue": ("environment", "queue"),
        "time": ("environment", "time"),
        "cores": ("environment", "cores"),
        "memory": ("environment", "memory"),
    }

    # try to read in configuration
    config_file = kwargs["config"]
    if not valid_file(config_file):
        raise ResourceError(
            "Config file does not exist or is empty: {}".format(config_file))

    config = read_config_file(config_file, preserve_order=True)

    # substitute command-line parameters into configuration
    # (if straightforward substitution)
    for param, value in kwargs.items():
        if param in CONFIG_MAP and value is not None:
            outer, inner = CONFIG_MAP[param]
            config[outer][inner] = value

    # make sure that number of CPUs requested by
    # programs within pipeline does not exceed
    # number of cores requested in environment
    if config["environment"]["cores"] is not None:
        config["global"]["cpu"] = config["environment"]["cores"]

    # handle the more complicated parameters

    # If alignment is given, run "existing" protocol
    if kwargs.get("alignment", None) is not None:
        # TODO: think about what to do if sequence_file is given
        # (will not be used)
        config["align"]["protocol"] = "existing"

    # subregion of protein
    if kwargs.get("region", None) is not None:
        region = kwargs["region"]
        m = re.search("(\d+)-(\d+)", region)
        if m:
            start, end = map(int, m.groups())
            config["global"]["region"] = [start, end]
        else:
            raise InvalidParameterError(
                "Region string does not have format "
                "start-end (e.g. 5-123):".format(region))

    # pipeline stages to run
    if kwargs.get("stages", None) is not None:
        config["stages"] = kwargs["stages"].replace(" ", "").split(",")

    # sequence alignment input database
    if kwargs.get("database", None) is not None:
        db = kwargs["database"]
        # check if we have a predefined sequence database
        # if so, use it; otherwise, interpret as file path
        if db in config["databases"]:
            config["align"]["database"] = db
        else:
            config["align"]["database"] = "custom"
            config["databases"]["custom"] = db

    # make sure bitscore and E-value thresholds are exclusively set
    if kwargs.get("bitscores", None) is not None and kwargs.get(
            "evalues", None) is not None:
        raise InvalidParameterError(
            "Can not specify bitscore and E-value threshold at the same time.")

    if kwargs.get("bitscores", None) is not None:
        thresholds = kwargs["bitscores"]
        bitscore = True
    elif kwargs.get("evalues", None) is not None:
        thresholds = kwargs["evalues"]
        bitscore = False
    else:
        thresholds = None

    if thresholds is not None:
        T = thresholds.replace(" ", "").split(",")
        try:
            x_cast = [(float(t) if "." in t else int(t)) for t in T]
        except ValueError:
            raise InvalidParameterError(
                "Bitscore/E-value threshold(s) must be numeric: "
                "{}".format(thresholds))

        config["align"]["use_bitscores"] = bitscore

        # check if we have a single threshold (single job)
        # or if we need to create an array of jobs
        if len(x_cast) == 1:
            config["align"]["domain_threshold"] = x_cast[0]
            config["align"]["sequence_threshold"] = x_cast[0]
        else:
            config["batch"] = {}
            for t in x_cast:
                sub_prefix = ("_b" if bitscore else "_e") + str(t)
                config["batch"][sub_prefix] = {
                    "align": {
                        "domain_threshold": t,
                        "sequence_threshold": t,
                    }
                }

    return config
Пример #14
0
def run_jobs(configs, global_config, overwrite=False, workdir=None):
    """
    Submit config to pipeline

    Parameters
    ----------
    configs : dict
        Configurations for individual subjobs
    global_config : dict
        Master configuration (if only one job,
        the contents of this dictionary will be
        equal to the single element of config_files)
    """
    python = executable
    pipeline_path = path.abspath(pipeline.__file__)
    summarize_path = path.abspath(summarize.__file__)

    cmd_base = "{} {}".format(python, pipeline_path)
    summ_base = "{} {}".format(python, summarize_path)

    # determine output directory for config files
    prefix = global_config["global"]["prefix"]

    # integrate working directory into output prefix
    # if it is given; if prefix contains an absolute path,
    # this will override the workdir according to
    # implementation of path.join()
    if workdir is not None:
        out_prefix = path.join(workdir, prefix)
    else:
        out_prefix = prefix

    # save configuration file, make sure we do not overwrite previous run
    # if overwrite protection is activated
    # (but only if it is a valid configuration file with contents)
    cfg_filename = CONFIG_NAME.format(out_prefix)

    if not overwrite and valid_file(cfg_filename):
        raise InvalidParameterError(
            "Existing configuration file {} ".format(cfg_filename) +
            "indicates current prefix {} ".format(prefix) +
            "would overwrite existing results. Use --yolo " +
            "flag to deactivate overwrite protection (e.g. for "
            "restarting a job or running a different stage)."
        )

    # make sure working directory exists
    create_prefix_folders(cfg_filename)

    # write global config file
    write_config_file(cfg_filename, global_config)

    # also write individual subjob configuration files
    # (we have to write these before submitting, since
    # the job summarizer needs the paths to all files)
    for subjob_prefix, subjob_cfg in configs.items():
        # determine working dir for each subjob, since subjob
        # prefix may contain slashes leading to subfolder creation
        if workdir is not None:
            subjob_out_prefix = path.join(workdir, subjob_prefix)
        else:
            subjob_out_prefix = subjob_prefix

        subcfg_filename = CONFIG_NAME.format(subjob_out_prefix)

        # make sure output subfolder exists
        create_prefix_folders(subcfg_filename)

        # write subjob configuration file
        write_config_file(subcfg_filename, subjob_cfg)

    # now create list of subjob config files relative to working
    # directory (above, we allow to run submitted in arbitrary directory)
    config_files = [
        CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs
    ]

    # create command for summarizer (needs to know all subjob config files)
    summ_cmd = "{} {} {} {}".format(
        summ_base,
        global_config["pipeline"],
        global_config["global"]["prefix"],
        " ".join(config_files)
    )

    # create submitter from global (pre-unrolling) configuration
    submitter = utils.SubmitterFactory(
        global_config["environment"]["engine"],
        db_path=out_prefix + "_job_database.txt"
    )

    # collect individual submitted jobs here
    commands = []

    # prepare individual jobs for submission
    for job, job_cfg in configs.items():
        job_prefix = job_cfg["global"]["prefix"]
        job_cfg_file = CONFIG_NAME.format(job)

        # set job status in database to pending
        pipeline.update_job_status(job_cfg, status=database.EStatus.PEND)

        # create submission command
        env = job_cfg["environment"]
        cmd = utils.Command(
            [
                "{} {}".format(cmd_base, job_cfg_file),
                summ_cmd
            ],
            name=job_prefix,
            environment=env["configuration"],
            workdir=workdir,
            resources={
                utils.EResource.queue: env["queue"],
                utils.EResource.time: env["time"],
                utils.EResource.mem: env["memory"],
                utils.EResource.nodes: env["cores"],
                utils.EResource.out: job_prefix + "_stdout.log",
                utils.EResource.error: job_prefix + "_stderr.log",
            }
        )

        # store job for later dependency creation
        commands.append(cmd)

        # finally, submit job
        submitter.submit(cmd)

    # submit final summarizer
    # (hold for now - summarizer is run after each subjob finishes)

    # wait for all runs to finish (but only if blocking)
    submitter.join()
Пример #15
0
def find_homologs(pdb_alignment_method="jackhmmer", **kwargs):
    """
    Identify homologs using jackhmmer or hmmbuild/hmmsearch

    Parameters
    ----------
    pdb_alignment_method : {"jackhmmer", "hmmsearch"}, 
             optional (default: "jackhmmer")
        Sequence alignment method used for searching the PDB
    **kwargs
        Passed into jackhmmer / hmmbuild_and_search protocol
        (see documentation for available options)

    Returns
    -------
    ali : evcouplings.align.Alignment
        Alignment of homologs of query sequence
        in sequence database
    hits : pandas.DataFrame
        Tabular representation of hits
    """

    # load default configuration
    config = parse_config(HMMER_CONFIG)

    # update with overrides from kwargs
    config = {
        **config,
        **kwargs,
    }

    # create temporary output if no prefix is given
    if config["prefix"] is None:
        config["prefix"] = path.join(tempdir(), "compare")

    check_required(config, ["prefix"])

    # run hmmsearch (possibly preceded by hmmbuild)
    if pdb_alignment_method == "hmmsearch":
        # set up config to run hmmbuild_and_search on the unfiltered alignment file
        updated_config = deepcopy(config)
        updated_config["alignment_file"] = config.get(
            "raw_focus_alignment_file")
        ar = hmmbuild_and_search(**updated_config)

        # For hmmbuild and search, we have to read the raw focus alignment file
        # to guarantee that the query sequence is present
        with open(ar["raw_focus_alignment_file"]) as a:
            ali = Alignment.from_file(a, "fasta")

    # run jackhmmer against sequence database
    # at this point we have already checked to ensure
    # that the input is either jackhmmer or hmmsearch
    elif pdb_alignment_method == "jackhmmer":
        ar = jackhmmer_search(**config)

        with open(ar["raw_alignment_file"]) as a:
            ali = Alignment.from_file(a, "stockholm")

        # write alignment as FASTA file for easier checking by hand,
        # if necessary
        with open(config["prefix"] + "_raw.fasta", "w") as f:
            ali.write(f)
    else:
        raise InvalidParameterError(
            "Invalid pdb_alignment_method selected. Valid options are: " +
            ", ".join(["jackhmmer", "hmmsearch"]))

    # read hmmer hittable and simplify
    hits = read_hmmer_domtbl(ar["hittable_file"])

    hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map(
        lambda x: x.split("|")[1])
    hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map(
        lambda x: x.split("|")[2])

    hits = hits.rename(
        columns={
            "domain_score": "bitscore",
            "domain_i_Evalue": "e_value",
            "ali_from": "alignment_start",
            "ali_to": "alignment_end",
            "hmm_from": "hmm_start",
            "hmm_to": "hmm_end",
        })

    hits.loc[:, "alignment_start"] = pd.to_numeric(
        hits.alignment_start).astype(int)
    hits.loc[:,
             "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int)

    hits.loc[:, "alignment_id"] = (hits.target_name + "/" +
                                   hits.alignment_start.astype(str) + "-" +
                                   hits.alignment_end.astype(str))

    hits = hits.loc[:, [
        "alignment_id", "uniprot_ac", "uniprot_id", "alignment_start",
        "alignment_end", "bitscore", "e_value"
    ]]

    return ali, hits
Пример #16
0
def verify_prefix(verify_subdir=True, **config):
    """
    Check if configuration contains a prefix,
    and that prefix is a valid directory we
    can write to on the filesystem
    
    Parameters
    ----------
    verify_subdir : bool, optional (default: True)
        Check if we can create subdirectory containing
        full prefix. Set this to False for outer evcouplings
        app loop.
    **config
        Input configuration for pipeline
        
    Returns
    -------
    prefix : str
        Verified prefix
    """
    # check we have a prefix entry, otherwise all hope is lost...
    try:
        prefix = config["global"]["prefix"]
    except KeyError:
        raise InvalidParameterError(
            "Configuration does not include 'prefix' setting in "
            "'global' section"
        )

    # make sure prefix is also specified
    if prefix is None:
        raise InvalidParameterError(
            "'prefix' must be specified and cannot be None"
        )

    # verify that prefix is workable in terms
    # of filesystem
    try:
        # make prefix folder
        create_prefix_folders(prefix)

        # try if we can write in the folder
        with open(prefix + ".test__", "w") as f:
            pass

        # get rid of the file again
        os.remove(prefix + ".test__")

        if verify_subdir:
            # make sure we can create a subdirectory
            sub_prefix = insert_dir(prefix, "test__")
            create_prefix_folders(sub_prefix)

            # remove again
            os.rmdir(path.dirname(sub_prefix))

    except OSError as e:
        raise InvalidParameterError(
            "Not a valid prefix: {}".format(prefix)
        ) from e

    return prefix
Пример #17
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        # for now, call the last two columns
        # "fn" and "cn" to prevent compare
        # stage from crashing
        names=["i", "A_i", "j", "A_j", "fn", "cn"]
        # names=["i", "A_i", "j", "A_j", "mi", "di"]
    ).sort_values(
        by="cn",
        ascending=False
    )

    # write the sorted ECs table to csv file
    ecs.to_csv(outcfg["ec_file"], index=False)

    # also store longrange ECs as convenience output
    if kwargs["min_sequence_distance"] is not None:
        outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv"
        ecs_longrange = ecs.query(
            "abs(i - j) >= {}".format(kwargs["min_sequence_distance"])
        )
        ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False)

        # also create line-drawing script (for now, only for single segments)
        if segments is None or len(segments) == 1:
            outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml"
            L = outcfg["num_sites"]
            ec_lines_pymol_script(
                ecs_longrange.iloc[:L, :],
                outcfg["ec_lines_pml_file"],
                score_column="cn"  # "di
            )

    # compute EC enrichment (for now, for single segments
    # only since enrichment code cannot handle multiple segments)
    if segments is None or len(segments) == 1:
        outcfg["enrichment_file"] = prefix + "_enrichment.csv"
        ecs_enriched = pairs.enrichment(ecs, score="cn")  # "di"
        ecs_enriched.to_csv(outcfg["enrichment_file"], index=False)

        # create corresponding enrichment pymol scripts
        outcfg["enrichment_pml_files"] = []
        for sphere_view, pml_suffix in [
            (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml")
        ]:
            pml_file = prefix + pml_suffix
            enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view)
            outcfg["enrichment_pml_files"].append(pml_file)

    # output EVzoom JSON file if we have stored model file
    if outcfg.get("model_file", None) is not None:
        outcfg["evzoom_file"] = prefix + "_evzoom.json"
        with open(outcfg["evzoom_file"], "w") as f:
            # create JSON output and write to file
            f.write(
                evzoom_json(model) + "\n"
            )

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_standard.outcfg", outcfg)

    return outcfg
Пример #18
0
def dihedral_ranking(structure_files,
                     residues,
                     chain=None,
                     sec_struct_column="sec_struct_3state",
                     model=0):
    """
    Assess quality of a set of structure models by
    twist of predicted alpha-helices and beta-sheets.

    This function re-implements the final score table
    computed in make_alpha_beta_score_table.m from
    the original pipeline. Some of the implementation
    details where however modified, possibly leading
    to differences in the final computed ranking scores.

    Parameters
    ----------
    structure_files : list(str)
        Paths to PDB files that will be ranked
        (have to be in .pdb format)
    residues : pandas.DataFrame
        Residue table with secondary structure predictions
        (columns i, A_i and secondary structure column)
    chain : str, optional (default: None)
        Use this chain in each structure for the calculation.
        If None, will pick the only existing chain (if there
        are multiple chains, an InvalidParameterError will be
        raised).
    sec_struct_column : str, optional (default: sec_struct_3state)
        Column in residues dataframe that contains predicted
        secondary structure (H, E, C)
    model : int, optional (default: 0)
        Use this model from each PDB structure
        
    Returns
    -------
    pandas.DataFrame
        Table with final ranking scores (column ranking_score)
        and alpha and beta dihedral scores, as well
        as well as possible maximum score (equal to the number
        of computed dihedrals).
        
    Raises
    ------
    InvalidParameterError
        If chain is None but multiple chains exist in any
        of the structures
    """
    res = []
    for filename in structure_files:
        # load structure
        struc = ClassicPDB.from_file(filename)

        # see if we can select the right chain
        if chain is None:
            chains = struc.model_to_chains[model]
            if len(chains) != 1:
                raise InvalidParameterError(
                    "Model has more than one chain, need to "
                    "specify chain parameter to disambiguate.")
            chain = chains[0]

        # extract chain from structure
        sel_chain = struc.get_chain(chain)

        # compute dihedral ranking score
        x = dihedral_ranking_score(sel_chain,
                                   residues,
                                   sec_struct_column,
                                   original=False)
        res.append((filename, *x))

    r = pd.DataFrame(res,
                     columns=[
                         "filename", "num_alpha_dihedrals",
                         "alpha_dihedral_score", "num_beta_dihedrals",
                         "beta_dihedral_score"
                     ])

    # maximum score we could have obtained for either set of dihedrals
    max_alpha = r.num_alpha_dihedrals.max()

    # note that, unlike for alpha dihedrals, beta dihedrals are dependent
    # on what contacts between strands are made in 3D structure models
    max_beta = r.num_beta_dihedrals.max()

    # compute final ranking score
    # this computation is somewhat diffeent from original implementation:
    # - normalization for alpha helices is through number of possible dihedrals
    #   not the number of residues with helix secondary structure
    # - scores for helix and beta dihedrals are not adjusted to values < 0 in
    #   borderline cases

    # make sure we do not divide by 0 if we didn't count any dihedrals at all
    max_val = max(1, max_alpha + max_beta)

    r.loc[:, "ranking_score"] = (
        (r.alpha_dihedral_score + r.beta_dihedral_score) / max_val)

    return r
Пример #19
0
def complex(**kwargs):
    """
    Protocol:

    Infer ECs for protein complexes from alignment using plmc.
    Allows user to select scoring protocol.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required
        and infer_plmc()

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        raw_ec_file
        model_file
        num_sites
        num_sequences
        effective_sequences

        focus_mode (passed through)
        focus_sequence (passed through)
        segments (passed through)
    """
    # for additional required parameters, see infer_plmc()
    check_required(
        kwargs,
        [
            "prefix", "min_sequence_distance",
            "scoring_model", "use_all_ecs_for_scoring",
        ]
    )

    prefix = kwargs["prefix"]

    # infer ECs and load them
    outcfg, ecs, segments = infer_plmc(**kwargs)
    model = CouplingsModel(outcfg["model_file"])

    # following computations are mostly specific to complex pipeline

    # add mixture model probability
    if kwargs["scoring_model"] in SCORING_MODELS:
        if kwargs["use_all_ecs_for_scoring"] is not None:
            use_all_ecs = kwargs["use_all_ecs_for_scoring"]
        else:
            use_all_ecs = False

        ecs = complex_probability(
            ecs, kwargs["scoring_model"], use_all_ecs
        )

    else:
        raise InvalidParameterError(
            "Invalid scoring_model parameter: " +
            "{}. Valid options are: {}".format(
                kwargs["protocol"], ", ".join(SCORING_MODELS)
            )
        )

    # also create line-drawing script (for multiple chains)
    # by convention, we map first segment to chain A,
    # second to B, a.s.f.
    chain_mapping = dict(
        zip(
            [s.segment_id for s in segments],
            string.ascii_uppercase,
        )
    )

    outcfg = {
        **outcfg,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_line_plot=True,
            generate_enrichment=False,
            ec_filter="segment_i != segment_j or abs(i - j) >= {}",
            chain=chain_mapping
        )
    }
    
    # save just the inter protein ECs
    ## TODO: eventually have this accomplished by _postprocess_inference
    ## right now avoiding a second call with a different ec_filter
    ecs = pd.read_csv(outcfg["ec_file"])
    outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv"
    inter_ecs = ecs.query("segment_i != segment_j")
    inter_ecs.to_csv(outcfg["inter_ec_file"], index=False)

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_complex.outcfg", outcfg)

    # TODO: make the following complex-ready
    # EC enrichment:
    #
    # 1) think about making EC enrichment complex-ready and add
    # it back here - so far it only makes sense if all ECs are
    # on one segment
    #
    # EVzoom:
    #
    # 1) at the moment, EVzoom will use numbering before remapping
    # we should eventually get this to a point where segments + residue
    # index are displayed on EVzoom
    #
    # 2) note that this will currently use the default mixture model
    # selection for determining the EC cutoff, rather than the selection
    # used for the EC table above

    return outcfg
Пример #20
0
def cut_sequence(sequence,
                 sequence_id,
                 region=None,
                 first_index=None,
                 out_file=None):
    """
    Cut a given sequence to sub-range and save it in a file

    Parameters
    ----------
    sequence : str
        Full sequence that will be cut
    sequence_id : str
        Identifier of sequence, used to construct header
        in output file
    region : tuple(int, int), optional (default: None)
        Region that will be cut out of full sequence.
        If None, full sequence will be returned.
    first_index : int, optional (default: None)
        Define index of first position in sequence.
        Will be set to 1 if None.
    out_file : str, optional (default: None)
        Save sequence in a FASTA file (header:
        >sequence_id/start_region-end_region)

    Returns
    ------
    str
        Subsequence contained in region
    tuple(int, int)
        Region. If no input region is given, this will be
        (1, len(sequence)); otherwise, the input region is
        returned.

    Raises
    ------
    InvalidParameterError
        Upon invalid region specification (violating boundaries
        of sequence)
    """
    cut_seq = None

    # (not using 1 as default value to allow parameter
    # to be unspecified in config file)
    if first_index is None:
        first_index = 1

    # last index is *inclusive*!
    if region is None:
        region = (first_index, first_index + len(sequence) - 1)
        cut_seq = sequence
    else:
        start, end = region
        str_start = start - first_index
        str_end = end - first_index + 1
        cut_seq = sequence[str_start:str_end]

        # make sure bounds are valid given the sequence that we have
        if str_start < 0 or str_end > len(sequence):
            raise InvalidParameterError(
                "Invalid sequence range: "
                "region={} first_index={} len(sequence)={}".format(
                    region, first_index, len(sequence)))

    # save sequence to file
    if out_file is not None:
        with open(out_file, "w") as f:
            header = "{}/{}-{}".format(sequence_id, *region)
            write_fasta([(header, cut_seq)], f)

    return region, cut_seq
Пример #21
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_valid_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        # for now, call the last two columns
        # "fn" and "cn" to prevent compare
        # stage from crashing
        names=["i", "A_i", "j", "A_j", "fn", "cn"]
        # names=["i", "A_i", "j", "A_j", "mi", "di"]
    ).sort_values(
        by="cn",
        ascending=False
    )

    is_single_segment = segments is None or len(segments) == 1
    outcfg = {
        **outcfg,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_enrichment=is_single_segment,
            generate_line_plot=is_single_segment
        )
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg)

    return outcfg
Пример #22
0
def _identify_structures(**kwargs):
    """
    Identify set of 3D structures for comparison

    Parameters
    ----------
    **kwargs
        See check_required in code below

    Returns
    -------
    SIFTSResult
        Identified structures and residue index mappings
    """
    def _filter_by_id(x, id_list):
        x = deepcopy(x)
        x.hits = x.hits.loc[x.hits.pdb_id.isin(id_list)]
        return x

    check_required(kwargs, [
        "prefix", "pdb_ids", "compare_multimer", "max_num_hits",
        "max_num_structures", "pdb_mmtf_dir", "sifts_mapping_table",
        "sifts_sequence_db", "by_alignment", "pdb_alignment_method",
        "alignment_min_overlap", "sequence_id", "sequence_file", "region",
        "use_bitscores", "domain_threshold", "sequence_threshold"
    ])
    # get SIFTS mapping object/sequence DB
    s = SIFTS(kwargs["sifts_mapping_table"], kwargs["sifts_sequence_db"])

    reduce_chains = not kwargs["compare_multimer"]

    # determine if we need to find structures
    # by sequence search or just fetching
    # based on Uniprot/PDB identifier
    if kwargs["by_alignment"]:

        # if searching by alignment, verify that
        # user selected jackhmmer or hmmsearch
        SEARCH_METHODS = ["jackhmmer", "hmmsearch"]

        if kwargs["pdb_alignment_method"] not in SEARCH_METHODS:
            raise InvalidParameterError("Invalid pdb search method: " +
                                        "{}. Valid selections are: {}".format(
                                            ", ".join(SEARCH_METHODS.keys())))

        sifts_map = s.by_alignment(reduce_chains=reduce_chains,
                                   min_overlap=kwargs["alignment_min_overlap"],
                                   **kwargs)
    else:
        sifts_map = s.by_uniprot_id(kwargs["sequence_id"],
                                    reduce_chains=reduce_chains)

    sifts_map_full = deepcopy(sifts_map)

    # filter ID list down to manually selected PDB entries
    if kwargs["pdb_ids"] is not None:
        pdb_ids = kwargs["pdb_ids"]

        # make sure we have a list of PDB IDs
        if not isinstance(pdb_ids, list):
            pdb_ids = [pdb_ids]

        pdb_ids = [x.lower() for x in pdb_ids]

        sifts_map = _filter_by_id(sifts_map, pdb_ids)

    # limit number of hits and structures
    if kwargs["max_num_hits"] is not None:
        sifts_map.hits = sifts_map.hits.iloc[:kwargs["max_num_hits"]]

    if kwargs["max_num_structures"] is not None:
        keep_ids = sifts_map.hits.pdb_id.unique()
        keep_ids = keep_ids[:kwargs["max_num_structures"]]
        sifts_map = _filter_by_id(sifts_map, keep_ids)

    return sifts_map, sifts_map_full
Пример #23
0
    def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs):
        # this file is starting point of pipeline;
        # check if input alignment actually exists

        verify_resources("Input alignment does not exist",
                         input_alignment_file)

        # first try to autodetect format of alignment
        with open(input_alignment_file) as f:
            format = detect_format(f)
            if format is None:
                raise InvalidParameterError(
                    "Format of input alignment {} could not be "
                    "automatically detected.".format(input_alignment_file))

        with open(input_alignment_file) as f:
            ali_raw = Alignment.from_file(f, format)

        # Target sequence of alignment
        sequence_id = kwargs["sequence_id"]

        if sequence_id is None:
            raise InvalidParameterError(
                "Parameter sequence_id must be defined")

        # First, find focus sequence in alignment
        focus_index = None
        for i, id_ in enumerate(ali_raw.ids):
            if id_.startswith(sequence_id):
                focus_index = i
                break

        # if we didn't find it, cannot continue
        if focus_index is None:
            raise InvalidParameterError(
                "Target sequence {} could not be found in alignment".format(
                    sequence_id))

        # identify what columns (non-gap) to keep for focus
        # this should be all columns in the raw_focus_alignment_file
        # but checking anyway
        focus_seq = ali_raw[focus_index]
        focus_cols = np.array([
            c not in [ali_raw._match_gap, ali_raw._insert_gap]
            for c in focus_seq
        ])

        # extract focus alignment
        focus_ali = ali_raw.select(columns=focus_cols)
        focus_seq_nogap = "".join(focus_ali[focus_index])

        # determine region of sequence. If first_index is given,
        # use that in any case, otherwise try to autodetect
        full_focus_header = ali_raw.ids[focus_index]
        focus_id = full_focus_header.split()[0]

        # try to extract region from sequence header
        id_, region_start, region_end = parse_header(focus_id)

        # override with first_index if given
        if kwargs["first_index"] is not None:
            region_start = kwargs["first_index"]
            region_end = region_start + len(focus_seq_nogap) - 1

        if region_start is None or region_end is None:
            raise InvalidParameterError(
                "Could not extract region information " +
                "from sequence header {} ".format(full_focus_header) +
                "and first_index parameter is not given.")

        # resubstitute full sequence ID from identifier
        # and region information
        header = "{}/{}-{}".format(id_, region_start, region_end)

        focus_ali.ids[focus_index] = header

        # write target sequence to file
        target_sequence_file = prefix + ".fa"
        with open(target_sequence_file, "w") as f:
            write_fasta([(header, focus_seq_nogap)], f)

        # swap target sequence to first position if it is not
        # the first sequence in alignment;
        # this is particularly important for hhfilter run
        # because target sequence might otherwise be filtered out
        if focus_index != 0:
            indices = np.arange(0, len(focus_ali))
            indices[0] = focus_index
            indices[focus_index] = 0
            focus_index = 0
            focus_ali = focus_ali.select(sequences=indices)

        # write the raw focus alignment for hmmbuild
        focus_fasta_file = prefix + "_raw_focus_input.fasta"
        with open(focus_fasta_file, "w") as f:
            focus_ali.write(f, "fasta")

        return focus_fasta_file, target_sequence_file, region_start, region_end
Пример #24
0
def mean_field(**kwargs):
    """
    Protocol:

    Infer ECs from alignment using mean field direct coupling analysis.

    For now, mean field DCA can only be run in focus mode, gaps
    included.

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required.

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * raw_ec_file
        * model_file
        * num_sites
        * num_sequences
        * effective_sequences

        * focus_mode (passed through)
        * focus_sequence (passed through)
        * segments (passed through)
    """
    check_required(
        kwargs,
        [
            "prefix", "alignment_file", "segments",
            "focus_mode", "focus_sequence", "theta",
            "pseudo_count", "alphabet",
            "min_sequence_distance", # "save_model",
            "ec_score_type",
        ]
    )

    if not kwargs["focus_mode"]:
        raise InvalidParameterError(
            "For now, mean field DCA can only be run in focus mode."
        )

    prefix = kwargs["prefix"]

    # option to save model disabled
    """
    if kwargs["save_model"]:
        model = prefix + ".model"
    else:
        model = None
    """
    model = prefix + ".model"

    outcfg = {
        "model_file": model,
        "raw_ec_file": prefix + "_ECs.txt",
        "ec_file": prefix + "_CouplingScores.csv",
        # TODO: the following are passed through stage...
        # keep this or unnecessary?
        "focus_mode": kwargs["focus_mode"],
        "focus_sequence": kwargs["focus_sequence"],
        "segments": kwargs["segments"],
    }

    # make sure input alignment exists
    alignment_file = kwargs["alignment_file"]
    verify_resources(
        "Input alignment does not exist",
        kwargs["alignment_file"]
    )

    # make sure output directory exists
    create_prefix_folders(prefix)

    segments = kwargs["segments"]
    if segments is not None:
        segments = [
            mapping.Segment.from_list(s) for s in segments
        ]

    # determine alphabet
    # default is protein
    if kwargs["alphabet"] is None:
        alphabet = ALPHABET_PROTEIN
    else:
        alphabet = kwargs["alphabet"]

        # allow shortcuts for protein, DNA, RNA
        if alphabet in ALPHABET_MAP:
            alphabet = ALPHABET_MAP[alphabet]

    # read in a2m alignment
    with open(alignment_file) as f:
        input_alignment = Alignment.from_file(
            f, alphabet=alphabet,
            format="fasta"
        )

    # init mean field direct coupling analysis
    mf_dca = MeanFieldDCA(input_alignment)

    # run mean field approximation
    model = mf_dca.fit(
        theta=kwargs["theta"],
        pseudo_count=kwargs["pseudo_count"]
    )

    # write ECs to file
    model.to_raw_ec_file(
        outcfg["raw_ec_file"]
    )

    # write model file
    if outcfg["model_file"] is not None:
        model.to_file(
            outcfg["model_file"],
            file_format="plmc_v2"
        )

    # store useful information about model in outcfg
    outcfg.update({
        "num_sites": model.L,
        "num_valid_sequences": model.N_valid,
        "effective_sequences": float(round(model.N_eff, 1)),
        "region_start": int(model.index_list[0]),
    })

    # read and sort ECs
    # Note: this now deviates from the original EC format
    # file because it has 4 score columns to accomodate
    # MI (raw), MI (APC-corrected), DI, CN;
    ecs = pd.read_csv(
        outcfg["raw_ec_file"], sep=" ",
        names=["i", "A_i", "j", "A_j", "mi_raw", "mi_apc", "di", "cn"]
    )

    # select target score;
    # by default select CN score, since it allows to compute probabilities etc.
    ec_score_type = kwargs.get("ec_score_type", "cn")
    valid_ec_type_choices = ["cn", "di", "mi_raw", "mi_apc"]

    if ec_score_type not in valid_ec_type_choices:
        raise InvalidParameterError(
            "Invalid choice for valid_ec_type: {}, valid options are: {}".format(
                ec_score_type, ", ".join(valid_ec_type_choices)
            )
        )

    # perform rescoring if CN score is selected, otherwise cannot rescore
    # since all models are based on distribution shapes generated by CN score
    if ec_score_type == "cn":
        # perform EC rescoring starting from CN score output by plmc;
        # outconfig update will be merged further down in final outcfg merge

        # returned list is already sorted
        ecs, rescorer_outcfg_update = rescore_cn_score_ecs(
            ecs, segments, outcfg, kwargs, score="cn"
        )
    else:
        # If MI or DI, cannot apply distribution-based rescoring approaches,
        # so just set score column and add dummy probability value for compatibility
        # with downstream code
        ecs = ecs.assign(
            score=ecs[ec_score_type],
            probability=np.nan
        ).sort_values(
            by="score",
            ascending=False
        )

        # no additional values to be updated in outcfg in this case
        rescorer_outcfg_update = {}

    is_single_segment = segments is None or len(segments) == 1
    outcfg = {
        **outcfg,
        **rescorer_outcfg_update,
        **_postprocess_inference(
            ecs, kwargs, model, outcfg, prefix,
            generate_enrichment=is_single_segment,
            generate_line_plot=is_single_segment,
            score="score"
        )
    }

    # dump output config to YAML file for debugging/logging
    write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg)

    return outcfg
Пример #25
0
def compare_models_maxcluster(experiments, predictions, norm_by_intersection=True,
                              distance_cutoff=None, binary="maxcluster"):
    """
    Compare predicted models to a set of experimental structures
    using maxcluster
    
    Parameters
    ----------
    experiments : list(str)
        Paths to files with experimental structures
    predictions : list(str)
        Paths to files with predicted structures
    norm_by_intersection : bool, optional (default: True)
        If True, use the number of positions that exist
        in both experiment and predictions for normalizing
        TM scores (assumes all predicted structures have the
        same positions). If False, use length of experimental
        structure.
    distance_cutoff : float, optional (default: None)
        Distance cutoff for MaxSub search (-d option of maxcluster).
        If None, will use maxcluster auto-calibration.
    binary : str, optional (default: "maxcluster")
        Path to maxcluster binary

    Returns
    -------
    full_result : pandas.DataFrame
        Comparison results across all experimental structures
    single_results : dict
        Mapping from experimental structure filename to
        a pandas.DataFrame containing the comparison
        result for that particular structure.
    """
    # determine list of positions in a structure
    # (maxcluster can only handle single model, single chain
    # structures, so we check that here and fail otherwise)
    def _determine_pos(filename):
        structure = ClassicPDB.from_file(filename)
        if len(structure.model_to_chains) == 0:
            raise InvalidParameterError(
                "Structure contains no model (is empty): " +
                filename +
                " - please verify that no problems occurred during structure mapping"
            )
        elif len(structure.model_to_chains) > 1:
            raise InvalidParameterError(
                "Structure contains more than one model: " +
                filename
            )

        model = list(structure.model_to_chains.keys())[0]
        chains = structure.model_to_chains[model]
        if len(chains) != 1:
            raise InvalidParameterError(
                "Structure must contain exactly one chain, but contains: " +
                ",".join(chains)
            )
        chain_name = chains[0]
        chain = structure.get_chain(chain_name, model)
        return chain.residues.id.astype(str).values, chain

    # remove alternative atom locations since maxcluster
    # can only handle one unique atoms
    def _eliminate_altloc(chain):
        # if multiple locations, select the one with the
        # highest occupancy
        chain.coords = chain.coords.loc[
            chain.coords.groupby(
                ["residue_index", "atom_name"]
            ).occupancy.idxmax()
        ]

        # save cut chain to temporary file
        temp_filename = temp()
        with open(temp_filename, "w") as f:
            chain.to_file(f)
        return temp_filename

    # check we have at least one prediction
    if len(predictions) == 0:
        raise InvalidParameterError(
            "Need at least one predicted structure."
        )

    # determine positions in predicted structure from first model
    pred_pos, _ = _determine_pos(predictions[0])

    # collect results of all comparisons here
    full_result = pd.DataFrame()
    single_results = {}

    for exp_file in experiments:
        # determine what number of position to normalize
        # TM score over (either experiment, or only positions
        # that were modelled and are also present in experiment)
        exp_pos, exp_chain = _determine_pos(exp_file)

        # remove alternative atom locations
        exp_file_cleaned = _eliminate_altloc(exp_chain)

        # compute set of positions both in prediction and expeirment
        joint_pos = set(exp_pos).intersection(pred_pos)

        if norm_by_intersection:
            normalization_length = len(joint_pos)
        else:
            normalization_length = len(exp_pos)

        # run comparison
        comp = run_maxcluster_compare(
            predictions, exp_file_cleaned,
            normalization_length=normalization_length,
            distance_cutoff=distance_cutoff, binary=binary
        )

        # store lengths of experiment, prediction,
        # and what was used for computing TM scores
        comp.loc[:, "filename_experimental"] = exp_file
        comp.loc[:, "L_experiment"] = len(exp_pos)
        comp.loc[:, "L_prediction"] = len(pred_pos)
        comp.loc[:, "L_joint"] = len(joint_pos)
        comp.loc[:, "L_normalization"] = normalization_length

        comp = comp.sort_values("tm", ascending=False)
        single_results[exp_file] = comp

        full_result = full_result.append(comp)

    return full_result, single_results
Пример #26
0
def read_psipred_prediction(filename, first_index=1):
    """
    Read a psipred secondary structure prediction file
    in horizontal or vertical format (auto-detected).

    Parameters
    ----------
    filename : str
        Path to prediction output file
    first_index : int, optional (default: 1)
        Index of first position in predicted sequence

    Returns
    -------
    pred : pandas.DataFrame
        Table containing secondary structure prediction,
        with the following columns:

        * i: position

        * A_i: amino acid

        * sec_struct_3state: prediction (H, E, C)

        If reading vformat, also contains columns
        for the individual (score_coil/helix/strand)

        If reading hformat, also contains confidence
        score between 1 and 9 (sec_struct_conf)
    """
    # detect file format
    file_format = None
    with open(filename) as f:
        for line in f:
            if line.startswith("# PSIPRED HFORMAT"):
                file_format = "hformat"
            elif line.startswith("# PSIPRED VFORMAT"):
                file_format = "vformat"

    if file_format == "vformat":
        # read in output file
        pred = pd.read_csv(
            filename,
            skip_blank_lines=True,
            comment="#",
            delim_whitespace=True,
            names=[
                "i", "A_i", "sec_struct_3state", "score_coil", "score_helix",
                "score_strand"
            ],
        )
    elif file_format == "hformat":
        content = defaultdict(str)
        with open(filename) as f:
            # go through file and assemble Conf, Pred, and AA lines
            # into single strings
            for line in f:
                line = line.rstrip().replace(" ", "")
                if ":" in line:
                    key, _, value = line.partition(":")
                    content[key] += value

        pred = pd.DataFrame({
            "A_i": list(content["AA"]),
            "sec_struct_3state": list(content["Pred"]),
            "sec_struct_conf": list(map(int, content["Conf"])),
        })
        pred.loc[:, "i"] = list(range(1, len(pred) + 1))
    else:
        raise InvalidParameterError(
            "Input file is not a valid psipred prediciton file")

    # shift indices if first_index != 1
    pred.loc[:, "i"] += (first_index - 1)

    return pred
Пример #27
0
def existing(**kwargs):
    """
    Protocol:

    Use external sequence alignment and extract all relevant
    information from there (e.g. sequence, region, etc.),
    then apply gap & fragment filtering as usual

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sequence_id (passed through from input)
        * alignment_file
        * raw_focus_alignment_file
        * statistics_file
        * sequence_file
        * first_index
        * target_sequence_file
        * annotation_file (None)
        * frequencies_file
        * identities_file
        * focus_mode
        * focus_sequence
        * segments
    """
    check_required(kwargs, [
        "prefix", "input_alignment", "sequence_id", "first_index",
        "extract_annotation"
    ])

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    # this file is starting point of pipeline;
    # check if input alignment actually exists
    input_alignment = kwargs["input_alignment"]
    verify_resources("Input alignment does not exist", input_alignment)

    # first try to autodetect format of alignment
    with open(input_alignment) as f:
        format = detect_format(f)
        if format is None:
            raise InvalidParameterError(
                "Format of input alignment {} could not be "
                "automatically detected.".format(input_alignment))

    with open(input_alignment) as f:
        ali_raw = Alignment.from_file(f, format)

    # save annotation in sequence headers (species etc.)
    annotation_file = None
    if kwargs["extract_annotation"]:
        annotation_file = prefix + "_annotation.csv"
        from_anno_line = (format == "stockholm")
        annotation = extract_header_annotation(ali_raw,
                                               from_annotation=from_anno_line)
        annotation.to_csv(annotation_file, index=False)

    # Target sequence of alignment
    sequence_id = kwargs["sequence_id"]

    if sequence_id is None:
        raise InvalidParameterError("Parameter sequence_id must be defined")

    # First, find focus sequence in alignment
    focus_index = None
    for i, id_ in enumerate(ali_raw.ids):
        if id_.startswith(sequence_id):
            focus_index = i
            break

    # if we didn't find it, cannot continue
    if focus_index is None:
        raise InvalidParameterError(
            "Target sequence {} could not be found in alignment".format(
                sequence_id))

    # identify what columns (non-gap) to keep for focus
    focus_seq = ali_raw[focus_index]
    focus_cols = np.array([
        c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq
    ])

    # extract focus alignment
    focus_ali = ali_raw.select(columns=focus_cols)
    focus_seq_nogap = "".join(focus_ali[focus_index])

    # determine region of sequence. If first_index is given,
    # use that in any case, otherwise try to autodetect
    full_focus_header = ali_raw.ids[focus_index]
    focus_id = full_focus_header.split()[0]

    # try to extract region from sequence header
    id_, region_start, region_end = parse_header(focus_id)

    # override with first_index if given
    if kwargs["first_index"] is not None:
        region_start = kwargs["first_index"]
        region_end = region_start + len(focus_seq_nogap) - 1

    if region_start is None or region_end is None:
        raise InvalidParameterError(
            "Could not extract region information " +
            "from sequence header {} ".format(full_focus_header) +
            "and first_index parameter is not given.")

    # resubstitute full sequence ID from identifier
    # and region information
    header = "{}/{}-{}".format(id_, region_start, region_end)

    focus_ali.ids[focus_index] = header

    # write target sequence to file
    target_sequence_file = prefix + ".fa"
    with open(target_sequence_file, "w") as f:
        write_fasta([(header, focus_seq_nogap)], f)

    # apply sequence identity and fragment filters,
    # and gap threshold
    mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_,
                                       region_start, **kwargs)

    # generate output configuration of protocol
    outcfg = {
        **mod_outcfg,
        "sequence_id": sequence_id,
        "sequence_file": target_sequence_file,
        "first_index": region_start,
        "target_sequence_file": target_sequence_file,
        "focus_sequence": header,
        "focus_mode": True,
    }

    if annotation_file is not None:
        outcfg["annotation_file"] = annotation_file

    # dump config to YAML file for debugging/logging
    write_config_file(prefix + ".align_existing.outcfg", outcfg)

    # return results of protocol
    return outcfg
Пример #28
0
def standard(**kwargs):
    """
    Protocol:
    Predict 3D structure from evolutionary couplings

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * sec_struct_file
        * folding_ec_file
        * folded_structure_files
    """
    check_required(
        kwargs,
        [
            "prefix", "engine", "ec_file", "target_sequence_file",
            "segments", "folding_config_file", "cut_to_alignment_region",
            "sec_struct_method", "reuse_sec_struct",
            "sec_struct_file", "filter_sec_struct_clashes",
            "min_sequence_distance", "fold_probability_cutoffs",
            "fold_lowest_count", "fold_highest_count", "fold_increase",
            "num_models", "psipred", "cpu", "remapped_pdb_files",
            "cleanup",
        ]
    )

    prefix = kwargs["prefix"]

    # make sure output directory exists
    create_prefix_folders(prefix)

    outcfg = {
        "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv",
        "sec_struct_file": prefix + "_secondary_structure.csv",
    }

    # get secondary structure prediction
    # check if we should (and can) reuse output file from previous run
    if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]):
        residues = pd.read_csv(outcfg["sec_struct_file"])
    else:
        residues = secondary_structure(**kwargs)

    # make pymol secondary structure assignment script
    outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml"
    pymol_secondary_structure(
        residues, outcfg["secondary_structure_pml_file"]
    )

    # load ECs and filter for long-range pairs
    verify_resources(
        "EC file does not exist", kwargs["ec_file"]
    )
    ecs_all = pd.read_csv(kwargs["ec_file"])
    ecs = ecs_all.query("abs(i - j) > {}".format(
        kwargs["min_sequence_distance"])
    )

    # find secondary structure clashes
    ecs = secstruct_clashes(ecs, residues)
    ecs.to_csv(outcfg["folding_ec_file"], index=False)

    # if requested, filter clashes out before folding
    if kwargs["filter_sec_struct_clashes"]:
        ecs_fold = ecs.loc[~ecs.ss_clash]
    else:
        ecs_fold = ecs

    # cut modelled region to aligned region, if selected
    if kwargs["cut_to_alignment_region"]:
        segments = kwargs["segments"]
        # infer region from segment positions if we have it
        if segments is not None:
            positions = Segment.from_list(segments[0]).positions
        else:
            # otherwise get from EC values (could be misleading if
            # EC list is truncated, so only second option)
            positions = set(ecs.i.unique()).union(ecs.j.unique())

        # limit modelled positions to covered region
        first_pos, last_pos = min(positions), max(positions)
        residues.loc[:, "in_model"] = False
        residues.loc[
            (residues.i >= first_pos) & (residues.i <= last_pos),
            "in_model"
        ] = True
    else:
        # otherwise include all positions in model
        residues.loc[:, "in_model"] = True

    # save secondary structure prediction
    residues.to_csv(outcfg["sec_struct_file"], index=False)

    # only use the residues that will be in model for folding
    residues_fold = residues.loc[residues.in_model]

    # after all the setup, now fold the structures...
    # to speed things up, parallelize this to the number of
    # available CPUs
    num_procs = kwargs["cpu"]
    if num_procs is None:
        num_procs = 1

    # first define all the sub-runs...
    folding_runs = []

    # ... based on mixture model probability
    cutoffs = kwargs["fold_probability_cutoffs"]
    if cutoffs is not None and "probability" in ecs_fold.columns:
        if not isinstance(cutoffs, list):
            cutoffs = [cutoffs]

        for c in cutoffs:
            sig_ecs = ecs_fold.query("probability >= @c")
            if len(sig_ecs) > 0:
                folding_runs.append(
                    (sig_ecs,
                     "_significant_ECs_{}".format(c))
                )

    # ... and on simple EC counts/bins
    flc = kwargs["fold_lowest_count"]
    fhc = kwargs["fold_highest_count"]
    fi = kwargs["fold_increase"]
    if flc is not None and fhc is not None and fi is not None:
        num_sites = len(
            set.union(set(ecs.i.unique()), set(ecs.j.unique()))
        )

        # transform fraction of number of sites into discrete number of ECs
        def _discrete_count(x):
            if isinstance(x, float):
                x = ceil(x * num_sites)
            return int(x)

        # range of plots to make
        lowest = _discrete_count(flc)
        highest = _discrete_count(fhc)
        step = _discrete_count(fi)

        # append to list of jobs to run
        folding_runs += [
            (
                ecs_fold.iloc[:c],
                "_{}".format(c)
            )
            for c in range(lowest, highest + 1, step)
        ]

    # set up method to drive the folding of each job
    method = kwargs["engine"]

    # store structures in an auxiliary subdirectory, after folding
    # final models will be moved to main folding dir. Depending
    # on cleanup setting, the aux directory will be removed
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    aux_dir = path.dirname(aux_prefix)

    folding_runs = [
        (job_ecs, aux_prefix + job_suffix)
        for (job_ecs, job_suffix) in folding_runs
    ]

    if method == "cns_dgsa":
        folder = partial(
            cns_dgsa_fold,
            residues_fold,
            config_file=kwargs["folding_config_file"],
            num_structures=kwargs["num_models"],
            log_level=None,
            binary=kwargs["cns"]
        )
    else:
        raise InvalidParameterError(
            "Invalid folding engine: {} ".format(method) +
            "Valid selections are: cns_dgsa"
        )

    # then apply folding function to each sub-run
    pool = mp.Pool(processes=num_procs)
    results = pool.starmap(folder, folding_runs)

    # make double sure that the pool is cleaned up,
    # or SIGTERM upon exit will interfere with
    # interrupt signal interception
    pool.close()
    pool.join()

    # merge result dictionaries into one dict
    folded_files = {
        k: v for subres in results for k, v in subres.items()
    }

    # move structures from aux into main folding dir
    fold_dir = path.dirname(prefix)
    prediction_files = []
    for name, file_path in folded_files.items():
        # move file (use copy to allow overwriting)
        shutil.copy(file_path, fold_dir)

        # update file path to main folding dir,
        # and put in a flat list of result files
        prediction_files.append(
            file_path.replace(aux_prefix, prefix)
        )

    outcfg["folded_structure_files"] = prediction_files

    # remove aux dir if cleanup is requested
    if kwargs["cleanup"]:
        shutil.rmtree(aux_dir)

    # apply ranking to predicted models
    ranking = dihedral_ranking(prediction_files, residues)

    # apply clustering (all available methods), but only
    # if we have something to cluster
    if len(prediction_files) > 1:
        clustering = maxcluster_clustering_table(
            prediction_files, binary=kwargs["maxcluster"]
        )

        # join ranking with clustering
        ranking = ranking.merge(clustering, on="filename", how="left")

    # sort by score (best models first)
    ranking = ranking.sort_values(by="ranking_score", ascending=False)

    # store as file
    outcfg["folding_ranking_file"] = prefix + "_ranking.csv"
    ranking.to_csv(outcfg["folding_ranking_file"], index=False)

    # apply comparison to existing structures
    if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0:
        experimental_files = kwargs["remapped_pdb_files"]

        comp_all, comp_singles = compare_models_maxcluster(
            list(experimental_files.keys()), prediction_files,
            norm_by_intersection=True, distance_cutoff=None,
            binary=kwargs["maxcluster"]
        )

        # merge with ranking and save
        comparison = ranking.merge(
            comp_all, on="filename", how="left"
        ).sort_values(by="tm", ascending=False)
        outcfg["folding_comparison_file"] = prefix + "_comparison.csv"
        comparison.to_csv(outcfg["folding_comparison_file"], index=False)

        # also store comparison to structures in individual files
        ind_comp_files = {}
        for filename, comp_single in comp_singles.items():
            comparison_s = ranking.merge(
                comp_single, on="filename", how="left"
            ).sort_values(by="tm", ascending=False)
            basename = path.splitext(path.split(filename)[1])[0]
            ind_file = path.join(fold_dir, basename + ".csv")

            # map back to original key from remapped_pdb_files as a key for this list
            ind_comp_files[ind_file] = experimental_files[filename]
            comparison_s.to_csv(ind_file, index=False)

        outcfg["folding_individual_comparison_files"] = ind_comp_files

    return outcfg
Пример #29
0
def complex(**kwargs):
    """
    Protocol:
    Compare ECs for a complex to
    3D structure

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    outcfg : dict
        Output configuration of the pipeline, including
        the following fields:

        * ec_file_compared_all
        * ec_file_compared_all_longrange
        * pdb_structure_hits
        * distmap_monomer
        * distmap_multimer
        * contact_map_files
        * remapped_pdb_files
    """
    check_required(kwargs, [
        "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir",
        "atom_filter", "first_compare_multimer", "second_compare_multimer",
        "distance_cutoff", "first_sequence_id", "second_sequence_id",
        "first_sequence_file", "second_sequence_file", "first_segments",
        "second_segments", "first_target_sequence_file",
        "second_target_sequence_file", "scale_sizes"
    ])

    prefix = kwargs["prefix"]

    outcfg = {
        # initialize output EC files
        "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv",
        "ec_compared_longrange_file":
        prefix + "_CouplingScoresCompared_longrange.csv",
        "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv",

        # initialize output inter distancemap files
        "distmap_inter": prefix + "_distmap_inter",
        "inter_contacts_file": prefix + "_inter_contacts_file"
    }

    # Add PDB comparison files for first and second monomer
    for monomer_prefix in ["first", "second"]:
        outcfg = {
            **outcfg,
            monomer_prefix + "_pdb_structure_hits_file":
            "{}_{}_structure_hits.csv".format(prefix, monomer_prefix),
            monomer_prefix + "_pdb_structure_hits_unfiltered_file":
            "{}_{}_structure_hits_unfitered.csv".format(
                prefix, monomer_prefix),
            monomer_prefix + "_distmap_monomer":
            "{}_{}_distance_map_monomer".format(prefix, monomer_prefix),
            monomer_prefix + "_distmap_multimer":
            "{}_{}_distance_map_multimer".format(prefix, monomer_prefix),
        }

    # make sure EC file exists
    verify_resources("EC file does not exist", kwargs["ec_file"])

    # make sure output directory exists
    create_prefix_folders(prefix)

    # store auxiliary files here (too much for average user)
    aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False)
    create_prefix_folders(aux_prefix)

    # store auxiliary files here (too much for average user)
    first_aux_prefix = insert_dir(aux_prefix,
                                  "first_monomer",
                                  rootname_subdir=False)
    create_prefix_folders(first_aux_prefix)

    # store auxiliary files here (too much for average user)
    second_aux_prefix = insert_dir(aux_prefix,
                                   "second_monomer",
                                   rootname_subdir=False)
    create_prefix_folders(second_aux_prefix)

    # Step 1: Identify 3D structures for comparison
    def _identify_monomer_structures(name_prefix, outcfg, aux_prefix):
        # create a dictionary with kwargs for just the current monomer
        # remove the "prefix" kwargs so that we can replace with the
        # aux prefix when calling _identify_structures
        # only replace first occurrence of name_prefix
        monomer_kwargs = {
            k.replace(name_prefix + "_", "", 1): v
            for k, v in kwargs.items() if "prefix" not in k
        }

        # this field needs to be set explicitly else it gets overwritten by concatenated file
        monomer_kwargs["alignment_file"] = kwargs[name_prefix +
                                                  "_alignment_file"]
        monomer_kwargs["raw_focus_alignment_file"] = kwargs[
            name_prefix + "_raw_focus_alignment_file"]

        # identify structures for that monomer
        sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs,
                                                         prefix=aux_prefix)

        # save selected PDB hits
        sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"],
                              index=False)

        # also save full list of hits
        sifts_map_full.hits.to_csv(
            outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"],
            index=False)
        return outcfg, sifts_map

    outcfg, first_sifts_map = _identify_monomer_structures(
        "first", outcfg, first_aux_prefix)
    outcfg, second_sifts_map = _identify_monomer_structures(
        "second", outcfg, second_aux_prefix)

    # get the segment names from the kwargs
    segment_list = kwargs["segments"]

    # Make sure user provided exactly two segments
    if len(segment_list) != 2:
        raise InvalidParameterError(
            "Compare stage for protein complexes requires exactly two segments"
        )

    first_segment_name = kwargs["segments"][0][0]
    second_segment_name = kwargs["segments"][1][0]

    # Step 2: Compute distance maps
    def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name):

        # prepare a sequence map to remap the structures we have found
        verify_resources("Target sequence file does not exist",
                         kwargs[name_prefix + "_target_sequence_file"])

        # create target sequence map for remapping structure
        with open(kwargs[name_prefix + "_target_sequence_file"]) as f:
            header, seq = next(read_fasta(f))

        # create target sequence map for remapping structure
        seq_id, seq_start, seq_end = parse_header(header)
        seqmap = dict(zip(range(seq_start, seq_end + 1), seq))

        # compute distance maps and save
        # (but only if we found some structure)
        if len(sifts_map.hits) > 0:
            d_intra = intra_dists(sifts_map,
                                  structures,
                                  atom_filter=kwargs["atom_filter"],
                                  output_prefix=aux_prefix + "_" +
                                  name_prefix + "_distmap_intra")
            d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"])

            # save contacts to separate file
            outcfg[
                name_prefix +
                "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv"
            d_intra.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg[name_prefix + "_monomer_contacts_file"], index=False)

            # compute multimer distances, if requested;
            # note that d_multimer can be None if there
            # are no structures with multiple chains
            if kwargs[name_prefix + "_compare_multimer"]:
                d_multimer = multimer_dists(sifts_map,
                                            structures,
                                            atom_filter=kwargs["atom_filter"],
                                            output_prefix=aux_prefix + "_" +
                                            name_prefix + "_distmap_multimer")
            else:
                d_multimer = None

            # if we have a multimer contact map, save it
            if d_multimer is not None:
                d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"])
                outcfg[
                    name_prefix +
                    "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv"

                # save contacts to separate file
                d_multimer.contacts(kwargs["distance_cutoff"]).to_csv(
                    outcfg[name_prefix + "_multimer_contacts_file"],
                    index=False)
            else:
                outcfg[name_prefix + "_distmap_multimer"] = None

            # create remapped structures (e.g. for
            # later comparison of folding results)
            # remap structures, swap mapping index and filename in
            # dictionary so we have a list of files in the dict keys
            outcfg[name_prefix + "_remapped_pdb_files"] = {
                filename: mapping_index
                for mapping_index, filename in remap_chains(
                    sifts_map,
                    aux_prefix,
                    seqmap,
                    chain_name=chain_name,
                    raise_missing=kwargs["raise_missing"]).items()
            }

        else:
            # if no structures, cannot compute distance maps
            d_intra = None
            d_multimer = None
            outcfg[name_prefix + "_distmap_monomer"] = None
            outcfg[name_prefix + "_distmap_multimer"] = None
            outcfg[name_prefix + "remapped_pdb_files"] = None

        return d_intra, d_multimer, seqmap

    # load all structures for both monomers
    all_structures = set(first_sifts_map.hits.pdb_id).union(
        set(second_sifts_map.hits.pdb_id))
    structures = load_structures(all_structures,
                                 kwargs["pdb_mmtf_dir"],
                                 raise_missing=False)

    d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps(
        first_sifts_map, "first", "A")
    d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps(
        second_sifts_map, "second", "B")

    # compute inter distance map if sifts map for each monomer exists
    if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0:
        d_inter = inter_dists(first_sifts_map,
                              second_sifts_map,
                              raise_missing=kwargs["raise_missing"])
        # if there were overlapping PDBs, save the results
        if d_inter is not None:
            d_inter.to_file(outcfg["distmap_inter"])

            # save contacts to separate file
            d_inter.contacts(kwargs["distance_cutoff"]).to_csv(
                outcfg["inter_contacts_file"], index=False)

    else:
        outcfg["inter_contacts_file"] = None
        d_inter = None

    # # Step 3: Compare ECs to distance maps
    ec_table = pd.read_csv(kwargs["ec_file"])

    for out_file, min_seq_dist in [
        ("ec_compared_longrange_file", kwargs["min_sequence_distance"]),
        ("ec_compared_all_file", 0),
    ]:

        # compare ECs only if we have an intra distance map
        # for at least one monomer - inter can't exist unless
        # we have both monomers
        if (d_intra_i is not None) or (d_intra_j is not None):
            # compare distances individually for each segment pair
            ecs_intra_i = ec_table.query(
                "segment_i == segment_j == @first_segment_name")
            if d_intra_i is not None:
                ecs_intra_i_compared = coupling_scores_compared(
                    ecs_intra_i,
                    d_intra_i,
                    d_multimer_i,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=min_seq_dist)
            else:
                # If no distance map, the distance is saved as np.nan
                ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan)

            ecs_intra_j = ec_table.query(
                "segment_i == segment_j == @second_segment_name")
            if d_intra_j is not None:
                ecs_intra_j_compared = coupling_scores_compared(
                    ecs_intra_j,
                    d_intra_j,
                    d_multimer_j,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=min_seq_dist)
            else:
                ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan)

            ecs_inter = ec_table.query("segment_i != segment_j")
            if d_inter is not None:
                ecs_inter_compared = coupling_scores_compared(
                    ecs_inter,
                    d_inter,
                    dist_map_multimer=None,
                    dist_cutoff=kwargs["distance_cutoff"],
                    output_file=None,
                    min_sequence_dist=
                    None  # does not apply for inter-protein ECs
                )
            else:
                ecs_inter_compared = ecs_inter.assign(dist=np.nan)

            # combine the tables
            ec_table_compared = pd.concat([
                ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared
            ])

            # rename the precision column to "segmentwise_precision"
            # because we calculated precision for each segment independently
            ec_table_compared = ec_table_compared.rename(
                columns={"precision": "segmentwise_precision"})
            # TODO: change "cn" to "score" eventually
            ec_table_compared = ec_table_compared.sort_values("cn",
                                                              ascending=False)

            # add the total precision
            # TODO: implement different cutoffs for intra vs inter contacts
            ec_table_compared = add_precision(
                ec_table_compared, dist_cutoff=kwargs["distance_cutoff"])

            # save to file
            # all ecs
            ec_table_compared.to_csv(outcfg[out_file])

            # save the inter ECs to a file
            ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"])

    # create the inter-ecs line drawing script
    if outcfg["ec_compared_inter_file"] is not None and kwargs[
            "plot_highest_count"] is not None:
        inter_ecs = ec_table.query("segment_i != segment_j")

        outcfg[
            "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml"

        pairs.ec_lines_pymol_script(
            inter_ecs.iloc[:kwargs["plot_highest_count"], :],
            outcfg["ec_lines_compared_pml_file"],
            distance_cutoff=kwargs["distance_cutoff"],
            chain={
                first_segment_name: "A",
                second_segment_name: "B"
            })

    # Remap the complex crystal structures, if available
    if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0:
        outcfg["complex_remapped_pdb_files"] = {
            filename: mapping_index
            for mapping_index, filename in remap_complex_chains(
                first_sifts_map,
                second_sifts_map,
                seqmap_i,
                seqmap_j,
                output_prefix=aux_prefix,
                raise_missing=kwargs["raise_missing"]).items()
        }

    # Step 4: Make contact map plots
    # if no structures available, defaults to EC-only plot
    outcfg["contact_map_files"] = _make_complex_contact_maps(
        ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter,
        first_segment_name, second_segment_name, **kwargs)

    return outcfg
Пример #30
0
def secondary_structure(**kwargs):
    """
    Predict or load secondary structure for an
    input sequence

    Parameters
    ----------
    Mandatory kwargs arguments:
        See list below in code where calling check_required

    Returns
    -------
    residues : pandas.DataFrame
        Table with sequence and secondary structure
        in columns i, A_i and sec_struct_3state
    """
    check_required(
        kwargs,
        [
            "prefix", "target_sequence_file",
            "segments", "sec_struct_method",
            "sec_struct_file", "psipred",
        ]
    )

    prefix = kwargs["prefix"]
    create_prefix_folders(prefix)

    secstruct_file = kwargs["sec_struct_file"]
    if secstruct_file is not None:
        verify_resources(
            "Secondary structure prediction file does not exist/is empty",
            secstruct_file
        )
        residues = pd.read_csv(secstruct_file)
    else:
        # make sure target sequence file is there so we can
        # predict secondary structure
        target_seq_file = kwargs["target_sequence_file"]
        verify_resources(
            "Sequence file does not exist/is empty", target_seq_file
        )

        # we need to figure out what the index of the first residue
        # in the target sequence is; obtain first index from segment
        # information if possible
        if kwargs["segments"] is not None:
            s = Segment.from_list(kwargs["segments"][0])
            first_index = s.region_start
        else:
            # otherwise try to get it from sequence file
            first_index = None

            with open(target_seq_file) as f:
                header, _ = next(read_fasta(f))
                if header is not None:
                    _, first_index, _ = parse_header(header)

                # if we cannot identify first index from header,
                # do not make guesses but fail
                if first_index is None:
                    raise InvalidParameterError(
                        "Could not unambiguously identify sequence range from "
                        "FASTA header, needs to specified as id/start-end: {}".format(
                            header
                        )
                    )

        # finally, run secondary structure prediction
        if kwargs["sec_struct_method"] == "psipred":
            # store psipred output in a separate directory
            output_dir = path.join(path.dirname(prefix), "psipred")

            # run psipred
            ss2_file, horiz_file = run_psipred(
                target_seq_file, output_dir, binary=kwargs["psipred"]
            )

            # parse output, renumber to first index
            residues = read_psipred_prediction(
                horiz_file, first_index=first_index
            )
        else:
            raise InvalidParameterError(
                "Secondary structure prediction method not implemented: "
                "{}. Valid choices: psipred".format(kwargs["sec_struct_method"])
            )

    # return predicted table
    return residues