def __init__(self, **kwargs): """ Create new SQL-based tracker. For now, this tracker will ignore file_list and store all file paths in the database except for those in delete_list. Parameters ---------- connection_string : str SQLite connection URI. Must include database name, and username/password if authentication is used. job_id : str Unique job identifier of job which should be tracked prefix : str Prefix of pipeline job pipeline : str Name of pipeline that is running file_list : list(str) List of file item keys from outconfig that should be stored in database. For now, this parameter has no effect and all file paths will be stored in database. delete_list : list(str) List of file item keys from outconfig that will be deleted after run is finished. These files cannot be stored as paths to the pipeline result in the output. config : dict(str) Entire configuration dictionary of job retry_max_number : int, optional (default: None) Maximum number of attemps to perform database queries / updates. If None, will try forever. retry_wait : int, optional (default: None) Time in seconds between retries to connect to database """ super().__init__(**kwargs) # for SQL tracker, job ID may not be longer than 255 chars to not interfere with older SQL DBs if len(self.job_id) > 255: raise InvalidParameterError( "Length of job_id for SQL tracker may not exceed 255 characters for database compatibility reasons" ) # create SQLAlchemy engine and session maker to # instantiate later sessions self._engine = create_engine(self.connection_string) self._Session = sessionmaker(bind=self._engine) # Make sure all tables are there in database Base.metadata.create_all(bind=self._engine)
def run(**kwargs): """ Run inference protocol to calculate ECs from input sequence alignment. Parameters ---------- Mandatory kwargs arguments: protocol: EC protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of couplings stage Dictionary with results in following fields: (in brackets: not mandatory) ec_file effective_sequences [enrichment_file] focus_mode focus_sequence model_file num_sequences num_sites raw_ec_file region_start segments """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run alignment concatenation protocol Parameters ---------- Mandatory kwargs arguments: protocol: concatenation protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of concatenation stage Dictionary with results in following fields: (in brackets: not mandatory) alignment_file raw_alignment_file focus_mode focus_sequence segments frequencies_file identities_file num_sequences num_sites raw_focus_alignment_file statistics_file """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run alignment protocol to generate multiple sequence alignment from input sequence. Parameters ---------- Mandatory kwargs arguments: protocol: Alignment protocol to run prefix: Output prefix for all generated files Optional: Returns ------- Alignment Dictionary with results of stage in following fields (in brackets - not returned by all protocols): * alignment_file * [raw_alignment_file] * statistics_file * target_sequence_file * sequence_file * [annotation_file] * frequencies_file * identities_file * [hittable_file] * focus_mode * focus_sequence * segments """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run alignment concatenation protocol Parameters ---------- Mandatory kwargs arguments: protocol: concatenation protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of concatenation stage Dictionary with results in following fields (in brackets: not mandatory): .. todo:: to be finalized after implementing protocols * alignment_file * focus_mode * focus_sequence * segments * num_sites * num_sequences """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def execute(**config): """ Execute a pipeline configuration Parameters ---------- **config Input configuration for pipeline (see pipeline config files for example of how this should look like) Returns ------- global_state : dict Global output state of pipeline """ check_required(config, ["pipeline", "stages", "global"]) # check if valid pipeline was selected if config["pipeline"] not in PIPELINES: raise InvalidParameterError("Not a valid pipeline selection. " "Valid choices are:\n{}".format(", ".join( PIPELINES.keys()))) stages = config["stages"] if stages is None: raise InvalidParameterError("No stages defined, need at least one.") # get definition of selected pipeline pipeline = PIPELINES[config["pipeline"]] prefix = config["global"]["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this is the global state of results as # we move through different stages of # the pipeline global_state = config["global"] # keep track of how many stages are still # to be run, so we can leave out stages at # the end of workflow below num_stages_to_run = len(stages) # get job tracker tracker = get_result_tracker(config) # set job status to running and also initalize global state tracker.update(status=EStatus.RUN, results=global_state) # iterate through individual stages for (stage, runner, key_prefix) in pipeline: # check if anything else is left to # run, otherwise skip if num_stages_to_run == 0: break # check if config for stage is there check_required(config, [stage]) # output files for stage into an individual folder stage_prefix = insert_dir(prefix, stage) create_prefix_folders(stage_prefix) # config files for input and output of stage stage_incfg = "{}_{}.incfg".format(stage_prefix, stage) stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage) # update current stage of job tracker.update(stage=stage) # check if stage should be executed if stage in stages: # global state inserted at end, overrides any # stage-specific settings (except for custom prefix) incfg = { **config["tools"], **config["databases"], **config[stage], **global_state, "prefix": stage_prefix } # save input of stage in config file write_config_file(stage_incfg, incfg) # run stage outcfg = runner(**incfg) # prefix output keys if this parameter is # given in stage configuration, to avoid # name clashes if same protocol run multiple times if key_prefix is not None: outcfg = {key_prefix + k: v for k, v in outcfg.items()} # save output of stage in config file write_config_file(stage_outcfg, outcfg) # one less stage to put through after we ran this... num_stages_to_run -= 1 else: # skip state by injecting state from previous run verify_resources( "Trying to skip, but output configuration " "for stage '{}' does not exist. Has it already " "been run?".format(stage, stage), stage_outcfg) # read output configuration outcfg = read_config_file(stage_outcfg) # verify all the output files are there outfiles = [ filepath for f, filepath in outcfg.items() if f.endswith("_file") and filepath is not None ] verify_resources( "Output files from stage '{}' " "missing".format(stage), *outfiles) # update global state with outputs of stage global_state = {**global_state, **outcfg} # update state in tracker accordingly tracker.update(results=outcfg) # create results archive archive_file = create_archive(config, global_state, prefix) # only store results archive if a result file was created if archive_file is not None: global_state["archive_file"] = archive_file # prepare update for tracker, but only store in last # go when job is set to done tracker_archive_update = {"archive_file": archive_file} else: tracker_archive_update = None # set job status to done and transfer archive if selected for syncing tracker.update(status=EStatus.DONE, results=tracker_archive_update) # delete selected output files if requested; # tracker does not need to update here since it won't # sync entries of delete list in the first place global_state = delete_outputs(config, global_state) # write final global state of pipeline write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state) return global_state
def create_archive(config, outcfg, prefix): """ Create archive of files generated by pipeline Parameters ---------- config : dict-like Input configuration of job. Uses config["management"]["archive"] (list of key used to index outcfg) to determine which files should be added to archive outcfg : dict-like Output configuration of job prefix : str Prefix of job, will be used to define archive file path (prefix + archive type-specific extension) """ # allowed output archive formats ALLOWED_FORMATS = ["targz", "zip"] # determine selected output format, default is .tar.gz archive_format = config.get("management", {}).get("archive_format", "targz") # determine keys (corresponding to files) in # outcfg that should be stored archive_keys = config.get("management", {}).get("archive", None) # if no files selected for archiving, return immediately if archive_keys is None: return # check if selected format is valid if archive_format not in ALLOWED_FORMATS: raise InvalidParameterError( "Invalid format for output archive: {}. ".format(archive_format) + "Valid options are: " + ", ".join(ALLOWED_FORMATS)) # create explicit list of files that would go into archive and are valid files archive_files = [(file_path, file_key, idx) for (file_path, file_key, idx) in iterate_files(outcfg, subset=archive_keys) if valid_file(file_path)] # if there are no file, don't create archive if len(archive_files) == 0: return if archive_format == "targz": final_archive_file = prefix + ".tar.gz" with tarfile.open(final_archive_file, "w:gz") as tar: for (file_path, file_key, idx) in archive_files: tar.add(file_path) elif archive_format == "zip": final_archive_file = prefix + ".zip" with zipfile.ZipFile(final_archive_file, "w", zipfile.ZIP_DEFLATED) as zip_: for (file_path, file_key, idx) in archive_files: zip_.write(file_path) return final_archive_file
def run_cns(inp_script=None, inp_file=None, log_file=None, binary="cns"): """ Run CNSsolve 1.21 (without worrying about environment setup) Note that the user is responsible for verifying the output products of CNS, since their paths are determined by .inp scripts and hard to check automatically and in a general way. Either input_script or input_file has to be specified. Parameters ---------- inp_script : str, optional (default: None) CNS ".inp" input script (actual commands, not file) inp_file : str, optional (default: None) Path to .inp input script file. Will override inp_script if also specified. log_file : str, optional (default: None) Save CNS stdout output to this file binary : str, optional (default: "cns") Absolute path of CNS binary Raises ------ ExternalToolError If call to CNS fails InvalidParameterError If no input script (file or string) given """ # make sure we have absolute path binary = path.abspath(binary) # extract main installation directory cns_main_dir = binary for i in range(3): cns_main_dir = path.dirname(cns_main_dir) # create environment env = deepcopy(os.environ) library_dir = path.join(cns_main_dir, "libraries") module_dir = path.join(cns_main_dir, "modules") env["CNS_SOLVE"] = cns_main_dir env["CNS_LIB"] = library_dir env["CNS_MODULE"] = module_dir env["CNS_HELPLIB"] = path.join(cns_main_dir, "helplip") for var, subdir in [ ("CNS_TOPPAR", "toppar"), ("CNS_CONFDB", "confdb"), ("CNS_XTALLIB", "xtal"), ("CNS_NMRLIB", "nmr"), ("CNS_XRAYLIB", "xray"), ]: env[var] = path.join(library_dir, subdir) for var, subdir in [ ("CNS_XTALMODULE", "xtal"), ("CNS_NMRMODULE", "nmr"), ]: env[var] = path.join(module_dir, subdir) if inp_script is None and inp_file is None: raise InvalidParameterError( "Must specify either input_script or input_file") # read input script, this is fed into CNS using stdin if inp_file is not None: with open(inp_file) as f: inp_script = "".join(f.readlines()) # run and store output return_code, stdout, stderr = run(binary, stdin=inp_script) # write stdout output to log file if log_file is not None: with open(log_file, "w") as f: f.write(stdout)
def run_cns_13(inp_script=None, inp_file=None, log_file=None, source_script=None, binary="cns"): """ Run CNSsolve 1.3 Note that the user is responsible for verifying the output products of CNS, since their paths are determined by .inp scripts and hard to check automatically and in a general way. Either input_script or input_file has to be specified. Parameters ---------- inp_script : str, optional (default: None) CNS ".inp" input script (actual commands, not file) inp_file : str, optional (default: None) Path to .inp input script file. Will override inp_script if also specified. log_file : str, optional (default: None) Save CNS stdout output to this file source_script : str, optional (default: None) Script to set CNS environment variables. This should typically point to .cns_solve_env_sh in the CNS installation main directory (the shell script itself needs to be edited to contain the path of the installation) binary : str, optional (default: "cns") Name of CNS binary Raises ------ ExternalToolError If call to CNS fails InvalidParameterError If no input script (file or string) given """ # usually need to source script to set up environment for CNS if source_script is not None: cmd = "source {};".format(source_script) else: cmd = "" cmd += binary if inp_script is None and inp_file is None: raise InvalidParameterError( "Must specify either input_script or input_file") # read input script, this is fed into CNS using stdin if inp_file is not None: with open(inp_file) as f: inp_script = "".join(f.readlines()) # run and store output return_code, stdout, stderr = run(cmd, stdin=inp_script, shell=True) # write stdout output to log file if log_file is not None: with open(log_file, "w") as f: f.write(stdout)
def get_result_tracker(config): """ Create result tracker from configuration Parameters ---------- config : dict Complete job configuration, including "global" and "management" sections. Returns ------- evcouplings.utils.tracker.ResultTracker Job tracker instance according to config """ # first make copy of config so tracker can't influence # job in any way by accident config = deepcopy(config) management = config.get("management", {}) tracker_type = management.get("tracker_type") # if no tracker selected, return NullTracker right away # and don't bother with all parameter setup below if tracker_type is None: return NullTracker() # connection string for database (or the like) connection_string = management.get("connection_string") # get unique job ID, job prefix and pipeline job_id = management.get("job_id", None) prefix = config.get("global", {}).get("prefix", None) pipeline = config.get("pipeline") # list of files that tracker should store file_list = management.get("tracker_file_list", None) # list of files that pipeline will delete delete_list = management.get("delete", []) # if we don't have these settings, cannot track job if connection_string is None: raise InvalidParameterError( "Must provide parameter 'connection_string' in management section " "of config when using a tracker.") if job_id is None: raise InvalidParameterError( "Must provide unique 'job_id' in management section " "of config when using a tracker.") # see if we have authentication information in the # environment variables (for careful people...) # Default is to authenticate using username/password # in URI env_tracker_username = environ.get(TRACKER_USERNAME_KEY) env_tracker_password = environ.get(TRACKER_PASSWORD_KEY) # substitute username/password into connection string # (will only have an effect if these are present) if connection_string is not None: connection_string = connection_string.format( username=env_tracker_username, password=env_tracker_password) # retry settings retry_max_number = management.get("tracker_max_retries", TRACKER_MAX_NUM_RETRIES) retry_wait = management.get("tracker_retry_wait", TRACKER_RETRY_WAIT) kwargs = { "connection_string": connection_string, "job_id": job_id, "prefix": prefix, "pipeline": pipeline, "file_list": file_list, "delete_list": delete_list, "config": config, "retry_max_number": retry_max_number, "retry_wait": retry_wait } # all fields that go into database itself rather than inside config # are extracted from config object by now; config param as such only serves # as record of the entire configuration and shouldn't be accessed inside # tracker to extract any sort of parametrization of the tracker if tracker_type == "mongodb": from evcouplings.utils.tracker.mongodb import MongoDBTracker return MongoDBTracker(**kwargs) elif tracker_type == "sql": from evcouplings.utils.tracker.sql import SQLTracker return SQLTracker(**kwargs) else: raise InvalidParameterError( "Not a valid job result tracker: '{}'. " "Valid options are: None, 'sql', 'mongodb'".format(tracker_type))
def complex(**kwargs): """ Protocol: Run monomer alignment protocol and postprocess it for EVcomplex calculations Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the alignment protocol, and the following additional field: genome_location_file : path to file containing the genomic locations for CDs's corresponding to identifiers in the alignment. """ check_required(kwargs, [ "prefix", "alignment_protocol", "uniprot_to_embl_table", "ena_genome_location_table" ]) verify_resources("Uniprot to EMBL mapping table does not exist", kwargs["uniprot_to_embl_table"]) verify_resources("ENA genome location table does not exist", kwargs["ena_genome_location_table"]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # run the regular alignment protocol # (standard, existing, ...) alignment_protocol = kwargs["alignment_protocol"] if alignment_protocol not in PROTOCOLS: raise InvalidParameterError( "Invalid choice for alignment protocol: {}".format( alignment_protocol)) outcfg = PROTOCOLS[kwargs["alignment_protocol"]](**kwargs) # if the user selected the existing alignment protocol # they can supply an input annotation file # which overwrites the annotation file generated by the existing protocol if alignment_protocol == "existing": check_required(kwargs, ["override_annotation_file"]) if kwargs["override_annotation_file"] is not None: verify_resources("Override annotation file does not exist", kwargs["override_annotation_file"]) outcfg["annotation_file"] = prefix + "_annotation.csv" annotation_data = pd.read_csv(kwargs["override_annotation_file"]) annotation_data.to_csv(outcfg["annotation_file"]) # extract cds identifiers for alignment uniprot IDs cds_ids = extract_cds_ids(outcfg["alignment_file"], kwargs["uniprot_to_embl_table"]) # extract genome location information from ENA genome_location_filename = prefix + "_genome_location.csv" genome_location_table = extract_embl_annotation( cds_ids, kwargs["ena_genome_location_table"], genome_location_filename) genome_location_table = add_full_header(genome_location_table, outcfg["alignment_file"]) genome_location_table.to_csv(genome_location_filename) outcfg["genome_location_file"] = genome_location_filename # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_complex.outcfg", outcfg) return outcfg
def run_jobs(configs, global_config, overwrite=False, workdir=None, abort_on_error=True, environment=None): """ Submit config to pipeline Parameters ---------- configs : dict Configurations for individual subjobs global_config : dict Master configuration (if only one job, the contents of this dictionary will be equal to the single element of config_files) overwrite : bool, optional (default: False) If True, allows overwriting previous run of the same config, otherwise will fail if results from previous execution are present workdir : str, optional (default: None) Workdir in which to run job (will combine workdir and prefix in joint path) abort_on_error : bool, optional (default: True) Abort entire job submission if error occurs for one of the jobs by propagating RuntimeError environment : str, optional (default: None) Allow to pass value for environment parameter of submitter, will override environment.configuration from global_config (e.g., for setting environment variables like passwords) Returns ------- job_ids : dict Mapping from subjob prefix (keys in configs parameter) to identifier returned by submitter for each of the jobs that was *successfully* submitted (i.e. missing keys from configs param indicate these jobs could not be submitted). Raises ------ RuntimeError If error encountered during submission and abort_on_error is True """ cmd_base = environ.get("EVCOUPLINGS_RUNCFG_APP") or "evcouplings_runcfg" summ_base = environ.get( "EVCOUPLINGS_SUMMARIZE_APP") or "evcouplings_summarize" # determine output directory for config files prefix = global_config["global"]["prefix"] # integrate working directory into output prefix # if it is given; if prefix contains an absolute path, # this will override the workdir according to # implementation of path.join() if workdir is not None: out_prefix = path.join(workdir, prefix) else: out_prefix = prefix # save configuration file, make sure we do not overwrite previous run # if overwrite protection is activated # (but only if it is a valid configuration file with contents) cfg_filename = CONFIG_NAME.format(out_prefix) if not overwrite and valid_file(cfg_filename): raise InvalidParameterError( "Existing configuration file {} ".format(cfg_filename) + "indicates current prefix {} ".format(prefix) + "would overwrite existing results. Use --yolo " + "flag to deactivate overwrite protection (e.g. for " "restarting a job or running a different stage).") # make sure working directory exists create_prefix_folders(cfg_filename) # write global config file write_config_file(cfg_filename, global_config) # also write individual subjob configuration files # (we have to write these before submitting, since # the job summarizer needs the paths to all files) for subjob_prefix, subjob_cfg in configs.items(): # determine working dir for each subjob, since subjob # prefix may contain slashes leading to subfolder creation if workdir is not None: subjob_out_prefix = path.join(workdir, subjob_prefix) else: subjob_out_prefix = subjob_prefix subcfg_filename = CONFIG_NAME.format(subjob_out_prefix) # make sure output subfolder exists create_prefix_folders(subcfg_filename) # write subjob configuration file write_config_file(subcfg_filename, subjob_cfg) # now create list of subjob config files relative to working # directory (above, we allow to run submitted in arbitrary directory) config_files = [ CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs ] # create command for summarizer (needs to know all subjob config files) summ_cmd = "{} {} {} {}".format(summ_base, global_config["pipeline"], global_config["global"]["prefix"], " ".join(config_files)) # create submitter from global (pre-unrolling) configuration submitter = utils.SubmitterFactory(global_config["environment"]["engine"], db_path=out_prefix + "_job_database.txt") # collect individual submitted jobs here commands = [] # record subjob IDs returned by submitter for each job job_ids = {} # prepare individual jobs for submission for job, job_cfg in configs.items(): job_prefix = job_cfg["global"]["prefix"] job_cfg_file = CONFIG_NAME.format(job) # create submission command env = job_cfg["environment"] cmd = utils.Command( ["{} {}".format(cmd_base, job_cfg_file), summ_cmd], name=job_prefix, environment=environment or env["configuration"], workdir=workdir, resources={ utils.EResource.queue: env["queue"], utils.EResource.time: env["time"], utils.EResource.mem: env["memory"], utils.EResource.nodes: env["cores"], utils.EResource.out: job_prefix + "_stdout.log", utils.EResource.error: job_prefix + "_stderr.log", }) # store job for later dependency creation commands.append(cmd) tracker = get_result_tracker(job_cfg) try: # finally, submit job current_job_id = submitter.submit(cmd) # store run identifier returned by submitter # TODO: consider storing current_job_id using tracker right away job_ids[job] = current_job_id # set job status in database to pending tracker.update(status=EStatus.PEND) except RuntimeError as e: # set job as failed in database tracker.update(status=EStatus.FAIL, message=str(e)) # fail entire job submission if requested if abort_on_error: raise # submit final summarizer # (hold for now - summarizer is run after each subjob finishes) # wait for all runs to finish (but only if blocking) submitter.join() # return job identifiers return job_ids
def substitute_config(**kwargs): """ Substitute command line arguments into config file Parameters ---------- **kwargs Command line parameters to be substituted into configuration file Returns ------- dict Updated configuration """ # mapping of command line parameters to config file entries CONFIG_MAP = { "prefix": ("global", "prefix"), "protein": ("global", "sequence_id"), "seqfile": ("global", "sequence_file"), "alignment": ("align", "input_alignment"), "iterations": ("align", "iterations"), "id": ("align", "seqid_filter"), "seqcov": ("align", "minimum_sequence_coverage"), "colcov": ("align", "minimum_column_coverage"), "theta": ("global", "theta"), "plmiter": ("couplings", "iterations"), "queue": ("environment", "queue"), "time": ("environment", "time"), "cores": ("environment", "cores"), "memory": ("environment", "memory"), } # try to read in configuration config_file = kwargs["config"] if not valid_file(config_file): raise ResourceError( "Config file does not exist or is empty: {}".format(config_file)) config = read_config_file(config_file, preserve_order=True) # substitute command-line parameters into configuration # (if straightforward substitution) for param, value in kwargs.items(): if param in CONFIG_MAP and value is not None: outer, inner = CONFIG_MAP[param] config[outer][inner] = value # make sure that number of CPUs requested by # programs within pipeline does not exceed # number of cores requested in environment if config["environment"]["cores"] is not None: config["global"]["cpu"] = config["environment"]["cores"] # handle the more complicated parameters # If alignment is given, run "existing" protocol if kwargs.get("alignment", None) is not None: # TODO: think about what to do if sequence_file is given # (will not be used) config["align"]["protocol"] = "existing" # subregion of protein if kwargs.get("region", None) is not None: region = kwargs["region"] m = re.search("(\d+)-(\d+)", region) if m: start, end = map(int, m.groups()) config["global"]["region"] = [start, end] else: raise InvalidParameterError( "Region string does not have format " "start-end (e.g. 5-123):".format(region)) # pipeline stages to run if kwargs.get("stages", None) is not None: config["stages"] = kwargs["stages"].replace(" ", "").split(",") # sequence alignment input database if kwargs.get("database", None) is not None: db = kwargs["database"] # check if we have a predefined sequence database # if so, use it; otherwise, interpret as file path if db in config["databases"]: config["align"]["database"] = db else: config["align"]["database"] = "custom" config["databases"]["custom"] = db # make sure bitscore and E-value thresholds are exclusively set if kwargs.get("bitscores", None) is not None and kwargs.get( "evalues", None) is not None: raise InvalidParameterError( "Can not specify bitscore and E-value threshold at the same time.") if kwargs.get("bitscores", None) is not None: thresholds = kwargs["bitscores"] bitscore = True elif kwargs.get("evalues", None) is not None: thresholds = kwargs["evalues"] bitscore = False else: thresholds = None if thresholds is not None: T = thresholds.replace(" ", "").split(",") try: x_cast = [(float(t) if "." in t else int(t)) for t in T] except ValueError: raise InvalidParameterError( "Bitscore/E-value threshold(s) must be numeric: " "{}".format(thresholds)) config["align"]["use_bitscores"] = bitscore # check if we have a single threshold (single job) # or if we need to create an array of jobs if len(x_cast) == 1: config["align"]["domain_threshold"] = x_cast[0] config["align"]["sequence_threshold"] = x_cast[0] else: config["batch"] = {} for t in x_cast: sub_prefix = ("_b" if bitscore else "_e") + str(t) config["batch"][sub_prefix] = { "align": { "domain_threshold": t, "sequence_threshold": t, } } return config
def run_jobs(configs, global_config, overwrite=False, workdir=None): """ Submit config to pipeline Parameters ---------- configs : dict Configurations for individual subjobs global_config : dict Master configuration (if only one job, the contents of this dictionary will be equal to the single element of config_files) """ python = executable pipeline_path = path.abspath(pipeline.__file__) summarize_path = path.abspath(summarize.__file__) cmd_base = "{} {}".format(python, pipeline_path) summ_base = "{} {}".format(python, summarize_path) # determine output directory for config files prefix = global_config["global"]["prefix"] # integrate working directory into output prefix # if it is given; if prefix contains an absolute path, # this will override the workdir according to # implementation of path.join() if workdir is not None: out_prefix = path.join(workdir, prefix) else: out_prefix = prefix # save configuration file, make sure we do not overwrite previous run # if overwrite protection is activated # (but only if it is a valid configuration file with contents) cfg_filename = CONFIG_NAME.format(out_prefix) if not overwrite and valid_file(cfg_filename): raise InvalidParameterError( "Existing configuration file {} ".format(cfg_filename) + "indicates current prefix {} ".format(prefix) + "would overwrite existing results. Use --yolo " + "flag to deactivate overwrite protection (e.g. for " "restarting a job or running a different stage)." ) # make sure working directory exists create_prefix_folders(cfg_filename) # write global config file write_config_file(cfg_filename, global_config) # also write individual subjob configuration files # (we have to write these before submitting, since # the job summarizer needs the paths to all files) for subjob_prefix, subjob_cfg in configs.items(): # determine working dir for each subjob, since subjob # prefix may contain slashes leading to subfolder creation if workdir is not None: subjob_out_prefix = path.join(workdir, subjob_prefix) else: subjob_out_prefix = subjob_prefix subcfg_filename = CONFIG_NAME.format(subjob_out_prefix) # make sure output subfolder exists create_prefix_folders(subcfg_filename) # write subjob configuration file write_config_file(subcfg_filename, subjob_cfg) # now create list of subjob config files relative to working # directory (above, we allow to run submitted in arbitrary directory) config_files = [ CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs ] # create command for summarizer (needs to know all subjob config files) summ_cmd = "{} {} {} {}".format( summ_base, global_config["pipeline"], global_config["global"]["prefix"], " ".join(config_files) ) # create submitter from global (pre-unrolling) configuration submitter = utils.SubmitterFactory( global_config["environment"]["engine"], db_path=out_prefix + "_job_database.txt" ) # collect individual submitted jobs here commands = [] # prepare individual jobs for submission for job, job_cfg in configs.items(): job_prefix = job_cfg["global"]["prefix"] job_cfg_file = CONFIG_NAME.format(job) # set job status in database to pending pipeline.update_job_status(job_cfg, status=database.EStatus.PEND) # create submission command env = job_cfg["environment"] cmd = utils.Command( [ "{} {}".format(cmd_base, job_cfg_file), summ_cmd ], name=job_prefix, environment=env["configuration"], workdir=workdir, resources={ utils.EResource.queue: env["queue"], utils.EResource.time: env["time"], utils.EResource.mem: env["memory"], utils.EResource.nodes: env["cores"], utils.EResource.out: job_prefix + "_stdout.log", utils.EResource.error: job_prefix + "_stderr.log", } ) # store job for later dependency creation commands.append(cmd) # finally, submit job submitter.submit(cmd) # submit final summarizer # (hold for now - summarizer is run after each subjob finishes) # wait for all runs to finish (but only if blocking) submitter.join()
def find_homologs(pdb_alignment_method="jackhmmer", **kwargs): """ Identify homologs using jackhmmer or hmmbuild/hmmsearch Parameters ---------- pdb_alignment_method : {"jackhmmer", "hmmsearch"}, optional (default: "jackhmmer") Sequence alignment method used for searching the PDB **kwargs Passed into jackhmmer / hmmbuild_and_search protocol (see documentation for available options) Returns ------- ali : evcouplings.align.Alignment Alignment of homologs of query sequence in sequence database hits : pandas.DataFrame Tabular representation of hits """ # load default configuration config = parse_config(HMMER_CONFIG) # update with overrides from kwargs config = { **config, **kwargs, } # create temporary output if no prefix is given if config["prefix"] is None: config["prefix"] = path.join(tempdir(), "compare") check_required(config, ["prefix"]) # run hmmsearch (possibly preceded by hmmbuild) if pdb_alignment_method == "hmmsearch": # set up config to run hmmbuild_and_search on the unfiltered alignment file updated_config = deepcopy(config) updated_config["alignment_file"] = config.get( "raw_focus_alignment_file") ar = hmmbuild_and_search(**updated_config) # For hmmbuild and search, we have to read the raw focus alignment file # to guarantee that the query sequence is present with open(ar["raw_focus_alignment_file"]) as a: ali = Alignment.from_file(a, "fasta") # run jackhmmer against sequence database # at this point we have already checked to ensure # that the input is either jackhmmer or hmmsearch elif pdb_alignment_method == "jackhmmer": ar = jackhmmer_search(**config) with open(ar["raw_alignment_file"]) as a: ali = Alignment.from_file(a, "stockholm") # write alignment as FASTA file for easier checking by hand, # if necessary with open(config["prefix"] + "_raw.fasta", "w") as f: ali.write(f) else: raise InvalidParameterError( "Invalid pdb_alignment_method selected. Valid options are: " + ", ".join(["jackhmmer", "hmmsearch"])) # read hmmer hittable and simplify hits = read_hmmer_domtbl(ar["hittable_file"]) hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[1]) hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[2]) hits = hits.rename( columns={ "domain_score": "bitscore", "domain_i_Evalue": "e_value", "ali_from": "alignment_start", "ali_to": "alignment_end", "hmm_from": "hmm_start", "hmm_to": "hmm_end", }) hits.loc[:, "alignment_start"] = pd.to_numeric( hits.alignment_start).astype(int) hits.loc[:, "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int) hits.loc[:, "alignment_id"] = (hits.target_name + "/" + hits.alignment_start.astype(str) + "-" + hits.alignment_end.astype(str)) hits = hits.loc[:, [ "alignment_id", "uniprot_ac", "uniprot_id", "alignment_start", "alignment_end", "bitscore", "e_value" ]] return ali, hits
def verify_prefix(verify_subdir=True, **config): """ Check if configuration contains a prefix, and that prefix is a valid directory we can write to on the filesystem Parameters ---------- verify_subdir : bool, optional (default: True) Check if we can create subdirectory containing full prefix. Set this to False for outer evcouplings app loop. **config Input configuration for pipeline Returns ------- prefix : str Verified prefix """ # check we have a prefix entry, otherwise all hope is lost... try: prefix = config["global"]["prefix"] except KeyError: raise InvalidParameterError( "Configuration does not include 'prefix' setting in " "'global' section" ) # make sure prefix is also specified if prefix is None: raise InvalidParameterError( "'prefix' must be specified and cannot be None" ) # verify that prefix is workable in terms # of filesystem try: # make prefix folder create_prefix_folders(prefix) # try if we can write in the folder with open(prefix + ".test__", "w") as f: pass # get rid of the file again os.remove(prefix + ".test__") if verify_subdir: # make sure we can create a subdirectory sub_prefix = insert_dir(prefix, "test__") create_prefix_folders(sub_prefix) # remove again os.rmdir(path.dirname(sub_prefix)) except OSError as e: raise InvalidParameterError( "Not a valid prefix: {}".format(prefix) ) from e return prefix
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) # write the sorted ECs table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"], score_column="cn" # "di ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs, score="cn") # "di" ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # create JSON output and write to file f.write( evzoom_json(model) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def dihedral_ranking(structure_files, residues, chain=None, sec_struct_column="sec_struct_3state", model=0): """ Assess quality of a set of structure models by twist of predicted alpha-helices and beta-sheets. This function re-implements the final score table computed in make_alpha_beta_score_table.m from the original pipeline. Some of the implementation details where however modified, possibly leading to differences in the final computed ranking scores. Parameters ---------- structure_files : list(str) Paths to PDB files that will be ranked (have to be in .pdb format) residues : pandas.DataFrame Residue table with secondary structure predictions (columns i, A_i and secondary structure column) chain : str, optional (default: None) Use this chain in each structure for the calculation. If None, will pick the only existing chain (if there are multiple chains, an InvalidParameterError will be raised). sec_struct_column : str, optional (default: sec_struct_3state) Column in residues dataframe that contains predicted secondary structure (H, E, C) model : int, optional (default: 0) Use this model from each PDB structure Returns ------- pandas.DataFrame Table with final ranking scores (column ranking_score) and alpha and beta dihedral scores, as well as well as possible maximum score (equal to the number of computed dihedrals). Raises ------ InvalidParameterError If chain is None but multiple chains exist in any of the structures """ res = [] for filename in structure_files: # load structure struc = ClassicPDB.from_file(filename) # see if we can select the right chain if chain is None: chains = struc.model_to_chains[model] if len(chains) != 1: raise InvalidParameterError( "Model has more than one chain, need to " "specify chain parameter to disambiguate.") chain = chains[0] # extract chain from structure sel_chain = struc.get_chain(chain) # compute dihedral ranking score x = dihedral_ranking_score(sel_chain, residues, sec_struct_column, original=False) res.append((filename, *x)) r = pd.DataFrame(res, columns=[ "filename", "num_alpha_dihedrals", "alpha_dihedral_score", "num_beta_dihedrals", "beta_dihedral_score" ]) # maximum score we could have obtained for either set of dihedrals max_alpha = r.num_alpha_dihedrals.max() # note that, unlike for alpha dihedrals, beta dihedrals are dependent # on what contacts between strands are made in 3D structure models max_beta = r.num_beta_dihedrals.max() # compute final ranking score # this computation is somewhat diffeent from original implementation: # - normalization for alpha helices is through number of possible dihedrals # not the number of residues with helix secondary structure # - scores for helix and beta dihedrals are not adjusted to values < 0 in # borderline cases # make sure we do not divide by 0 if we didn't count any dihedrals at all max_val = max(1, max_alpha + max_beta) r.loc[:, "ranking_score"] = ( (r.alpha_dihedral_score + r.beta_dihedral_score) / max_val) return r
def complex(**kwargs): """ Protocol: Infer ECs for protein complexes from alignment using plmc. Allows user to select scoring protocol. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required and infer_plmc() Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ # for additional required parameters, see infer_plmc() check_required( kwargs, [ "prefix", "min_sequence_distance", "scoring_model", "use_all_ecs_for_scoring", ] ) prefix = kwargs["prefix"] # infer ECs and load them outcfg, ecs, segments = infer_plmc(**kwargs) model = CouplingsModel(outcfg["model_file"]) # following computations are mostly specific to complex pipeline # add mixture model probability if kwargs["scoring_model"] in SCORING_MODELS: if kwargs["use_all_ecs_for_scoring"] is not None: use_all_ecs = kwargs["use_all_ecs_for_scoring"] else: use_all_ecs = False ecs = complex_probability( ecs, kwargs["scoring_model"], use_all_ecs ) else: raise InvalidParameterError( "Invalid scoring_model parameter: " + "{}. Valid options are: {}".format( kwargs["protocol"], ", ".join(SCORING_MODELS) ) ) # also create line-drawing script (for multiple chains) # by convention, we map first segment to chain A, # second to B, a.s.f. chain_mapping = dict( zip( [s.segment_id for s in segments], string.ascii_uppercase, ) ) outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_line_plot=True, generate_enrichment=False, ec_filter="segment_i != segment_j or abs(i - j) >= {}", chain=chain_mapping ) } # save just the inter protein ECs ## TODO: eventually have this accomplished by _postprocess_inference ## right now avoiding a second call with a different ec_filter ecs = pd.read_csv(outcfg["ec_file"]) outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv" inter_ecs = ecs.query("segment_i != segment_j") inter_ecs.to_csv(outcfg["inter_ec_file"], index=False) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_complex.outcfg", outcfg) # TODO: make the following complex-ready # EC enrichment: # # 1) think about making EC enrichment complex-ready and add # it back here - so far it only makes sense if all ECs are # on one segment # # EVzoom: # # 1) at the moment, EVzoom will use numbering before remapping # we should eventually get this to a point where segments + residue # index are displayed on EVzoom # # 2) note that this will currently use the default mixture model # selection for determining the EC cutoff, rather than the selection # used for the EC table above return outcfg
def cut_sequence(sequence, sequence_id, region=None, first_index=None, out_file=None): """ Cut a given sequence to sub-range and save it in a file Parameters ---------- sequence : str Full sequence that will be cut sequence_id : str Identifier of sequence, used to construct header in output file region : tuple(int, int), optional (default: None) Region that will be cut out of full sequence. If None, full sequence will be returned. first_index : int, optional (default: None) Define index of first position in sequence. Will be set to 1 if None. out_file : str, optional (default: None) Save sequence in a FASTA file (header: >sequence_id/start_region-end_region) Returns ------ str Subsequence contained in region tuple(int, int) Region. If no input region is given, this will be (1, len(sequence)); otherwise, the input region is returned. Raises ------ InvalidParameterError Upon invalid region specification (violating boundaries of sequence) """ cut_seq = None # (not using 1 as default value to allow parameter # to be unspecified in config file) if first_index is None: first_index = 1 # last index is *inclusive*! if region is None: region = (first_index, first_index + len(sequence) - 1) cut_seq = sequence else: start, end = region str_start = start - first_index str_end = end - first_index + 1 cut_seq = sequence[str_start:str_end] # make sure bounds are valid given the sequence that we have if str_start < 0 or str_end > len(sequence): raise InvalidParameterError( "Invalid sequence range: " "region={} first_index={} len(sequence)={}".format( region, first_index, len(sequence))) # save sequence to file if out_file is not None: with open(out_file, "w") as f: header = "{}/{}-{}".format(sequence_id, *region) write_fasta([(header, cut_seq)], f) return region, cut_seq
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg) return outcfg
def _identify_structures(**kwargs): """ Identify set of 3D structures for comparison Parameters ---------- **kwargs See check_required in code below Returns ------- SIFTSResult Identified structures and residue index mappings """ def _filter_by_id(x, id_list): x = deepcopy(x) x.hits = x.hits.loc[x.hits.pdb_id.isin(id_list)] return x check_required(kwargs, [ "prefix", "pdb_ids", "compare_multimer", "max_num_hits", "max_num_structures", "pdb_mmtf_dir", "sifts_mapping_table", "sifts_sequence_db", "by_alignment", "pdb_alignment_method", "alignment_min_overlap", "sequence_id", "sequence_file", "region", "use_bitscores", "domain_threshold", "sequence_threshold" ]) # get SIFTS mapping object/sequence DB s = SIFTS(kwargs["sifts_mapping_table"], kwargs["sifts_sequence_db"]) reduce_chains = not kwargs["compare_multimer"] # determine if we need to find structures # by sequence search or just fetching # based on Uniprot/PDB identifier if kwargs["by_alignment"]: # if searching by alignment, verify that # user selected jackhmmer or hmmsearch SEARCH_METHODS = ["jackhmmer", "hmmsearch"] if kwargs["pdb_alignment_method"] not in SEARCH_METHODS: raise InvalidParameterError("Invalid pdb search method: " + "{}. Valid selections are: {}".format( ", ".join(SEARCH_METHODS.keys()))) sifts_map = s.by_alignment(reduce_chains=reduce_chains, min_overlap=kwargs["alignment_min_overlap"], **kwargs) else: sifts_map = s.by_uniprot_id(kwargs["sequence_id"], reduce_chains=reduce_chains) sifts_map_full = deepcopy(sifts_map) # filter ID list down to manually selected PDB entries if kwargs["pdb_ids"] is not None: pdb_ids = kwargs["pdb_ids"] # make sure we have a list of PDB IDs if not isinstance(pdb_ids, list): pdb_ids = [pdb_ids] pdb_ids = [x.lower() for x in pdb_ids] sifts_map = _filter_by_id(sifts_map, pdb_ids) # limit number of hits and structures if kwargs["max_num_hits"] is not None: sifts_map.hits = sifts_map.hits.iloc[:kwargs["max_num_hits"]] if kwargs["max_num_structures"] is not None: keep_ids = sifts_map.hits.pdb_id.unique() keep_ids = keep_ids[:kwargs["max_num_structures"]] sifts_map = _filter_by_id(sifts_map, keep_ids) return sifts_map, sifts_map_full
def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", "ec_score_type", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs # Note: this now deviates from the original EC format # file because it has 4 score columns to accomodate # MI (raw), MI (APC-corrected), DI, CN; ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", names=["i", "A_i", "j", "A_j", "mi_raw", "mi_apc", "di", "cn"] ) # select target score; # by default select CN score, since it allows to compute probabilities etc. ec_score_type = kwargs.get("ec_score_type", "cn") valid_ec_type_choices = ["cn", "di", "mi_raw", "mi_apc"] if ec_score_type not in valid_ec_type_choices: raise InvalidParameterError( "Invalid choice for valid_ec_type: {}, valid options are: {}".format( ec_score_type, ", ".join(valid_ec_type_choices) ) ) # perform rescoring if CN score is selected, otherwise cannot rescore # since all models are based on distribution shapes generated by CN score if ec_score_type == "cn": # perform EC rescoring starting from CN score output by plmc; # outconfig update will be merged further down in final outcfg merge # returned list is already sorted ecs, rescorer_outcfg_update = rescore_cn_score_ecs( ecs, segments, outcfg, kwargs, score="cn" ) else: # If MI or DI, cannot apply distribution-based rescoring approaches, # so just set score column and add dummy probability value for compatibility # with downstream code ecs = ecs.assign( score=ecs[ec_score_type], probability=np.nan ).sort_values( by="score", ascending=False ) # no additional values to be updated in outcfg in this case rescorer_outcfg_update = {} is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **rescorer_outcfg_update, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment, score="score" ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg) return outcfg
def compare_models_maxcluster(experiments, predictions, norm_by_intersection=True, distance_cutoff=None, binary="maxcluster"): """ Compare predicted models to a set of experimental structures using maxcluster Parameters ---------- experiments : list(str) Paths to files with experimental structures predictions : list(str) Paths to files with predicted structures norm_by_intersection : bool, optional (default: True) If True, use the number of positions that exist in both experiment and predictions for normalizing TM scores (assumes all predicted structures have the same positions). If False, use length of experimental structure. distance_cutoff : float, optional (default: None) Distance cutoff for MaxSub search (-d option of maxcluster). If None, will use maxcluster auto-calibration. binary : str, optional (default: "maxcluster") Path to maxcluster binary Returns ------- full_result : pandas.DataFrame Comparison results across all experimental structures single_results : dict Mapping from experimental structure filename to a pandas.DataFrame containing the comparison result for that particular structure. """ # determine list of positions in a structure # (maxcluster can only handle single model, single chain # structures, so we check that here and fail otherwise) def _determine_pos(filename): structure = ClassicPDB.from_file(filename) if len(structure.model_to_chains) == 0: raise InvalidParameterError( "Structure contains no model (is empty): " + filename + " - please verify that no problems occurred during structure mapping" ) elif len(structure.model_to_chains) > 1: raise InvalidParameterError( "Structure contains more than one model: " + filename ) model = list(structure.model_to_chains.keys())[0] chains = structure.model_to_chains[model] if len(chains) != 1: raise InvalidParameterError( "Structure must contain exactly one chain, but contains: " + ",".join(chains) ) chain_name = chains[0] chain = structure.get_chain(chain_name, model) return chain.residues.id.astype(str).values, chain # remove alternative atom locations since maxcluster # can only handle one unique atoms def _eliminate_altloc(chain): # if multiple locations, select the one with the # highest occupancy chain.coords = chain.coords.loc[ chain.coords.groupby( ["residue_index", "atom_name"] ).occupancy.idxmax() ] # save cut chain to temporary file temp_filename = temp() with open(temp_filename, "w") as f: chain.to_file(f) return temp_filename # check we have at least one prediction if len(predictions) == 0: raise InvalidParameterError( "Need at least one predicted structure." ) # determine positions in predicted structure from first model pred_pos, _ = _determine_pos(predictions[0]) # collect results of all comparisons here full_result = pd.DataFrame() single_results = {} for exp_file in experiments: # determine what number of position to normalize # TM score over (either experiment, or only positions # that were modelled and are also present in experiment) exp_pos, exp_chain = _determine_pos(exp_file) # remove alternative atom locations exp_file_cleaned = _eliminate_altloc(exp_chain) # compute set of positions both in prediction and expeirment joint_pos = set(exp_pos).intersection(pred_pos) if norm_by_intersection: normalization_length = len(joint_pos) else: normalization_length = len(exp_pos) # run comparison comp = run_maxcluster_compare( predictions, exp_file_cleaned, normalization_length=normalization_length, distance_cutoff=distance_cutoff, binary=binary ) # store lengths of experiment, prediction, # and what was used for computing TM scores comp.loc[:, "filename_experimental"] = exp_file comp.loc[:, "L_experiment"] = len(exp_pos) comp.loc[:, "L_prediction"] = len(pred_pos) comp.loc[:, "L_joint"] = len(joint_pos) comp.loc[:, "L_normalization"] = normalization_length comp = comp.sort_values("tm", ascending=False) single_results[exp_file] = comp full_result = full_result.append(comp) return full_result, single_results
def read_psipred_prediction(filename, first_index=1): """ Read a psipred secondary structure prediction file in horizontal or vertical format (auto-detected). Parameters ---------- filename : str Path to prediction output file first_index : int, optional (default: 1) Index of first position in predicted sequence Returns ------- pred : pandas.DataFrame Table containing secondary structure prediction, with the following columns: * i: position * A_i: amino acid * sec_struct_3state: prediction (H, E, C) If reading vformat, also contains columns for the individual (score_coil/helix/strand) If reading hformat, also contains confidence score between 1 and 9 (sec_struct_conf) """ # detect file format file_format = None with open(filename) as f: for line in f: if line.startswith("# PSIPRED HFORMAT"): file_format = "hformat" elif line.startswith("# PSIPRED VFORMAT"): file_format = "vformat" if file_format == "vformat": # read in output file pred = pd.read_csv( filename, skip_blank_lines=True, comment="#", delim_whitespace=True, names=[ "i", "A_i", "sec_struct_3state", "score_coil", "score_helix", "score_strand" ], ) elif file_format == "hformat": content = defaultdict(str) with open(filename) as f: # go through file and assemble Conf, Pred, and AA lines # into single strings for line in f: line = line.rstrip().replace(" ", "") if ":" in line: key, _, value = line.partition(":") content[key] += value pred = pd.DataFrame({ "A_i": list(content["AA"]), "sec_struct_3state": list(content["Pred"]), "sec_struct_conf": list(map(int, content["Conf"])), }) pred.loc[:, "i"] = list(range(1, len(pred) + 1)) else: raise InvalidParameterError( "Input file is not a valid psipred prediciton file") # shift indices if first_index != 1 pred.loc[:, "i"] += (first_index - 1) return pred
def existing(**kwargs): """ Protocol: Use external sequence alignment and extract all relevant information from there (e.g. sequence, region, etc.), then apply gap & fragment filtering as usual Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * alignment_file * raw_focus_alignment_file * statistics_file * sequence_file * first_index * target_sequence_file * annotation_file (None) * frequencies_file * identities_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "input_alignment", "sequence_id", "first_index", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this file is starting point of pipeline; # check if input alignment actually exists input_alignment = kwargs["input_alignment"] verify_resources("Input alignment does not exist", input_alignment) # first try to autodetect format of alignment with open(input_alignment) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment)) with open(input_alignment) as f: ali_raw = Alignment.from_file(f, format) # save annotation in sequence headers (species etc.) annotation_file = None if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" from_anno_line = (format == "stockholm") annotation = extract_header_annotation(ali_raw, from_annotation=from_anno_line) annotation.to_csv(annotation_file, index=False) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError("Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # apply sequence identity and fragment filters, # and gap threshold mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_, region_start, **kwargs) # generate output configuration of protocol outcfg = { **mod_outcfg, "sequence_id": sequence_id, "sequence_file": target_sequence_file, "first_index": region_start, "target_sequence_file": target_sequence_file, "focus_sequence": header, "focus_mode": True, } if annotation_file is not None: outcfg["annotation_file"] = annotation_file # dump config to YAML file for debugging/logging write_config_file(prefix + ".align_existing.outcfg", outcfg) # return results of protocol return outcfg
def standard(**kwargs): """ Protocol: Predict 3D structure from evolutionary couplings Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sec_struct_file * folding_ec_file * folded_structure_files """ check_required( kwargs, [ "prefix", "engine", "ec_file", "target_sequence_file", "segments", "folding_config_file", "cut_to_alignment_region", "sec_struct_method", "reuse_sec_struct", "sec_struct_file", "filter_sec_struct_clashes", "min_sequence_distance", "fold_probability_cutoffs", "fold_lowest_count", "fold_highest_count", "fold_increase", "num_models", "psipred", "cpu", "remapped_pdb_files", "cleanup", ] ) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) outcfg = { "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv", "sec_struct_file": prefix + "_secondary_structure.csv", } # get secondary structure prediction # check if we should (and can) reuse output file from previous run if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]): residues = pd.read_csv(outcfg["sec_struct_file"]) else: residues = secondary_structure(**kwargs) # make pymol secondary structure assignment script outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml" pymol_secondary_structure( residues, outcfg["secondary_structure_pml_file"] ) # load ECs and filter for long-range pairs verify_resources( "EC file does not exist", kwargs["ec_file"] ) ecs_all = pd.read_csv(kwargs["ec_file"]) ecs = ecs_all.query("abs(i - j) > {}".format( kwargs["min_sequence_distance"]) ) # find secondary structure clashes ecs = secstruct_clashes(ecs, residues) ecs.to_csv(outcfg["folding_ec_file"], index=False) # if requested, filter clashes out before folding if kwargs["filter_sec_struct_clashes"]: ecs_fold = ecs.loc[~ecs.ss_clash] else: ecs_fold = ecs # cut modelled region to aligned region, if selected if kwargs["cut_to_alignment_region"]: segments = kwargs["segments"] # infer region from segment positions if we have it if segments is not None: positions = Segment.from_list(segments[0]).positions else: # otherwise get from EC values (could be misleading if # EC list is truncated, so only second option) positions = set(ecs.i.unique()).union(ecs.j.unique()) # limit modelled positions to covered region first_pos, last_pos = min(positions), max(positions) residues.loc[:, "in_model"] = False residues.loc[ (residues.i >= first_pos) & (residues.i <= last_pos), "in_model" ] = True else: # otherwise include all positions in model residues.loc[:, "in_model"] = True # save secondary structure prediction residues.to_csv(outcfg["sec_struct_file"], index=False) # only use the residues that will be in model for folding residues_fold = residues.loc[residues.in_model] # after all the setup, now fold the structures... # to speed things up, parallelize this to the number of # available CPUs num_procs = kwargs["cpu"] if num_procs is None: num_procs = 1 # first define all the sub-runs... folding_runs = [] # ... based on mixture model probability cutoffs = kwargs["fold_probability_cutoffs"] if cutoffs is not None and "probability" in ecs_fold.columns: if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: sig_ecs = ecs_fold.query("probability >= @c") if len(sig_ecs) > 0: folding_runs.append( (sig_ecs, "_significant_ECs_{}".format(c)) ) # ... and on simple EC counts/bins flc = kwargs["fold_lowest_count"] fhc = kwargs["fold_highest_count"] fi = kwargs["fold_increase"] if flc is not None and fhc is not None and fi is not None: num_sites = len( set.union(set(ecs.i.unique()), set(ecs.j.unique())) ) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(flc) highest = _discrete_count(fhc) step = _discrete_count(fi) # append to list of jobs to run folding_runs += [ ( ecs_fold.iloc[:c], "_{}".format(c) ) for c in range(lowest, highest + 1, step) ] # set up method to drive the folding of each job method = kwargs["engine"] # store structures in an auxiliary subdirectory, after folding # final models will be moved to main folding dir. Depending # on cleanup setting, the aux directory will be removed aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) aux_dir = path.dirname(aux_prefix) folding_runs = [ (job_ecs, aux_prefix + job_suffix) for (job_ecs, job_suffix) in folding_runs ] if method == "cns_dgsa": folder = partial( cns_dgsa_fold, residues_fold, config_file=kwargs["folding_config_file"], num_structures=kwargs["num_models"], log_level=None, binary=kwargs["cns"] ) else: raise InvalidParameterError( "Invalid folding engine: {} ".format(method) + "Valid selections are: cns_dgsa" ) # then apply folding function to each sub-run pool = mp.Pool(processes=num_procs) results = pool.starmap(folder, folding_runs) # make double sure that the pool is cleaned up, # or SIGTERM upon exit will interfere with # interrupt signal interception pool.close() pool.join() # merge result dictionaries into one dict folded_files = { k: v for subres in results for k, v in subres.items() } # move structures from aux into main folding dir fold_dir = path.dirname(prefix) prediction_files = [] for name, file_path in folded_files.items(): # move file (use copy to allow overwriting) shutil.copy(file_path, fold_dir) # update file path to main folding dir, # and put in a flat list of result files prediction_files.append( file_path.replace(aux_prefix, prefix) ) outcfg["folded_structure_files"] = prediction_files # remove aux dir if cleanup is requested if kwargs["cleanup"]: shutil.rmtree(aux_dir) # apply ranking to predicted models ranking = dihedral_ranking(prediction_files, residues) # apply clustering (all available methods), but only # if we have something to cluster if len(prediction_files) > 1: clustering = maxcluster_clustering_table( prediction_files, binary=kwargs["maxcluster"] ) # join ranking with clustering ranking = ranking.merge(clustering, on="filename", how="left") # sort by score (best models first) ranking = ranking.sort_values(by="ranking_score", ascending=False) # store as file outcfg["folding_ranking_file"] = prefix + "_ranking.csv" ranking.to_csv(outcfg["folding_ranking_file"], index=False) # apply comparison to existing structures if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0: experimental_files = kwargs["remapped_pdb_files"] comp_all, comp_singles = compare_models_maxcluster( list(experimental_files.keys()), prediction_files, norm_by_intersection=True, distance_cutoff=None, binary=kwargs["maxcluster"] ) # merge with ranking and save comparison = ranking.merge( comp_all, on="filename", how="left" ).sort_values(by="tm", ascending=False) outcfg["folding_comparison_file"] = prefix + "_comparison.csv" comparison.to_csv(outcfg["folding_comparison_file"], index=False) # also store comparison to structures in individual files ind_comp_files = {} for filename, comp_single in comp_singles.items(): comparison_s = ranking.merge( comp_single, on="filename", how="left" ).sort_values(by="tm", ascending=False) basename = path.splitext(path.split(filename)[1])[0] ind_file = path.join(fold_dir, basename + ".csv") # map back to original key from remapped_pdb_files as a key for this list ind_comp_files[ind_file] = experimental_files[filename] comparison_s.to_csv(ind_file, index=False) outcfg["folding_individual_comparison_files"] = ind_comp_files return outcfg
def complex(**kwargs): """ Protocol: Compare ECs for a complex to 3D structure Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "first_compare_multimer", "second_compare_multimer", "distance_cutoff", "first_sequence_id", "second_sequence_id", "first_sequence_file", "second_sequence_file", "first_segments", "second_segments", "first_target_sequence_file", "second_target_sequence_file", "scale_sizes" ]) prefix = kwargs["prefix"] outcfg = { # initialize output EC files "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv", # initialize output inter distancemap files "distmap_inter": prefix + "_distmap_inter", "inter_contacts_file": prefix + "_inter_contacts_file" } # Add PDB comparison files for first and second monomer for monomer_prefix in ["first", "second"]: outcfg = { **outcfg, monomer_prefix + "_pdb_structure_hits_file": "{}_{}_structure_hits.csv".format(prefix, monomer_prefix), monomer_prefix + "_pdb_structure_hits_unfiltered_file": "{}_{}_structure_hits_unfitered.csv".format( prefix, monomer_prefix), monomer_prefix + "_distmap_monomer": "{}_{}_distance_map_monomer".format(prefix, monomer_prefix), monomer_prefix + "_distmap_multimer": "{}_{}_distance_map_multimer".format(prefix, monomer_prefix), } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # store auxiliary files here (too much for average user) first_aux_prefix = insert_dir(aux_prefix, "first_monomer", rootname_subdir=False) create_prefix_folders(first_aux_prefix) # store auxiliary files here (too much for average user) second_aux_prefix = insert_dir(aux_prefix, "second_monomer", rootname_subdir=False) create_prefix_folders(second_aux_prefix) # Step 1: Identify 3D structures for comparison def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): # create a dictionary with kwargs for just the current monomer # remove the "prefix" kwargs so that we can replace with the # aux prefix when calling _identify_structures # only replace first occurrence of name_prefix monomer_kwargs = { k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k } # this field needs to be set explicitly else it gets overwritten by concatenated file monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"] monomer_kwargs["raw_focus_alignment_file"] = kwargs[ name_prefix + "_raw_focus_alignment_file"] # identify structures for that monomer sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs, prefix=aux_prefix) # save selected PDB hits sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv( outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False) return outcfg, sifts_map outcfg, first_sifts_map = _identify_monomer_structures( "first", outcfg, first_aux_prefix) outcfg, second_sifts_map = _identify_monomer_structures( "second", outcfg, second_aux_prefix) # get the segment names from the kwargs segment_list = kwargs["segments"] # Make sure user provided exactly two segments if len(segment_list) != 2: raise InvalidParameterError( "Compare stage for protein complexes requires exactly two segments" ) first_segment_name = kwargs["segments"][0][0] second_segment_name = kwargs["segments"][1][0] # Step 2: Compute distance maps def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # prepare a sequence map to remap the structures we have found verify_resources("Target sequence file does not exist", kwargs[name_prefix + "_target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs[name_prefix + "_target_sequence_file"]) as f: header, seq = next(read_fasta(f)) # create target sequence map for remapping structure seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_intra") d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"]) # save contacts to separate file outcfg[ name_prefix + "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs[name_prefix + "_compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact map, save it if d_multimer is not None: d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"]) outcfg[ name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_multimer_contacts_file"], index=False) else: outcfg[name_prefix + "_distmap_multimer"] = None # create remapped structures (e.g. for # later comparison of folding results) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg[name_prefix + "_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap, chain_name=chain_name, raise_missing=kwargs["raise_missing"]).items() } else: # if no structures, cannot compute distance maps d_intra = None d_multimer = None outcfg[name_prefix + "_distmap_monomer"] = None outcfg[name_prefix + "_distmap_multimer"] = None outcfg[name_prefix + "remapped_pdb_files"] = None return d_intra, d_multimer, seqmap # load all structures for both monomers all_structures = set(first_sifts_map.hits.pdb_id).union( set(second_sifts_map.hits.pdb_id)) structures = load_structures(all_structures, kwargs["pdb_mmtf_dir"], raise_missing=False) d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps( first_sifts_map, "first", "A") d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps( second_sifts_map, "second", "B") # compute inter distance map if sifts map for each monomer exists if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: d_inter = inter_dists(first_sifts_map, second_sifts_map, raise_missing=kwargs["raise_missing"]) # if there were overlapping PDBs, save the results if d_inter is not None: d_inter.to_file(outcfg["distmap_inter"]) # save contacts to separate file d_inter.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["inter_contacts_file"], index=False) else: outcfg["inter_contacts_file"] = None d_inter = None # # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we have an intra distance map # for at least one monomer - inter can't exist unless # we have both monomers if (d_intra_i is not None) or (d_intra_j is not None): # compare distances individually for each segment pair ecs_intra_i = ec_table.query( "segment_i == segment_j == @first_segment_name") if d_intra_i is not None: ecs_intra_i_compared = coupling_scores_compared( ecs_intra_i, d_intra_i, d_multimer_i, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: # If no distance map, the distance is saved as np.nan ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan) ecs_intra_j = ec_table.query( "segment_i == segment_j == @second_segment_name") if d_intra_j is not None: ecs_intra_j_compared = coupling_scores_compared( ecs_intra_j, d_intra_j, d_multimer_j, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan) ecs_inter = ec_table.query("segment_i != segment_j") if d_inter is not None: ecs_inter_compared = coupling_scores_compared( ecs_inter, d_inter, dist_map_multimer=None, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist= None # does not apply for inter-protein ECs ) else: ecs_inter_compared = ecs_inter.assign(dist=np.nan) # combine the tables ec_table_compared = pd.concat([ ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared ]) # rename the precision column to "segmentwise_precision" # because we calculated precision for each segment independently ec_table_compared = ec_table_compared.rename( columns={"precision": "segmentwise_precision"}) # TODO: change "cn" to "score" eventually ec_table_compared = ec_table_compared.sort_values("cn", ascending=False) # add the total precision # TODO: implement different cutoffs for intra vs inter contacts ec_table_compared = add_precision( ec_table_compared, dist_cutoff=kwargs["distance_cutoff"]) # save to file # all ecs ec_table_compared.to_csv(outcfg[out_file]) # save the inter ECs to a file ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"]) # create the inter-ecs line drawing script if outcfg["ec_compared_inter_file"] is not None and kwargs[ "plot_highest_count"] is not None: inter_ecs = ec_table.query("segment_i != segment_j") outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script( inter_ecs.iloc[:kwargs["plot_highest_count"], :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"], chain={ first_segment_name: "A", second_segment_name: "B" }) # Remap the complex crystal structures, if available if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: outcfg["complex_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_complex_chains( first_sifts_map, second_sifts_map, seqmap_i, seqmap_j, output_prefix=aux_prefix, raise_missing=kwargs["raise_missing"]).items() } # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_complex_contact_maps( ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter, first_segment_name, second_segment_name, **kwargs) return outcfg
def secondary_structure(**kwargs): """ Predict or load secondary structure for an input sequence Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- residues : pandas.DataFrame Table with sequence and secondary structure in columns i, A_i and sec_struct_3state """ check_required( kwargs, [ "prefix", "target_sequence_file", "segments", "sec_struct_method", "sec_struct_file", "psipred", ] ) prefix = kwargs["prefix"] create_prefix_folders(prefix) secstruct_file = kwargs["sec_struct_file"] if secstruct_file is not None: verify_resources( "Secondary structure prediction file does not exist/is empty", secstruct_file ) residues = pd.read_csv(secstruct_file) else: # make sure target sequence file is there so we can # predict secondary structure target_seq_file = kwargs["target_sequence_file"] verify_resources( "Sequence file does not exist/is empty", target_seq_file ) # we need to figure out what the index of the first residue # in the target sequence is; obtain first index from segment # information if possible if kwargs["segments"] is not None: s = Segment.from_list(kwargs["segments"][0]) first_index = s.region_start else: # otherwise try to get it from sequence file first_index = None with open(target_seq_file) as f: header, _ = next(read_fasta(f)) if header is not None: _, first_index, _ = parse_header(header) # if we cannot identify first index from header, # do not make guesses but fail if first_index is None: raise InvalidParameterError( "Could not unambiguously identify sequence range from " "FASTA header, needs to specified as id/start-end: {}".format( header ) ) # finally, run secondary structure prediction if kwargs["sec_struct_method"] == "psipred": # store psipred output in a separate directory output_dir = path.join(path.dirname(prefix), "psipred") # run psipred ss2_file, horiz_file = run_psipred( target_seq_file, output_dir, binary=kwargs["psipred"] ) # parse output, renumber to first index residues = read_psipred_prediction( horiz_file, first_index=first_index ) else: raise InvalidParameterError( "Secondary structure prediction method not implemented: " "{}. Valid choices: psipred".format(kwargs["sec_struct_method"]) ) # return predicted table return residues