def _docking_config(config_file=None): """ Load docking configuration Parameters ---------- config_file: str, optional (default: None) Path to configuration file. If None, loads default configuration included with package. Returns ------- dict Loaded configuration """ if config_file is None: # get path of config within package config_file = resource_filename( __name__, "cns_templates/haddock_restraints.yml") # check if config file exists and read verify_resources("Folding config file does not exist or is empty", config_file) return read_config_file(config_file)
def run_hhfilter(input_file, output_file, threshold=95, columns="a2m", binary="hhfilter"): """ Redundancy-reduce a sequence alignment using hhfilter from the HHsuite alignment suite. Parameters ---------- input_file : str Path to input alignment in A2M/FASTA format output_file : str Path to output alignment (will be in A3M format) threshold : int, optional (default: 95) Sequence identity threshold for maximum pairwise identity (between 0 and 100) columns : {"first", "a2m"}, optional (default: "a2m") Definition of match columns (based on first sequence or upper-case columns (a2m)) binary : str Path to hhfilter binary Returns ------- str output_file Raises ------ ResourceError If output alignment is non-existent/empty ValueError Upon invalid value of columns parameter """ if columns not in ["first", "a2m"]: raise ValueError("Invalid column selection: {}".format(columns)) verify_resources("Alignment file does not exist or is empty", input_file) create_prefix_folders(output_file) cmd = [ binary, "-i", input_file, "-o", output_file, "-id", str(threshold), "-M", columns, "-v", str(2) ] return_code, stdout, stderr = run(cmd) verify_resources( "hhfilter returned empty alignment: " "stdout={} stderr={} file={}".format(stdout, stderr, output_file), output_file) return output_file
def fetch_sequence(sequence_id, sequence_file, sequence_download_url, out_file): """ Fetch sequence either from database based on identifier, or from input sequence file. Parameters ---------- sequence_id : str Identifier of sequence that should be retrieved sequence_file : str File containing sequence. If None, sqeuence will be downloaded from sequence_download_url sequence_download_url : str URL from which to download missing sequence. Must contain "{}" at the position where sequence ID will be inserted into download URL (using str.format). out_file : str Output file in which sequence will be stored, if sequence_file is not existing. Returns ------- str Path of file with stored sequence (can be sequence_file or out_file) tuple (str, str) Identifier of sequence as stored in file, and sequence """ if sequence_file is None: get(sequence_download_url.format(sequence_id), out_file, allow_redirects=True) else: # if we have sequence file, try to copy it try: copy(sequence_file, out_file) except FileNotFoundError: raise ResourceError( "sequence_file does not exist: {}".format(sequence_file)) # also make sure input file has something in it verify_resources("Input sequence missing", out_file) with open(out_file) as f: seq = next(read_fasta(f)) return out_file, seq
def run(**kwargs): """ EVcouplings pipeline execution from a configuration file (single thread, no batch or environment configuration) Parameters ---------- kwargs See click.option decorators for app() """ config_file = kwargs["config"] verify_resources("Config file does not exist or is empty.", config_file) # read configuration and execute config = read_config_file(config_file) # execute configuration in "wrapped" mode # that handles exceptions and internal interrupts return execute_wrapped(**config)
def _cns_render_template(template_name, mapping): """ Render an included CNS template .inp Parameters ---------- template_name : str Name of CNS template (e.g. dg_sa) mapping : dict Values to be substituted into template Returns ------- str Rendered template """ # get path of template within package template_file = resource_filename( __name__, "cns_templates/{}.inp".format(template_name)) verify_resources("CNS template does not exist: {}".format(template_file), template_file) return render_template(template_file, mapping)
def execute(**config): """ Execute a pipeline configuration Parameters ---------- **config Input configuration for pipeline (see pipeline config files for example of how this should look like) Returns ------- global_state : dict Global output state of pipeline """ check_required(config, ["pipeline", "stages", "global"]) # check if valid pipeline was selected if config["pipeline"] not in PIPELINES: raise InvalidParameterError("Not a valid pipeline selection. " "Valid choices are:\n{}".format(", ".join( PIPELINES.keys()))) stages = config["stages"] if stages is None: raise InvalidParameterError("No stages defined, need at least one.") # get definition of selected pipeline pipeline = PIPELINES[config["pipeline"]] prefix = config["global"]["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this is the global state of results as # we move through different stages of # the pipeline global_state = config["global"] # keep track of how many stages are still # to be run, so we can leave out stages at # the end of workflow below num_stages_to_run = len(stages) # get job tracker tracker = get_result_tracker(config) # set job status to running and also initalize global state tracker.update(status=EStatus.RUN, results=global_state) # iterate through individual stages for (stage, runner, key_prefix) in pipeline: # check if anything else is left to # run, otherwise skip if num_stages_to_run == 0: break # check if config for stage is there check_required(config, [stage]) # output files for stage into an individual folder stage_prefix = insert_dir(prefix, stage) create_prefix_folders(stage_prefix) # config files for input and output of stage stage_incfg = "{}_{}.incfg".format(stage_prefix, stage) stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage) # update current stage of job tracker.update(stage=stage) # check if stage should be executed if stage in stages: # global state inserted at end, overrides any # stage-specific settings (except for custom prefix) incfg = { **config["tools"], **config["databases"], **config[stage], **global_state, "prefix": stage_prefix } # save input of stage in config file write_config_file(stage_incfg, incfg) # run stage outcfg = runner(**incfg) # prefix output keys if this parameter is # given in stage configuration, to avoid # name clashes if same protocol run multiple times if key_prefix is not None: outcfg = {key_prefix + k: v for k, v in outcfg.items()} # save output of stage in config file write_config_file(stage_outcfg, outcfg) # one less stage to put through after we ran this... num_stages_to_run -= 1 else: # skip state by injecting state from previous run verify_resources( "Trying to skip, but output configuration " "for stage '{}' does not exist. Has it already " "been run?".format(stage, stage), stage_outcfg) # read output configuration outcfg = read_config_file(stage_outcfg) # verify all the output files are there outfiles = [ filepath for f, filepath in outcfg.items() if f.endswith("_file") and filepath is not None ] verify_resources( "Output files from stage '{}' " "missing".format(stage), *outfiles) # update global state with outputs of stage global_state = {**global_state, **outcfg} # update state in tracker accordingly tracker.update(results=outcfg) # create results archive archive_file = create_archive(config, global_state, prefix) # only store results archive if a result file was created if archive_file is not None: global_state["archive_file"] = archive_file # prepare update for tracker, but only store in last # go when job is set to done tracker_archive_update = {"archive_file": archive_file} else: tracker_archive_update = None # set job status to done and transfer archive if selected for syncing tracker.update(status=EStatus.DONE, results=tracker_archive_update) # delete selected output files if requested; # tracker does not need to update here since it won't # sync entries of delete list in the first place global_state = delete_outputs(config, global_state) # write final global state of pipeline write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state) return global_state
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg) return outcfg
def best_hit(**kwargs): """ Protocol: Concatenate alignments based on the best hit to the focus sequence in each species Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: alignment_file raw_alignment_file focus_mode focus_sequence segments frequencies_file identities_file num_sequences num_sites raw_focus_alignment_file statistics_file """ check_required( kwargs, [ "prefix", "first_alignment_file", "second_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_segments", "second_segments", "first_identities_file", "second_identities_file", "first_annotation_file", "second_annotation_file", "use_best_reciprocal", "paralog_identity_threshold" ] ) prefix = kwargs["prefix"] # make sure input alignments verify_resources( "Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) def _load_monomer_info(annotations_file, identities_file, target_sequence, alignment_file, use_best_reciprocal, identity_threshold): # read in annotation to a file and rename the appropriate column annotation_table = read_species_annotation_table(annotations_file) # read identity file similarities = pd.read_csv(identities_file) # create a pd.DataFrame containing the best hit in each organism most_similar_in_species = most_similar_by_organism(similarities, annotation_table) if use_best_reciprocal: paralogs = find_paralogs( target_sequence, annotation_table, similarities, identity_threshold ) most_similar_in_species = filter_best_reciprocal( alignment_file, paralogs, most_similar_in_species ) return most_similar_in_species # load the information about each monomer alignment most_similar_in_species_1 = _load_monomer_info( kwargs["first_annotation_file"], kwargs["first_identities_file"], kwargs["first_focus_sequence"], kwargs["first_alignment_file"], kwargs["use_best_reciprocal"], kwargs["paralog_identity_threshold"] ) most_similar_in_species_2 = _load_monomer_info( kwargs["second_annotation_file"], kwargs["second_identities_file"], kwargs["second_focus_sequence"], kwargs["second_alignment_file"], kwargs["use_best_reciprocal"], kwargs["paralog_identity_threshold"] ) # merge the two dataframes to get all species found in # both alignments species_intersection = most_similar_in_species_1.merge( most_similar_in_species_2, how="inner", # takes the intersection on="species", # merges on species identifiers suffixes=("_1", "_2") ) # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ write_concatenated_alignment( species_intersection, kwargs["first_alignment_file"], kwargs["second_alignment_file"], kwargs["first_focus_sequence"], kwargs["second_focus_sequence"] ) # save the alignment files raw_alignment_file = prefix + "_raw.fasta" with open(raw_alignment_file, "w") as of: raw_ali.write(of) mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: mon_ali_2.write(of) aln_outcfg, _ = modify_alignment( raw_ali, target_seq_index, target_seq_id, kwargs["first_region_start"], **kwargs ) # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = aln_outcfg outcfg["raw_alignment_file"] = raw_alignment_file outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1 outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2 outcfg["focus_sequence"] = target_seq_id # Update the segments outcfg = modify_complex_segments(outcfg, **kwargs) # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) return outcfg
def standard(**kwargs): """ Protocol: Infer ECs from alignment using plmc. .. todo:: 1. make EC enrichment calculation segment-ready 2. explain meaning of parameters in detail. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", "min_sequence_distance", # "save_model", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) # add mixture model probability ecs = pairs.add_mixture_probability(ecs) if segments is not None: # and (len(segments) > 1 or not kwargs["focus_mode"]): # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) # write updated table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"] ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs) ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # load parameters c = CouplingsModel(outcfg["model_file"]) # create JSON output and write to file f.write( evzoom_json(c) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def complex(**kwargs): """ Protocol: Run monomer alignment protocol and postprocess it for EVcomplex calculations Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the alignment protocol, and the following additional field: genome_location_file : path to file containing the genomic locations for CDs's corresponding to identifiers in the alignment. """ check_required(kwargs, [ "prefix", "alignment_protocol", "uniprot_to_embl_table", "ena_genome_location_table" ]) verify_resources("Uniprot to EMBL mapping table does not exist", kwargs["uniprot_to_embl_table"]) verify_resources("ENA genome location table does not exist", kwargs["ena_genome_location_table"]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # run the regular alignment protocol # (standard, existing, ...) alignment_protocol = kwargs["alignment_protocol"] if alignment_protocol not in PROTOCOLS: raise InvalidParameterError( "Invalid choice for alignment protocol: {}".format( alignment_protocol)) outcfg = PROTOCOLS[kwargs["alignment_protocol"]](**kwargs) # if the user selected the existing alignment protocol # they can supply an input annotation file # which overwrites the annotation file generated by the existing protocol if alignment_protocol == "existing": check_required(kwargs, ["override_annotation_file"]) if kwargs["override_annotation_file"] is not None: verify_resources("Override annotation file does not exist", kwargs["override_annotation_file"]) outcfg["annotation_file"] = prefix + "_annotation.csv" annotation_data = pd.read_csv(kwargs["override_annotation_file"]) annotation_data.to_csv(outcfg["annotation_file"]) # extract cds identifiers for alignment uniprot IDs cds_ids = extract_cds_ids(outcfg["alignment_file"], kwargs["uniprot_to_embl_table"]) # extract genome location information from ENA genome_location_filename = prefix + "_genome_location.csv" genome_location_table = extract_embl_annotation( cds_ids, kwargs["ena_genome_location_table"], genome_location_filename) genome_location_table = add_full_header(genome_location_table, outcfg["alignment_file"]) genome_location_table.to_csv(genome_location_filename) outcfg["genome_location_file"] = genome_location_filename # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_complex.outcfg", outcfg) return outcfg
def run_jackhmmer(query, database, prefix, use_bitscores, domain_threshold, seq_threshold, iterations=5, nobias=False, cpu=None, stdout_redirect=None, checkpoints_hmm=False, checkpoints_ali=False, binary="jackhmmer"): """ Run jackhmmer sequence search against target database. Refer to HMMER Userguide for explanation of these parameters. Parameters ---------- query : str File containing query sequence database : str File containing sequence database prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. use_bitscores : bool Use bitscore inclusion thresholds rather than E-values. domain_threshold : int or float or str Inclusion threshold applied on the domain level (e.g. "1E-03" or 0.001 or 50) seq_threshold : int or float or str Inclusion threshold applied on the sequence level (e.g. "1E-03" or 0.001 or 50) iterations : int number of jackhmmer search iterations nobias : bool, optional (default: False) Turn of bias correction cpu : int, optional (default: None) Number of CPUs to use for search. Uses all if None. stdout_redirect : str, optional (default: None) Redirect bulky stdout instead of storing with rest of results (use "/dev/null" to dispose) checkpoints_hmm : bool, optional (default: False) Store checkpoint HMMs to prefix.<iter>.hmm checkpoints_ali : bool, optional (default: False) Store checkpoint alignments to prefix.<iter>.sto binary : str (default: "jackhmmer") Path to jackhmmer binary (put in PATH for default to work) Returns ------- JackhmmerResult namedtuple with fields corresponding to the different output files (prefix, alignment, output, tblout, domtblout) Raises ------ ExternalToolError, ResourceError """ verify_resources("Input file does not exist or is empty", query, database) create_prefix_folders(prefix) # store filenames of all individual results; # these will be returned as result of the # function. result = JackhmmerResult( prefix, prefix + ".sto", prefix + ".output" if stdout_redirect is None else stdout_redirect, prefix + ".tblout", prefix + ".domtblout") cmd = [ binary, "-N", str(iterations), "-o", result.output, "-A", result.alignment, "--tblout", result.tblout, "--domtblout", result.domtblout, "--noali", "--notextw" ] # reporting thresholds are set accordingly to # inclusion threshold to reduce memory footprit if use_bitscores: cmd += [ "-T", str(seq_threshold), "--domT", str(domain_threshold), "--incT", str(seq_threshold), "--incdomT", str(domain_threshold) ] else: cmd += [ "-E", str(seq_threshold), "--domE", str(domain_threshold), "--incE", str(seq_threshold), "--incdomE", str(domain_threshold) ] # number of CPUs if cpu is not None: cmd += ["--cpu", str(cpu)] # bias correction filter if nobias: cmd += ["--nobias"] # save checkpoints for alignments and HMMs? if checkpoints_ali: cmd += ["--chkali", prefix] if checkpoints_hmm: cmd += ["--chkhmm", prefix] cmd += [query, database] return_code, stdout, stderr = run(cmd) # also check we actually created some sort of alignment verify_resources( "jackhmmer returned empty alignment: " "stdout={} stderr={} file={}".format(stdout, stderr, result.alignment), result.alignment) return result
def run_plmc(alignment, couplings_file, param_file=None, focus_seq=None, alphabet=None, theta=None, scale=None, ignore_gaps=False, iterations=None, lambda_h=None, lambda_J=None, lambda_g=None, cpu=None, binary="plmc"): """ Run plmc on sequence alignment and store files with model parameters and pair couplings. Parameters ---------- alignment : str Path to input sequence alignment couplings_file : str Output path for file with evolutionary couplings (folder will be created) param_file : str Output path for binary file containing model parameters (folder will be created) focus_seq : str, optional (default: None) Name of focus sequence, if None, non-focus mode will be used alphabet : str, optional (default: None) Alphabet for model inference. If None, standard amino acid alphabet including gap will be used. First character in string corresponds to gap character (relevant for ignore_gaps). theta : float, optional (default: None) Sequences with pairwise identity >= theta will be clustered and their sequence weights downweighted as 1 / num_cluster_members. Important: Note that plmc will be parametrized using 1 - theta. If None, default value in plmc will be used, which corresponds to theta=0.8 (plmc setting 0.2). scale : float, optional (default: None) Scale weights of clusters by this value. If None, default value in plmc (1.0) will be used ignore_gaps : bool, optional (default: False) Exclude gaps from parameter inference. Gap character is first character of alphabet parameter. iterations : int, optional (default: None) Maximum iterations for optimization. lambda_h : float, optional (default: None) l2 regularization strength on fields. If None, plmc default will be used. lambda_J : float, optional (default: None) l2-regularization strength on couplings. If None, plmc default will be used lambda_g : float, optional (default: None) group l1-regularization strength on couplings If None, plmc default will be used. cpu : Number of cores to use for running plmc. Note that plmc has to be compiled in openmp mode to runnable with multiple cores. Can also be set to "max". binary : str, optional (default: "plmc") Path to plmc binary Returns ------- PlmcResult namedtuple containing output files and parsed fields from console output of plmc Raises ------ ExternalToolError """ create_prefix_folders(couplings_file) # Make sure input alignment exists verify_resources( "Alignment file does not exist", alignment ) cmd = [ binary, "-c", couplings_file, ] # store eij file if explicitly requested if param_file is not None: create_prefix_folders(param_file) cmd += ["-o", param_file] # focus sequence mode and ID if focus_seq is not None: # TODO: for now split exclude sequence # region from focus seq name, otherwise # plmc does not remap names. If this # behaviour changes in plmc, remove the # following line. focus_seq = focus_seq.split("/")[0] cmd += ["-f", focus_seq] # exclude gaps from calculation? if ignore_gaps: cmd += ["-g"] # maximum number of iterations, can also be "max" if iterations is not None: cmd += ["-m", str(iterations)] # set custom alphabet # (first character is gap by default in nogap mode) if alphabet is not None: cmd += ["-a", alphabet] # sequence reweighting if theta is not None: # transform into plmc convention (1-theta) theta = 1.0 - theta cmd += ["-t", str(theta)] # cluster weight if scale is not None: cmd += ["-s", str(scale)] # L2 regularization weight for fields if lambda_h is not None: cmd += ["-lh", str(lambda_h)] # L2 regularization weight for pair couplings if lambda_J is not None: cmd += ["-le", str(lambda_J)] # Group L1 regularization weight for pair couplings if lambda_g is not None: cmd += ["-lg", str(lambda_g)] # Number of cores to use for calculation if cpu is not None: cmd += ["-n", str(cpu)] # finally also add input alignment (main parameter) cmd += [alignment] # TODO: for now do not check returncode because sometimes # returncode == -11 (segfault) despite successful calculation return_code, stdout, stderr = run(cmd, check_returncode=False) # TODO: remove this segfault-hunting output once fixed if return_code != 0: # if not a segfault, still raise exception if return_code != -11: from evcouplings.utils.system import ExternalToolError raise ExternalToolError( "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format( cmd, return_code, stdout, stderr ) ) print("PLMC NON-ZERO RETURNCODE:", return_code) print(cmd) print(" ".join(cmd)) print("stdout:", stdout) print("stderr:", stderr) iter_df, out_fields = parse_plmc_log(stderr) # also check we actually calculated couplings... if not valid_file(couplings_file): raise ResourceError( "plmc returned no couplings: stdout={} stderr={} file={}".format( stdout, stderr, couplings_file ) ) # ... and parameter file, if requested if param_file and not valid_file(param_file): raise ResourceError( "plmc returned no parameter file: stdout={} stderr={} file={}".format( stdout, stderr, param_file ) ) return PlmcResult( couplings_file, param_file, iter_df, *out_fields )
def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end
def secondary_structure(**kwargs): """ Predict or load secondary structure for an input sequence Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- residues : pandas.DataFrame Table with sequence and secondary structure in columns i, A_i and sec_struct_3state """ check_required( kwargs, [ "prefix", "target_sequence_file", "segments", "sec_struct_method", "sec_struct_file", "psipred", ] ) prefix = kwargs["prefix"] create_prefix_folders(prefix) secstruct_file = kwargs["sec_struct_file"] if secstruct_file is not None: verify_resources( "Secondary structure prediction file does not exist/is empty", secstruct_file ) residues = pd.read_csv(secstruct_file) else: # make sure target sequence file is there so we can # predict secondary structure target_seq_file = kwargs["target_sequence_file"] verify_resources( "Sequence file does not exist/is empty", target_seq_file ) # we need to figure out what the index of the first residue # in the target sequence is; obtain first index from segment # information if possible if kwargs["segments"] is not None: s = Segment.from_list(kwargs["segments"][0]) first_index = s.region_start else: # otherwise try to get it from sequence file first_index = None with open(target_seq_file) as f: header, _ = next(read_fasta(f)) if header is not None: _, first_index, _ = parse_header(header) # if we cannot identify first index from header, # do not make guesses but fail if first_index is None: raise InvalidParameterError( "Could not unambiguously identify sequence range from " "FASTA header, needs to specified as id/start-end: {}".format( header ) ) # finally, run secondary structure prediction if kwargs["sec_struct_method"] == "psipred": # store psipred output in a separate directory output_dir = path.join(path.dirname(prefix), "psipred") # run psipred ss2_file, horiz_file = run_psipred( target_seq_file, output_dir, binary=kwargs["psipred"] ) # parse output, renumber to first index residues = read_psipred_prediction( horiz_file, first_index=first_index ) else: raise InvalidParameterError( "Secondary structure prediction method not implemented: " "{}. Valid choices: psipred".format(kwargs["sec_struct_method"]) ) # return predicted table return residues
def genome_distance(**kwargs): """ Protocol: Concatenate alignments based on genomic distance Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: Explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: .. todo:: this is the full list normally returned by alignment protocol, decide which ones to keep. Mandatory: * alignment_file * focus_sequence * focus_mode * segments * alignment_file * [raw_alignment_file] * statistics_file * target_sequence_file * sequence_file * [annotation_file] * frequencies_file * identities_file * [hittable_file] * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "first_raw_focus_alignment_file", "second_raw_focus_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_segments", "second_segments", ]) prefix = kwargs["prefix"] # make sure input alignments verify_resources("Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"]) # make sure output directory exists create_prefix_folders(prefix) # ------------------------------------------------- # TODO: implement concatenation functionality and # postprocessing functionality here # ------------------------------------------------- def _modify_segments(seg_list, seg_prefix): # extract segments from list representation into objects segs = [Segment.from_list(s) for s in seg_list] # update segment IDs for i, s in enumerate(segs, start=1): s.segment_id = "{}_{}".format(seg_prefix, i) return segs # merge segments - this allows to have more than one segment per # "monomer" alignment segments_1 = _modify_segments(kwargs["first_segments"], "A") segments_2 = _modify_segments(kwargs["second_segments"], "B") segments_complex = segments_1 + segments_2 # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = { "alignment_file": None, # TODO: specify "focus_mode": True, "focus_sequence": None, # TODO: specify "segments": [s.to_list() for s in segments_complex], # optional but good to have: "num_sites": None, "num_sequences": None, # "effective_sequences": n_eff # TODO: could compute this like in align stage # TODO: there are more outputs that we could add here (not mandatory), # e.g. single column frequencies in concatenated alignment } return outcfg
def standard(**kwargs): """ Protocol: Predict 3D structure from evolutionary couplings Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sec_struct_file * folding_ec_file * folded_structure_files """ check_required( kwargs, [ "prefix", "engine", "ec_file", "target_sequence_file", "segments", "folding_config_file", "cut_to_alignment_region", "sec_struct_method", "reuse_sec_struct", "sec_struct_file", "filter_sec_struct_clashes", "min_sequence_distance", "fold_probability_cutoffs", "fold_lowest_count", "fold_highest_count", "fold_increase", "num_models", "psipred", "cpu", "remapped_pdb_files", "cleanup", ] ) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) outcfg = { "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv", "sec_struct_file": prefix + "_secondary_structure.csv", } # get secondary structure prediction # check if we should (and can) reuse output file from previous run if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]): residues = pd.read_csv(outcfg["sec_struct_file"]) else: residues = secondary_structure(**kwargs) # make pymol secondary structure assignment script outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml" pymol_secondary_structure( residues, outcfg["secondary_structure_pml_file"] ) # load ECs and filter for long-range pairs verify_resources( "EC file does not exist", kwargs["ec_file"] ) ecs_all = pd.read_csv(kwargs["ec_file"]) ecs = ecs_all.query("abs(i - j) > {}".format( kwargs["min_sequence_distance"]) ) # find secondary structure clashes ecs = secstruct_clashes(ecs, residues) ecs.to_csv(outcfg["folding_ec_file"], index=False) # if requested, filter clashes out before folding if kwargs["filter_sec_struct_clashes"]: ecs_fold = ecs.loc[~ecs.ss_clash] else: ecs_fold = ecs # cut modelled region to aligned region, if selected if kwargs["cut_to_alignment_region"]: segments = kwargs["segments"] # infer region from segment positions if we have it if segments is not None: positions = Segment.from_list(segments[0]).positions else: # otherwise get from EC values (could be misleading if # EC list is truncated, so only second option) positions = set(ecs.i.unique()).union(ecs.j.unique()) # limit modelled positions to covered region first_pos, last_pos = min(positions), max(positions) residues.loc[:, "in_model"] = False residues.loc[ (residues.i >= first_pos) & (residues.i <= last_pos), "in_model" ] = True else: # otherwise include all positions in model residues.loc[:, "in_model"] = True # save secondary structure prediction residues.to_csv(outcfg["sec_struct_file"], index=False) # only use the residues that will be in model for folding residues_fold = residues.loc[residues.in_model] # after all the setup, now fold the structures... # to speed things up, parallelize this to the number of # available CPUs num_procs = kwargs["cpu"] if num_procs is None: num_procs = 1 # first define all the sub-runs... folding_runs = [] # ... based on mixture model probability cutoffs = kwargs["fold_probability_cutoffs"] if cutoffs is not None and "probability" in ecs_fold.columns: if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: sig_ecs = ecs_fold.query("probability >= @c") if len(sig_ecs) > 0: folding_runs.append( (sig_ecs, "_significant_ECs_{}".format(c)) ) # ... and on simple EC counts/bins flc = kwargs["fold_lowest_count"] fhc = kwargs["fold_highest_count"] fi = kwargs["fold_increase"] if flc is not None and fhc is not None and fi is not None: num_sites = len( set.union(set(ecs.i.unique()), set(ecs.j.unique())) ) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(flc) highest = _discrete_count(fhc) step = _discrete_count(fi) # append to list of jobs to run folding_runs += [ ( ecs_fold.iloc[:c], "_{}".format(c) ) for c in range(lowest, highest + 1, step) ] # set up method to drive the folding of each job method = kwargs["engine"] # store structures in an auxiliary subdirectory, after folding # final models will be moved to main folding dir. Depending # on cleanup setting, the aux directory will be removed aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) aux_dir = path.dirname(aux_prefix) folding_runs = [ (job_ecs, aux_prefix + job_suffix) for (job_ecs, job_suffix) in folding_runs ] if method == "cns_dgsa": folder = partial( cns_dgsa_fold, residues_fold, config_file=kwargs["folding_config_file"], num_structures=kwargs["num_models"], log_level=None, binary=kwargs["cns"] ) else: raise InvalidParameterError( "Invalid folding engine: {} ".format(method) + "Valid selections are: cns_dgsa" ) # then apply folding function to each sub-run pool = mp.Pool(processes=num_procs) results = pool.starmap(folder, folding_runs) # make double sure that the pool is cleaned up, # or SIGTERM upon exit will interfere with # interrupt signal interception pool.close() pool.join() # merge result dictionaries into one dict folded_files = { k: v for subres in results for k, v in subres.items() } # move structures from aux into main folding dir fold_dir = path.dirname(prefix) prediction_files = [] for name, file_path in folded_files.items(): # move file (use copy to allow overwriting) shutil.copy(file_path, fold_dir) # update file path to main folding dir, # and put in a flat list of result files prediction_files.append( file_path.replace(aux_prefix, prefix) ) outcfg["folded_structure_files"] = prediction_files # remove aux dir if cleanup is requested if kwargs["cleanup"]: shutil.rmtree(aux_dir) # apply ranking to predicted models ranking = dihedral_ranking(prediction_files, residues) # apply clustering (all available methods), but only # if we have something to cluster if len(prediction_files) > 1: clustering = maxcluster_clustering_table( prediction_files, binary=kwargs["maxcluster"] ) # join ranking with clustering ranking = ranking.merge(clustering, on="filename", how="left") # sort by score (best models first) ranking = ranking.sort_values(by="ranking_score", ascending=False) # store as file outcfg["folding_ranking_file"] = prefix + "_ranking.csv" ranking.to_csv(outcfg["folding_ranking_file"], index=False) # apply comparison to existing structures if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0: experimental_files = kwargs["remapped_pdb_files"] comp_all, comp_singles = compare_models_maxcluster( list(experimental_files.keys()), prediction_files, norm_by_intersection=True, distance_cutoff=None, binary=kwargs["maxcluster"] ) # merge with ranking and save comparison = ranking.merge( comp_all, on="filename", how="left" ).sort_values(by="tm", ascending=False) outcfg["folding_comparison_file"] = prefix + "_comparison.csv" comparison.to_csv(outcfg["folding_comparison_file"], index=False) # also store comparison to structures in individual files ind_comp_files = {} for filename, comp_single in comp_singles.items(): comparison_s = ranking.merge( comp_single, on="filename", how="left" ).sort_values(by="tm", ascending=False) basename = path.splitext(path.split(filename)[1])[0] ind_file = path.join(fold_dir, basename + ".csv") # map back to original key from remapped_pdb_files as a key for this list ind_comp_files[ind_file] = experimental_files[filename] comparison_s.to_csv(ind_file, index=False) outcfg["folding_individual_comparison_files"] = ind_comp_files return outcfg
def standard(**kwargs): """ Protocol: Compare ECs for single proteins (or domains) to 3D structure information Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * mutation_matrix_file * [mutation_dataset_predicted_file] """ check_required(kwargs, [ "prefix", "model_file", "mutation_dataset_file", ]) prefix = kwargs["prefix"] outcfg = { "mutation_matrix_file": prefix + "_single_mutant_matrix.csv", "mutation_matrix_plot_files": [], } # make sure model file exists verify_resources("Model parameter file does not exist", kwargs["model_file"]) # make sure output directory exists create_prefix_folders(prefix) # load couplings object, and create independent model c = CouplingsModel(kwargs["model_file"]) c0 = c.to_independent_model() for model, type_ in [(c, "Epistatic"), (c0, "Independent")]: # interactive plot using bokeh filename = prefix + "_{}_model".format(type_.lower(), ) output_file(filename + ".html", "{} model".format(type_)) fig = evcouplings.visualize.mutations.plot_mutation_matrix( model, engine="bokeh") save(fig) outcfg["mutation_matrix_plot_files"].append(filename + ".html") # static matplotlib plot evcouplings.visualize.mutations.plot_mutation_matrix(model) plt.savefig(filename + ".pdf", bbox_inches="tight") outcfg["mutation_matrix_plot_files"].append(filename + ".pdf") # create single mutation matrix table, # add prediction by independent model and # save to file singles = single_mutant_matrix(c, output_column="prediction_epistatic") singles = predict_mutation_table(c0, singles, "prediction_independent") singles.to_csv(outcfg["mutation_matrix_file"], index=False) # Pymol scripts outcfg["mutations_epistatic_pml_files"] = [] for model in ["epistatic", "independent"]: pml_filename = prefix + "_{}_model.pml".format(model) evcouplings.visualize.mutations.mutation_pymol_script( singles, pml_filename, effect_column="prediction_" + model) outcfg["mutations_epistatic_pml_files"].append(pml_filename) # predict experimental dataset if given dataset_file = kwargs["mutation_dataset_file"] if dataset_file is not None: verify_resources("Dataset file does not exist", dataset_file) data = pd.read_csv(dataset_file, comment="#") # add epistatic model prediction data_pred = predict_mutation_table(c, data, "prediction_epistatic") # add independent model prediction data_pred = predict_mutation_table(c0, data_pred, "prediction_independent") outcfg[ "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv" data_pred.to_csv(outcfg["mutation_dataset_predicted_file"], index=False) return outcfg
def complex(**kwargs): """ Protocol: Mutation effect prediction and visualization for protein complexes Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * mutation_matrix_file * [mutation_dataset_predicted_file] """ check_required( kwargs, ["prefix", "model_file", "mutation_dataset_file", "segments"]) prefix = kwargs["prefix"] outcfg = { "mutation_matrix_file": prefix + "_single_mutant_matrix.csv", "mutation_matrix_plot_files": [], } # make sure model file exists verify_resources("Model parameter file does not exist", kwargs["model_file"]) # make sure output directory exists create_prefix_folders(prefix) # load segments to create couplings object segment_objects = [] for segment_list in kwargs["segments"]: segment_objects.append(Segment.from_list(segment_list)) first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id first_chain_name = Segment.from_list( kwargs["segments"][0]).default_chain_name() second_chain_name = Segment.from_list( kwargs["segments"][1]).default_chain_name() # load couplings object c = MultiSegmentCouplingsModel(kwargs["model_file"], *segment_objects) # create the independent model c0 = c.to_independent_model() # create the inter-protein only Jij model ci = c.to_inter_segment_model() for model, type_ in [(c, "Epistatic"), (c0, "Independent"), (ci, "Inter_segment")]: # interactive plot using bokeh filename = prefix + "_{}_model".format(type_.lower(), ) output_file(filename + ".html", "{} model".format(type_)) fig = evcouplings.visualize.mutations.plot_mutation_matrix( model, engine="bokeh") save(fig) outcfg["mutation_matrix_plot_files"].append(filename + ".html") # static matplotlib plot evcouplings.visualize.mutations.plot_mutation_matrix(model) plt.savefig(filename + ".pdf", bbox_inches="tight") outcfg["mutation_matrix_plot_files"].append(filename + ".pdf") # create single mutation matrix table, # add prediction by independent model and # save to file singles = single_mutant_matrix(c, output_column="prediction_epistatic") singles = predict_mutation_table(c0, singles, "prediction_independent") singles = predict_mutation_table(ci, singles, "prediction_inter_segment") singles.to_csv(outcfg["mutation_matrix_file"], index=False) # Pymol scripts outcfg["mutations_epistatic_pml_files"] = [] for model in ["epistatic", "independent", "inter_segment"]: pml_filename = prefix + "_{}_model.pml".format(model) evcouplings.visualize.mutations.mutation_pymol_script( singles, pml_filename, effect_column="prediction_" + model, segment_to_chain_mapping={ first_segment_name: first_chain_name, second_segment_name: second_chain_name }) outcfg["mutations_epistatic_pml_files"].append(pml_filename) # predict experimental dataset if given dataset_file = kwargs["mutation_dataset_file"] if dataset_file is not None: verify_resources("Dataset file does not exist", dataset_file) data = pd.read_csv(dataset_file, comment="#", sep=",") if "segment" not in data.columns: raise ValueError("Input mutation dataset file does not contain " "a column called 'segment' to specify the " "protein of origin for each mutation") # add epistatic model prediction data_pred = predict_mutation_table(c, data, "prediction_epistatic") # add independent model prediction data_pred = predict_mutation_table(c0, data_pred, "prediction_independent") data_pred = predict_mutation_table(ci, data_pred, "inter_segment") outcfg[ "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv" data_pred.to_csv(outcfg["mutation_dataset_predicted_file"], index=False) return outcfg
def infer_plmc(**kwargs): """ Run EC computation on alignment. This function contains the functionality shared between monomer and complex EC inference. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # the following are passed through stage... "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_valid_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) if segments is not None: # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) return outcfg, ecs, segments
def jackhmmer_search(**kwargs): """ Protocol: Iterative jackhmmer search against a sequence database. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "sequence_id", "sequence_file", "sequence_download_url", "region", "first_index", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "iterations", "cpu", "nobias", "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # store search sequence file here target_sequence_file = prefix + ".fa" full_sequence_file = prefix + "_full.fa" # make sure search sequence is defined and load it full_seq_file, (full_seq_id, full_seq) = fetch_sequence( kwargs["sequence_id"], kwargs["sequence_file"], kwargs["sequence_download_url"], full_sequence_file) # cut sequence to target region and save in sequence_file # (this is the main sequence file used downstream) (region_start, region_end), cut_seq = cut_sequence(full_seq, kwargs["sequence_id"], kwargs["region"], kwargs["first_index"], target_sequence_file) # run jackhmmer... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for jackhmmer seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], len(cut_seq)) # run search process ali = at.run_jackhmmer( query=target_sequence_file, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, iterations=kwargs["iterations"], nobias=kwargs["nobias"], cpu=kwargs["cpu"], checkpoints_hmm=kwargs["checkpoints_hmm"], checkpoints_ali=kwargs["checkpoints_ali"], binary=kwargs["jackhmmer"], ) # get rid of huge stdout log file immediately # (do not use /dev/null option of jackhmmer function # to make no assumption about operating system) try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_id": kwargs["sequence_id"], "target_sequence_file": target_sequence_file, "sequence_file": full_sequence_file, "first_index": kwargs["first_index"], "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def hmmbuild_and_search(**kwargs): """ Protocol: Build HMM from sequence alignment using hmmbuild and search against a sequence database using hmmsearch. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end # define the gap threshold for inclusion in HMM's build by HMMbuild. SYMFRAC_HMMBUILD = 0.0 # check for required options check_required(kwargs, [ "prefix", "sequence_id", "alignment_file", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "cpu", "nobias", "reuse_alignment", "hmmbuild", "hmmsearch" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # prepare input alignment for hmmbuild focus_fasta_file, target_sequence_file, region_start, region_end = \ _format_alignment_for_hmmbuild( kwargs["alignment_file"], **kwargs ) # run hmmbuild_and_search... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for hmmsearch sequence_length = region_end - region_start + 1 seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], sequence_length) # create the hmm hmmbuild_result = at.run_hmmbuild( alignment_file=focus_fasta_file, prefix=prefix, symfrac=SYMFRAC_HMMBUILD, cpu=kwargs["cpu"], binary=kwargs["hmmbuild"], ) hmmfile = hmmbuild_result.hmmfile # run the alignment from the hmm ali = at.run_hmmsearch( hmmfile=hmmfile, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, nobias=kwargs["nobias"], cpu=kwargs["cpu"], binary=kwargs["hmmsearch"], ) # get rid of huge stdout log file immediately try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # only item from hmmsearch_result to save is the hmmfile ali["hmmfile"] = hmmfile # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_file": target_sequence_file, "first_index": region_start, "input_raw_focus_alignment": focus_fasta_file, "target_sequence_file": target_sequence_file, "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # convert the raw output alignment to fasta format # and add the appropriate query sequecne raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def complex(**kwargs): """ Protocol: Compare ECs for a complex to 3D structure Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "first_compare_multimer", "second_compare_multimer", "distance_cutoff", "first_sequence_id", "second_sequence_id", "first_sequence_file", "second_sequence_file", "first_segments", "second_segments", "first_target_sequence_file", "second_target_sequence_file", "scale_sizes" ]) prefix = kwargs["prefix"] outcfg = { # initialize output EC files "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv", # initialize output inter distancemap files "distmap_inter": prefix + "_distmap_inter", "inter_contacts_file": prefix + "_inter_contacts_file" } # Add PDB comparison files for first and second monomer for monomer_prefix in ["first", "second"]: outcfg = { **outcfg, monomer_prefix + "_pdb_structure_hits_file": "{}_{}_structure_hits.csv".format(prefix, monomer_prefix), monomer_prefix + "_pdb_structure_hits_unfiltered_file": "{}_{}_structure_hits_unfitered.csv".format( prefix, monomer_prefix), monomer_prefix + "_distmap_monomer": "{}_{}_distance_map_monomer".format(prefix, monomer_prefix), monomer_prefix + "_distmap_multimer": "{}_{}_distance_map_multimer".format(prefix, monomer_prefix), } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # store auxiliary files here (too much for average user) first_aux_prefix = insert_dir(aux_prefix, "first_monomer", rootname_subdir=False) create_prefix_folders(first_aux_prefix) # store auxiliary files here (too much for average user) second_aux_prefix = insert_dir(aux_prefix, "second_monomer", rootname_subdir=False) create_prefix_folders(second_aux_prefix) # Step 1: Identify 3D structures for comparison def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): # create a dictionary with kwargs for just the current monomer # remove the "prefix" kwargs so that we can replace with the # aux prefix when calling _identify_structures # only replace first occurrence of name_prefix monomer_kwargs = { k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k } # this field needs to be set explicitly else it gets overwritten by concatenated file monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"] monomer_kwargs["raw_focus_alignment_file"] = kwargs[ name_prefix + "_raw_focus_alignment_file"] # identify structures for that monomer sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs, prefix=aux_prefix) # save selected PDB hits sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv( outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False) return outcfg, sifts_map outcfg, first_sifts_map = _identify_monomer_structures( "first", outcfg, first_aux_prefix) outcfg, second_sifts_map = _identify_monomer_structures( "second", outcfg, second_aux_prefix) # get the segment names from the kwargs segment_list = kwargs["segments"] # Make sure user provided exactly two segments if len(segment_list) != 2: raise InvalidParameterError( "Compare stage for protein complexes requires exactly two segments" ) first_segment_name = kwargs["segments"][0][0] second_segment_name = kwargs["segments"][1][0] # Step 2: Compute distance maps def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # prepare a sequence map to remap the structures we have found verify_resources("Target sequence file does not exist", kwargs[name_prefix + "_target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs[name_prefix + "_target_sequence_file"]) as f: header, seq = next(read_fasta(f)) # create target sequence map for remapping structure seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_intra") d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"]) # save contacts to separate file outcfg[ name_prefix + "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs[name_prefix + "_compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact map, save it if d_multimer is not None: d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"]) outcfg[ name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_multimer_contacts_file"], index=False) else: outcfg[name_prefix + "_distmap_multimer"] = None # create remapped structures (e.g. for # later comparison of folding results) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg[name_prefix + "_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap, chain_name=chain_name, raise_missing=kwargs["raise_missing"]).items() } else: # if no structures, cannot compute distance maps d_intra = None d_multimer = None outcfg[name_prefix + "_distmap_monomer"] = None outcfg[name_prefix + "_distmap_multimer"] = None outcfg[name_prefix + "remapped_pdb_files"] = None return d_intra, d_multimer, seqmap # load all structures for both monomers all_structures = set(first_sifts_map.hits.pdb_id).union( set(second_sifts_map.hits.pdb_id)) structures = load_structures(all_structures, kwargs["pdb_mmtf_dir"], raise_missing=False) d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps( first_sifts_map, "first", "A") d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps( second_sifts_map, "second", "B") # compute inter distance map if sifts map for each monomer exists if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: d_inter = inter_dists(first_sifts_map, second_sifts_map, raise_missing=kwargs["raise_missing"]) # if there were overlapping PDBs, save the results if d_inter is not None: d_inter.to_file(outcfg["distmap_inter"]) # save contacts to separate file d_inter.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["inter_contacts_file"], index=False) else: outcfg["inter_contacts_file"] = None d_inter = None # # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we have an intra distance map # for at least one monomer - inter can't exist unless # we have both monomers if (d_intra_i is not None) or (d_intra_j is not None): # compare distances individually for each segment pair ecs_intra_i = ec_table.query( "segment_i == segment_j == @first_segment_name") if d_intra_i is not None: ecs_intra_i_compared = coupling_scores_compared( ecs_intra_i, d_intra_i, d_multimer_i, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: # If no distance map, the distance is saved as np.nan ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan) ecs_intra_j = ec_table.query( "segment_i == segment_j == @second_segment_name") if d_intra_j is not None: ecs_intra_j_compared = coupling_scores_compared( ecs_intra_j, d_intra_j, d_multimer_j, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan) ecs_inter = ec_table.query("segment_i != segment_j") if d_inter is not None: ecs_inter_compared = coupling_scores_compared( ecs_inter, d_inter, dist_map_multimer=None, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist= None # does not apply for inter-protein ECs ) else: ecs_inter_compared = ecs_inter.assign(dist=np.nan) # combine the tables ec_table_compared = pd.concat([ ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared ]) # rename the precision column to "segmentwise_precision" # because we calculated precision for each segment independently ec_table_compared = ec_table_compared.rename( columns={"precision": "segmentwise_precision"}) # TODO: change "cn" to "score" eventually ec_table_compared = ec_table_compared.sort_values("cn", ascending=False) # add the total precision # TODO: implement different cutoffs for intra vs inter contacts ec_table_compared = add_precision( ec_table_compared, dist_cutoff=kwargs["distance_cutoff"]) # save to file # all ecs ec_table_compared.to_csv(outcfg[out_file]) # save the inter ECs to a file ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"]) # create the inter-ecs line drawing script if outcfg["ec_compared_inter_file"] is not None and kwargs[ "plot_highest_count"] is not None: inter_ecs = ec_table.query("segment_i != segment_j") outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script( inter_ecs.iloc[:kwargs["plot_highest_count"], :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"], chain={ first_segment_name: "A", second_segment_name: "B" }) # Remap the complex crystal structures, if available if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: outcfg["complex_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_complex_chains( first_sifts_map, second_sifts_map, seqmap_i, seqmap_j, output_prefix=aux_prefix, raise_missing=kwargs["raise_missing"]).items() } # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_complex_contact_maps( ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter, first_segment_name, second_segment_name, **kwargs) return outcfg
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) # write the sorted ECs table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"], score_column="cn" # "di ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs, score="cn") # "di" ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # create JSON output and write to file f.write( evzoom_json(model) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def run_psipred(fasta_file, output_dir, binary="runpsipred"): """ Run psipred secondary structure prediction psipred output file convention: run_psipred creates output files <rootname>.ss2 and <rootname2>.horiz in the current working directory, where <rootname> is extracted from the basename of the input file (e.g. /home/test/<rootname>.fa) Parameters ---------- fasta_file : str Input sequence file in FASTA format output_dir : str Directory in which output will be saved binary : str, optional (default: "cns") Path of psipred executable (runpsipred) Returns ------- ss2_file : str Absolute path to prediction output in "VFORMAT" horiz_file : str Absolute path to prediction output in "HFORMAT" Raises ------ ExternalToolError If call to psipred fails """ # make sure we have absolute path binary = path.abspath(binary) fasta_file = path.abspath(fasta_file) output_dir = path.abspath(output_dir) # make sure input file is valid verify_resources("Input FASTA file is invalid", fasta_file) # make sure output directory exists makedirs(output_dir) # execute psipred; # we have to start it from output directory so # result files end up there (this is hardcoded # in runpsipred) return_code, stdout, stderr = run( [binary, fasta_file], working_dir=output_dir, ) # determine where psipred will store output based # on logic from runpsipred script rootname, _ = path.splitext(path.basename(fasta_file)) output_prefix = path.join(output_dir, rootname) # construct paths to output files in vertical and horizontal formats ss2_file = output_prefix + ".ss2" horiz_file = output_prefix + ".horiz" # make sure we actually predicted something verify_resources("psipred output is invalid", ss2_file, horiz_file) return ss2_file, horiz_file
def existing(**kwargs): """ Protocol: Use external sequence alignment and extract all relevant information from there (e.g. sequence, region, etc.), then apply gap & fragment filtering as usual Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * alignment_file * raw_focus_alignment_file * statistics_file * sequence_file * first_index * target_sequence_file * annotation_file (None) * frequencies_file * identities_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "input_alignment", "sequence_id", "first_index", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this file is starting point of pipeline; # check if input alignment actually exists input_alignment = kwargs["input_alignment"] verify_resources("Input alignment does not exist", input_alignment) # first try to autodetect format of alignment with open(input_alignment) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment)) with open(input_alignment) as f: ali_raw = Alignment.from_file(f, format) # save annotation in sequence headers (species etc.) annotation_file = None if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" from_anno_line = (format == "stockholm") annotation = extract_header_annotation(ali_raw, from_annotation=from_anno_line) annotation.to_csv(annotation_file, index=False) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError("Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # apply sequence identity and fragment filters, # and gap threshold mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_, region_start, **kwargs) # generate output configuration of protocol outcfg = { **mod_outcfg, "sequence_id": sequence_id, "sequence_file": target_sequence_file, "first_index": region_start, "target_sequence_file": target_sequence_file, "focus_sequence": header, "focus_mode": True, } if annotation_file is not None: outcfg["annotation_file"] = annotation_file # dump config to YAML file for debugging/logging write_config_file(prefix + ".align_existing.outcfg", outcfg) # return results of protocol return outcfg
def run_hmmscan(query, database, prefix, use_model_threshold=True, threshold_type="cut_ga", use_bitscores=True, domain_threshold=None, seq_threshold=None, nobias=False, cpu=None, stdout_redirect=None, binary="hmmscan"): """ Run hmmscan of HMMs in database against sequences in query to identify matches of these HMMs. Refer to HMMER Userguide for explanation of these parameters. Parameters ---------- query : str File containing query sequence(s) database : str File containing HMM database (prepared with hmmpress) prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. use_model_threshold: bool (default: True) Use model-specific inclusion thresholds from HMM database rather than global bitscore/E-value thresholds (use_bitscores, domain_threshold and seq_threshold are overriden by this flag). threshold-type: {"cut_ga", "cut_nc", "cut_tc"} (default: "cut_ga") Use gathering (default), noise or trusted cutoff to define scan hits. Please refer to HMMER manual for details. use_bitscores : bool Use bitscore inclusion thresholds rather than E-values. Overriden by use_model_threshold flag. domain_threshold : int or float or str Inclusion threshold applied on the domain level (e.g. "1E-03" or 0.001 or 50) seq_threshold : int or float or str Inclusion threshold applied on the sequence level (e.g. "1E-03" or 0.001 or 50) nobias : bool, optional (default: False) Turn of bias correction cpu : int, optional (default: None) Number of CPUs to use for search. Uses all if None. stdout_redirect : str, optional (default: None) Redirect bulky stdout instead of storing with rest of results (use "/dev/null" to dispose) binary : str (default: "hmmscan") Path to hmmscan binary (put in PATH for default to work) Returns ------- HmmscanResult namedtuple with fields corresponding to the different output files (prefix, output, tblout, domtblout, pfamtblout) Raises ------ ExternalToolError, ResourceError """ verify_resources("Input file does not exist or is empty", query, database) create_prefix_folders(prefix) result = HmmscanResult( prefix, prefix + ".output" if stdout_redirect is None else stdout_redirect, prefix + ".tblout", prefix + ".domtblout", prefix + ".pfamtblout") cmd = [ binary, "-o", result.output, "--tblout", result.tblout, "--domtblout", result.domtblout, "--pfamtblout", result.pfamtblout, "--notextw", "--acc", ] # number of CPUs if cpu is not None: cmd += ["--cpu", str(cpu)] # bias correction filter if nobias: cmd += ["--nobias"] # either use model-specific threshold, or custom # bitscore/E-value thresholds if use_model_threshold: THRESHOLD_CHOICES = ["cut_ga", "cut_nc", "cut_tc"] if threshold_type not in THRESHOLD_CHOICES: raise ValueError("Invalid model threshold, valid choices are: " + ", ".join(THRESHOLD_CHOICES)) cmd += ["--" + threshold_type] else: if seq_threshold is None or domain_threshold is None: raise ValueError("Must define sequence- and domain-level reporting" "thresholds, or use gathering threshold instead.") if use_bitscores: cmd += [ "-T", str(seq_threshold), "--domT", str(domain_threshold), ] else: cmd += [ "-E", str(seq_threshold), "--domE", str(domain_threshold), ] cmd += [database, query] return_code, stdout, stderr = run(cmd) # also check we actually created a table with hits verify_resources( "hmmscan did not return results: " "stdout={} stderr={} file={}".format(stdout, stderr, result.domtblout), result.domtblout) return result
def standard(**kwargs): """ Protocol: Compare ECs for single proteins (or domains) to 3D structure information Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "compare_multimer", "distance_cutoff", "target_sequence_file", "scale_sizes", ]) prefix = kwargs["prefix"] outcfg = { "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "pdb_structure_hits_file": prefix + "_structure_hits.csv", "pdb_structure_hits_unfiltered_file": prefix + "_structure_hits_unfiltered.csv", # cannot have the distmap files end with "_file" because there are # two files (.npy and .csv), which would cause problems with automatic # checking if those files exist "distmap_monomer": prefix + "_distance_map_monomer", "distmap_multimer": prefix + "_distance_map_multimer", } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # Step 1: Identify 3D structures for comparison sifts_map, sifts_map_full = _identify_structures( **{ **kwargs, "prefix": aux_prefix, }) # save selected PDB hits sifts_map.hits.to_csv(outcfg["pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv(outcfg["pdb_structure_hits_unfiltered_file"], index=False) # Step 2: Compute distance maps # load all structures at once structures = load_structures(sifts_map.hits.pdb_id, kwargs["pdb_mmtf_dir"], raise_missing=False) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_distmap_intra") d_intra.to_file(outcfg["distmap_monomer"]) # save contacts to separate file outcfg["monomer_contacts_file"] = prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs["compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact mapin the end, save it if d_multimer is not None: d_multimer.to_file(outcfg["distmap_multimer"]) outcfg[ "multimer_contacts_file"] = prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["multimer_contacts_file"], index=False) else: outcfg["distmap_multimer"] = None # at this point, also create remapped structures (e.g. for # later comparison of folding results) verify_resources("Target sequence file does not exist", kwargs["target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs["target_sequence_file"]) as f: header, seq = next(read_fasta(f)) seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg["remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap).items() } else: # if no structures, can not compute distance maps d_intra = None d_multimer = None outcfg["distmap_monomer"] = None outcfg["distmap_multimer"] = None outcfg["remapped_pdb_files"] = None # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) # identify number of sites in EC model num_sites = len( set.union(set(ec_table.i.unique()), set(ec_table.j.unique()))) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we minimally have intra distance map if d_intra is not None: coupling_scores_compared(ec_table, d_intra, d_multimer, dist_cutoff=kwargs["distance_cutoff"], output_file=outcfg[out_file], min_sequence_dist=min_seq_dist) else: outcfg[out_file] = None # also create line-drawing script if we made the csv if outcfg["ec_compared_longrange_file"] is not None: ecs_longrange = pd.read_csv(outcfg["ec_compared_longrange_file"]) outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script(ecs_longrange.iloc[:num_sites, :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"]) # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_contact_maps(ec_table, d_intra, d_multimer, **kwargs) return outcfg
def genome_distance(**kwargs): """ Protocol: Concatenate alignments based on genomic distance Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * alignment_file * raw_alignment_file * focus_mode * focus_sequence * segments * frequencies_file * identities_file * num_sequences * num_sites * raw_focus_alignment_file * statistics_file """ check_required( kwargs, [ "prefix", "first_alignment_file", "second_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_region_start", "second_region_start", "first_segments", "second_segments", "genome_distance_threshold", "first_genome_location_file", "second_genome_location_file", "first_annotation_file", "second_annotation_file" ] ) prefix = kwargs["prefix"] # make sure input alignments exist verify_resources( "Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"] ) verify_resources( "Genome location file does not exist", kwargs["first_genome_location_file"], kwargs["second_genome_location_file"] ) # make sure output directory exists create_prefix_folders(prefix) # load the information for each monomer alignment alignment_1 = kwargs["first_alignment_file"] alignment_2 = kwargs["second_alignment_file"] genome_location_filename_1 = kwargs["first_genome_location_file"] genome_location_filename_2 = kwargs["second_genome_location_file"] gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0) gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0) # find all possible matches possible_partners = find_possible_partners( gene_location_table_1, gene_location_table_2 ) # find the best reciprocal matches id_pairing_unfiltered = best_reciprocal_matching(possible_partners) # filter best reciprocal matches by genome distance threshold if kwargs["genome_distance_threshold"]: distance_threshold = kwargs["genome_distance_threshold"] id_pairing = id_pairing_unfiltered.query("distance < @distance_threshold") else: id_pairing = id_pairing_unfiltered id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"] id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"] # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ write_concatenated_alignment( id_pairing, alignment_1, alignment_2, kwargs["first_focus_sequence"], kwargs["second_focus_sequence"] ) # save the alignment files raw_alignment_file = prefix + "_raw.fasta" with open(raw_alignment_file, "w") as of: raw_ali.write(of) mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: mon_ali_2.write(of) # filter the alignment aln_outcfg, _ = modify_alignment( raw_ali, target_seq_index, target_seq_id, kwargs["first_region_start"], **kwargs ) # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = aln_outcfg outcfg["raw_alignment_file"] = raw_alignment_file outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1 outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2 outcfg["focus_sequence"] = target_seq_id # Update the segments outcfg = modify_complex_segments(outcfg, **kwargs) # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) # plot the genome distance distribution outcfg["distance_plot_file"] = prefix + "_distplot.pdf" plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"]) return outcfg
def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # prepare a sequence map to remap the structures we have found verify_resources("Target sequence file does not exist", kwargs[name_prefix + "_target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs[name_prefix + "_target_sequence_file"]) as f: header, seq = next(read_fasta(f)) # create target sequence map for remapping structure seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_intra") d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"]) # save contacts to separate file outcfg[ name_prefix + "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs[name_prefix + "_compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact map, save it if d_multimer is not None: d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"]) outcfg[ name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_multimer_contacts_file"], index=False) else: outcfg[name_prefix + "_distmap_multimer"] = None # create remapped structures (e.g. for # later comparison of folding results) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg[name_prefix + "_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap, chain_name=chain_name, raise_missing=kwargs["raise_missing"]).items() } else: # if no structures, cannot compute distance maps d_intra = None d_multimer = None outcfg[name_prefix + "_distmap_monomer"] = None outcfg[name_prefix + "_distmap_multimer"] = None outcfg[name_prefix + "remapped_pdb_files"] = None return d_intra, d_multimer, seqmap
def complex_dock(**kwargs): """ Protocol: Predict 3D structure from evolutionary couplings Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * docking_restraints_files """ check_required(kwargs, [ "prefix", "ec_file", "segments", "dock_probability_cutoffs", "dock_lowest_count", "dock_highest_count", "dock_increase", ]) prefix = kwargs["prefix"] outcfg = {} # make sure output directory exists create_prefix_folders(prefix) verify_resources("EC file does not exist and/or is empty", kwargs["ec_file"]) ecs_all = pd.read_csv(kwargs["ec_file"]) ecs_dock = ecs_all.query("segment_i != segment_j") # define the sub-runs ... folding_runs = [] # ... based on mixture model probability cutoffs = kwargs["dock_probability_cutoffs"] if cutoffs is not None and "probability" in ecs_dock.columns: if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: sig_ecs = ecs_dock.query("probability >= @c") if len(sig_ecs) > 0: folding_runs.append( (sig_ecs, "_significant_ECs_{}_restraints.tbl".format(c))) # ... and on simple EC counts/bins flc = kwargs["dock_lowest_count"] fhc = kwargs["dock_highest_count"] fi = kwargs["dock_increase"] if flc is not None and fhc is not None and fi is not None: num_sites = len(set(ecs_dock.i.unique())) + len( set(ecs_dock.j.unique())) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(flc) highest = _discrete_count(fhc) step = _discrete_count(fi) # append to list of jobs to run folding_runs += [(ecs_dock.iloc[:c], "_{}_restraints.tbl".format(c)) for c in range(lowest, highest + 1, step)] outcfg["docking_restraint_files"] = [] for job_ecs, job_suffix in folding_runs: job_filename = prefix + job_suffix docking_restraints(job_ecs, job_filename, haddock_dist_restraint) outcfg["docking_restraint_files"].append(job_filename) return outcfg