def verify_prefix(verify_subdir=True, **config): """ Check if configuration contains a prefix, and that prefix is a valid directory we can write to on the filesystem Parameters ---------- verify_subdir : bool, optional (default: True) Check if we can create subdirectory containing full prefix. Set this to False for outer evcouplings app loop. **config Input configuration for pipeline Returns ------- prefix : str Verified prefix """ # check we have a prefix entry, otherwise all hope is lost... try: prefix = config["global"]["prefix"] except KeyError: raise InvalidParameterError( "Configuration does not include 'prefix' setting in " "'global' section") # make sure prefix is also specified if prefix is None: raise InvalidParameterError( "'prefix' must be specified and cannot be None") # verify that prefix is workable in terms # of filesystem try: # make prefix folder create_prefix_folders(prefix) # try if we can write in the folder with open(prefix + ".test__", "w") as f: pass # get rid of the file again os.remove(prefix + ".test__") if verify_subdir: # make sure we can create a subdirectory sub_prefix = insert_dir(prefix, "test__") create_prefix_folders(sub_prefix) # remove again os.rmdir(path.dirname(sub_prefix)) except OSError as e: raise InvalidParameterError( "Not a valid prefix: {}".format(prefix)) from e return prefix
def from_file(cls, filename, file_format="pdb"): """ Initialize structure from PDB/mmCIF file Parameters ---------- filename : str Path of file file_format : {"pdb", "cif"}, optional (default: "pdb") Format of structure (old PDB format or mmCIF) Returns ------- ClassicPDB Initialized PDB structure """ try: if file_format == "pdb": from Bio.PDB import PDBParser parser = PDBParser(QUIET=True) elif file_format == "cif": from Bio.PDB import FastMMCIFParser parser = FastMMCIFParser(QUIET=True) else: raise InvalidParameterError( "Invalid file_format, valid options are: pdb, cif" ) structure = parser.get_structure("", filename) return cls(structure) except FileNotFoundError as e: raise ResourceError( "Could not find file {}".format(filename) ) from e
def run(**kwargs): """ Exposes command line interface as a Python function. Parameters ---------- kwargs See click.option decorators for app() function """ # substitute commmand line options in config file config = substitute_config(**kwargs) # check minimal set of parameters is present in config check_required(config, ["pipeline", "stages", "global"]) # verify that global prefix makes sense pipeline.verify_prefix(verify_subdir=False, **config) # make sure parameters make sense (minimally...) if config["global"].get("sequence_id", None) is None: raise InvalidParameterError( "Sequence identifier not defined (sequence_id).") # for convenience, turn on N_eff computation if we run alignment, # but not the couplings stage if "align" in config["stages"] and "couplings" not in config["stages"]: config["align"]["compute_num_effective_seqs"] = True # unroll batch jobs into individual pipeline jobs sub_configs = unroll_config(config) # run pipeline computation for each individual (unrolled) config run_jobs(sub_configs, config, kwargs.get("yolo", False), kwargs.get("workdir", None))
def run_maxcluster_cluster(predictions, method="average", rmsd=True, clustering_threshold=None, binary="maxcluster"): """ Compare a set of predicted structures to an experimental structure using maxcluster. For clustering functionality, use run_maxcluster_clustering() function. Parameters ---------- predictions : list(str) List of PDB files that should be compared against experiment method : {"single", "average", "maximum", "pairs_min", "pairs_abs"}, optional (default: "average") Clustering method (single / average / maximum linkage, or min / absolute size neighbour pairs clustering_threshold : float (optional, default: None) Initial clustering threshold (maxcluster -T option) rmsd : bool, optional (default: True) Use RMSD-based clustering (faster) binary : str, optional (default: "maxcluster") Path to maxcluster binary Returns ------- pandas.DataFrame Clustering result table (see parse_maxcluster_clustering for more detailed explanation) """ # create a list of files for input to maxcluster list_file = temp() with open(list_file, "w") as f: for pred_file in predictions: f.write(pred_file + "\n") method_map = { "single": 1, "average": 2, "maximum": 3, "pairs_min": 4, "pairs_abs": 5, } if method not in method_map: raise InvalidParameterError("Method must be one of the following: " + ", ".join(method_map.keys())) cmd = [binary, "-l", list_file, "-C", str(method_map[method])] if rmsd: cmd += ["-rmsd"] if clustering_threshold is not None: cmd += ["-T", str(clustering_threshold)] return_code, stdout, stderr = run(cmd) return parse_maxcluster_clustering(stdout)
def run(**kwargs): """ Run inference protocol to calculate ECs from input sequence alignment. Parameters ---------- Mandatory kwargs arguments: protocol: EC protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of stage (see individual protocol for fields) """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def _determine_pos(filename): structure = ClassicPDB.from_file(filename) if len(structure.model_to_chains) != 1: raise InvalidParameterError( "Structure contains more than one model: " + filename ) model = list(structure.model_to_chains.keys())[0] chains = structure.model_to_chains[model] if len(chains) != 1: raise InvalidParameterError( "Structure must contain exactly one chain, but contains: " + ",".join(chains) ) chain_name = chains[0] chain = structure.get_chain(chain_name, model) return chain.residues.id.astype(str).values, chain
def _determine_pos(filename): structure = ClassicPDB.from_file(filename) if len(structure.model_to_chains) == 0: raise InvalidParameterError( "Structure contains no model (is empty): " + filename + " - please verify that no problems occurred during structure mapping" ) elif len(structure.model_to_chains) > 1: raise InvalidParameterError( "Structure contains more than one model: " + filename) model = list(structure.model_to_chains.keys())[0] chains = structure.model_to_chains[model] if len(chains) != 1: raise InvalidParameterError( "Structure must contain exactly one chain, but contains: " + ",".join(chains)) chain_name = chains[0] chain = structure.get_chain(chain_name, model) return chain.residues.id.astype(str).values, chain
def run(**kwargs): """ Create summary statistics for evcouplings pipeline runs """ try: summarizer = PIPELINE_TO_SUMMARIZER[kwargs["pipeline"]] except KeyError: raise InvalidParameterError( "Not a valid pipeline, valid selections are: {}".format(",".join( PIPELINE_TO_SUMMARIZER.keys()))) summarizer(kwargs["prefix"], kwargs["configs"])
def cns_seq_file(sequence, output_file=None, residues_per_line=16): """ Generate a CNS .seq file for a given protein sequence Parameters ---------- sequence : str Amino acid sequence in one-letter code output_file : str, optional (default: None) Save 3-letter code sequence to this file (if None, will create temporary file) residues_per_line : int, optional (default: 16) Print this many residues on each line of .seq file Returns ------- output_file : str Path to file with sequence (useful if temporary file was generated) Raises ------ InvalidParameterError If sequence contains invalid symbol """ if output_file is None: output_file = temp() with open(output_file, "w") as f: # split sequence into parts per line lines = [ sequence[i:i + residues_per_line] for i in range(0, len(sequence), residues_per_line) ] # go through lines and transform into 3-letter code for line in lines: try: l3 = " ".join([AA1_to_AA3[aa] for aa in line]) except KeyError as e: raise InvalidParameterError( "Invalid amino acid could not be mapped") from e f.write(l3 + "\n") return output_file
def __init__(self, **kwargs): """ Create new SQL-based tracker. For now, this tracker will ignore file_list and store all file paths in the database except for those in delete_list. Parameters ---------- connection_string : str SQLite connection URI. Must include database name, and username/password if authentication is used. job_id : str Unique job identifier of job which should be tracked prefix : str Prefix of pipeline job pipeline : str Name of pipeline that is running file_list : list(str) List of file item keys from outconfig that should be stored in database. For now, this parameter has no effect and all file paths will be stored in database. delete_list : list(str) List of file item keys from outconfig that will be deleted after run is finished. These files cannot be stored as paths to the pipeline result in the output. config : dict(str) Entire configuration dictionary of job retry_max_number : int, optional (default: None) Maximum number of attemps to perform database queries / updates. If None, will try forever. retry_wait : int, optional (default: None) Time in seconds between retries to connect to database """ super().__init__(**kwargs) # for SQL tracker, job ID may not be longer than 255 chars to not interfere with older SQL DBs if len(self.job_id) > 255: raise InvalidParameterError( "Length of job_id for SQL tracker may not exceed 255 characters for database compatibility reasons" ) # create SQLAlchemy engine and session maker to # instantiate later sessions self._engine = create_engine(self.connection_string) self._Session = sessionmaker(bind=self._engine) # Make sure all tables are there in database Base.metadata.create_all(bind=self._engine)
def run(**kwargs): """ Run alignment concatenation protocol Parameters ---------- Mandatory kwargs arguments: protocol: concatenation protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of concatenation stage Dictionary with results in following fields: (in brackets: not mandatory) alignment_file raw_alignment_file focus_mode focus_sequence segments frequencies_file identities_file num_sequences num_sites raw_focus_alignment_file statistics_file """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run inference protocol to calculate ECs from input sequence alignment. Parameters ---------- Mandatory kwargs arguments: protocol: EC protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of couplings stage Dictionary with results in following fields: (in brackets: not mandatory) ec_file effective_sequences [enrichment_file] focus_mode focus_sequence model_file num_sequences num_sites raw_ec_file region_start segments """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format( kwargs["protocol"], ", ".join(PROTOCOLS.keys()) ) ) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run alignment protocol to generate multiple sequence alignment from input sequence. Parameters ---------- Mandatory kwargs arguments: protocol: Alignment protocol to run prefix: Output prefix for all generated files Optional: Returns ------- Alignment Dictionary with results of stage in following fields (in brackets - not returned by all protocols): * alignment_file * [raw_alignment_file] * statistics_file * target_sequence_file * sequence_file * [annotation_file] * frequencies_file * identities_file * [hittable_file] * focus_mode * focus_sequence * segments """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def run(**kwargs): """ Run alignment concatenation protocol Parameters ---------- Mandatory kwargs arguments: protocol: concatenation protocol to run prefix: Output prefix for all generated files Returns ------- outcfg : dict Output configuration of concatenation stage Dictionary with results in following fields (in brackets: not mandatory): .. todo:: to be finalized after implementing protocols * alignment_file * focus_mode * focus_sequence * segments * num_sites * num_sequences """ check_required(kwargs, ["protocol"]) if kwargs["protocol"] not in PROTOCOLS: raise InvalidParameterError( "Invalid protocol selection: " + "{}. Valid protocols are: {}".format(kwargs["protocol"], ", ".join( PROTOCOLS.keys()))) return PROTOCOLS[kwargs["protocol"]](**kwargs)
def existing(**kwargs): """ Protocol: Use external sequence alignment and extract all relevant information from there (e.g. sequence, region, etc.), then apply gap & fragment filtering as usual Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * alignment_file * raw_focus_alignment_file * statistics_file * sequence_file * first_index * target_sequence_file * annotation_file (None) * frequencies_file * identities_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "input_alignment", "sequence_id", "first_index", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this file is starting point of pipeline; # check if input alignment actually exists input_alignment = kwargs["input_alignment"] verify_resources("Input alignment does not exist", input_alignment) # first try to autodetect format of alignment with open(input_alignment) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment)) with open(input_alignment) as f: ali_raw = Alignment.from_file(f, format) # save annotation in sequence headers (species etc.) annotation_file = None if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" from_anno_line = (format == "stockholm") annotation = extract_header_annotation(ali_raw, from_annotation=from_anno_line) annotation.to_csv(annotation_file, index=False) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError("Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # apply sequence identity and fragment filters, # and gap threshold mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_, region_start, **kwargs) # generate output configuration of protocol outcfg = { **mod_outcfg, "sequence_id": sequence_id, "sequence_file": target_sequence_file, "first_index": region_start, "target_sequence_file": target_sequence_file, "focus_sequence": header, "focus_mode": True, } if annotation_file is not None: outcfg["annotation_file"] = annotation_file # dump config to YAML file for debugging/logging write_config_file(prefix + ".align_existing.outcfg", outcfg) # return results of protocol return outcfg
def cut_sequence(sequence, sequence_id, region=None, first_index=None, out_file=None): """ Cut a given sequence to sub-range and save it in a file Parameters ---------- sequence : str Full sequence that will be cut sequence_id : str Identifier of sequence, used to construct header in output file region : tuple(int, int), optional (default: None) Region that will be cut out of full sequence. If None, full sequence will be returned. first_index : int, optional (default: None) Define index of first position in sequence. Will be set to 1 if None. out_file : str, optional (default: None) Save sequence in a FASTA file (header: >sequence_id/start_region-end_region) Returns ------ str Subsequence contained in region tuple(int, int) Region. If no input region is given, this will be (1, len(sequence)); otherwise, the input region is returned. Raises ------ InvalidParameterError Upon invalid region specification (violating boundaries of sequence) """ cut_seq = None # (not using 1 as default value to allow parameter # to be unspecified in config file) if first_index is None: first_index = 1 # last index is *inclusive*! if region is None: region = (first_index, first_index + len(sequence) - 1) cut_seq = sequence else: start, end = region str_start = start - first_index str_end = end - first_index + 1 cut_seq = sequence[str_start:str_end] # make sure bounds are valid given the sequence that we have if str_start < 0 or str_end > len(sequence): raise InvalidParameterError( "Invalid sequence range: " "region={} first_index={} len(sequence)={}".format( region, first_index, len(sequence))) # save sequence to file if out_file is not None: with open(out_file, "w") as f: header = "{}/{}-{}".format(sequence_id, *region) write_fasta([(header, cut_seq)], f) return region, cut_seq
def complex(**kwargs): """ Protocol: Run monomer alignment protocol and postprocess it for EVcomplex calculations Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the alignment protocol, and the following additional field: genome_location_file : path to file containing the genomic locations for CDs's corresponding to identifiers in the alignment. """ check_required(kwargs, [ "prefix", "alignment_protocol", "uniprot_to_embl_table", "ena_genome_location_table" ]) verify_resources("Uniprot to EMBL mapping table does not exist", kwargs["uniprot_to_embl_table"]) verify_resources("ENA genome location table does not exist", kwargs["ena_genome_location_table"]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # run the regular alignment protocol # (standard, existing, ...) alignment_protocol = kwargs["alignment_protocol"] if alignment_protocol not in PROTOCOLS: raise InvalidParameterError( "Invalid choice for alignment protocol: {}".format( alignment_protocol)) outcfg = PROTOCOLS[kwargs["alignment_protocol"]](**kwargs) # if the user selected the existing alignment protocol # they can supply an input annotation file # which overwrites the annotation file generated by the existing protocol if alignment_protocol == "existing": check_required(kwargs, ["override_annotation_file"]) if kwargs["override_annotation_file"] is not None: verify_resources("Override annotation file does not exist", kwargs["override_annotation_file"]) outcfg["annotation_file"] = prefix + "_annotation.csv" annotation_data = pd.read_csv(kwargs["override_annotation_file"]) annotation_data.to_csv(outcfg["annotation_file"]) # extract cds identifiers for alignment uniprot IDs cds_ids = extract_cds_ids(outcfg["alignment_file"], kwargs["uniprot_to_embl_table"]) # extract genome location information from ENA genome_location_filename = prefix + "_genome_location.csv" genome_location_table = extract_embl_annotation( cds_ids, kwargs["ena_genome_location_table"], genome_location_filename) genome_location_table = add_full_header(genome_location_table, outcfg["alignment_file"]) genome_location_table.to_csv(genome_location_filename) outcfg["genome_location_file"] = genome_location_filename # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_complex.outcfg", outcfg) return outcfg
def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end
def complex(**kwargs): """ Protocol: Compare ECs for a complex to 3D structure Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "first_compare_multimer", "second_compare_multimer", "distance_cutoff", "first_sequence_id", "second_sequence_id", "first_sequence_file", "second_sequence_file", "first_segments", "second_segments", "first_target_sequence_file", "second_target_sequence_file", "scale_sizes" ]) prefix = kwargs["prefix"] outcfg = { # initialize output EC files "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv", # initialize output inter distancemap files "distmap_inter": prefix + "_distmap_inter", "inter_contacts_file": prefix + "_inter_contacts_file" } # Add PDB comparison files for first and second monomer for monomer_prefix in ["first", "second"]: outcfg = { **outcfg, monomer_prefix + "_pdb_structure_hits_file": "{}_{}_structure_hits.csv".format(prefix, monomer_prefix), monomer_prefix + "_pdb_structure_hits_unfiltered_file": "{}_{}_structure_hits_unfitered.csv".format( prefix, monomer_prefix), monomer_prefix + "_distmap_monomer": "{}_{}_distance_map_monomer".format(prefix, monomer_prefix), monomer_prefix + "_distmap_multimer": "{}_{}_distance_map_multimer".format(prefix, monomer_prefix), } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # store auxiliary files here (too much for average user) first_aux_prefix = insert_dir(aux_prefix, "first_monomer", rootname_subdir=False) create_prefix_folders(first_aux_prefix) # store auxiliary files here (too much for average user) second_aux_prefix = insert_dir(aux_prefix, "second_monomer", rootname_subdir=False) create_prefix_folders(second_aux_prefix) # Step 1: Identify 3D structures for comparison def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): # create a dictionary with kwargs for just the current monomer # remove the "prefix" kwargs so that we can replace with the # aux prefix when calling _identify_structures # only replace first occurrence of name_prefix monomer_kwargs = { k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k } # this field needs to be set explicitly else it gets overwritten by concatenated file monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"] monomer_kwargs["raw_focus_alignment_file"] = kwargs[ name_prefix + "_raw_focus_alignment_file"] # identify structures for that monomer sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs, prefix=aux_prefix) # save selected PDB hits sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv( outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False) return outcfg, sifts_map outcfg, first_sifts_map = _identify_monomer_structures( "first", outcfg, first_aux_prefix) outcfg, second_sifts_map = _identify_monomer_structures( "second", outcfg, second_aux_prefix) # get the segment names from the kwargs segment_list = kwargs["segments"] # Make sure user provided exactly two segments if len(segment_list) != 2: raise InvalidParameterError( "Compare stage for protein complexes requires exactly two segments" ) first_segment_name = kwargs["segments"][0][0] second_segment_name = kwargs["segments"][1][0] # Step 2: Compute distance maps def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # prepare a sequence map to remap the structures we have found verify_resources("Target sequence file does not exist", kwargs[name_prefix + "_target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs[name_prefix + "_target_sequence_file"]) as f: header, seq = next(read_fasta(f)) # create target sequence map for remapping structure seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_intra") d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"]) # save contacts to separate file outcfg[ name_prefix + "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs[name_prefix + "_compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact map, save it if d_multimer is not None: d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"]) outcfg[ name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_multimer_contacts_file"], index=False) else: outcfg[name_prefix + "_distmap_multimer"] = None # create remapped structures (e.g. for # later comparison of folding results) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg[name_prefix + "_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap, chain_name=chain_name, raise_missing=kwargs["raise_missing"]).items() } else: # if no structures, cannot compute distance maps d_intra = None d_multimer = None outcfg[name_prefix + "_distmap_monomer"] = None outcfg[name_prefix + "_distmap_multimer"] = None outcfg[name_prefix + "remapped_pdb_files"] = None return d_intra, d_multimer, seqmap # load all structures for both monomers all_structures = set(first_sifts_map.hits.pdb_id).union( set(second_sifts_map.hits.pdb_id)) structures = load_structures(all_structures, kwargs["pdb_mmtf_dir"], raise_missing=False) d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps( first_sifts_map, "first", "A") d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps( second_sifts_map, "second", "B") # compute inter distance map if sifts map for each monomer exists if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: d_inter = inter_dists(first_sifts_map, second_sifts_map, raise_missing=kwargs["raise_missing"]) # if there were overlapping PDBs, save the results if d_inter is not None: d_inter.to_file(outcfg["distmap_inter"]) # save contacts to separate file d_inter.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["inter_contacts_file"], index=False) else: outcfg["inter_contacts_file"] = None d_inter = None # # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we have an intra distance map # for at least one monomer - inter can't exist unless # we have both monomers if (d_intra_i is not None) or (d_intra_j is not None): # compare distances individually for each segment pair ecs_intra_i = ec_table.query( "segment_i == segment_j == @first_segment_name") if d_intra_i is not None: ecs_intra_i_compared = coupling_scores_compared( ecs_intra_i, d_intra_i, d_multimer_i, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: # If no distance map, the distance is saved as np.nan ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan) ecs_intra_j = ec_table.query( "segment_i == segment_j == @second_segment_name") if d_intra_j is not None: ecs_intra_j_compared = coupling_scores_compared( ecs_intra_j, d_intra_j, d_multimer_j, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan) ecs_inter = ec_table.query("segment_i != segment_j") if d_inter is not None: ecs_inter_compared = coupling_scores_compared( ecs_inter, d_inter, dist_map_multimer=None, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist= None # does not apply for inter-protein ECs ) else: ecs_inter_compared = ecs_inter.assign(dist=np.nan) # combine the tables ec_table_compared = pd.concat([ ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared ]) # rename the precision column to "segmentwise_precision" # because we calculated precision for each segment independently ec_table_compared = ec_table_compared.rename( columns={"precision": "segmentwise_precision"}) # TODO: change "cn" to "score" eventually ec_table_compared = ec_table_compared.sort_values("cn", ascending=False) # add the total precision # TODO: implement different cutoffs for intra vs inter contacts ec_table_compared = add_precision( ec_table_compared, dist_cutoff=kwargs["distance_cutoff"]) # save to file # all ecs ec_table_compared.to_csv(outcfg[out_file]) # save the inter ECs to a file ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"]) # create the inter-ecs line drawing script if outcfg["ec_compared_inter_file"] is not None and kwargs[ "plot_highest_count"] is not None: inter_ecs = ec_table.query("segment_i != segment_j") outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script( inter_ecs.iloc[:kwargs["plot_highest_count"], :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"], chain={ first_segment_name: "A", second_segment_name: "B" }) # Remap the complex crystal structures, if available if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: outcfg["complex_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_complex_chains( first_sifts_map, second_sifts_map, seqmap_i, seqmap_j, output_prefix=aux_prefix, raise_missing=kwargs["raise_missing"]).items() } # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_complex_contact_maps( ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter, first_segment_name, second_segment_name, **kwargs) return outcfg
def complex(**kwargs): """ Protocol: Infer ECs for protein complexes from alignment using plmc. Allows user to select scoring protocol. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required and infer_plmc() Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ # for additional required parameters, see infer_plmc() check_required( kwargs, [ "prefix", "min_sequence_distance", "scoring_model", "use_all_ecs_for_scoring", ] ) prefix = kwargs["prefix"] # infer ECs and load them outcfg, ecs, segments = infer_plmc(**kwargs) model = CouplingsModel(outcfg["model_file"]) # following computations are mostly specific to complex pipeline # add mixture model probability if kwargs["scoring_model"] in SCORING_MODELS: if kwargs["use_all_ecs_for_scoring"] is not None: use_all_ecs = kwargs["use_all_ecs_for_scoring"] else: use_all_ecs = False ecs = complex_probability( ecs, kwargs["scoring_model"], use_all_ecs ) else: raise InvalidParameterError( "Invalid scoring_model parameter: " + "{}. Valid options are: {}".format( kwargs["protocol"], ", ".join(SCORING_MODELS) ) ) # also create line-drawing script (for multiple chains) # by convention, we map first segment to chain A, # second to B, a.s.f. chain_mapping = dict( zip( [s.segment_id for s in segments], string.ascii_uppercase, ) ) outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_line_plot=True, generate_enrichment=False, ec_filter="segment_i != segment_j or abs(i - j) >= {}", chain=chain_mapping ) } # save just the inter protein ECs ## TODO: eventually have this accomplished by _postprocess_inference ## right now avoiding a second call with a different ec_filter ecs = pd.read_csv(outcfg["ec_file"]) outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv" inter_ecs = ecs.query("segment_i != segment_j") inter_ecs.to_csv(outcfg["inter_ec_file"], index=False) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_complex.outcfg", outcfg) # TODO: make the following complex-ready # EC enrichment: # # 1) think about making EC enrichment complex-ready and add # it back here - so far it only makes sense if all ECs are # on one segment # # EVzoom: # # 1) at the moment, EVzoom will use numbering before remapping # we should eventually get this to a point where segments + residue # index are displayed on EVzoom # # 2) note that this will currently use the default mixture model # selection for determining the EC cutoff, rather than the selection # used for the EC table above return outcfg
def execute(**config): """ Execute a pipeline configuration Parameters ---------- **config Input configuration for pipeline (see pipeline config files for example of how this should look like) Returns ------- global_state : dict Global output state of pipeline """ check_required(config, ["pipeline", "stages", "global"]) # check if valid pipeline was selected if config["pipeline"] not in PIPELINES: raise InvalidParameterError("Not a valid pipeline selection. " "Valid choices are:\n{}".format(", ".join( PIPELINES.keys()))) stages = config["stages"] if stages is None: raise InvalidParameterError("No stages defined, need at least one.") # get definition of selected pipeline pipeline = PIPELINES[config["pipeline"]] prefix = config["global"]["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this is the global state of results as # we move through different stages of # the pipeline global_state = config["global"] # keep track of how many stages are still # to be run, so we can leave out stages at # the end of workflow below num_stages_to_run = len(stages) # get job tracker tracker = get_result_tracker(config) # set job status to running and also initalize global state tracker.update(status=EStatus.RUN, results=global_state) # iterate through individual stages for (stage, runner, key_prefix) in pipeline: # check if anything else is left to # run, otherwise skip if num_stages_to_run == 0: break # check if config for stage is there check_required(config, [stage]) # output files for stage into an individual folder stage_prefix = insert_dir(prefix, stage) create_prefix_folders(stage_prefix) # config files for input and output of stage stage_incfg = "{}_{}.incfg".format(stage_prefix, stage) stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage) # update current stage of job tracker.update(stage=stage) # check if stage should be executed if stage in stages: # global state inserted at end, overrides any # stage-specific settings (except for custom prefix) incfg = { **config["tools"], **config["databases"], **config[stage], **global_state, "prefix": stage_prefix } # save input of stage in config file write_config_file(stage_incfg, incfg) # run stage outcfg = runner(**incfg) # prefix output keys if this parameter is # given in stage configuration, to avoid # name clashes if same protocol run multiple times if key_prefix is not None: outcfg = {key_prefix + k: v for k, v in outcfg.items()} # save output of stage in config file write_config_file(stage_outcfg, outcfg) # one less stage to put through after we ran this... num_stages_to_run -= 1 else: # skip state by injecting state from previous run verify_resources( "Trying to skip, but output configuration " "for stage '{}' does not exist. Has it already " "been run?".format(stage, stage), stage_outcfg) # read output configuration outcfg = read_config_file(stage_outcfg) # verify all the output files are there outfiles = [ filepath for f, filepath in outcfg.items() if f.endswith("_file") and filepath is not None ] verify_resources( "Output files from stage '{}' " "missing".format(stage), *outfiles) # update global state with outputs of stage global_state = {**global_state, **outcfg} # update state in tracker accordingly tracker.update(results=outcfg) # create results archive archive_file = create_archive(config, global_state, prefix) # only store results archive if a result file was created if archive_file is not None: global_state["archive_file"] = archive_file # prepare update for tracker, but only store in last # go when job is set to done tracker_archive_update = {"archive_file": archive_file} else: tracker_archive_update = None # set job status to done and transfer archive if selected for syncing tracker.update(status=EStatus.DONE, results=tracker_archive_update) # delete selected output files if requested; # tracker does not need to update here since it won't # sync entries of delete list in the first place global_state = delete_outputs(config, global_state) # write final global state of pipeline write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state) return global_state
def run_jobs(configs, global_config, overwrite=False, workdir=None, abort_on_error=True, environment=None): """ Submit config to pipeline Parameters ---------- configs : dict Configurations for individual subjobs global_config : dict Master configuration (if only one job, the contents of this dictionary will be equal to the single element of config_files) overwrite : bool, optional (default: False) If True, allows overwriting previous run of the same config, otherwise will fail if results from previous execution are present workdir : str, optional (default: None) Workdir in which to run job (will combine workdir and prefix in joint path) abort_on_error : bool, optional (default: True) Abort entire job submission if error occurs for one of the jobs by propagating RuntimeError environment : str, optional (default: None) Allow to pass value for environment parameter of submitter, will override environment.configuration from global_config (e.g., for setting environment variables like passwords) Returns ------- job_ids : dict Mapping from subjob prefix (keys in configs parameter) to identifier returned by submitter for each of the jobs that was *successfully* submitted (i.e. missing keys from configs param indicate these jobs could not be submitted). Raises ------ RuntimeError If error encountered during submission and abort_on_error is True """ cmd_base = environ.get("EVCOUPLINGS_RUNCFG_APP") or "evcouplings_runcfg" summ_base = environ.get( "EVCOUPLINGS_SUMMARIZE_APP") or "evcouplings_summarize" # determine output directory for config files prefix = global_config["global"]["prefix"] # integrate working directory into output prefix # if it is given; if prefix contains an absolute path, # this will override the workdir according to # implementation of path.join() if workdir is not None: out_prefix = path.join(workdir, prefix) else: out_prefix = prefix # save configuration file, make sure we do not overwrite previous run # if overwrite protection is activated # (but only if it is a valid configuration file with contents) cfg_filename = CONFIG_NAME.format(out_prefix) if not overwrite and valid_file(cfg_filename): raise InvalidParameterError( "Existing configuration file {} ".format(cfg_filename) + "indicates current prefix {} ".format(prefix) + "would overwrite existing results. Use --yolo " + "flag to deactivate overwrite protection (e.g. for " "restarting a job or running a different stage).") # make sure working directory exists create_prefix_folders(cfg_filename) # write global config file write_config_file(cfg_filename, global_config) # also write individual subjob configuration files # (we have to write these before submitting, since # the job summarizer needs the paths to all files) for subjob_prefix, subjob_cfg in configs.items(): # determine working dir for each subjob, since subjob # prefix may contain slashes leading to subfolder creation if workdir is not None: subjob_out_prefix = path.join(workdir, subjob_prefix) else: subjob_out_prefix = subjob_prefix subcfg_filename = CONFIG_NAME.format(subjob_out_prefix) # make sure output subfolder exists create_prefix_folders(subcfg_filename) # write subjob configuration file write_config_file(subcfg_filename, subjob_cfg) # now create list of subjob config files relative to working # directory (above, we allow to run submitted in arbitrary directory) config_files = [ CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs ] # create command for summarizer (needs to know all subjob config files) summ_cmd = "{} {} {} {}".format(summ_base, global_config["pipeline"], global_config["global"]["prefix"], " ".join(config_files)) # create submitter from global (pre-unrolling) configuration submitter = utils.SubmitterFactory(global_config["environment"]["engine"], db_path=out_prefix + "_job_database.txt") # collect individual submitted jobs here commands = [] # record subjob IDs returned by submitter for each job job_ids = {} # prepare individual jobs for submission for job, job_cfg in configs.items(): job_prefix = job_cfg["global"]["prefix"] job_cfg_file = CONFIG_NAME.format(job) # create submission command env = job_cfg["environment"] cmd = utils.Command( ["{} {}".format(cmd_base, job_cfg_file), summ_cmd], name=job_prefix, environment=environment or env["configuration"], workdir=workdir, resources={ utils.EResource.queue: env["queue"], utils.EResource.time: env["time"], utils.EResource.mem: env["memory"], utils.EResource.nodes: env["cores"], utils.EResource.out: job_prefix + "_stdout.log", utils.EResource.error: job_prefix + "_stderr.log", }) # store job for later dependency creation commands.append(cmd) tracker = get_result_tracker(job_cfg) try: # finally, submit job current_job_id = submitter.submit(cmd) # store run identifier returned by submitter # TODO: consider storing current_job_id using tracker right away job_ids[job] = current_job_id # set job status in database to pending tracker.update(status=EStatus.PEND) except RuntimeError as e: # set job as failed in database tracker.update(status=EStatus.FAIL, message=str(e)) # fail entire job submission if requested if abort_on_error: raise # submit final summarizer # (hold for now - summarizer is run after each subjob finishes) # wait for all runs to finish (but only if blocking) submitter.join() # return job identifiers return job_ids
def substitute_config(**kwargs): """ Substitute command line arguments into config file Parameters ---------- **kwargs Command line parameters to be substituted into configuration file Returns ------- dict Updated configuration """ # mapping of command line parameters to config file entries CONFIG_MAP = { "prefix": ("global", "prefix"), "protein": ("global", "sequence_id"), "seqfile": ("global", "sequence_file"), "alignment": ("align", "input_alignment"), "iterations": ("align", "iterations"), "id": ("align", "seqid_filter"), "seqcov": ("align", "minimum_sequence_coverage"), "colcov": ("align", "minimum_column_coverage"), "theta": ("global", "theta"), "plmiter": ("couplings", "iterations"), "queue": ("environment", "queue"), "time": ("environment", "time"), "cores": ("environment", "cores"), "memory": ("environment", "memory"), } # try to read in configuration config_file = kwargs["config"] if not valid_file(config_file): raise ResourceError( "Config file does not exist or is empty: {}".format(config_file)) config = read_config_file(config_file, preserve_order=True) # substitute command-line parameters into configuration # (if straightforward substitution) for param, value in kwargs.items(): if param in CONFIG_MAP and value is not None: outer, inner = CONFIG_MAP[param] config[outer][inner] = value # make sure that number of CPUs requested by # programs within pipeline does not exceed # number of cores requested in environment if config["environment"]["cores"] is not None: config["global"]["cpu"] = config["environment"]["cores"] # handle the more complicated parameters # If alignment is given, run "existing" protocol if kwargs.get("alignment", None) is not None: # TODO: think about what to do if sequence_file is given # (will not be used) config["align"]["protocol"] = "existing" # subregion of protein if kwargs.get("region", None) is not None: region = kwargs["region"] m = re.search("(\d+)-(\d+)", region) if m: start, end = map(int, m.groups()) config["global"]["region"] = [start, end] else: raise InvalidParameterError( "Region string does not have format " "start-end (e.g. 5-123):".format(region)) # pipeline stages to run if kwargs.get("stages", None) is not None: config["stages"] = kwargs["stages"].replace(" ", "").split(",") # sequence alignment input database if kwargs.get("database", None) is not None: db = kwargs["database"] # check if we have a predefined sequence database # if so, use it; otherwise, interpret as file path if db in config["databases"]: config["align"]["database"] = db else: config["align"]["database"] = "custom" config["databases"]["custom"] = db # make sure bitscore and E-value thresholds are exclusively set if kwargs.get("bitscores", None) is not None and kwargs.get( "evalues", None) is not None: raise InvalidParameterError( "Can not specify bitscore and E-value threshold at the same time.") if kwargs.get("bitscores", None) is not None: thresholds = kwargs["bitscores"] bitscore = True elif kwargs.get("evalues", None) is not None: thresholds = kwargs["evalues"] bitscore = False else: thresholds = None if thresholds is not None: T = thresholds.replace(" ", "").split(",") try: x_cast = [(float(t) if "." in t else int(t)) for t in T] except ValueError: raise InvalidParameterError( "Bitscore/E-value threshold(s) must be numeric: " "{}".format(thresholds)) config["align"]["use_bitscores"] = bitscore # check if we have a single threshold (single job) # or if we need to create an array of jobs if len(x_cast) == 1: config["align"]["domain_threshold"] = x_cast[0] config["align"]["sequence_threshold"] = x_cast[0] else: config["batch"] = {} for t in x_cast: sub_prefix = ("_b" if bitscore else "_e") + str(t) config["batch"][sub_prefix] = { "align": { "domain_threshold": t, "sequence_threshold": t, } } return config
def compare_models_maxcluster(experiments, predictions, norm_by_intersection=True, distance_cutoff=None, binary="maxcluster"): """ Compare predicted models to a set of experimental structures using maxcluster Parameters ---------- experiments : list(str) Paths to files with experimental structures predictions : list(str) Paths to files with predicted structures norm_by_intersection : bool, optional (default: True) If True, use the number of positions that exist in both experiment and predictions for normalizing TM scores (assumes all predicted structures have the same positions). If False, use length of experimental structure. distance_cutoff : float, optional (default: None) Distance cutoff for MaxSub search (-d option of maxcluster). If None, will use maxcluster auto-calibration. binary : str, optional (default: "maxcluster") Path to maxcluster binary Returns ------- full_result : pandas.DataFrame Comparison results across all experimental structures single_results : dict Mapping from experimental structure filename to a pandas.DataFrame containing the comparison result for that particular structure. """ # determine list of positions in a structure # (maxcluster can only handle single model, single chain # structures, so we check that here and fail otherwise) def _determine_pos(filename): structure = ClassicPDB.from_file(filename) if len(structure.model_to_chains) == 0: raise InvalidParameterError( "Structure contains no model (is empty): " + filename + " - please verify that no problems occurred during structure mapping" ) elif len(structure.model_to_chains) > 1: raise InvalidParameterError( "Structure contains more than one model: " + filename ) model = list(structure.model_to_chains.keys())[0] chains = structure.model_to_chains[model] if len(chains) != 1: raise InvalidParameterError( "Structure must contain exactly one chain, but contains: " + ",".join(chains) ) chain_name = chains[0] chain = structure.get_chain(chain_name, model) return chain.residues.id.astype(str).values, chain # remove alternative atom locations since maxcluster # can only handle one unique atoms def _eliminate_altloc(chain): # if multiple locations, select the one with the # highest occupancy chain.coords = chain.coords.loc[ chain.coords.groupby( ["residue_index", "atom_name"] ).occupancy.idxmax() ] # save cut chain to temporary file temp_filename = temp() with open(temp_filename, "w") as f: chain.to_file(f) return temp_filename # check we have at least one prediction if len(predictions) == 0: raise InvalidParameterError( "Need at least one predicted structure." ) # determine positions in predicted structure from first model pred_pos, _ = _determine_pos(predictions[0]) # collect results of all comparisons here full_result = pd.DataFrame() single_results = {} for exp_file in experiments: # determine what number of position to normalize # TM score over (either experiment, or only positions # that were modelled and are also present in experiment) exp_pos, exp_chain = _determine_pos(exp_file) # remove alternative atom locations exp_file_cleaned = _eliminate_altloc(exp_chain) # compute set of positions both in prediction and expeirment joint_pos = set(exp_pos).intersection(pred_pos) if norm_by_intersection: normalization_length = len(joint_pos) else: normalization_length = len(exp_pos) # run comparison comp = run_maxcluster_compare( predictions, exp_file_cleaned, normalization_length=normalization_length, distance_cutoff=distance_cutoff, binary=binary ) # store lengths of experiment, prediction, # and what was used for computing TM scores comp.loc[:, "filename_experimental"] = exp_file comp.loc[:, "L_experiment"] = len(exp_pos) comp.loc[:, "L_prediction"] = len(pred_pos) comp.loc[:, "L_joint"] = len(joint_pos) comp.loc[:, "L_normalization"] = normalization_length comp = comp.sort_values("tm", ascending=False) single_results[exp_file] = comp full_result = full_result.append(comp) return full_result, single_results
def standard(**kwargs): """ Protocol: Predict 3D structure from evolutionary couplings Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sec_struct_file * folding_ec_file * folded_structure_files """ check_required( kwargs, [ "prefix", "engine", "ec_file", "target_sequence_file", "segments", "folding_config_file", "cut_to_alignment_region", "sec_struct_method", "reuse_sec_struct", "sec_struct_file", "filter_sec_struct_clashes", "min_sequence_distance", "fold_probability_cutoffs", "fold_lowest_count", "fold_highest_count", "fold_increase", "num_models", "psipred", "cpu", "remapped_pdb_files", "cleanup", ] ) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) outcfg = { "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv", "sec_struct_file": prefix + "_secondary_structure.csv", } # get secondary structure prediction # check if we should (and can) reuse output file from previous run if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]): residues = pd.read_csv(outcfg["sec_struct_file"]) else: residues = secondary_structure(**kwargs) # make pymol secondary structure assignment script outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml" pymol_secondary_structure( residues, outcfg["secondary_structure_pml_file"] ) # load ECs and filter for long-range pairs verify_resources( "EC file does not exist", kwargs["ec_file"] ) ecs_all = pd.read_csv(kwargs["ec_file"]) ecs = ecs_all.query("abs(i - j) > {}".format( kwargs["min_sequence_distance"]) ) # find secondary structure clashes ecs = secstruct_clashes(ecs, residues) ecs.to_csv(outcfg["folding_ec_file"], index=False) # if requested, filter clashes out before folding if kwargs["filter_sec_struct_clashes"]: ecs_fold = ecs.loc[~ecs.ss_clash] else: ecs_fold = ecs # cut modelled region to aligned region, if selected if kwargs["cut_to_alignment_region"]: segments = kwargs["segments"] # infer region from segment positions if we have it if segments is not None: positions = Segment.from_list(segments[0]).positions else: # otherwise get from EC values (could be misleading if # EC list is truncated, so only second option) positions = set(ecs.i.unique()).union(ecs.j.unique()) # limit modelled positions to covered region first_pos, last_pos = min(positions), max(positions) residues.loc[:, "in_model"] = False residues.loc[ (residues.i >= first_pos) & (residues.i <= last_pos), "in_model" ] = True else: # otherwise include all positions in model residues.loc[:, "in_model"] = True # save secondary structure prediction residues.to_csv(outcfg["sec_struct_file"], index=False) # only use the residues that will be in model for folding residues_fold = residues.loc[residues.in_model] # after all the setup, now fold the structures... # to speed things up, parallelize this to the number of # available CPUs num_procs = kwargs["cpu"] if num_procs is None: num_procs = 1 # first define all the sub-runs... folding_runs = [] # ... based on mixture model probability cutoffs = kwargs["fold_probability_cutoffs"] if cutoffs is not None and "probability" in ecs_fold.columns: if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: sig_ecs = ecs_fold.query("probability >= @c") if len(sig_ecs) > 0: folding_runs.append( (sig_ecs, "_significant_ECs_{}".format(c)) ) # ... and on simple EC counts/bins flc = kwargs["fold_lowest_count"] fhc = kwargs["fold_highest_count"] fi = kwargs["fold_increase"] if flc is not None and fhc is not None and fi is not None: num_sites = len( set.union(set(ecs.i.unique()), set(ecs.j.unique())) ) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(flc) highest = _discrete_count(fhc) step = _discrete_count(fi) # append to list of jobs to run folding_runs += [ ( ecs_fold.iloc[:c], "_{}".format(c) ) for c in range(lowest, highest + 1, step) ] # set up method to drive the folding of each job method = kwargs["engine"] # store structures in an auxiliary subdirectory, after folding # final models will be moved to main folding dir. Depending # on cleanup setting, the aux directory will be removed aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) aux_dir = path.dirname(aux_prefix) folding_runs = [ (job_ecs, aux_prefix + job_suffix) for (job_ecs, job_suffix) in folding_runs ] if method == "cns_dgsa": folder = partial( cns_dgsa_fold, residues_fold, config_file=kwargs["folding_config_file"], num_structures=kwargs["num_models"], log_level=None, binary=kwargs["cns"] ) else: raise InvalidParameterError( "Invalid folding engine: {} ".format(method) + "Valid selections are: cns_dgsa" ) # then apply folding function to each sub-run pool = mp.Pool(processes=num_procs) results = pool.starmap(folder, folding_runs) # make double sure that the pool is cleaned up, # or SIGTERM upon exit will interfere with # interrupt signal interception pool.close() pool.join() # merge result dictionaries into one dict folded_files = { k: v for subres in results for k, v in subres.items() } # move structures from aux into main folding dir fold_dir = path.dirname(prefix) prediction_files = [] for name, file_path in folded_files.items(): # move file (use copy to allow overwriting) shutil.copy(file_path, fold_dir) # update file path to main folding dir, # and put in a flat list of result files prediction_files.append( file_path.replace(aux_prefix, prefix) ) outcfg["folded_structure_files"] = prediction_files # remove aux dir if cleanup is requested if kwargs["cleanup"]: shutil.rmtree(aux_dir) # apply ranking to predicted models ranking = dihedral_ranking(prediction_files, residues) # apply clustering (all available methods), but only # if we have something to cluster if len(prediction_files) > 1: clustering = maxcluster_clustering_table( prediction_files, binary=kwargs["maxcluster"] ) # join ranking with clustering ranking = ranking.merge(clustering, on="filename", how="left") # sort by score (best models first) ranking = ranking.sort_values(by="ranking_score", ascending=False) # store as file outcfg["folding_ranking_file"] = prefix + "_ranking.csv" ranking.to_csv(outcfg["folding_ranking_file"], index=False) # apply comparison to existing structures if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0: experimental_files = kwargs["remapped_pdb_files"] comp_all, comp_singles = compare_models_maxcluster( list(experimental_files.keys()), prediction_files, norm_by_intersection=True, distance_cutoff=None, binary=kwargs["maxcluster"] ) # merge with ranking and save comparison = ranking.merge( comp_all, on="filename", how="left" ).sort_values(by="tm", ascending=False) outcfg["folding_comparison_file"] = prefix + "_comparison.csv" comparison.to_csv(outcfg["folding_comparison_file"], index=False) # also store comparison to structures in individual files ind_comp_files = {} for filename, comp_single in comp_singles.items(): comparison_s = ranking.merge( comp_single, on="filename", how="left" ).sort_values(by="tm", ascending=False) basename = path.splitext(path.split(filename)[1])[0] ind_file = path.join(fold_dir, basename + ".csv") # map back to original key from remapped_pdb_files as a key for this list ind_comp_files[ind_file] = experimental_files[filename] comparison_s.to_csv(ind_file, index=False) outcfg["folding_individual_comparison_files"] = ind_comp_files return outcfg
def find_homologs(pdb_alignment_method="jackhmmer", **kwargs): """ Identify homologs using jackhmmer or hmmbuild/hmmsearch Parameters ---------- pdb_alignment_method : {"jackhmmer", "hmmsearch"}, optional (default: "jackhmmer") Sequence alignment method used for searching the PDB **kwargs Passed into jackhmmer / hmmbuild_and_search protocol (see documentation for available options) Returns ------- ali : evcouplings.align.Alignment Alignment of homologs of query sequence in sequence database hits : pandas.DataFrame Tabular representation of hits """ # load default configuration config = parse_config(HMMER_CONFIG) # update with overrides from kwargs config = { **config, **kwargs, } # create temporary output if no prefix is given if config["prefix"] is None: config["prefix"] = path.join(tempdir(), "compare") check_required(config, ["prefix"]) # run hmmsearch (possibly preceded by hmmbuild) if pdb_alignment_method == "hmmsearch": # set up config to run hmmbuild_and_search on the unfiltered alignment file updated_config = deepcopy(config) updated_config["alignment_file"] = config.get( "raw_focus_alignment_file") ar = hmmbuild_and_search(**updated_config) # For hmmbuild and search, we have to read the raw focus alignment file # to guarantee that the query sequence is present with open(ar["raw_focus_alignment_file"]) as a: ali = Alignment.from_file(a, "fasta") # run jackhmmer against sequence database # at this point we have already checked to ensure # that the input is either jackhmmer or hmmsearch elif pdb_alignment_method == "jackhmmer": ar = jackhmmer_search(**config) with open(ar["raw_alignment_file"]) as a: ali = Alignment.from_file(a, "stockholm") # write alignment as FASTA file for easier checking by hand, # if necessary with open(config["prefix"] + "_raw.fasta", "w") as f: ali.write(f) else: raise InvalidParameterError( "Invalid pdb_alignment_method selected. Valid options are: " + ", ".join(["jackhmmer", "hmmsearch"])) # read hmmer hittable and simplify hits = read_hmmer_domtbl(ar["hittable_file"]) hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[1]) hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[2]) hits = hits.rename( columns={ "domain_score": "bitscore", "domain_i_Evalue": "e_value", "ali_from": "alignment_start", "ali_to": "alignment_end", "hmm_from": "hmm_start", "hmm_to": "hmm_end", }) hits.loc[:, "alignment_start"] = pd.to_numeric( hits.alignment_start).astype(int) hits.loc[:, "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int) hits.loc[:, "alignment_id"] = (hits.target_name + "/" + hits.alignment_start.astype(str) + "-" + hits.alignment_end.astype(str)) hits = hits.loc[:, [ "alignment_id", "uniprot_ac", "uniprot_id", "alignment_start", "alignment_end", "bitscore", "e_value" ]] return ali, hits
def _identify_structures(**kwargs): """ Identify set of 3D structures for comparison Parameters ---------- **kwargs See check_required in code below Returns ------- SIFTSResult Identified structures and residue index mappings """ def _filter_by_id(x, id_list): x = deepcopy(x) x.hits = x.hits.loc[x.hits.pdb_id.isin(id_list)] return x check_required(kwargs, [ "prefix", "pdb_ids", "compare_multimer", "max_num_hits", "max_num_structures", "pdb_mmtf_dir", "sifts_mapping_table", "sifts_sequence_db", "by_alignment", "pdb_alignment_method", "alignment_min_overlap", "sequence_id", "sequence_file", "region", "use_bitscores", "domain_threshold", "sequence_threshold" ]) # get SIFTS mapping object/sequence DB s = SIFTS(kwargs["sifts_mapping_table"], kwargs["sifts_sequence_db"]) reduce_chains = not kwargs["compare_multimer"] # determine if we need to find structures # by sequence search or just fetching # based on Uniprot/PDB identifier if kwargs["by_alignment"]: # if searching by alignment, verify that # user selected jackhmmer or hmmsearch SEARCH_METHODS = ["jackhmmer", "hmmsearch"] if kwargs["pdb_alignment_method"] not in SEARCH_METHODS: raise InvalidParameterError("Invalid pdb search method: " + "{}. Valid selections are: {}".format( ", ".join(SEARCH_METHODS.keys()))) sifts_map = s.by_alignment(reduce_chains=reduce_chains, min_overlap=kwargs["alignment_min_overlap"], **kwargs) else: sifts_map = s.by_uniprot_id(kwargs["sequence_id"], reduce_chains=reduce_chains) sifts_map_full = deepcopy(sifts_map) # filter ID list down to manually selected PDB entries if kwargs["pdb_ids"] is not None: pdb_ids = kwargs["pdb_ids"] # make sure we have a list of PDB IDs if not isinstance(pdb_ids, list): pdb_ids = [pdb_ids] pdb_ids = [x.lower() for x in pdb_ids] sifts_map = _filter_by_id(sifts_map, pdb_ids) # limit number of hits and structures if kwargs["max_num_hits"] is not None: sifts_map.hits = sifts_map.hits.iloc[:kwargs["max_num_hits"]] if kwargs["max_num_structures"] is not None: keep_ids = sifts_map.hits.pdb_id.unique() keep_ids = keep_ids[:kwargs["max_num_structures"]] sifts_map = _filter_by_id(sifts_map, keep_ids) return sifts_map, sifts_map_full
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg) return outcfg
def create_archive(config, outcfg, prefix): """ Create archive of files generated by pipeline Parameters ---------- config : dict-like Input configuration of job. Uses config["management"]["archive"] (list of key used to index outcfg) to determine which files should be added to archive outcfg : dict-like Output configuration of job prefix : str Prefix of job, will be used to define archive file path (prefix + archive type-specific extension) """ # allowed output archive formats ALLOWED_FORMATS = ["targz", "zip"] # determine selected output format, default is .tar.gz archive_format = config.get("management", {}).get("archive_format", "targz") # determine keys (corresponding to files) in # outcfg that should be stored archive_keys = config.get("management", {}).get("archive", None) # if no files selected for archiving, return immediately if archive_keys is None: return # check if selected format is valid if archive_format not in ALLOWED_FORMATS: raise InvalidParameterError( "Invalid format for output archive: {}. ".format(archive_format) + "Valid options are: " + ", ".join(ALLOWED_FORMATS)) # create explicit list of files that would go into archive and are valid files archive_files = [(file_path, file_key, idx) for (file_path, file_key, idx) in iterate_files(outcfg, subset=archive_keys) if valid_file(file_path)] # if there are no file, don't create archive if len(archive_files) == 0: return if archive_format == "targz": final_archive_file = prefix + ".tar.gz" with tarfile.open(final_archive_file, "w:gz") as tar: for (file_path, file_key, idx) in archive_files: tar.add(file_path) elif archive_format == "zip": final_archive_file = prefix + ".zip" with zipfile.ZipFile(final_archive_file, "w", zipfile.ZIP_DEFLATED) as zip_: for (file_path, file_key, idx) in archive_files: zip_.write(file_path) return final_archive_file
def secondary_structure(**kwargs): """ Predict or load secondary structure for an input sequence Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- residues : pandas.DataFrame Table with sequence and secondary structure in columns i, A_i and sec_struct_3state """ check_required( kwargs, [ "prefix", "target_sequence_file", "segments", "sec_struct_method", "sec_struct_file", "psipred", ] ) prefix = kwargs["prefix"] create_prefix_folders(prefix) secstruct_file = kwargs["sec_struct_file"] if secstruct_file is not None: verify_resources( "Secondary structure prediction file does not exist/is empty", secstruct_file ) residues = pd.read_csv(secstruct_file) else: # make sure target sequence file is there so we can # predict secondary structure target_seq_file = kwargs["target_sequence_file"] verify_resources( "Sequence file does not exist/is empty", target_seq_file ) # we need to figure out what the index of the first residue # in the target sequence is; obtain first index from segment # information if possible if kwargs["segments"] is not None: s = Segment.from_list(kwargs["segments"][0]) first_index = s.region_start else: # otherwise try to get it from sequence file first_index = None with open(target_seq_file) as f: header, _ = next(read_fasta(f)) if header is not None: _, first_index, _ = parse_header(header) # if we cannot identify first index from header, # do not make guesses but fail if first_index is None: raise InvalidParameterError( "Could not unambiguously identify sequence range from " "FASTA header, needs to specified as id/start-end: {}".format( header ) ) # finally, run secondary structure prediction if kwargs["sec_struct_method"] == "psipred": # store psipred output in a separate directory output_dir = path.join(path.dirname(prefix), "psipred") # run psipred ss2_file, horiz_file = run_psipred( target_seq_file, output_dir, binary=kwargs["psipred"] ) # parse output, renumber to first index residues = read_psipred_prediction( horiz_file, first_index=first_index ) else: raise InvalidParameterError( "Secondary structure prediction method not implemented: " "{}. Valid choices: psipred".format(kwargs["sec_struct_method"]) ) # return predicted table return residues