def verify_prefix(verify_subdir=True, **config): """ Check if configuration contains a prefix, and that prefix is a valid directory we can write to on the filesystem Parameters ---------- verify_subdir : bool, optional (default: True) Check if we can create subdirectory containing full prefix. Set this to False for outer evcouplings app loop. **config Input configuration for pipeline Returns ------- prefix : str Verified prefix """ # check we have a prefix entry, otherwise all hope is lost... try: prefix = config["global"]["prefix"] except KeyError: raise InvalidParameterError( "Configuration does not include 'prefix' setting in " "'global' section") # make sure prefix is also specified if prefix is None: raise InvalidParameterError( "'prefix' must be specified and cannot be None") # verify that prefix is workable in terms # of filesystem try: # make prefix folder create_prefix_folders(prefix) # try if we can write in the folder with open(prefix + ".test__", "w") as f: pass # get rid of the file again os.remove(prefix + ".test__") if verify_subdir: # make sure we can create a subdirectory sub_prefix = insert_dir(prefix, "test__") create_prefix_folders(sub_prefix) # remove again os.rmdir(path.dirname(sub_prefix)) except OSError as e: raise InvalidParameterError( "Not a valid prefix: {}".format(prefix)) from e return prefix
def run_hhfilter(input_file, output_file, threshold=95, columns="a2m", binary="hhfilter"): """ Redundancy-reduce a sequence alignment using hhfilter from the HHsuite alignment suite. Parameters ---------- input_file : str Path to input alignment in A2M/FASTA format output_file : str Path to output alignment (will be in A3M format) threshold : int, optional (default: 95) Sequence identity threshold for maximum pairwise identity (between 0 and 100) columns : {"first", "a2m"}, optional (default: "a2m") Definition of match columns (based on first sequence or upper-case columns (a2m)) binary : str Path to hhfilter binary Returns ------- str output_file Raises ------ ResourceError If output alignment is non-existent/empty ValueError Upon invalid value of columns parameter """ if columns not in ["first", "a2m"]: raise ValueError("Invalid column selection: {}".format(columns)) verify_resources("Alignment file does not exist or is empty", input_file) create_prefix_folders(output_file) cmd = [ binary, "-i", input_file, "-o", output_file, "-id", str(threshold), "-M", columns, "-v", str(2) ] return_code, stdout, stderr = run(cmd) verify_resources( "hhfilter returned empty alignment: " "stdout={} stderr={} file={}".format(stdout, stderr, output_file), output_file) return output_file
def inter_dists(sifts_result_i, sifts_result_j, structures=None, atom_filter=None, intersect=False, output_prefix=None, model=0, raise_missing=True): """ Compute inter-chain distances (between different entities) in PDB file. Resulting distance map is typically not symmetric, with either axis corresponding to either chain. Inter-distances are calculated on all combinations of chains that have the same PDB id in sifts_result_i and sifts_result_j. Parameters ---------- sifts_result_i : SIFTSResult Input structures and mapping to use for first axis of computed distance map sifts_result_j : SIFTSResult Input structures and mapping to use for second axis of computed distance map structures : str or dict, optional (default: None) * If str: Load structures from directory this string points to. Missing structures will be fetched from web. * If dict: dictionary with lower-case PDB ids as keys and PDB objects as values. This dictionary has to contain all necessary structures, missing ones will not be fetched. This dictionary can be created using pdb.load_structures. atom_filter : str, optional (default: None) Filter coordinates to contain only these atoms. E.g. set to "CA" to compute C_alpha - C_alpha distances instead of minimum atom distance over all atoms in both residues. intersect : bool, optional (default: False) If True, intersect indices of the given distance maps. Otherwise, union of indices will be used. output_prefix : str, optional (default: None) If given, save individual and final contact maps to files prefixed with this string. The appended file suffixes map to row index in sifts_results.hits model : int, optional (default: 0) Index of model in PDB structure that should be used raise_missing : bool, optional (default: True) Raise a ResourceError if any of the input structures can not be loaded; otherwise, ignore missing entries. Returns ------- DistanceMap Computed aggregated distance map across all input structures Raises ------ ValueError If sifts_result_i or sifts_result_j is empty (no structure hits) ResourceError If any structure could not be loaded and raise_missing is True """ def _get_chains(sifts_result): return { i: _prepare_chain( structures, r["pdb_id"], r["pdb_chain"], atom_filter, sifts_result.mapping[r["mapping_index"]], model ) for i, r in sifts_result.hits.iterrows() if raise_missing or r["pdb_id"] in structures } if len(sifts_result_i.hits) == 0 or len(sifts_result_j.hits) == 0: raise ValueError( "sifts_result_i or sifts_result_j is empty " "(no structure hits, but at least one required)" ) # if no structures given, or path to files, load first structures = _prepare_structures( structures, sifts_result_i.hits.pdb_id.append( sifts_result_j.hits.pdb_id ), raise_missing ) # aggegrated distance map agg_distmap = None # create output folder if necessary if output_prefix is not None: create_prefix_folders(output_prefix) # determine which combinations of chains to look at # (anything that has same PDB identifier) combis = sifts_result_i.hits.reset_index().merge( sifts_result_j.hits.reset_index(), on="pdb_id", suffixes=("_i", "_j") ) # extract chains for each subunit chains_i = _get_chains(sifts_result_i) chains_j = _get_chains(sifts_result_j) # go through all chain combinations for i, r in combis.iterrows(): # skip missing structures if not raise_missing and r["pdb_id"] not in structures: continue index_i = r["index_i"] index_j = r["index_j"] # skip empty chains if (len(chains_i[index_i].residues) == 0 or len(chains_j[index_j].residues) == 0): continue # compute distance map for current chain pair distmap = DistanceMap.from_coords( chains_i[index_i], chains_j[index_j], ) # save individual distance map if output_prefix is not None: distmap.to_file("{}_{}_{}".format( output_prefix, index_i, index_j) ) # aggregate with other chain combinations if agg_distmap is None: agg_distmap = distmap else: agg_distmap = DistanceMap.aggregate( agg_distmap, distmap, intersect=intersect ) return agg_distmap
def standard(**kwargs): """ Protocol: Compare ECs for single proteins (or domains) to 3D structure information Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "compare_multimer", "distance_cutoff", "target_sequence_file", "scale_sizes", ]) prefix = kwargs["prefix"] outcfg = { "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "pdb_structure_hits_file": prefix + "_structure_hits.csv", "pdb_structure_hits_unfiltered_file": prefix + "_structure_hits_unfiltered.csv", # cannot have the distmap files end with "_file" because there are # two files (.npy and .csv), which would cause problems with automatic # checking if those files exist "distmap_monomer": prefix + "_distance_map_monomer", "distmap_multimer": prefix + "_distance_map_multimer", } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # Step 1: Identify 3D structures for comparison sifts_map, sifts_map_full = _identify_structures( **{ **kwargs, "prefix": aux_prefix, }) # save selected PDB hits sifts_map.hits.to_csv(outcfg["pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv(outcfg["pdb_structure_hits_unfiltered_file"], index=False) # Step 2: Compute distance maps # load all structures at once structures = load_structures(sifts_map.hits.pdb_id, kwargs["pdb_mmtf_dir"], raise_missing=False) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_distmap_intra") d_intra.to_file(outcfg["distmap_monomer"]) # save contacts to separate file outcfg["monomer_contacts_file"] = prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs["compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact mapin the end, save it if d_multimer is not None: d_multimer.to_file(outcfg["distmap_multimer"]) outcfg[ "multimer_contacts_file"] = prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["multimer_contacts_file"], index=False) else: outcfg["distmap_multimer"] = None # at this point, also create remapped structures (e.g. for # later comparison of folding results) verify_resources("Target sequence file does not exist", kwargs["target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs["target_sequence_file"]) as f: header, seq = next(read_fasta(f)) seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg["remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap).items() } else: # if no structures, can not compute distance maps d_intra = None d_multimer = None outcfg["distmap_monomer"] = None outcfg["distmap_multimer"] = None outcfg["remapped_pdb_files"] = None # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) # identify number of sites in EC model num_sites = len( set.union(set(ec_table.i.unique()), set(ec_table.j.unique()))) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we minimally have intra distance map if d_intra is not None: coupling_scores_compared(ec_table, d_intra, d_multimer, dist_cutoff=kwargs["distance_cutoff"], output_file=outcfg[out_file], min_sequence_dist=min_seq_dist) else: outcfg[out_file] = None # also create line-drawing script if we made the csv if outcfg["ec_compared_longrange_file"] is not None: ecs_longrange = pd.read_csv(outcfg["ec_compared_longrange_file"]) outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script(ecs_longrange.iloc[:num_sites, :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"]) # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_contact_maps(ec_table, d_intra, d_multimer, **kwargs) return outcfg
def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, **kwargs): """ Apply pairwise identity filtering, fragment filtering, and exclusion of columns with too many gaps to a sequence alignment. Also generates files describing properties of the alignment such as frequency distributions, conservation, and "old-style" alignment statistics files. .. note:: assumes focus alignment (otherwise unprocessed) as input. .. todo:: come up with something more clever to filter fragments than fixed width (e.g. use 95% quantile of length distribution as reference point) Parameters ---------- focus_ali : Alignment Focus-mode input alignment target_seq_index : int Index of target sequence in alignment target_seq_id : str Identifier of target sequence (without range) region_start : int Index of first sequence position in target sequence kwargs : See required arguments in source code Returns ------- outcfg : Dict File products generated by the function: * alignment_file * statistics_file * frequencies_file * identities_file * raw_focus_alignment_file ali : Alignment Final processed alignment """ check_required(kwargs, [ "prefix", "seqid_filter", "hhfilter", "minimum_sequence_coverage", "minimum_column_coverage", "compute_num_effective_seqs", "theta", ]) prefix = kwargs["prefix"] create_prefix_folders(prefix) focus_fasta_file = prefix + "_raw_focus.fasta" outcfg = { "alignment_file": prefix + ".a2m", "statistics_file": prefix + "_alignment_statistics.csv", "frequencies_file": prefix + "_frequencies.csv", "identities_file": prefix + "_identities.csv", "raw_focus_alignment_file": focus_fasta_file, } # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if target_seq_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = target_seq_index indices[target_seq_index] = 0 target_seq_index = 0 focus_ali = focus_ali.select(sequences=indices) with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") # apply pairwise identity filter (using hhfilter) if kwargs["seqid_filter"] is not None: filtered_file = prefix + "_filtered.a3m" at.run_hhfilter(focus_fasta_file, filtered_file, threshold=kwargs["seqid_filter"], columns="first", binary=kwargs["hhfilter"]) with open(filtered_file) as f: focus_ali = Alignment.from_file(f, "a3m") # final FASTA alignment before applying A2M format modifications filtered_fasta_file = prefix + "_raw_focus_filtered.fasta" with open(filtered_fasta_file, "w") as f: focus_ali.write(f, "fasta") ali = focus_ali # filter fragments # come up with something more clever here than fixed width # (e.g. use 95% quantile of length distribution as reference point) min_cov = kwargs["minimum_sequence_coverage"] if min_cov is not None: if isinstance(min_cov, int): min_cov /= 100 keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov ali = ali.select(sequences=keep_seqs) # Calculate frequencies, conservation and identity to query # on final alignment (except for lowercase modification) # Note: running hhfilter might cause a loss of the target seque # if it is not the first sequence in the file! To be sure that # nothing goes wrong, target_seq_index should always be 0. describe_seq_identities(ali, target_seq_index=target_seq_index).to_csv( outcfg["identities_file"], float_format="%.3f", index=False) describe_frequencies(ali, region_start, target_seq_index=target_seq_index).to_csv( outcfg["frequencies_file"], float_format="%.3f", index=False) coverage_stats = describe_coverage(ali, prefix, region_start, kwargs["minimum_column_coverage"]) # keep list of uppercase sequence positions in alignment pos_list = np.arange(region_start, region_start + ali.L, dtype="int32") # Make columns with too many gaps lowercase min_col_cov = kwargs["minimum_column_coverage"] if min_col_cov is not None: if isinstance(min_col_cov, int): min_col_cov /= 100 lc_cols = ali.count(ali._match_gap, axis="pos") > 1 - min_col_cov ali = ali.lowercase_columns(lc_cols) # if we remove columns, we have to update list of positions pos_list = pos_list[~lc_cols] else: lc_cols = None # compute effective number of sequences # (this is intended for cases where coupling stage is # not run, but this number is wanted nonetheless) if kwargs["compute_num_effective_seqs"]: # make sure we only compute N_eff on the columns # that would be used for model inference, dispose # the rest if lc_cols is None: cut_ali = ali else: cut_ali = ali.select(columns=~lc_cols) # compute sequence weights cut_ali.set_weights(kwargs["theta"]) # N_eff := sum of all sequence weights n_eff = float(cut_ali.weights.sum()) # patch into coverage statistics (N_eff column) coverage_stats.loc[:, "N_eff"] = n_eff else: n_eff = None # save coverage statistics to file coverage_stats.to_csv(outcfg["statistics_file"], float_format="%.3f", index=False) # store description of final sequence alignment in outcfg # (note these parameters will be updated by couplings protocol) outcfg.update({ "num_sites": len(pos_list), "num_sequences": len(ali), "effective_sequences": n_eff, "region_start": region_start, }) # create segment in outcfg outcfg["segments"] = [ Segment("aa", target_seq_id, region_start, region_start + ali.L - 1, pos_list).to_list() ] with open(outcfg["alignment_file"], "w") as f: ali.write(f, "fasta") return outcfg, ali
def complex(**kwargs): """ Protocol: Run monomer alignment protocol and postprocess it for EVcomplex calculations Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the alignment protocol, and the following additional field: genome_location_file : path to file containing the genomic locations for CDs's corresponding to identifiers in the alignment. """ check_required(kwargs, [ "prefix", "alignment_protocol", "uniprot_to_embl_table", "ena_genome_location_table" ]) verify_resources("Uniprot to EMBL mapping table does not exist", kwargs["uniprot_to_embl_table"]) verify_resources("ENA genome location table does not exist", kwargs["ena_genome_location_table"]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # run the regular alignment protocol # (standard, existing, ...) alignment_protocol = kwargs["alignment_protocol"] if alignment_protocol not in PROTOCOLS: raise InvalidParameterError( "Invalid choice for alignment protocol: {}".format( alignment_protocol)) outcfg = PROTOCOLS[kwargs["alignment_protocol"]](**kwargs) # if the user selected the existing alignment protocol # they can supply an input annotation file # which overwrites the annotation file generated by the existing protocol if alignment_protocol == "existing": check_required(kwargs, ["override_annotation_file"]) if kwargs["override_annotation_file"] is not None: verify_resources("Override annotation file does not exist", kwargs["override_annotation_file"]) outcfg["annotation_file"] = prefix + "_annotation.csv" annotation_data = pd.read_csv(kwargs["override_annotation_file"]) annotation_data.to_csv(outcfg["annotation_file"]) # extract cds identifiers for alignment uniprot IDs cds_ids = extract_cds_ids(outcfg["alignment_file"], kwargs["uniprot_to_embl_table"]) # extract genome location information from ENA genome_location_filename = prefix + "_genome_location.csv" genome_location_table = extract_embl_annotation( cds_ids, kwargs["ena_genome_location_table"], genome_location_filename) genome_location_table = add_full_header(genome_location_table, outcfg["alignment_file"]) genome_location_table.to_csv(genome_location_filename) outcfg["genome_location_file"] = genome_location_filename # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_complex.outcfg", outcfg) return outcfg
def hmmbuild_and_search(**kwargs): """ Protocol: Build HMM from sequence alignment using hmmbuild and search against a sequence database using hmmsearch. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end # define the gap threshold for inclusion in HMM's build by HMMbuild. SYMFRAC_HMMBUILD = 0.0 # check for required options check_required(kwargs, [ "prefix", "sequence_id", "alignment_file", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "cpu", "nobias", "reuse_alignment", "hmmbuild", "hmmsearch" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # prepare input alignment for hmmbuild focus_fasta_file, target_sequence_file, region_start, region_end = \ _format_alignment_for_hmmbuild( kwargs["alignment_file"], **kwargs ) # run hmmbuild_and_search... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for hmmsearch sequence_length = region_end - region_start + 1 seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], sequence_length) # create the hmm hmmbuild_result = at.run_hmmbuild( alignment_file=focus_fasta_file, prefix=prefix, symfrac=SYMFRAC_HMMBUILD, cpu=kwargs["cpu"], binary=kwargs["hmmbuild"], ) hmmfile = hmmbuild_result.hmmfile # run the alignment from the hmm ali = at.run_hmmsearch( hmmfile=hmmfile, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, nobias=kwargs["nobias"], cpu=kwargs["cpu"], binary=kwargs["hmmsearch"], ) # get rid of huge stdout log file immediately try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # only item from hmmsearch_result to save is the hmmfile ali["hmmfile"] = hmmfile # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_file": target_sequence_file, "first_index": region_start, "input_raw_focus_alignment": focus_fasta_file, "target_sequence_file": target_sequence_file, "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # convert the raw output alignment to fasta format # and add the appropriate query sequecne raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg) return outcfg
def run_jobs(configs, global_config, overwrite=False, workdir=None): """ Submit config to pipeline Parameters ---------- configs : dict Configurations for individual subjobs global_config : dict Master configuration (if only one job, the contents of this dictionary will be equal to the single element of config_files) """ python = executable pipeline_path = path.abspath(pipeline.__file__) summarize_path = path.abspath(summarize.__file__) cmd_base = "{} {}".format(python, pipeline_path) summ_base = "{} {}".format(python, summarize_path) # determine output directory for config files prefix = global_config["global"]["prefix"] # integrate working directory into output prefix # if it is given; if prefix contains an absolute path, # this will override the workdir according to # implementation of path.join() if workdir is not None: out_prefix = path.join(workdir, prefix) else: out_prefix = prefix # save configuration file, make sure we do not overwrite previous run # if overwrite protection is activated # (but only if it is a valid configuration file with contents) cfg_filename = CONFIG_NAME.format(out_prefix) if not overwrite and valid_file(cfg_filename): raise InvalidParameterError( "Existing configuration file {} ".format(cfg_filename) + "indicates current prefix {} ".format(prefix) + "would overwrite existing results. Use --yolo " + "flag to deactivate overwrite protection (e.g. for " "restarting a job or running a different stage)." ) # make sure working directory exists create_prefix_folders(cfg_filename) # write global config file write_config_file(cfg_filename, global_config) # also write individual subjob configuration files # (we have to write these before submitting, since # the job summarizer needs the paths to all files) for subjob_prefix, subjob_cfg in configs.items(): # determine working dir for each subjob, since subjob # prefix may contain slashes leading to subfolder creation if workdir is not None: subjob_out_prefix = path.join(workdir, subjob_prefix) else: subjob_out_prefix = subjob_prefix subcfg_filename = CONFIG_NAME.format(subjob_out_prefix) # make sure output subfolder exists create_prefix_folders(subcfg_filename) # write subjob configuration file write_config_file(subcfg_filename, subjob_cfg) # now create list of subjob config files relative to working # directory (above, we allow to run submitted in arbitrary directory) config_files = [ CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs ] # create command for summarizer (needs to know all subjob config files) summ_cmd = "{} {} {} {}".format( summ_base, global_config["pipeline"], global_config["global"]["prefix"], " ".join(config_files) ) # create submitter from global (pre-unrolling) configuration submitter = utils.SubmitterFactory( global_config["environment"]["engine"], db_path=out_prefix + "_job_database.txt" ) # collect individual submitted jobs here commands = [] # prepare individual jobs for submission for job, job_cfg in configs.items(): job_prefix = job_cfg["global"]["prefix"] job_cfg_file = CONFIG_NAME.format(job) # set job status in database to pending pipeline.update_job_status(job_cfg, status=database.EStatus.PEND) # create submission command env = job_cfg["environment"] cmd = utils.Command( [ "{} {}".format(cmd_base, job_cfg_file), summ_cmd ], name=job_prefix, environment=env["configuration"], workdir=workdir, resources={ utils.EResource.queue: env["queue"], utils.EResource.time: env["time"], utils.EResource.mem: env["memory"], utils.EResource.nodes: env["cores"], utils.EResource.out: job_prefix + "_stdout.log", utils.EResource.error: job_prefix + "_stderr.log", } ) # store job for later dependency creation commands.append(cmd) # finally, submit job submitter.submit(cmd) # submit final summarizer # (hold for now - summarizer is run after each subjob finishes) # wait for all runs to finish (but only if blocking) submitter.join()
def run_hmmscan(query, database, prefix, use_model_threshold=True, threshold_type="cut_ga", use_bitscores=True, domain_threshold=None, seq_threshold=None, nobias=False, cpu=None, stdout_redirect=None, binary="hmmscan"): """ Run hmmscan of HMMs in database against sequences in query to identify matches of these HMMs. Refer to HMMER Userguide for explanation of these parameters. Parameters ---------- query : str File containing query sequence(s) database : str File containing HMM database (prepared with hmmpress) prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. use_model_threshold: bool (default: True) Use model-specific inclusion thresholds from HMM database rather than global bitscore/E-value thresholds (use_bitscores, domain_threshold and seq_threshold are overriden by this flag). threshold-type: {"cut_ga", "cut_nc", "cut_tc"} (default: "cut_ga") Use gathering (default), noise or trusted cutoff to define scan hits. Please refer to HMMER manual for details. use_bitscores : bool Use bitscore inclusion thresholds rather than E-values. Overriden by use_model_threshold flag. domain_threshold : int or float or str Inclusion threshold applied on the domain level (e.g. "1E-03" or 0.001 or 50) seq_threshold : int or float or str Inclusion threshold applied on the sequence level (e.g. "1E-03" or 0.001 or 50) nobias : bool, optional (default: False) Turn of bias correction cpu : int, optional (default: None) Number of CPUs to use for search. Uses all if None. stdout_redirect : str, optional (default: None) Redirect bulky stdout instead of storing with rest of results (use "/dev/null" to dispose) binary : str (default: "hmmscan") Path to hmmscan binary (put in PATH for default to work) Returns ------- HmmscanResult namedtuple with fields corresponding to the different output files (prefix, output, tblout, domtblout, pfamtblout) Raises ------ ExternalToolError, ResourceError """ verify_resources("Input file does not exist or is empty", query, database) create_prefix_folders(prefix) result = HmmscanResult( prefix, prefix + ".output" if stdout_redirect is None else stdout_redirect, prefix + ".tblout", prefix + ".domtblout", prefix + ".pfamtblout") cmd = [ binary, "-o", result.output, "--tblout", result.tblout, "--domtblout", result.domtblout, "--pfamtblout", result.pfamtblout, "--notextw", "--acc", ] # number of CPUs if cpu is not None: cmd += ["--cpu", str(cpu)] # bias correction filter if nobias: cmd += ["--nobias"] # either use model-specific threshold, or custom # bitscore/E-value thresholds if use_model_threshold: THRESHOLD_CHOICES = ["cut_ga", "cut_nc", "cut_tc"] if threshold_type not in THRESHOLD_CHOICES: raise ValueError("Invalid model threshold, valid choices are: " + ", ".join(THRESHOLD_CHOICES)) cmd += ["--" + threshold_type] else: if seq_threshold is None or domain_threshold is None: raise ValueError("Must define sequence- and domain-level reporting" "thresholds, or use gathering threshold instead.") if use_bitscores: cmd += [ "-T", str(seq_threshold), "--domT", str(domain_threshold), ] else: cmd += [ "-E", str(seq_threshold), "--domE", str(domain_threshold), ] cmd += [database, query] return_code, stdout, stderr = run(cmd) # also check we actually created a table with hits verify_resources( "hmmscan did not return results: " "stdout={} stderr={} file={}".format(stdout, stderr, result.domtblout), result.domtblout) return result
def run_jackhmmer(query, database, prefix, use_bitscores, domain_threshold, seq_threshold, iterations=5, nobias=False, cpu=None, stdout_redirect=None, checkpoints_hmm=False, checkpoints_ali=False, binary="jackhmmer"): """ Run jackhmmer sequence search against target database. Refer to HMMER Userguide for explanation of these parameters. Parameters ---------- query : str File containing query sequence database : str File containing sequence database prefix : str Prefix path for output files. Folder structure in the prefix will be created if not existing. use_bitscores : bool Use bitscore inclusion thresholds rather than E-values. domain_threshold : int or float or str Inclusion threshold applied on the domain level (e.g. "1E-03" or 0.001 or 50) seq_threshold : int or float or str Inclusion threshold applied on the sequence level (e.g. "1E-03" or 0.001 or 50) iterations : int number of jackhmmer search iterations nobias : bool, optional (default: False) Turn of bias correction cpu : int, optional (default: None) Number of CPUs to use for search. Uses all if None. stdout_redirect : str, optional (default: None) Redirect bulky stdout instead of storing with rest of results (use "/dev/null" to dispose) checkpoints_hmm : bool, optional (default: False) Store checkpoint HMMs to prefix.<iter>.hmm checkpoints_ali : bool, optional (default: False) Store checkpoint alignments to prefix.<iter>.sto binary : str (default: "jackhmmer") Path to jackhmmer binary (put in PATH for default to work) Returns ------- JackhmmerResult namedtuple with fields corresponding to the different output files (prefix, alignment, output, tblout, domtblout) Raises ------ ExternalToolError, ResourceError """ verify_resources("Input file does not exist or is empty", query, database) create_prefix_folders(prefix) # store filenames of all individual results; # these will be returned as result of the # function. result = JackhmmerResult( prefix, prefix + ".sto", prefix + ".output" if stdout_redirect is None else stdout_redirect, prefix + ".tblout", prefix + ".domtblout") cmd = [ binary, "-N", str(iterations), "-o", result.output, "-A", result.alignment, "--tblout", result.tblout, "--domtblout", result.domtblout, "--noali", "--notextw" ] # reporting thresholds are set accordingly to # inclusion threshold to reduce memory footprit if use_bitscores: cmd += [ "-T", str(seq_threshold), "--domT", str(domain_threshold), "--incT", str(seq_threshold), "--incdomT", str(domain_threshold) ] else: cmd += [ "-E", str(seq_threshold), "--domE", str(domain_threshold), "--incE", str(seq_threshold), "--incdomE", str(domain_threshold) ] # number of CPUs if cpu is not None: cmd += ["--cpu", str(cpu)] # bias correction filter if nobias: cmd += ["--nobias"] # save checkpoints for alignments and HMMs? if checkpoints_ali: cmd += ["--chkali", prefix] if checkpoints_hmm: cmd += ["--chkhmm", prefix] cmd += [query, database] return_code, stdout, stderr = run(cmd) # also check we actually created some sort of alignment verify_resources( "jackhmmer returned empty alignment: " "stdout={} stderr={} file={}".format(stdout, stderr, result.alignment), result.alignment) return result
def complex(**kwargs): """ Protocol: Mutation effect prediction and visualization for protein complexes Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * mutation_matrix_file * [mutation_dataset_predicted_file] """ check_required( kwargs, ["prefix", "model_file", "mutation_dataset_file", "segments"]) prefix = kwargs["prefix"] outcfg = { "mutation_matrix_file": prefix + "_single_mutant_matrix.csv", "mutation_matrix_plot_files": [], } # make sure model file exists verify_resources("Model parameter file does not exist", kwargs["model_file"]) # make sure output directory exists create_prefix_folders(prefix) # load segments to create couplings object segment_objects = [] for segment_list in kwargs["segments"]: segment_objects.append(Segment.from_list(segment_list)) first_segment_name = Segment.from_list(kwargs["segments"][0]).segment_id second_segment_name = Segment.from_list(kwargs["segments"][1]).segment_id first_chain_name = Segment.from_list( kwargs["segments"][0]).default_chain_name() second_chain_name = Segment.from_list( kwargs["segments"][1]).default_chain_name() # load couplings object c = MultiSegmentCouplingsModel(kwargs["model_file"], *segment_objects) # create the independent model c0 = c.to_independent_model() # create the inter-protein only Jij model ci = c.to_inter_segment_model() for model, type_ in [(c, "Epistatic"), (c0, "Independent"), (ci, "Inter_segment")]: # interactive plot using bokeh filename = prefix + "_{}_model".format(type_.lower(), ) output_file(filename + ".html", "{} model".format(type_)) fig = evcouplings.visualize.mutations.plot_mutation_matrix( model, engine="bokeh") save(fig) outcfg["mutation_matrix_plot_files"].append(filename + ".html") # static matplotlib plot evcouplings.visualize.mutations.plot_mutation_matrix(model) plt.savefig(filename + ".pdf", bbox_inches="tight") outcfg["mutation_matrix_plot_files"].append(filename + ".pdf") # create single mutation matrix table, # add prediction by independent model and # save to file singles = single_mutant_matrix(c, output_column="prediction_epistatic") singles = predict_mutation_table(c0, singles, "prediction_independent") singles = predict_mutation_table(ci, singles, "prediction_inter_segment") singles.to_csv(outcfg["mutation_matrix_file"], index=False) # Pymol scripts outcfg["mutations_epistatic_pml_files"] = [] for model in ["epistatic", "independent", "inter_segment"]: pml_filename = prefix + "_{}_model.pml".format(model) evcouplings.visualize.mutations.mutation_pymol_script( singles, pml_filename, effect_column="prediction_" + model, segment_to_chain_mapping={ first_segment_name: first_chain_name, second_segment_name: second_chain_name }) outcfg["mutations_epistatic_pml_files"].append(pml_filename) # predict experimental dataset if given dataset_file = kwargs["mutation_dataset_file"] if dataset_file is not None: verify_resources("Dataset file does not exist", dataset_file) data = pd.read_csv(dataset_file, comment="#", sep=",") if "segment" not in data.columns: raise ValueError("Input mutation dataset file does not contain " "a column called 'segment' to specify the " "protein of origin for each mutation") # add epistatic model prediction data_pred = predict_mutation_table(c, data, "prediction_epistatic") # add independent model prediction data_pred = predict_mutation_table(c0, data_pred, "prediction_independent") data_pred = predict_mutation_table(ci, data_pred, "inter_segment") outcfg[ "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv" data_pred.to_csv(outcfg["mutation_dataset_predicted_file"], index=False) return outcfg
def standard(**kwargs): """ Protocol: Infer ECs from alignment using plmc. .. todo:: 1. make EC enrichment calculation segment-ready 2. explain meaning of parameters in detail. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", "min_sequence_distance", # "save_model", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) # add mixture model probability ecs = pairs.add_mixture_probability(ecs) if segments is not None: # and (len(segments) > 1 or not kwargs["focus_mode"]): # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) # write updated table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"] ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs) ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # load parameters c = CouplingsModel(outcfg["model_file"]) # create JSON output and write to file f.write( evzoom_json(c) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) # write the sorted ECs table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"], score_column="cn" # "di ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs, score="cn") # "di" ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # create JSON output and write to file f.write( evzoom_json(model) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def genome_distance(**kwargs): """ Protocol: Concatenate alignments based on genomic distance Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * alignment_file * raw_alignment_file * focus_mode * focus_sequence * segments * frequencies_file * identities_file * num_sequences * num_sites * raw_focus_alignment_file * statistics_file """ check_required( kwargs, [ "prefix", "first_alignment_file", "second_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_region_start", "second_region_start", "first_segments", "second_segments", "genome_distance_threshold", "first_genome_location_file", "second_genome_location_file", "first_annotation_file", "second_annotation_file" ] ) prefix = kwargs["prefix"] # make sure input alignments exist verify_resources( "Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"] ) verify_resources( "Genome location file does not exist", kwargs["first_genome_location_file"], kwargs["second_genome_location_file"] ) # make sure output directory exists create_prefix_folders(prefix) # load the information for each monomer alignment alignment_1 = kwargs["first_alignment_file"] alignment_2 = kwargs["second_alignment_file"] genome_location_filename_1 = kwargs["first_genome_location_file"] genome_location_filename_2 = kwargs["second_genome_location_file"] gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0) gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0) # find all possible matches possible_partners = find_possible_partners( gene_location_table_1, gene_location_table_2 ) # find the best reciprocal matches id_pairing_unfiltered = best_reciprocal_matching(possible_partners) # filter best reciprocal matches by genome distance threshold if kwargs["genome_distance_threshold"]: distance_threshold = kwargs["genome_distance_threshold"] id_pairing = id_pairing_unfiltered.query("distance < @distance_threshold") else: id_pairing = id_pairing_unfiltered id_pairing.loc[:, "id_1"] = id_pairing.loc[:, "uniprot_id_1"] id_pairing.loc[:, "id_2"] = id_pairing.loc[:, "uniprot_id_2"] # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ write_concatenated_alignment( id_pairing, alignment_1, alignment_2, kwargs["first_focus_sequence"], kwargs["second_focus_sequence"] ) # save the alignment files raw_alignment_file = prefix + "_raw.fasta" with open(raw_alignment_file, "w") as of: raw_ali.write(of) mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: mon_ali_2.write(of) # filter the alignment aln_outcfg, _ = modify_alignment( raw_ali, target_seq_index, target_seq_id, kwargs["first_region_start"], **kwargs ) # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = aln_outcfg outcfg["raw_alignment_file"] = raw_alignment_file outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1 outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2 outcfg["focus_sequence"] = target_seq_id # Update the segments outcfg = modify_complex_segments(outcfg, **kwargs) # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) # plot the genome distance distribution outcfg["distance_plot_file"] = prefix + "_distplot.pdf" plot_distance_distribution(id_pairing_unfiltered, outcfg["distance_plot_file"]) return outcfg
def run_jobs(configs, global_config, overwrite=False, workdir=None, abort_on_error=True, environment=None): """ Submit config to pipeline Parameters ---------- configs : dict Configurations for individual subjobs global_config : dict Master configuration (if only one job, the contents of this dictionary will be equal to the single element of config_files) overwrite : bool, optional (default: False) If True, allows overwriting previous run of the same config, otherwise will fail if results from previous execution are present workdir : str, optional (default: None) Workdir in which to run job (will combine workdir and prefix in joint path) abort_on_error : bool, optional (default: True) Abort entire job submission if error occurs for one of the jobs by propagating RuntimeError environment : str, optional (default: None) Allow to pass value for environment parameter of submitter, will override environment.configuration from global_config (e.g., for setting environment variables like passwords) Returns ------- job_ids : dict Mapping from subjob prefix (keys in configs parameter) to identifier returned by submitter for each of the jobs that was *successfully* submitted (i.e. missing keys from configs param indicate these jobs could not be submitted). Raises ------ RuntimeError If error encountered during submission and abort_on_error is True """ cmd_base = environ.get("EVCOUPLINGS_RUNCFG_APP") or "evcouplings_runcfg" summ_base = environ.get( "EVCOUPLINGS_SUMMARIZE_APP") or "evcouplings_summarize" # determine output directory for config files prefix = global_config["global"]["prefix"] # integrate working directory into output prefix # if it is given; if prefix contains an absolute path, # this will override the workdir according to # implementation of path.join() if workdir is not None: out_prefix = path.join(workdir, prefix) else: out_prefix = prefix # save configuration file, make sure we do not overwrite previous run # if overwrite protection is activated # (but only if it is a valid configuration file with contents) cfg_filename = CONFIG_NAME.format(out_prefix) if not overwrite and valid_file(cfg_filename): raise InvalidParameterError( "Existing configuration file {} ".format(cfg_filename) + "indicates current prefix {} ".format(prefix) + "would overwrite existing results. Use --yolo " + "flag to deactivate overwrite protection (e.g. for " "restarting a job or running a different stage).") # make sure working directory exists create_prefix_folders(cfg_filename) # write global config file write_config_file(cfg_filename, global_config) # also write individual subjob configuration files # (we have to write these before submitting, since # the job summarizer needs the paths to all files) for subjob_prefix, subjob_cfg in configs.items(): # determine working dir for each subjob, since subjob # prefix may contain slashes leading to subfolder creation if workdir is not None: subjob_out_prefix = path.join(workdir, subjob_prefix) else: subjob_out_prefix = subjob_prefix subcfg_filename = CONFIG_NAME.format(subjob_out_prefix) # make sure output subfolder exists create_prefix_folders(subcfg_filename) # write subjob configuration file write_config_file(subcfg_filename, subjob_cfg) # now create list of subjob config files relative to working # directory (above, we allow to run submitted in arbitrary directory) config_files = [ CONFIG_NAME.format(subjob_prefix) for subjob_prefix in configs ] # create command for summarizer (needs to know all subjob config files) summ_cmd = "{} {} {} {}".format(summ_base, global_config["pipeline"], global_config["global"]["prefix"], " ".join(config_files)) # create submitter from global (pre-unrolling) configuration submitter = utils.SubmitterFactory(global_config["environment"]["engine"], db_path=out_prefix + "_job_database.txt") # collect individual submitted jobs here commands = [] # record subjob IDs returned by submitter for each job job_ids = {} # prepare individual jobs for submission for job, job_cfg in configs.items(): job_prefix = job_cfg["global"]["prefix"] job_cfg_file = CONFIG_NAME.format(job) # create submission command env = job_cfg["environment"] cmd = utils.Command( ["{} {}".format(cmd_base, job_cfg_file), summ_cmd], name=job_prefix, environment=environment or env["configuration"], workdir=workdir, resources={ utils.EResource.queue: env["queue"], utils.EResource.time: env["time"], utils.EResource.mem: env["memory"], utils.EResource.nodes: env["cores"], utils.EResource.out: job_prefix + "_stdout.log", utils.EResource.error: job_prefix + "_stderr.log", }) # store job for later dependency creation commands.append(cmd) tracker = get_result_tracker(job_cfg) try: # finally, submit job current_job_id = submitter.submit(cmd) # store run identifier returned by submitter # TODO: consider storing current_job_id using tracker right away job_ids[job] = current_job_id # set job status in database to pending tracker.update(status=EStatus.PEND) except RuntimeError as e: # set job as failed in database tracker.update(status=EStatus.FAIL, message=str(e)) # fail entire job submission if requested if abort_on_error: raise # submit final summarizer # (hold for now - summarizer is run after each subjob finishes) # wait for all runs to finish (but only if blocking) submitter.join() # return job identifiers return job_ids
def best_hit(**kwargs): """ Protocol: Concatenate alignments based on the best hit to the focus sequence in each species Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: alignment_file raw_alignment_file focus_mode focus_sequence segments frequencies_file identities_file num_sequences num_sites raw_focus_alignment_file statistics_file """ check_required( kwargs, [ "prefix", "first_alignment_file", "second_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_segments", "second_segments", "first_identities_file", "second_identities_file", "first_annotation_file", "second_annotation_file", "use_best_reciprocal", "paralog_identity_threshold" ] ) prefix = kwargs["prefix"] # make sure input alignments verify_resources( "Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) def _load_monomer_info(annotations_file, identities_file, target_sequence, alignment_file, use_best_reciprocal, identity_threshold): # read in annotation to a file and rename the appropriate column annotation_table = read_species_annotation_table(annotations_file) # read identity file similarities = pd.read_csv(identities_file) # create a pd.DataFrame containing the best hit in each organism most_similar_in_species = most_similar_by_organism(similarities, annotation_table) if use_best_reciprocal: paralogs = find_paralogs( target_sequence, annotation_table, similarities, identity_threshold ) most_similar_in_species = filter_best_reciprocal( alignment_file, paralogs, most_similar_in_species ) return most_similar_in_species # load the information about each monomer alignment most_similar_in_species_1 = _load_monomer_info( kwargs["first_annotation_file"], kwargs["first_identities_file"], kwargs["first_focus_sequence"], kwargs["first_alignment_file"], kwargs["use_best_reciprocal"], kwargs["paralog_identity_threshold"] ) most_similar_in_species_2 = _load_monomer_info( kwargs["second_annotation_file"], kwargs["second_identities_file"], kwargs["second_focus_sequence"], kwargs["second_alignment_file"], kwargs["use_best_reciprocal"], kwargs["paralog_identity_threshold"] ) # merge the two dataframes to get all species found in # both alignments species_intersection = most_similar_in_species_1.merge( most_similar_in_species_2, how="inner", # takes the intersection on="species", # merges on species identifiers suffixes=("_1", "_2") ) # write concatenated alignment with distance filtering # TODO: save monomer alignments? target_seq_id, target_seq_index, raw_ali, mon_ali_1, mon_ali_2 = \ write_concatenated_alignment( species_intersection, kwargs["first_alignment_file"], kwargs["second_alignment_file"], kwargs["first_focus_sequence"], kwargs["second_focus_sequence"] ) # save the alignment files raw_alignment_file = prefix + "_raw.fasta" with open(raw_alignment_file, "w") as of: raw_ali.write(of) mon_alignment_file_1 = prefix + "_monomer_1.fasta" with open(mon_alignment_file_1, "w") as of: mon_ali_1.write(of) mon_alignment_file_2 = prefix + "_monomer_2.fasta" with open(mon_alignment_file_2, "w") as of: mon_ali_2.write(of) aln_outcfg, _ = modify_alignment( raw_ali, target_seq_index, target_seq_id, kwargs["first_region_start"], **kwargs ) # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = aln_outcfg outcfg["raw_alignment_file"] = raw_alignment_file outcfg["first_concatenated_monomer_alignment_file"] = mon_alignment_file_1 outcfg["second_concatenated_monomer_alignment_file"] = mon_alignment_file_2 outcfg["focus_sequence"] = target_seq_id # Update the segments outcfg = modify_complex_segments(outcfg, **kwargs) # Describe the statistics of the concatenation outcfg = _run_describe_concatenation(outcfg, **kwargs) return outcfg
def standard(**kwargs): """ Protocol: Compare ECs for single proteins (or domains) to 3D structure information Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * mutation_matrix_file * [mutation_dataset_predicted_file] """ check_required(kwargs, [ "prefix", "model_file", "mutation_dataset_file", ]) prefix = kwargs["prefix"] outcfg = { "mutation_matrix_file": prefix + "_single_mutant_matrix.csv", "mutation_matrix_plot_files": [], } # make sure model file exists verify_resources("Model parameter file does not exist", kwargs["model_file"]) # make sure output directory exists create_prefix_folders(prefix) # load couplings object, and create independent model c = CouplingsModel(kwargs["model_file"]) c0 = c.to_independent_model() for model, type_ in [(c, "Epistatic"), (c0, "Independent")]: # interactive plot using bokeh filename = prefix + "_{}_model".format(type_.lower(), ) output_file(filename + ".html", "{} model".format(type_)) fig = evcouplings.visualize.mutations.plot_mutation_matrix( model, engine="bokeh") save(fig) outcfg["mutation_matrix_plot_files"].append(filename + ".html") # static matplotlib plot evcouplings.visualize.mutations.plot_mutation_matrix(model) plt.savefig(filename + ".pdf", bbox_inches="tight") outcfg["mutation_matrix_plot_files"].append(filename + ".pdf") # create single mutation matrix table, # add prediction by independent model and # save to file singles = single_mutant_matrix(c, output_column="prediction_epistatic") singles = predict_mutation_table(c0, singles, "prediction_independent") singles.to_csv(outcfg["mutation_matrix_file"], index=False) # Pymol scripts outcfg["mutations_epistatic_pml_files"] = [] for model in ["epistatic", "independent"]: pml_filename = prefix + "_{}_model.pml".format(model) evcouplings.visualize.mutations.mutation_pymol_script( singles, pml_filename, effect_column="prediction_" + model) outcfg["mutations_epistatic_pml_files"].append(pml_filename) # predict experimental dataset if given dataset_file = kwargs["mutation_dataset_file"] if dataset_file is not None: verify_resources("Dataset file does not exist", dataset_file) data = pd.read_csv(dataset_file, comment="#") # add epistatic model prediction data_pred = predict_mutation_table(c, data, "prediction_epistatic") # add independent model prediction data_pred = predict_mutation_table(c0, data_pred, "prediction_independent") outcfg[ "mutation_dataset_predicted_file"] = prefix + "_dataset_predicted.csv" data_pred.to_csv(outcfg["mutation_dataset_predicted_file"], index=False) return outcfg
def execute(**config): """ Execute a pipeline configuration Parameters ---------- **config Input configuration for pipeline (see pipeline config files for example of how this should look like) Returns ------- global_state : dict Global output state of pipeline """ check_required(config, ["pipeline", "stages", "global"]) # check if valid pipeline was selected if config["pipeline"] not in PIPELINES: raise InvalidParameterError("Not a valid pipeline selection. " "Valid choices are:\n{}".format(", ".join( PIPELINES.keys()))) stages = config["stages"] if stages is None: raise InvalidParameterError("No stages defined, need at least one.") # get definition of selected pipeline pipeline = PIPELINES[config["pipeline"]] prefix = config["global"]["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this is the global state of results as # we move through different stages of # the pipeline global_state = config["global"] # keep track of how many stages are still # to be run, so we can leave out stages at # the end of workflow below num_stages_to_run = len(stages) # get job tracker tracker = get_result_tracker(config) # set job status to running and also initalize global state tracker.update(status=EStatus.RUN, results=global_state) # iterate through individual stages for (stage, runner, key_prefix) in pipeline: # check if anything else is left to # run, otherwise skip if num_stages_to_run == 0: break # check if config for stage is there check_required(config, [stage]) # output files for stage into an individual folder stage_prefix = insert_dir(prefix, stage) create_prefix_folders(stage_prefix) # config files for input and output of stage stage_incfg = "{}_{}.incfg".format(stage_prefix, stage) stage_outcfg = "{}_{}.outcfg".format(stage_prefix, stage) # update current stage of job tracker.update(stage=stage) # check if stage should be executed if stage in stages: # global state inserted at end, overrides any # stage-specific settings (except for custom prefix) incfg = { **config["tools"], **config["databases"], **config[stage], **global_state, "prefix": stage_prefix } # save input of stage in config file write_config_file(stage_incfg, incfg) # run stage outcfg = runner(**incfg) # prefix output keys if this parameter is # given in stage configuration, to avoid # name clashes if same protocol run multiple times if key_prefix is not None: outcfg = {key_prefix + k: v for k, v in outcfg.items()} # save output of stage in config file write_config_file(stage_outcfg, outcfg) # one less stage to put through after we ran this... num_stages_to_run -= 1 else: # skip state by injecting state from previous run verify_resources( "Trying to skip, but output configuration " "for stage '{}' does not exist. Has it already " "been run?".format(stage, stage), stage_outcfg) # read output configuration outcfg = read_config_file(stage_outcfg) # verify all the output files are there outfiles = [ filepath for f, filepath in outcfg.items() if f.endswith("_file") and filepath is not None ] verify_resources( "Output files from stage '{}' " "missing".format(stage), *outfiles) # update global state with outputs of stage global_state = {**global_state, **outcfg} # update state in tracker accordingly tracker.update(results=outcfg) # create results archive archive_file = create_archive(config, global_state, prefix) # only store results archive if a result file was created if archive_file is not None: global_state["archive_file"] = archive_file # prepare update for tracker, but only store in last # go when job is set to done tracker_archive_update = {"archive_file": archive_file} else: tracker_archive_update = None # set job status to done and transfer archive if selected for syncing tracker.update(status=EStatus.DONE, results=tracker_archive_update) # delete selected output files if requested; # tracker does not need to update here since it won't # sync entries of delete list in the first place global_state = delete_outputs(config, global_state) # write final global state of pipeline write_config_file(prefix + FINAL_CONFIG_SUFFIX, global_state) return global_state
def standard(**kwargs): """ Protocol: Predict 3D structure from evolutionary couplings Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sec_struct_file * folding_ec_file * folded_structure_files """ check_required( kwargs, [ "prefix", "engine", "ec_file", "target_sequence_file", "segments", "folding_config_file", "cut_to_alignment_region", "sec_struct_method", "reuse_sec_struct", "sec_struct_file", "filter_sec_struct_clashes", "min_sequence_distance", "fold_probability_cutoffs", "fold_lowest_count", "fold_highest_count", "fold_increase", "num_models", "psipred", "cpu", "remapped_pdb_files", "cleanup", ] ) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) outcfg = { "folding_ec_file": prefix + "_CouplingScores_with_clashes.csv", "sec_struct_file": prefix + "_secondary_structure.csv", } # get secondary structure prediction # check if we should (and can) reuse output file from previous run if kwargs["reuse_sec_struct"] and valid_file(outcfg["sec_struct_file"]): residues = pd.read_csv(outcfg["sec_struct_file"]) else: residues = secondary_structure(**kwargs) # make pymol secondary structure assignment script outcfg["secondary_structure_pml_file"] = prefix + "_ss_draw.pml" pymol_secondary_structure( residues, outcfg["secondary_structure_pml_file"] ) # load ECs and filter for long-range pairs verify_resources( "EC file does not exist", kwargs["ec_file"] ) ecs_all = pd.read_csv(kwargs["ec_file"]) ecs = ecs_all.query("abs(i - j) > {}".format( kwargs["min_sequence_distance"]) ) # find secondary structure clashes ecs = secstruct_clashes(ecs, residues) ecs.to_csv(outcfg["folding_ec_file"], index=False) # if requested, filter clashes out before folding if kwargs["filter_sec_struct_clashes"]: ecs_fold = ecs.loc[~ecs.ss_clash] else: ecs_fold = ecs # cut modelled region to aligned region, if selected if kwargs["cut_to_alignment_region"]: segments = kwargs["segments"] # infer region from segment positions if we have it if segments is not None: positions = Segment.from_list(segments[0]).positions else: # otherwise get from EC values (could be misleading if # EC list is truncated, so only second option) positions = set(ecs.i.unique()).union(ecs.j.unique()) # limit modelled positions to covered region first_pos, last_pos = min(positions), max(positions) residues.loc[:, "in_model"] = False residues.loc[ (residues.i >= first_pos) & (residues.i <= last_pos), "in_model" ] = True else: # otherwise include all positions in model residues.loc[:, "in_model"] = True # save secondary structure prediction residues.to_csv(outcfg["sec_struct_file"], index=False) # only use the residues that will be in model for folding residues_fold = residues.loc[residues.in_model] # after all the setup, now fold the structures... # to speed things up, parallelize this to the number of # available CPUs num_procs = kwargs["cpu"] if num_procs is None: num_procs = 1 # first define all the sub-runs... folding_runs = [] # ... based on mixture model probability cutoffs = kwargs["fold_probability_cutoffs"] if cutoffs is not None and "probability" in ecs_fold.columns: if not isinstance(cutoffs, list): cutoffs = [cutoffs] for c in cutoffs: sig_ecs = ecs_fold.query("probability >= @c") if len(sig_ecs) > 0: folding_runs.append( (sig_ecs, "_significant_ECs_{}".format(c)) ) # ... and on simple EC counts/bins flc = kwargs["fold_lowest_count"] fhc = kwargs["fold_highest_count"] fi = kwargs["fold_increase"] if flc is not None and fhc is not None and fi is not None: num_sites = len( set.union(set(ecs.i.unique()), set(ecs.j.unique())) ) # transform fraction of number of sites into discrete number of ECs def _discrete_count(x): if isinstance(x, float): x = ceil(x * num_sites) return int(x) # range of plots to make lowest = _discrete_count(flc) highest = _discrete_count(fhc) step = _discrete_count(fi) # append to list of jobs to run folding_runs += [ ( ecs_fold.iloc[:c], "_{}".format(c) ) for c in range(lowest, highest + 1, step) ] # set up method to drive the folding of each job method = kwargs["engine"] # store structures in an auxiliary subdirectory, after folding # final models will be moved to main folding dir. Depending # on cleanup setting, the aux directory will be removed aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) aux_dir = path.dirname(aux_prefix) folding_runs = [ (job_ecs, aux_prefix + job_suffix) for (job_ecs, job_suffix) in folding_runs ] if method == "cns_dgsa": folder = partial( cns_dgsa_fold, residues_fold, config_file=kwargs["folding_config_file"], num_structures=kwargs["num_models"], log_level=None, binary=kwargs["cns"] ) else: raise InvalidParameterError( "Invalid folding engine: {} ".format(method) + "Valid selections are: cns_dgsa" ) # then apply folding function to each sub-run pool = mp.Pool(processes=num_procs) results = pool.starmap(folder, folding_runs) # make double sure that the pool is cleaned up, # or SIGTERM upon exit will interfere with # interrupt signal interception pool.close() pool.join() # merge result dictionaries into one dict folded_files = { k: v for subres in results for k, v in subres.items() } # move structures from aux into main folding dir fold_dir = path.dirname(prefix) prediction_files = [] for name, file_path in folded_files.items(): # move file (use copy to allow overwriting) shutil.copy(file_path, fold_dir) # update file path to main folding dir, # and put in a flat list of result files prediction_files.append( file_path.replace(aux_prefix, prefix) ) outcfg["folded_structure_files"] = prediction_files # remove aux dir if cleanup is requested if kwargs["cleanup"]: shutil.rmtree(aux_dir) # apply ranking to predicted models ranking = dihedral_ranking(prediction_files, residues) # apply clustering (all available methods), but only # if we have something to cluster if len(prediction_files) > 1: clustering = maxcluster_clustering_table( prediction_files, binary=kwargs["maxcluster"] ) # join ranking with clustering ranking = ranking.merge(clustering, on="filename", how="left") # sort by score (best models first) ranking = ranking.sort_values(by="ranking_score", ascending=False) # store as file outcfg["folding_ranking_file"] = prefix + "_ranking.csv" ranking.to_csv(outcfg["folding_ranking_file"], index=False) # apply comparison to existing structures if kwargs["remapped_pdb_files"] is not None and len(kwargs["remapped_pdb_files"]) > 0: experimental_files = kwargs["remapped_pdb_files"] comp_all, comp_singles = compare_models_maxcluster( list(experimental_files.keys()), prediction_files, norm_by_intersection=True, distance_cutoff=None, binary=kwargs["maxcluster"] ) # merge with ranking and save comparison = ranking.merge( comp_all, on="filename", how="left" ).sort_values(by="tm", ascending=False) outcfg["folding_comparison_file"] = prefix + "_comparison.csv" comparison.to_csv(outcfg["folding_comparison_file"], index=False) # also store comparison to structures in individual files ind_comp_files = {} for filename, comp_single in comp_singles.items(): comparison_s = ranking.merge( comp_single, on="filename", how="left" ).sort_values(by="tm", ascending=False) basename = path.splitext(path.split(filename)[1])[0] ind_file = path.join(fold_dir, basename + ".csv") # map back to original key from remapped_pdb_files as a key for this list ind_comp_files[ind_file] = experimental_files[filename] comparison_s.to_csv(ind_file, index=False) outcfg["folding_individual_comparison_files"] = ind_comp_files return outcfg
def infer_plmc(**kwargs): """ Run EC computation on alignment. This function contains the functionality shared between monomer and complex EC inference. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: raw_ec_file model_file num_sites num_sequences effective_sequences focus_mode (passed through) focus_sequence (passed through) segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "focus_mode", "focus_sequence", "theta", "alphabet", "segments", "ignore_gaps", "iterations", "lambda_h", "lambda_J", "lambda_group", "scale_clusters", "cpu", "plmc", "reuse_ecs", ] ) prefix = kwargs["prefix"] # for now disable option to not save model, since # otherwise mutate stage will crash. To remove model # file at end, use delete option in management section. """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # the following are passed through stage... "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) # regularization strength on couplings J_ij lambda_J = kwargs["lambda_J"] segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # first determine size of alphabet; # default is amino acid alphabet if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN alphabet_setting = None else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # if we have protein alphabet, do not set # as plmc parameter since default parameter, # has some implementation advantages for focus mode if alphabet == ALPHABET_PROTEIN: alphabet_setting = None else: alphabet_setting = alphabet # scale lambda_J to proportionally compensate # for higher number of J_ij compared to h_i? if kwargs["lambda_J_times_Lq"]: num_symbols = len(alphabet) # if we ignore gaps, there is one character less if kwargs["ignore_gaps"]: num_symbols -= 1 # second, determine number of uppercase positions # that are included in the calculation with open(kwargs["alignment_file"]) as f: seq_id, seq = next(read_fasta(f)) # gap character is by convention first char in alphabet gap = alphabet[0] uppercase = [ c for c in seq if c == c.upper() or c == gap ] L = len(uppercase) # finally, scale lambda_J lambda_J *= (num_symbols - 1) * (L - 1) # run plmc... or reuse pre-exisiting results from previous run plm_outcfg_file = prefix + ".couplings_standard_plmc.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_ecs"] and valid_file(plm_outcfg_file): plmc_result = read_config_file(plm_outcfg_file) # check if the EC/parameter files are there required_files = [outcfg["raw_ec_file"]] if outcfg["model_file"] is not None: required_files += [outcfg["model_file"]] verify_resources( "Tried to reuse ECs, but empty or " "does not exist", *required_files ) else: # run plmc binary plmc_result = ct.run_plmc( kwargs["alignment_file"], outcfg["raw_ec_file"], outcfg["model_file"], focus_seq=kwargs["focus_sequence"], alphabet=alphabet_setting, theta=kwargs["theta"], scale=kwargs["scale_clusters"], ignore_gaps=kwargs["ignore_gaps"], iterations=kwargs["iterations"], lambda_h=kwargs["lambda_h"], lambda_J=lambda_J, lambda_g=kwargs["lambda_group"], cpu=kwargs["cpu"], binary=kwargs["plmc"], ) # save iteration table to file iter_table_file = prefix + "_iteration_table.csv" plmc_result.iteration_table.to_csv( iter_table_file ) # turn namedtuple into dictionary to make # restarting code nicer plmc_result = dict(plmc_result._asdict()) # then replace table with filename so # we can store results in config file plmc_result["iteration_table"] = iter_table_file # save results of search for possible restart write_config_file(plm_outcfg_file, plmc_result) # store useful information about model in outcfg outcfg.update({ "num_sites": plmc_result["num_valid_sites"], "num_valid_sequences": plmc_result["num_valid_seqs"], "effective_sequences": plmc_result["effective_samples"], "region_start": plmc_result["region_start"], }) # read and sort ECs ecs = pairs.read_raw_ec_file(outcfg["raw_ec_file"]) if segments is not None: # create index mapping seg_mapper = mapping.SegmentIndexMapper( kwargs["focus_mode"], outcfg["region_start"], *segments ) # apply to EC table ecs = mapping.segment_map_ecs(ecs, seg_mapper) return outcfg, ecs, segments
def secondary_structure(**kwargs): """ Predict or load secondary structure for an input sequence Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- residues : pandas.DataFrame Table with sequence and secondary structure in columns i, A_i and sec_struct_3state """ check_required( kwargs, [ "prefix", "target_sequence_file", "segments", "sec_struct_method", "sec_struct_file", "psipred", ] ) prefix = kwargs["prefix"] create_prefix_folders(prefix) secstruct_file = kwargs["sec_struct_file"] if secstruct_file is not None: verify_resources( "Secondary structure prediction file does not exist/is empty", secstruct_file ) residues = pd.read_csv(secstruct_file) else: # make sure target sequence file is there so we can # predict secondary structure target_seq_file = kwargs["target_sequence_file"] verify_resources( "Sequence file does not exist/is empty", target_seq_file ) # we need to figure out what the index of the first residue # in the target sequence is; obtain first index from segment # information if possible if kwargs["segments"] is not None: s = Segment.from_list(kwargs["segments"][0]) first_index = s.region_start else: # otherwise try to get it from sequence file first_index = None with open(target_seq_file) as f: header, _ = next(read_fasta(f)) if header is not None: _, first_index, _ = parse_header(header) # if we cannot identify first index from header, # do not make guesses but fail if first_index is None: raise InvalidParameterError( "Could not unambiguously identify sequence range from " "FASTA header, needs to specified as id/start-end: {}".format( header ) ) # finally, run secondary structure prediction if kwargs["sec_struct_method"] == "psipred": # store psipred output in a separate directory output_dir = path.join(path.dirname(prefix), "psipred") # run psipred ss2_file, horiz_file = run_psipred( target_seq_file, output_dir, binary=kwargs["psipred"] ) # parse output, renumber to first index residues = read_psipred_prediction( horiz_file, first_index=first_index ) else: raise InvalidParameterError( "Secondary structure prediction method not implemented: " "{}. Valid choices: psipred".format(kwargs["sec_struct_method"]) ) # return predicted table return residues
def standard(**kwargs): """ Protocol: Standard buildali4 workflow (run iterative jackhmmer search against sequence database, than determine which sequences and columns to include in the calculation based on coverage and maximum gap thresholds). Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * alignment_file * raw_alignment_file * raw_focus_alignment_file * statistics_file * target_sequence_file * sequence_file * annotation_file * frequencies_file * identities_file * hittable_file * focus_mode * focus_sequence * segments ali : Alignment Final sequence alignment """ check_required(kwargs, [ "prefix", "extract_annotation", ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # first step of protocol is to get alignment using # jackhmmer; initialize output configuration with # results of this search jackhmmer_outcfg = jackhmmer_search(**kwargs) stockholm_file = jackhmmer_outcfg["raw_alignment_file"] segment = Segment.from_list(jackhmmer_outcfg["segments"][0]) target_seq_id = segment.sequence_id region_start = segment.region_start region_end = segment.region_end # read in stockholm format (with full annotation) with open(stockholm_file) as a: ali_raw = Alignment.from_file(a, "stockholm") # and store as FASTA file first (disabled for now # since equivalent information easily be obtained # from Stockholm file """ ali_raw_fasta_file = prefix + "_raw.fasta" with open(ali_raw_fasta_file, "w") as f: ali_raw.write(f, "fasta") """ # save annotation in sequence headers (species etc.) if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" annotation = extract_header_annotation(ali_raw) annotation.to_csv(annotation_file, index=False) # center alignment around focus/search sequence focus_cols = np.array([c != "-" for c in ali_raw[0]]) focus_ali = ali_raw.select(columns=focus_cols) target_seq_index = 0 mod_outcfg, ali = modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, **kwargs) # merge results of jackhmmer_search and modify_alignment stage outcfg = { **jackhmmer_outcfg, **mod_outcfg, "annotation_file": annotation_file } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_standard.outcfg", outcfg) # return results of protocol return outcfg
def genome_distance(**kwargs): """ Protocol: Concatenate alignments based on genomic distance Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: Explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: .. todo:: this is the full list normally returned by alignment protocol, decide which ones to keep. Mandatory: * alignment_file * focus_sequence * focus_mode * segments * alignment_file * [raw_alignment_file] * statistics_file * target_sequence_file * sequence_file * [annotation_file] * frequencies_file * identities_file * [hittable_file] * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "first_raw_focus_alignment_file", "second_raw_focus_alignment_file", "first_focus_sequence", "second_focus_sequence", "first_focus_mode", "second_focus_mode", "first_segments", "second_segments", ]) prefix = kwargs["prefix"] # make sure input alignments verify_resources("Input alignment does not exist", kwargs["first_alignment_file"], kwargs["second_alignment_file"]) # make sure output directory exists create_prefix_folders(prefix) # ------------------------------------------------- # TODO: implement concatenation functionality and # postprocessing functionality here # ------------------------------------------------- def _modify_segments(seg_list, seg_prefix): # extract segments from list representation into objects segs = [Segment.from_list(s) for s in seg_list] # update segment IDs for i, s in enumerate(segs, start=1): s.segment_id = "{}_{}".format(seg_prefix, i) return segs # merge segments - this allows to have more than one segment per # "monomer" alignment segments_1 = _modify_segments(kwargs["first_segments"], "A") segments_2 = _modify_segments(kwargs["second_segments"], "B") segments_complex = segments_1 + segments_2 # make sure we return all the necessary information: # * alignment_file: final concatenated alignment that will go into plmc # * focus_sequence: this is the identifier of the concatenated target # sequence which will be passed into plmc with -f outcfg = { "alignment_file": None, # TODO: specify "focus_mode": True, "focus_sequence": None, # TODO: specify "segments": [s.to_list() for s in segments_complex], # optional but good to have: "num_sites": None, "num_sequences": None, # "effective_sequences": n_eff # TODO: could compute this like in align stage # TODO: there are more outputs that we could add here (not mandatory), # e.g. single column frequencies in concatenated alignment } return outcfg
def existing(**kwargs): """ Protocol: Use external sequence alignment and extract all relevant information from there (e.g. sequence, region, etc.), then apply gap & fragment filtering as usual Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * alignment_file * raw_focus_alignment_file * statistics_file * sequence_file * first_index * target_sequence_file * annotation_file (None) * frequencies_file * identities_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "input_alignment", "sequence_id", "first_index", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this file is starting point of pipeline; # check if input alignment actually exists input_alignment = kwargs["input_alignment"] verify_resources("Input alignment does not exist", input_alignment) # first try to autodetect format of alignment with open(input_alignment) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment)) with open(input_alignment) as f: ali_raw = Alignment.from_file(f, format) # save annotation in sequence headers (species etc.) annotation_file = None if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" from_anno_line = (format == "stockholm") annotation = extract_header_annotation(ali_raw, from_annotation=from_anno_line) annotation.to_csv(annotation_file, index=False) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError("Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # apply sequence identity and fragment filters, # and gap threshold mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_, region_start, **kwargs) # generate output configuration of protocol outcfg = { **mod_outcfg, "sequence_id": sequence_id, "sequence_file": target_sequence_file, "first_index": region_start, "target_sequence_file": target_sequence_file, "focus_sequence": header, "focus_mode": True, } if annotation_file is not None: outcfg["annotation_file"] = annotation_file # dump config to YAML file for debugging/logging write_config_file(prefix + ".align_existing.outcfg", outcfg) # return results of protocol return outcfg
def run_plmc(alignment, couplings_file, param_file=None, focus_seq=None, alphabet=None, theta=None, scale=None, ignore_gaps=False, iterations=None, lambda_h=None, lambda_J=None, lambda_g=None, cpu=None, binary="plmc"): """ Run plmc on sequence alignment and store files with model parameters and pair couplings. Parameters ---------- alignment : str Path to input sequence alignment couplings_file : str Output path for file with evolutionary couplings (folder will be created) param_file : str Output path for binary file containing model parameters (folder will be created) focus_seq : str, optional (default: None) Name of focus sequence, if None, non-focus mode will be used alphabet : str, optional (default: None) Alphabet for model inference. If None, standard amino acid alphabet including gap will be used. First character in string corresponds to gap character (relevant for ignore_gaps). theta : float, optional (default: None) Sequences with pairwise identity >= theta will be clustered and their sequence weights downweighted as 1 / num_cluster_members. Important: Note that plmc will be parametrized using 1 - theta. If None, default value in plmc will be used, which corresponds to theta=0.8 (plmc setting 0.2). scale : float, optional (default: None) Scale weights of clusters by this value. If None, default value in plmc (1.0) will be used ignore_gaps : bool, optional (default: False) Exclude gaps from parameter inference. Gap character is first character of alphabet parameter. iterations : int, optional (default: None) Maximum iterations for optimization. lambda_h : float, optional (default: None) l2 regularization strength on fields. If None, plmc default will be used. lambda_J : float, optional (default: None) l2-regularization strength on couplings. If None, plmc default will be used lambda_g : float, optional (default: None) group l1-regularization strength on couplings If None, plmc default will be used. cpu : Number of cores to use for running plmc. Note that plmc has to be compiled in openmp mode to runnable with multiple cores. Can also be set to "max". binary : str, optional (default: "plmc") Path to plmc binary Returns ------- PlmcResult namedtuple containing output files and parsed fields from console output of plmc Raises ------ ExternalToolError """ create_prefix_folders(couplings_file) # Make sure input alignment exists verify_resources( "Alignment file does not exist", alignment ) cmd = [ binary, "-c", couplings_file, ] # store eij file if explicitly requested if param_file is not None: create_prefix_folders(param_file) cmd += ["-o", param_file] # focus sequence mode and ID if focus_seq is not None: # TODO: for now split exclude sequence # region from focus seq name, otherwise # plmc does not remap names. If this # behaviour changes in plmc, remove the # following line. focus_seq = focus_seq.split("/")[0] cmd += ["-f", focus_seq] # exclude gaps from calculation? if ignore_gaps: cmd += ["-g"] # maximum number of iterations, can also be "max" if iterations is not None: cmd += ["-m", str(iterations)] # set custom alphabet # (first character is gap by default in nogap mode) if alphabet is not None: cmd += ["-a", alphabet] # sequence reweighting if theta is not None: # transform into plmc convention (1-theta) theta = 1.0 - theta cmd += ["-t", str(theta)] # cluster weight if scale is not None: cmd += ["-s", str(scale)] # L2 regularization weight for fields if lambda_h is not None: cmd += ["-lh", str(lambda_h)] # L2 regularization weight for pair couplings if lambda_J is not None: cmd += ["-le", str(lambda_J)] # Group L1 regularization weight for pair couplings if lambda_g is not None: cmd += ["-lg", str(lambda_g)] # Number of cores to use for calculation if cpu is not None: cmd += ["-n", str(cpu)] # finally also add input alignment (main parameter) cmd += [alignment] # TODO: for now do not check returncode because sometimes # returncode == -11 (segfault) despite successful calculation return_code, stdout, stderr = run(cmd, check_returncode=False) # TODO: remove this segfault-hunting output once fixed if return_code != 0: # if not a segfault, still raise exception if return_code != -11: from evcouplings.utils.system import ExternalToolError raise ExternalToolError( "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format( cmd, return_code, stdout, stderr ) ) print("PLMC NON-ZERO RETURNCODE:", return_code) print(cmd) print(" ".join(cmd)) print("stdout:", stdout) print("stderr:", stderr) iter_df, out_fields = parse_plmc_log(stderr) # also check we actually calculated couplings... if not valid_file(couplings_file): raise ResourceError( "plmc returned no couplings: stdout={} stderr={} file={}".format( stdout, stderr, couplings_file ) ) # ... and parameter file, if requested if param_file and not valid_file(param_file): raise ResourceError( "plmc returned no parameter file: stdout={} stderr={} file={}".format( stdout, stderr, param_file ) ) return PlmcResult( couplings_file, param_file, iter_df, *out_fields )
def jackhmmer_search(**kwargs): """ Protocol: Iterative jackhmmer search against a sequence database. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required .. todo:: explain meaning of parameters in detail. Returns ------- outcfg : dict Output configuration of the protocol, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * target_sequence_file * sequence_file * raw_alignment_file * hittable_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "sequence_id", "sequence_file", "sequence_download_url", "region", "first_index", "use_bitscores", "domain_threshold", "sequence_threshold", "database", "iterations", "cpu", "nobias", "reuse_alignment", "checkpoints_hmm", "checkpoints_ali", "jackhmmer", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # store search sequence file here target_sequence_file = prefix + ".fa" full_sequence_file = prefix + "_full.fa" # make sure search sequence is defined and load it full_seq_file, (full_seq_id, full_seq) = fetch_sequence( kwargs["sequence_id"], kwargs["sequence_file"], kwargs["sequence_download_url"], full_sequence_file) # cut sequence to target region and save in sequence_file # (this is the main sequence file used downstream) (region_start, region_end), cut_seq = cut_sequence(full_seq, kwargs["sequence_id"], kwargs["region"], kwargs["first_index"], target_sequence_file) # run jackhmmer... allow to reuse pre-exisiting # Stockholm alignment file here ali_outcfg_file = prefix + ".align_jackhmmer_search.outcfg" # determine if to rerun, only possible if previous results # were stored in ali_outcfg_file if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): ali = read_config_file(ali_outcfg_file) # check if the alignment file itself is also there verify_resources( "Tried to reuse alignment, but empty or " "does not exist", ali["alignment"], ali["domtblout"]) else: # otherwise, we have to run the alignment # modify search thresholds to be suitable for jackhmmer seq_threshold, domain_threshold = search_thresholds( kwargs["use_bitscores"], kwargs["sequence_threshold"], kwargs["domain_threshold"], len(cut_seq)) # run search process ali = at.run_jackhmmer( query=target_sequence_file, database=kwargs[kwargs["database"]], prefix=prefix, use_bitscores=kwargs["use_bitscores"], domain_threshold=domain_threshold, seq_threshold=seq_threshold, iterations=kwargs["iterations"], nobias=kwargs["nobias"], cpu=kwargs["cpu"], checkpoints_hmm=kwargs["checkpoints_hmm"], checkpoints_ali=kwargs["checkpoints_ali"], binary=kwargs["jackhmmer"], ) # get rid of huge stdout log file immediately # (do not use /dev/null option of jackhmmer function # to make no assumption about operating system) try: os.remove(ali.output) except OSError: pass # turn namedtuple into dictionary to make # restarting code nicer ali = dict(ali._asdict()) # save results of search for possible restart write_config_file(ali_outcfg_file, ali) # prepare output dictionary with result files outcfg = { "sequence_id": kwargs["sequence_id"], "target_sequence_file": target_sequence_file, "sequence_file": full_sequence_file, "first_index": kwargs["first_index"], "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } # define a single protein segment based on target sequence outcfg["segments"] = [ Segment("aa", kwargs["sequence_id"], region_start, region_end, range(region_start, region_end + 1)).to_list() ] outcfg["focus_sequence"] = "{}/{}-{}".format(kwargs["sequence_id"], region_start, region_end) return outcfg
def cns_dgsa_fold(residues, ec_pairs, prefix, config_file=None, secstruct_column="sec_struct_3state", num_structures=20, min_cycles=5, log_level=None, binary="cns"): """ Predict 3D structure coordinates using distance geometry and simulated annealing-based folding protocol Parameters ---------- residues : pandas.DataFrame Table containing positions (column i), residue type (column A_i), and secondary structure for each position ec_pairs : pandas.DataFrame Table with EC pairs that will be turned into distance restraints (with columns i, j, A_i, A_j) prefix : str Prefix for output files (can include directories). Folders will be created automatically. config_file : str, optional (default: None) Path to config file with folding settings. If None, will use default settings included in package (restraints.yml) secstruct_column : str, optional (default: sec_struct_3state) Column name in residues dataframe from which secondary structure will be extracted (has to be H, E, or C). num_structures : int, optional (default: 20) Number of trial structures to generate min_cycles : int, optional (default: 5) Number of minimization cycles at end of protocol log_level : {None, "quiet", "verbose"}, optional (default: None) Don't keep CNS log files, or switch to different degrees of verbosity ("verbose" needed to obtain violation information) binary : str, optional (default: "cns") Path of CNS binary Returns ------- final_models : dict Mapping from model name to path of model """ def _run_inp(inp_str, output_prefix): with open(output_prefix + ".inp", "w") as f: f.write(inp_str) if log_level is not None: log_file = output_prefix + ".log" else: log_file = None run_cns(inp_str, log_file=log_file, binary=binary) # make sure output directory exists create_prefix_folders(prefix) # CNS doesn't like paths above a certain length, so we # will change into working directory and keep paths short. # For this reason, extract path and filename prefix dir_, rootname = path.split(prefix) cwd = os.getcwd() if dir_ != "": os.chdir(dir_) # create restraints (EC pairs and secondary structure-based) ec_tbl = rootname + "_couplings.tbl" ss_dist_tbl = rootname + "_ss_distance.tbl" ss_angle_tbl = rootname + "_ss_angle.tbl" ec_dist_restraints(ec_pairs, ec_tbl, cns_dist_restraint, config_file) secstruct_dist_restraints(residues, ss_dist_tbl, cns_dist_restraint, config_file, secstruct_column) secstruct_angle_restraints(residues, ss_angle_tbl, cns_dihedral_restraint, config_file, secstruct_column) # create sequence file seq = "".join(residues.A_i) seq_file = rootname + ".seq" cns_seq_file(seq, seq_file) # set up input files for folding # make molecular topology file (will be written to mtf_file) mtf_file = rootname + ".mtf" _run_inp( cns_mtf_inp(seq_file, mtf_file, first_index=residues.i.min(), disulfide_bridges=None), mtf_file) # make extended PDB file (will be in extended_file) extended_file = rootname + "_extended.pdb" _run_inp(cns_extended_inp(mtf_file, extended_file), extended_file) # fold using dg_sa protocol (filenames will have suffixes _1, _2, ...) # have to pass either quiet or verbose to CNS (but will not store # log file if log_level is None). if log_level is None: dgsa_log_level = "quiet" else: dgsa_log_level = log_level _run_inp( cns_dgsa_inp(extended_file, mtf_file, rootname, ec_tbl, ss_dist_tbl, ss_angle_tbl, num_structures=num_structures, log_level=dgsa_log_level), rootname + "_dgsa") # add hydrogen atoms and minimize (for all # generated candidate structures from dg_sa) # keep track of final predicted structures final_models = {} for i in range(1, num_structures + 1): input_root = "{}_{}".format(rootname, i) input_model = input_root + ".pdb" # check if we actually got the model from dg_sa if not valid_file(input_model): continue # run generate_easy protocol to add hydrogen atoms easy_pdb = input_root + "_h.pdb" easy_mtf = input_root + "_h.mtf" _run_inp(cns_generate_easy_inp(input_model, easy_pdb, easy_mtf), input_root + "_h") # then minimize min_pdb = input_root + "_hMIN.pdb" _run_inp( cns_minimize_inp(easy_pdb, easy_mtf, min_pdb, num_cycles=min_cycles), input_root + "_hMIN") if valid_file(min_pdb): final_models[min_pdb] = path.join(dir_, min_pdb) # change back into original directory os.chdir(cwd) return final_models
def complex(**kwargs): """ Protocol: Compare ECs for a complex to 3D structure Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "first_compare_multimer", "second_compare_multimer", "distance_cutoff", "first_sequence_id", "second_sequence_id", "first_sequence_file", "second_sequence_file", "first_segments", "second_segments", "first_target_sequence_file", "second_target_sequence_file", "scale_sizes" ]) prefix = kwargs["prefix"] outcfg = { # initialize output EC files "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv", # initialize output inter distancemap files "distmap_inter": prefix + "_distmap_inter", "inter_contacts_file": prefix + "_inter_contacts_file" } # Add PDB comparison files for first and second monomer for monomer_prefix in ["first", "second"]: outcfg = { **outcfg, monomer_prefix + "_pdb_structure_hits_file": "{}_{}_structure_hits.csv".format(prefix, monomer_prefix), monomer_prefix + "_pdb_structure_hits_unfiltered_file": "{}_{}_structure_hits_unfitered.csv".format( prefix, monomer_prefix), monomer_prefix + "_distmap_monomer": "{}_{}_distance_map_monomer".format(prefix, monomer_prefix), monomer_prefix + "_distmap_multimer": "{}_{}_distance_map_multimer".format(prefix, monomer_prefix), } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # store auxiliary files here (too much for average user) first_aux_prefix = insert_dir(aux_prefix, "first_monomer", rootname_subdir=False) create_prefix_folders(first_aux_prefix) # store auxiliary files here (too much for average user) second_aux_prefix = insert_dir(aux_prefix, "second_monomer", rootname_subdir=False) create_prefix_folders(second_aux_prefix) # Step 1: Identify 3D structures for comparison def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): # create a dictionary with kwargs for just the current monomer # remove the "prefix" kwargs so that we can replace with the # aux prefix when calling _identify_structures # only replace first occurrence of name_prefix monomer_kwargs = { k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k } # this field needs to be set explicitly else it gets overwritten by concatenated file monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"] monomer_kwargs["raw_focus_alignment_file"] = kwargs[ name_prefix + "_raw_focus_alignment_file"] # identify structures for that monomer sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs, prefix=aux_prefix) # save selected PDB hits sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv( outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False) return outcfg, sifts_map outcfg, first_sifts_map = _identify_monomer_structures( "first", outcfg, first_aux_prefix) outcfg, second_sifts_map = _identify_monomer_structures( "second", outcfg, second_aux_prefix) # get the segment names from the kwargs segment_list = kwargs["segments"] # Make sure user provided exactly two segments if len(segment_list) != 2: raise InvalidParameterError( "Compare stage for protein complexes requires exactly two segments" ) first_segment_name = kwargs["segments"][0][0] second_segment_name = kwargs["segments"][1][0] # Step 2: Compute distance maps def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # prepare a sequence map to remap the structures we have found verify_resources("Target sequence file does not exist", kwargs[name_prefix + "_target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs[name_prefix + "_target_sequence_file"]) as f: header, seq = next(read_fasta(f)) # create target sequence map for remapping structure seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_intra") d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"]) # save contacts to separate file outcfg[ name_prefix + "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs[name_prefix + "_compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact map, save it if d_multimer is not None: d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"]) outcfg[ name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_multimer_contacts_file"], index=False) else: outcfg[name_prefix + "_distmap_multimer"] = None # create remapped structures (e.g. for # later comparison of folding results) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg[name_prefix + "_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap, chain_name=chain_name, raise_missing=kwargs["raise_missing"]).items() } else: # if no structures, cannot compute distance maps d_intra = None d_multimer = None outcfg[name_prefix + "_distmap_monomer"] = None outcfg[name_prefix + "_distmap_multimer"] = None outcfg[name_prefix + "remapped_pdb_files"] = None return d_intra, d_multimer, seqmap # load all structures for both monomers all_structures = set(first_sifts_map.hits.pdb_id).union( set(second_sifts_map.hits.pdb_id)) structures = load_structures(all_structures, kwargs["pdb_mmtf_dir"], raise_missing=False) d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps( first_sifts_map, "first", "A") d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps( second_sifts_map, "second", "B") # compute inter distance map if sifts map for each monomer exists if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: d_inter = inter_dists(first_sifts_map, second_sifts_map, raise_missing=kwargs["raise_missing"]) # if there were overlapping PDBs, save the results if d_inter is not None: d_inter.to_file(outcfg["distmap_inter"]) # save contacts to separate file d_inter.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["inter_contacts_file"], index=False) else: outcfg["inter_contacts_file"] = None d_inter = None # # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we have an intra distance map # for at least one monomer - inter can't exist unless # we have both monomers if (d_intra_i is not None) or (d_intra_j is not None): # compare distances individually for each segment pair ecs_intra_i = ec_table.query( "segment_i == segment_j == @first_segment_name") if d_intra_i is not None: ecs_intra_i_compared = coupling_scores_compared( ecs_intra_i, d_intra_i, d_multimer_i, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: # If no distance map, the distance is saved as np.nan ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan) ecs_intra_j = ec_table.query( "segment_i == segment_j == @second_segment_name") if d_intra_j is not None: ecs_intra_j_compared = coupling_scores_compared( ecs_intra_j, d_intra_j, d_multimer_j, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan) ecs_inter = ec_table.query("segment_i != segment_j") if d_inter is not None: ecs_inter_compared = coupling_scores_compared( ecs_inter, d_inter, dist_map_multimer=None, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist= None # does not apply for inter-protein ECs ) else: ecs_inter_compared = ecs_inter.assign(dist=np.nan) # combine the tables ec_table_compared = pd.concat([ ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared ]) # rename the precision column to "segmentwise_precision" # because we calculated precision for each segment independently ec_table_compared = ec_table_compared.rename( columns={"precision": "segmentwise_precision"}) # TODO: change "cn" to "score" eventually ec_table_compared = ec_table_compared.sort_values("cn", ascending=False) # add the total precision # TODO: implement different cutoffs for intra vs inter contacts ec_table_compared = add_precision( ec_table_compared, dist_cutoff=kwargs["distance_cutoff"]) # save to file # all ecs ec_table_compared.to_csv(outcfg[out_file]) # save the inter ECs to a file ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"]) # create the inter-ecs line drawing script if outcfg["ec_compared_inter_file"] is not None and kwargs[ "plot_highest_count"] is not None: inter_ecs = ec_table.query("segment_i != segment_j") outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script( inter_ecs.iloc[:kwargs["plot_highest_count"], :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"], chain={ first_segment_name: "A", second_segment_name: "B" }) # Remap the complex crystal structures, if available if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: outcfg["complex_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_complex_chains( first_sifts_map, second_sifts_map, seqmap_i, seqmap_j, output_prefix=aux_prefix, raise_missing=kwargs["raise_missing"]).items() } # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_complex_contact_maps( ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter, first_segment_name, second_segment_name, **kwargs) return outcfg
def multimer_dists(sifts_result, structures=None, atom_filter=None, intersect=False, output_prefix=None, model=0, raise_missing=True): """ Compute homomultimer distances (between repeated copies of the same entity) in PDB file. Resulting distance matrix will be symmetric by minimization over upper and lower triangle of matrix, even if the complex structure is not symmetric. Parameters ---------- sifts_result : SIFTSResult Input structures and mapping to use for distance map calculation structures : str or dict, optional (default: None) If str: Load structures from directory this string points to. Missing structures will be fetched from web. If dict: dictionary with lower-case PDB ids as keys and PDB objects as values. This dictionary has to contain all necessary structures, missing ones will not be fetched. This dictionary can be created using pdb.load_structures. atom_filter : str, optional (default: None) Filter coordinates to contain only these atoms. E.g. set to "CA" to compute C_alpha - C_alpha distances instead of minimum atom distance over all atoms in both residues. intersect : bool, optional (default: False) If True, intersect indices of the given distance maps. Otherwise, union of indices will be used. output_prefix : str, optional (default: None) If given, save individual and final contact maps to files prefixed with this string. The appended file suffixes map to row index in sifts_results.hits model : int, optional (default: 0) Index of model in PDB structure that should be used raise_missing : bool, optional (default: True) Raise a ResourceError if any of the input structures can not be loaded; otherwise, ignore missing entries. Returns ------- DistanceMap Computed aggregated distance map across all input structures Raises ------ ValueError If sifts_result is empty (no structure hits) ResourceError If any structure could not be loaded and raise_missing is True """ if len(sifts_result.hits) == 0: raise ValueError( "sifts_result is empty (no structure hits, but at least one required)" ) # if no structures given, or path to files, load first structures = _prepare_structures( structures, sifts_result.hits.pdb_id, raise_missing ) # aggegrated distance map agg_distmap = None # create output folder if necessary if output_prefix is not None: create_prefix_folders(output_prefix) # go through each structure for pdb_id, grp in sifts_result.hits.reset_index().groupby("pdb_id"): # skip missing structures if not raise_missing and pdb_id not in structures: continue # extract all chains for this structure chains = [ ( r["index"], _prepare_chain( structures, r["pdb_id"], r["pdb_chain"], atom_filter, sifts_result.mapping[r["mapping_index"]], model ) ) for i, r in grp.iterrows() ] # compare all possible pairs of chains for (index_i, ch_i), (index_j, ch_j) in combinations(chains, 2): # skip empty chains (e.g. residues lost during remapping) if len(ch_i.residues) == 0 or len(ch_j.residues) == 0: continue distmap = DistanceMap.from_coords(ch_i, ch_j) # symmetrize matrix (for ECs we are only interested if a pair # is close in some combination) distmap_sym = DistanceMap.aggregate( distmap, distmap.transpose(), intersect=intersect ) distmap_sym.symmetric = True # save individual distance map if output_prefix is not None: distmap_sym.to_file("{}_{}_{}".format( output_prefix, index_i, index_j) ) # aggregate with other chain combinations if agg_distmap is None: agg_distmap = distmap_sym else: agg_distmap = DistanceMap.aggregate( agg_distmap, distmap_sym, intersect=intersect ) return agg_distmap