def modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, **kwargs): """ Apply pairwise identity filtering, fragment filtering, and exclusion of columns with too many gaps to a sequence alignment. Also generates files describing properties of the alignment such as frequency distributions, conservation, and "old-style" alignment statistics files. .. note:: assumes focus alignment (otherwise unprocessed) as input. .. todo:: come up with something more clever to filter fragments than fixed width (e.g. use 95% quantile of length distribution as reference point) Parameters ---------- focus_ali : Alignment Focus-mode input alignment target_seq_index : int Index of target sequence in alignment target_seq_id : str Identifier of target sequence (without range) region_start : int Index of first sequence position in target sequence kwargs : See required arguments in source code Returns ------- outcfg : Dict File products generated by the function: * alignment_file * statistics_file * frequencies_file * identities_file * raw_focus_alignment_file ali : Alignment Final processed alignment """ check_required(kwargs, [ "prefix", "seqid_filter", "hhfilter", "minimum_sequence_coverage", "minimum_column_coverage", "compute_num_effective_seqs", "theta", ]) prefix = kwargs["prefix"] create_prefix_folders(prefix) focus_fasta_file = prefix + "_raw_focus.fasta" outcfg = { "alignment_file": prefix + ".a2m", "statistics_file": prefix + "_alignment_statistics.csv", "frequencies_file": prefix + "_frequencies.csv", "identities_file": prefix + "_identities.csv", "raw_focus_alignment_file": focus_fasta_file, } # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if target_seq_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = target_seq_index indices[target_seq_index] = 0 target_seq_index = 0 focus_ali = focus_ali.select(sequences=indices) with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") # apply pairwise identity filter (using hhfilter) if kwargs["seqid_filter"] is not None: filtered_file = prefix + "_filtered.a3m" at.run_hhfilter(focus_fasta_file, filtered_file, threshold=kwargs["seqid_filter"], columns="first", binary=kwargs["hhfilter"]) with open(filtered_file) as f: focus_ali = Alignment.from_file(f, "a3m") # final FASTA alignment before applying A2M format modifications filtered_fasta_file = prefix + "_raw_focus_filtered.fasta" with open(filtered_fasta_file, "w") as f: focus_ali.write(f, "fasta") ali = focus_ali # filter fragments # come up with something more clever here than fixed width # (e.g. use 95% quantile of length distribution as reference point) min_cov = kwargs["minimum_sequence_coverage"] if min_cov is not None: if isinstance(min_cov, int): min_cov /= 100 keep_seqs = (1 - ali.count("-", axis="seq")) >= min_cov ali = ali.select(sequences=keep_seqs) # Calculate frequencies, conservation and identity to query # on final alignment (except for lowercase modification) # Note: running hhfilter might cause a loss of the target seque # if it is not the first sequence in the file! To be sure that # nothing goes wrong, target_seq_index should always be 0. describe_seq_identities(ali, target_seq_index=target_seq_index).to_csv( outcfg["identities_file"], float_format="%.3f", index=False) describe_frequencies(ali, region_start, target_seq_index=target_seq_index).to_csv( outcfg["frequencies_file"], float_format="%.3f", index=False) coverage_stats = describe_coverage(ali, prefix, region_start, kwargs["minimum_column_coverage"]) # keep list of uppercase sequence positions in alignment pos_list = np.arange(region_start, region_start + ali.L, dtype="int32") # Make columns with too many gaps lowercase min_col_cov = kwargs["minimum_column_coverage"] if min_col_cov is not None: if isinstance(min_col_cov, int): min_col_cov /= 100 lc_cols = ali.count(ali._match_gap, axis="pos") > 1 - min_col_cov ali = ali.lowercase_columns(lc_cols) # if we remove columns, we have to update list of positions pos_list = pos_list[~lc_cols] else: lc_cols = None # compute effective number of sequences # (this is intended for cases where coupling stage is # not run, but this number is wanted nonetheless) if kwargs["compute_num_effective_seqs"]: # make sure we only compute N_eff on the columns # that would be used for model inference, dispose # the rest if lc_cols is None: cut_ali = ali else: cut_ali = ali.select(columns=~lc_cols) # compute sequence weights cut_ali.set_weights(kwargs["theta"]) # N_eff := sum of all sequence weights n_eff = float(cut_ali.weights.sum()) # patch into coverage statistics (N_eff column) coverage_stats.loc[:, "N_eff"] = n_eff else: n_eff = None # save coverage statistics to file coverage_stats.to_csv(outcfg["statistics_file"], float_format="%.3f", index=False) # store description of final sequence alignment in outcfg # (note these parameters will be updated by couplings protocol) outcfg.update({ "num_sites": len(pos_list), "num_sequences": len(ali), "effective_sequences": n_eff, "region_start": region_start, }) # create segment in outcfg outcfg["segments"] = [ Segment("aa", target_seq_id, region_start, region_start + ali.L - 1, pos_list).to_list() ] with open(outcfg["alignment_file"], "w") as f: ali.write(f, "fasta") return outcfg, ali
def _make_hmmsearch_raw_fasta(alignment_result, prefix): """ HMMsearch results do not contain the query sequence so we must construct a raw_fasta file with the query sequence as the first hit, to ensure proper numbering. The search result is filtered to only contain the columns with match states to the HMM, which has a one to one mapping to the query sequence. Paramters --------- alignment_result : dict Alignment result dictionary, output by run_hmmsearch prefix : str Prefix for file creation Returns ------- str path to raw focus alignment file """ def _add_gaps_to_query(query_sequence_ali, ali): # get the index of columns that do not contain match states (indicated by an x) gap_index = [ i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x != "x" ] # get the index of columns that contain match states (indicated by an x) match_index = [ i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x == "x" ] # ensure that the length of the match states # match the length of the sequence if len(match_index) != query_sequence_ali.L: raise ValueError("HMMsearch result {} does not have a one-to-one" " mapping to the query sequence columns".format( ar["raw_alignment_file"])) gapped_query_sequence = "" seq = list(query_sequence_ali.matrix[0, :]) # loop through every position in the HMMsearch hits for i in range(len(ali.annotation["GC"]["RF"])): # if that position should be a gap, add a gap if i in gap_index: gapped_query_sequence += "-" # if that position should be a letter, pop the next # letter in the query sequence else: gapped_query_sequence += seq.pop(0) new_sequence_ali = Alignment.from_dict( {query_sequence_ali.ids[0]: gapped_query_sequence}) return new_sequence_ali # open the sequence file with open(alignment_result["target_sequence_file"]) as a: query_sequence_ali = Alignment.from_file(a, format="fasta") # if the provided alignment is empty, just return the target sequence raw_focus_alignment_file = prefix + "_raw.fasta" if not valid_file(alignment_result["raw_alignment_file"]): # write the query sequence to a fasta file with open(raw_focus_alignment_file, "w") as of: query_sequence_ali.write(of) # return as an alignment object return raw_focus_alignment_file # else, open the HMM search result with open(alignment_result["raw_alignment_file"]) as a: ali = Alignment.from_file(a, format="stockholm") # make sure that the stockholm alignment contains the match annotation if not ("GC" in ali.annotation and "RF" in ali.annotation["GC"]): raise ValueError("Stockholm alignment {} missing RF" " annotation of match states".format( alignment_result["raw_alignment_file"])) # add insertions to the query sequence in order to preserve correct # numbering of match sequences gapped_sequence_ali = _add_gaps_to_query(query_sequence_ali, ali) # write a new alignment file with the query sequence as # the first entry with open(raw_focus_alignment_file, "w") as of: gapped_sequence_ali.write(of) ali.write(of) return raw_focus_alignment_file
def existing(**kwargs): """ Protocol: Use external sequence alignment and extract all relevant information from there (e.g. sequence, region, etc.), then apply gap & fragment filtering as usual Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * alignment_file * raw_focus_alignment_file * statistics_file * sequence_file * first_index * target_sequence_file * annotation_file (None) * frequencies_file * identities_file * focus_mode * focus_sequence * segments """ check_required(kwargs, [ "prefix", "input_alignment", "sequence_id", "first_index", "extract_annotation" ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # this file is starting point of pipeline; # check if input alignment actually exists input_alignment = kwargs["input_alignment"] verify_resources("Input alignment does not exist", input_alignment) # first try to autodetect format of alignment with open(input_alignment) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment)) with open(input_alignment) as f: ali_raw = Alignment.from_file(f, format) # save annotation in sequence headers (species etc.) annotation_file = None if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" from_anno_line = (format == "stockholm") annotation = extract_header_annotation(ali_raw, from_annotation=from_anno_line) annotation.to_csv(annotation_file, index=False) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError("Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # apply sequence identity and fragment filters, # and gap threshold mod_outcfg, ali = modify_alignment(focus_ali, focus_index, id_, region_start, **kwargs) # generate output configuration of protocol outcfg = { **mod_outcfg, "sequence_id": sequence_id, "sequence_file": target_sequence_file, "first_index": region_start, "target_sequence_file": target_sequence_file, "focus_sequence": header, "focus_mode": True, } if annotation_file is not None: outcfg["annotation_file"] = annotation_file # dump config to YAML file for debugging/logging write_config_file(prefix + ".align_existing.outcfg", outcfg) # return results of protocol return outcfg
def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): # this file is starting point of pipeline; # check if input alignment actually exists verify_resources("Input alignment does not exist", input_alignment_file) # first try to autodetect format of alignment with open(input_alignment_file) as f: format = detect_format(f) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " "automatically detected.".format(input_alignment_file)) with open(input_alignment_file) as f: ali_raw = Alignment.from_file(f, format) # Target sequence of alignment sequence_id = kwargs["sequence_id"] if sequence_id is None: raise InvalidParameterError( "Parameter sequence_id must be defined") # First, find focus sequence in alignment focus_index = None for i, id_ in enumerate(ali_raw.ids): if id_.startswith(sequence_id): focus_index = i break # if we didn't find it, cannot continue if focus_index is None: raise InvalidParameterError( "Target sequence {} could not be found in alignment".format( sequence_id)) # identify what columns (non-gap) to keep for focus # this should be all columns in the raw_focus_alignment_file # but checking anyway focus_seq = ali_raw[focus_index] focus_cols = np.array([ c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq ]) # extract focus alignment focus_ali = ali_raw.select(columns=focus_cols) focus_seq_nogap = "".join(focus_ali[focus_index]) # determine region of sequence. If first_index is given, # use that in any case, otherwise try to autodetect full_focus_header = ali_raw.ids[focus_index] focus_id = full_focus_header.split()[0] # try to extract region from sequence header id_, region_start, region_end = parse_header(focus_id) # override with first_index if given if kwargs["first_index"] is not None: region_start = kwargs["first_index"] region_end = region_start + len(focus_seq_nogap) - 1 if region_start is None or region_end is None: raise InvalidParameterError( "Could not extract region information " + "from sequence header {} ".format(full_focus_header) + "and first_index parameter is not given.") # resubstitute full sequence ID from identifier # and region information header = "{}/{}-{}".format(id_, region_start, region_end) focus_ali.ids[focus_index] = header # write target sequence to file target_sequence_file = prefix + ".fa" with open(target_sequence_file, "w") as f: write_fasta([(header, focus_seq_nogap)], f) # swap target sequence to first position if it is not # the first sequence in alignment; # this is particularly important for hhfilter run # because target sequence might otherwise be filtered out if focus_index != 0: indices = np.arange(0, len(focus_ali)) indices[0] = focus_index indices[focus_index] = 0 focus_index = 0 focus_ali = focus_ali.select(sequences=indices) # write the raw focus alignment for hmmbuild focus_fasta_file = prefix + "_raw_focus_input.fasta" with open(focus_fasta_file, "w") as f: focus_ali.write(f, "fasta") return focus_fasta_file, target_sequence_file, region_start, region_end
def standard(**kwargs): """ Protocol: Standard buildali4 workflow (run iterative jackhmmer search against sequence database, than determine which sequences and columns to include in the calculation based on coverage and maximum gap thresholds). Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * sequence_id (passed through from input) * first_index (passed through from input) * alignment_file * raw_alignment_file * raw_focus_alignment_file * statistics_file * target_sequence_file * sequence_file * annotation_file * frequencies_file * identities_file * hittable_file * focus_mode * focus_sequence * segments ali : Alignment Final sequence alignment """ check_required(kwargs, [ "prefix", "extract_annotation", ]) prefix = kwargs["prefix"] # make sure output directory exists create_prefix_folders(prefix) # first step of protocol is to get alignment using # jackhmmer; initialize output configuration with # results of this search jackhmmer_outcfg = jackhmmer_search(**kwargs) stockholm_file = jackhmmer_outcfg["raw_alignment_file"] segment = Segment.from_list(jackhmmer_outcfg["segments"][0]) target_seq_id = segment.sequence_id region_start = segment.region_start region_end = segment.region_end # read in stockholm format (with full annotation) with open(stockholm_file) as a: ali_raw = Alignment.from_file(a, "stockholm") # and store as FASTA file first (disabled for now # since equivalent information easily be obtained # from Stockholm file """ ali_raw_fasta_file = prefix + "_raw.fasta" with open(ali_raw_fasta_file, "w") as f: ali_raw.write(f, "fasta") """ # save annotation in sequence headers (species etc.) if kwargs["extract_annotation"]: annotation_file = prefix + "_annotation.csv" annotation = extract_header_annotation(ali_raw) annotation.to_csv(annotation_file, index=False) # center alignment around focus/search sequence focus_cols = np.array([c != "-" for c in ali_raw[0]]) focus_ali = ali_raw.select(columns=focus_cols) target_seq_index = 0 mod_outcfg, ali = modify_alignment(focus_ali, target_seq_index, target_seq_id, region_start, **kwargs) # merge results of jackhmmer_search and modify_alignment stage outcfg = { **jackhmmer_outcfg, **mod_outcfg, "annotation_file": annotation_file } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_standard.outcfg", outcfg) # return results of protocol return outcfg
def find_homologs(pdb_alignment_method="jackhmmer", **kwargs): """ Identify homologs using jackhmmer or hmmbuild/hmmsearch Parameters ---------- pdb_alignment_method : {"jackhmmer", "hmmsearch"}, optional (default: "jackhmmer") Sequence alignment method used for searching the PDB **kwargs Passed into jackhmmer / hmmbuild_and_search protocol (see documentation for available options) Returns ------- ali : evcouplings.align.Alignment Alignment of homologs of query sequence in sequence database hits : pandas.DataFrame Tabular representation of hits """ # load default configuration config = parse_config(HMMER_CONFIG) # update with overrides from kwargs config = { **config, **kwargs, } # create temporary output if no prefix is given if config["prefix"] is None: config["prefix"] = path.join(tempdir(), "compare") check_required(config, ["prefix"]) # run hmmsearch (possibly preceded by hmmbuild) if pdb_alignment_method == "hmmsearch": # set up config to run hmmbuild_and_search on the unfiltered alignment file updated_config = deepcopy(config) updated_config["alignment_file"] = config.get( "raw_focus_alignment_file") ar = hmmbuild_and_search(**updated_config) # For hmmbuild and search, we have to read the raw focus alignment file # to guarantee that the query sequence is present with open(ar["raw_focus_alignment_file"]) as a: ali = Alignment.from_file(a, "fasta") # run jackhmmer against sequence database # at this point we have already checked to ensure # that the input is either jackhmmer or hmmsearch elif pdb_alignment_method == "jackhmmer": ar = jackhmmer_search(**config) with open(ar["raw_alignment_file"]) as a: ali = Alignment.from_file(a, "stockholm") # write alignment as FASTA file for easier checking by hand, # if necessary with open(config["prefix"] + "_raw.fasta", "w") as f: ali.write(f) else: raise InvalidParameterError( "Invalid pdb_alignment_method selected. Valid options are: " + ", ".join(["jackhmmer", "hmmsearch"])) # read hmmer hittable and simplify hits = read_hmmer_domtbl(ar["hittable_file"]) hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[1]) hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[2]) hits = hits.rename( columns={ "domain_score": "bitscore", "domain_i_Evalue": "e_value", "ali_from": "alignment_start", "ali_to": "alignment_end", "hmm_from": "hmm_start", "hmm_to": "hmm_end", }) hits.loc[:, "alignment_start"] = pd.to_numeric( hits.alignment_start).astype(int) hits.loc[:, "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int) hits.loc[:, "alignment_id"] = (hits.target_name + "/" + hits.alignment_start.astype(str) + "-" + hits.alignment_end.astype(str)) hits = hits.loc[:, [ "alignment_id", "uniprot_ac", "uniprot_id", "alignment_start", "alignment_end", "bitscore", "e_value" ]] return ali, hits
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg) return outcfg
def find_homologs_jackhmmer(**kwargs): """ Identify homologs using jackhmmer Parameters ---------- **kwargs Passed into jackhmmer_search protocol (see documentation for available options) Returns ------- ali : evcouplings.align.Alignment Alignment of homologs of query sequence in sequence database hits : pandas.DataFrame Tabular representation of hits """ # load default configuration config = parse_config(JACKHMMER_CONFIG) # update with overrides from kwargs config = { **config, **kwargs, } # create temporary output if no prefix is given if config["prefix"] is None: config["prefix"] = path.join(tempdir(), "compare") # run jackhmmer against sequence database ar = jackhmmer_search(**config) with open(ar["raw_alignment_file"]) as a: ali = Alignment.from_file(a, "stockholm") # write alignment as FASTA file for easier checking by hand, # if necessary with open(config["prefix"] + "_raw.fasta", "w") as f: ali.write(f) # read hmmer hittable and simplify hits = read_hmmer_domtbl(ar["hittable_file"]) hits.loc[:, "uniprot_ac"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[1]) hits.loc[:, "uniprot_id"] = hits.loc[:, "target_name"].map( lambda x: x.split("|")[2]) hits = hits.rename( columns={ "domain_score": "bitscore", "domain_i_Evalue": "e_value", "ali_from": "alignment_start", "ali_to": "alignment_end", }) hits.loc[:, "alignment_start"] = pd.to_numeric( hits.alignment_start).astype(int) hits.loc[:, "alignment_end"] = pd.to_numeric(hits.alignment_end).astype(int) hits.loc[:, "alignment_id"] = (hits.target_name + "/" + hits.alignment_start.astype(str) + "-" + hits.alignment_end.astype(str)) hits = hits.loc[:, [ "alignment_id", "uniprot_ac", "uniprot_id", "alignment_start", "alignment_end", "bitscore", "e_value" ]] return ali, hits
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", # for now, call the last two columns # "fn" and "cn" to prevent compare # stage from crashing names=["i", "A_i", "j", "A_j", "fn", "cn"] # names=["i", "A_i", "j", "A_j", "mi", "di"] ).sort_values( by="cn", ascending=False ) # write the sorted ECs table to csv file ecs.to_csv(outcfg["ec_file"], index=False) # also store longrange ECs as convenience output if kwargs["min_sequence_distance"] is not None: outcfg["ec_longrange_file"] = prefix + "_CouplingScores_longrange.csv" ecs_longrange = ecs.query( "abs(i - j) >= {}".format(kwargs["min_sequence_distance"]) ) ecs_longrange.to_csv(outcfg["ec_longrange_file"], index=False) # also create line-drawing script (for now, only for single segments) if segments is None or len(segments) == 1: outcfg["ec_lines_pml_file"] = prefix + "_draw_ec_lines.pml" L = outcfg["num_sites"] ec_lines_pymol_script( ecs_longrange.iloc[:L, :], outcfg["ec_lines_pml_file"], score_column="cn" # "di ) # compute EC enrichment (for now, for single segments # only since enrichment code cannot handle multiple segments) if segments is None or len(segments) == 1: outcfg["enrichment_file"] = prefix + "_enrichment.csv" ecs_enriched = pairs.enrichment(ecs, score="cn") # "di" ecs_enriched.to_csv(outcfg["enrichment_file"], index=False) # create corresponding enrichment pymol scripts outcfg["enrichment_pml_files"] = [] for sphere_view, pml_suffix in [ (True, "_enrichment_spheres.pml"), (False, "_enrichment_sausage.pml") ]: pml_file = prefix + pml_suffix enrichment_pymol_script(ecs_enriched, pml_file, sphere_view=sphere_view) outcfg["enrichment_pml_files"].append(pml_file) # output EVzoom JSON file if we have stored model file if outcfg.get("model_file", None) is not None: outcfg["evzoom_file"] = prefix + "_evzoom.json" with open(outcfg["evzoom_file"], "w") as f: # create JSON output and write to file f.write( evzoom_json(model) + "\n" ) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_standard.outcfg", outcfg) return outcfg
def filter_best_reciprocal(alignment, paralogs, most_similar_in_species, allowed_error=0.02): """ Takes in a dictionary of the best hit to each genome Removes sequences that are not the best reciprocal hit to the query sequence Parameters ---------- alignment : str Path to sequence alignment file paralogs : pd.DataFrame Rows correspond to paralogs to the query sequence Created by find_paralogs() function most_similar_in_species : pd.DataFrame Contains the id, species name, and percent identity to query for each sequence that was the best hit to the query in its respective species allowed_error : float In order for a sequence to be filtered out of the alignment, it must be more identitical to a paralog sequence than the target sequence by at least this amount Returns ------- pd.DataFrame Contains the id, species name, and percenty identity to query for each sequence that was the best reciprocal hit to the query sequence """ with open(alignment, "r") as inf: ali = Alignment.from_file(inf) # Create an n_paralogs x n_sequences ndarray # where entry i,j is percent identity of paralog i to sequence j # note the identity here will be different than for the unfiltered alignment # initialize the matrix identity_mat = np.zeros((len(paralogs), len(ali.ids)), dtype=float) for idx, paralog_id in enumerate(paralogs.id): # calculate the % identity of every seq in the alignment to current paralog identities = ali.identities_to(ali[ali.id_to_index[paralog_id]]) # save the results identity_mat[idx, :] = identities indices_to_keep = [] # for every sequence in the alignment that is the most similar to the query # in its respective species... for index, row in most_similar_in_species.iterrows(): # get the index of that sequence in the alignment. alignment_index = ali.id_to_index[row.id] # Keep sequences if they are the best reciprocal hit - # i.e., that sequence is not more similar to any paralog # than it is to the query sequence if np.all(identity_mat[:, alignment_index] < row.identity_to_query + allowed_error): indices_to_keep.append(index) return most_similar_in_species.loc[indices_to_keep, :]
def alignment_index_mapping(alignment_file, format="stockholm", target_seq=None): """ Create index mapping table between sequence positions based on a sequence alignment. Parameters ---------- alignment_file : str Path of alignment file containing sequences for which indices shoul dbe mapped format : {"stockholm", "fasta"} Format of alignment file target_seq : str, optional (default: None) Identifier of sequence around which the index mapping will be centered. If None, first sequence in alignment will be used. Returns ------- pandas.DataFrame Mapping table containing assignment of 1. index in target sequence (i) 2. symbol in target sequence (A_i) For all other sequences in alignment, the following two columns: 3. index in second sequence (j_<sequence id>) 4. symbol in second sequence (A_j_<sequence_id>) """ # read alignment that is basis of mapping with open(alignment_file) as a: ali = Alignment.from_file(a, format) # determine index of target sequence if necessary # (default: first sequence in alignment) if target_seq is None: target_seq_index = 0 else: for i, full_id in enumerate(ali.ids): if full_id.startswith(target_seq): target_seq_index = i # get range and sequence of target id_, target_start, target_end = parse_header(ali.ids[target_seq_index]) target_seq = ali.matrix[target_seq_index] # now map from target numbering to hit numbering full_map = None for i, full_id in enumerate(ali.ids): if i == target_seq_index: continue # extract information about sequence we are comparing to id_, region_start, region_end = parse_header(full_id) other_seq = ali.matrix[i] # compute mapping table map_df = map_indices(target_seq, target_start, target_end, other_seq, region_start, region_end, [ali._match_gap, ali._insert_gap]) # adjust column names for non-target sequence map_df = map_df.rename(columns={ "j": "i_" + full_id, "A_j": "A_i_" + full_id, }) # add to full mapping table, left outer join # so all positions in target sequence are kept if full_map is None: full_map = map_df else: full_map = full_map.merge(map_df, on=("i", "A_i"), how="left") return full_map
def mean_field(**kwargs): """ Protocol: Infer ECs from alignment using mean field direct coupling analysis. For now, mean field DCA can only be run in focus mode, gaps included. Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required. Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * raw_ec_file * model_file * num_sites * num_sequences * effective_sequences * focus_mode (passed through) * focus_sequence (passed through) * segments (passed through) """ check_required( kwargs, [ "prefix", "alignment_file", "segments", "focus_mode", "focus_sequence", "theta", "pseudo_count", "alphabet", "min_sequence_distance", # "save_model", "ec_score_type", ] ) if not kwargs["focus_mode"]: raise InvalidParameterError( "For now, mean field DCA can only be run in focus mode." ) prefix = kwargs["prefix"] # option to save model disabled """ if kwargs["save_model"]: model = prefix + ".model" else: model = None """ model = prefix + ".model" outcfg = { "model_file": model, "raw_ec_file": prefix + "_ECs.txt", "ec_file": prefix + "_CouplingScores.csv", # TODO: the following are passed through stage... # keep this or unnecessary? "focus_mode": kwargs["focus_mode"], "focus_sequence": kwargs["focus_sequence"], "segments": kwargs["segments"], } # make sure input alignment exists alignment_file = kwargs["alignment_file"] verify_resources( "Input alignment does not exist", kwargs["alignment_file"] ) # make sure output directory exists create_prefix_folders(prefix) segments = kwargs["segments"] if segments is not None: segments = [ mapping.Segment.from_list(s) for s in segments ] # determine alphabet # default is protein if kwargs["alphabet"] is None: alphabet = ALPHABET_PROTEIN else: alphabet = kwargs["alphabet"] # allow shortcuts for protein, DNA, RNA if alphabet in ALPHABET_MAP: alphabet = ALPHABET_MAP[alphabet] # read in a2m alignment with open(alignment_file) as f: input_alignment = Alignment.from_file( f, alphabet=alphabet, format="fasta" ) # init mean field direct coupling analysis mf_dca = MeanFieldDCA(input_alignment) # run mean field approximation model = mf_dca.fit( theta=kwargs["theta"], pseudo_count=kwargs["pseudo_count"] ) # write ECs to file model.to_raw_ec_file( outcfg["raw_ec_file"] ) # write model file if outcfg["model_file"] is not None: model.to_file( outcfg["model_file"], file_format="plmc_v2" ) # store useful information about model in outcfg outcfg.update({ "num_sites": model.L, "num_valid_sequences": model.N_valid, "effective_sequences": float(round(model.N_eff, 1)), "region_start": int(model.index_list[0]), }) # read and sort ECs # Note: this now deviates from the original EC format # file because it has 4 score columns to accomodate # MI (raw), MI (APC-corrected), DI, CN; ecs = pd.read_csv( outcfg["raw_ec_file"], sep=" ", names=["i", "A_i", "j", "A_j", "mi_raw", "mi_apc", "di", "cn"] ) # select target score; # by default select CN score, since it allows to compute probabilities etc. ec_score_type = kwargs.get("ec_score_type", "cn") valid_ec_type_choices = ["cn", "di", "mi_raw", "mi_apc"] if ec_score_type not in valid_ec_type_choices: raise InvalidParameterError( "Invalid choice for valid_ec_type: {}, valid options are: {}".format( ec_score_type, ", ".join(valid_ec_type_choices) ) ) # perform rescoring if CN score is selected, otherwise cannot rescore # since all models are based on distribution shapes generated by CN score if ec_score_type == "cn": # perform EC rescoring starting from CN score output by plmc; # outconfig update will be merged further down in final outcfg merge # returned list is already sorted ecs, rescorer_outcfg_update = rescore_cn_score_ecs( ecs, segments, outcfg, kwargs, score="cn" ) else: # If MI or DI, cannot apply distribution-based rescoring approaches, # so just set score column and add dummy probability value for compatibility # with downstream code ecs = ecs.assign( score=ecs[ec_score_type], probability=np.nan ).sort_values( by="score", ascending=False ) # no additional values to be updated in outcfg in this case rescorer_outcfg_update = {} is_single_segment = segments is None or len(segments) == 1 outcfg = { **outcfg, **rescorer_outcfg_update, **_postprocess_inference( ecs, kwargs, model, outcfg, prefix, generate_enrichment=is_single_segment, generate_line_plot=is_single_segment, score="score" ) } # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_meanfield.outcfg", outcfg) return outcfg