def _prepare_structures(structures, pdb_id_list, raise_missing=True): """ Get structures ready for distance calculation Parameters ---------- structures : str or dict See intra_dists function for explanation pdb_id_list: List of PDB entries to load raise_missing : bool, optional (default: True) Raise a ResourceError if any of the input structures can not be loaded; otherwise, ignore missing entries. Returns ------- dict dictionary with lower-case PDB ids as keys and PDB objects as value """ # load structures if not yet done so if structures is None or isinstance(structures, str): structures = load_structures( pdb_id_list, structures, raise_missing ) return structures
def complex(**kwargs): """ Protocol: Compare ECs for a complex to 3D structure Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "first_compare_multimer", "second_compare_multimer", "distance_cutoff", "first_sequence_id", "second_sequence_id", "first_sequence_file", "second_sequence_file", "first_segments", "second_segments", "first_target_sequence_file", "second_target_sequence_file", "scale_sizes" ]) prefix = kwargs["prefix"] outcfg = { # initialize output EC files "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv", # initialize output inter distancemap files "distmap_inter": prefix + "_distmap_inter", "inter_contacts_file": prefix + "_inter_contacts_file" } # Add PDB comparison files for first and second monomer for monomer_prefix in ["first", "second"]: outcfg = { **outcfg, monomer_prefix + "_pdb_structure_hits_file": "{}_{}_structure_hits.csv".format(prefix, monomer_prefix), monomer_prefix + "_pdb_structure_hits_unfiltered_file": "{}_{}_structure_hits_unfitered.csv".format( prefix, monomer_prefix), monomer_prefix + "_distmap_monomer": "{}_{}_distance_map_monomer".format(prefix, monomer_prefix), monomer_prefix + "_distmap_multimer": "{}_{}_distance_map_multimer".format(prefix, monomer_prefix), } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # store auxiliary files here (too much for average user) first_aux_prefix = insert_dir(aux_prefix, "first_monomer", rootname_subdir=False) create_prefix_folders(first_aux_prefix) # store auxiliary files here (too much for average user) second_aux_prefix = insert_dir(aux_prefix, "second_monomer", rootname_subdir=False) create_prefix_folders(second_aux_prefix) # Step 1: Identify 3D structures for comparison def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): # create a dictionary with kwargs for just the current monomer # remove the "prefix" kwargs so that we can replace with the # aux prefix when calling _identify_structures # only replace first occurrence of name_prefix monomer_kwargs = { k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k } # this field needs to be set explicitly else it gets overwritten by concatenated file monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"] monomer_kwargs["raw_focus_alignment_file"] = kwargs[ name_prefix + "_raw_focus_alignment_file"] # identify structures for that monomer sifts_map, sifts_map_full = _identify_structures(**monomer_kwargs, prefix=aux_prefix) # save selected PDB hits sifts_map.hits.to_csv(outcfg[name_prefix + "_pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv( outcfg[name_prefix + "_pdb_structure_hits_unfiltered_file"], index=False) return outcfg, sifts_map outcfg, first_sifts_map = _identify_monomer_structures( "first", outcfg, first_aux_prefix) outcfg, second_sifts_map = _identify_monomer_structures( "second", outcfg, second_aux_prefix) # get the segment names from the kwargs segment_list = kwargs["segments"] # Make sure user provided exactly two segments if len(segment_list) != 2: raise InvalidParameterError( "Compare stage for protein complexes requires exactly two segments" ) first_segment_name = kwargs["segments"][0][0] second_segment_name = kwargs["segments"][1][0] # Step 2: Compute distance maps def _compute_monomer_distance_maps(sifts_map, name_prefix, chain_name): # prepare a sequence map to remap the structures we have found verify_resources("Target sequence file does not exist", kwargs[name_prefix + "_target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs[name_prefix + "_target_sequence_file"]) as f: header, seq = next(read_fasta(f)) # create target sequence map for remapping structure seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_intra") d_intra.to_file(outcfg[name_prefix + "_distmap_monomer"]) # save contacts to separate file outcfg[ name_prefix + "_monomer_contacts_file"] = prefix + "_" + name_prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs[name_prefix + "_compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_" + name_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact map, save it if d_multimer is not None: d_multimer.to_file(outcfg[name_prefix + "_distmap_multimer"]) outcfg[ name_prefix + "_multimer_contacts_file"] = prefix + name_prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg[name_prefix + "_multimer_contacts_file"], index=False) else: outcfg[name_prefix + "_distmap_multimer"] = None # create remapped structures (e.g. for # later comparison of folding results) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg[name_prefix + "_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap, chain_name=chain_name, raise_missing=kwargs["raise_missing"]).items() } else: # if no structures, cannot compute distance maps d_intra = None d_multimer = None outcfg[name_prefix + "_distmap_monomer"] = None outcfg[name_prefix + "_distmap_multimer"] = None outcfg[name_prefix + "remapped_pdb_files"] = None return d_intra, d_multimer, seqmap # load all structures for both monomers all_structures = set(first_sifts_map.hits.pdb_id).union( set(second_sifts_map.hits.pdb_id)) structures = load_structures(all_structures, kwargs["pdb_mmtf_dir"], raise_missing=False) d_intra_i, d_multimer_i, seqmap_i = _compute_monomer_distance_maps( first_sifts_map, "first", "A") d_intra_j, d_multimer_j, seqmap_j = _compute_monomer_distance_maps( second_sifts_map, "second", "B") # compute inter distance map if sifts map for each monomer exists if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: d_inter = inter_dists(first_sifts_map, second_sifts_map, raise_missing=kwargs["raise_missing"]) # if there were overlapping PDBs, save the results if d_inter is not None: d_inter.to_file(outcfg["distmap_inter"]) # save contacts to separate file d_inter.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["inter_contacts_file"], index=False) else: outcfg["inter_contacts_file"] = None d_inter = None # # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we have an intra distance map # for at least one monomer - inter can't exist unless # we have both monomers if (d_intra_i is not None) or (d_intra_j is not None): # compare distances individually for each segment pair ecs_intra_i = ec_table.query( "segment_i == segment_j == @first_segment_name") if d_intra_i is not None: ecs_intra_i_compared = coupling_scores_compared( ecs_intra_i, d_intra_i, d_multimer_i, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: # If no distance map, the distance is saved as np.nan ecs_intra_i_compared = ecs_intra_i.assign(dist=np.nan) ecs_intra_j = ec_table.query( "segment_i == segment_j == @second_segment_name") if d_intra_j is not None: ecs_intra_j_compared = coupling_scores_compared( ecs_intra_j, d_intra_j, d_multimer_j, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist=min_seq_dist) else: ecs_intra_j_compared = ecs_intra_j.assign(dist=np.nan) ecs_inter = ec_table.query("segment_i != segment_j") if d_inter is not None: ecs_inter_compared = coupling_scores_compared( ecs_inter, d_inter, dist_map_multimer=None, dist_cutoff=kwargs["distance_cutoff"], output_file=None, min_sequence_dist= None # does not apply for inter-protein ECs ) else: ecs_inter_compared = ecs_inter.assign(dist=np.nan) # combine the tables ec_table_compared = pd.concat([ ecs_inter_compared, ecs_intra_i_compared, ecs_intra_j_compared ]) # rename the precision column to "segmentwise_precision" # because we calculated precision for each segment independently ec_table_compared = ec_table_compared.rename( columns={"precision": "segmentwise_precision"}) # TODO: change "cn" to "score" eventually ec_table_compared = ec_table_compared.sort_values("cn", ascending=False) # add the total precision # TODO: implement different cutoffs for intra vs inter contacts ec_table_compared = add_precision( ec_table_compared, dist_cutoff=kwargs["distance_cutoff"]) # save to file # all ecs ec_table_compared.to_csv(outcfg[out_file]) # save the inter ECs to a file ecs_inter_compared.to_csv(outcfg["ec_compared_inter_file"]) # create the inter-ecs line drawing script if outcfg["ec_compared_inter_file"] is not None and kwargs[ "plot_highest_count"] is not None: inter_ecs = ec_table.query("segment_i != segment_j") outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script( inter_ecs.iloc[:kwargs["plot_highest_count"], :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"], chain={ first_segment_name: "A", second_segment_name: "B" }) # Remap the complex crystal structures, if available if len(first_sifts_map.hits) > 0 and len(second_sifts_map.hits) > 0: outcfg["complex_remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_complex_chains( first_sifts_map, second_sifts_map, seqmap_i, seqmap_j, output_prefix=aux_prefix, raise_missing=kwargs["raise_missing"]).items() } # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_complex_contact_maps( ec_table, d_intra_i, d_multimer_i, d_intra_j, d_multimer_j, d_inter, first_segment_name, second_segment_name, **kwargs) return outcfg
def standard(**kwargs): """ Protocol: Compare ECs for single proteins (or domains) to 3D structure information Parameters ---------- Mandatory kwargs arguments: See list below in code where calling check_required Returns ------- outcfg : dict Output configuration of the pipeline, including the following fields: * ec_file_compared_all * ec_file_compared_all_longrange * pdb_structure_hits * distmap_monomer * distmap_multimer * contact_map_files * remapped_pdb_files """ check_required(kwargs, [ "prefix", "ec_file", "min_sequence_distance", "pdb_mmtf_dir", "atom_filter", "compare_multimer", "distance_cutoff", "target_sequence_file", "scale_sizes", ]) prefix = kwargs["prefix"] outcfg = { "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", "pdb_structure_hits_file": prefix + "_structure_hits.csv", "pdb_structure_hits_unfiltered_file": prefix + "_structure_hits_unfiltered.csv", # cannot have the distmap files end with "_file" because there are # two files (.npy and .csv), which would cause problems with automatic # checking if those files exist "distmap_monomer": prefix + "_distance_map_monomer", "distmap_multimer": prefix + "_distance_map_multimer", } # make sure EC file exists verify_resources("EC file does not exist", kwargs["ec_file"]) # make sure output directory exists create_prefix_folders(prefix) # store auxiliary files here (too much for average user) aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) # Step 1: Identify 3D structures for comparison sifts_map, sifts_map_full = _identify_structures( **{ **kwargs, "prefix": aux_prefix, }) # save selected PDB hits sifts_map.hits.to_csv(outcfg["pdb_structure_hits_file"], index=False) # also save full list of hits sifts_map_full.hits.to_csv(outcfg["pdb_structure_hits_unfiltered_file"], index=False) # Step 2: Compute distance maps # load all structures at once structures = load_structures(sifts_map.hits.pdb_id, kwargs["pdb_mmtf_dir"], raise_missing=False) # compute distance maps and save # (but only if we found some structure) if len(sifts_map.hits) > 0: d_intra = intra_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_distmap_intra") d_intra.to_file(outcfg["distmap_monomer"]) # save contacts to separate file outcfg["monomer_contacts_file"] = prefix + "_contacts_monomer.csv" d_intra.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["monomer_contacts_file"], index=False) # compute multimer distances, if requested; # note that d_multimer can be None if there # are no structures with multiple chains if kwargs["compare_multimer"]: d_multimer = multimer_dists(sifts_map, structures, atom_filter=kwargs["atom_filter"], output_prefix=aux_prefix + "_distmap_multimer") else: d_multimer = None # if we have a multimer contact mapin the end, save it if d_multimer is not None: d_multimer.to_file(outcfg["distmap_multimer"]) outcfg[ "multimer_contacts_file"] = prefix + "_contacts_multimer.csv" # save contacts to separate file d_multimer.contacts(kwargs["distance_cutoff"]).to_csv( outcfg["multimer_contacts_file"], index=False) else: outcfg["distmap_multimer"] = None # at this point, also create remapped structures (e.g. for # later comparison of folding results) verify_resources("Target sequence file does not exist", kwargs["target_sequence_file"]) # create target sequence map for remapping structure with open(kwargs["target_sequence_file"]) as f: header, seq = next(read_fasta(f)) seq_id, seq_start, seq_end = parse_header(header) seqmap = dict(zip(range(seq_start, seq_end + 1), seq)) # remap structures, swap mapping index and filename in # dictionary so we have a list of files in the dict keys outcfg["remapped_pdb_files"] = { filename: mapping_index for mapping_index, filename in remap_chains( sifts_map, aux_prefix, seqmap).items() } else: # if no structures, can not compute distance maps d_intra = None d_multimer = None outcfg["distmap_monomer"] = None outcfg["distmap_multimer"] = None outcfg["remapped_pdb_files"] = None # Step 3: Compare ECs to distance maps ec_table = pd.read_csv(kwargs["ec_file"]) # identify number of sites in EC model num_sites = len( set.union(set(ec_table.i.unique()), set(ec_table.j.unique()))) for out_file, min_seq_dist in [ ("ec_compared_longrange_file", kwargs["min_sequence_distance"]), ("ec_compared_all_file", 0), ]: # compare ECs only if we minimally have intra distance map if d_intra is not None: coupling_scores_compared(ec_table, d_intra, d_multimer, dist_cutoff=kwargs["distance_cutoff"], output_file=outcfg[out_file], min_sequence_dist=min_seq_dist) else: outcfg[out_file] = None # also create line-drawing script if we made the csv if outcfg["ec_compared_longrange_file"] is not None: ecs_longrange = pd.read_csv(outcfg["ec_compared_longrange_file"]) outcfg[ "ec_lines_compared_pml_file"] = prefix + "_draw_ec_lines_compared.pml" pairs.ec_lines_pymol_script(ecs_longrange.iloc[:num_sites, :], outcfg["ec_lines_compared_pml_file"], distance_cutoff=kwargs["distance_cutoff"]) # Step 4: Make contact map plots # if no structures available, defaults to EC-only plot outcfg["contact_map_files"] = _make_contact_maps(ec_table, d_intra, d_multimer, **kwargs) return outcfg