def complex_to_pairs(complex, source_type, get_pairs, output_dir): pairs_txt = output_dir + '/pairs.txt' name = complex.name logging.info("Working on {:}".format(name)) pairs, num_subunits = get_pairs(complex) casp_capri_addon_message = '; selecting pair with most inter-chain interactions' logging_message = "For complex {:} found {:} pairs out of {:} chains" logging_message += casp_capri_addon_message if source_type == 'casp_capri' and num_subunits > 1 else '' logging.info(logging_message.format(name, len(pairs), num_subunits)) sub_dir = output_dir + '/' + db.get_pdb_code(name)[1:3] f = name if ('mut' in f) and ('mut' not in db.get_pdb_code(name)): pdb = db.get_pdb_code(name)(f) + f[f.rfind('_') + 1: f.find('.')] sub_dir = output_dir + '/' + pdb with sem: if len(pairs) > 0: if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) with open(pairs_txt, 'a') as f: f.write(name + '\n') if source_type == 'casp_capri': pair_with_most_interactions = pairs[0] for pair in pairs: if len(pair.pos_idx) > len(pair_with_most_interactions.pos_idx): pair_with_most_interactions = pair pairs = [pair_with_most_interactions] assert len(pairs) == 1, 'For CASP-CAPRI complexes, the max-interactions chain must be the only chain selected' for i, pair in enumerate(pairs): output_dill = "{:}/{:}_{:}.dill".format(sub_dir, name, i) write_pair_as_dill(pair, output_dill)
def find_of_type(pdb_name_query, pdb_dataset, receptor, bound, style): """Get matching partner of provided file.""" pdb_code = db.get_pdb_code(pdb_name_query) results = None for pdb_name in db.get_structures_filenames(pdb_dataset): if db.get_pdb_code(pdb_name) == pdb_code: if is_of_type(pdb_name, style, receptor=receptor, bound=bound): results = pdb_name return results
def _generate_clean_unbound_bound(filename_u, filename_b, results_dir, style): """Perform alignment on unbound and bound files to standardize them.""" b2r_chain, u2r_chain, b2r_res, u2r_res = \ _get_chain_mapping(filename_u, filename_b, style) aug_pdb_code_u = get_pdb_code_with_partner_and_binding(filename_u) if style == 'db5': aug_pdb_code_b = get_pdb_code_with_partner_and_binding(filename_b) pdb_extension = 'pdb' elif style == 'dockground': partner = _get_partner(filename_u) aug_pdb_code_b = db.get_pdb_code(filename_b) + '_' + partner + '_b' pdb_extension = db.get_pdb_type(filename_b) output_filename_u = results_dir + '/' + aug_pdb_code_u + '_cleaned.' + \ pdb_extension output_filename_b = results_dir + '/' + aug_pdb_code_b + '_cleaned.' + \ pdb_extension _generate_reference( filename_b, b2r_chain, b2r_res, output_filename_b, style) _generate_reference( filename_u, u2r_chain, u2r_res, output_filename_u, style) output_mapping_u = results_dir + '/' + aug_pdb_code_u + '_toref.pkl' output_mapping_b = results_dir + '/' + aug_pdb_code_b + '_toref.pkl' _generate_mapping(u2r_chain, u2r_res, output_mapping_u) _generate_mapping(b2r_chain, b2r_res, output_mapping_b)
def get_pdb_code_with_binding(pdb_filename): """ Get pdb code with binding state annotated. e.g. 11as_r_u.pdb would give 11as_u """ return db.get_pdb_code(pdb_filename) + '_' + _get_binding(pdb_filename)
def get_pdb_code_with_partner(pdb_filename): """ Get pdb code with partner annotated. e.g. 11as_r_u.pdb would give 11as_r """ return db.get_pdb_code(pdb_filename) + '_' + _get_partner(pdb_filename)
def map_all_pssms(pdb_dataset, blastdb, output_dir, num_cpus): ext = '.pkl' requested_filenames = \ db.get_structures_filenames(pdb_dataset, extension=ext) requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] work_filenames = [ x[0] for x in db.get_all_filenames(work_keys, pdb_dataset, extension=ext, keyer=lambda x: db.get_pdb_name(x)) ] output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) inputs = [(key, blastdb, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_pssms, inputs, num_cpus)
def parse_all(pdb_dataset, output_dir, num_cpus): """Parse pdb dataset (pdb files) to pandas dataframes.""" requested_filenames = db.get_structures_filenames(pdb_dataset) produced_filenames = db.get_structures_filenames( output_dir, extension='.pkl') requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] work_filenames = [x[0] for x in db.get_all_filenames(work_keys, pdb_dataset, enforcement=2)] logging.info("{:} requested keys, {:} produced keys, {:} work keys" .format(len(requested_keys), len(produced_keys), len(work_keys))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append( sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") inputs = [(key, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(parse, inputs, num_cpus)
def main(pair_dir, tfrecord_dir, num_cpus): """Run write_pairs on all provided complexes.""" requested_filenames = \ db.get_structures_filenames(pair_dir, extension='.dill') requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = \ db.get_structures_filenames(tfrecord_dir, extension='.tfrecord') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) work_filenames = [ x[0] for x in db.get_all_filenames(work_keys, pair_dir, extension='.dill') ] output_filenames = [] for pdb_filename in work_filenames: sub_dir = tfrecord_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".tfrecord") inputs = [(i, o) for i, o in zip(work_filenames, output_filenames)] par.submit_jobs(pairs_to_tfrecord, inputs, num_cpus)
def main(pair_dir, to_keep_dir, output_dir, num_cpus): """Run write_pairs on all provided complexes.""" to_keep_filenames = \ db.get_structures_filenames(to_keep_dir, extension='.txt') if len(to_keep_filenames) == 0: logging.warning( "There is no to_keep file in {:}. All pair files from {:} " "will be copied into {:}".format(to_keep_dir, pair_dir, output_dir)) to_keep_df = __load_to_keep_files_into_dataframe(to_keep_filenames) logging.info("There are {:} rows, cols in to_keep_df".format( to_keep_df.shape)) logging.info("Looking for all pairs in {:}".format(pair_dir)) work_filenames = \ db.get_structures_filenames(pair_dir, extension='.dill') work_keys = [db.get_pdb_name(x) for x in work_filenames] logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir)) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".dill") inputs = [(i, o, to_keep_df) for i, o in zip(work_filenames, output_filenames)] ncopied = 0 ncopied += np.sum(par.submit_jobs(process_pairs_to_keep, inputs, num_cpus)) logging.info("{:} out of {:} pairs was copied".format( ncopied, len(work_keys)))
def complex_to_pairs(complex, get_pairs, output_dir): pairs_txt = output_dir + '/pairs.txt' name = complex.name logging.info("Working on {:}".format(name)) pairs, num_subunits = get_pairs(complex) logging.info("For complex {:} found {:} pairs out of {:} chains" .format(name, len(pairs), num_subunits)) sub_dir = output_dir + '/' + db.get_pdb_code(name)[1:3] f = name if ('mut' in f) and ('mut' not in db.get_pdb_code(name)): pdb = db.get_pdb_code(name)(f) + f[f.rfind('_') + 1: f.find('.')] sub_dir = output_dir + '/' + pdb with sem: if len(pairs) > 0: if not os.path.exists(sub_dir): os.makedirs(sub_dir) with open(pairs_txt, 'a') as f: f.write(name + '\n') for i, pair in enumerate(pairs): output_dill = "{:}/{:}_{:}.dill".format(sub_dir, name, i) write_pair_as_dill(pair, output_dill)
def main(raw_pdb_dir, pruned_pairs_dir, output_dir, neighbor_def, cutoff, num_cpus): """Run postprocess_pruned_pairs on all provided complexes.""" logging.info("Looking for all pairs in {:}".format(pruned_pairs_dir)) work_filenames = \ db.get_structures_filenames(pruned_pairs_dir, extension='.dill') work_keys = [db.get_pdb_name(x) for x in work_filenames] logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir)) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".dill") inputs = [(raw_pdb_dir, neighbor_def, cutoff, i, o) for i, o in zip(work_filenames, output_filenames)] n_copied = 0 n_copied += np.sum( par.submit_jobs(postprocess_pruned_pairs, inputs, num_cpus)) logging.info("{:} out of {:} pairs was copied".format( n_copied, len(work_keys)))
def get_complex_pdb_codes(pdb_dataset): """Get complexes in provided directory.""" complexes = set() for structure in db.get_structures_filenames(pdb_dataset): complexes.add(db.get_pdb_code(structure)) return complexes
def map_all_profile_hmms(pkl_dataset, pruned_dataset, output_dir, hhsuite_db, num_cpu_jobs, num_cpus_per_job, source_type, num_iter, rank, size, write_file): ext = '.pkl' if write_file: if source_type.lower() == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or ( source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input']) ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] establish_pdb_code_case = lambda pdb_code, source_type: pdb_code.lower() \ if source_type.lower() == 'casp_capri' \ else pdb_code.upper() work_filenames = [ os.path.join( pkl_dataset, establish_pdb_code_case(db.get_pdb_code(work_key), source_type)[1:3], work_key + ext) for work_key in work_keys ] # Remove any duplicate filenames work_filenames = list(set(work_filenames)) logging.info( "{:} requested keys, {:} produced keys, {:} work filenames".format( len(requested_keys), len(produced_keys), len(work_filenames))) if source_type.lower() == 'input': # Directly generate profile HMM features after aggregating input filenames logging.info("{:} work filenames".format(len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + '.pkl') inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type, num_iter) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs) else: # Write out a local file containing all work filenames temp_df = pd.DataFrame({'filename': work_filenames}) temp_df.to_csv(f'{source_type}_work_filenames.csv') logging.info( 'File containing work filenames written to storage. Exiting...' ) # Read from previously-created work filenames CSV else: work_filenames = pd.read_csv( f'{source_type}_work_filenames.csv').iloc[:, 1].to_list() work_filenames = list( set(work_filenames)) # Remove any duplicate filenames # Reserve an equally-sized portion of the full work load for a given rank in the MPI world work_filename_rank_batches = slice_list(work_filenames, size) work_filenames = work_filename_rank_batches[rank] logging.info("{:} work filenames".format(len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + '.pkl') inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type, num_iter) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
def map_all_pssms(pkl_dataset, pruned_dataset, blastdb, output_dir, num_cpus, source_type, rank, size): ext = '.pkl' if source_type.lower( ) == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or ( source_type.lower() == 'rcsb') or (source_type.lower( ) == 'evcoupling') or (source_type.lower() == 'casp_capri') ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri': work_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(work_key)[1:3], work_key + ext) for work_key in work_keys ] else: work_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(work_key)[1:3].upper(), work_key + ext) for work_key in work_keys ] # Reserve an equally-sized portion of the full work load for a given rank in the MPI world work_filenames = list(set(work_filenames)) work_filename_rank_batches = slice_list(work_filenames, size) work_filenames = work_filename_rank_batches[rank] # Remove any duplicate filenames logging.info( "{:} requested keys, {:} produced keys, {:} work filenames".format( len(requested_keys), len(produced_keys), len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") inputs = [(key, blastdb, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_pssms, inputs, num_cpus)
def map_all_protrusion_indices(psaia_dir, psaia_config_file, pdb_dataset, pkl_dataset, pruned_dataset, output_dir, source_type): ext = '.pkl' if source_type.lower() == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or (source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input']) ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] requested_pdb_codes = [db.get_pdb_code(x) for x in requested_filenames] produced_filenames_path = os.path.join(output_dir, 'PSAIA', source_type.upper()) produced_filenames = [ path.as_posix() for path in Path(produced_filenames_path).rglob('*.tbl') ] produced_keys = [db.get_pdb_code(x) for x in produced_filenames] work_keys = [ key for key, pdb_code in zip(requested_keys, requested_pdb_codes) if pdb_code not in produced_keys ] format_pdb_code_for_inputs = lambda pdb_code, source_type: pdb_code[1:3] \ if source_type.lower() in ['input'] \ else pdb_code.upper() if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri': work_filenames = [ os.path.join(pdb_dataset, db.get_pdb_code(work_key)[1:3], work_key) for work_key in work_keys ] else: work_filenames = [ os.path.join( pdb_dataset, format_pdb_code_for_inputs(db.get_pdb_code(work_key), source_type), work_key) for work_key in work_keys ] # Remove any duplicate filenames work_filenames = list(set(work_filenames)) # Exit early if no inputs need to processed logging.info("{:} PDB files to process with PSAIA".format( len(work_filenames))) # Create comprehensive filename list for PSAIA to single-threadedly process for requested features (e.g. protrusion) file_list_file = os.path.join(output_dir, 'PSAIA', source_type.upper(), 'pdb_list.fls') with open(file_list_file, 'w') as file: for requested_pdb_filename in work_filenames: file.write(f'{requested_pdb_filename}\n') inputs = [(psaia_dir, psaia_config_file, file_list_file)] par.submit_jobs(map_protrusion_indices, inputs, 1) # PSAIA is inherently single-threaded in execution