def main(pair_dir, to_keep_dir, output_dir, num_cpus): """Run write_pairs on all provided complexes.""" to_keep_filenames = \ db.get_structures_filenames(to_keep_dir, extension='.txt') if len(to_keep_filenames) == 0: logging.warning( "There is no to_keep file in {:}. All pair files from {:} " "will be copied into {:}".format(to_keep_dir, pair_dir, output_dir)) to_keep_df = __load_to_keep_files_into_dataframe(to_keep_filenames) logging.info("There are {:} rows, cols in to_keep_df".format( to_keep_df.shape)) logging.info("Looking for all pairs in {:}".format(pair_dir)) work_filenames = \ db.get_structures_filenames(pair_dir, extension='.dill') work_keys = [db.get_pdb_name(x) for x in work_filenames] logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir)) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".dill") inputs = [(i, o, to_keep_df) for i, o in zip(work_filenames, output_filenames)] ncopied = 0 ncopied += np.sum(par.submit_jobs(process_pairs_to_keep, inputs, num_cpus)) logging.info("{:} out of {:} pairs was copied".format( ncopied, len(work_keys)))
def map_all_pssms(pdb_dataset, blastdb, output_dir, num_cpus): ext = '.pkl' requested_filenames = \ db.get_structures_filenames(pdb_dataset, extension=ext) requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] work_filenames = [ x[0] for x in db.get_all_filenames(work_keys, pdb_dataset, extension=ext, keyer=lambda x: db.get_pdb_name(x)) ] output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) inputs = [(key, blastdb, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_pssms, inputs, num_cpus)
def parse_all(pdb_dataset, output_dir, num_cpus): """Parse pdb dataset (pdb files) to pandas dataframes.""" requested_filenames = db.get_structures_filenames(pdb_dataset) produced_filenames = db.get_structures_filenames( output_dir, extension='.pkl') requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] work_filenames = [x[0] for x in db.get_all_filenames(work_keys, pdb_dataset, enforcement=2)] logging.info("{:} requested keys, {:} produced keys, {:} work keys" .format(len(requested_keys), len(produced_keys), len(work_keys))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append( sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") inputs = [(key, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(parse, inputs, num_cpus)
def main(pair_dir, tfrecord_dir, num_cpus): """Run write_pairs on all provided complexes.""" requested_filenames = \ db.get_structures_filenames(pair_dir, extension='.dill') requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = \ db.get_structures_filenames(tfrecord_dir, extension='.tfrecord') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) work_filenames = [ x[0] for x in db.get_all_filenames(work_keys, pair_dir, extension='.dill') ] output_filenames = [] for pdb_filename in work_filenames: sub_dir = tfrecord_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".tfrecord") inputs = [(i, o) for i, o in zip(work_filenames, output_filenames)] par.submit_jobs(pairs_to_tfrecord, inputs, num_cpus)
def pdb_to_fasta(pdb_filename, fasta_filename, id_filename, separate): """Write a pdb file as a fasta file.""" flat_map = {} pdb_name = db.get_pdb_name(pdb_filename) structure = pd.read_pickle(pdb_filename) fasta_name_to_chain = {} for (chain, residues) in struct.get_chain_to_valid_residues(structure): fasta_name = pdb_name + '-' + chain[-2] + '-' + chain[-1] flat_map[fasta_name] = residues fasta_name_to_chain[fasta_name] = chain names = [] filenames = [] id_filenames = [] if not separate: write_fasta(flat_map, fasta_filename, id_out=id_filename) filenames.append(fasta_filename) id_filenames.append(id_filename) names.append('all') else: for (name, seq) in flat_map.items(): new_dict = {} new_dict[name] = seq filename = fasta_filename.format(name) filename2 = id_filename.format(name) write_fasta(new_dict, filename, id_out=filename2) names.append(fasta_name_to_chain[name]) filenames.append(filename) id_filenames.append(filename2) return (names, filenames, id_filenames)
def __should_keep(pair_filename, to_keep_df): assert (not to_keep_df.empty) # pair_name example: 20gs.pdb1_0 pair_name_regex = re.compile( '(?P<pdb_code>\w{4})(\.pdb(?P<struct_id>\d+))*(_(?P<pair_id>\d+))') pair_name = db.get_pdb_name(pair_filename) pair_metadata = pair_name_regex.match(pair_name).groupdict() # The order to check is: pdb_code, struct_id, pair_id, chain if pair_metadata['pdb_code'] not in set(to_keep_df.pdb_code): return False # Check if we need to select based on struct_id slice = to_keep_df[to_keep_df.pdb_code == pair_metadata['pdb_code']] if 'struct_id' in slice.columns: if pair_metadata['struct_id'] not in set(slice.struct_id): return False slice = slice[slice.struct_id == pair_metadata['struct_id']] # Check if we need to select based on pair_id if 'pair_id' in slice.columns: if pair_metadata['pair_id'] not in set(slice.pair_id): return False slice = slice[slice.pair_id == pair_metadata['pair_id']] # Check if we need to select based on chain if 'chain' in slice.columns: pair = pa.read_pair_from_dill(pair_filename) pair_chains = set(pair.df0.chain) | set(pair.df1.chain) # Convert chain names to lowercase pair_chains = set([c.lower() for c in pair_chains]) # All chains in the pair need to be to_keep_df to be valid if not pair_chains.issubset(set(slice.chain)): return False return True
def get_missing_sidechains(pdb_dataset, output_scwrl): """Get residues that are missing atoms.""" for pdb_filename in db.get_structures_filenames(pdb_dataset): biopy_structure = db.parse_biopython_structure(pdb_filename) pdb_name = db.get_pdb_name(pdb_filename) missing = 0 scwrl_list = [] logging.info("Processing {:}".format(pdb_name)) for model in biopy_structure: for chain in model: for i, residue in enumerate(chain): res_name = residue.resname if res_name not in expected: logging.warning("Non-standard residue found: {:}. " "Skipping.".format(res_name)) continue res_code = poly.three_to_one(res_name) res_id = residue.id[1] curr_count = len( Bio.PDB.Selection.unfold_entities(residue, 'A')) if curr_count != expected[res_name]: logging.debug( "Missing residue {:} at position {:} (with id {:})" " which has {:} instead of the expected {:} atoms." .format(res_name, i, res_id, curr_count, expected[res_name])) missing += 1 scwrl_list.append(res_code.upper()) else: scwrl_list.append(res_code.lower()) logging.debug("Missing {:} residue total".format(missing)) with open(output_scwrl, 'w') as f: f.write("".join(scwrl_list))
def complexes_from_pair_dir(pair_dir): """Get all complex names from provided pair directory.""" filenames = db.get_structures_filenames(pair_dir, extension='.dill') # Remove per-chain identifier. # TODO: This could cause issues when only some of the pairs have been # written. return ['_'.join(db.get_pdb_name(x).split('_')[:-1]) for x in filenames]
def is_of_type(pdb_name, style, receptor=None, bound=None): """Check if pdb_name is of requested type.""" pdb_name = db.get_pdb_name(pdb_name, with_type=False) if receptor is None: if bound is None: return True elif bound: if style == 'db5': return _has_symbol('b', pdb_name) elif style == 'dockground': # Dockground only has pdb code. return len(pdb_name) == 4 else: return _has_symbol('u', pdb_name) if bound is None: if receptor: return _has_symbol('r', pdb_name) or _has_symbol('2', pdb_name) else: return _has_symbol('l', pdb_name) or _has_symbol('1', pdb_name) if receptor and bound: return _has_symbol('r_b', pdb_name) or _has_symbol('2_b', pdb_name) elif receptor and not bound: return _has_symbol('r_u', pdb_name) or _has_symbol('2_u', pdb_name) elif not receptor and bound: return _has_symbol('l_b', pdb_name) or _has_symbol('1_b', pdb_name) else: return _has_symbol('l_u', pdb_name) or _has_symbol('1_u', pdb_name) return False
def map_pssms(pdb_filename, blastdb, output_filename): pdb_name = db.get_pdb_name(pdb_filename) start_time = timeit.default_timer() start_time_blasting = timeit.default_timer() pis = gen_pssm(pdb_filename, blastdb, output_filename) num_chains = len(pis.groupby(['pdb_name', 'model', 'chain'])) elapsed_blasting = timeit.default_timer() - start_time_blasting parsed = pd.read_pickle(pdb_filename) parsed = parsed.merge( pis, on=['model', 'pdb_name', 'chain', 'residue']) start_time_writing = timeit.default_timer() parsed.to_pickle(output_filename) elapsed_writing = timeit.default_timer() - start_time_writing elapsed = timeit.default_timer() - start_time logging.info( ('For {:d} pssms generated from {} spent {:05.2f} blasting, ' '{:05.2f} writing, and {:05.2f} overall.') .format( num_chains, pdb_name, elapsed_blasting, elapsed_writing, elapsed))
def get_chain(pdb_filename): """Get chain from split pdb filename.""" pdb_name = db.get_pdb_name(pdb_filename, with_type=False) tokens = pdb_name.split("_") if len(tokens) < 3: return 0 else: return tokens[1]
def _get_rcsb_complexes(filenames): """Get complexes for RCSB type dataset.""" complexes = {} for filename in filenames: name = db.get_pdb_name(filename) complexes[name] = Complex(name=name, bound_filenames=[filename], unbound_filenames=[]) return complexes
def _get_casp_capri_complexes(filenames, keyer=db.get_pdb_code): """Get complexes for CASP-CAPRI type dataset.""" complexes = {} for filename in filenames: name = db.get_pdb_name(filename) complexes[name] = Complex(name=name, bound_filenames=[filename], unbound_filenames=[]) return complexes
def get_model(pdb_filename): """Get model from split pdb filename.""" pdb_name = db.get_pdb_name(pdb_filename, with_type=False) tokens = pdb_name.split("_") if len(tokens) < 3: return 0 elif not tokens[2].isdigit(): return 0 else: return int(tokens[2])
def main(raw_pdb_dir, pruned_pairs_dir, output_dir, neighbor_def, cutoff, num_cpus): """Run postprocess_pruned_pairs on all provided complexes.""" logging.info("Looking for all pairs in {:}".format(pruned_pairs_dir)) work_filenames = \ db.get_structures_filenames(pruned_pairs_dir, extension='.dill') work_keys = [db.get_pdb_name(x) for x in work_filenames] logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir)) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".dill") inputs = [(raw_pdb_dir, neighbor_def, cutoff, i, o) for i, o in zip(work_filenames, output_filenames)] n_copied = 0 n_copied += np.sum( par.submit_jobs(postprocess_pruned_pairs, inputs, num_cpus)) logging.info("{:} out of {:} pairs was copied".format( n_copied, len(work_keys)))
def _get_seq_and_atoms(filename): """Form dictionaries mapping from chain to their sequence and atoms.""" seqs = {} all_atoms = {} structure = struct.parse_structure(filename) pdb_name = db.get_pdb_name(filename) for (chain, residues) in \ struct.get_chain_to_valid_residues(structure, pdb_name): atoms = [] for residue in residues: atoms.append(np.array(residue[['x', 'y', 'z']], dtype='f4')) if len(residues) != 0: # Ignore zero-length peptides. seqs[chain] = residues all_atoms[chain] = np.array(atoms) return all_atoms, seqs
def map_profile_hmms(num_cpus, pkl_filename, output_filename, hhsuite_db, source_type, num_iter): pdb_name = db.get_pdb_name(pkl_filename) start_time = timeit.default_timer() start_time_blitsing = timeit.default_timer() profile_hmms, num_chains = gen_profile_hmm(num_cpus, pkl_filename, output_filename, hhsuite_db, source_type, num_iter) elapsed_blitsing = timeit.default_timer() - start_time_blitsing start_time_writing = timeit.default_timer() profile_hmms.to_pickle(output_filename) elapsed_writing = timeit.default_timer() - start_time_writing elapsed = timeit.default_timer() - start_time logging.info( ('For {:d} profile HMMs generated from {}, spent {:05.2f} blitsing,' ' {:05.2f} writing, and {:05.2f} overall.').format( num_chains, pdb_name, elapsed_blitsing, elapsed_writing, elapsed))
def _generate_reference(pdb_filename, s2r_chain, s2r_res, output_filename, style): """Transform PDB structure to a reference structure.""" biopy_structure = db.parse_biopython_structure(pdb_filename) pdb_name = db.get_pdb_name(pdb_filename) new_model = Bio.PDB.Model.Model('0') new_structure = Bio.PDB.Structure.Structure('') for (chain, residues) in \ struct.get_chain_to_valid_residues(biopy_structure, pdb_name): if style == 'dockground' and chain not in s2r_chain: # If we are in dockground, we allow ourselves to remove unmapped # chains. continue ref_chain = s2r_chain[chain] if chain in s2r_res: # If we have an alignment for this chain. new_chain = Bio.PDB.Chain.Chain(ref_chain) for i, residue in enumerate(residues): if residue.id[0] != ' ': continue residue.segid = "" residue.id = (' ', s2r_res[chain][i], residue.id[2]) new_chain.add(residue) else: # Else, just remove segment ID. new_chain = Bio.PDB.Chain.Chain(ref_chain) for i, residue in enumerate(residues): residue.segid = "" new_model.add(new_chain) new_structure.add(new_model) w = Bio.PDB.PDBIO() w.set_structure(new_structure) w.save(output_filename)
def map_all_profile_hmms(pkl_dataset, pruned_dataset, output_dir, hhsuite_db, num_cpu_jobs, num_cpus_per_job, source_type, num_iter, rank, size, write_file): ext = '.pkl' if write_file: if source_type.lower() == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or ( source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input']) ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] establish_pdb_code_case = lambda pdb_code, source_type: pdb_code.lower() \ if source_type.lower() == 'casp_capri' \ else pdb_code.upper() work_filenames = [ os.path.join( pkl_dataset, establish_pdb_code_case(db.get_pdb_code(work_key), source_type)[1:3], work_key + ext) for work_key in work_keys ] # Remove any duplicate filenames work_filenames = list(set(work_filenames)) logging.info( "{:} requested keys, {:} produced keys, {:} work filenames".format( len(requested_keys), len(produced_keys), len(work_filenames))) if source_type.lower() == 'input': # Directly generate profile HMM features after aggregating input filenames logging.info("{:} work filenames".format(len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + '.pkl') inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type, num_iter) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs) else: # Write out a local file containing all work filenames temp_df = pd.DataFrame({'filename': work_filenames}) temp_df.to_csv(f'{source_type}_work_filenames.csv') logging.info( 'File containing work filenames written to storage. Exiting...' ) # Read from previously-created work filenames CSV else: work_filenames = pd.read_csv( f'{source_type}_work_filenames.csv').iloc[:, 1].to_list() work_filenames = list( set(work_filenames)) # Remove any duplicate filenames # Reserve an equally-sized portion of the full work load for a given rank in the MPI world work_filename_rank_batches = slice_list(work_filenames, size) work_filenames = work_filename_rank_batches[rank] logging.info("{:} work filenames".format(len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + '.pkl') inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type, num_iter) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
def gen_profile_hmm(num_cpus, pkl_filename, output_filename, hhsuite_db, source_type, num_iter): """Generate profile HMM from sequence.""" pdb_name = db.get_pdb_name(pkl_filename) out_dir = os.path.dirname(output_filename) work_dir = os.path.join(out_dir, 'work') if not os.path.exists(work_dir): os.makedirs(work_dir, exist_ok=True) fasta_format = work_dir + "/{:}.fa" id_format = work_dir + "/{:}.cpkl" # Get FASTA sequence-chain representations of PDB structures chains, chain_fasta_filenames, id_filenames = sequ.pdb_to_fasta( pkl_filename, fasta_format, id_format, True) # Process each profile HMM for a given PDB structure or complex num_chains = 0 profile_hmms = [] for chain, chain_fasta_filename, id_filename in zip( chains, chain_fasta_filenames, id_filenames): basename = os.path.splitext(chain_fasta_filename)[0] profile_hmm_filename = "{}.hhm".format(basename) hhblits_filename = "{}.a3m".format(basename) if not os.path.exists(profile_hmm_filename): logging.info("HHblits'ing {:}".format(chain_fasta_filename)) _hhsuite(num_cpus, chain_fasta_filename, hhblits_filename, profile_hmm_filename, hhsuite_db, num_iter) if not os.path.exists(profile_hmm_filename): logging.warning("No hits for {:}".format(chain_fasta_filename)) # Create empty file open(profile_hmm_filename, 'w').close() if os.stat(profile_hmm_filename).st_size != 0: with open(chain_fasta_filename, 'r') as fasta: with open(profile_hmm_filename, 'r') as hmm: sequence = '' for seq_line in fasta.readlines()[1:]: sequence += " ".join(seq_line.splitlines()) profile_hmm = extract_hmm_profile(hmm.read(), sequence) else: logging.warning( "No profile HMM found for {:} (model {:}, chain {:})".format( pdb_name, chain[-2], chain[-1])) profile_hmm = None pdb_name = db.get_pdb_name(pkl_filename) key = pdb_name + '-' + chain[-2] + '-' + chain[-1] pos_to_res = pickle.load(open(id_filename, 'rb'))[key] if profile_hmm is not None: # Skip if profile HMM was not found profile_hmm = pd.DataFrame(data=profile_hmm) profile_hmm.insert(0, 'pdb_name', db.get_pdb_name(pkl_filename)) profile_hmm.insert(1, 'model', chain[0]) profile_hmm.insert(2, 'chain', chain[1]) profile_hmm.insert(3, 'residue', pos_to_res) profile_hmms.append(profile_hmm) # Keep track of how many chains have been processed num_chains += 1 # Merge related DataFrames into a single one profile_hmms = pd.concat(profile_hmms) return profile_hmms, num_chains
def map_all_protrusion_indices(psaia_dir, psaia_config_file, pdb_dataset, pkl_dataset, pruned_dataset, output_dir, source_type): ext = '.pkl' if source_type.lower() == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or (source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input']) ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] requested_pdb_codes = [db.get_pdb_code(x) for x in requested_filenames] produced_filenames_path = os.path.join(output_dir, 'PSAIA', source_type.upper()) produced_filenames = [ path.as_posix() for path in Path(produced_filenames_path).rglob('*.tbl') ] produced_keys = [db.get_pdb_code(x) for x in produced_filenames] work_keys = [ key for key, pdb_code in zip(requested_keys, requested_pdb_codes) if pdb_code not in produced_keys ] format_pdb_code_for_inputs = lambda pdb_code, source_type: pdb_code[1:3] \ if source_type.lower() in ['input'] \ else pdb_code.upper() if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri': work_filenames = [ os.path.join(pdb_dataset, db.get_pdb_code(work_key)[1:3], work_key) for work_key in work_keys ] else: work_filenames = [ os.path.join( pdb_dataset, format_pdb_code_for_inputs(db.get_pdb_code(work_key), source_type), work_key) for work_key in work_keys ] # Remove any duplicate filenames work_filenames = list(set(work_filenames)) # Exit early if no inputs need to processed logging.info("{:} PDB files to process with PSAIA".format( len(work_filenames))) # Create comprehensive filename list for PSAIA to single-threadedly process for requested features (e.g. protrusion) file_list_file = os.path.join(output_dir, 'PSAIA', source_type.upper(), 'pdb_list.fls') with open(file_list_file, 'w') as file: for requested_pdb_filename in work_filenames: file.write(f'{requested_pdb_filename}\n') inputs = [(psaia_dir, psaia_config_file, file_list_file)] par.submit_jobs(map_protrusion_indices, inputs, 1) # PSAIA is inherently single-threaded in execution
def parse_structure(structure_filename, concoord=False, one_model=False): """Parse a file into chain,model-to-residue mapping.""" _, ext = os.path.splitext(structure_filename) detailed = ext == '.pkl' if detailed: # If detailed we are reading pandas pickle file outputted by # protprep. df = pd.read_pickle(structure_filename) # Set model to 0, because a multi-model file was either already split # into separate files (using the split command) or was pared down to a # single model by the autodock portion of the protprep pipeline. # This might need to be revisited if/when autodock is removed from # pipeline or we decide to actually keep track of correct model. df['model'] = get_model(structure_filename) # Remove hydrogens, for now, to maintain compatability. df = df[df['maestro_atom_name'].apply(lambda x: x.strip()[0]) != 'H'] else: # BioPython.PDB Structure extracted from PDB file. biopy_structure = db.parse_biopython_structure(structure_filename) pdb_name = db.get_pdb_name(structure_filename) if concoord: # need to set model number to be correct (drawn from filename) # TODO: I (Raphael) moved this out of core Structure code, need to # make sure it is correct still for CONCOORD. biopy_structure = db.parse_biopython_structure(structure_filename) biopy_structure = \ Bio.PDB.Structure.Structure(biopy_structure.id) chainmodel = pdb_name.split('_')[1] model_id = str(int(re.split('(\d+)', chainmodel)[1]) + 1) for model_obj in biopy_structure: new_model = Bio.PDB.Model.Model(model_id) for chain in model_obj: new_model.add(chain) biopy_structure.add(new_model) if one_model: new_structure = Bio.PDB.Structure.Structure(biopy_structure.id) new_structure.add(biopy_structure[0]) biopy_structure = new_structure atoms = [] for residue in Bio.PDB.Selection.unfold_entities(biopy_structure, 'R'): # Prune out things that aren't actually residue atoms. if 'CA' in residue and residue.get_id()[0] == ' ': for atom in residue: atoms.append(atom) df = pd.DataFrame( [(pdb_name, str(atom.get_parent().get_parent().get_parent().serial_num), atom.get_parent().get_full_id()[2], str(atom.get_parent().get_id()[1]) + atom.get_parent().get_id()[2], atom.get_parent().get_resname(), atom.get_coord()[0], atom.get_coord()[1], atom.get_coord()[2], atom.get_id()[0], atom.get_name(), str(atom.serial_number)) for atom in atoms], columns=[ 'pdb_name', 'model', 'chain', 'residue', 'resname', 'x', 'y', 'z', 'element', 'atom_name', 'aid' ]) return df
def gen_pssm(pdb_filename, blastdb, output_filename): """Generate PSSM and PSFM from sequence.""" pdb_name = db.get_pdb_name(pdb_filename) out_dir = os.path.dirname(output_filename) work_dir = os.path.join(out_dir, 'work') if not os.path.exists(work_dir): os.makedirs(work_dir) fasta_format = work_dir + "/{:}.fa" id_format = work_dir + "/{:}.cpkl" chains, chain_fasta_filenames, id_filenames = sequ.pdb_to_fasta( pdb_filename, fasta_format, id_format, True) pssms = [] for chain, chain_fasta_filename, id_filename in \ zip(chains, chain_fasta_filenames, id_filenames): basename = os.path.splitext(chain_fasta_filename)[0] pssm_filename = "{}.pssm".format(basename) blast_filename = "{}.blast".format(basename) clustal_filename = "{}.clustal".format(basename) al2co_filename = "{}.al2co".format(basename) if not os.path.exists(pssm_filename): logging.info("Blasting {:}".format(chain_fasta_filename)) _blast(chain_fasta_filename, pssm_filename, blast_filename, blastdb) if not os.path.exists(pssm_filename): logging.warning("No hits for {:}".format(chain_fasta_filename)) # Create empty file. open(pssm_filename, 'w').close() if not os.path.exists(clustal_filename): logging.info("Converting {:}".format(blast_filename)) _to_clustal(blast_filename, clustal_filename) if not os.path.exists(al2co_filename): logging.info("Al2co {:}".format(al2co_filename)) _al2co(clustal_filename, al2co_filename) if os.stat(pssm_filename).st_size != 0: pssm = pd.read_csv(pssm_filename, skiprows=2, skipfooter=6, delim_whitespace=True, engine='python', usecols=range(20), index_col=[0, 1]) pssm = pssm.reset_index() del pssm['level_0'] pssm.rename(columns={'level_1': 'orig'}, inplace=True) pscm = pd.read_csv(pssm_filename, skiprows=2, skipfooter=6, delim_whitespace=True, engine='python', usecols=range(20, 40), index_col=[0, 1]) psfm = pscm.applymap(lambda x: x / 100.) psfm = psfm.reset_index() del psfm['level_0'] psfm.columns = pssm.columns del psfm['orig'] del pssm['orig'] # Combine both into one. psfm = psfm.add_prefix('psfm_') pssm = pssm.add_prefix('pssm_') al2co = pd.read_csv(al2co_filename, delim_whitespace=True, usecols=[2], names=['al2co']) pssm = pd.concat([pssm, psfm, al2co], axis=1) else: logging.warning( "No pssm found for {:} (model {:}, chain {:})".format( pdb_name, chain[-2], chain[-1])) pssm, psfm = None, None pdb_name = db.get_pdb_name(pdb_filename) key = pdb_name + '-' + chain[-2] + '-' + chain[-1] pos_to_res = pickle.load(open(id_filename))[key] pssm['pdb_name'] = db.get_pdb_name(pdb_filename) pssm['model'] = chain[0] pssm['chain'] = chain[1] pssm['residue'] = pos_to_res pssms.append(pssm) pssms = pd.concat(pssms) return pssms
def map_all_pssms(pkl_dataset, pruned_dataset, blastdb, output_dir, num_cpus, source_type, rank, size): ext = '.pkl' if source_type.lower( ) == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or ( source_type.lower() == 'rcsb') or (source_type.lower( ) == 'evcoupling') or (source_type.lower() == 'casp_capri') ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri': work_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(work_key)[1:3], work_key + ext) for work_key in work_keys ] else: work_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(work_key)[1:3].upper(), work_key + ext) for work_key in work_keys ] # Reserve an equally-sized portion of the full work load for a given rank in the MPI world work_filenames = list(set(work_filenames)) work_filename_rank_batches = slice_list(work_filenames, size) work_filenames = work_filename_rank_batches[rank] # Remove any duplicate filenames logging.info( "{:} requested keys, {:} produced keys, {:} work filenames".format( len(requested_keys), len(produced_keys), len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") inputs = [(key, blastdb, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_pssms, inputs, num_cpus)