def main(data_dir, target_list, fastas_dir, pssms_dir, psfms_dir, secstructs_dir, num_cpus, blast_path, nr_path, psipred_path): """ Run psipreds to generate the PSSMs and secondary structure predictions for each residue in a protein structure. """ logger = logging.getLogger(__name__) with open(target_list, 'r') as f: targets = [t.strip() for t in f.readlines()] logger.info("Running psipreds on {:} structures in {:}".format( len(targets), target_list)) def __run(id, blast_path, nr_path, psipred_path, pssms_dir, psfms_dir, secstructs_dir, tmp_dir, target, fasta_file): logger.info("Processing target {:} ({:}/{:})".format( target, id, len(targets))) run_psipred(blast_path, nr_path, psipred_path, pssms_dir, psfms_dir, secstructs_dir, tmp_dir, target, fasta_file) os.makedirs(pssms_dir, exist_ok=True) os.makedirs(psfms_dir, exist_ok=True) os.makedirs(secstructs_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) inputs = [] for i, target in enumerate(targets): fasta_file = os.path.join(fastas_dir, '{:}.fasta'.format(target)) if os.path.exists(fasta_file): inputs.append((i+1, blast_path, nr_path, psipred_path, pssms_dir, psfms_dir, secstructs_dir, tmp_dir, target, fasta_file)) else: logger.warning("FASTA for {:} does not exist".format(target)) par.submit_jobs(__run, inputs, num_cpus)
def map_all_pssms(pdb_dataset, blastdb, output_dir, num_cpus): ext = '.pkl' requested_filenames = \ db.get_structures_filenames(pdb_dataset, extension=ext) requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] work_filenames = [ x[0] for x in db.get_all_filenames(work_keys, pdb_dataset, extension=ext, keyer=lambda x: db.get_pdb_name(x)) ] output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) inputs = [(key, blastdb, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_pssms, inputs, num_cpus)
def parse_all(pdb_dataset, output_dir, num_cpus): """Parse pdb dataset (pdb files) to pandas dataframes.""" requested_filenames = db.get_structures_filenames(pdb_dataset) produced_filenames = db.get_structures_filenames( output_dir, extension='.pkl') requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] work_filenames = [x[0] for x in db.get_all_filenames(work_keys, pdb_dataset, enforcement=2)] logging.info("{:} requested keys, {:} produced keys, {:} work keys" .format(len(requested_keys), len(produced_keys), len(work_keys))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append( sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") inputs = [(key, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(parse, inputs, num_cpus)
def main(data_dir, target_list, labels_dir, struct_format, num_cpus, overwrite, tmscore_exe): """ Compute rmsd, tm-score, gdt-ts, gdt-ha of decoy structures """ logger = logging.getLogger(__name__) logger.info("Compute rmsd, tm-score, gdt-ts, gdt-ha of decoys in {:}".format( data_dir)) os.makedirs(labels_dir, exist_ok=True) with open(target_list, 'r') as f: requested_filenames = \ [os.path.join(labels_dir, '{:}.dat'.format(x.strip())) for x in f] logger.info("{:} requested keys".format(len(requested_filenames))) produced_filenames = [] if not overwrite: produced_filenames = [f for f in fi.find_files(labels_dir, 'dat') \ if 'targets' not in f] logger.info("{:} produced keys".format(len(produced_filenames))) inputs = [] for filename in requested_filenames: if filename in produced_filenames: continue target_name = util.get_target_name(filename) target_dir = os.path.join(data_dir, target_name) inputs.append((tmscore_exe, filename, target_name, target_dir, struct_format)) logger.info("{:} work keys".format(len(inputs))) par.submit_jobs(run_tmscore_per_target, inputs, num_cpus)
def main(pair_dir, tfrecord_dir, num_cpus): """Run write_pairs on all provided complexes.""" requested_filenames = \ db.get_structures_filenames(pair_dir, extension='.dill') requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = \ db.get_structures_filenames(tfrecord_dir, extension='.tfrecord') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) work_filenames = [ x[0] for x in db.get_all_filenames(work_keys, pair_dir, extension='.dill') ] output_filenames = [] for pdb_filename in work_filenames: sub_dir = tfrecord_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".tfrecord") inputs = [(i, o) for i, o in zip(work_filenames, output_filenames)] par.submit_jobs(pairs_to_tfrecord, inputs, num_cpus)
def shard_envs(input_path, output_path, num_threads=8, subsample=True): input_sharded = sh.Sharded.load(input_path) keys = input_sharded.get_keys() if keys != ['ensemble']: raise RuntimeError('Can only apply to sharded by ensemble.') output_sharded = sh.Sharded(output_path, keys) input_num_shards = input_sharded.get_num_shards() tmp_path = output_sharded.get_prefix() + f'_tmp@{input_num_shards:}' tmp_sharded = sh.Sharded(tmp_path, keys) not_written = [] for i in range(input_num_shards): shard = output_sharded._get_shard(i) if not os.path.exists(shard): not_written.append(i) print(f'Using {num_threads:} threads') inputs = [(input_sharded, tmp_sharded, shard_num, subsample) for shard_num in range(8)] # with multiprocessing.Pool(processes=num_threads) as pool: # pool.starmap(_shard_envs, inputs) par.submit_jobs(_shard_envs, inputs, num_threads) sho.reshard(tmp_sharded, output_sharded) tmp_sharded.delete_files()
def save_graphs(sharded, out_dir, num_threads=8): num_shards = sharded.get_num_shards() inputs = [(sharded, shard_num, out_dir) for shard_num in range(num_shards)] # with multiprocessing.Pool(processes=num_threads) as pool: # pool.starmap(_shard_envs, inputs) par.submit_jobs(_save_graphs, inputs, num_threads) _rename(out_dir)
def bsa_db(sharded_path, output_bsa, num_threads): sharded = sh.Sharded.load(sharded_path) num_shards = sharded.get_num_shards() dirname = os.path.dirname(output_bsa) if dirname != '': os.makedirs(dirname, exist_ok=True) inputs = [(sharded, x, output_bsa) for x in range(num_shards)] logger.info(f'{num_shards:} shards to do.') logger.info(f'Using {num_threads:} threads') par.submit_jobs(_bsa_db, inputs, num_threads)
def all_complex_to_pairs(complexes, get_pairs, output_dir, num_cpus): """Reads in structures and produces appropriate pairings.""" if not os.path.exists(output_dir): os.makedirs(output_dir) requested_keys = complexes['data'].keys() produced_keys = complexes_from_pair_dir(output_dir) work_keys = [key for key in requested_keys if key not in produced_keys] inputs = [(complexes['data'][key], get_pairs, output_dir) for key in work_keys] logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) par.submit_jobs(complex_to_pairs, inputs, num_cpus)
def main(pair_dir, to_keep_dir, output_dir, num_cpus): """Run write_pairs on all provided complexes.""" to_keep_filenames = \ db.get_structures_filenames(to_keep_dir, extension='.txt') if len(to_keep_filenames) == 0: logging.warning( "There is no to_keep file in {:}. All pair files from {:} " "will be copied into {:}".format(to_keep_dir, pair_dir, output_dir)) to_keep_df = __load_to_keep_files_into_dataframe(to_keep_filenames) logging.info("There are {:} rows, cols in to_keep_df".format( to_keep_df.shape)) logging.info("Looking for all pairs in {:}".format(pair_dir)) work_filenames = \ db.get_structures_filenames(pair_dir, extension='.dill') work_keys = [db.get_pdb_name(x) for x in work_filenames] logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir)) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".dill") inputs = [(i, o, to_keep_df) for i, o in zip(work_filenames, output_filenames)] ncopied = 0 ncopied += np.sum(par.submit_jobs(process_pairs_to_keep, inputs, num_cpus)) logging.info("{:} out of {:} pairs was copied".format( ncopied, len(work_keys)))
def generate_all_clean_complexes(args): """Clean all complexes in input_dir, writing them out to output_dir.""" requested_keys = get_complex_pdb_codes(args.pdb_dataset) produced_filenames = db.get_structures_filenames(args.output_dir) produced_keys = [] for pdb_code in requested_keys: res = get_files_for_complex(pdb_code, produced_filenames, 'db5') if len([x for x in res if x is None]) == 0: produced_keys.append(pdb_code) work_keys = [key for key in requested_keys if key not in produced_keys] logging.info("{:} requested keys, {:} produced keys, {:} work keys".format( len(requested_keys), len(produced_keys), len(work_keys))) inputs = [(pc, args.pdb_dataset, args.output_dir + '/' + pc, args.style) for pc in work_keys] par.submit_jobs(_generate_clean_complex, inputs, args.c)
def shard_pairs(input_path, output_path, cutoff, cutoff_type, num_threads): input_sharded = sh.Sharded.load(input_path) keys = input_sharded.get_keys() if keys != ['ensemble']: raise RuntimeError('Can only apply to sharded by ensemble.') output_sharded = sh.Sharded(output_path, keys) input_num_shards = input_sharded.get_num_shards() tmp_path = output_sharded.get_prefix() + f'_tmp@{input_num_shards:}' tmp_sharded = sh.Sharded(tmp_path, keys) logger.info(f'Using {num_threads:} threads') inputs = [(input_sharded, tmp_sharded, shard_num, cutoff, cutoff_type) for shard_num in range(input_num_shards)] par.submit_jobs(_shard_pairs, inputs, num_threads) sho.reshard(tmp_sharded, output_sharded) tmp_sharded.delete_files()
def gen_labels_sharded(sharded_path, data_csv, num_threads, overwrite): sharded = sh.Sharded.load(sharded_path) num_shards = sharded.get_num_shards() requested_shards = list(range(num_shards)) if not overwrite: produced_shards = [ x for x in requested_shards if sharded.has(x, 'labels') ] else: produced_shards = [] work_shards = set(requested_shards).difference(produced_shards) logger.info(f'{len(requested_shards):} requested, ' f'{len(produced_shards):} already produced, ' f'{len(work_shards):} left to do.') logger.info(f'Using {num_threads:} threads') inputs = [(sharded, shard_num, data_csv) for shard_num in work_shards] par.submit_jobs(_gen_labels_shard, inputs, num_threads)
def main(raw_pdb_dir, pruned_pairs_dir, output_dir, neighbor_def, cutoff, num_cpus): """Run postprocess_pruned_pairs on all provided complexes.""" logging.info("Looking for all pairs in {:}".format(pruned_pairs_dir)) work_filenames = \ db.get_structures_filenames(pruned_pairs_dir, extension='.dill') work_keys = [db.get_pdb_name(x) for x in work_filenames] logging.info("Found {:} pairs in {:}".format(len(work_keys), output_dir)) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".dill") inputs = [(raw_pdb_dir, neighbor_def, cutoff, i, o) for i, o in zip(work_filenames, output_filenames)] n_copied = 0 n_copied += np.sum( par.submit_jobs(postprocess_pruned_pairs, inputs, num_cpus)) logging.info("{:} out of {:} pairs was copied".format( n_copied, len(work_keys)))
def map_all_profile_hmms(pkl_dataset, pruned_dataset, output_dir, hhsuite_db, num_cpu_jobs, num_cpus_per_job, source_type, num_iter, rank, size, write_file): ext = '.pkl' if write_file: if source_type.lower() == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or ( source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input']) ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] establish_pdb_code_case = lambda pdb_code, source_type: pdb_code.lower() \ if source_type.lower() == 'casp_capri' \ else pdb_code.upper() work_filenames = [ os.path.join( pkl_dataset, establish_pdb_code_case(db.get_pdb_code(work_key), source_type)[1:3], work_key + ext) for work_key in work_keys ] # Remove any duplicate filenames work_filenames = list(set(work_filenames)) logging.info( "{:} requested keys, {:} produced keys, {:} work filenames".format( len(requested_keys), len(produced_keys), len(work_filenames))) if source_type.lower() == 'input': # Directly generate profile HMM features after aggregating input filenames logging.info("{:} work filenames".format(len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + '.pkl') inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type, num_iter) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs) else: # Write out a local file containing all work filenames temp_df = pd.DataFrame({'filename': work_filenames}) temp_df.to_csv(f'{source_type}_work_filenames.csv') logging.info( 'File containing work filenames written to storage. Exiting...' ) # Read from previously-created work filenames CSV else: work_filenames = pd.read_csv( f'{source_type}_work_filenames.csv').iloc[:, 1].to_list() work_filenames = list( set(work_filenames)) # Remove any duplicate filenames # Reserve an equally-sized portion of the full work load for a given rank in the MPI world work_filename_rank_batches = slice_list(work_filenames, size) work_filenames = work_filename_rank_batches[rank] logging.info("{:} work filenames".format(len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + '.pkl') inputs = [(num_cpus_per_job, key, output, hhsuite_db, source_type, num_iter) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_profile_hmms, inputs, num_cpu_jobs)
def regenerate_edge(edge, filelist, root, dryrun, job_server): def fileListSize(files): size = 0 for f in files: stat = os.stat(f) size += stat.st_size return size if not edge.check_sources(filelist, root): return tgtdir = '%(root)s/%(lang)s/%(to_what)s' % { 'root': root, 'lang': edge.target['lang'], 'to_what': edge.target['tag'] } # printout msg = "# Doing %s" % str(edge) msg += ' (%s %s)' % ('.'.join([str(i) for i in time.localtime()[:3]]), ':'.join([str(i) for i in time.localtime()[3:6]])) if dryrun: print msg if not os.path.exists(tgtdir): print 'mkdir %s 2>/dev/null' % tgtdir if hasattr(edge, 'collector') and edge.collector: print '# Collecting before running giza' else: #regenerating files one by one for f in filelist: regenerate_file(filename=f, edge=edge, root=root, dryrun=dryrun) else: logg(msg) # Starting daemon if needed if edge.daemon and not job_server: start_daemon(edge) if not os.path.exists(tgtdir): os.mkdir(tgtdir) # checking if edge is in "collector" mode if hasattr(edge, 'collector') and edge.collector: dirs_to_clear = [tgtdir] # collecting all the data first sourceDirs = edge.sourceDirs(root) for i, src in enumerate(edge.sources): dirs_to_clear.append(sourceDirs[i]) collecting.collect(sourceDirs[i], suffix='.%s.%s' % (src['lang'], src['tag'])) # run the command on the collected data regenerate_file(filename=collecting.DATA, edge=edge, root=root, dryrun=dryrun) # separating file src0dir = sourceDirs[0] src0_catalog = '%s/%s.%s.%s' % (src0dir, collecting.CATALOG, edge.sources[0]['lang'], edge.sources[0]['tag']) suffix = '.%s.%s' % (edge.target['lang'], edge.target['tag']) collecting.separate(tgtdir, suffix=suffix, catalog=src0_catalog) collecting.clear_dirs(dirs_to_clear) else: #regenerating files one by one if not job_server: sourceFiles0 = edge.sourceFiles(root, filelist)[0] targetSize = fileListSize(sourceFiles0) actualSize = 0 for sourceFile, filename in zip(sourceFiles0, filelist): regenerate_file(filename=filename, edge=edge, root=root, dryrun=dryrun) actualSize += os.stat(sourceFile).st_size try: percent = float(actualSize) / float(targetSize) * 100.0 except ZeroDivisionError: sys.stdout.write( 'Nothing to be done. Only empty files.') break sys.stdout.write('%.2f%% ' % percent) sys.stdout.flush() sys.stdout.write('\n') else: jobs = [] sourceFiles0 = edge.sourceFiles(root, filelist)[0] for sourceFile, filename in zip(sourceFiles0, filelist): regenerate_file(filename=filename, edge=edge, root=root, dryrun=dryrun, job_server=job_server, jobs=jobs) if not edge.daemon: for i, job in enumerate(jobs): job() percent = float(i) / float(len(jobs)) * 100.0 sys.stdout.write('%.2f%% ' % percent) sys.stdout.flush() sys.stdout.write('\n') else: submit_jobs(job_server, edge.command, 6, edge.port, jobs)
def regenerate_edge(edge, filelist, root, dryrun, job_server): def fileListSize(files): size = 0 for f in files: stat = os.stat(f) size += stat.st_size return size if not edge.check_sources(filelist, root): return tgtdir = '%(root)s/%(lang)s/%(to_what)s' % {'root': root, 'lang': edge.target['lang'], 'to_what': edge.target['tag'] } # printout msg = "# Doing %s" % str(edge) msg += ' (%s %s)' % ('.'.join([str(i) for i in time.localtime()[:3]]), ':'.join([str(i) for i in time.localtime()[3:6]])) if dryrun: print msg if not os.path.exists(tgtdir): print 'mkdir %s 2>/dev/null' % tgtdir if hasattr(edge, 'collector') and edge.collector: print '# Collecting before running giza' else: #regenerating files one by one for f in filelist: regenerate_file(filename=f, edge=edge, root=root, dryrun=dryrun) else: logg(msg) # Starting daemon if needed if edge.daemon and not job_server: start_daemon(edge) if not os.path.exists(tgtdir): os.mkdir(tgtdir) # checking if edge is in "collector" mode if hasattr(edge, 'collector') and edge.collector: dirs_to_clear = [tgtdir] # collecting all the data first sourceDirs = edge.sourceDirs(root) for i, src in enumerate(edge.sources): dirs_to_clear.append(sourceDirs[i]) collecting.collect(sourceDirs[i], suffix='.%s.%s' % ( src['lang'], src['tag'] ) ) # run the command on the collected data regenerate_file(filename=collecting.DATA, edge=edge, root=root, dryrun=dryrun) # separating file src0dir = sourceDirs[0] src0_catalog = '%s/%s.%s.%s' % (src0dir, collecting.CATALOG, edge.sources[0]['lang'], edge.sources[0]['tag'] ) suffix = '.%s.%s' % (edge.target['lang'], edge.target['tag']) collecting.separate(tgtdir, suffix=suffix, catalog=src0_catalog) collecting.clear_dirs(dirs_to_clear) else: #regenerating files one by one if not job_server: sourceFiles0 = edge.sourceFiles(root, filelist)[0] targetSize = fileListSize(sourceFiles0) actualSize = 0 for sourceFile, filename in zip(sourceFiles0, filelist): regenerate_file(filename=filename, edge=edge, root=root, dryrun=dryrun) actualSize += os.stat(sourceFile).st_size try: percent = float(actualSize)/float(targetSize) * 100.0 except ZeroDivisionError: sys.stdout.write('Nothing to be done. Only empty files.') break sys.stdout.write('%.2f%% ' % percent) sys.stdout.flush() sys.stdout.write('\n') else: jobs = [] sourceFiles0 = edge.sourceFiles(root, filelist)[0] for sourceFile, filename in zip(sourceFiles0, filelist): regenerate_file(filename=filename, edge=edge, root=root, dryrun=dryrun, job_server=job_server, jobs=jobs) if not edge.daemon: for i, job in enumerate(jobs): job() percent = float(i)/float(len(jobs)) * 100.0 sys.stdout.write('%.2f%% ' % percent) sys.stdout.flush() sys.stdout.write('\n') else: submit_jobs(job_server, edge.command, 6, edge.port, jobs)
def map_all_protrusion_indices(psaia_dir, psaia_config_file, pdb_dataset, pkl_dataset, pruned_dataset, output_dir, source_type): ext = '.pkl' if source_type.lower() == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or (source_type.lower() in ['rcsb', 'evcoupling', 'casp_capri', 'input']) ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] requested_pdb_codes = [db.get_pdb_code(x) for x in requested_filenames] produced_filenames_path = os.path.join(output_dir, 'PSAIA', source_type.upper()) produced_filenames = [ path.as_posix() for path in Path(produced_filenames_path).rglob('*.tbl') ] produced_keys = [db.get_pdb_code(x) for x in produced_filenames] work_keys = [ key for key, pdb_code in zip(requested_keys, requested_pdb_codes) if pdb_code not in produced_keys ] format_pdb_code_for_inputs = lambda pdb_code, source_type: pdb_code[1:3] \ if source_type.lower() in ['input'] \ else pdb_code.upper() if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri': work_filenames = [ os.path.join(pdb_dataset, db.get_pdb_code(work_key)[1:3], work_key) for work_key in work_keys ] else: work_filenames = [ os.path.join( pdb_dataset, format_pdb_code_for_inputs(db.get_pdb_code(work_key), source_type), work_key) for work_key in work_keys ] # Remove any duplicate filenames work_filenames = list(set(work_filenames)) # Exit early if no inputs need to processed logging.info("{:} PDB files to process with PSAIA".format( len(work_filenames))) # Create comprehensive filename list for PSAIA to single-threadedly process for requested features (e.g. protrusion) file_list_file = os.path.join(output_dir, 'PSAIA', source_type.upper(), 'pdb_list.fls') with open(file_list_file, 'w') as file: for requested_pdb_filename in work_filenames: file.write(f'{requested_pdb_filename}\n') inputs = [(psaia_dir, psaia_config_file, file_list_file)] par.submit_jobs(map_protrusion_indices, inputs, 1) # PSAIA is inherently single-threaded in execution
def map_all_pssms(pkl_dataset, pruned_dataset, blastdb, output_dir, num_cpus, source_type, rank, size): ext = '.pkl' if source_type.lower( ) == 'rcsb': # Filter out pairs that did not survive pruning previously to reduce complexity pruned_pdb_names = [ db.get_pdb_name(filename) for filename in db.get_structures_filenames(pruned_dataset, extension='.dill') ] requested_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(pruned_pdb_name)[1:3], pruned_pdb_name.split('_')[0] + ext) for pruned_pdb_name in pruned_pdb_names ] else: # DB5 does not employ pair pruning, so there are no pairs to filter requested_filenames = [ filename for filename in db.get_structures_filenames(pkl_dataset, extension=ext) ] # Filter DB5 filenames to unbound type and get all work filenames requested_filenames = [ filename for filename in requested_filenames if (source_type.lower() == 'db5' and '_u_' in filename) or ( source_type.lower() == 'rcsb') or (source_type.lower( ) == 'evcoupling') or (source_type.lower() == 'casp_capri') ] requested_keys = [db.get_pdb_name(x) for x in requested_filenames] produced_filenames = db.get_structures_filenames(output_dir, extension='.pkl') produced_keys = [db.get_pdb_name(x) for x in produced_filenames] work_keys = [key for key in requested_keys if key not in produced_keys] if source_type.lower() == 'rcsb' or source_type.lower() == 'casp_capri': work_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(work_key)[1:3], work_key + ext) for work_key in work_keys ] else: work_filenames = [ os.path.join(pkl_dataset, db.get_pdb_code(work_key)[1:3].upper(), work_key + ext) for work_key in work_keys ] # Reserve an equally-sized portion of the full work load for a given rank in the MPI world work_filenames = list(set(work_filenames)) work_filename_rank_batches = slice_list(work_filenames, size) work_filenames = work_filename_rank_batches[rank] # Remove any duplicate filenames logging.info( "{:} requested keys, {:} produced keys, {:} work filenames".format( len(requested_keys), len(produced_keys), len(work_filenames))) output_filenames = [] for pdb_filename in work_filenames: sub_dir = output_dir + '/' + db.get_pdb_code(pdb_filename)[1:3] if not os.path.exists(sub_dir): os.makedirs(sub_dir, exist_ok=True) output_filenames.append(sub_dir + '/' + db.get_pdb_name(pdb_filename) + ".pkl") inputs = [(key, blastdb, output) for key, output in zip(work_filenames, output_filenames)] par.submit_jobs(map_pssms, inputs, num_cpus)