def main(): complex_list = [] for file in os.listdir(): if file.endswith(".pdb"): complex_list.append(file) complex_list_sorted = natural_sort.main(complex_list) df_pQ3sc = pd.DataFrame( columns=['name', 'ProQ2D', 'ProQRosCenD', 'ProQRosFAD', 'ProQ3D']) for i, cmplx in enumerate(complex_list_sorted): name = cmplx with open(name + '.proq3.global', 'r') as pq_sc: lines = pq_sc.readlines() scores = lines[1].split() df_pQ3sc.loc[i] = [ name, scores[0], scores[1], scores[2], scores[3] ] return (df_pQ3sc)
def main(structure, sab_dir, exec, ProQ3_dir, decoynum): # The .theano directory gets very large RAM usage over time using ProQ3D subprocess.run(['rm', '-rf', '/home/drewaight/.theano']) start_time = timeit.default_timer() # Working directory where you want all the results stored topdir = os.getcwd() scheme = 'chothia' # Make a folder and copy the target structure into it subprocess.run(['mkdir', Path(structure).stem]) subprocess.run(['cp', sab_dir + structure, Path(structure).stem]) os.chdir(Path(structure).stem) # The biopandas_cleanup module selects the best vL vH Epitope complex from a structure. It uses Bfactors as criteria # the best complex is relabelled 'L' 'H' 'A' and the antibodies are renumbered by Anarci with the input scheme # and returns dataframes with the heavy and light stats. See biopandas_cleanup.py for more information struct_clean = Path(structure).stem + '_1.pdb' h_stats, l_stats = biopandas_cleanup.main(structure, scheme, struct_clean) # RaBdaB takes IgG Kappa definition pulled here from the anarci stats if re.search(r"IGK", str(l_stats['v_gene'])): light_chain = 'kappa' elif re.search(r"IGL", str(l_stats['v_gene'])): light_chain = 'lambda' else: light_chain = 'undefined' cwd = os.getcwd() # The program antibody_info_and_dock.prep.py calls structure_extract_relax.py which uses pyrosetta to run packing # and minimization on the input structure and extracts the releveant antibody CDR statistics and Dunbrack clusters # into a returned dataframe. After this the Aho numbered CDRs + 3 stem residues are set to 1.0 in the Bfactor column # and all other residues are set to 0.0 the antibody_info_and_dock.prep then separates the LH and A molecules # randomizes the orientation and slides them back together in preparation for docking. abdf = antibody_info_and_dock_prep.main(cwd, struct_clean, light_chain) h_stats.to_json('h_stats.json') l_stats.to_json('l_stats.json') abdf.to_json('abdf.json') # At this point we want to fork the two prepared structures into the broad CDR definition above, and a targeted # defintion using Parapred. The randomized LH structure is copied with the ppd extension to denote Parapred scoring test_str = Path(struct_clean).stem + '_rand_LH.pdb' test2_str = Path(struct_clean).stem + '_ppd_rand_LH.pdb' subprocess.run(['cp', test_str, test2_str]) recep_str = Path(struct_clean).stem + '_rand_A.pdb' init_str = Path(struct_clean).stem + '_init.pdb' print([test_str, recep_str, init_str]) # Execute parapred on the copied structure subprocess.run(['parapred', 'pdb', test2_str]) # The bfactor_to_blocking_biopandas.py program simply converts all residues with bfactor == 0 to the residue BLK # which avoids rigid body docking on these residues (ZDOCK and Megadock) standard bfactor_to_blocking_biopandas.main(test_str) bfactor_to_blocking_biopandas.main(test2_str) block_cdr = Path(test_str).stem + "_blocking.pdb" parap_cdr = Path(test2_str).stem + "_blocking.pdb" # Writing the jobs table for the megadock program with open("SAMPLE.table", "w") as f: f.write("TITLE= sample jobs\n") f.write("PARAM= -R $1 -L $2\n") f.write(block_cdr + "\t" + recep_str + "\n") f.write(parap_cdr + "\t" + recep_str) # Execute megadock GPU docking on both the broader CDR and targeted Parapred treated structures subprocess.run([ '/usr/bin/mpiexec', '-n', '16', '--use-hwthread-cpus', 'megadock-gpu-dp', '-v', '1.0', '-D', '-t', '3', '-N', '54000', '-tb', 'SAMPLE.table' ]) # subprocess.run(['/usr/bin/mpiexec', '-n', '16', '--use-hwthread-cpus', 'megadock-gpu-dp', # '-tb', 'SAMPLE.table']) # Everything gets moved into the megadock folder subprocess.run(['mkdir', 'megadock']) subprocess.run([ 'mv', Path(struct_clean).stem + '_rand_LH_blocking-' + Path(struct_clean).stem + '_rand_A.out', 'megadock' ]) subprocess.run([ 'mv', Path(struct_clean).stem + '_ppd_rand_LH_blocking-' + Path(struct_clean).stem + '_rand_A.out', 'megadock' ]) subprocess.run(['cp', recep_str, 'megadock']) subprocess.run(['cp', test_str, 'megadock']) subprocess.run(['cp', test2_str, 'megadock']) subprocess.run(['cp', init_str, 'megadock']) os.chdir('megadock') subprocess.run(['mv', init_str, 'complex_0.pdb']) subprocess.run([ 'mv', Path(struct_clean).stem + '_rand_LH_blocking-' + Path(struct_clean).stem + '_rand_A.out', Path(struct_clean).stem + '_complex.out' ]) subprocess.run([ 'mv', Path(struct_clean).stem + '_ppd_rand_LH_blocking-' + Path(struct_clean).stem + '_rand_A.out', Path(struct_clean).stem + '_ppd_complex.out' ]) # The zrank_processing.py module rescores the megadock output files using the zrank program, it calls the mpi # enabled module zrank_mpi.py and returns a dataframe of the results sorted by zscore infile = Path(struct_clean).stem + '_complex.out' infile2 = Path(struct_clean).stem + '_ppd_complex.out' df_zsc = zrank_processing.main(infile, recep_str, test_str, exec) df2_zsc = zrank_processing.main(infile2, recep_str, test2_str, exec) # We want to keep trach of which method performs better, so add a identifier column before merging and resorting the # zranked datasets df_zsc['method'] = "CDR" df2_zsc['method'] = "Parapred" df_zsc = pd.concat([df_zsc, df2_zsc], ignore_index=True) df1 = df_zsc.sort_values(by=['zrank']) df1 = df1.reset_index(drop=True) # Reformat the merged zranked dataframes from back into megadock format outfile = Path(struct_clean).stem + '_complex_zranked.out' outfp = open(outfile, 'w') with open(infile, 'r') as fp: for i, line in enumerate(fp): if i < 4: outfp.write(line) df1['rot1'] = df1['rot1'].map(lambda x: '%.6f' % x) df1['rot2'] = df1['rot2'].map(lambda x: '%.6f' % x) df1['rot3'] = df1['rot3'].map(lambda x: '%.6f' % x) df1['score'] = df1['score'].map(lambda x: '%.2f' % x) df1['zrank'] = df1['zrank'].map(lambda x: '%.5f' % x) # We want to keep track of the zranked stats, this data is split off as dfout, the rest gets re-written to the # outfile for decoy generation df3 = df1.drop(['id', 'zrank', 'method'], axis=1) df3.to_csv(outfp, sep='\t', mode='a', header=False, index=False) dfout = df1.drop( ['rot1', 'rot2', 'rot3', 'vox1', 'vox2', 'vox3', 'id', 'score'], axis=1) dfout['complex'] = dfout.index + 1 print(dfout.head()) dfout.to_csv('zrank.csv') dfout.to_json('zrank.json') # The decoygen.py program converts the zranked list of rotational coordinates to back into pdb complexes. The number # of complexes returned is set by the decoynum variable, complex_0.pdb (native complex) is added to the beginning of # the list as the reference. zinfile = Path(struct_clean).stem + '_complex_zranked.out' complexes = decoygen.main(recep_str, zinfile, decoynum, test_str) complexes.append('complex_0.pdb') complex_list = natural_sort.main(complexes) with open('relax_list.txt', 'w') as r: for cmplx in complex_list: r.write(cmplx + '\n') cwd = os.getcwd() jsonfile = "relaxed_list.json" reference = 'complex_0_relaxed.pdb' # The relax_mpi.py program uses mpi to run the drew_relax.py program over 16 cores. The output is a json file of the # relaxed and interface parameters of each complex print('Running Relax: ' + Path(structure).stem) bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength).start() p = subprocess.Popen([ 'mpiexec', '-n', '16', 'python', exec + 'relax_mpi.py', 'relax_list.txt', cwd ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) result = [] while p.stdout is not None: bar.update() line = p.stdout.readline() result.append(line.decode('UTF-8').rstrip('\r')) if not line: print("\n") p.stdout.flush() break with open("relax_mpi.log", 'w') as f: f.write(''.join(result)) # The rmsd_align_mpi.py program uses mpi to run rmsd.py which aligns all of the relaxed complexes by their Antigen # chain, and calculates the RMSD value from the reference pose print('Align and Rmsd:') subprocess.run([ 'mpiexec', '-n', '16', 'python', exec + 'rmsd_align_mpi.py', cwd, jsonfile, reference ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Read back in the output json file from the above mpi programs ros_df = pd.read_json(cwd + '/relaxed_list_rmsd.json') ros_df = ros_df.drop(['dG_cross', 'dG_cross/dSASAx100', 'packstat'], axis=1) # We want ProQ3D to use our relaxed structures write the list for it to process with open('proQ3_list.txt', 'w') as p: for cmplx in complex_list: pandas_anarci_numberer.main( Path(cmplx).stem + '_relaxed.pdb', 'chothia', Path(cmplx).stem + '_relaxed_cho.pdb') subprocess.run([ 'bash', exec + 'ProQDock_prepare.sh', Path(cmplx).stem + '_relaxed_cho.pdb' ]) p.write(Path(cmplx).stem + '_relaxed_cho_reres.pdb' + '\n') # ProQ3D is a big complicated program. It is called using GNU parallel through the proq3_all.sh script, if there is # not yet a profile one is created in the top level working directory. All ProQ3D output is put into the run_all # directory try: print('ProQ3 processing') subprocess.run([ ProQ3_dir + 'proq3_all.sh', './complex.fasta', 'proQ3_list.txt', topdir + '/' + Path(structure).stem + '_profile', 'run_all', '16' ], stdout=subprocess.PIPE) except: print('ProQ3 parallel error') os.chdir('run_all') ProQ_list = [] for file in os.listdir(): if file.endswith(".pdb"): ProQ_list.append(file) ProQ_list_sorted = natural_sort.main(ProQ_list) # The collector.py program scrapes the scores from the proq3.global files and returns a dataframe total_sc = collector.main() # Join the rosetta scores total_sc = total_sc.join(ros_df) total_sc.to_json('total_sc.json') # Merge the zrank data into the total scores total_sc = total_sc.merge(dfout, how='left', on='complex') total_sc['structure'] = Path(structure).stem abdf['structure'] = Path(structure).stem # Merge the antibody information the single statistics for the antibody including CDR classifications are populated # to all the complexes total_sc = total_sc.merge(abdf, how='right') # Shuffle important columns to the front col = total_sc.pop("zrank") total_sc.insert(0, col.name, col) col = total_sc.pop("method") total_sc.insert(0, col.name, col) col = total_sc.pop("structure") total_sc.insert(0, col.name, col) print(total_sc) os.chdir('../../') outfp = Path(structure).stem + '_TOTAL.csv' # Sort by rmsd and output total_sc = total_sc.sort_values(by=['rmsd']) total_sc.to_csv(outfp, sep='\t', mode='a') total_sc.to_json(Path(structure).stem + '_TOTAL.json') stop_time = timeit.default_timer() time_tot = str(stop_time - start_time) with open('time.txt', 'w') as t: t.write(time_tot) os.chdir('..') # Return the dataframe and thats it! return (total_sc)
def main(structure, exec, ProQ3_dir, decoynum): start_time = timeit.default_timer() topdir = os.getcwd() chain = subprocess.run(['pdbcount', structure], stdout=subprocess.PIPE) counts = chain.stdout.split() if int(counts[1]) > 3: print("too many chains bud") return () else: print(chain.stdout.decode('utf-8')) chain_check.main(structure) subprocess.run(['mkdir', Path(structure).stem]) subprocess.run(['mv', structure, Path(structure).stem]) os.chdir(Path(structure).stem) docking_structure_randomizer.main(structure) test_str = Path(structure).stem + '_rand_LH.pdb' print(test_str) recep_str = Path(structure).stem + '_rand_A.pdb' out_str = Path(test_str).stem + '_cho.pdb' print(out_str) scheme = 'chothia' anarci_to_pdb.main(test_str, scheme, out_str) subprocess.run(['parapred', 'pdb', out_str]) parapred_to_blocking.main(out_str, recep_str) subprocess.run([ '/usr/bin/mpiexec', '-n', '16', '--use-hwthread-cpus', 'megadock-gpu-dp', '-v', '1.0', '-D', '-t', '3', '-N', '54000', '-tb', 'SAMPLE.table' ]) #subprocess.run(['/usr/bin/mpiexec', '-n', '16', '--use-hwthread-cpus', 'megadock-gpu-dp', # '-tb', 'SAMPLE.table']) subprocess.run(['mkdir', 'megadock']) subprocess.run([ 'mv', Path(structure).stem + '_rand_LH_cho_blocking-' + Path(structure).stem + '_rand_A.out', 'megadock' ]) subprocess.run(['cp', recep_str, 'megadock']) subprocess.run(['cp', test_str, 'megadock']) subprocess.run(['cp', Path(structure).stem + '.clean.pdb', 'megadock']) os.chdir('megadock') subprocess.run( ['mv', Path(structure).stem + '.clean.pdb', 'complex_0.pdb']) subprocess.run([ 'mv', Path(structure).stem + '_rand_LH_cho_blocking-' + Path(structure).stem + '_rand_A.out', Path(structure).stem + '_complex.out' ]) infile = Path(structure).stem + '_complex.out' df_zsc = zrank_processing.main(infile, recep_str, test_str, exec) print(df_zsc.describe()) zinfile = Path(structure).stem + '_complex_zranked.out' complex_list = decoygen.main(recep_str, zinfile, decoynum, test_str) subprocess.run([exec + 'ProQDock_prepare.sh', 'complex_0.pdb']) with open('proQ3_list.txt', 'w') as p: p.write('complex_0_merge.pdb' + '\n') for cmplx in complex_list: subprocess.run([exec + 'ProQDock_prepare.sh', cmplx]) p.write(Path(cmplx).stem + '_merge.pdb' + '\n') try: print('ProQ3 processing') subprocess.run([ ProQ3_dir + 'proq3_all.sh', './complex.fasta', 'proQ3_list.txt', topdir + '/' + Path(structure).stem + '_profile', './run_all', '16' ], stdout=subprocess.PIPE) except: print('ProQ3 parallel error') os.chdir('run_all') ProQ_list = [] for file in os.listdir(): if file.endswith(".pdb"): ProQ_list.append(file) ProQ_list_sorted = natural_sort.main(ProQ_list) total_sc = collector.main() rmsd_df = pd.DataFrame(columns=['rmsd', 'all_rmsd', 'sc_score', 'p2sc']) for i, p_cmplx in enumerate(ProQ_list_sorted): rmd_sc = rmsd.main('../../' + structure, p_cmplx) rmsd_df.loc[i] = [ rmd_sc[0][0], rmd_sc[0][1], rmd_sc[0][2], rmd_sc[0][3] ] total_sc = total_sc.join(rmsd_df) os.chdir('../../') outfp = Path(structure).stem + '_TOTAL.csv' total_sc.to_csv(outfp, sep='\t', mode='a') stop_time = timeit.default_timer() time_tot = str(start_time - stop_time) with open('time.txt', 'w') as t: t.write(time_tot) os.chdir('..') return (total_sc)
import re sab_dir = '/home/drewaight/hdd1/sabdab_short/' exec = '/media/hdd1/proQ3drewdock_big/drewdock_exec/' ProQ3_dir = '/home/drewaight/proq3/' decoynum = 200 cwd = os.getcwd() print(cwd) score = pd.DataFrame() master_list = [] for file in os.listdir(sab_dir): if file.endswith(".pdb"): master_list.append(file) master_list_sorted = natural_sort.main(master_list) for structure in master_list_sorted: if os.path.isdir(cwd + '/' + Path(structure).stem): print('Looks like you already did ' + Path(structure).stem + ' bud.') continue else: print('You didnt do ' + Path(structure).stem + ' yet') try: print("############################# NOW WORKING ON " + structure + " ##########################################") sc = drewdock_biopandas.main(structure, sab_dir, exec, ProQ3_dir, decoynum) score = score.append(sc) score.to_csv('TOTAL.csv', sep='\t', mode='a') except: dir = exec + "../"
def main(structure, sab_dir, exec, ProQ3_dir, decoynum): subprocess.run(['rm', '-rf', '/home/drewaight/.theano']) start_time = timeit.default_timer() topdir = os.getcwd() scheme = 'chothia' subprocess.run(['mkdir', Path(structure).stem]) subprocess.run(['cp', sab_dir + structure, Path(structure).stem]) os.chdir(Path(structure).stem) struct_clean = Path(structure).stem + '_1.pdb' h_stats, l_stats = biopandas_cleanup.main(structure, scheme, struct_clean) docking_structure_randomizer.main(struct_clean) test_str = Path(struct_clean).stem + '_rand_LH.pdb' recep_str = Path(struct_clean).stem + '_rand_A.pdb' init_str = Path(struct_clean).stem + '_init.pdb' print([test_str, recep_str, init_str]) subprocess.run(['parapred', 'pdb', test_str]) parapred_to_blocking_biopandas.main(test_str, recep_str) subprocess.run([ '/usr/bin/mpiexec', '-n', '16', '--use-hwthread-cpus', 'megadock-gpu-dp', '-v', '1.0', '-D', '-t', '3', '-N', '54000', '-tb', 'SAMPLE.table' ]) # subprocess.run(['/usr/bin/mpiexec', '-n', '16', '--use-hwthread-cpus', 'megadock-gpu-dp', # '-tb', 'SAMPLE.table']) subprocess.run(['mkdir', 'megadock']) subprocess.run([ 'mv', Path(struct_clean).stem + '_rand_LH_blocking-' + Path(struct_clean).stem + '_rand_A.out', 'megadock' ]) subprocess.run(['cp', recep_str, 'megadock']) subprocess.run(['cp', test_str, 'megadock']) subprocess.run(['cp', init_str, 'megadock']) os.chdir('megadock') subprocess.run(['mv', init_str, 'complex_0.pdb']) subprocess.run([ 'mv', Path(struct_clean).stem + '_rand_LH_blocking-' + Path(struct_clean).stem + '_rand_A.out', Path(struct_clean).stem + '_complex.out' ]) infile = Path(struct_clean).stem + '_complex.out' df_zsc = zrank_processing.main(infile, recep_str, test_str, exec) print(df_zsc.describe()) zinfile = Path(struct_clean).stem + '_complex_zranked.out' complex_list = decoygen.main(recep_str, zinfile, decoynum, test_str) subprocess.run(['bash', exec + 'ProQDock_prepare.sh', 'complex_0.pdb']) with open('proQ3_list.txt', 'w') as p: p.write('complex_0_reres.pdb' + '\n') for cmplx in complex_list: subprocess.run(['bash', exec + 'ProQDock_prepare.sh', cmplx]) p.write(Path(cmplx).stem + '_reres.pdb' + '\n') try: print('ProQ3 processing') subprocess.run([ ProQ3_dir + 'proq3_all.sh', './complex.fasta', 'proQ3_list.txt', topdir + '/' + Path(structure).stem + '_profile', 'run_all', '16' ], stdout=subprocess.PIPE) except: print('ProQ3 parallel error') os.chdir('run_all') ProQ_list = [] for file in os.listdir(): if file.endswith(".pdb"): ProQ_list.append(file) ProQ_list_sorted = natural_sort.main(ProQ_list) total_sc = collector.main() rmsd_df = pd.DataFrame(columns=['rmsd', 'all_rmsd', 'sc_score', 'p2sc']) for i, p_cmplx in enumerate(ProQ_list_sorted): rmd_sc = rmsd.main('../../' + struct_clean, p_cmplx) rmsd_df.loc[i] = [ rmd_sc[0][0], rmd_sc[0][1], rmd_sc[0][2], rmd_sc[0][3] ] total_sc = total_sc.join(rmsd_df) os.chdir('../../') outfp = Path(structure).stem + '_TOTAL.csv' total_sc.to_csv(outfp, sep='\t', mode='a') h_stats.to_csv(Path(structure).stem + 'H_STATS.csv', sep='\t') l_stats.to_csv(Path(structure).stem + 'L_STATS.csv', sep='\t') stop_time = timeit.default_timer() time_tot = str(start_time - stop_time) with open('time.txt', 'w') as t: t.write(time_tot) os.chdir('..') return (total_sc) gc.collect()