def main(hhblitsdb, jackhmmerdb, seqfile, n_cores=1, n_jobs_plm=1, n_jobs_psi=1, layers=5, pconsc1_flag=False): # Create input alignments for evalue thresholds 1, 10^-4, 10^-10, # and 10^-40 prep.run_alignments(hhblitsdb, jackhmmerdb, seqfile, n_cores=n_cores) # Run PSICOV and plmDCA on every alignment and collect output # filepaths psicovnames = prep.run_contact_pred(seqfile, 'psicov', n_cores=n_cores, n_jobs=n_jobs_psi) plmdcanames = prep.run_contact_pred(seqfile, 'plmdca', n_cores=n_cores, n_jobs=n_jobs_plm) predictionnames = dict(psicovnames.items() + plmdcanames.items()) # Generate PconsC/2 prediction command with all arguments in # correct order l = [root + '/src/predict2.py'] names = ['jhE4', 'jhE0', 'jhE10', 'jhE40', 'hhE4', 'hhE0', 'hhE10', 'hhE40'] for key in names: l.append(predictionnames[key + 'psicov']) l.append(predictionnames[key + 'plmdca']) # Run PconsC if flag is given if pconsc1_flag: sys.stderr.write("Running PconsC1...\n") sys.stderr.write("Predicting...\n") result_name = seqfile + '.pconsc.out' l[0] = root + '/src/predict.py' results = check_output(l) f = open(result_name, 'w') f.write(results) f.close() # Run PconsC2 otherwise (default) else: sys.stderr.write("Running PconsC2...\n") netsurfpredictionname, sspredictionname, pssmaliname = prep.run_pconsc2_dependencies(hhblitsdb, seqfile, n_cores=1) sys.stderr.write("Predicting...\n") result_name = seqfile + '.pconsc2.out' l.extend([netsurfpredictionname, sspredictionname, pssmaliname, result_name]) check_output(l) # Plot top L*1 contacts in a contact map (where L is the length of # the input sequence). # Those contacts are later used during protein folding if plot_flag: if os.path.exists('native.pdb') and os.path.exists(seqfile + '.horiz'): plot_map(seqfile, result_name, 1., pdb_filename='native.pdb', psipred_filename=seqfile + '.horiz') elif os.path.exists('native.pdb'): plot_map(seqfile, result_name, 1., pdb_filename='native.pdb') elif os.path.exists(seqfile + '.horiz'): plot_map(seqfile, result_name, 1., psipred_filename=seqfile + '.horiz') else: plot_map(seqfile, result_name, 1.)
def main(hhblitsdb, jackhmmerdb, seqfile, n_cores=1, n_jobs_plm=1, n_jobs_psi=1, layers=5, pconsc1_flag=False): # Create input alignments for evalue thresholds 1, 10^-4, 10^-10, # and 10^-40 prep.run_alignments(hhblitsdb, jackhmmerdb, seqfile, n_cores=n_cores) # Run PSICOV and plmDCA on every alignment and collect output # filepaths psicovnames = prep.run_contact_pred(seqfile, 'psicov', n_cores=n_cores, n_jobs=n_jobs_psi) plmdcanames = prep.run_contact_pred(seqfile, 'plmdca', n_cores=n_cores, n_jobs=n_jobs_plm) predictionnames = dict(psicovnames.items() + plmdcanames.items()) # Generate PconsC/2 prediction command with all arguments in # correct order l = [root + '/src/predict2.py'] names = [ 'jhE4', 'jhE0', 'jhE10', 'jhE40', 'hhE4', 'hhE0', 'hhE10', 'hhE40' ] for key in names: l.append(predictionnames[key + 'psicov']) l.append(predictionnames[key + 'plmdca']) # Run PconsC if flag is given if pconsc1_flag: sys.stderr.write("Running PconsC1...\n") sys.stderr.write("Predicting...\n") result_name = seqfile + '.pconsc.out' l[0] = root + '/src/predict.py' results = check_output(l) f = open(result_name, 'w') f.write(results) f.close() # Run PconsC2 otherwise (default) else: sys.stderr.write("Running PconsC2...\n") netsurfpredictionname, sspredictionname, pssmaliname = prep.run_pconsc2_dependencies( hhblitsdb, seqfile, n_cores=1) sys.stderr.write("Predicting...\n") result_name = seqfile + '.pconsc2.out' l.extend([ netsurfpredictionname, sspredictionname, pssmaliname, result_name ]) check_output(l) # Plot top L*1 contacts in a contact map (where L is the length of # the input sequence). # Those contacts are later used during protein folding if plot_flag: if os.path.exists('native.pdb') and os.path.exists(seqfile + '.horiz'): plot_map(seqfile, result_name, 1., pdb_filename='native.pdb', psipred_filename=seqfile + '.horiz') elif os.path.exists('native.pdb'): plot_map(seqfile, result_name, 1., pdb_filename='native.pdb') elif os.path.exists(seqfile + '.horiz'): plot_map(seqfile, result_name, 1., psipred_filename=seqfile + '.horiz') else: plot_map(seqfile, result_name, 1.)
def main(hhblitsdb, jackhmmerdb, seqfile, n_cores=1): rundir = seqfile.rfind('/') if rundir < 0: rundir = '.' else: rundir = seqfile[:rundir] if hhblitsdb.endswith('_a3m_db'): hhblitsdb = hhblitsdb[:-7] if not os.path.exists(hhblitsdb + '_a3m_db'): sys.stderr.write('\n' + hhblitsdb + '_a3m_db' + 'does not exist\n') sys.exit(1) if not os.path.exists(jackhmmerdb): sys.stderr.write('\n' + jackhmmerdb + 'does not exist\n') sys.exit(1) if not os.path.exists(seqfile): sys.stderr.write('\n' + seqfile + 'does not exist\n') sys.exit(0) f = open(seqfile).read() if os.path.exists(seqfile + '.fasta'): subprocess.call(['mv', seqfile + '.fasta', seqfile +'.bak']) f2 = open(seqfile +'.fasta', 'w') if f[0] != '>': f2.write('>target\n' + f +'\n') else: x = f.split('\n') if len(x[0]) > 6: target = x[0][1:5] + x[0][6] f2.write('>target\n' + "".join(x[1:]) + '\n') f2.close() names = ['E4', 'E0', 'E10', 'E40'] cutoffs = ['1e-4', '1', '1e-10', '1e-40'] jhpredictionnames = [] hhpredictionnames = [] failed = [] for i in range(4): exists_jh = os.path.exists(seqfile + '.jh' + names[i] + '.a3m') exists_jh_psicov = os.path.exists(seqfile + '.jh' + names[i] + '.psicov') exists_jh_plmdca = os.path.exists(seqfile + '.jh' + names[i] + '.plmdca') exists_hh = os.path.exists(seqfile + '.hh' + names[i] + '.a3m') exists_hh_psicov = os.path.exists(seqfile + '.hh' + names[i] + '.psicov') exists_hh_plmdca = os.path.exists(seqfile + '.hh' + names[i] + '.plmdca') # only create alignment file if at least one of the contact maps is missing if not exists_jh and (not exists_jh_psicov or not exists_jh_plmdca): sys.stderr.write(str(datetime.now()) + ' jackhmmer ' + names[i] + ': generating alignment\nThis may take quite a few minutes!\n ') t = check_output([jackhmmer, '--cpu', str(n_cores), '-N', '5', '-E', cutoffs[i], '-A', seqfile +'.jh' + names[i] + '.ali', seqfile + '.fasta', jackhmmerdb]) check_output([reformat, 'sto', 'a3m', seqfile + '.jh' + names[i] + '.ali', seqfile + '.jh' + names[i] + '.a3m']) check_output(['rm', seqfile + '.jh' + names[i] + '.ali']) if not exists_jh_psicov: #t = check_output([trim, seqfile + '.jh' + names[i] + '.fas']) t = check_output([trim2jones, seqfile + '.jh' + names[i] + '.a3m']) f = open(seqfile + '.jh' + names[i] + '.jones', 'w') f.write(t) f.close() t = '' sys.stderr.write(str(datetime.now()) + ' jackhmmer ' + names[i] + ': running PSICOV\nThis may take more than an hour.\n') try: # Joel @ NSC: Added -o flag, in case the psicov binary has not # been compiled with MINEFSEQS=0. t = check_output([psicov, '-o', seqfile + '.jh' + names[i] + '.jones']) except: t = '' f = open(seqfile + '.jh' + names[i] + '.psicov', 'w') f.write(t) f.close() jhpredictionnames.append(seqfile + '.jh' + names[i] + '.psicov') if not exists_jh_plmdca: t = check_output([trim2trimmed, seqfile + '.jh' + names[i] + '.a3m']) f = open(seqfile + '.jh' + names[i] + '.trimmed', 'w') f.write(t) f.close() sys.stderr.write(str(datetime.now()) + ' jackhmmer ' + names[i] + ': running plmDCA\nThis may take more than an hour.\n') if plmdca: #t = check_output([plmdca, matlabdir, seqfile + '.jh' + names[i] + ".trimmed", seqfile + '.jh' + names[i] + ".plmdca", "0.01", "0.01", "0.1", str(n_cores)]) t = check_output([plmdca, seqfile + '.jh' + names[i] + ".trimmed", seqfile + '.jh' + names[i] + ".plmdca", "0.01", "0.01", "0.1", str(n_cores)]) else: t = check_output([matlab, '-nodesktop', '-nosplash', '-r', "path(path, '" + plmdcapath + "'); path(path, '" + plmdcapath + "/functions'); path(path, '" + plmdcapath + "/3rd_party_code/minFunc/'); plmDCA_symmetric ( '" + seqfile + '.jh' + names[i] + ".trimmed', '" + seqfile + '.jh' + names[i] + ".plmdca', 0.01, 0.01, 0.1, " + str(n_cores) + "); exit"]) jhpredictionnames.append(seqfile + '.jh' + names[i] + '.plmdca') # only create alignment file if at least one of the contact maps is missing if not exists_hh and (not exists_hh_psicov or not exists_hh_plmdca): sys.stderr.write(str(datetime.now()) + ' HHblits' + names[i] + ': generating alignment\nThis may take quite a few minutes!\n ') t = check_output([hhblits, '-all', '-oa3m', seqfile + '.hh' + names[i] + '.a3m', '-e', cutoffs[i], '-cpu', str(n_cores), '-i', seqfile + '.fasta', '-d', hhblitsdb]) #check_output([reformat, 'a3m', 'fas', seqfile + '.hh' + names[i] + '.a3m', seqfile + '.hh' + names[i] + '.fas']) if not exists_hh_psicov: #t = check_output([trim, seqfile + '.hh' + names[i] + '.fas']) t = check_output([trim2jones, seqfile + '.hh' + names[i] + '.a3m']) f = open(seqfile + '.hh' + names[i] + '.jones', 'w') f.write(t) f.close() sys.stderr.write(str(datetime.now()) + ' HHblits ' + names[i] + ': running PSICOV\nThis may take more than an hour.\n') t = '' try: # Joel @ NSC: Added -o flag, in case the psicov binary has not # been compiled with MINEFSEQS=0. t = check_output([psicov, '-o', seqfile + '.hh' + names[i] + '.jones']) except: t = '' f = open(seqfile + '.hh' + names[i] + '.psicov', 'w') f.write(t) f.close() hhpredictionnames.append(seqfile + '.hh' + names[i] + '.psicov') if not exists_hh_plmdca: #t = check_output([trim2, seqfile + '.hh' + names[i] + '.fas']) t = check_output([trim2trimmed, seqfile + '.hh' + names[i] + '.a3m']) f = open(seqfile + '.hh' + names[i] + '.trimmed', 'w') f.write(t) f.close() sys.stderr.write(str(datetime.now()) + ' HHblits ' + names[i] + ': running plmDCA\nThis may take more than an hour.\n') if plmdca: #t = check_output([plmdca, matlabdir, seqfile + '.hh' + names[i] + ".trimmed", seqfile + '.hh' + names[i] + ".plmdca", "0.01", "0.01", "0.1", str(n_cores)]) t = check_output([plmdca, seqfile + '.hh' + names[i] + ".trimmed", seqfile + '.hh' + names[i] + ".plmdca", "0.01", "0.01", "0.1", str(n_cores)]) else: t = check_output([matlab, '-nodesktop', '-nosplash', '-r', "path(path, '" + plmdcapath + "'); path(path, '" + plmdcapath + "/functions'); path(path, '" + plmdcapath + "/3rd_party_code/minFunc/'); plmDCA_symmetric ( '" + seqfile + '.hh' + names[i] + ".trimmed', '" + seqfile + '.hh' + names[i] + ".plmdca', 0.01, 0.01, 0.1, " + str(n_cores) + "); exit"]) hhpredictionnames.append(seqfile + '.hh' + names[i] + '.plmdca') sys.stderr.write("Predicting...\n") #l = [os.path.dirname(os.path.abspath(sys.argv[0])) + '/predict.py'] l = [root + '/src/predict.py'] l.extend(jhpredictionnames) l.extend(hhpredictionnames) results = check_output(l) f = open(seqfile + '.pconsc.out', 'w') f.write(results) f.close() # plot the top L*1 contacts in a contact map # those contacts are later used during protein folding if plot_flag: if os.path.exists('native.pdb') and os.path.exists(seqfile + '.horiz'): plot_map(seqfile, seqfile + '.pconsc.out', 1.0, pdb_filename='native.pdb', psipred_filename=seqfile + '.horiz') elif os.path.exists('native.pdb'): plot_map(seqfile, seqfile + '.pconsc.out', 1.0, pdb_filename='native.pdb') elif os.path.exists(seqfile + '.horiz'): plot_map(seqfile, seqfile + '.pconsc.out', 1.0, psipred_filename=seqfile + '.horiz') else: plot_map(seqfile, seqfile + '.pconsc.out', 1.0)