def storeMccTotalEner(): import sys lst_fn = sys.argv[1] complx_lst = getLst(lst_fn) for complx in complx_lst: extractMccTotalEner(complx)
def lstCmp(path1, path2): list1 = getLst(path1) list2 = getLst(path2) common = [] unique1 = [] unique2 = [] for item in list1: if item in list2: common.append(item) else: unique1.append(item) for item in list2: if not item in list1: unique2.append(item) return common, unique1, unique2
def subZ_ScoreSvr_tasks(): """submit the loose grid jobs for svr parameter selection using z_score data set""" import lst_sub import sys filename = sys.argv[1] jobs = lst_sub.getLst(filename) tasks = Z_Svr_tasks(jobs, task_fn = filename) tasks.displayJobs() tasks.subLooseGrid()
def xargs(lstfn, pyscpt): """emulate the linux xargs fucntion, use each entry as the argument for the script Arguments: - `lstfn`: name of the list file - `pyscpt`: python scirpt """ lst = getLst(lstfn) os.environ["script"] = pyscpt for task in lst: os.environ["task"] = task os.system("python $script $task") time.sleep(5)
def storeMccTotalEnerParallelly(lst_fn): """ parallel version of storeMccTotalEner using multiprocessing """ from multiprocessing import Pool complx_lst = getLst(lst_fn) pool = Pool() pool.map(extractMccTotalEner, complx_lst) pool.close() pool.join()
def getIndividualPcc(self): """ calculate the PCC of each complexes' pre-cluster data """ from lst_sub import getLst cmplxes = getLst(self.lstfn) for cmplx in cmplxes: ener_fn = cmplx + '-A-' + self.ener_extd mcc_fn = cmplx + '-A-' + self.mcc_extd ener_dt = pd.read_csv(ener_fn, sep='\s+') mcc_dt = pd.read_csv(mcc_fn, header=None) mcc_dt.columns = ['mcc'] dt = pd.concat([ener_dt, mcc_dt], axis=1) dt = dt.sort(columns=['mcc']) print "for the complex", cmplx preClusterAna.calcuPCC(dt)
def constructTotalDt(self): """construc the total data frame of all the complexes """ from lst_sub import getLst cmplxes = getLst(self.lstfn) dts = [] for cmplx in cmplxes: ener_fn = cmplx + '-A-' + self.ener_extd mcc_fn = cmplx + '-A-' + self.mcc_extd ener_dt = pd.read_csv(ener_fn, sep='\s+') mcc_dt = pd.read_csv(mcc_fn, header=None) mcc_dt.columns = ['mcc'] dt = pd.concat([ener_dt, mcc_dt], axis=1) dt = dt.sort(columns=['mcc']) dts.append(dt) self.total_dt = pd.concat(dts) self.total_dt = self.total_dt.sort(columns=['mcc']) self.total_dt.columns = self.total_columns
def getReps_bk(clustering_rslt_file): """ read the clustering results, return the representatives from each cluster """ print "clustering result\t", clustering_rslt_file clustering_rslt = getLst(clustering_rslt_file) clustering_rslt = [int(i) for i in clustering_rslt] total_grps = max(clustering_rslt) + 1 total_pts = len(clustering_rslt) mappings = zip(range(total_pts), clustering_rslt) belongings = [] for grp_num in range(total_grps): grp_member = [mapping[0] for mapping in mappings if mapping[1] == grp_num] belongings.append({str(grp_num): grp_member}) reps = [] for belonging in belongings: reps.append(random.choice(belonging.values()[0])) return reps
import time class svm_tasks: def __init__(self, tasks, task_fn = ' '): self.tasks = tasks self.task_fn = task_fn def displayJobs(self): """print the name of the jobs""" print "get tasks from ", self.task_fn for job_name in self.tasks: print job_name def subFineGrid(self): """submit the fine grid job for svc parameter selection""" for subset in self.tasks: time.sleep(3) os.environ['subset'] = subset print "submit fine grid job: ", subset os.system('qsub -v var1=$subset -N $subset fine_grid.pbs') if __name__ == '__main__': '''iterate the list of orders''' import lst_sub import sys filename = sys.argv[1] jobs = lst_sub.getLst(filename) tasks = svm_tasks(jobs, task_fn = filename) tasks.displayJobs() tasks.subFineGrid()
from lst_sub import getLst import NB_classifier as nb ener_rows_ifn = 'ener_row_name.txt' ff = '08ff' low_condi_dist_fn = ff + '_low_decoy.dist' high_condi_dist_fn = ff + '_high_decoy.dist' bayes_dist_ofn = ff + '_bayes.txt' low_condi_dist = nb.loadCondiDistribution(low_condi_dist_fn) high_condi_dist = nb.loadCondiDistribution(high_condi_dist_fn) ener_rows = getLst(ener_rows_ifn) def convertPdfName(dist_tuple): """conver the first letter of the distribution name to upper case """ name = dist_tuple[0] name = name.upper()[0] + name[1:] return (name, dist_tuple[1]) ################################################################################ # converting low_condi_dist = [convertPdfName(i) for i in low_condi_dist] high_condi_dist = [convertPdfName(i) for i in high_condi_dist] ################################################################################ high_bayes_dists = [[ener_rows[i], high_condi_dist[i][0], high_condi_dist[i][1][0], high_condi_dist[i][1][1]] for i in range(len(ener_rows))] low_bayes_dists = [[ener_rows[i], low_condi_dist[i][0], low_condi_dist[i][1][0], low_condi_dist[i][1][1]] for i in range(len(ener_rows))]
dset = f[group_path] dset.attrs["prt_conf"] = prt_conf dset.attrs["lig_conf"] = lig_conf dset.create_dataset(str(rep_num), data=dt) f.flush() f.close() if __name__ == "__main__": ################################################################################ # # extract track from multiprocessing import Pool lst_fn = "exp_lst" complx_lst = getLst(lst_fn) # for complx in complx_lst: # extractTrack(complx) pool = Pool() pool.map(extractTrack, complx_lst) pool.close() pool.join() ################################################################################ # load mcc and energy components # lst_fn = "exp_lst" # complx_lst = getLst(lst_fn)
from pprint import pprint import subprocess from glob import glob from lst_sub import getLst lst_fn = 'total_lst' rows = str(10001) complxes = getLst(lst_fn) incomplete_results = [] for complx in complxes: matxs = glob(complx + '/*.trace.mat') if len(matxs) == 10: for matx in matxs: wc = subprocess.check_output(['wc', '-l', matx]) total_row = wc.split(' ')[0] if total_row != rows: incomplete_results.append(complx) print complx break else: incomplete_results.append(complx) print complx pprint(incomplete_results)
import sys import multi_process from lst_sub import getLst complx_lst = sys.argv[1] complxes = getLst(complx_lst) cmds = [('python', '/home/jaydy/Workspace/script/Pyscripts/prepare_cluster.py', complx) for complx in complxes] for cmd in cmds: print cmd multi_process.multi_process(cmds, 8)
""" f = h5py.File(self.h5_path) if self.non_centralized_path in f: del f[self.non_centralized_path] f.create_group(self.non_centralized_path) subgroup = f[self.non_centralized_path] subgroup.create_dataset('low_decoy', data=self.low_dt.values) subgroup.create_dataset('high_decoy', data=self.high_dt.values) f.flush() f.close() if __name__ == "__main__": import sys jobs_fn = sys.argv[1] complxes = getLst(jobs_fn) multi_decoys = MultiComplexesDecoy(complxes) ################################################################################ # collect raw data multi_decoys.cleanH5() multi_decoys.vstackAll() ################################################################################ # prepare for linear force field # multi_decoys.processMccEner(centralized=True, normed=True) # multi_decoys.splitHighLow() # multi_decoys.dumpLinearMccEner() # multi_decoys.printMccEner() ################################################################################
def loadClusteringRslts(self): self.rslts = getLst(self.clust_rslt_fn) self.rslts = [int(i) for i in self.rslts]