def run(SelectedSGBs, configFile): config = configparser.ConfigParser( interpolation=configparser.ExtendedInterpolation()) config.read(configFile) build_representatives = config['build_representatives'] print("Initializing") basedir = build_representatives['qp_base_dir'] if not os.path.exists(basedir): os.makedirs(basedir) if not os.path.exists(build_representatives['output_cores_dir']): os.makedirs(build_representatives['output_cores_dir']) sethandlers() os.chdir(basedir) print("Starting") print(time.ctime()) with fakeqp(jobname='build', q=['himem7.q']) as q: q.startpermanentrun() waiton = [] chunk_size = eval(build_representatives['chunksize']) # for chunk in range(0,len(SelectedSGBs), chunk_size): # waiton.append(q.method(renameCore, (SelectedSGBs[chunk:chunk+chunk_size], # build_representatives['output_cores_dir'], # build_representatives['genomes_dir']))) # q.wait(waiton) waiton = [ q.method(buildByCore, (SelectedSGBs, build_representatives['output_fasta'], build_representatives['output_cores_dir'])) ] q.wait(waiton) print(time.ctime())
def run(job_iterator: Iterable[JobInfo], data_iterator: Callable, xy_function: Callable, output_dir: str, use_fakeqp=False, qp_kwargs: Dict = None) -> pd.DataFrame: """Creates a job for each item in job_iterator and collects the results.""" sethandlers() if qp_kwargs is None: qp_kwargs = {} qprovider = qp if not use_fakeqp else fakeqp with qprovider(**qp_kwargs) as q: q.startpermanentrun() tkttores = [] for job_info in job_iterator: tkttores.append( q.method(_run_per_job, (job_info, data_iterator, xy_function, output_dir), _job_name=job_info.name)) fnames = [] for r in tkttores: fnames.append(q.waitforresult(r)) result = pd.concat((pd.read_hdf(f) for f in fnames), ignore_index=True) for f in fnames: os.remove(f) return result
def runOnSGBs(configFile): config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation()) config.read(configFile) run_pipeline = config['run_pipeline'] if not os.path.exists(run_pipeline['representatives']): EatOrKeepSmallRepresentatives.run(configFile) SelectedSGBs=getAllSGBs(run_pipeline['representatives'], run_pipeline['genomes_dir'], run_pipeline['all_large_or_new_sgbs']) if not os.path.exists(run_pipeline['stage1output']): print ("Making representatives fasta", time.ctime()) buildRepresentatives.run(SelectedSGBs,configFile) print ("Bulding Bowtie index", time.ctime()) build_big_bowtie.run(configFile) with open(run_pipeline['stage1output'],'w') as donefile: donefile.write('Done\n') basedir = run_pipeline['qp_base_dir'] score_output = run_pipeline['score_output'] sethandlers() os.chdir(basedir) print ("Starting") with qp(jobname='build', q=['himem7.q']) as q: q.startpermanentrun() waiton = [] chucksize=50 count=0 for chunkSGBsIDs in range(0,len(SelectedSGBs),chucksize): chunkSGBs=SelectedSGBs.loc[count*chucksize:chucksize*(count+1)-1] count+=1 waiton.append(q.method(runChuckOfSGBs, (chunkSGBs, configFile))) q.wait(waiton) print ("Done running on %s SGBs"%len(waiton)) print ("Done", time.ctime()) return
import os import mwas_annot from LabQueue.qp import qp, fakeqp from LabUtils.addloglevels import sethandlers # parameters body_site = 'Oral' # TODO: don't forget to update majmin output_dir = f'/net/mraid08/export/genie/LabData/Analyses/saarsh/PNP3_mwas/PNP3_mwas_{body_site.lower()}_0months_subtraction' jobs_path = os.path.join(output_dir, 'jobs') mwas_file_path = os.path.join(output_dir, f'mb_gwas_significant.h5') # run os.chdir(jobs_path) sethandlers() with qp(jobname=f'annot_{body_site}', _delete_csh_withnoerr=True, q=['himem7.q'], max_r=2, _mem_def='5G') as q: q.startpermanentrun() snps = q.method(mwas_annot.run, (mwas_file_path, output_dir, body_site)) q.waitforresult(snps)
min_reads_per_snp = 3 min_common_positions = 20000 min_positions_per_sample = 20000 work_dir = os.path.join(config.analyses_dir, date2_dir()) def data_gen(loaders, subjects_df=None, **kwargs): from LabData.DataMergers.MultiDataLoader import MultiDataLoader accepts_subjects_df = all([l != 'SubjectLoader' for l in to_list(loaders)]) return MultiDataLoader(loaders, subjects_df=subjects_df, **kwargs).get_data() if accepts_subjects_df \ else MultiDataLoader(loaders, **kwargs).get_data() def gen_pairwise_dists(): write_members(os.path.join(P.work_dir, 'PARAMS.txt'), P) subjects_gen_f = lambda: data_gen(subjects_loaders, ** subjects_get_data_args) from LabData.DataAnalyses.MBSNPs.MBSNPAnalyses import MBSNPPairwiseDistances MBSNPPairwiseDistances(**dict((key, value) for key, value in P.__dict__.items() if not key.startswith('__')))\ .run(subjects_gen_f=subjects_gen_f, species_set=species_set) if __name__ == '__main__': sethandlers(file_dir=config.log_dir) gen_pairwise_dists()
import os import glob from LabQueue.qp import qp from LabUtils.addloglevels import sethandlers from LabData.DataLoaders.MBSNPLoader import OralMBSNPLoader def func(): potential_species = glob.glob('/home/saarsh/Genie/LabData/Data/MBPipeline/Analyses/MBSNP/Oral/MAF/mb_snp_maf_SGB_*_R1_S100.h5') potential_species = ['SGB_' + s.split('_')[-3] for s in potential_species] done_species = glob.glob('/home/saarsh/Genie/LabData/Data/MBPipeline/Analyses/MBSNP/Oral/MAF/mb_snp_annot_maf_SGB_*_R1_S100.h5') done_species = ['SGB_' + s.split('_')[-3] for s in done_species] species = list(set(potential_species) - set(done_species)) ld = OralMBSNPLoader() ld._gen_species_set_maf_annot_data(species, min_reads_per_snp=1, min_samples_per_snp_cached=100) # TODO: make sure the gene annotation loader is using the OralMBLoader and not the Gut sethandlers(file_dir='/home/saarsh/Analysis/antibiotics/jobs/') os.chdir('/home/saarsh/Analysis/antibiotics/jobs/') with qp(jobname='annot', _delete_csh_withnoerr=True, q=['himem7.q']) as q: q.startpermanentrun() tkttores = {} tkttores[0] = q.method(func) for k, v in tkttores.items(): q.waitforresult(v)
jobs_dir = '/net/mraid08/export/jafar/Microbiome/Analyses/saar/NLDcopmJobs' def func(folder): cmd = f'rm -Rf {folder} &' print(cmd) _shell_command(cmd) # files = glob.glob(os.path.join(folder, '*')) # len_files = len(files) # for i_file, file in enumerate(files): # if not os.path.isdir(file): # print(f'file {i_file + 1}/{len_files}') # _shell_command('gzip -9 ' + file) # queue os.chdir(jobs_dir) sethandlers(file_dir=jobs_dir) with qp(jobname='NLDcomp', _mem_def='10G', _tryrerun=False) as q: q.startpermanentrun() tkttores = {} for i_folder, folder in enumerate(folders): tkttores[i_folder] = q.method(func, [folder]) for k, v in tkttores.items(): q.waitforresult(v)
col and 'SNPB' not in col for col in df.columns ]] if df['RawReadLength'].unique() != read_len: print('all hell broke loose') 1 / 0 df['cnt'] = True df.to_pickle(os.path.join(base_dir, 'DFOut', 'PostUniteMetadata.df')) print(f'Running on {df.shape[0]}') # pipeline Email = ' [email protected]' General_params = f' --max_r {MAX_JOBS} --use_general_python ' Modules = ' --module_seq "MID,UZP,URB,SNB" ' os.chdir(os.path.join(base_dir, 'tmp2', 'jobs')) logf, warnstream = sethandlers(logging.INFO, True, True, file_prefix='mmmbp_') with config.qp(jobname=run_name, max_r=MAX_JOBS, q=['himem7.q'], _tryrerun=True, delay_batch=10) as q: q.startpermanentrun() MID_params = '--mid_md_path ' + os.path.join(base_dir, 'DFOut', 'PostUniteMetadata.df ') + \ '--mid_input_path ' + os.path.join(os.path.dirname(os.path.dirname(post)), 'tmp2', 'UNT', ' ') + \ '--mid_ext .fastq.gz ' + \ '--mid_check_cont ' URB_params = f' --urb_num_mapped_to_subsample {urb_num_mapped_to_subsample} ' \ f' --urb_min_mapped_to_retain {urb_min_mapped_to_retain} ' \