def main(earliest_night, latest_night, data_dir, jar, xml, db, out, queue, walltime, engine, num_runs, vmem, log_level, port, source, conditions, max_delta_t, local, password): level=logging.INFO if log_level is 'DEBUG': level = logging.DEBUG elif log_level is 'WARN': level = logging.WARN elif log_level is 'INFO': level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=level) jarpath = os.path.abspath(jar) xmlpath =os. path.abspath(xml) outpath = os.path.abspath(out) erna.ensure_output(out) db_path = os.path.abspath(db) output_directory = os.path.dirname(outpath) #create dir if it doesnt exist os.makedirs(output_directory, exist_ok=True) logger.info("Writing output data to {}".format(out)) factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password)) data_conditions=dcc.conditions[conditions] df_runs = erna.load(earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions) logger.info("Would process {} jobs with {} runs per job".format(len(df_runs)//num_runs, num_runs)) click.confirm('Do you want to continue processing and start jobs?', abort=True) job_list = make_jobs(jarpath, xmlpath, db_path, output_directory, df_runs, engine, queue, vmem, num_runs, walltime) job_outputs = gridmap.process_jobs(job_list, max_processes=len(job_list), local=local) erna.collect_output(job_outputs, out, df_runs)
def main( jar, xml, out, mc_path, queue, walltime, engine, num_jobs, vmem, log_level, port, local): ''' Script to execute fact-tools on MonteCarlo files. Use the MC_PATH argument to specifiy the folders containing the MC ''' level=logging.INFO if log_level is 'DEBUG': level = logging.DEBUG elif log_level is 'WARN': level = logging.WARN elif log_level is 'INFO': level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=level) erna.ensure_output(out) jarpath = path.abspath(jar) xmlpath = path.abspath(xml) drspath = erna.mc_drs_file() logger.info('Using drs file at {}'.format(drspath)) #get data files files=[] for folder in tqdm(mc_path): # print("Entering folder {}".format(folder)) pattern = path.join(folder, '**/*_Events.fit*') f = glob.glob(pattern, recursive=True) files = files + f num_files = len(files) logger.info("Found {} files.".format(num_files)) if num_files == 1: logger.error("Need more than one file to work with.") return if num_jobs > num_files: logger.error("You specified more jobs than files. This doesn't make sense.") return click.confirm('Do you want to continue processing and start jobs?', abort=True) mc_paths_array = np.array(files) drs_paths_array = np.repeat(np.array(drspath), len(mc_paths_array)) job_list = make_jobs(jarpath, xmlpath, mc_paths_array, drs_paths_array, engine, queue, vmem, num_jobs, walltime) job_outputs = gridmap.process_jobs(job_list, max_processes=num_jobs, local=local) erna.collect_output(job_outputs, out)
def main( earliest_night, latest_night, data_dir, jar, xml, db, out, queue, mail, walltime, engine, num_runs, qjobs, vmem, log_level, port, source, conditions, max_delta_t, local, password, ): level = logging.INFO if log_level is "DEBUG": level = logging.DEBUG elif log_level is "WARN": level = logging.WARN elif log_level is "INFO": level = logging.INFO logging.captureWarnings(True) logging.basicConfig(format=("%(asctime)s - %(levelname)s - " + "%(message)s"), level=level) jarpath = os.path.abspath(jar) xmlpath = os.path.abspath(xml) outpath = os.path.abspath(out) erna.ensure_output(out) logger.info("Output data will be written to {}".format(out)) db_path = os.path.abspath(db) output_directory = os.path.dirname(outpath) # create dir if it doesnt exist os.makedirs(output_directory, exist_ok=True) factdb = sqlalchemy.create_engine("mysql+pymysql://factread:{}@129.194.168.95/factdata".format(password)) data_conditions = dcc.conditions[conditions] df_loaded = erna.load( earliest_night, latest_night, data_dir, source_name=source, timedelta_in_minutes=max_delta_t, factdb=factdb, data_conditions=data_conditions, ) df_loaded.to_hdf(out + ".tmp", "loaded", mode="a") logger.info("Processing {} jobs with {} runs per job.".format(int(len(df_loaded) / num_runs), num_runs)) click.confirm("Do you want to continue processing and start jobs?", abort=True) # ensure that the max number of queuable jobs is smaller than the total number of jobs if qjobs > len(df_loaded): qjobs = len(df_loaded) nfinished = 0 nsubmited = 1 running_jobs = [] pending_jobs = [] last_finished = [] jobids = [] job_output_paths = [] df_submitted = pd.DataFrame() # copy then dataframe with loaded jobs to submit df_runs = df_loaded.copy() # operate submission loop, as long as jobs need to be submitted while nfinished < nsubmited: n_toqueue = qjobs - (len(pending_jobs) + len(running_jobs)) logger.info("{} jobs to be queued".format(n_toqueue)) if (n_toqueue > 0) and (len(df_runs) > 0): df_to_submit = df_runs.head(n_toqueue * num_runs).copy() processing_identifier = "{}_{}".format(source, time.strftime("%Y%m%d%H%M")) df_submitted_last = submit_qsub_jobs( processing_identifier, jarpath, xmlpath, db_path, df_to_submit, engine, queue, vmem, num_runs, walltime, db, mail, ) df_submitted = df_submitted.append(df_submitted_last) # append submitted jobids jobids = df_submitted["JOBID"].unique() df_runs = df_runs.drop(df_to_submit.index) nsubmited = len(jobids) logger.info("Submitted {} jobs in last bunch".format(len(df_submitted_last))) logger.info("Submitted {} jobs in total".format(nsubmited)) finished_jobs = q.get_finished_jobs(jobids) running_jobs = q.get_running_jobs(jobids) pending_jobs = q.get_pending_jobs(jobids) nfinished = len(finished_jobs) logger.info( "Processing Status: running: {}, pending: {}, queued: {}, finished: {}/{}".format( len(running_jobs), len(pending_jobs), nsubmited - nfinished, nfinished, nsubmited ) ) last_finished = np.setdiff1d(finished_jobs, last_finished) if len(last_finished) > 0: last_paths = last_finished_out_paths(df_submitted, last_finished) job_output_paths = np.append(job_output_paths, last_paths) last_finished = finished_jobs if nfinished < nsubmited: logger.info("Checking qstat in 5 min again") time.sleep(5 * 60) logger.info("All jobs have been finished, processing done") job_outputs = read_outputs_to_list(job_output_paths) erna.collect_output(job_outputs, out, df_started_runs=df_loaded) # erna.collect_output(job_output_paths, out) df_loaded.to_hdf(out, "loaded", mode="a") df_submitted.to_hdf(out, "jobinfo", mode="a") os.remove(out + ".tmp")