def verify_conf(parser): # Verify whatever preconditions we can verify ref = GlobalConf['reference_archive'] if not (ref.endswith('.tar.gz') or ref.endswith('.tar.bz2') or ref.endswith('.tar')): parser.error("Reference {} doesn't seem to be an archive!".format(ref)) if not phdfs.path.exists(ref): parser.error("Reference {} doesn't seem to exist".format(ref)) if GlobalConf['job_manager_mem'] <= 100: parser.error("job_manager_mem of {:d} is too low".format( GlobalConf['job_manager_mem'])) if GlobalConf['task_manager_mem'] <= 1000: parser.error("task_manager_mem of {:d} is too low".format( GlobalConf['task_manager_mem'])) if GlobalConf.get('session_wait', 0) < 0: parser.error( "session_wait, if present, must be >= 0 (found {})".format( GlobalConf['session_wait'])) # test whether we can find the executables we need to run for e in ('yarn-session.sh', 'flink', 'seal', 'yarn', 'hdfs'): get_exec(e)
def parse_args(args): p = make_parser() options = p.parse_args(args) # check bcl converter and bwa path if options.converter_path: if not os.path.exists(options.converter_path): p.error("Specified converter doesn't exist") if not os.access(options.converter_path, os.X_OK | os.R_OK): p.error("Specified converter is not executable") else: options.converter_path = get_exec('bcl2fastq') if options.bwa_path: if not os.path.exists(options.bwa_path): p.error("Specified bwa doesn't exist") if not os.access(options.bwa_path, os.X_OK | os.R_OK): p.error("Specified bwa is not executable") else: options.bwa_path = get_exec('bwa') if options.keep_intermediate and options.skip_bcl: p.error("--keep-intermediate and --skip-bcl are incompatible") try: log_level = getattr(logging, options.log_level) options.log_level = log_level # overwrite the existing value except AttributeError as e: # this should never happend since we restricted the valid # choices at the level of the argument parser p.error("Invalid log level! " + e.message) verify_conf(p) return options
def run_alignments(bcl_output_dir, output_dir): sample_directories = _get_samples_from_bcl_output(bcl_output_dir) logger.info("Found %d samples in bcl output directory", len(sample_directories)) logger.debug("Making base output directory %s", output_dir) phdfs.mkdir(output_dir) # launch all the jobs base_cmd = [ get_exec('seal'), 'seqal', '--align-only', '-D', 'seal.seqal.nthreads={:d}'.format(GlobalConf['seqal_nthreads']), '-D', 'mapreduce.map.cpu.vcores={:d}'.format(GlobalConf['seqal_yarn_cores']), '--input-format', GlobalConf.get('seqal_input_fmt', 'prq'), '--output-format', GlobalConf.get('seqal_output_fmt', 'sam'), '--ref-archive', GlobalConf['reference_archive'], ] def start_job(sample_dir): sample_output_dir = phdfs.path.join(output_dir, os.path.basename(sample_dir)) cmd = base_cmd + [ sample_dir, sample_output_dir ] # LP: should refactor to start the job within the AlignJob object job = AlignJob(cmd=cmd, inputp=sample_dir, outputp=sample_output_dir) logger.info("Launching alignment of sample %s", os.path.basename(sample_dir)) logger.debug("executing command: %s", cmd) job.popen_obj = subprocess.Popen(map(str, cmd), bufsize=4096) job.popen_obj.poll() logger.debug("job running with PID %d", job.popen_obj.pid) return job jobs = [ start_job(s) for s in sample_directories ] ok = _wait(jobs, GlobalConf['remove_output']) if not ok: errored_jobs = [ j for j in jobs if j.failed ] logger.error("%d alignment jobs failed", len(errored_jobs)) logger.error("Here are the return codes: %s", ', '.join([ str(j.retcode) for j in errored_jobs ])) raise RuntimeError("Some alignment jobs failed")
def _start_flink_yarn_session(n_nodes): """ :return: yarn application id of the session """ cmd = [ get_exec('yarn-session.sh'), '-n', n_nodes * 2, '-jm', GlobalConf['job_manager_mem'], # job manager memory '-tm', GlobalConf['task_manager_mem'], # task manager memory '-s', GlobalConf['slots'], '-d', # run in detached mode ] logger.info("Starting flink session on Yarn in detached mode") logger.info( "Configuration:\n\tnodes: %d\n\tjm mem: %d\n\ttm mem: %d\n\tslots: %d", n_nodes, GlobalConf['job_manager_mem'], GlobalConf['task_manager_mem'], GlobalConf['slots']) logger.debug("executing command: %s", cmd) try: output = subprocess.check_output(map(str, cmd)) except subprocess.CalledProcessError: logger.error("Failed to start Flink session on Yarn!") raise logger.debug( "Session output\n============================================================\n" "%s\n============================================================", output) app_id = _parse_session_output(output) logger.info("Flink session started with application id '%s'", app_id) state, final_state = _get_app_status(app_id) while state != 'RUNNING' and final_state == 'UNDEFINED': logger.debug( "Waiting for session to enter the RUNNING state (currently in %s)", state) time.sleep(2) state, final_state = _get_app_status(app_id) if final_state != 'UNDEFINED': raise RuntimeError( "Problem!! Flink session {} has terminated! Final state: {}". format(app_id, final_state)) logger.info("Flink session %s RUNNING", app_id) if GlobalConf.get('session_wait', 0) > 0: logger.info( "Waiting for %d seconds to flink session to start TaskManagers", GlobalConf['session_wait']) time.sleep(GlobalConf['session_wait']) logger.debug("Wait finished.") return app_id
def _run_converter_and_yarn_session(input_dir, output_dir, n_nodes, jar_path): # setup properties file run_dir = tempfile.mkdtemp(prefix="bclconverter_run_dir") try: ## start by preparing the properties file (at the moment the program # doesn't accept command line arguments tmp_conf_dir = os.path.join(run_dir, "conf") os.makedirs(tmp_conf_dir) props_file = os.path.join(tmp_conf_dir, GlobalConf['props_filename']) with open(props_file, 'w') as f: f.write("root = {}/\n".format(input_dir.rstrip('/'))) f.write("fout = {}/\n".format(output_dir.rstrip('/'))) f.write("numTasks = {:d}\n".format(GlobalConf['tasksPerNode'] * n_nodes)) f.write("flinkpar = {:d}\n".format(GlobalConf['flinkpar'])) f.write("jnum = {:d}\n".format(GlobalConf['jnum'])) logger.info("Wrote properties in file %s", props_file) if logger.isEnabledFor(logging.DEBUG): with open(props_file) as f: logger.debug( "\n=============================\n%s\n=====================\n", f.read()) # now run the program logger.debug("Running flink cwd %s", run_dir) cmd = [ get_exec("flink"), "run", "-m", "yarn-cluster", '-yn', n_nodes, '-yjm', GlobalConf['job_manager_mem'], # job manager memory '-ytm', GlobalConf['task_manager_mem'], # task manager memory '-ys', GlobalConf['slots'], "-c", "bclconverter.bclreader.test", # class name jar_path ] logger.debug("executing command: %s", cmd) with chdir(run_dir): logger.debug("In CWD, where we're going to run flink") logger.debug("cat conf/bclconverter.properties gives:") subprocess.check_call("cat conf/bclconverter.properties", shell=True) logger.debug("Now running flink") subprocess.check_call(map(str, cmd), cwd=run_dir) finally: logger.debug("Removing run directory %s", run_dir) try: shutil.rmtree(run_dir) except IOError as e: logger.debug("Error cleaning up temporary dir %s", run_dir) logger.debug(e.message)
def verify_conf(parser): # Verify whatever preconditions we can verify ref = GlobalConf['reference_archive'] if not (ref.endswith('.tar.gz') or ref.endswith('.tar.bz2') or ref.endswith('.tar')): parser.error("Reference {} doesn't seem to be an archive!".format(ref)) if not phdfs.path.exists(ref): parser.error("Reference {} doesn't seem to exist".format(ref)) if GlobalConf['job_manager_mem'] <= 100: parser.error("job_manager_mem of {:d} is too low".format(GlobalConf['job_manager_mem'])) if GlobalConf['task_manager_mem'] <= 1000: parser.error("task_manager_mem of {:d} is too low".format(GlobalConf['task_manager_mem'])) if GlobalConf.get('session_wait', 0) < 0: parser.error("session_wait, if present, must be >= 0 (found {})".format(GlobalConf['session_wait'])) # test whether we can find the executables we need to run for e in ('yarn-session.sh', 'flink', 'seal', 'yarn', 'hdfs'): get_exec(e)
def _yarn_kill_all_apps(): error = False yarn_exec = get_exec('yarn') for app_id in _yarn_get_app_ids(): cmd = [ yarn_exec, 'application', '-kill', app_id ] logger.debug("killing application %s: %s", app_id, cmd) retcode = subprocess.call(cmd) if retcode != 0: logger.info("Failed to kill yarn application %s", app_id) error = True if error: raise RuntimeError("Failed to kill some running yarn applications")
def _yarn_kill_all_apps(): error = False yarn_exec = get_exec('yarn') for app_id in _yarn_get_app_ids(): cmd = [yarn_exec, 'application', '-kill', app_id] logger.debug("killing application %s: %s", app_id, cmd) retcode = subprocess.call(cmd) if retcode != 0: logger.info("Failed to kill yarn application %s", app_id) error = True if error: raise RuntimeError("Failed to kill some running yarn applications")
def _clear_caches(self): logger.info("Clearing system caches on cluster") nodes = yarn_get_node_list() hostnames = set([n.split(':')[0].strip() for n in nodes]) logger.debug("Found %d yarn nodemanager hosts", len(hostnames)) clean_cmd = "sudo sh -c 'echo 3 >/proc/sys/vm/drop_caches'" logger.debug("Using pdsh") pdsh_cmd = [ get_exec('pdsh'), '-R', 'ssh', '-w', ','.join(hostnames), clean_cmd ] logger.debug("cmd: %s", pdsh_cmd) subprocess.check_call(pdsh_cmd)
def _clear_caches(self): logger.info("Clearing system caches on cluster") nodes = yarn_get_node_list() hostnames = set([ n.split(':')[0].strip() for n in nodes ]) logger.debug("Found %d yarn nodemanager hosts", len(hostnames)) clean_cmd = "sudo sh -c 'echo 3 >/proc/sys/vm/drop_caches'" logger.debug("Using pdsh") pdsh_cmd = [ get_exec('pdsh'), '-R', 'ssh', '-w', ','.join(hostnames), clean_cmd ] logger.debug("cmd: %s", pdsh_cmd) subprocess.check_call(pdsh_cmd)
def _start_flink_yarn_session(n_nodes): """ :return: yarn application id of the session """ cmd = [ get_exec('yarn-session.sh'), '-n', n_nodes * 2, '-jm', GlobalConf['job_manager_mem'], # job manager memory '-tm', GlobalConf['task_manager_mem'], # task manager memory '-s', GlobalConf['slots'], '-d', # run in detached mode ] logger.info("Starting flink session on Yarn in detached mode") logger.info("Configuration:\n\tnodes: %d\n\tjm mem: %d\n\ttm mem: %d\n\tslots: %d", n_nodes, GlobalConf['job_manager_mem'], GlobalConf['task_manager_mem'], GlobalConf['slots']) logger.debug("executing command: %s", cmd) try: output = subprocess.check_output(map(str, cmd)) except subprocess.CalledProcessError: logger.error("Failed to start Flink session on Yarn!") raise logger.debug( "Session output\n============================================================\n" "%s\n============================================================", output) app_id = _parse_session_output(output) logger.info("Flink session started with application id '%s'", app_id) state, final_state = _get_app_status(app_id) while state != 'RUNNING' and final_state == 'UNDEFINED': logger.debug("Waiting for session to enter the RUNNING state (currently in %s)", state) time.sleep(2) state, final_state = _get_app_status(app_id) if final_state != 'UNDEFINED': raise RuntimeError("Problem!! Flink session {} has terminated! Final state: {}".format(app_id, final_state)) logger.info("Flink session %s RUNNING", app_id) if GlobalConf.get('session_wait', 0) > 0: logger.info("Waiting for %d seconds to flink session to start TaskManagers", GlobalConf['session_wait']) time.sleep(GlobalConf['session_wait']) logger.debug("Wait finished.") return app_id
def run_alignments(bcl_output_dir, output_dir): sample_directories = _get_samples_from_bcl_output(bcl_output_dir) logger.info("Found %d samples in bcl output directory", len(sample_directories)) logger.debug("Making base output directory %s", output_dir) phdfs.mkdir(output_dir) # launch all the jobs base_cmd = [ get_exec('seal'), 'seqal', '--align-only', '-D', 'seal.seqal.nthreads={:d}'.format(GlobalConf['seqal_nthreads']), '-D', 'mapreduce.map.cpu.vcores={:d}'.format(GlobalConf['seqal_yarn_cores']), '--input-format', GlobalConf.get('seqal_input_fmt', 'prq'), '--output-format', GlobalConf.get('seqal_output_fmt', 'sam'), '--ref-archive', GlobalConf['reference_archive'], ] def start_job(sample_dir): sample_output_dir = phdfs.path.join(output_dir, os.path.basename(sample_dir)) cmd = base_cmd + [sample_dir, sample_output_dir] # LP: should refactor to start the job within the AlignJob object job = AlignJob(cmd=cmd, inputp=sample_dir, outputp=sample_output_dir) logger.info("Launching alignment of sample %s", os.path.basename(sample_dir)) logger.debug("executing command: %s", cmd) job.popen_obj = subprocess.Popen(map(str, cmd), bufsize=4096) job.popen_obj.poll() logger.debug("job running with PID %d", job.popen_obj.pid) return job jobs = [start_job(s) for s in sample_directories] ok = _wait(jobs, GlobalConf['remove_output']) if not ok: errored_jobs = [j for j in jobs if j.failed] logger.error("%d alignment jobs failed", len(errored_jobs)) logger.error("Here are the return codes: %s", ', '.join([str(j.retcode) for j in errored_jobs])) raise RuntimeError("Some alignment jobs failed")
def _run_converter_and_yarn_session(input_dir, output_dir, n_nodes, jar_path): # setup properties file run_dir = tempfile.mkdtemp(prefix="bclconverter_run_dir") try: ## start by preparing the properties file (at the moment the program # doesn't accept command line arguments tmp_conf_dir = os.path.join(run_dir, "conf") os.makedirs(tmp_conf_dir) props_file = os.path.join(tmp_conf_dir, GlobalConf['props_filename']) with open(props_file, 'w') as f: f.write("root = {}/\n".format(input_dir.rstrip('/'))) f.write("fout = {}/\n".format(output_dir.rstrip('/'))) f.write("numTasks = {:d}\n".format(GlobalConf['tasksPerNode'] * n_nodes)) f.write("flinkpar = {:d}\n".format(GlobalConf['flinkpar'])) f.write("jnum = {:d}\n".format(GlobalConf['jnum'])) logger.info("Wrote properties in file %s", props_file) if logger.isEnabledFor(logging.DEBUG): with open(props_file) as f: logger.debug("\n=============================\n%s\n=====================\n", f.read()) # now run the program logger.debug("Running flink cwd %s", run_dir) cmd = [ get_exec("flink"), "run", "-m", "yarn-cluster", '-yn', n_nodes, '-yjm', GlobalConf['job_manager_mem'], # job manager memory '-ytm', GlobalConf['task_manager_mem'], # task manager memory '-ys', GlobalConf['slots'], "-c", "bclconverter.bclreader.test", # class name jar_path ] logger.debug("executing command: %s", cmd) with chdir(run_dir): logger.debug("In CWD, where we're going to run flink") logger.debug("cat conf/bclconverter.properties gives:") subprocess.check_call("cat conf/bclconverter.properties", shell=True) logger.debug("Now running flink") subprocess.check_call(map(str, cmd), cwd=run_dir) finally: logger.debug("Removing run directory %s", run_dir) try: shutil.rmtree(run_dir) except IOError as e: logger.debug("Error cleaning up temporary dir %s", run_dir) logger.debug(e.message)
def _wait(jobs, remove_output): logger.info("Waiting for jobs to finish") running = list(jobs) secs = 0 poll_freq = 2 failed = False while running and not failed: failed = any((j.failed for j in running)) if failed: break # update running list new_running = [] for j in running: if j.done: # job just finished logger.info("Alignment job writing to %s just finished", phdfs.path.basename(j.output_path)) if remove_output: logger.info("Removing output path %s", j.output_path) _try_remove_hdfs_dir(j.output_path) else: new_running.append(j) running = new_running if secs % 8 == 0: logger.info("%d jobs (out of %d) haven't finished", len(running), len(jobs)) if secs % 60 == 0: logger.debug("Logging free disk space situation") subprocess.call([get_exec('hdfs'), 'dfsadmin', '-report']) if running: time.sleep(poll_freq) secs += poll_freq if failed: logger.error("We have failed jobs :-(") logger.error("Killing all remaining jobs on Yarn cluster") try: _yarn_kill_all_apps() except StandardError as e: logger.error("Failed to clean up yarn cluster. Sorry!") logger.exception(e) else: logger.info("All jobs finished") return not failed
def _wait(jobs, remove_output): logger.info("Waiting for jobs to finish") running = list(jobs) secs = 0 poll_freq = 2 failed = False while running and not failed: failed = any( (j.failed for j in running ) ) if failed: break # update running list new_running = [] for j in running: if j.done: # job just finished logger.info("Alignment job writing to %s just finished", phdfs.path.basename(j.output_path)) if remove_output: logger.info("Removing output path %s", j.output_path) _try_remove_hdfs_dir(j.output_path) else: new_running.append(j) running = new_running if secs % 8 == 0: logger.info("%d jobs (out of %d) haven't finished", len(running), len(jobs)) if secs % 60 == 0: logger.debug("Logging free disk space situation") subprocess.call([get_exec('hdfs'), 'dfsadmin', '-report']) if running: time.sleep(poll_freq) secs += poll_freq if failed: logger.error("We have failed jobs :-(") logger.error("Killing all remaining jobs on Yarn cluster") try: _yarn_kill_all_apps() except StandardError as e: logger.error("Failed to clean up yarn cluster. Sorry!") logger.exception(e) else: logger.info("All jobs finished") return not failed
def _yarn_get_app_ids(): yarn_exec = get_exec('yarn') yarn_output = subprocess.check_output([ yarn_exec, 'application', '-list' ]) app_ids = [ line.split('\t', 1)[0] for line in yarn_output.split('\n')[2:] ] return app_ids
def _yarn_get_app_ids(): yarn_exec = get_exec('yarn') yarn_output = subprocess.check_output([yarn_exec, 'application', '-list']) app_ids = [line.split('\t', 1)[0] for line in yarn_output.split('\n')[2:]] return app_ids